[v3,4/5] common/octeontx2: fix build with sve enabled

Message ID 20210112025709.1121523-5-ruifeng.wang@arm.com (mailing list archive)
State Accepted, archived
Delegated to: David Marchand
Headers
Series lpm lookup with sve support |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Ruifeng Wang Jan. 12, 2021, 2:57 a.m. UTC
  Building with gcc 10.2 with SVE extension enabled got error:

{standard input}: Assembler messages:
{standard input}:4002: Error: selected processor does not support `mov z3.b,#0'
{standard input}:4003: Error: selected processor does not support `whilelo p1.b,xzr,x7'
{standard input}:4005: Error: selected processor does not support `ld1b z0.b,p1/z,[x8]'
{standard input}:4006: Error: selected processor does not support `whilelo p4.s,wzr,w7'

This is because inline assembly code explicitly resets cpu model to
not have SVE support. Thus SVE instructions generated by compiler
auto vectorization got rejected by assembler.

Added SVE to the cpu model specified by inline assembly for SVE support.
Not replacing the inline assembly with C atomics because the driver relies
on specific LSE instruction to interface to co-processor [1].

Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
Cc: jerinj@marvell.com
Cc: stable@dpdk.org

[1] https://mails.dpdk.org/archives/dev/2021-January/196092.html

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
v3:
Keep inline assembly and add sve extension to fix issue. (Pavan)

 drivers/common/octeontx2/otx2_io_arm64.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)
  

Comments

Jerin Jacob Jan. 12, 2021, 4:38 a.m. UTC | #1
On Tue, Jan 12, 2021 at 8:28 AM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>
> Building with gcc 10.2 with SVE extension enabled got error:
>
> {standard input}: Assembler messages:
> {standard input}:4002: Error: selected processor does not support `mov z3.b,#0'
> {standard input}:4003: Error: selected processor does not support `whilelo p1.b,xzr,x7'
> {standard input}:4005: Error: selected processor does not support `ld1b z0.b,p1/z,[x8]'
> {standard input}:4006: Error: selected processor does not support `whilelo p4.s,wzr,w7'
>
> This is because inline assembly code explicitly resets cpu model to
> not have SVE support. Thus SVE instructions generated by compiler
> auto vectorization got rejected by assembler.
>
> Added SVE to the cpu model specified by inline assembly for SVE support.
> Not replacing the inline assembly with C atomics because the driver relies
> on specific LSE instruction to interface to co-processor [1].
>
> Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
> Cc: jerinj@marvell.com
> Cc: stable@dpdk.org

Reviewed-by: Jerin Jacob <jerinj@marvell.com>



>
> [1] https://mails.dpdk.org/archives/dev/2021-January/196092.html
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
> v3:
> Keep inline assembly and add sve extension to fix issue. (Pavan)
>
>  drivers/common/octeontx2/otx2_io_arm64.h | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/common/octeontx2/otx2_io_arm64.h b/drivers/common/octeontx2/otx2_io_arm64.h
> index b5c85d9a6..34268e3af 100644
> --- a/drivers/common/octeontx2/otx2_io_arm64.h
> +++ b/drivers/common/octeontx2/otx2_io_arm64.h
> @@ -21,6 +21,12 @@
>  #define otx2_prefetch_store_keep(ptr) ({\
>         asm volatile("prfm pstl1keep, [%x0]\n" : : "r" (ptr)); })
>
> +#if defined(__ARM_FEATURE_SVE)
> +#define __LSE_PREAMBLE " .cpu  generic+lse+sve\n"
> +#else
> +#define __LSE_PREAMBLE " .cpu  generic+lse\n"
> +#endif
> +
>  static __rte_always_inline uint64_t
>  otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
>  {
> @@ -28,7 +34,7 @@ otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
>
>         /* Atomic add with no ordering */
>         asm volatile (
> -               ".cpu  generic+lse\n"
> +               __LSE_PREAMBLE
>                 "ldadd %x[i], %x[r], [%[b]]"
>                 : [r] "=r" (result), "+m" (*ptr)
>                 : [i] "r" (incr), [b] "r" (ptr)
> @@ -43,7 +49,7 @@ otx2_atomic64_add_sync(int64_t incr, int64_t *ptr)
>
>         /* Atomic add with ordering */
>         asm volatile (
> -               ".cpu  generic+lse\n"
> +               __LSE_PREAMBLE
>                 "ldadda %x[i], %x[r], [%[b]]"
>                 : [r] "=r" (result), "+m" (*ptr)
>                 : [i] "r" (incr), [b] "r" (ptr)
> @@ -57,7 +63,7 @@ otx2_lmt_submit(rte_iova_t io_address)
>         uint64_t result;
>
>         asm volatile (
> -               ".cpu  generic+lse\n"
> +               __LSE_PREAMBLE
>                 "ldeor xzr,%x[rf],[%[rs]]" :
>                  [rf] "=r"(result): [rs] "r"(io_address));
>         return result;
> @@ -69,7 +75,7 @@ otx2_lmt_submit_release(rte_iova_t io_address)
>         uint64_t result;
>
>         asm volatile (
> -               ".cpu  generic+lse\n"
> +               __LSE_PREAMBLE
>                 "ldeorl xzr,%x[rf],[%[rs]]" :
>                  [rf] "=r"(result) : [rs] "r"(io_address));
>         return result;
> @@ -104,4 +110,5 @@ otx2_lmt_mov_seg(void *out, const void *in, const uint16_t segdw)
>                 dst128[i] = src128[i];
>  }
>
> +#undef __LSE_PREAMBLE
>  #endif /* _OTX2_IO_ARM64_H_ */
> --
> 2.25.1
>
  

Patch

diff --git a/drivers/common/octeontx2/otx2_io_arm64.h b/drivers/common/octeontx2/otx2_io_arm64.h
index b5c85d9a6..34268e3af 100644
--- a/drivers/common/octeontx2/otx2_io_arm64.h
+++ b/drivers/common/octeontx2/otx2_io_arm64.h
@@ -21,6 +21,12 @@ 
 #define otx2_prefetch_store_keep(ptr) ({\
 	asm volatile("prfm pstl1keep, [%x0]\n" : : "r" (ptr)); })
 
+#if defined(__ARM_FEATURE_SVE)
+#define __LSE_PREAMBLE " .cpu  generic+lse+sve\n"
+#else
+#define __LSE_PREAMBLE " .cpu  generic+lse\n"
+#endif
+
 static __rte_always_inline uint64_t
 otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
 {
@@ -28,7 +34,7 @@  otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
 
 	/* Atomic add with no ordering */
 	asm volatile (
-		".cpu  generic+lse\n"
+		__LSE_PREAMBLE
 		"ldadd %x[i], %x[r], [%[b]]"
 		: [r] "=r" (result), "+m" (*ptr)
 		: [i] "r" (incr), [b] "r" (ptr)
@@ -43,7 +49,7 @@  otx2_atomic64_add_sync(int64_t incr, int64_t *ptr)
 
 	/* Atomic add with ordering */
 	asm volatile (
-		".cpu  generic+lse\n"
+		__LSE_PREAMBLE
 		"ldadda %x[i], %x[r], [%[b]]"
 		: [r] "=r" (result), "+m" (*ptr)
 		: [i] "r" (incr), [b] "r" (ptr)
@@ -57,7 +63,7 @@  otx2_lmt_submit(rte_iova_t io_address)
 	uint64_t result;
 
 	asm volatile (
-		".cpu  generic+lse\n"
+		__LSE_PREAMBLE
 		"ldeor xzr,%x[rf],[%[rs]]" :
 		 [rf] "=r"(result): [rs] "r"(io_address));
 	return result;
@@ -69,7 +75,7 @@  otx2_lmt_submit_release(rte_iova_t io_address)
 	uint64_t result;
 
 	asm volatile (
-		".cpu  generic+lse\n"
+		__LSE_PREAMBLE
 		"ldeorl xzr,%x[rf],[%[rs]]" :
 		 [rf] "=r"(result) : [rs] "r"(io_address));
 	return result;
@@ -104,4 +110,5 @@  otx2_lmt_mov_seg(void *out, const void *in, const uint16_t segdw)
 		dst128[i] = src128[i];
 }
 
+#undef __LSE_PREAMBLE
 #endif /* _OTX2_IO_ARM64_H_ */