diff mbox series

[v2,4/5] common/octeontx2: fix build with sve enabled

Message ID	20210108082523.1062058-5-ruifeng.wang@arm.com (mailing list archive)
State	Superseded, archived
Delegated to:	David Marchand
Headers	From: Ruifeng Wang <ruifeng.wang@arm.com> To: Jerin Jacob <jerinj@marvell.com>, Nithin Dabilpuram <ndabilpuram@marvell.com>, Pavan Nikhilesh <pbhagavatula@marvell.com> Cc: dev@dpdk.org, vladimir.medvedkin@intel.com, hemant.agrawal@nxp.com, honnappa.nagarahalli@arm.com, nd@arm.com, Ruifeng Wang <ruifeng.wang@arm.com>, stable@dpdk.org Date: Fri, 8 Jan 2021 08:25:22 +0000 Message-Id: <20210108082523.1062058-5-ruifeng.wang@arm.com> In-Reply-To: <20210108082523.1062058-1-ruifeng.wang@arm.com> References: <20201218101210.356836-1-ruifeng.wang@arm.com> <20210108082523.1062058-1-ruifeng.wang@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [dpdk-dev] [PATCH v2 4/5] common/octeontx2: fix build with sve enabled Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	lpm lookup with sve support \| [v2,0/5] lpm lookup with sve support [v2,1/5] lpm: add sve support for lookup on Arm platform [v2,2/5] net/hns3: fix build with sve enabled [v2,3/5] net/octeontx: fix build with sve enabled [v2,4/5] common/octeontx2: fix build with sve enabled [v2,5/5] config: add Arm Neoverse N2

Checks

Context	Check	Description
ci/checkpatch	warning	coding style issues

Commit Message

Ruifeng Wang Jan. 8, 2021, 8:25 a.m. UTC

  Building with gcc 10.2 with SVE extension enabled got error:

{standard input}: Assembler messages:
{standard input}:4002: Error: selected processor does not support `mov z3.b,#0'
{standard input}:4003: Error: selected processor does not support `whilelo p1.b,xzr,x7'
{standard input}:4005: Error: selected processor does not support `ld1b z0.b,p1/z,[x8]'
{standard input}:4006: Error: selected processor does not support `whilelo p4.s,wzr,w7'

This is because inline assembly code explicitly resets cpu model to
not have SVE support. Thus SVE instructions generated by compiler
auto vectorization got rejected by assembler.

Fixed the issue by replacing inline assembly with equivalent atomic
built-ins. Compiler will generate LSE instructions for cpu that has
the extension.

Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
Cc: jerinj@marvell.com
Cc: stable@dpdk.org

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 drivers/common/octeontx2/otx2_io_arm64.h | 37 +++---------------------
 1 file changed, 4 insertions(+), 33 deletions(-)

Comments

Pavan Nikhilesh Bhagavatula Jan. 8, 2021, 10:29 a.m. UTC | #1

Hi Ruifeng,

>Building with gcc 10.2 with SVE extension enabled got error:
>
>{standard input}: Assembler messages:
>{standard input}:4002: Error: selected processor does not support `mov
>z3.b,#0'
>{standard input}:4003: Error: selected processor does not support
>`whilelo p1.b,xzr,x7'
>{standard input}:4005: Error: selected processor does not support `ld1b
>z0.b,p1/z,[x8]'
>{standard input}:4006: Error: selected processor does not support
>`whilelo p4.s,wzr,w7'
>
>This is because inline assembly code explicitly resets cpu model to
>not have SVE support. Thus SVE instructions generated by compiler
>auto vectorization got rejected by assembler.
>
>Fixed the issue by replacing inline assembly with equivalent atomic
>built-ins. Compiler will generate LSE instructions for cpu that has
>the extension.
>
>Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
>Cc: jerinj@marvell.com
>Cc: stable@dpdk.org
>
>Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
>---
> drivers/common/octeontx2/otx2_io_arm64.h | 37 +++--------------------
>-
> 1 file changed, 4 insertions(+), 33 deletions(-)
>
>diff --git a/drivers/common/octeontx2/otx2_io_arm64.h
>b/drivers/common/octeontx2/otx2_io_arm64.h
>index b5c85d9a6..8843a79b5 100644
>--- a/drivers/common/octeontx2/otx2_io_arm64.h
>+++ b/drivers/common/octeontx2/otx2_io_arm64.h
>@@ -24,55 +24,26 @@
> static __rte_always_inline uint64_t
> otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
> {
>-	uint64_t result;
>-
> 	/* Atomic add with no ordering */
>-	asm volatile (
>-		".cpu  generic+lse\n"
>-		"ldadd %x[i], %x[r], [%[b]]"
>-		: [r] "=r" (result), "+m" (*ptr)
>-		: [i] "r" (incr), [b] "r" (ptr)
>-		: "memory");
>-	return result;
>+	return (uint64_t)__atomic_fetch_add(ptr, incr,
>__ATOMIC_RELAXED);
> }
>

Here LDADD acts as a way to interface to co-processors i.e. 
LDADD instruction opcode + specific io address are recognized by 
HW interceptor and dispatched to the specific coprocessor.

Leaving it to the compiler to use the correct instruction is a bad idea.
This breaks the arm64_armv8_linux_gcc build as it doesn't have the
+lse enabled.
__atomic_fetch_add will generate a different instruction with SVE 
enabled.

Instead can we add +sve to the first line to prevent outer loop from optimizing out 
the trap?

I tested with 10.2 and n2 config below change works fine.
-" .cpu          generic+lse\n"
+" .cpu		generic+lse+sve\n"

Regards,
Pavan.

> static __rte_always_inline uint64_t
> otx2_atomic64_add_sync(int64_t incr, int64_t *ptr)
> {
>-	uint64_t result;
>-
>-	/* Atomic add with ordering */
>-	asm volatile (
>-		".cpu  generic+lse\n"
>-		"ldadda %x[i], %x[r], [%[b]]"
>-		: [r] "=r" (result), "+m" (*ptr)
>-		: [i] "r" (incr), [b] "r" (ptr)
>-		: "memory");
>-	return result;
>+	return (uint64_t)__atomic_fetch_add(ptr, incr,
>__ATOMIC_ACQUIRE);
> }
>
> static __rte_always_inline uint64_t
> otx2_lmt_submit(rte_iova_t io_address)
> {
>-	uint64_t result;
>-
>-	asm volatile (
>-		".cpu  generic+lse\n"
>-		"ldeor xzr,%x[rf],[%[rs]]" :
>-		 [rf] "=r"(result): [rs] "r"(io_address));
>-	return result;
>+	return __atomic_fetch_xor((uint64_t *)io_address, 0,
>__ATOMIC_RELAXED);
> }
>
> static __rte_always_inline uint64_t
> otx2_lmt_submit_release(rte_iova_t io_address)
> {
>-	uint64_t result;
>-
>-	asm volatile (
>-		".cpu  generic+lse\n"
>-		"ldeorl xzr,%x[rf],[%[rs]]" :
>-		 [rf] "=r"(result) : [rs] "r"(io_address));
>-	return result;
>+	return __atomic_fetch_xor((uint64_t *)io_address, 0,
>__ATOMIC_RELEASE);
> }
>
> static __rte_always_inline void
>--
>2.25.1

Ruifeng Wang Jan. 11, 2021, 9:51 a.m. UTC | #2

> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>
> Sent: Friday, January 8, 2021 6:29 PM
> To: Ruifeng Wang <Ruifeng.Wang@arm.com>; jerinj@marvell.com; Nithin
> Kumar Dabilpuram <ndabilpuram@marvell.com>
> Cc: dev@dpdk.org; vladimir.medvedkin@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; stable@dpdk.org
> Subject: RE: [EXT] [PATCH v2 4/5] common/octeontx2: fix build with sve
> enabled
> 
> Hi Ruifeng,
> 
> >Building with gcc 10.2 with SVE extension enabled got error:
> >
> >{standard input}: Assembler messages:
> >{standard input}:4002: Error: selected processor does not support `mov
> >z3.b,#0'
> >{standard input}:4003: Error: selected processor does not support
> >`whilelo p1.b,xzr,x7'
> >{standard input}:4005: Error: selected processor does not support `ld1b
> >z0.b,p1/z,[x8]'
> >{standard input}:4006: Error: selected processor does not support
> >`whilelo p4.s,wzr,w7'
> >
> >This is because inline assembly code explicitly resets cpu model to not
> >have SVE support. Thus SVE instructions generated by compiler auto
> >vectorization got rejected by assembler.
> >
> >Fixed the issue by replacing inline assembly with equivalent atomic
> >built-ins. Compiler will generate LSE instructions for cpu that has the
> >extension.
> >
> >Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
> >Cc: jerinj@marvell.com
> >Cc: stable@dpdk.org
> >
> >Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> >---
> > drivers/common/octeontx2/otx2_io_arm64.h | 37 +++--------------------
> >-
> > 1 file changed, 4 insertions(+), 33 deletions(-)
> >
> >diff --git a/drivers/common/octeontx2/otx2_io_arm64.h
> >b/drivers/common/octeontx2/otx2_io_arm64.h
> >index b5c85d9a6..8843a79b5 100644
> >--- a/drivers/common/octeontx2/otx2_io_arm64.h
> >+++ b/drivers/common/octeontx2/otx2_io_arm64.h
> >@@ -24,55 +24,26 @@
> > static __rte_always_inline uint64_t
> > otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)  {
> >-	uint64_t result;
> >-
> > 	/* Atomic add with no ordering */
> >-	asm volatile (
> >-		".cpu  generic+lse\n"
> >-		"ldadd %x[i], %x[r], [%[b]]"
> >-		: [r] "=r" (result), "+m" (*ptr)
> >-		: [i] "r" (incr), [b] "r" (ptr)
> >-		: "memory");
> >-	return result;
> >+	return (uint64_t)__atomic_fetch_add(ptr, incr,
> >__ATOMIC_RELAXED);
> > }
> >
> 
> Here LDADD acts as a way to interface to co-processors i.e.
> LDADD instruction opcode + specific io address are recognized by HW
> interceptor and dispatched to the specific coprocessor.

OK. Now I understand the background.
> 
> Leaving it to the compiler to use the correct instruction is a bad idea.
> This breaks the arm64_armv8_linux_gcc build as it doesn't have the
> +lse enabled.
> __atomic_fetch_add will generate a different instruction with SVE enabled.
> 
> Instead can we add +sve to the first line to prevent outer loop from
> optimizing out the trap?

Since the inline assembly needs to be preserved, we have to tune the enabled extensions.
I will change in next version.

Thanks,
Ruifeng
> 
> I tested with 10.2 and n2 config below change works fine.
> -" .cpu          generic+lse\n"
> +" .cpu		generic+lse+sve\n"
> 
> Regards,
> Pavan.
> 
> > static __rte_always_inline uint64_t
> > otx2_atomic64_add_sync(int64_t incr, int64_t *ptr)  {
> >-	uint64_t result;
> >-
> >-	/* Atomic add with ordering */
> >-	asm volatile (
> >-		".cpu  generic+lse\n"
> >-		"ldadda %x[i], %x[r], [%[b]]"
> >-		: [r] "=r" (result), "+m" (*ptr)
> >-		: [i] "r" (incr), [b] "r" (ptr)
> >-		: "memory");
> >-	return result;
> >+	return (uint64_t)__atomic_fetch_add(ptr, incr,
> >__ATOMIC_ACQUIRE);
> > }
> >
> > static __rte_always_inline uint64_t
> > otx2_lmt_submit(rte_iova_t io_address)  {
> >-	uint64_t result;
> >-
> >-	asm volatile (
> >-		".cpu  generic+lse\n"
> >-		"ldeor xzr,%x[rf],[%[rs]]" :
> >-		 [rf] "=r"(result): [rs] "r"(io_address));
> >-	return result;
> >+	return __atomic_fetch_xor((uint64_t *)io_address, 0,
> >__ATOMIC_RELAXED);
> > }
> >
> > static __rte_always_inline uint64_t
> > otx2_lmt_submit_release(rte_iova_t io_address)  {
> >-	uint64_t result;
> >-
> >-	asm volatile (
> >-		".cpu  generic+lse\n"
> >-		"ldeorl xzr,%x[rf],[%[rs]]" :
> >-		 [rf] "=r"(result) : [rs] "r"(io_address));
> >-	return result;
> >+	return __atomic_fetch_xor((uint64_t *)io_address, 0,
> >__ATOMIC_RELEASE);
> > }
> >
> > static __rte_always_inline void
> >--
> >2.25.1

diff mbox series

Patch

diff --git a/drivers/common/octeontx2/otx2_io_arm64.h b/drivers/common/octeontx2/otx2_io_arm64.h
index b5c85d9a6..8843a79b5 100644
--- a/drivers/common/octeontx2/otx2_io_arm64.h
+++ b/drivers/common/octeontx2/otx2_io_arm64.h
@@ -24,55 +24,26 @@ 
 static __rte_always_inline uint64_t
 otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
 {
-	uint64_t result;
-
 	/* Atomic add with no ordering */
-	asm volatile (
-		".cpu  generic+lse\n"
-		"ldadd %x[i], %x[r], [%[b]]"
-		: [r] "=r" (result), "+m" (*ptr)
-		: [i] "r" (incr), [b] "r" (ptr)
-		: "memory");
-	return result;
+	return (uint64_t)__atomic_fetch_add(ptr, incr, __ATOMIC_RELAXED);
 }
 
 static __rte_always_inline uint64_t
 otx2_atomic64_add_sync(int64_t incr, int64_t *ptr)
 {
-	uint64_t result;
-
-	/* Atomic add with ordering */
-	asm volatile (
-		".cpu  generic+lse\n"
-		"ldadda %x[i], %x[r], [%[b]]"
-		: [r] "=r" (result), "+m" (*ptr)
-		: [i] "r" (incr), [b] "r" (ptr)
-		: "memory");
-	return result;
+	return (uint64_t)__atomic_fetch_add(ptr, incr, __ATOMIC_ACQUIRE);
 }
 
 static __rte_always_inline uint64_t
 otx2_lmt_submit(rte_iova_t io_address)
 {
-	uint64_t result;
-
-	asm volatile (
-		".cpu  generic+lse\n"
-		"ldeor xzr,%x[rf],[%[rs]]" :
-		 [rf] "=r"(result): [rs] "r"(io_address));
-	return result;
+	return __atomic_fetch_xor((uint64_t *)io_address, 0, __ATOMIC_RELAXED);
 }
 
 static __rte_always_inline uint64_t
 otx2_lmt_submit_release(rte_iova_t io_address)
 {
-	uint64_t result;
-
-	asm volatile (
-		".cpu  generic+lse\n"
-		"ldeorl xzr,%x[rf],[%[rs]]" :
-		 [rf] "=r"(result) : [rs] "r"(io_address));
-	return result;
+	return __atomic_fetch_xor((uint64_t *)io_address, 0, __ATOMIC_RELEASE);
 }
 
 static __rte_always_inline void