[v3,25/27] mempool/octeontx2: add optimized dequeue operation for arm64

Message ID 20190617155537.36144-26-jerinj@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series OCTEON TX2 common and mempool driver |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Jerin Jacob Kollanukkaran June 17, 2019, 3:55 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

This patch adds an optimized arm64 instruction based routine to leverage
CPU pipeline characteristics of octeontx2. The theme is to fill the
pipeline with CASP operations as much HW can do so that HW can do alloc()
HW ops in full throttle.

Cc: Olivier Matz <olivier.matz@6wind.com>
Cc: Aaron Conole <aconole@redhat.com>

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Signed-off-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 drivers/mempool/octeontx2/otx2_mempool_ops.c | 291 +++++++++++++++++++
 1 file changed, 291 insertions(+)
  

Comments

Aaron Conole June 17, 2019, 9:25 p.m. UTC | #1
<jerinj@marvell.com> writes:

> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> This patch adds an optimized arm64 instruction based routine to leverage
> CPU pipeline characteristics of octeontx2. The theme is to fill the
> pipeline with CASP operations as much HW can do so that HW can do alloc()
> HW ops in full throttle.
>
> Cc: Olivier Matz <olivier.matz@6wind.com>
> Cc: Aaron Conole <aconole@redhat.com>
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> ---
>  drivers/mempool/octeontx2/otx2_mempool_ops.c | 291 +++++++++++++++++++
>  1 file changed, 291 insertions(+)
>
> diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> index c59bd73c0..e6737abda 100644
> --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> @@ -37,6 +37,293 @@ npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t * const addr,
>  	return -ENOENT;
>  }
>  
> +#if defined(RTE_ARCH_ARM64)
> +static __rte_noinline int
> +npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
> +		void **obj_table, unsigned int n)
> +{
> +	uint8_t i;
> +
> +	for (i = 0; i < n; i++) {
> +		if (obj_table[i] != NULL)
> +			continue;
> +		if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i))
> +			return -ENOENT;
> +	}
> +
> +	return 0;
> +}
> +
> +static  __attribute__((optimize("-O3"))) __rte_noinline int __hot

Sorry if I missed this before.

Is there a good reason to hard-code this optimization, rather than let
the build system provide it?

> +npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
> +			  unsigned int n, void **obj_table)
> +{
> +	const __uint128_t wdata128 = ((__uint128_t)wdata << 64) | wdata;
> +	uint64x2_t failed = vdupq_n_u64(~0);
> +
> +	switch (n) {
> +	case 32:
> +	{
> +		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
> +		__uint128_t t10, t11;
> +
> +		asm volatile (
> +		".cpu  generic+lse\n"
> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t8], %H[t8], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t9], %H[t9], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t10], %H[t10], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t11], %H[t11], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d16, %[t0]\n"
> +		"fmov v16.D[1], %H[t0]\n"
> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d17, %[t1]\n"
> +		"fmov v17.D[1], %H[t1]\n"
> +		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d18, %[t2]\n"
> +		"fmov v18.D[1], %H[t2]\n"
> +		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d19, %[t3]\n"
> +		"fmov v19.D[1], %H[t3]\n"
> +		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
> +		"and %[failed].16B, %[failed].16B, v16.16B\n"
> +		"and %[failed].16B, %[failed].16B, v17.16B\n"
> +		"and %[failed].16B, %[failed].16B, v18.16B\n"
> +		"and %[failed].16B, %[failed].16B, v19.16B\n"
> +		"fmov d20, %[t4]\n"
> +		"fmov v20.D[1], %H[t4]\n"
> +		"fmov d21, %[t5]\n"
> +		"fmov v21.D[1], %H[t5]\n"
> +		"fmov d22, %[t6]\n"
> +		"fmov v22.D[1], %H[t6]\n"
> +		"fmov d23, %[t7]\n"
> +		"fmov v23.D[1], %H[t7]\n"
> +		"and %[failed].16B, %[failed].16B, v20.16B\n"
> +		"and %[failed].16B, %[failed].16B, v21.16B\n"
> +		"and %[failed].16B, %[failed].16B, v22.16B\n"
> +		"and %[failed].16B, %[failed].16B, v23.16B\n"
> +		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
> +		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
> +		"fmov d16, %[t8]\n"
> +		"fmov v16.D[1], %H[t8]\n"
> +		"fmov d17, %[t9]\n"
> +		"fmov v17.D[1], %H[t9]\n"
> +		"fmov d18, %[t10]\n"
> +		"fmov v18.D[1], %H[t10]\n"
> +		"fmov d19, %[t11]\n"
> +		"fmov v19.D[1], %H[t11]\n"
> +		"and %[failed].16B, %[failed].16B, v16.16B\n"
> +		"and %[failed].16B, %[failed].16B, v17.16B\n"
> +		"and %[failed].16B, %[failed].16B, v18.16B\n"
> +		"and %[failed].16B, %[failed].16B, v19.16B\n"
> +		"fmov d20, %[t0]\n"
> +		"fmov v20.D[1], %H[t0]\n"
> +		"fmov d21, %[t1]\n"
> +		"fmov v21.D[1], %H[t1]\n"
> +		"fmov d22, %[t2]\n"
> +		"fmov v22.D[1], %H[t2]\n"
> +		"fmov d23, %[t3]\n"
> +		"fmov v23.D[1], %H[t3]\n"
> +		"and %[failed].16B, %[failed].16B, v20.16B\n"
> +		"and %[failed].16B, %[failed].16B, v21.16B\n"
> +		"and %[failed].16B, %[failed].16B, v22.16B\n"
> +		"and %[failed].16B, %[failed].16B, v23.16B\n"
> +		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
> +		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
> +		: "+Q" (*addr), [failed] "=&w" (failed),
> +		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
> +		[t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
> +		[t6] "=&r" (t6), [t7] "=&r" (t7), [t8] "=&r" (t8),
> +		[t9] "=&r" (t9), [t10] "=&r" (t10), [t11] "=&r" (t11)
> +		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
> +		[loc] "r" (addr)
> +		: "memory", "v16", "v17", "v18",
> +		"v19", "v20", "v21", "v22", "v23"
> +		);
> +		break;
> +	}
> +	case 16:
> +	{
> +		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7;
> +
> +		asm volatile (
> +		".cpu  generic+lse\n"
> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d16, %[t0]\n"
> +		"fmov v16.D[1], %H[t0]\n"
> +		"fmov d17, %[t1]\n"
> +		"fmov v17.D[1], %H[t1]\n"
> +		"fmov d18, %[t2]\n"
> +		"fmov v18.D[1], %H[t2]\n"
> +		"fmov d19, %[t3]\n"
> +		"fmov v19.D[1], %H[t3]\n"
> +		"and %[failed].16B, %[failed].16B, v16.16B\n"
> +		"and %[failed].16B, %[failed].16B, v17.16B\n"
> +		"and %[failed].16B, %[failed].16B, v18.16B\n"
> +		"and %[failed].16B, %[failed].16B, v19.16B\n"
> +		"fmov d20, %[t4]\n"
> +		"fmov v20.D[1], %H[t4]\n"
> +		"fmov d21, %[t5]\n"
> +		"fmov v21.D[1], %H[t5]\n"
> +		"fmov d22, %[t6]\n"
> +		"fmov v22.D[1], %H[t6]\n"
> +		"fmov d23, %[t7]\n"
> +		"fmov v23.D[1], %H[t7]\n"
> +		"and %[failed].16B, %[failed].16B, v20.16B\n"
> +		"and %[failed].16B, %[failed].16B, v21.16B\n"
> +		"and %[failed].16B, %[failed].16B, v22.16B\n"
> +		"and %[failed].16B, %[failed].16B, v23.16B\n"
> +		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
> +		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
> +		: "+Q" (*addr), [failed] "=&w" (failed),
> +		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
> +		[t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
> +		[t6] "=&r" (t6), [t7] "=&r" (t7)
> +		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
> +		[loc] "r" (addr)
> +		: "memory", "v16", "v17", "v18", "v19",
> +		  "v20", "v21", "v22", "v23"
> +		);
> +		break;
> +	}
> +	case 8:
> +	{
> +		__uint128_t t0, t1, t2, t3;
> +
> +		asm volatile (
> +		".cpu  generic+lse\n"
> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d16, %[t0]\n"
> +		"fmov v16.D[1], %H[t0]\n"
> +		"fmov d17, %[t1]\n"
> +		"fmov v17.D[1], %H[t1]\n"
> +		"fmov d18, %[t2]\n"
> +		"fmov v18.D[1], %H[t2]\n"
> +		"fmov d19, %[t3]\n"
> +		"fmov v19.D[1], %H[t3]\n"
> +		"and %[failed].16B, %[failed].16B, v16.16B\n"
> +		"and %[failed].16B, %[failed].16B, v17.16B\n"
> +		"and %[failed].16B, %[failed].16B, v18.16B\n"
> +		"and %[failed].16B, %[failed].16B, v19.16B\n"
> +		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
> +		: "+Q" (*addr), [failed] "=&w" (failed),
> +		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
> +		[t3] "=&r" (t3)
> +		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
> +		[loc] "r" (addr)
> +		: "memory", "v16", "v17", "v18", "v19"
> +		);
> +		break;
> +	}
> +	case 4:
> +	{
> +		__uint128_t t0, t1;
> +
> +		asm volatile (
> +		".cpu  generic+lse\n"
> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> +		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d16, %[t0]\n"
> +		"fmov v16.D[1], %H[t0]\n"
> +		"fmov d17, %[t1]\n"
> +		"fmov v17.D[1], %H[t1]\n"
> +		"and %[failed].16B, %[failed].16B, v16.16B\n"
> +		"and %[failed].16B, %[failed].16B, v17.16B\n"
> +		"st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
> +		: "+Q" (*addr), [failed] "=&w" (failed),
> +		[t0] "=&r" (t0), [t1] "=&r" (t1)
> +		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
> +		[loc] "r" (addr)
> +		: "memory", "v16", "v17"
> +		);
> +		break;
> +	}
> +	case 2:
> +	{
> +		__uint128_t t0;
> +
> +		asm volatile (
> +		".cpu  generic+lse\n"
> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
> +		"fmov d16, %[t0]\n"
> +		"fmov v16.D[1], %H[t0]\n"
> +		"and %[failed].16B, %[failed].16B, v16.16B\n"
> +		"st1 { v16.2d}, [%[dst]], 16\n"
> +		: "+Q" (*addr), [failed] "=&w" (failed),
> +		[t0] "=&r" (t0)
> +		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
> +		[loc] "r" (addr)
> +		: "memory", "v16"
> +		);
> +		break;
> +	}
> +	case 1:
> +		return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
> +	}
> +
> +	if (unlikely(!(vgetq_lane_u64(failed, 0) & vgetq_lane_u64(failed, 1))))
> +		return npa_lf_aura_op_search_alloc(wdata, addr, (void **)
> +			((char *)obj_table - (sizeof(uint64_t) * n)), n);
> +
> +	return 0;
> +}
> +
> +static __rte_noinline void
> +otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < n; i++) {
> +		if (obj_table[i] != NULL) {
> +			otx2_npa_enq(mp, &obj_table[i], 1);
> +			obj_table[i] = NULL;
> +		}
> +	}
> +}
> +
> +static inline int __hot
> +otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
> +{
> +	const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
> +	void **obj_table_bak = obj_table;
> +	const unsigned int nfree = n;
> +	unsigned int parts;
> +
> +	int64_t * const addr = (int64_t * const)
> +			(npa_lf_aura_handle_to_base(mp->pool_id) +
> +				NPA_LF_AURA_OP_ALLOCX(0));
> +	while (n) {
> +		parts = n > 31 ? 32 : rte_align32prevpow2(n);
> +		n -= parts;
> +		if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr,
> +				parts, obj_table))) {
> +			otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n);
> +			return -ENOENT;
> +		}
> +		obj_table += parts;
> +	}
> +
> +	return 0;
> +}
> +#endif
> +
>  static inline int __hot
>  otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
>  {
> @@ -463,7 +750,11 @@ static struct rte_mempool_ops otx2_npa_ops = {
>  	.get_count = otx2_npa_get_count,
>  	.calc_mem_size = otx2_npa_calc_mem_size,
>  	.populate = otx2_npa_populate,
> +#if defined(RTE_ARCH_ARM64)
> +	.dequeue = otx2_npa_deq_arm64,
> +#else
>  	.dequeue = otx2_npa_deq,
> +#endif
>  };
>  
>  MEMPOOL_REGISTER_OPS(otx2_npa_ops);
  
Pavan Nikhilesh Bhagavatula June 18, 2019, 7:39 a.m. UTC | #2
Hi Aaron,

>-----Original Message-----
>From: Aaron Conole <aconole@redhat.com>
>Sent: Tuesday, June 18, 2019 2:55 AM
>To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
>Cc: dev@dpdk.org; Nithin Kumar Dabilpuram
><ndabilpuram@marvell.com>; Vamsi Krishna Attunuru
><vattunuru@marvell.com>; Pavan Nikhilesh Bhagavatula
><pbhagavatula@marvell.com>; Olivier Matz <olivier.matz@6wind.com>
>Subject: [EXT] Re: [dpdk-dev] [PATCH v3 25/27] mempool/octeontx2:
>add optimized dequeue operation for arm64
>
>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>
>> This patch adds an optimized arm64 instruction based routine to
>leverage
>> CPU pipeline characteristics of octeontx2. The theme is to fill the
>> pipeline with CASP operations as much HW can do so that HW can do
>alloc()
>> HW ops in full throttle.
>>
>> Cc: Olivier Matz <olivier.matz@6wind.com>
>> Cc: Aaron Conole <aconole@redhat.com>
>>
>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>> Signed-off-by: Jerin Jacob <jerinj@marvell.com>
>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>> ---
>>  drivers/mempool/octeontx2/otx2_mempool_ops.c | 291
>+++++++++++++++++++
>>  1 file changed, 291 insertions(+)
>>
>> diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>> index c59bd73c0..e6737abda 100644
>> --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>> +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>> @@ -37,6 +37,293 @@ npa_lf_aura_op_alloc_one(const int64_t
>wdata, int64_t * const addr,
>>  	return -ENOENT;
>>  }
>>
>> +#if defined(RTE_ARCH_ARM64)
>> +static __rte_noinline int
>> +npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const
>addr,
>> +		void **obj_table, unsigned int n)
>> +{
>> +	uint8_t i;
>> +
>> +	for (i = 0; i < n; i++) {
>> +		if (obj_table[i] != NULL)
>> +			continue;
>> +		if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table,
>i))
>> +			return -ENOENT;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static  __attribute__((optimize("-O3"))) __rte_noinline int __hot
>
>Sorry if I missed this before.
>
>Is there a good reason to hard-code this optimization, rather than let
>the build system provide it?

Some versions of compiler don't have support for __int128_t for CASP inline-asm.
i.e. if the optimization level is reduced to -O0 the CASP restrictions aren't followed and 
compiler might end up violation the CASP rules example:

/tmp/ccSPMGzq.s:1648: Error: reg pair must start from even reg at operand 1 - `casp x21,x22,x0,x1,[x19]'
/tmp/ccSPMGzq.s:1706: Error: reg pair must start from even reg at operand 1 - `casp x13,x14,x0,x1,[x11]'
/tmp/ccSPMGzq.s:1745: Error: reg pair must start from even reg at operand 1 - `casp x9,x10,x0,x1,[x7]'
/tmp/ccSPMGzq.s:1775: Error: reg pair must start from even reg at operand 1 - `casp x7,x8,x0,x1,[x5]'*

Forcing to -O3 with __rte_noinline in place fixes it as the alignment fits in.

Regards,
Pavan.

>
>> +npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const
>addr,
>> +			  unsigned int n, void **obj_table)
>> +{
>> +	const __uint128_t wdata128 = ((__uint128_t)wdata << 64) |
>wdata;
>> +	uint64x2_t failed = vdupq_n_u64(~0);
>> +
>> +	switch (n) {
>> +	case 32:
>> +	{
>> +		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
>> +		__uint128_t t10, t11;
>> +
>> +		asm volatile (
>> +		".cpu  generic+lse\n"
>> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
  
Aaron Conole June 21, 2019, 7:26 p.m. UTC | #3
Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com> writes:

> Hi Aaron,
>
>>-----Original Message-----
>>From: Aaron Conole <aconole@redhat.com>
>>Sent: Tuesday, June 18, 2019 2:55 AM
>>To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
>>Cc: dev@dpdk.org; Nithin Kumar Dabilpuram
>><ndabilpuram@marvell.com>; Vamsi Krishna Attunuru
>><vattunuru@marvell.com>; Pavan Nikhilesh Bhagavatula
>><pbhagavatula@marvell.com>; Olivier Matz <olivier.matz@6wind.com>
>>Subject: [EXT] Re: [dpdk-dev] [PATCH v3 25/27] mempool/octeontx2:
>>add optimized dequeue operation for arm64
>>
>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>
>>> This patch adds an optimized arm64 instruction based routine to
>>leverage
>>> CPU pipeline characteristics of octeontx2. The theme is to fill the
>>> pipeline with CASP operations as much HW can do so that HW can do
>>alloc()
>>> HW ops in full throttle.
>>>
>>> Cc: Olivier Matz <olivier.matz@6wind.com>
>>> Cc: Aaron Conole <aconole@redhat.com>
>>>
>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>> Signed-off-by: Jerin Jacob <jerinj@marvell.com>
>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>> ---
>>>  drivers/mempool/octeontx2/otx2_mempool_ops.c | 291
>>+++++++++++++++++++
>>>  1 file changed, 291 insertions(+)
>>>
>>> diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> index c59bd73c0..e6737abda 100644
>>> --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> @@ -37,6 +37,293 @@ npa_lf_aura_op_alloc_one(const int64_t
>>wdata, int64_t * const addr,
>>>  	return -ENOENT;
>>>  }
>>>
>>> +#if defined(RTE_ARCH_ARM64)
>>> +static __rte_noinline int
>>> +npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const
>>addr,
>>> +		void **obj_table, unsigned int n)
>>> +{
>>> +	uint8_t i;
>>> +
>>> +	for (i = 0; i < n; i++) {
>>> +		if (obj_table[i] != NULL)
>>> +			continue;
>>> +		if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table,
>>i))
>>> +			return -ENOENT;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static  __attribute__((optimize("-O3"))) __rte_noinline int __hot
>>
>>Sorry if I missed this before.
>>
>>Is there a good reason to hard-code this optimization, rather than let
>>the build system provide it?
>
> Some versions of compiler don't have support for __int128_t for CASP inline-asm.
> i.e. if the optimization level is reduced to -O0 the CASP restrictions aren't followed and 
> compiler might end up violation the CASP rules example:
>
> /tmp/ccSPMGzq.s:1648: Error: reg pair must start from even reg at
> operand 1 - `casp x21,x22,x0,x1,[x19]'
> /tmp/ccSPMGzq.s:1706: Error: reg pair must start from even reg at
> operand 1 - `casp x13,x14,x0,x1,[x11]'
> /tmp/ccSPMGzq.s:1745: Error: reg pair must start from even reg at
> operand 1 - `casp x9,x10,x0,x1,[x7]'
> /tmp/ccSPMGzq.s:1775: Error: reg pair must start from even reg at
> operand 1 - `casp x7,x8,x0,x1,[x5]'*
>
> Forcing to -O3 with __rte_noinline in place fixes it as the alignment fits in.

It makes sense to document this - it isn't apparent that it is needed.
It would be good to put a comment just before that explains it,
preferably with the compilers that aren't behaving.  This would help in
the future to determine when it would be safe to drop the flag.

> Regards,
> Pavan.
>
>>
>>> +npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const
>>addr,
>>> +			  unsigned int n, void **obj_table)
>>> +{
>>> +	const __uint128_t wdata128 = ((__uint128_t)wdata << 64) |
>>wdata;
>>> +	uint64x2_t failed = vdupq_n_u64(~0);
>>> +
>>> +	switch (n) {
>>> +	case 32:
>>> +	{
>>> +		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
>>> +		__uint128_t t10, t11;
>>> +
>>> +		asm volatile (
>>> +		".cpu  generic+lse\n"
>>> +		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
  

Patch

diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index c59bd73c0..e6737abda 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -37,6 +37,293 @@  npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t * const addr,
 	return -ENOENT;
 }
 
+#if defined(RTE_ARCH_ARM64)
+static __rte_noinline int
+npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
+		void **obj_table, unsigned int n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		if (obj_table[i] != NULL)
+			continue;
+		if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i))
+			return -ENOENT;
+	}
+
+	return 0;
+}
+
+static  __attribute__((optimize("-O3"))) __rte_noinline int __hot
+npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
+			  unsigned int n, void **obj_table)
+{
+	const __uint128_t wdata128 = ((__uint128_t)wdata << 64) | wdata;
+	uint64x2_t failed = vdupq_n_u64(~0);
+
+	switch (n) {
+	case 32:
+	{
+		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+		__uint128_t t10, t11;
+
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t8], %H[t8], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t9], %H[t9], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t10], %H[t10], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t11], %H[t11], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d16, %[t0]\n"
+		"fmov v16.D[1], %H[t0]\n"
+		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d17, %[t1]\n"
+		"fmov v17.D[1], %H[t1]\n"
+		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d18, %[t2]\n"
+		"fmov v18.D[1], %H[t2]\n"
+		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d19, %[t3]\n"
+		"fmov v19.D[1], %H[t3]\n"
+		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"fmov d20, %[t4]\n"
+		"fmov v20.D[1], %H[t4]\n"
+		"fmov d21, %[t5]\n"
+		"fmov v21.D[1], %H[t5]\n"
+		"fmov d22, %[t6]\n"
+		"fmov v22.D[1], %H[t6]\n"
+		"fmov d23, %[t7]\n"
+		"fmov v23.D[1], %H[t7]\n"
+		"and %[failed].16B, %[failed].16B, v20.16B\n"
+		"and %[failed].16B, %[failed].16B, v21.16B\n"
+		"and %[failed].16B, %[failed].16B, v22.16B\n"
+		"and %[failed].16B, %[failed].16B, v23.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+		"fmov d16, %[t8]\n"
+		"fmov v16.D[1], %H[t8]\n"
+		"fmov d17, %[t9]\n"
+		"fmov v17.D[1], %H[t9]\n"
+		"fmov d18, %[t10]\n"
+		"fmov v18.D[1], %H[t10]\n"
+		"fmov d19, %[t11]\n"
+		"fmov v19.D[1], %H[t11]\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"fmov d20, %[t0]\n"
+		"fmov v20.D[1], %H[t0]\n"
+		"fmov d21, %[t1]\n"
+		"fmov v21.D[1], %H[t1]\n"
+		"fmov d22, %[t2]\n"
+		"fmov v22.D[1], %H[t2]\n"
+		"fmov d23, %[t3]\n"
+		"fmov v23.D[1], %H[t3]\n"
+		"and %[failed].16B, %[failed].16B, v20.16B\n"
+		"and %[failed].16B, %[failed].16B, v21.16B\n"
+		"and %[failed].16B, %[failed].16B, v22.16B\n"
+		"and %[failed].16B, %[failed].16B, v23.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+		: "+Q" (*addr), [failed] "=&w" (failed),
+		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
+		[t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
+		[t6] "=&r" (t6), [t7] "=&r" (t7), [t8] "=&r" (t8),
+		[t9] "=&r" (t9), [t10] "=&r" (t10), [t11] "=&r" (t11)
+		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
+		[loc] "r" (addr)
+		: "memory", "v16", "v17", "v18",
+		"v19", "v20", "v21", "v22", "v23"
+		);
+		break;
+	}
+	case 16:
+	{
+		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d16, %[t0]\n"
+		"fmov v16.D[1], %H[t0]\n"
+		"fmov d17, %[t1]\n"
+		"fmov v17.D[1], %H[t1]\n"
+		"fmov d18, %[t2]\n"
+		"fmov v18.D[1], %H[t2]\n"
+		"fmov d19, %[t3]\n"
+		"fmov v19.D[1], %H[t3]\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"fmov d20, %[t4]\n"
+		"fmov v20.D[1], %H[t4]\n"
+		"fmov d21, %[t5]\n"
+		"fmov v21.D[1], %H[t5]\n"
+		"fmov d22, %[t6]\n"
+		"fmov v22.D[1], %H[t6]\n"
+		"fmov d23, %[t7]\n"
+		"fmov v23.D[1], %H[t7]\n"
+		"and %[failed].16B, %[failed].16B, v20.16B\n"
+		"and %[failed].16B, %[failed].16B, v21.16B\n"
+		"and %[failed].16B, %[failed].16B, v22.16B\n"
+		"and %[failed].16B, %[failed].16B, v23.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+		: "+Q" (*addr), [failed] "=&w" (failed),
+		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
+		[t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
+		[t6] "=&r" (t6), [t7] "=&r" (t7)
+		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
+		[loc] "r" (addr)
+		: "memory", "v16", "v17", "v18", "v19",
+		  "v20", "v21", "v22", "v23"
+		);
+		break;
+	}
+	case 8:
+	{
+		__uint128_t t0, t1, t2, t3;
+
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d16, %[t0]\n"
+		"fmov v16.D[1], %H[t0]\n"
+		"fmov d17, %[t1]\n"
+		"fmov v17.D[1], %H[t1]\n"
+		"fmov d18, %[t2]\n"
+		"fmov v18.D[1], %H[t2]\n"
+		"fmov d19, %[t3]\n"
+		"fmov v19.D[1], %H[t3]\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		: "+Q" (*addr), [failed] "=&w" (failed),
+		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
+		[t3] "=&r" (t3)
+		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
+		[loc] "r" (addr)
+		: "memory", "v16", "v17", "v18", "v19"
+		);
+		break;
+	}
+	case 4:
+	{
+		__uint128_t t0, t1;
+
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d16, %[t0]\n"
+		"fmov v16.D[1], %H[t0]\n"
+		"fmov d17, %[t1]\n"
+		"fmov v17.D[1], %H[t1]\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
+		: "+Q" (*addr), [failed] "=&w" (failed),
+		[t0] "=&r" (t0), [t1] "=&r" (t1)
+		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
+		[loc] "r" (addr)
+		: "memory", "v16", "v17"
+		);
+		break;
+	}
+	case 2:
+	{
+		__uint128_t t0;
+
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
+		"fmov d16, %[t0]\n"
+		"fmov v16.D[1], %H[t0]\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"st1 { v16.2d}, [%[dst]], 16\n"
+		: "+Q" (*addr), [failed] "=&w" (failed),
+		[t0] "=&r" (t0)
+		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
+		[loc] "r" (addr)
+		: "memory", "v16"
+		);
+		break;
+	}
+	case 1:
+		return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
+	}
+
+	if (unlikely(!(vgetq_lane_u64(failed, 0) & vgetq_lane_u64(failed, 1))))
+		return npa_lf_aura_op_search_alloc(wdata, addr, (void **)
+			((char *)obj_table - (sizeof(uint64_t) * n)), n);
+
+	return 0;
+}
+
+static __rte_noinline void
+otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++) {
+		if (obj_table[i] != NULL) {
+			otx2_npa_enq(mp, &obj_table[i], 1);
+			obj_table[i] = NULL;
+		}
+	}
+}
+
+static inline int __hot
+otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
+	void **obj_table_bak = obj_table;
+	const unsigned int nfree = n;
+	unsigned int parts;
+
+	int64_t * const addr = (int64_t * const)
+			(npa_lf_aura_handle_to_base(mp->pool_id) +
+				NPA_LF_AURA_OP_ALLOCX(0));
+	while (n) {
+		parts = n > 31 ? 32 : rte_align32prevpow2(n);
+		n -= parts;
+		if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr,
+				parts, obj_table))) {
+			otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n);
+			return -ENOENT;
+		}
+		obj_table += parts;
+	}
+
+	return 0;
+}
+#endif
+
 static inline int __hot
 otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
@@ -463,7 +750,11 @@  static struct rte_mempool_ops otx2_npa_ops = {
 	.get_count = otx2_npa_get_count,
 	.calc_mem_size = otx2_npa_calc_mem_size,
 	.populate = otx2_npa_populate,
+#if defined(RTE_ARCH_ARM64)
+	.dequeue = otx2_npa_deq_arm64,
+#else
 	.dequeue = otx2_npa_deq,
+#endif
 };
 
 MEMPOOL_REGISTER_OPS(otx2_npa_ops);