[1/1] mempool: implement index-based per core cache

Message ID 20211224225923.806498-2-dharmik.thakkar@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series mempool: implement index-based per core cache |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/github-robot: build success github build: passed
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-abi-testing success Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS

Commit Message

Dharmik Thakkar Dec. 24, 2021, 10:59 p.m. UTC
  Current mempool per core cache implementation stores pointers to mbufs
On 64b architectures, each pointer consumes 8B
This patch replaces it with index-based implementation,
where in each buffer is addressed by (pool base address + index)
It reduces the amount of memory/cache required for per core cache

L3Fwd performance testing reveals minor improvements in the cache
performance (L1 and L2 misses reduced by 0.60%)
with no change in throughput

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
 lib/mempool/rte_mempool_ops_default.c |   7 ++
 2 files changed, 119 insertions(+), 2 deletions(-)
  

Comments

Ananyev, Konstantin Jan. 11, 2022, 2:26 a.m. UTC | #1
> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput

I feel really sceptical about that patch and the whole idea in general:
- From what I read above there is no real performance improvement observed.
  (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
  see below for more details). 
- Space utilization difference looks neglectable too.
- The change introduces a new build time config option with a major limitation:
   All memzones in a pool have to be within the same 4GB boundary. 
   To address it properly, extra changes will be required in init(/populate) part of the code.
   All that will complicate mempool code, will make it more error prone
   and harder to maintain.
But, as there is no real gain in return - no point to add such extra complexity at all.

Konstantin

CSX 2.1 GHz
==========

echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci

params :                                                                                                  rate_persec  	
                                                                                                                 (normal/index-based/diff %)
(with cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 740989337.00/504116019.00/-31.97
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756495155.00/615002931.00/-18.70
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1483499110.00/1007248997.00/-32.10
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1512439807.00/1229927218.00/-18.68
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 5933668757.00/4029048421.00/-32.10
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6049234942.00/4921111344.00/-18.65

(with user-owned cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 630600499.00/504312627.00/-20.03
 cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756259225.00/615042252.00/-18.67
 cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1262052966.00/1007039283.00/-20.21
 cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1517853081.00/1230818508.00/-18.91
 cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :5054529533.00/4028052273.00/-20.31
 cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6059340592.00/4912893129.00/-18.92

> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
>  lib/mempool/rte_mempool_ops_default.c |   7 ++
>  2 files changed, 119 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1e7a3c15273c..4fabd3b1920b 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -50,6 +50,10 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#include <rte_vect.h>
> +#endif
> +
>  #include "rte_mempool_trace_fp.h"
> 
>  #ifdef __cplusplus
> @@ -239,6 +243,9 @@ struct rte_mempool {
>  	int32_t ops_index;
> 
>  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	void *pool_base_value; /**< Base value to calculate indices */
> +#endif
> 
>  	uint32_t populated_size;         /**< Number of populated objects. */
>  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>  	if (cache == NULL || cache->len == 0)
>  		return;
>  	rte_mempool_trace_cache_flush(cache, mp);
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	unsigned int i;
> +	unsigned int cache_len = cache->len;
> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +	void *base_value = mp->pool_base_value;
> +	uint32_t *cache_objs = (uint32_t *) cache->objs;
> +	for (i = 0; i < cache_len; i++)
> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> +#else
>  	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +#endif
>  	cache->len = 0;
>  }
> 
> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
>  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  			   unsigned int n, struct rte_mempool_cache *cache)
>  {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	uint32_t *cache_objs;
> +	void *base_value;
> +	uint32_t i;
> +#else
>  	void **cache_objs;
> -
> +#endif
>  	/* increment stat now, adding in mempool always success */
>  	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
>  	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
>  		goto ring_enqueue;
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	cache_objs = (uint32_t *) cache->objs;
> +	cache_objs = &cache_objs[cache->len];
> +	base_value = mp->pool_base_value;
> +#else
>  	cache_objs = &cache->objs[cache->len];
> +#endif
> 
>  	/*
>  	 * The cache follows the following algorithm
> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	 */
> 
>  	/* Add elements back into the cache */
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +	uint32x2_t v_cache_objs;
> +
> +	for (i = 0; i < (n & ~0x1); i += 2) {
> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +		vst1_u32(cache_objs + i, v_cache_objs);
> +	}
> +	if (n & 0x1) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#else
> +	for (i = 0; i < n; i++) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#endif
> +#else
>  	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +#endif
> 
>  	cache->len += n;
> 
>  	if (cache->len >= cache->flushthresh) {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
> +				cache->len - cache->size);
> +#else
>  		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
>  				cache->len - cache->size);
> +#endif
>  		cache->len = cache->size;
>  	}
> 
> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  {
>  	int ret;
>  	uint32_t index, len;
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	uint32_t i;
> +	uint32_t *cache_objs;
> +#else
>  	void **cache_objs;
> -
> +#endif
>  	/* No cache provided or cannot be satisfied from cache */
>  	if (unlikely(cache == NULL || n >= cache->size))
>  		goto ring_dequeue;
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	void *base_value = mp->pool_base_value;
> +	cache_objs = (uint32_t *) cache->objs;
> +#else
>  	cache_objs = cache->objs;
> +#endif
> 
>  	/* Can this be satisfied from the cache? */
>  	if (cache->len < n) {
> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  		uint32_t req = n + (cache->size - cache->len);
> 
>  		/* How many do we require i.e. number to fill the cache + the request */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +		ret = rte_mempool_ops_dequeue_bulk(mp,
> +			temp_objs, req);
> +#else
>  		ret = rte_mempool_ops_dequeue_bulk(mp,
>  			&cache->objs[cache->len], req);
> +#endif
>  		if (unlikely(ret < 0)) {
>  			/*
>  			 * In the off chance that we are buffer constrained,
> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  			goto ring_dequeue;
>  		}
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		len = cache->len;
> +		for (i = 0; i < req; ++i, ++len) {
> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> +								base_value);
> +		}
> +#endif
>  		cache->len += req;
>  	}
> 
>  	/* Now fill in the response ... */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_cache_objs;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
> +	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
> +						len -= 4, obj_table += 4) {
> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
> +	}
> +	switch (n & 0x3) {
> +	case 3:
> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> +								/* fallthrough */
> +	case 2:
> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> +								/* fallthrough */
> +	case 1:
> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> +	}
> +#else
> +	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
> +#else
>  	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>  		*obj_table = cache_objs[len];
> +#endif
> 
>  	cache->len -= n;
> 
> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
> index 22fccf9d7619..3543cad9d4ce 100644
> --- a/lib/mempool/rte_mempool_ops_default.c
> +++ b/lib/mempool/rte_mempool_ops_default.c
> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>  		obj = va + off;
>  		obj_cb(mp, obj_cb_arg, obj,
>  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		/* Store pool base value to calculate indices for index-based
> +		 * lcore cache implementation
> +		 */
> +		if (i == 0)
> +			mp->pool_base_value = obj;
> +#endif
>  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>  		off += mp->elt_size + mp->trailer_size;
>  	}
> --
> 2.25.1
  
Dharmik Thakkar Jan. 13, 2022, 5:17 a.m. UTC | #2
Hi Konstatin,

Thank you for your comments and the test report!

> On Jan 10, 2022, at 8:26 PM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
> 
> 
> 
> 
>> Current mempool per core cache implementation stores pointers to mbufs
>> On 64b architectures, each pointer consumes 8B
>> This patch replaces it with index-based implementation,
>> where in each buffer is addressed by (pool base address + index)
>> It reduces the amount of memory/cache required for per core cache
>> 
>> L3Fwd performance testing reveals minor improvements in the cache
>> performance (L1 and L2 misses reduced by 0.60%)
>> with no change in throughput
> 
> I feel really sceptical about that patch and the whole idea in general:
> - From what I read above there is no real performance improvement observed.
>  (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>  see below for more details). 

Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
Similar optimizations can be implemented for x86 platforms which should close the performance gap
and in my understanding should give better performance for a bulk size of 32.

> - Space utilization difference looks neglectable too.

Sorry, I did not understand this point.

> - The change introduces a new build time config option with a major limitation:
>   All memzones in a pool have to be within the same 4GB boundary. 
>   To address it properly, extra changes will be required in init(/populate) part of the code.

I agree to the above mentioned challenges and I am currently working on resolving these issues.

>   All that will complicate mempool code, will make it more error prone
>   and harder to maintain.
> But, as there is no real gain in return - no point to add such extra complexity at all.
> 
> Konstantin
> 
> CSX 2.1 GHz
> ==========
> 
> echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci
> 
> params :                                                                                                  rate_persec  	
>                                                                                                                 (normal/index-based/diff %)
> (with cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 740989337.00/504116019.00/-31.97
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756495155.00/615002931.00/-18.70
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1483499110.00/1007248997.00/-32.10
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1512439807.00/1229927218.00/-18.68
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 5933668757.00/4029048421.00/-32.10
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6049234942.00/4921111344.00/-18.65
> 
> (with user-owned cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 630600499.00/504312627.00/-20.03
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756259225.00/615042252.00/-18.67
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1262052966.00/1007039283.00/-20.21
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1517853081.00/1230818508.00/-18.91
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :5054529533.00/4028052273.00/-20.31
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6059340592.00/4912893129.00/-18.92
> 
>> 
>> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>> lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
>> lib/mempool/rte_mempool_ops_default.c |   7 ++
>> 2 files changed, 119 insertions(+), 2 deletions(-)
>> 
>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>> index 1e7a3c15273c..4fabd3b1920b 100644
>> --- a/lib/mempool/rte_mempool.h
>> +++ b/lib/mempool/rte_mempool.h
>> @@ -50,6 +50,10 @@
>> #include <rte_memcpy.h>
>> #include <rte_common.h>
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#include <rte_vect.h>
>> +#endif
>> +
>> #include "rte_mempool_trace_fp.h"
>> 
>> #ifdef __cplusplus
>> @@ -239,6 +243,9 @@ struct rte_mempool {
>> 	int32_t ops_index;
>> 
>> 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	void *pool_base_value; /**< Base value to calculate indices */
>> +#endif
>> 
>> 	uint32_t populated_size;         /**< Number of populated objects. */
>> 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
>> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>> 	if (cache == NULL || cache->len == 0)
>> 		return;
>> 	rte_mempool_trace_cache_flush(cache, mp);
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	unsigned int i;
>> +	unsigned int cache_len = cache->len;
>> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
>> +	void *base_value = mp->pool_base_value;
>> +	uint32_t *cache_objs = (uint32_t *) cache->objs;
>> +	for (i = 0; i < cache_len; i++)
>> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
>> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>> +#else
>> 	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
>> +#endif
>> 	cache->len = 0;
>> }
>> 
>> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
>> rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 			   unsigned int n, struct rte_mempool_cache *cache)
>> {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	uint32_t *cache_objs;
>> +	void *base_value;
>> +	uint32_t i;
>> +#else
>> 	void **cache_objs;
>> -
>> +#endif
>> 	/* increment stat now, adding in mempool always success */
>> 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
>> 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
>> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
>> 		goto ring_enqueue;
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	cache_objs = (uint32_t *) cache->objs;
>> +	cache_objs = &cache_objs[cache->len];
>> +	base_value = mp->pool_base_value;
>> +#else
>> 	cache_objs = &cache->objs[cache->len];
>> +#endif
>> 
>> 	/*
>> 	 * The cache follows the following algorithm
>> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 	 */
>> 
>> 	/* Add elements back into the cache */
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> +	uint64x2_t v_obj_table;
>> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> +	uint32x2_t v_cache_objs;
>> +
>> +	for (i = 0; i < (n & ~0x1); i += 2) {
>> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
>> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
>> +		vst1_u32(cache_objs + i, v_cache_objs);
>> +	}
>> +	if (n & 0x1) {
>> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> +	}
>> +#else
>> +	for (i = 0; i < n; i++) {
>> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> +	}
>> +#endif
>> +#else
>> 	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
>> +#endif
>> 
>> 	cache->len += n;
>> 
>> 	if (cache->len >= cache->flushthresh) {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>> +				cache->len - cache->size);
>> +#else
>> 		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
>> 				cache->len - cache->size);
>> +#endif
>> 		cache->len = cache->size;
>> 	}
>> 
>> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> {
>> 	int ret;
>> 	uint32_t index, len;
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	uint32_t i;
>> +	uint32_t *cache_objs;
>> +#else
>> 	void **cache_objs;
>> -
>> +#endif
>> 	/* No cache provided or cannot be satisfied from cache */
>> 	if (unlikely(cache == NULL || n >= cache->size))
>> 		goto ring_dequeue;
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	void *base_value = mp->pool_base_value;
>> +	cache_objs = (uint32_t *) cache->objs;
>> +#else
>> 	cache_objs = cache->objs;
>> +#endif
>> 
>> 	/* Can this be satisfied from the cache? */
>> 	if (cache->len < n) {
>> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> 		uint32_t req = n + (cache->size - cache->len);
>> 
>> 		/* How many do we require i.e. number to fill the cache + the request */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>> +		ret = rte_mempool_ops_dequeue_bulk(mp,
>> +			temp_objs, req);
>> +#else
>> 		ret = rte_mempool_ops_dequeue_bulk(mp,
>> 			&cache->objs[cache->len], req);
>> +#endif
>> 		if (unlikely(ret < 0)) {
>> 			/*
>> 			 * In the off chance that we are buffer constrained,
>> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> 			goto ring_dequeue;
>> 		}
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		len = cache->len;
>> +		for (i = 0; i < req; ++i, ++len) {
>> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
>> +								base_value);
>> +		}
>> +#endif
>> 		cache->len += req;
>> 	}
>> 
>> 	/* Now fill in the response ... */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> +	uint64x2_t v_obj_table;
>> +	uint64x2_t v_cache_objs;
>> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> +
>> +	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
>> +						len -= 4, obj_table += 4) {
>> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
>> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
>> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
>> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> +		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
>> +	}
>> +	switch (n & 0x3) {
>> +	case 3:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +								/* fallthrough */
>> +	case 2:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +								/* fallthrough */
>> +	case 1:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +	}
>> +#else
>> +	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
>> +#endif
>> +#else
>> 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> 		*obj_table = cache_objs[len];
>> +#endif
>> 
>> 	cache->len -= n;
>> 
>> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
>> index 22fccf9d7619..3543cad9d4ce 100644
>> --- a/lib/mempool/rte_mempool_ops_default.c
>> +++ b/lib/mempool/rte_mempool_ops_default.c
>> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>> 		obj = va + off;
>> 		obj_cb(mp, obj_cb_arg, obj,
>> 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		/* Store pool base value to calculate indices for index-based
>> +		 * lcore cache implementation
>> +		 */
>> +		if (i == 0)
>> +			mp->pool_base_value = obj;
>> +#endif
>> 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>> 		off += mp->elt_size + mp->trailer_size;
>> 	}
>> --
>> 2.25.1
>
  
Ananyev, Konstantin Jan. 13, 2022, 10:37 a.m. UTC | #3
Hi Dharmik,

> >
> >> Current mempool per core cache implementation stores pointers to mbufs
> >> On 64b architectures, each pointer consumes 8B
> >> This patch replaces it with index-based implementation,
> >> where in each buffer is addressed by (pool base address + index)
> >> It reduces the amount of memory/cache required for per core cache
> >>
> >> L3Fwd performance testing reveals minor improvements in the cache
> >> performance (L1 and L2 misses reduced by 0.60%)
> >> with no change in throughput
> >
> > I feel really sceptical about that patch and the whole idea in general:
> > - From what I read above there is no real performance improvement observed.
> >  (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> >  see below for more details).
> 
> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> and in my understanding should give better performance for a bulk size of 32.

Might be, but I still don't see the reason for such effort.
As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
on ARM64 even with vectorized version of the code.

> > - Space utilization difference looks neglectable too.
> 
> Sorry, I did not understand this point.

As I understand one of the expectations from that patch was:
reduce memory/cache required, which should improve cache utilization
(less misses, etc.).
Though I think such improvements would be neglectable and wouldn't
cause any real performance gain. 

> > - The change introduces a new build time config option with a major limitation:
> >   All memzones in a pool have to be within the same 4GB boundary.
> >   To address it properly, extra changes will be required in init(/populate) part of the code.
> 
> I agree to the above mentioned challenges and I am currently working on resolving these issues.

I still think that to justify such changes some really noticeable performance
improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...  
Otherwise it just not worth the hassle. 
 
> >   All that will complicate mempool code, will make it more error prone
> >   and harder to maintain.
> > But, as there is no real gain in return - no point to add such extra complexity at all.
> >
> > Konstantin
> >
  
Dharmik Thakkar Jan. 19, 2022, 3:32 p.m. UTC | #4
Hi Konstatin,

> On Jan 13, 2022, at 4:37 AM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
> 
> 
> Hi Dharmik,
> 
>>> 
>>>> Current mempool per core cache implementation stores pointers to mbufs
>>>> On 64b architectures, each pointer consumes 8B
>>>> This patch replaces it with index-based implementation,
>>>> where in each buffer is addressed by (pool base address + index)
>>>> It reduces the amount of memory/cache required for per core cache
>>>> 
>>>> L3Fwd performance testing reveals minor improvements in the cache
>>>> performance (L1 and L2 misses reduced by 0.60%)
>>>> with no change in throughput
>>> 
>>> I feel really sceptical about that patch and the whole idea in general:
>>> - From what I read above there is no real performance improvement observed.
>>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>>> see below for more details).
>> 
>> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
>> Similar optimizations can be implemented for x86 platforms which should close the performance gap
>> and in my understanding should give better performance for a bulk size of 32.
> 
> Might be, but I still don't see the reason for such effort.
> As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> on ARM64 even with vectorized version of the code.
> 

IMO, even without performance improvement, it is advantageous because the same performance is being achieved
with less memory and cache utilization using the patch.

>>> - Space utilization difference looks neglectable too.
>> 
>> Sorry, I did not understand this point.
> 
> As I understand one of the expectations from that patch was:
> reduce memory/cache required, which should improve cache utilization
> (less misses, etc.).
> Though I think such improvements would be neglectable and wouldn't
> cause any real performance gain.

The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.

> 
>>> - The change introduces a new build time config option with a major limitation:
>>>  All memzones in a pool have to be within the same 4GB boundary.
>>>  To address it properly, extra changes will be required in init(/populate) part of the code.
>> 
>> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> 
> I still think that to justify such changes some really noticeable performance
> improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...  
> Otherwise it just not worth the hassle. 
> 

Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.

>>>  All that will complicate mempool code, will make it more error prone
>>>  and harder to maintain.
>>> But, as there is no real gain in return - no point to add such extra complexity at all.
>>> 
>>> Konstantin
>>>
  
Ananyev, Konstantin Jan. 21, 2022, 11:25 a.m. UTC | #5
Hi Dharmik,
> >
> >>>
> >>>> Current mempool per core cache implementation stores pointers to mbufs
> >>>> On 64b architectures, each pointer consumes 8B
> >>>> This patch replaces it with index-based implementation,
> >>>> where in each buffer is addressed by (pool base address + index)
> >>>> It reduces the amount of memory/cache required for per core cache
> >>>>
> >>>> L3Fwd performance testing reveals minor improvements in the cache
> >>>> performance (L1 and L2 misses reduced by 0.60%)
> >>>> with no change in throughput
> >>>
> >>> I feel really sceptical about that patch and the whole idea in general:
> >>> - From what I read above there is no real performance improvement observed.
> >>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> >>> see below for more details).
> >>
> >> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> >> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> >> and in my understanding should give better performance for a bulk size of 32.
> >
> > Might be, but I still don't see the reason for such effort.
> > As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> > on ARM64 even with vectorized version of the code.
> >
> 
> IMO, even without performance improvement, it is advantageous because the same performance is being achieved
> with less memory and cache utilization using the patch.
> 
> >>> - Space utilization difference looks neglectable too.
> >>
> >> Sorry, I did not understand this point.
> >
> > As I understand one of the expectations from that patch was:
> > reduce memory/cache required, which should improve cache utilization
> > (less misses, etc.).
> > Though I think such improvements would be neglectable and wouldn't
> > cause any real performance gain.
> 
> The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
> Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.

It saves you 4 just bytes per mbuf.
Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
Let's do a simplistic estimation of  number of affected cache-lines l for l2fwd. 
For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:

                                                                       number of cache-lines accessed 
                                                                   cache with pointers / cache with indexes 
mempool_get:                                            (32*8)/64=4          /  (32*4)/64=2
RX (read HW desc):                                    (32*16)/64=8       /   (32*16)/64=8
RX (write mbuf fields, 1st cache line):    (32*64)/64=3       /   (32*64)/64=32
update mac addrs:                                     (32*64)/64=32     /   (32*64)/64=32   
TX (write HW desc):                                   (32*16)/64=8       /   (32*16)/64=8
free mbufs (read 2nd mbuf cache line): (32*64)/64=32    /   (32*64)/64=32   
mempool_put:                                            (32*8)/64=4        /    (32*4)/64=2
total:                                                             120                             116

So, if my calculations are correct, max estimated gain for cache utilization would be:
(120-116)*100/120=3.33% 
Note that numbers are for over-simplistic usage scenario.
In more realistic ones, when we have to touch more cache-lines per packet,
that difference would be even less noticeable.
So I really doubt we will see some noticeable improvements in terms of cache utilization
with that patch.

> >
> >>> - The change introduces a new build time config option with a major limitation:
> >>>  All memzones in a pool have to be within the same 4GB boundary.
> >>>  To address it properly, extra changes will be required in init(/populate) part of the code.
> >>
> >> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> >
> > I still think that to justify such changes some really noticeable performance
> > improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
> > Otherwise it just not worth the hassle.
> >
> 
> Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
> That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.

As I said above, I don’t think we'll see any real advantage here.
But feel free to pick-up different app and prove me wrong.
After all we have plenty of sample apps that do provide enough
pressure on the cache: l3fwd-acl, ipsec-secgw.
Or you can even apply these patches from Sean:
https://patches.dpdk.org/project/dpdk/list/?series=20999
to run l3fwd with configurable routes.
That should help you to make it cache-bound.

> I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.

And for other we are seeing double digit degradation.
So far the whole idea doesn't look promising at all, at least to me.
Konstantin
  
Ananyev, Konstantin Jan. 21, 2022, 11:31 a.m. UTC | #6
> 
> 
> Hi Dharmik,
> > >
> > >>>
> > >>>> Current mempool per core cache implementation stores pointers to mbufs
> > >>>> On 64b architectures, each pointer consumes 8B
> > >>>> This patch replaces it with index-based implementation,
> > >>>> where in each buffer is addressed by (pool base address + index)
> > >>>> It reduces the amount of memory/cache required for per core cache
> > >>>>
> > >>>> L3Fwd performance testing reveals minor improvements in the cache
> > >>>> performance (L1 and L2 misses reduced by 0.60%)
> > >>>> with no change in throughput
> > >>>
> > >>> I feel really sceptical about that patch and the whole idea in general:
> > >>> - From what I read above there is no real performance improvement observed.
> > >>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> > >>> see below for more details).
> > >>
> > >> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> > >> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> > >> and in my understanding should give better performance for a bulk size of 32.
> > >
> > > Might be, but I still don't see the reason for such effort.
> > > As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> > > on ARM64 even with vectorized version of the code.
> > >
> >
> > IMO, even without performance improvement, it is advantageous because the same performance is being achieved
> > with less memory and cache utilization using the patch.
> >
> > >>> - Space utilization difference looks neglectable too.
> > >>
> > >> Sorry, I did not understand this point.
> > >
> > > As I understand one of the expectations from that patch was:
> > > reduce memory/cache required, which should improve cache utilization
> > > (less misses, etc.).
> > > Though I think such improvements would be neglectable and wouldn't
> > > cause any real performance gain.
> >
> > The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
> > Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
> 
> It saves you 4 just bytes per mbuf.
> Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
> Let's do a simplistic estimation of  number of affected cache-lines l for l2fwd.
> For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:
> 
>                                                                        number of cache-lines accessed
>                                                                    cache with pointers / cache with indexes
> mempool_get:                                            (32*8)/64=4          /  (32*4)/64=2
> RX (read HW desc):                                    (32*16)/64=8       /   (32*16)/64=8
> RX (write mbuf fields, 1st cache line):    (32*64)/64=3       /   (32*64)/64=32

Should be:
RX (write mbuf fields, 1st cache line):    (32*64)/64=32       /   (32*64)/64=32
off course

> update mac addrs:                                     (32*64)/64=32     /   (32*64)/64=32
> TX (write HW desc):                                   (32*16)/64=8       /   (32*16)/64=8
> free mbufs (read 2nd mbuf cache line): (32*64)/64=32    /   (32*64)/64=32
> mempool_put:                                            (32*8)/64=4        /    (32*4)/64=2
> total:                                                             120                             116
> 
> So, if my calculations are correct, max estimated gain for cache utilization would be:
> (120-116)*100/120=3.33%
> Note that numbers are for over-simplistic usage scenario.
> In more realistic ones, when we have to touch more cache-lines per packet,
> that difference would be even less noticeable.
> So I really doubt we will see some noticeable improvements in terms of cache utilization
> with that patch.
> 
> > >
> > >>> - The change introduces a new build time config option with a major limitation:
> > >>>  All memzones in a pool have to be within the same 4GB boundary.
> > >>>  To address it properly, extra changes will be required in init(/populate) part of the code.
> > >>
> > >> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> > >
> > > I still think that to justify such changes some really noticeable performance
> > > improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
> > > Otherwise it just not worth the hassle.
> > >
> >
> > Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
> > That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
> 
> As I said above, I don’t think we'll see any real advantage here.
> But feel free to pick-up different app and prove me wrong.
> After all we have plenty of sample apps that do provide enough
> pressure on the cache: l3fwd-acl, ipsec-secgw.
> Or you can even apply these patches from Sean:
> https://patches.dpdk.org/project/dpdk/list/?series=20999
> to run l3fwd with configurable routes.
> That should help you to make it cache-bound.
> 
> > I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
> 
> And for other we are seeing double digit degradation.
> So far the whole idea doesn't look promising at all, at least to me.
> Konstantin
  
Dharmik Thakkar March 24, 2022, 7:51 p.m. UTC | #7
Hi,

Thank you for the comments!

Based on the suggestions, I tested the patch for single core L3Fwd performance with increased number of routes/flows (maximum 8K) to increase cache footprint.
However, I don’t see much improvement with the patch.

> On Jan 21, 2022, at 5:25 AM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
> 
> 
> 
> Hi Dharmik,
>>> 
>>>>> 
>>>>>> Current mempool per core cache implementation stores pointers to mbufs
>>>>>> On 64b architectures, each pointer consumes 8B
>>>>>> This patch replaces it with index-based implementation,
>>>>>> where in each buffer is addressed by (pool base address + index)
>>>>>> It reduces the amount of memory/cache required for per core cache
>>>>>> 
>>>>>> L3Fwd performance testing reveals minor improvements in the cache
>>>>>> performance (L1 and L2 misses reduced by 0.60%)
>>>>>> with no change in throughput
>>>>> 
>>>>> I feel really sceptical about that patch and the whole idea in general:
>>>>> - From what I read above there is no real performance improvement observed.
>>>>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>>>>> see below for more details).
>>>> 
>>>> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
>>>> Similar optimizations can be implemented for x86 platforms which should close the performance gap
>>>> and in my understanding should give better performance for a bulk size of 32.
>>> 
>>> Might be, but I still don't see the reason for such effort.
>>> As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
>>> on ARM64 even with vectorized version of the code.
>>> 
>> 
>> IMO, even without performance improvement, it is advantageous because the same performance is being achieved
>> with less memory and cache utilization using the patch.
>> 
>>>>> - Space utilization difference looks neglectable too.
>>>> 
>>>> Sorry, I did not understand this point.
>>> 
>>> As I understand one of the expectations from that patch was:
>>> reduce memory/cache required, which should improve cache utilization
>>> (less misses, etc.).
>>> Though I think such improvements would be neglectable and wouldn't
>>> cause any real performance gain.
>> 
>> The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
>> Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
> 
> It saves you 4 just bytes per mbuf.
> Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
> Let's do a simplistic estimation of  number of affected cache-lines l for l2fwd. 
> For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:
> 
>                                                                       number of cache-lines accessed 
>                                                                   cache with pointers / cache with indexes 
> mempool_get:                                            (32*8)/64=4          /  (32*4)/64=2
> RX (read HW desc):                                    (32*16)/64=8       /   (32*16)/64=8
> RX (write mbuf fields, 1st cache line):    (32*64)/64=3       /   (32*64)/64=32
> update mac addrs:                                     (32*64)/64=32     /   (32*64)/64=32   
> TX (write HW desc):                                   (32*16)/64=8       /   (32*16)/64=8
> free mbufs (read 2nd mbuf cache line): (32*64)/64=32    /   (32*64)/64=32   
> mempool_put:                                            (32*8)/64=4        /    (32*4)/64=2
> total:                                                             120                             116
> 
> So, if my calculations are correct, max estimated gain for cache utilization would be:
> (120-116)*100/120=3.33% 
> Note that numbers are for over-simplistic usage scenario.
> In more realistic ones, when we have to touch more cache-lines per packet,
> that difference would be even less noticeable.
> So I really doubt we will see some noticeable improvements in terms of cache utilization
> with that patch.
> 
>>> 
>>>>> - The change introduces a new build time config option with a major limitation:
>>>>> All memzones in a pool have to be within the same 4GB boundary.
>>>>> To address it properly, extra changes will be required in init(/populate) part of the code.
>>>> 
>>>> I agree to the above mentioned challenges and I am currently working on resolving these issues.
>>> 
>>> I still think that to justify such changes some really noticeable performance
>>> improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
>>> Otherwise it just not worth the hassle.
>>> 
>> 
>> Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
>> That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
> 
> As I said above, I don’t think we'll see any real advantage here.
> But feel free to pick-up different app and prove me wrong.
> After all we have plenty of sample apps that do provide enough
> pressure on the cache: l3fwd-acl, ipsec-secgw.
> Or you can even apply these patches from Sean:
> https://patches.dpdk.org/project/dpdk/list/?series=20999
> to run l3fwd with configurable routes.
> That should help you to make it cache-bound.
> 

Thank you, Konstantin! This patch was helpful.

>> I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
> 
> And for other we are seeing double digit degradation.
> So far the whole idea doesn't look promising at all, at least to me.
> Konstantin
>
  

Patch

diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 1e7a3c15273c..4fabd3b1920b 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -50,6 +50,10 @@ 
 #include <rte_memcpy.h>
 #include <rte_common.h>
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#include <rte_vect.h>
+#endif
+
 #include "rte_mempool_trace_fp.h"
 
 #ifdef __cplusplus
@@ -239,6 +243,9 @@  struct rte_mempool {
 	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	void *pool_base_value; /**< Base value to calculate indices */
+#endif
 
 	uint32_t populated_size;         /**< Number of populated objects. */
 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1314,7 +1321,19 @@  rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	if (cache == NULL || cache->len == 0)
 		return;
 	rte_mempool_trace_cache_flush(cache, mp);
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	unsigned int i;
+	unsigned int cache_len = cache->len;
+	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+	void *base_value = mp->pool_base_value;
+	uint32_t *cache_objs = (uint32_t *) cache->objs;
+	for (i = 0; i < cache_len; i++)
+		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
+#else
 	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+#endif
 	cache->len = 0;
 }
 
@@ -1334,8 +1353,13 @@  static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 			   unsigned int n, struct rte_mempool_cache *cache)
 {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	uint32_t *cache_objs;
+	void *base_value;
+	uint32_t i;
+#else
 	void **cache_objs;
-
+#endif
 	/* increment stat now, adding in mempool always success */
 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
@@ -1344,7 +1368,13 @@  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
 		goto ring_enqueue;
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	cache_objs = (uint32_t *) cache->objs;
+	cache_objs = &cache_objs[cache->len];
+	base_value = mp->pool_base_value;
+#else
 	cache_objs = &cache->objs[cache->len];
+#endif
 
 	/*
 	 * The cache follows the following algorithm
@@ -1354,13 +1384,40 @@  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	 */
 
 	/* Add elements back into the cache */
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+	uint64x2_t v_obj_table;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+	uint32x2_t v_cache_objs;
+
+	for (i = 0; i < (n & ~0x1); i += 2) {
+		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+		vst1_u32(cache_objs + i, v_cache_objs);
+	}
+	if (n & 0x1) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#else
+	for (i = 0; i < n; i++) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#endif
+#else
 	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+#endif
 
 	cache->len += n;
 
 	if (cache->len >= cache->flushthresh) {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
+				cache->len - cache->size);
+#else
 		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
 				cache->len - cache->size);
+#endif
 		cache->len = cache->size;
 	}
 
@@ -1461,13 +1518,22 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	uint32_t index, len;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	uint32_t i;
+	uint32_t *cache_objs;
+#else
 	void **cache_objs;
-
+#endif
 	/* No cache provided or cannot be satisfied from cache */
 	if (unlikely(cache == NULL || n >= cache->size))
 		goto ring_dequeue;
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	void *base_value = mp->pool_base_value;
+	cache_objs = (uint32_t *) cache->objs;
+#else
 	cache_objs = cache->objs;
+#endif
 
 	/* Can this be satisfied from the cache? */
 	if (cache->len < n) {
@@ -1475,8 +1541,14 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache->size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			temp_objs, req);
+#else
 		ret = rte_mempool_ops_dequeue_bulk(mp,
 			&cache->objs[cache->len], req);
+#endif
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the off chance that we are buffer constrained,
@@ -1487,12 +1559,50 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 			goto ring_dequeue;
 		}
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		len = cache->len;
+		for (i = 0; i < req; ++i, ++len) {
+			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
+								base_value);
+		}
+#endif
 		cache->len += req;
 	}
 
 	/* Now fill in the response ... */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+	uint64x2_t v_obj_table;
+	uint64x2_t v_cache_objs;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
+	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
+						len -= 4, obj_table += 4) {
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)obj_table, v_obj_table);
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
+	}
+	switch (n & 0x3) {
+	case 3:
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+								/* fallthrough */
+	case 2:
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+								/* fallthrough */
+	case 1:
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+	}
+#else
+	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
+#else
 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
 		*obj_table = cache_objs[len];
+#endif
 
 	cache->len -= n;
 
diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
index 22fccf9d7619..3543cad9d4ce 100644
--- a/lib/mempool/rte_mempool_ops_default.c
+++ b/lib/mempool/rte_mempool_ops_default.c
@@ -127,6 +127,13 @@  rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
 		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		/* Store pool base value to calculate indices for index-based
+		 * lcore cache implementation
+		 */
+		if (i == 0)
+			mp->pool_base_value = obj;
+#endif
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 		off += mp->elt_size + mp->trailer_size;
 	}