diff mbox series

[RFC,1/2] eal: add pointer compression functions

Message ID	20230927150854.3670391-2-paul.szczepanek@arm.com (mailing list archive)
State	Superseded, archived
Delegated to:	Thomas Monjalon
Headers	From: Paul Szczepanek <paul.szczepanek@arm.com> To: dev@dpdk.org Cc: Paul Szczepanek <paul.szczepanek@arm.com>, Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>, Kamalakshitha Aligeri <kamalakshitha.aligeri@arm.com> Subject: [RFC 1/2] eal: add pointer compression functions Date: Wed, 27 Sep 2023 15:08:53 +0000 Message-Id: <20230927150854.3670391-2-paul.szczepanek@arm.com> In-Reply-To: <20230927150854.3670391-1-paul.szczepanek@arm.com> References: <20230927150854.3670391-1-paul.szczepanek@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: dev-bounces@dpdk.org
Series	add pointer compression API \| [RFC,0/2] add pointer compression API [RFC,1/2] eal: add pointer compression functions [RFC,2/2] test: add pointer compress tests to ring perf test

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK

Commit Message

Paul Szczepanek Sept. 27, 2023, 3:08 p.m. UTC

  Add a new utility header for compressing pointers. Pointers are
compressed by taking advantage of their locality. Instead of
storing the full address only an offset from a known base is stored.

The provided functions can store pointers in 32bit offsets.

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Paul Szczepanek <paul.szczepanek@arm.com>
Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.aligeri@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 .mailmap                           |   1 +
 lib/eal/include/meson.build        |   1 +
 lib/eal/include/rte_ptr_compress.h | 158 +++++++++++++++++++++++++++++
 3 files changed, 160 insertions(+)
 create mode 100644 lib/eal/include/rte_ptr_compress.h

Comments

Thomas Monjalon Oct. 9, 2023, 3:54 p.m. UTC | #1

27/09/2023 17:08, Paul Szczepanek:
> Add a new utility header for compressing pointers. Pointers are
> compressed by taking advantage of their locality. Instead of
> storing the full address only an offset from a known base is stored.

You probably need to insert some explanations from the cover letter.

> The provided functions can store pointers in 32bit offsets.
> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Paul Szczepanek <paul.szczepanek@arm.com>
> Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.aligeri@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
[...]
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -35,6 +35,7 @@ headers += files(
>          'rte_pci_dev_feature_defs.h',
>          'rte_pci_dev_features.h',
>          'rte_per_lcore.h',
> +	'rte_ptr_compress.h',
>          'rte_pflock.h',
>          'rte_random.h',
>          'rte_reciprocal.h',

Did you try to sort alphabetically? failed :)

> +#ifndef _RTE_PTR_COMPRESS_H_
> +#define _RTE_PTR_COMPRESS_H_

No need extra underscores.

> +
> +/**
> + * @file
> + * RTE pointer compression and decompression.

RTE has no mean here I think.

> + */
> +
> +#include <stdint.h>
> +#include <inttypes.h>
> +
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_debug.h>
> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Compress pointers into 32 bit offsets from base pointer.

I think it should be "32-bit".

> + *
> + * @note Offsets from the base pointer must fit within 32bits. Alignment allows
> + * us to drop bits from the offsets - this means that for pointers aligned by
> + * 8 bytes they must be within 32GB of the base pointer. Unaligned pointers
> + * must be within 4GB.

Not clear what is "alignment".
> + *
> + * @param ptr_base
> + *   A pointer used to calculate offsets of pointers in src_table.
> + * @param src_table
> + *   A pointer to an array of pointers.
> + * @param dest_table
> + *   A pointer to an array of compressed pointers returned by this function.
> + * @param n
> + *   The number of objects to compress, must be strictly positive.
> + * @param bit_shift
> + *   Byte alignment of memory pointed to by the pointers allows for
> + *   bits to be dropped from the offset and hence widen the memory region that
> + *   can be covered. This controls how many bits are right shifted.
> + **/
> +static __rte_always_inline void
> +rte_ptr_compress_32(void *ptr_base, void **src_table,
> +		uint32_t *dest_table, unsigned int n, unsigned int bit_shift)
> +{
> +	unsigned int i = 0;
> +#if defined RTE_HAS_SVE_ACLE
> +	svuint64_t v_src_table;
> +	svuint64_t v_dest_table;
> +	svbool_t pg = svwhilelt_b64(i, n);
> +	do {
> +		v_src_table = svld1_u64(pg, (uint64_t *)src_table + i);
> +		v_dest_table = svsub_x(pg, v_src_table, (uint64_t)ptr_base);
> +		v_dest_table = svlsr_x(pg, v_dest_table, bit_shift);
> +		svst1w(pg, &dest_table[i], v_dest_table);
> +		i += svcntd();
> +		pg = svwhilelt_b64(i, n);
> +	} while (svptest_any(svptrue_b64(), pg));
> +#elif defined __ARM_NEON
> +	uint64_t ptr_diff;
> +	uint64x2_t v_src_table;
> +	uint64x2_t v_dest_table;
> +	/* right shift is done by left shifting by negative int */
> +	int64x2_t v_shift = vdupq_n_s64(-bit_shift);
> +	uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
> +	for (; i < (n & ~0x1); i += 2) {
> +		v_src_table = vld1q_u64((const uint64_t *)src_table + i);
> +		v_dest_table = vsubq_u64(v_src_table, v_ptr_base);
> +		v_dest_table = vshlq_u64(v_dest_table, v_shift);
> +		vst1_u32(dest_table + i, vqmovn_u64(v_dest_table));
> +	}
> +	/* process leftover single item in case of odd number of n */
> +	if (unlikely(n & 0x1)) {
> +		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
> +		dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
> +	}
> +#else
> +	uint64_t ptr_diff;
> +	for (; i < n; i++) {
> +		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
> +		/* save extra bits that are redundant due to alignment */
> +		ptr_diff = ptr_diff >> bit_shift;
> +		/* make sure no truncation will happen when casting */
> +		RTE_ASSERT(ptr_diff <= UINT32_MAX);
> +		dest_table[i] = (uint32_t) ptr_diff;
> +	}
> +#endif
> +}

I see it is providing some per-CPU optimizations,
so it is in favor of having it in DPDK.
Other than that, it looks very generic, so it is questionable to have in DPDK.

Honnappa Nagarahalli Oct. 11, 2023, 1:36 p.m. UTC | #2

> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Monday, October 9, 2023 10:54 AM
> To: Paul Szczepanek <Paul.Szczepanek@arm.com>
> Cc: dev@dpdk.org; Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> Kamalakshitha Aligeri <Kamalakshitha.Aligeri@arm.com>
> Subject: Re: [RFC 1/2] eal: add pointer compression functions
> 
> 27/09/2023 17:08, Paul Szczepanek:
> > Add a new utility header for compressing pointers. Pointers are
> > compressed by taking advantage of their locality. Instead of storing
> > the full address only an offset from a known base is stored.
> 
> You probably need to insert some explanations from the cover letter.
> 
> > The provided functions can store pointers in 32bit offsets.
> >
> > Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Signed-off-by: Paul Szczepanek <paul.szczepanek@arm.com>
> > Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.aligeri@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> [...]
> > --- a/lib/eal/include/meson.build
> > +++ b/lib/eal/include/meson.build
> > @@ -35,6 +35,7 @@ headers += files(
> >          'rte_pci_dev_feature_defs.h',
> >          'rte_pci_dev_features.h',
> >          'rte_per_lcore.h',
> > +	'rte_ptr_compress.h',
> >          'rte_pflock.h',
> >          'rte_random.h',
> >          'rte_reciprocal.h',
> 
> Did you try to sort alphabetically? failed :)
> 
> > +#ifndef _RTE_PTR_COMPRESS_H_
> > +#define _RTE_PTR_COMPRESS_H_
> 
> No need extra underscores.
> 
> > +
> > +/**
> > + * @file
> > + * RTE pointer compression and decompression.
> 
> RTE has no mean here I think.
> 
> > + */
> > +
> > +#include <stdint.h>
> > +#include <inttypes.h>
> > +
> > +#include <rte_branch_prediction.h>
> > +#include <rte_common.h>
> > +#include <rte_debug.h>
> > +#include <rte_vect.h>
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +/**
> > + * Compress pointers into 32 bit offsets from base pointer.
> 
> I think it should be "32-bit".
> 
> > + *
> > + * @note Offsets from the base pointer must fit within 32bits.
> > + Alignment allows
> > + * us to drop bits from the offsets - this means that for pointers
> > + aligned by
> > + * 8 bytes they must be within 32GB of the base pointer. Unaligned
> > + pointers
> > + * must be within 4GB.
> 
> Not clear what is "alignment".
> > + *
> > + * @param ptr_base
> > + *   A pointer used to calculate offsets of pointers in src_table.
> > + * @param src_table
> > + *   A pointer to an array of pointers.
> > + * @param dest_table
> > + *   A pointer to an array of compressed pointers returned by this function.
> > + * @param n
> > + *   The number of objects to compress, must be strictly positive.
> > + * @param bit_shift
> > + *   Byte alignment of memory pointed to by the pointers allows for
> > + *   bits to be dropped from the offset and hence widen the memory region
> that
> > + *   can be covered. This controls how many bits are right shifted.
> > + **/
> > +static __rte_always_inline void
> > +rte_ptr_compress_32(void *ptr_base, void **src_table,
> > +		uint32_t *dest_table, unsigned int n, unsigned int bit_shift) {
> > +	unsigned int i = 0;
> > +#if defined RTE_HAS_SVE_ACLE
> > +	svuint64_t v_src_table;
> > +	svuint64_t v_dest_table;
> > +	svbool_t pg = svwhilelt_b64(i, n);
> > +	do {
> > +		v_src_table = svld1_u64(pg, (uint64_t *)src_table + i);
> > +		v_dest_table = svsub_x(pg, v_src_table, (uint64_t)ptr_base);
> > +		v_dest_table = svlsr_x(pg, v_dest_table, bit_shift);
> > +		svst1w(pg, &dest_table[i], v_dest_table);
> > +		i += svcntd();
> > +		pg = svwhilelt_b64(i, n);
> > +	} while (svptest_any(svptrue_b64(), pg)); #elif defined __ARM_NEON
> > +	uint64_t ptr_diff;
> > +	uint64x2_t v_src_table;
> > +	uint64x2_t v_dest_table;
> > +	/* right shift is done by left shifting by negative int */
> > +	int64x2_t v_shift = vdupq_n_s64(-bit_shift);
> > +	uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
> > +	for (; i < (n & ~0x1); i += 2) {
> > +		v_src_table = vld1q_u64((const uint64_t *)src_table + i);
> > +		v_dest_table = vsubq_u64(v_src_table, v_ptr_base);
> > +		v_dest_table = vshlq_u64(v_dest_table, v_shift);
> > +		vst1_u32(dest_table + i, vqmovn_u64(v_dest_table));
> > +	}
> > +	/* process leftover single item in case of odd number of n */
> > +	if (unlikely(n & 0x1)) {
> > +		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
> > +		dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
> > +	}
> > +#else
> > +	uint64_t ptr_diff;
> > +	for (; i < n; i++) {
> > +		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
> > +		/* save extra bits that are redundant due to alignment */
> > +		ptr_diff = ptr_diff >> bit_shift;
> > +		/* make sure no truncation will happen when casting */
> > +		RTE_ASSERT(ptr_diff <= UINT32_MAX);
> > +		dest_table[i] = (uint32_t) ptr_diff;
> > +	}
> > +#endif
> > +}
> 
> I see it is providing some per-CPU optimizations, so it is in favor of having it in
> DPDK.
> Other than that, it looks very generic, so it is questionable to have in DPDK.
We had it done for mbuf pointers. But then, we thought it could be generic.

Right now the API results in 32b indices. We could make it generic further by allowing for 16b indices. 8b indices does not make sense.
> 
>

Paul Szczepanek Oct. 11, 2023, 4:43 p.m. UTC | #3

On 11/10/2023 14:36, Honnappa Nagarahalli wrote:
>> -----Original Message-----
>> From: Thomas Monjalon <thomas@monjalon.net>
>> Sent: Monday, October 9, 2023 10:54 AM
>> To: Paul Szczepanek <Paul.Szczepanek@arm.com>
>> Cc: dev@dpdk.org; Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
>> Kamalakshitha Aligeri <Kamalakshitha.Aligeri@arm.com>
>> Subject: Re: [RFC 1/2] eal: add pointer compression functions
[...]
>> I see it is providing some per-CPU optimizations, so it is in favor of having it in
>> DPDK.
>> Other than that, it looks very generic, so it is questionable to have in DPDK.
> We had it done for mbuf pointers. But then, we thought it could be generic.
>
> Right now the API results in 32b indices. We could make it generic further by allowing for 16b indices. 8b indices does not make sense.

To add to this, this being generic is I think a good thing.

I think it belongs in DPDK as it will make it easy for other 
architectures to add their versions and maintain the abstraction.

Morten Brørup Nov. 1, 2023, 7:42 a.m. UTC | #4

> From: Paul Szczepanek [mailto:paul.szczepanek@arm.com]
> Sent: Tuesday, 31 October 2023 19.11

[...]

> Test is added that shows potential performance gain from compression.
> In
> this test an array of pointers is passed through a ring between two
> cores.
> It shows the gain which is dependent on the bulk operation size. In
> this
> synthetic test run on ampere altra a substantial (up to 25%)
> performance
> gain is seen if done in bulk size larger than 32. At 32 it breaks even
> and
> lower sizes create a small (less than 5%) slowdown due to overhead.
> 
> In a more realistic mock application running the l3 forwarding dpdk
> example that works in pipeline mode this translated into a ~5%
> throughput
> increase on an ampere altra.

What was the bulk size in this test?

And were the pipeline stages running on the same lcore or individual lcores per pipeline stage?

Paul Szczepanek Nov. 1, 2023, 12:52 p.m. UTC | #5

On 01/11/2023 07:42, Morten Brørup wrote:
>> From: Paul Szczepanek [mailto:paul.szczepanek@arm.com]
>> Sent: Tuesday, 31 October 2023 19.11
> [...]
>
>> In a more realistic mock application running the l3 forwarding dpdk
>> example that works in pipeline mode this translated into a ~5%
>> throughput
>> increase on an ampere altra.
> What was the bulk size in this test?
>
> And were the pipeline stages running on the same lcore or individual lcores per pipeline stage?
>

The pipeline mode was run on separate cores and used 128 as the bulk size.

Paul Szczepanek Feb. 22, 2024, 8:15 a.m. UTC | #6

For some reason your email is not visible to me, even though it's in the 
archive.

On 02/11/202416:32,Konstantin Ananyev konstantin.v.ananyev  wrote:

> From one side the code itself is very small and straightforward, > from other side - it is not clear to me what is intended usage for it
> within DPDK and it's applianances?
> Konstantin

The intended usage is explained in the cover email (see below) and demonstrated
in the test supplied in the following patch - when sending arrays of pointers
between cores as it happens in a forwarding example.

On 01/11/2023 18:12, Paul Szczepanek wrote:

> This patchset is proposing adding a new EAL header with utility functions
> that allow compression of arrays of pointers.
>
> When passing caches full of pointers between threads, memory containing
> the pointers is copied multiple times which is especially costly between
> cores. A compression method will allow us to shrink the memory size
> copied.
>
> The compression takes advantage of the fact that pointers are usually
> located in a limited memory region (like a mempool). We can compress them
> by converting them to offsets from a base memory address.
>
> Offsets can be stored in fewer bytes (dictated by the memory region size
> and alignment of the pointer). For example: an 8 byte aligned pointer
> which is part of a 32GB memory pool can be stored in 4 bytes. The API is
> very generic and does not assume mempool pointers, any pointer can be
> passed in.
>
> Compression is based on few and fast operations and especially with vector
> instructions leveraged creates minimal overhead.
>
> The API accepts and returns arrays because the overhead means it only is
> worth it when done in bulk.
>
> Test is added that shows potential performance gain from compression. In
> this test an array of pointers is passed through a ring between two cores.
> It shows the gain which is dependent on the bulk operation size. In this
> synthetic test run on ampere altra a substantial (up to 25%) performance
> gain is seen if done in bulk size larger than 32. At 32 it breaks even and
> lower sizes create a small (less than 5%) slowdown due to overhead.
>
> In a more realistic mock application running the l3 forwarding dpdk
> example that works in pipeline mode on two cores this translated into a
> ~5% throughput increase on an ampere altra.
>
> v2:
> * addressed review comments (style, explanations and typos)
> * lowered bulk iterations closer to original numbers to keep runtime short
> * fixed pointer size warning on 32-bit arch
> v3:
> * added 16-bit versions of compression functions and tests
> * added documentation of these new utility functions in the EAL guide
> v4:
> * added unit test
> * fix bug in NEON implementation of 32-bit decompress
> v5:
> * disable NEON and SVE implementation on AARCH32 due to wrong pointer size
>
> Paul Szczepanek (4):
>    eal: add pointer compression functions
>    test: add pointer compress tests to ring perf test
>    docs: add pointer compression to the EAL guide
>    test: add unit test for ptr compression
>
>   .mailmap                                      |   1 +
>   app/test/meson.build                          |   1 +
>   app/test/test_eal_ptr_compress.c              | 108 ++++++
>   app/test/test_ring.h                          |  94 ++++-
>   app/test/test_ring_perf.c                     | 354 ++++++++++++------
>   .../prog_guide/env_abstraction_layer.rst      | 142 +++++++
>   lib/eal/include/meson.build                   |   1 +
>   lib/eal/include/rte_ptr_compress.h            | 266 +++++++++++++
>   8 files changed, 843 insertions(+), 124 deletions(-)
>   create mode 100644 app/test/test_eal_ptr_compress.c
>   create mode 100644 lib/eal/include/rte_ptr_compress.h
>
> --
> 2.25.1
>

Konstantin Ananyev Feb. 22, 2024, 4:16 p.m. UTC | #7

> For some reason your email is not visible to me, even though it's in the
> archive.

No worries.

> 
> On 02/11/202416:32,Konstantin Ananyev konstantin.v.ananyev  wrote:
> 
> > From one side the code itself is very small and straightforward, > from other side - it is not clear to me what is intended usage for it
> > within DPDK and it's applianances?
> > Konstantin
> 
> The intended usage is explained in the cover email (see below) and demonstrated
> in the test supplied in the following patch - when sending arrays of pointers
> between cores as it happens in a forwarding example.

Yes, I saw that. The thing is that test is a 'synthetic' one.
My question was about how do you expect people to use it in more realistic scenarios?
Let say user has a bunch of mbuf pointers, possibly from different mempools.
How he can use this API: how to deduce the base pointer for all of them and what to
do if it can't be done?
 
> On 01/11/2023 18:12, Paul Szczepanek wrote:
> 
> > This patchset is proposing adding a new EAL header with utility functions
> > that allow compression of arrays of pointers.
> >
> > When passing caches full of pointers between threads, memory containing
> > the pointers is copied multiple times which is especially costly between
> > cores. A compression method will allow us to shrink the memory size
> > copied.
> >
> > The compression takes advantage of the fact that pointers are usually
> > located in a limited memory region (like a mempool). We can compress them
> > by converting them to offsets from a base memory address.
> >
> > Offsets can be stored in fewer bytes (dictated by the memory region size
> > and alignment of the pointer). For example: an 8 byte aligned pointer
> > which is part of a 32GB memory pool can be stored in 4 bytes. The API is
> > very generic and does not assume mempool pointers, any pointer can be
> > passed in.
> >
> > Compression is based on few and fast operations and especially with vector
> > instructions leveraged creates minimal overhead.
> >
> > The API accepts and returns arrays because the overhead means it only is
> > worth it when done in bulk.
> >
> > Test is added that shows potential performance gain from compression. In
> > this test an array of pointers is passed through a ring between two cores.
> > It shows the gain which is dependent on the bulk operation size. In this
> > synthetic test run on ampere altra a substantial (up to 25%) performance
> > gain is seen if done in bulk size larger than 32. At 32 it breaks even and
> > lower sizes create a small (less than 5%) slowdown due to overhead.
> >
> > In a more realistic mock application running the l3 forwarding dpdk
> > example that works in pipeline mode on two cores this translated into a
> > ~5% throughput increase on an ampere altra.
> >
> > v2:
> > * addressed review comments (style, explanations and typos)
> > * lowered bulk iterations closer to original numbers to keep runtime short
> > * fixed pointer size warning on 32-bit arch
> > v3:
> > * added 16-bit versions of compression functions and tests
> > * added documentation of these new utility functions in the EAL guide
> > v4:
> > * added unit test
> > * fix bug in NEON implementation of 32-bit decompress
> > v5:
> > * disable NEON and SVE implementation on AARCH32 due to wrong pointer size
> >
> > Paul Szczepanek (4):
> >    eal: add pointer compression functions
> >    test: add pointer compress tests to ring perf test
> >    docs: add pointer compression to the EAL guide
> >    test: add unit test for ptr compression
> >
> >   .mailmap                                      |   1 +
> >   app/test/meson.build                          |   1 +
> >   app/test/test_eal_ptr_compress.c              | 108 ++++++
> >   app/test/test_ring.h                          |  94 ++++-
> >   app/test/test_ring_perf.c                     | 354 ++++++++++++------
> >   .../prog_guide/env_abstraction_layer.rst      | 142 +++++++
> >   lib/eal/include/meson.build                   |   1 +
> >   lib/eal/include/rte_ptr_compress.h            | 266 +++++++++++++
> >   8 files changed, 843 insertions(+), 124 deletions(-)
> >   create mode 100644 app/test/test_eal_ptr_compress.c
> >   create mode 100644 lib/eal/include/rte_ptr_compress.h
> >
> > --
> > 2.25.1
> >

Morten Brørup March 1, 2024, 11:16 a.m. UTC | #8

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Thursday, 22 February 2024 17.16
> 
> > For some reason your email is not visible to me, even though it's in the
> > archive.
> 
> No worries.
> 
> >
> > On 02/11/202416:32,Konstantin Ananyev konstantin.v.ananyev  wrote:
> >
> > > From one side the code itself is very small and straightforward, > from
> other side - it is not clear to me what is intended usage for it
> > > within DPDK and it's applianances?
> > > Konstantin
> >
> > The intended usage is explained in the cover email (see below) and
> demonstrated
> > in the test supplied in the following patch - when sending arrays of
> pointers
> > between cores as it happens in a forwarding example.
> 
> Yes, I saw that. The thing is that test is a 'synthetic' one.
> My question was about how do you expect people to use it in more realistic
> scenarios?
> Let say user has a bunch of mbuf pointers, possibly from different mempools.
> How he can use this API: how to deduce the base pointer for all of them and
> what to
> do if it can't be done?

I share Konstantin's concerns with this feature.

If we want to compress mbuf pointers in applications with a few mbuf pools, e.g. an mbuf pool per CPU socket, the compression algorithm would be different.

I would like to add:
If we want to offer optimizations specifically for applications with a single mbuf pool, I think it should be considered in a system-wide context to determine if performance could be improved in more areas.
E.g. removing the pool field from the rte_mbuf structure might free up space to move hot fields from the second cache line to the first, so the second cache line rarely needs to be touched. (As an alternative to removing the pool field, it could be moved to the second cache line, only to be used if the global "single mbuf pool" is NULL.)

On the other hand, I agree that pointer compression can be useful for some applications, so we should accept it.

However, pointer compression has nothing to do with the underlying hardware or operating system, so it does not belong in the EAL (which is already too bloated). It should be a separate library.

> 
> > On 01/11/2023 18:12, Paul Szczepanek wrote:
> >
> > > This patchset is proposing adding a new EAL header with utility functions
> > > that allow compression of arrays of pointers.
> > >
> > > When passing caches full of pointers between threads, memory containing
> > > the pointers is copied multiple times which is especially costly between
> > > cores. A compression method will allow us to shrink the memory size
> > > copied.
> > >
> > > The compression takes advantage of the fact that pointers are usually
> > > located in a limited memory region (like a mempool). We can compress them
> > > by converting them to offsets from a base memory address.
> > >
> > > Offsets can be stored in fewer bytes (dictated by the memory region size
> > > and alignment of the pointer). For example: an 8 byte aligned pointer
> > > which is part of a 32GB memory pool can be stored in 4 bytes. The API is
> > > very generic and does not assume mempool pointers, any pointer can be
> > > passed in.
> > >
> > > Compression is based on few and fast operations and especially with vector
> > > instructions leveraged creates minimal overhead.
> > >
> > > The API accepts and returns arrays because the overhead means it only is
> > > worth it when done in bulk.
> > >
> > > Test is added that shows potential performance gain from compression. In
> > > this test an array of pointers is passed through a ring between two cores.
> > > It shows the gain which is dependent on the bulk operation size. In this
> > > synthetic test run on ampere altra a substantial (up to 25%) performance
> > > gain is seen if done in bulk size larger than 32. At 32 it breaks even and
> > > lower sizes create a small (less than 5%) slowdown due to overhead.
> > >
> > > In a more realistic mock application running the l3 forwarding dpdk
> > > example that works in pipeline mode on two cores this translated into a
> > > ~5% throughput increase on an ampere altra.

Which burst size was used to achieve this ~5% throughput increase?

> > >
> > > v2:
> > > * addressed review comments (style, explanations and typos)
> > > * lowered bulk iterations closer to original numbers to keep runtime short
> > > * fixed pointer size warning on 32-bit arch
> > > v3:
> > > * added 16-bit versions of compression functions and tests
> > > * added documentation of these new utility functions in the EAL guide
> > > v4:
> > > * added unit test
> > > * fix bug in NEON implementation of 32-bit decompress
> > > v5:
> > > * disable NEON and SVE implementation on AARCH32 due to wrong pointer size
> > >
> > > Paul Szczepanek (4):
> > >    eal: add pointer compression functions
> > >    test: add pointer compress tests to ring perf test
> > >    docs: add pointer compression to the EAL guide
> > >    test: add unit test for ptr compression
> > >
> > >   .mailmap                                      |   1 +
> > >   app/test/meson.build                          |   1 +
> > >   app/test/test_eal_ptr_compress.c              | 108 ++++++
> > >   app/test/test_ring.h                          |  94 ++++-
> > >   app/test/test_ring_perf.c                     | 354 ++++++++++++------
> > >   .../prog_guide/env_abstraction_layer.rst      | 142 +++++++
> > >   lib/eal/include/meson.build                   |   1 +
> > >   lib/eal/include/rte_ptr_compress.h            | 266 +++++++++++++
> > >   8 files changed, 843 insertions(+), 124 deletions(-)
> > >   create mode 100644 app/test/test_eal_ptr_compress.c
> > >   create mode 100644 lib/eal/include/rte_ptr_compress.h
> > >
> > > --
> > > 2.25.1
> > >

Patrick Robb March 1, 2024, 4:12 p.m. UTC | #9

The Community CI Testing Lab had an infra failure this morning and some
patches including yours were affected with false failures. The issue is now
resolved and we are rerunning the tests in question for all patches
submitted today.

On Fri, Mar 1, 2024 at 6:16 AM Morten Brørup <mb@smartsharesystems.com>
wrote:

> > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > Sent: Thursday, 22 February 2024 17.16
> >
> > > For some reason your email is not visible to me, even though it's in
> the
> > > archive.
> >
> > No worries.
> >
> > >
> > > On 02/11/202416:32,Konstantin Ananyev konstantin.v.ananyev  wrote:
> > >
> > > > From one side the code itself is very small and straightforward, >
> from
> > other side - it is not clear to me what is intended usage for it
> > > > within DPDK and it's applianances?
> > > > Konstantin
> > >
> > > The intended usage is explained in the cover email (see below) and
> > demonstrated
> > > in the test supplied in the following patch - when sending arrays of
> > pointers
> > > between cores as it happens in a forwarding example.
> >
> > Yes, I saw that. The thing is that test is a 'synthetic' one.
> > My question was about how do you expect people to use it in more
> realistic
> > scenarios?
> > Let say user has a bunch of mbuf pointers, possibly from different
> mempools.
> > How he can use this API: how to deduce the base pointer for all of them
> and
> > what to
> > do if it can't be done?
>
> I share Konstantin's concerns with this feature.
>
> If we want to compress mbuf pointers in applications with a few mbuf
> pools, e.g. an mbuf pool per CPU socket, the compression algorithm would be
> different.
>
> I would like to add:
> If we want to offer optimizations specifically for applications with a
> single mbuf pool, I think it should be considered in a system-wide context
> to determine if performance could be improved in more areas.
> E.g. removing the pool field from the rte_mbuf structure might free up
> space to move hot fields from the second cache line to the first, so the
> second cache line rarely needs to be touched. (As an alternative to
> removing the pool field, it could be moved to the second cache line, only
> to be used if the global "single mbuf pool" is NULL.)
>
> On the other hand, I agree that pointer compression can be useful for some
> applications, so we should accept it.
>
> However, pointer compression has nothing to do with the underlying
> hardware or operating system, so it does not belong in the EAL (which is
> already too bloated). It should be a separate library.
>
> >
> > > On 01/11/2023 18:12, Paul Szczepanek wrote:
> > >
> > > > This patchset is proposing adding a new EAL header with utility
> functions
> > > > that allow compression of arrays of pointers.
> > > >
> > > > When passing caches full of pointers between threads, memory
> containing
> > > > the pointers is copied multiple times which is especially costly
> between
> > > > cores. A compression method will allow us to shrink the memory size
> > > > copied.
> > > >
> > > > The compression takes advantage of the fact that pointers are usually
> > > > located in a limited memory region (like a mempool). We can compress
> them
> > > > by converting them to offsets from a base memory address.
> > > >
> > > > Offsets can be stored in fewer bytes (dictated by the memory region
> size
> > > > and alignment of the pointer). For example: an 8 byte aligned pointer
> > > > which is part of a 32GB memory pool can be stored in 4 bytes. The
> API is
> > > > very generic and does not assume mempool pointers, any pointer can be
> > > > passed in.
> > > >
> > > > Compression is based on few and fast operations and especially with
> vector
> > > > instructions leveraged creates minimal overhead.
> > > >
> > > > The API accepts and returns arrays because the overhead means it
> only is
> > > > worth it when done in bulk.
> > > >
> > > > Test is added that shows potential performance gain from
> compression. In
> > > > this test an array of pointers is passed through a ring between two
> cores.
> > > > It shows the gain which is dependent on the bulk operation size. In
> this
> > > > synthetic test run on ampere altra a substantial (up to 25%)
> performance
> > > > gain is seen if done in bulk size larger than 32. At 32 it breaks
> even and
> > > > lower sizes create a small (less than 5%) slowdown due to overhead.
> > > >
> > > > In a more realistic mock application running the l3 forwarding dpdk
> > > > example that works in pipeline mode on two cores this translated
> into a
> > > > ~5% throughput increase on an ampere altra.
>
> Which burst size was used to achieve this ~5% throughput increase?
>
> > > >
> > > > v2:
> > > > * addressed review comments (style, explanations and typos)
> > > > * lowered bulk iterations closer to original numbers to keep runtime
> short
> > > > * fixed pointer size warning on 32-bit arch
> > > > v3:
> > > > * added 16-bit versions of compression functions and tests
> > > > * added documentation of these new utility functions in the EAL guide
> > > > v4:
> > > > * added unit test
> > > > * fix bug in NEON implementation of 32-bit decompress
> > > > v5:
> > > > * disable NEON and SVE implementation on AARCH32 due to wrong
> pointer size
> > > >
> > > > Paul Szczepanek (4):
> > > >    eal: add pointer compression functions
> > > >    test: add pointer compress tests to ring perf test
> > > >    docs: add pointer compression to the EAL guide
> > > >    test: add unit test for ptr compression
> > > >
> > > >   .mailmap                                      |   1 +
> > > >   app/test/meson.build                          |   1 +
> > > >   app/test/test_eal_ptr_compress.c              | 108 ++++++
> > > >   app/test/test_ring.h                          |  94 ++++-
> > > >   app/test/test_ring_perf.c                     | 354
> ++++++++++++------
> > > >   .../prog_guide/env_abstraction_layer.rst      | 142 +++++++
> > > >   lib/eal/include/meson.build                   |   1 +
> > > >   lib/eal/include/rte_ptr_compress.h            | 266 +++++++++++++
> > > >   8 files changed, 843 insertions(+), 124 deletions(-)
> > > >   create mode 100644 app/test/test_eal_ptr_compress.c
> > > >   create mode 100644 lib/eal/include/rte_ptr_compress.h
> > > >
> > > > --
> > > > 2.25.1
> > > >
>

Honnappa Nagarahalli March 1, 2024, 7:57 p.m. UTC | #10

> On Mar 1, 2024, at 5:16 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> 
>> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
>> Sent: Thursday, 22 February 2024 17.16
>> 
>>> For some reason your email is not visible to me, even though it's in the
>>> archive.
>> 
>> No worries.
>> 
>>> 
>>> On 02/11/202416:32,Konstantin Ananyev konstantin.v.ananyev  wrote:
>>> 
>>>> From one side the code itself is very small and straightforward, > from
>> other side - it is not clear to me what is intended usage for it
>>>> within DPDK and it's applianances?
>>>> Konstantin
>>> 
>>> The intended usage is explained in the cover email (see below) and
>> demonstrated
>>> in the test supplied in the following patch - when sending arrays of
>> pointers
>>> between cores as it happens in a forwarding example.
>> 
>> Yes, I saw that. The thing is that test is a 'synthetic' one.
>> My question was about how do you expect people to use it in more realistic
>> scenarios?
>> Let say user has a bunch of mbuf pointers, possibly from different mempools.
>> How he can use this API: how to deduce the base pointer for all of them and
>> what to
>> do if it can't be done?
> 
> I share Konstantin's concerns with this feature.
> 
> If we want to compress mbuf pointers in applications with a few mbuf pools, e.g. an mbuf pool per CPU socket, the compression algorithm would be different.
This feature is targeted for pipeline mode of applications. We see many customers using pipeline mode. This feature helps in reducing the cost of transferring the packets between cores by reducing the copies involved.
For an application with multiple pools, it depends on how the applications are using multiple pools. But, if there is a bunch of packets belonging to multiple mempools, compressing those mbufs may not be possible. But if those mbufs are grouped per mempool and are transferred on different queues, then it is possible. Hence the APIs are implemented very generically.

> 
> I would like to add:
> If we want to offer optimizations specifically for applications with a single mbuf pool, I think it should be considered in a system-wide context to determine if performance could be improved in more areas.
> E.g. removing the pool field from the rte_mbuf structure might free up space to move hot fields from the second cache line to the first, so the second cache line rarely needs to be touched. (As an alternative to removing the pool field, it could be moved to the second cache line, only to be used if the global "single mbuf pool" is NULL.)
Agree on this. The feedback I have received is on similar lines, many are using simple features. I also received feedback that 90% of the applications use less than 4GB of memory for mbuf and burst sizes are up to 256.

> 
> On the other hand, I agree that pointer compression can be useful for some applications, so we should accept it.
> 
> However, pointer compression has nothing to do with the underlying hardware or operating system, so it does not belong in the EAL (which is already too bloated). It should be a separate library.
Yes, this is generic (though there is SIMD code). We could move it out of EAL.

> 
>> 
>>> On 01/11/2023 18:12, Paul Szczepanek wrote:
>>> 
>>>> This patchset is proposing adding a new EAL header with utility functions
>>>> that allow compression of arrays of pointers.
>>>> 
>>>> When passing caches full of pointers between threads, memory containing
>>>> the pointers is copied multiple times which is especially costly between
>>>> cores. A compression method will allow us to shrink the memory size
>>>> copied.
>>>> 
>>>> The compression takes advantage of the fact that pointers are usually
>>>> located in a limited memory region (like a mempool). We can compress them
>>>> by converting them to offsets from a base memory address.
>>>> 
>>>> Offsets can be stored in fewer bytes (dictated by the memory region size
>>>> and alignment of the pointer). For example: an 8 byte aligned pointer
>>>> which is part of a 32GB memory pool can be stored in 4 bytes. The API is
>>>> very generic and does not assume mempool pointers, any pointer can be
>>>> passed in.
>>>> 
>>>> Compression is based on few and fast operations and especially with vector
>>>> instructions leveraged creates minimal overhead.
>>>> 
>>>> The API accepts and returns arrays because the overhead means it only is
>>>> worth it when done in bulk.
>>>> 
>>>> Test is added that shows potential performance gain from compression. In
>>>> this test an array of pointers is passed through a ring between two cores.
>>>> It shows the gain which is dependent on the bulk operation size. In this
>>>> synthetic test run on ampere altra a substantial (up to 25%) performance
>>>> gain is seen if done in bulk size larger than 32. At 32 it breaks even and
>>>> lower sizes create a small (less than 5%) slowdown due to overhead.
>>>> 
>>>> In a more realistic mock application running the l3 forwarding dpdk
>>>> example that works in pipeline mode on two cores this translated into a
>>>> ~5% throughput increase on an ampere altra.
> 
> Which burst size was used to achieve this ~5% throughput increase?
This is the stock L3fwd application which is split into 2 stages: RX, L3fwd, TX. The default burst size 32 is used.

> 
>>>> 
>>>> v2:
>>>> * addressed review comments (style, explanations and typos)
>>>> * lowered bulk iterations closer to original numbers to keep runtime short
>>>> * fixed pointer size warning on 32-bit arch
>>>> v3:
>>>> * added 16-bit versions of compression functions and tests
>>>> * added documentation of these new utility functions in the EAL guide
>>>> v4:
>>>> * added unit test
>>>> * fix bug in NEON implementation of 32-bit decompress
>>>> v5:
>>>> * disable NEON and SVE implementation on AARCH32 due to wrong pointer size
>>>> 
>>>> Paul Szczepanek (4):
>>>>   eal: add pointer compression functions
>>>>   test: add pointer compress tests to ring perf test
>>>>   docs: add pointer compression to the EAL guide
>>>>   test: add unit test for ptr compression
>>>> 
>>>>  .mailmap                                      |   1 +
>>>>  app/test/meson.build                          |   1 +
>>>>  app/test/test_eal_ptr_compress.c              | 108 ++++++
>>>>  app/test/test_ring.h                          |  94 ++++-
>>>>  app/test/test_ring_perf.c                     | 354 ++++++++++++------
>>>>  .../prog_guide/env_abstraction_layer.rst      | 142 +++++++
>>>>  lib/eal/include/meson.build                   |   1 +
>>>>  lib/eal/include/rte_ptr_compress.h            | 266 +++++++++++++
>>>>  8 files changed, 843 insertions(+), 124 deletions(-)
>>>>  create mode 100644 app/test/test_eal_ptr_compress.c
>>>>  create mode 100644 lib/eal/include/rte_ptr_compress.h
>>>> 
>>>> --
>>>> 2.25.1
>>>>

Morten Brørup March 2, 2024, 10:33 a.m. UTC | #11

> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Friday, 1 March 2024 20.57
> 
> > On Mar 1, 2024, at 5:16 AM, Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> >> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> >> Sent: Thursday, 22 February 2024 17.16
> >>
> >>> For some reason your email is not visible to me, even though it's in
> the
> >>> archive.
> >>
> >> No worries.
> >>
> >>>
> >>> On 02/11/202416:32,Konstantin Ananyev konstantin.v.ananyev  wrote:
> >>>
> >>>> From one side the code itself is very small and straightforward, >
> from
> >> other side - it is not clear to me what is intended usage for it
> >>>> within DPDK and it's applianances?
> >>>> Konstantin
> >>>
> >>> The intended usage is explained in the cover email (see below) and
> >> demonstrated
> >>> in the test supplied in the following patch - when sending arrays of
> >> pointers
> >>> between cores as it happens in a forwarding example.
> >>
> >> Yes, I saw that. The thing is that test is a 'synthetic' one.
> >> My question was about how do you expect people to use it in more
> realistic
> >> scenarios?
> >> Let say user has a bunch of mbuf pointers, possibly from different
> mempools.
> >> How he can use this API: how to deduce the base pointer for all of
> them and
> >> what to
> >> do if it can't be done?
> >
> > I share Konstantin's concerns with this feature.
> >
> > If we want to compress mbuf pointers in applications with a few mbuf
> pools, e.g. an mbuf pool per CPU socket, the compression algorithm would
> be different.
> This feature is targeted for pipeline mode of applications. We see many
> customers using pipeline mode. This feature helps in reducing the cost
> of transferring the packets between cores by reducing the copies
> involved.

OK. I agree this is a very common use case, worth optimizing for.

> For an application with multiple pools, it depends on how the
> applications are using multiple pools. But, if there is a bunch of
> packets belonging to multiple mempools, compressing those mbufs may not
> be possible. But if those mbufs are grouped per mempool and are
> transferred on different queues, then it is possible. Hence the APIs are
> implemented very generically.

OK.

<feature creep>
And for a possible future extension:
If there are very few mbuf pools, such as 2 or 4, it might be possible to develop similar functions to efficiently compress/decompress pointers in a shared queue. E.g. the highest bits could identify the pool, and the lowest bits could identify the pointer offset (with bit shift) in that pool. Or if the pools are less than 4 GB each, the lowest bits could identify the pool, and be masked away for getting the offset (no bit shift), taking advantage of lowest bits of the pointer address always being zero anyway.
I am mentioning this, so it can be taken into consideration when designing the pointer compression library and its API. I don't expect it to be implemented at this time. Also, it might not lead to any changes of the already proposed pointer compression API - just give it a few thoughts.
</feature creep>

+1 for the simplicity of the functions and the API in this patch.
E.g. the bit_shift is most likely known constant at build time, so inlining allows the compiler to optimize for this. In many use cases, it might be 1, and thus optimized away.

> 
> >
> > I would like to add:
> > If we want to offer optimizations specifically for applications with a
> single mbuf pool, I think it should be considered in a system-wide
> context to determine if performance could be improved in more areas.
> > E.g. removing the pool field from the rte_mbuf structure might free up
> space to move hot fields from the second cache line to the first, so the
> second cache line rarely needs to be touched. (As an alternative to
> removing the pool field, it could be moved to the second cache line,
> only to be used if the global "single mbuf pool" is NULL.)
> Agree on this. The feedback I have received is on similar lines, many
> are using simple features. I also received feedback that 90% of the
> applications use less than 4GB of memory for mbuf and burst sizes are up
> to 256.

Interesting.
Keeping the most common use cases in mind is important for steering DPDK in the right direction as it evolves.

If a very large percentage of use cases use one mbuf pool of less than 4 GB, we should seriously consider the broader opportunity for optimizing by generally referencing mbufs by an uint32_t pointer offset (no bit shifting) instead of by pointers.

> 
> >
> > On the other hand, I agree that pointer compression can be useful for
> some applications, so we should accept it.
> >
> > However, pointer compression has nothing to do with the underlying
> hardware or operating system, so it does not belong in the EAL (which is
> already too bloated). It should be a separate library.
> Yes, this is generic (though there is SIMD code). We could move it out
> of EAL.

Thank you.

I think that a misconception that arch specific optimizations (such as SIMD code) required stuff to go into EAL has been prevailing, and this misconception is a main reason why EAL has become so bloated.
Moving features like pointer compression out of EAL, thereby showing alternative design patterns for code containing arch specific optimizations, will help eliminate that misconception.

> 
> >
> >>
> >>> On 01/11/2023 18:12, Paul Szczepanek wrote:
> >>>
> >>>> This patchset is proposing adding a new EAL header with utility
> functions
> >>>> that allow compression of arrays of pointers.
> >>>>
> >>>> When passing caches full of pointers between threads, memory
> containing
> >>>> the pointers is copied multiple times which is especially costly
> between
> >>>> cores. A compression method will allow us to shrink the memory size
> >>>> copied.
> >>>>
> >>>> The compression takes advantage of the fact that pointers are
> usually
> >>>> located in a limited memory region (like a mempool). We can
> compress them
> >>>> by converting them to offsets from a base memory address.
> >>>>
> >>>> Offsets can be stored in fewer bytes (dictated by the memory region
> size
> >>>> and alignment of the pointer). For example: an 8 byte aligned
> pointer
> >>>> which is part of a 32GB memory pool can be stored in 4 bytes. The
> API is
> >>>> very generic and does not assume mempool pointers, any pointer can
> be
> >>>> passed in.
> >>>>
> >>>> Compression is based on few and fast operations and especially with
> vector
> >>>> instructions leveraged creates minimal overhead.
> >>>>
> >>>> The API accepts and returns arrays because the overhead means it
> only is
> >>>> worth it when done in bulk.
> >>>>
> >>>> Test is added that shows potential performance gain from
> compression. In
> >>>> this test an array of pointers is passed through a ring between two
> cores.
> >>>> It shows the gain which is dependent on the bulk operation size. In
> this
> >>>> synthetic test run on ampere altra a substantial (up to 25%)
> performance
> >>>> gain is seen if done in bulk size larger than 32. At 32 it breaks
> even and
> >>>> lower sizes create a small (less than 5%) slowdown due to overhead.
> >>>>
> >>>> In a more realistic mock application running the l3 forwarding dpdk
> >>>> example that works in pipeline mode on two cores this translated
> into a
> >>>> ~5% throughput increase on an ampere altra.
> >
> > Which burst size was used to achieve this ~5% throughput increase?
> This is the stock L3fwd application which is split into 2 stages: RX,
> L3fwd, TX. The default burst size 32 is used.

Impressive.
It proves the point that synthetic tests often are too simple to show the benefits of optimizations for reducing cache misses.

Konstantin Ananyev March 4, 2024, 2:44 p.m. UTC | #12

> > On Mar 1, 2024, at 5:16 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> >
> >> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> >> Sent: Thursday, 22 February 2024 17.16
> >>
> >>> For some reason your email is not visible to me, even though it's in the
> >>> archive.
> >>
> >> No worries.
> >>
> >>>
> >>> On 02/11/202416:32,Konstantin Ananyev konstantin.v.ananyev  wrote:
> >>>
> >>>> From one side the code itself is very small and straightforward, > from
> >> other side - it is not clear to me what is intended usage for it
> >>>> within DPDK and it's applianances?
> >>>> Konstantin
> >>>
> >>> The intended usage is explained in the cover email (see below) and
> >> demonstrated
> >>> in the test supplied in the following patch - when sending arrays of
> >> pointers
> >>> between cores as it happens in a forwarding example.
> >>
> >> Yes, I saw that. The thing is that test is a 'synthetic' one.
> >> My question was about how do you expect people to use it in more realistic
> >> scenarios?
> >> Let say user has a bunch of mbuf pointers, possibly from different mempools.
> >> How he can use this API: how to deduce the base pointer for all of them and
> >> what to
> >> do if it can't be done?
> >
> > I share Konstantin's concerns with this feature.
> >
> > If we want to compress mbuf pointers in applications with a few mbuf pools, e.g. an mbuf pool per CPU socket, the compression
> algorithm would be different.
> This feature is targeted for pipeline mode of applications. We see many customers using pipeline mode. This feature helps in reducing
> the cost of transferring the packets between cores by reducing the copies involved.

I do understand the intention, and I am not arguing about usefulness of the pipeline model. 
My point is you are introducing new API: compress/decompress pointers,
but don't provide (or even describe) any proper way for the developer to use it in a safe and predictable manner.
Which from my perspective make it nearly useless and misleading.

> For an application with multiple pools, it depends on how the applications are using multiple pools. But, if there is a bunch of packets
> belonging to multiple mempools, compressing those mbufs may not be possible. But if those mbufs are grouped per mempool and
> are transferred on different queues, then it is possible. Hence the APIs are implemented very generically.

Ok, let's consider even more simplistic scenario - all pointers belong to one mempool.
AFAIK, even one mempool can contain elements from different memzones,
and these memzones are not guaranteed to have consecutive VAs.
So even one mempool, with total size <=4GB can contain elements with distances between them more than 4GB. 
Now let say at startup user created a mempool, how he can determine programmatically
can he apply your compress API safely on it or not?
I presume that if you are serious about this API usage, then such ability has to be provided.
Something like:

int compress_pointer_deduce_mempool_base(const struct rte_memepool *mp[],
	uint32_t nb_mp, uint32_t compress_size, uintptr_t *base_ptr);

Or probably even more generic one:

struct mem_buf {uintptr_t base, size_t len;}; 
int compress_pointer_deduce_base(const struct mem_buf *mem_buf[],
	uint32_t nb_membuf, uint32_t compress_size, uintptr_t *base_ptr);

Even with these functions in-place, user has to be extra careful:
 - he can't add new memory chunks to these mempools (or he'll need to re-calcualte the new base_ptr)
 - he needs to make sure that pointers from only these mempools will be used by compress/decompress.
But at least it provides some ability to use this feature in real apps.

With such API in place it should be possible to make the auto-test more realistic:
- allocate mempool 
- deduce base_pointer
- then we can have a loop with producer/consumer to mimic realistic workload.
    As an example:
     producer(s):  mempool_alloc(); <fill mbuf with some values>; ring_enqueue();  
     consumer(s): ring_dequeue(); <read_and_check_mbuf_data>; free_mbuf();
- free mempool

Or probably you can go even further: take some existing pipeline sample app and make it use compress/decompress API.
That will provide people with some ability to test it and measure it's perf impact.
Again, it will provide an example of the amount of changes required to enable it.
My speculation here that majority of users will find the effort too big, 
while the gain way too limited and fragile.
But at least, there would be some realistic reference point for it and users can decide themselves is it worth it or not. 

> >
> > I would like to add:
> > If we want to offer optimizations specifically for applications with a single mbuf pool, I think it should be considered in a system-wide
> context to determine if performance could be improved in more areas.
> > E.g. removing the pool field from the rte_mbuf structure might free up space to move hot fields from the second cache line to the
> first, so the second cache line rarely needs to be touched. (As an alternative to removing the pool field, it could be moved to the
> second cache line, only to be used if the global "single mbuf pool" is NULL.)
> Agree on this. The feedback I have received is on similar lines, many are using simple features. I also received feedback that 90% of
> the applications use less than 4GB of memory for mbuf and burst sizes are up to 256.

Well, from my perspective the story is completely different:
Majority of real-world apps I am aware do use multiple mempools,
it is also not uncommon to have a mempools with size bigger then 4GB (8/16).
Again, there are queries to make mempools growable/shrinkable on demand.

> >
> > On the other hand, I agree that pointer compression can be useful for some applications, so we should accept it.
> >
> > However, pointer compression has nothing to do with the underlying hardware or operating system, so it does not belong in the EAL
> (which is already too bloated). It should be a separate library.
> Yes, this is generic (though there is SIMD code). We could move it out of EAL.
> 
> >
> >>
> >>> On 01/11/2023 18:12, Paul Szczepanek wrote:
> >>>
> >>>> This patchset is proposing adding a new EAL header with utility functions
> >>>> that allow compression of arrays of pointers.
> >>>>
> >>>> When passing caches full of pointers between threads, memory containing
> >>>> the pointers is copied multiple times which is especially costly between
> >>>> cores. A compression method will allow us to shrink the memory size
> >>>> copied.
> >>>>
> >>>> The compression takes advantage of the fact that pointers are usually
> >>>> located in a limited memory region (like a mempool). We can compress them
> >>>> by converting them to offsets from a base memory address.
> >>>>
> >>>> Offsets can be stored in fewer bytes (dictated by the memory region size
> >>>> and alignment of the pointer). For example: an 8 byte aligned pointer
> >>>> which is part of a 32GB memory pool can be stored in 4 bytes. The API is
> >>>> very generic and does not assume mempool pointers, any pointer can be
> >>>> passed in.
> >>>>
> >>>> Compression is based on few and fast operations and especially with vector
> >>>> instructions leveraged creates minimal overhead.
> >>>>
> >>>> The API accepts and returns arrays because the overhead means it only is
> >>>> worth it when done in bulk.
> >>>>
> >>>> Test is added that shows potential performance gain from compression. In
> >>>> this test an array of pointers is passed through a ring between two cores.
> >>>> It shows the gain which is dependent on the bulk operation size. In this
> >>>> synthetic test run on ampere altra a substantial (up to 25%) performance
> >>>> gain is seen if done in bulk size larger than 32. At 32 it breaks even and
> >>>> lower sizes create a small (less than 5%) slowdown due to overhead.
> >>>>
> >>>> In a more realistic mock application running the l3 forwarding dpdk
> >>>> example that works in pipeline mode on two cores this translated into a
> >>>> ~5% throughput increase on an ampere altra.
> >
> > Which burst size was used to achieve this ~5% throughput increase?
> This is the stock L3fwd application which is split into 2 stages: RX, L3fwd, TX. The default burst size 32 is used.
> 
> >
> >>>>
> >>>> v2:
> >>>> * addressed review comments (style, explanations and typos)
> >>>> * lowered bulk iterations closer to original numbers to keep runtime short
> >>>> * fixed pointer size warning on 32-bit arch
> >>>> v3:
> >>>> * added 16-bit versions of compression functions and tests
> >>>> * added documentation of these new utility functions in the EAL guide
> >>>> v4:
> >>>> * added unit test
> >>>> * fix bug in NEON implementation of 32-bit decompress
> >>>> v5:
> >>>> * disable NEON and SVE implementation on AARCH32 due to wrong pointer size
> >>>>
> >>>> Paul Szczepanek (4):
> >>>>   eal: add pointer compression functions
> >>>>   test: add pointer compress tests to ring perf test
> >>>>   docs: add pointer compression to the EAL guide
> >>>>   test: add unit test for ptr compression
> >>>>
> >>>>  .mailmap                                      |   1 +
> >>>>  app/test/meson.build                          |   1 +
> >>>>  app/test/test_eal_ptr_compress.c              | 108 ++++++
> >>>>  app/test/test_ring.h                          |  94 ++++-
> >>>>  app/test/test_ring_perf.c                     | 354 ++++++++++++------
> >>>>  .../prog_guide/env_abstraction_layer.rst      | 142 +++++++
> >>>>  lib/eal/include/meson.build                   |   1 +
> >>>>  lib/eal/include/rte_ptr_compress.h            | 266 +++++++++++++
> >>>>  8 files changed, 843 insertions(+), 124 deletions(-)
> >>>>  create mode 100644 app/test/test_eal_ptr_compress.c
> >>>>  create mode 100644 lib/eal/include/rte_ptr_compress.h
> >>>>
> >>>> --
> >>>> 2.25.1
> >>>>

Paul Szczepanek March 6, 2024, 10:31 p.m. UTC | #13

On 02/03/2024 10:33, Morten Brørup wrote:
> I think that a misconception that arch specific optimizations (such as SIMD code) required stuff to go into EAL has been prevailing, and this misconception is a main reason why EAL has become so bloated.
> Moving features like pointer compression out of EAL, thereby showing alternative design patterns for code containing arch specific optimizations, will help eliminate that misconception.

I have a patch ready that moves the ptr compress into its own library 
but I must first land this patch:
https://patches.dpdk.org/project/dpdk/patch/20240306221709.166722-2-paul.szczepanek@arm.com/
which is required to have header only libraries - otherwise errors stop 
the build.

Honnappa Nagarahalli March 7, 2024, 2:13 a.m. UTC | #14

> On Mar 6, 2024, at 4:31 PM, Paul Szczepanek <Paul.Szczepanek@arm.com> wrote:
> 
> On 02/03/2024 10:33, Morten Brørup wrote:
>> I think that a misconception that arch specific optimizations (such as SIMD code) required stuff to go into EAL has been prevailing, and this misconception is a main reason why EAL has become so bloated.
>> Moving features like pointer compression out of EAL, thereby showing alternative design patterns for code containing arch specific optimizations, will help eliminate that misconception.
> 
> I have a patch ready that moves the ptr compress into its own library but I must first land this patch:
> https://patches.dpdk.org/project/dpdk/patch/20240306221709.166722-2-paul.szczepanek@arm.com/
> which is required to have header only libraries - otherwise errors stop the build.
You can add dependencies to your patch. You do not have to wait for the patch to be merged. I believe the CI takes care of the dependencies as well. Please check: https://doc.dpdk.org/guides/contributing/patches.html#patch-dependencies

Paul Szczepanek March 7, 2024, 8:39 p.m. UTC | #15

This patchset is proposing adding a new EAL header with utility functions
that allow compression of arrays of pointers.

When passing caches full of pointers between threads, memory containing
the pointers is copied multiple times which is especially costly between
cores. A compression method will allow us to shrink the memory size
copied.

The compression takes advantage of the fact that pointers are usually
located in a limited memory region (like a mempool). We can compress them
by converting them to offsets from a base memory address.

Offsets can be stored in fewer bytes (dictated by the memory region size
and alignment of the pointer). For example: an 8 byte aligned pointer
which is part of a 32GB memory pool can be stored in 4 bytes. The API is
very generic and does not assume mempool pointers, any pointer can be
passed in.

Compression is based on few and fast operations and especially with vector
instructions leveraged creates minimal overhead.

The API accepts and returns arrays because the overhead means it only is
worth it when done in bulk.

Test is added that shows potential performance gain from compression. In
this test an array of pointers is passed through a ring between two cores.
It shows the gain which is dependent on the bulk operation size. In this
synthetic test run on ampere altra a substantial (up to 25%) performance
gain is seen if done in bulk size larger than 32. At 32 it breaks even and
lower sizes create a small (less than 5%) slowdown due to overhead.

In a more realistic mock application running the l3 forwarding dpdk
example that works in pipeline mode on two cores this translated into a
~5% throughput increase on an ampere altra.

v2:
* addressed review comments (style, explanations and typos)
* lowered bulk iterations closer to original numbers to keep runtime short
* fixed pointer size warning on 32-bit arch
v3:
* added 16-bit versions of compression functions and tests
* added documentation of these new utility functions in the EAL guide
v4:
* added unit test
* fix bug in NEON implementation of 32-bit decompress
v5:
* disable NEON and SVE implementation on AARCH32 due to wrong pointer size
v6:
* added example usage to commit message of the initial commit
v7:
* rebase to remove clashing mailmap changes
v8:
* put ptr compress into its own library
* add depends-on tag
* remove copyright bumps
* typos

Paul Szczepanek (4):
  ptr_compress: add pointer compression library
  test: add pointer compress tests to ring perf test
  docs: add pointer compression guide
  test: add unit test for ptr compression

 app/test/meson.build                       |  21 +-
 app/test/test_ptr_compress.c               | 108 +++++++
 app/test/test_ring.h                       |  92 ++++++
 app/test/test_ring_perf.c                  | 352 ++++++++++++++-------
 doc/guides/prog_guide/ptr_compress_lib.rst | 144 +++++++++
 lib/meson.build                            |   1 +
 lib/ptr_compress/meson.build               |   4 +
 lib/ptr_compress/rte_ptr_compress.h        | 266 ++++++++++++++++
 lib/ptr_compress/version.map               |   3 +
 9 files changed, 859 insertions(+), 132 deletions(-)
 create mode 100644 app/test/test_ptr_compress.c
 create mode 100644 doc/guides/prog_guide/ptr_compress_lib.rst
 create mode 100644 lib/ptr_compress/meson.build
 create mode 100644 lib/ptr_compress/rte_ptr_compress.h
 create mode 100644 lib/ptr_compress/version.map

--
2.25.1

David Marchand March 8, 2024, 8:27 a.m. UTC | #16

Hello Paul,

On Thu, Mar 7, 2024 at 9:40 PM Paul Szczepanek <paul.szczepanek@arm.com> wrote:
>
> This patchset is proposing adding a new EAL header with utility functions
> that allow compression of arrays of pointers.
>
> When passing caches full of pointers between threads, memory containing
> the pointers is copied multiple times which is especially costly between
> cores. A compression method will allow us to shrink the memory size
> copied.
>
> The compression takes advantage of the fact that pointers are usually
> located in a limited memory region (like a mempool). We can compress them
> by converting them to offsets from a base memory address.
>
> Offsets can be stored in fewer bytes (dictated by the memory region size
> and alignment of the pointer). For example: an 8 byte aligned pointer
> which is part of a 32GB memory pool can be stored in 4 bytes. The API is
> very generic and does not assume mempool pointers, any pointer can be
> passed in.
>
> Compression is based on few and fast operations and especially with vector
> instructions leveraged creates minimal overhead.
>
> The API accepts and returns arrays because the overhead means it only is
> worth it when done in bulk.
>
> Test is added that shows potential performance gain from compression. In
> this test an array of pointers is passed through a ring between two cores.
> It shows the gain which is dependent on the bulk operation size. In this
> synthetic test run on ampere altra a substantial (up to 25%) performance
> gain is seen if done in bulk size larger than 32. At 32 it breaks even and
> lower sizes create a small (less than 5%) slowdown due to overhead.
>
> In a more realistic mock application running the l3 forwarding dpdk
> example that works in pipeline mode on two cores this translated into a
> ~5% throughput increase on an ampere altra.
>
> v2:
> * addressed review comments (style, explanations and typos)
> * lowered bulk iterations closer to original numbers to keep runtime short
> * fixed pointer size warning on 32-bit arch
> v3:
> * added 16-bit versions of compression functions and tests
> * added documentation of these new utility functions in the EAL guide
> v4:
> * added unit test
> * fix bug in NEON implementation of 32-bit decompress
> v5:
> * disable NEON and SVE implementation on AARCH32 due to wrong pointer size
> v6:
> * added example usage to commit message of the initial commit
> v7:
> * rebase to remove clashing mailmap changes
> v8:
> * put ptr compress into its own library
> * add depends-on tag
> * remove copyright bumps
> * typos
>
> Paul Szczepanek (4):
>   ptr_compress: add pointer compression library
>   test: add pointer compress tests to ring perf test
>   docs: add pointer compression guide
>   test: add unit test for ptr compression
>
>  app/test/meson.build                       |  21 +-
>  app/test/test_ptr_compress.c               | 108 +++++++
>  app/test/test_ring.h                       |  92 ++++++
>  app/test/test_ring_perf.c                  | 352 ++++++++++++++-------
>  doc/guides/prog_guide/ptr_compress_lib.rst | 144 +++++++++
>  lib/meson.build                            |   1 +
>  lib/ptr_compress/meson.build               |   4 +
>  lib/ptr_compress/rte_ptr_compress.h        | 266 ++++++++++++++++
>  lib/ptr_compress/version.map               |   3 +
>  9 files changed, 859 insertions(+), 132 deletions(-)
>  create mode 100644 app/test/test_ptr_compress.c
>  create mode 100644 doc/guides/prog_guide/ptr_compress_lib.rst
>  create mode 100644 lib/ptr_compress/meson.build
>  create mode 100644 lib/ptr_compress/rte_ptr_compress.h
>  create mode 100644 lib/ptr_compress/version.map

We mentionned during the weekly release meeting, it seemed too late
for merging this work in the 24.03 release.

Looking at v8, I have comments on this series:
- rather than put a Depends-on: tag, take the lib: patch as part of
your series, there is no need for this patch without the ptr_compress
lib and it will avoid any CI issue (ovsrobot does not support
Depends-on: patch- for example),
- lib/ptr_compress/version.map is unneeded now,
- lib/ptr_compress/, app/test/test_ptr_compress.c and
doc/guides/prog_guide/ptr_compress_lib.rst need a MAINTAINERS entry,
- prefer lowercase characters for mail addresses in commitlogs,
- the documentation is not referenced in doc/guides/prog_guide/index.rst,
- doxygen does not know of this new library, you must update
doc/api/doxy-api-index.md and doc/api/doxy-api.conf.in,
- a RN entry is missing,

There were also comments on the lib: patch.

At this point, it is better to take our time to finish putting this
work in good form and merge it in 24.07.

Thanks.

Honnappa Nagarahalli March 10, 2024, 7:34 p.m. UTC | #17

+ Wathsala


> On Mar 8, 2024, at 2:27 AM, David Marchand <david.marchand@redhat.com> wrote:
> 
> Hello Paul,
> 
> On Thu, Mar 7, 2024 at 9:40 PM Paul Szczepanek <paul.szczepanek@arm.com> wrote:
>> 
>> This patchset is proposing adding a new EAL header with utility functions
>> that allow compression of arrays of pointers.
>> 
>> When passing caches full of pointers between threads, memory containing
>> the pointers is copied multiple times which is especially costly between
>> cores. A compression method will allow us to shrink the memory size
>> copied.
>> 
>> The compression takes advantage of the fact that pointers are usually
>> located in a limited memory region (like a mempool). We can compress them
>> by converting them to offsets from a base memory address.
>> 
>> Offsets can be stored in fewer bytes (dictated by the memory region size
>> and alignment of the pointer). For example: an 8 byte aligned pointer
>> which is part of a 32GB memory pool can be stored in 4 bytes. The API is
>> very generic and does not assume mempool pointers, any pointer can be
>> passed in.
>> 
>> Compression is based on few and fast operations and especially with vector
>> instructions leveraged creates minimal overhead.
>> 
>> The API accepts and returns arrays because the overhead means it only is
>> worth it when done in bulk.
>> 
>> Test is added that shows potential performance gain from compression. In
>> this test an array of pointers is passed through a ring between two cores.
>> It shows the gain which is dependent on the bulk operation size. In this
>> synthetic test run on ampere altra a substantial (up to 25%) performance
>> gain is seen if done in bulk size larger than 32. At 32 it breaks even and
>> lower sizes create a small (less than 5%) slowdown due to overhead.
>> 
>> In a more realistic mock application running the l3 forwarding dpdk
>> example that works in pipeline mode on two cores this translated into a
>> ~5% throughput increase on an ampere altra.
>> 
>> v2:
>> * addressed review comments (style, explanations and typos)
>> * lowered bulk iterations closer to original numbers to keep runtime short
>> * fixed pointer size warning on 32-bit arch
>> v3:
>> * added 16-bit versions of compression functions and tests
>> * added documentation of these new utility functions in the EAL guide
>> v4:
>> * added unit test
>> * fix bug in NEON implementation of 32-bit decompress
>> v5:
>> * disable NEON and SVE implementation on AARCH32 due to wrong pointer size
>> v6:
>> * added example usage to commit message of the initial commit
>> v7:
>> * rebase to remove clashing mailmap changes
>> v8:
>> * put ptr compress into its own library
>> * add depends-on tag
>> * remove copyright bumps
>> * typos
>> 
>> Paul Szczepanek (4):
>>  ptr_compress: add pointer compression library
>>  test: add pointer compress tests to ring perf test
>>  docs: add pointer compression guide
>>  test: add unit test for ptr compression
>> 
>> app/test/meson.build                       |  21 +-
>> app/test/test_ptr_compress.c               | 108 +++++++
>> app/test/test_ring.h                       |  92 ++++++
>> app/test/test_ring_perf.c                  | 352 ++++++++++++++-------
>> doc/guides/prog_guide/ptr_compress_lib.rst | 144 +++++++++
>> lib/meson.build                            |   1 +
>> lib/ptr_compress/meson.build               |   4 +
>> lib/ptr_compress/rte_ptr_compress.h        | 266 ++++++++++++++++
>> lib/ptr_compress/version.map               |   3 +
>> 9 files changed, 859 insertions(+), 132 deletions(-)
>> create mode 100644 app/test/test_ptr_compress.c
>> create mode 100644 doc/guides/prog_guide/ptr_compress_lib.rst
>> create mode 100644 lib/ptr_compress/meson.build
>> create mode 100644 lib/ptr_compress/rte_ptr_compress.h
>> create mode 100644 lib/ptr_compress/version.map
> 
> We mentionned during the weekly release meeting, it seemed too late
> for merging this work in the 24.03 release.
> 
> Looking at v8, I have comments on this series:
> - rather than put a Depends-on: tag, take the lib: patch as part of
> your series, there is no need for this patch without the ptr_compress
> lib and it will avoid any CI issue (ovsrobot does not support
> Depends-on: patch- for example),
Agree, this is a better solution

> - lib/ptr_compress/version.map is unneeded now,
> - lib/ptr_compress/, app/test/test_ptr_compress.c and
> doc/guides/prog_guide/ptr_compress_lib.rst need a MAINTAINERS entry,
> - prefer lowercase characters for mail addresses in commitlogs,
> - the documentation is not referenced in doc/guides/prog_guide/index.rst,
> - doxygen does not know of this new library, you must update
> doc/api/doxy-api-index.md and doc/api/doxy-api.conf.in,
> - a RN entry is missing,
Apologies for missing these.

> 
> There were also comments on the lib: patch.
Not sure which comments you are talking about. Your comments on V7 were addressed in V8.

> 
> At this point, it is better to take our time to finish putting this
> work in good form and merge it in 24.07.
Given your comments do not affect the code and the changes are pretty straightforward, request you reconsider the decision.
Anyway, we will get these changes pushed to community on Monday.

> 
> Thanks.
> 
> -- 
> David Marchand
>

David Marchand March 11, 2024, 7:44 a.m. UTC | #18

On Sun, Mar 10, 2024 at 8:35 PM Honnappa Nagarahalli
<Honnappa.Nagarahalli@arm.com> wrote:
> > We mentionned during the weekly release meeting, it seemed too late
> > for merging this work in the 24.03 release.
> >
> > Looking at v8, I have comments on this series:
> > - rather than put a Depends-on: tag, take the lib: patch as part of
> > your series, there is no need for this patch without the ptr_compress
> > lib and it will avoid any CI issue (ovsrobot does not support
> > Depends-on: patch- for example),
> Agree, this is a better solution
>
> > - lib/ptr_compress/version.map is unneeded now,
> > - lib/ptr_compress/, app/test/test_ptr_compress.c and
> > doc/guides/prog_guide/ptr_compress_lib.rst need a MAINTAINERS entry,
> > - prefer lowercase characters for mail addresses in commitlogs,
> > - the documentation is not referenced in doc/guides/prog_guide/index.rst,
> > - doxygen does not know of this new library, you must update
> > doc/api/doxy-api-index.md and doc/api/doxy-api.conf.in,
> > - a RN entry is missing,
> Apologies for missing these.
>
> >
> > There were also comments on the lib: patch.
> Not sure which comments you are talking about. Your comments on V7 were addressed in V8.

http://inbox.dpdk.org/dev/ZemoDBHt6fMBUqne@bricha3-mobl1.ger.corp.intel.com/T/#m665026fb0c7ed832d3e80b68b16d7549124f6880

diff mbox series

Patch

diff --git a/.mailmap b/.mailmap
index 864d33ee46..3f0c9d32f5 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1058,6 +1058,7 @@  Paul Greenwalt <paul.greenwalt@intel.com>
 Paulis Gributs <paulis.gributs@intel.com>
 Paul Luse <paul.e.luse@intel.com>
 Paul M Stillwell Jr <paul.m.stillwell.jr@intel.com>
+Paul Szczepanek <paul.szczepanek@arm.com>
 Pavan Kumar Linga <pavan.kumar.linga@intel.com>
 Pavan Nikhilesh <pbhagavatula@marvell.com> <pbhagavatula@caviumnetworks.com>
 Pavel Belous <pavel.belous@aquantia.com>
diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
index a0463efac7..60b056ef96 100644
--- a/lib/eal/include/meson.build
+++ b/lib/eal/include/meson.build
@@ -35,6 +35,7 @@  headers += files(
         'rte_pci_dev_feature_defs.h',
         'rte_pci_dev_features.h',
         'rte_per_lcore.h',
+	'rte_ptr_compress.h',
         'rte_pflock.h',
         'rte_random.h',
         'rte_reciprocal.h',
diff --git a/lib/eal/include/rte_ptr_compress.h b/lib/eal/include/rte_ptr_compress.h
new file mode 100644
index 0000000000..6498587c0b
--- /dev/null
+++ b/lib/eal/include/rte_ptr_compress.h
@@ -0,0 +1,158 @@ 
+/* SPDX-License-Identifier: BSD-shift-Clause
+ * Copyright(c) 2023 Arm Limited
+ */
+
+#ifndef _RTE_PTR_COMPRESS_H_
+#define _RTE_PTR_COMPRESS_H_
+
+/**
+ * @file
+ * RTE pointer compression and decompression.
+ */
+
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Compress pointers into 32 bit offsets from base pointer.
+ *
+ * @note Offsets from the base pointer must fit within 32bits. Alignment allows
+ * us to drop bits from the offsets - this means that for pointers aligned by
+ * 8 bytes they must be within 32GB of the base pointer. Unaligned pointers
+ * must be within 4GB.
+ *
+ * @param ptr_base
+ *   A pointer used to calculate offsets of pointers in src_table.
+ * @param src_table
+ *   A pointer to an array of pointers.
+ * @param dest_table
+ *   A pointer to an array of compressed pointers returned by this function.
+ * @param n
+ *   The number of objects to compress, must be strictly positive.
+ * @param bit_shift
+ *   Byte alignment of memory pointed to by the pointers allows for
+ *   bits to be dropped from the offset and hence widen the memory region that
+ *   can be covered. This controls how many bits are right shifted.
+ **/
+static __rte_always_inline void
+rte_ptr_compress_32(void *ptr_base, void **src_table,
+		uint32_t *dest_table, unsigned int n, unsigned int bit_shift)
+{
+	unsigned int i = 0;
+#if defined RTE_HAS_SVE_ACLE
+	svuint64_t v_src_table;
+	svuint64_t v_dest_table;
+	svbool_t pg = svwhilelt_b64(i, n);
+	do {
+		v_src_table = svld1_u64(pg, (uint64_t *)src_table + i);
+		v_dest_table = svsub_x(pg, v_src_table, (uint64_t)ptr_base);
+		v_dest_table = svlsr_x(pg, v_dest_table, bit_shift);
+		svst1w(pg, &dest_table[i], v_dest_table);
+		i += svcntd();
+		pg = svwhilelt_b64(i, n);
+	} while (svptest_any(svptrue_b64(), pg));
+#elif defined __ARM_NEON
+	uint64_t ptr_diff;
+	uint64x2_t v_src_table;
+	uint64x2_t v_dest_table;
+	/* right shift is done by left shifting by negative int */
+	int64x2_t v_shift = vdupq_n_s64(-bit_shift);
+	uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
+	for (; i < (n & ~0x1); i += 2) {
+		v_src_table = vld1q_u64((const uint64_t *)src_table + i);
+		v_dest_table = vsubq_u64(v_src_table, v_ptr_base);
+		v_dest_table = vshlq_u64(v_dest_table, v_shift);
+		vst1_u32(dest_table + i, vqmovn_u64(v_dest_table));
+	}
+	/* process leftover single item in case of odd number of n */
+	if (unlikely(n & 0x1)) {
+		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+		dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
+	}
+#else
+	uint64_t ptr_diff;
+	for (; i < n; i++) {
+		ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+		/* save extra bits that are redundant due to alignment */
+		ptr_diff = ptr_diff >> bit_shift;
+		/* make sure no truncation will happen when casting */
+		RTE_ASSERT(ptr_diff <= UINT32_MAX);
+		dest_table[i] = (uint32_t) ptr_diff;
+	}
+#endif
+}
+
+/**
+ * Decompress pointers from 32 bit offsets from base pointer.
+ *
+ * @param ptr_base
+ *   A pointer which was used to calculate offsets in src_table.
+ * @param src_table
+ *   A pointer to an array to compressed pointers.
+ * @param dest_table
+ *   A pointer to an array of decompressed pointers returned by this function.
+ * @param n
+ *   The number of objects to decompress, must be strictly positive.
+ * @param bit_shift
+ *   Byte alignment of memory pointed to by the pointers allows for
+ *   bits to be dropped from the offset and hence widen the memory region that
+ *   can be covered. This controls how many bits are left shifted when pointers
+ *   are recovered from the offsets.
+ **/
+static __rte_always_inline void
+rte_ptr_decompress_32(void *ptr_base, uint32_t *src_table,
+		void **dest_table, unsigned int n, unsigned int bit_shift)
+{
+	unsigned int i = 0;
+#if defined RTE_HAS_SVE_ACLE
+	svuint64_t v_src_table;
+	svuint64_t v_dest_table;
+	svbool_t pg = svwhilelt_b64(i, n);
+	do {
+		v_src_table = svld1uw_u64(pg, &src_table[i]);
+		v_src_table = svlsl_x(pg, v_src_table, bit_shift);
+		v_dest_table = svadd_x(pg, v_src_table, (uint64_t)ptr_base);
+		svst1(pg, (uint64_t *)dest_table + i, v_dest_table);
+		i += svcntd();
+		pg = svwhilelt_b64(i, n);
+	} while (svptest_any(svptrue_b64(), pg));
+#elif defined __ARM_NEON
+	uint64_t ptr_diff;
+	uint64x2_t v_src_table;
+	uint64x2_t v_dest_table;
+	int64x2_t v_shift = vdupq_n_s64(bit_shift);
+	uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
+	for (; i < (n & ~0x1); i += 2) {
+		v_src_table = vmovl_u32(vld1_u32(src_table + i));
+		v_src_table = vshlq_u64(v_dest_table, v_shift);
+		v_dest_table = vaddq_u64(v_src_table, v_ptr_base);
+		vst1q_u64((uint64_t *)dest_table + i, v_dest_table);
+	}
+	/* process leftover single item in case of odd number of n */
+	if (unlikely(n & 0x1)) {
+		ptr_diff = ((uint64_t) src_table[i]) << bit_shift;
+		dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+	}
+#else
+	uint64_t ptr_diff;
+	for (; i < n; i++) {
+		ptr_diff = ((uint64_t) src_table[i]) << bit_shift;
+		dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+	}
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PTR_COMPRESS_H_ */