[v6] eal: add cache-line demote support

Message ID 1602497980-20680-2-git-send-email-omkar.maslekar@intel.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series [v6] eal: add cache-line demote support |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/Intel-compilation fail apply issues

Commit Message

Omkar Maslekar Oct. 12, 2020, 10:19 a.m. UTC
  rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
enables software to hint to hardware that line is likely to be shared.
Useful in core-to-core communications where cache-line is likely to be
shared. ARM and PPC implementation is provided with NOP and can be added
if any equivalent instructions could be used for implementation on those
architectures.

Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

---
v6: marked rte_cldemote as experimental
    added rte_cldemote call in existing app/test_prefetch.c

v5: documentation updated
    fixed formatting issue in release notes
    added Acked-by: Bruce Richardson <bruce.richardson@intel.com>
*
v4: updated bold text for title and fixed margin in release notes
*
v3: fixed warning regarding whitespace
*
v2: documentation updated
---
---
 app/test/test_prefetch.c                      |  4 ++++
 doc/guides/rel_notes/release_20_11.rst        |  7 +++++++
 lib/librte_eal/arm/include/rte_prefetch_32.h  |  8 ++++++++
 lib/librte_eal/arm/include/rte_prefetch_64.h  |  8 ++++++++
 lib/librte_eal/include/generic/rte_prefetch.h | 16 ++++++++++++++++
 lib/librte_eal/ppc/include/rte_prefetch.h     |  8 ++++++++
 lib/librte_eal/x86/include/rte_prefetch.h     | 12 ++++++++++++
 7 files changed, 63 insertions(+)
  

Comments

David Christensen Oct. 12, 2020, 7:31 p.m. UTC | #1
On 10/12/20 3:19 AM, Omkar Maslekar wrote:
> rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
> enables software to hint to hardware that line is likely to be shared.
> Useful in core-to-core communications where cache-line is likely to be
> shared. ARM and PPC implementation is provided with NOP and can be added
> if any equivalent instructions could be used for implementation on those
> architectures.
> 
> Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> 
> ---
> v6: marked rte_cldemote as experimental
>      added rte_cldemote call in existing app/test_prefetch.c
> 
> v5: documentation updated
>      fixed formatting issue in release notes
>      added Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> *
> v4: updated bold text for title and fixed margin in release notes
> *
> v3: fixed warning regarding whitespace
> *
> v2: documentation updated
> ---
> ---
>   app/test/test_prefetch.c                      |  4 ++++
>   doc/guides/rel_notes/release_20_11.rst        |  7 +++++++
>   lib/librte_eal/arm/include/rte_prefetch_32.h  |  8 ++++++++
>   lib/librte_eal/arm/include/rte_prefetch_64.h  |  8 ++++++++
>   lib/librte_eal/include/generic/rte_prefetch.h | 16 ++++++++++++++++
>   lib/librte_eal/ppc/include/rte_prefetch.h     |  8 ++++++++
>   lib/librte_eal/x86/include/rte_prefetch.h     | 12 ++++++++++++
>   7 files changed, 63 insertions(+)

...snip...

> diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h b/lib/librte_eal/ppc/include/rte_prefetch.h
> index 9ba07c8..9630227 100644
> --- a/lib/librte_eal/ppc/include/rte_prefetch.h
> +++ b/lib/librte_eal/ppc/include/rte_prefetch.h
> @@ -11,6 +11,7 @@
>   #endif
> 
>   #include <rte_common.h>
> +#include <rte_compat.h>
>   #include "generic/rte_prefetch.h"
> 
>   static inline void rte_prefetch0(const volatile void *p)
> @@ -34,6 +35,13 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>   	rte_prefetch0(p);
>   }
> 
> +static inline void
> +__rte_experimental
> +rte_cldemote(const volatile void *p)
> +{
> +	RTE_SET_USED(p);
> +}
> +
>   #ifdef __cplusplus
>   }
>   #endif

Don't see an equivalent operation in the 3.1 ISA for POWER processors, 
so NOP is the right implementation.

Acked-by: David Christensen <drc@linux.vnet.ibm.com>
  
Ruifeng Wang Oct. 13, 2020, 2:59 a.m. UTC | #2
> -----Original Message-----
> From: Omkar Maslekar <omkar.maslekar@intel.com>
> Sent: Monday, October 12, 2020 6:20 PM
> To: dev@dpdk.org
> Cc: bruce.richardson@intel.com; ciara.loftus@intel.com;
> omkar.maslekar@intel.com; drc@linux.vnet.ibm.com; jerinj@marvell.com;
> Ruifeng Wang <Ruifeng.Wang@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: [PATCH v6] eal: add cache-line demote support
> 
> rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
> enables software to hint to hardware that line is likely to be shared.
> Useful in core-to-core communications where cache-line is likely to be
> shared. ARM and PPC implementation is provided with NOP and can be
> added if any equivalent instructions could be used for implementation on
> those architectures.
> 
> Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> 
> ---
> v6: marked rte_cldemote as experimental
>     added rte_cldemote call in existing app/test_prefetch.c
> 
> v5: documentation updated
>     fixed formatting issue in release notes
>     added Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> *
> v4: updated bold text for title and fixed margin in release notes
> *
> v3: fixed warning regarding whitespace
> *
> v2: documentation updated
> ---
> ---
>  app/test/test_prefetch.c                      |  4 ++++
>  doc/guides/rel_notes/release_20_11.rst        |  7 +++++++
>  lib/librte_eal/arm/include/rte_prefetch_32.h  |  8 ++++++++
> lib/librte_eal/arm/include/rte_prefetch_64.h  |  8 ++++++++
> lib/librte_eal/include/generic/rte_prefetch.h | 16 ++++++++++++++++
>  lib/librte_eal/ppc/include/rte_prefetch.h     |  8 ++++++++
>  lib/librte_eal/x86/include/rte_prefetch.h     | 12 ++++++++++++
>  7 files changed, 63 insertions(+)
> 
> diff --git a/app/test/test_prefetch.c b/app/test/test_prefetch.c index
> 41f219a..5c58d0c 100644
> --- a/app/test/test_prefetch.c
> +++ b/app/test/test_prefetch.c
> @@ -26,7 +26,11 @@
>  	rte_prefetch1(&a);
>  	rte_prefetch2(&a);
> 
> +/* test for marking a line as shared to test cldemote functionality */
> +	rte_cldemote(&a);
> +
>  	return 0;
>  }
> 
> +
>  REGISTER_TEST_COMMAND(prefetch_autotest, test_prefetch); diff --git
> a/doc/guides/rel_notes/release_20_11.rst
> b/doc/guides/rel_notes/release_20_11.rst
> index df227a1..dc402ab 100644
> --- a/doc/guides/rel_notes/release_20_11.rst
> +++ b/doc/guides/rel_notes/release_20_11.rst
> @@ -55,6 +55,13 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
> 
> +* **Added new function rte_cldemote in rte_prefetch.h.**
> +
> +  Added a hardware hint CLDEMOTE, which is similar to prefetch in reverse.
> +  CLDEMOTE moves the cache line to the more remote cache, where it
> + expects  sharing to be efficient. Moving the cache line to a level
> + more distant from  the processor helps to accelerate core-to-core
> communication.
> +

Patch cannot apply. Maybe rebase is needed.

> 
>  Removed Items
>  -------------
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h
> b/lib/librte_eal/arm/include/rte_prefetch_32.h
> index e53420a..062ed27 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_32.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
> @@ -10,6 +10,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -33,6 +34,13 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	rte_prefetch0(p);
>  }
> 
> +static inline void
> +__rte_experimental

See below.

> +rte_cldemote(const volatile void *p)
> +{
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h
> b/lib/librte_eal/arm/include/rte_prefetch_64.h
> index fc2b391..6e5ee07 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_64.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
> @@ -10,6 +10,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -32,6 +33,13 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));  }
> 
> +static inline void
> +__rte_experimental
> +rte_cldemote(const volatile void *p)
> +{
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_prefetch.h
> b/lib/librte_eal/include/generic/rte_prefetch.h
> index 6e47bdf..3474548 100644
> --- a/lib/librte_eal/include/generic/rte_prefetch.h
> +++ b/lib/librte_eal/include/generic/rte_prefetch.h
> @@ -51,4 +51,20 @@
>   */
>  static inline void rte_prefetch_non_temporal(const volatile void *p);
> 
> +/**
> + * Demote a cache line to a more distant level of cache from the processor.
> + *
> + * CLDEMOTE hints to hardware to move (demote) a cache line from the
> +closest to
> + * the processor to a level more distant from the processor. It is a
> +hint and
> + * not guarantee. rte_cldemote is intended to move the cache line to
> +the more
> + * remote cache, where it expects sharing to be efficient and to
> +indicate that a
> + * line may be accessed by a different core in the future.
> + *
> + * @param p
> + *   Address to demote
> + */
> +static inline void
> +__rte_experimental

1. Experimental tag is only needed in this file. Tags at other places can be removed.
2. To align with other codes, experimental tag can be put above 'static inline void' line.

> +rte_cldemote(const volatile void *p);
> +
>  #endif /* _RTE_PREFETCH_H_ */
> diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h
> b/lib/librte_eal/ppc/include/rte_prefetch.h
> index 9ba07c8..9630227 100644
> --- a/lib/librte_eal/ppc/include/rte_prefetch.h
> +++ b/lib/librte_eal/ppc/include/rte_prefetch.h
> @@ -11,6 +11,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -34,6 +35,13 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	rte_prefetch0(p);
>  }
> 
> +static inline void
> +__rte_experimental
> +rte_cldemote(const volatile void *p)
> +{
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/x86/include/rte_prefetch.h
> b/lib/librte_eal/x86/include/rte_prefetch.h
> index 384c6b3..e1e120e 100644
> --- a/lib/librte_eal/x86/include/rte_prefetch.h
> +++ b/lib/librte_eal/x86/include/rte_prefetch.h
> @@ -10,6 +10,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -32,6 +33,17 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char
> *)p));  }
> 
> +/*
> + * we're using raw byte codes for now as only the newest compiler
> + * versions support this instruction natively.
> + */
> +static inline void
> +__rte_experimental
> +rte_cldemote(const volatile void *p)
> +{
> +	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p)); }
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 1.8.3.1
  
Bruce Richardson Oct. 13, 2020, 4:20 p.m. UTC | #3
On Tue, Oct 13, 2020 at 02:59:24AM +0000, Ruifeng Wang wrote:
> 
> > -----Original Message-----
> > From: Omkar Maslekar <omkar.maslekar@intel.com>
> > Sent: Monday, October 12, 2020 6:20 PM
> > To: dev@dpdk.org
> > Cc: bruce.richardson@intel.com; ciara.loftus@intel.com;
> > omkar.maslekar@intel.com; drc@linux.vnet.ibm.com; jerinj@marvell.com;
> > Ruifeng Wang <Ruifeng.Wang@arm.com>; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>
> > Subject: [PATCH v6] eal: add cache-line demote support
> > 
> > rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
> > enables software to hint to hardware that line is likely to be shared.
> > Useful in core-to-core communications where cache-line is likely to be
> > shared. ARM and PPC implementation is provided with NOP and can be
> > added if any equivalent instructions could be used for implementation on
> > those architectures.
> > 
> > Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > 
> > ---
> > v6: marked rte_cldemote as experimental
> >     added rte_cldemote call in existing app/test_prefetch.c
> > 
> > v5: documentation updated
> >     fixed formatting issue in release notes
> >     added Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > *
> > v4: updated bold text for title and fixed margin in release notes
> > *
> > v3: fixed warning regarding whitespace
> > *
> > v2: documentation updated
> > ---
> > ---
<snip>
> 
> > +/**
> > + * Demote a cache line to a more distant level of cache from the processor.
> > + *
> > + * CLDEMOTE hints to hardware to move (demote) a cache line from the
> > +closest to
> > + * the processor to a level more distant from the processor. It is a
> > +hint and
> > + * not guarantee. rte_cldemote is intended to move the cache line to
> > +the more
> > + * remote cache, where it expects sharing to be efficient and to
> > +indicate that a
> > + * line may be accessed by a different core in the future.
> > + *
> > + * @param p
> > + *   Address to demote
> > + */
> > +static inline void
> > +__rte_experimental
> 
> 1. Experimental tag is only needed in this file. Tags at other places can be removed.

I'm not sure that is the case. The generic file is used when preparing the
docs, so the experimental tag needs to go there for the docs, but when
actually using the function in compiled code the "generic" version is
unused. Therefore we need the experimental tag there to trigger a build
warning about using the function if the appropriate ALLOW_EXPERIMENTAL_APIS
flag is not set.

/Bruce
  
Ruifeng Wang Oct. 14, 2020, 1:55 a.m. UTC | #4
> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Wednesday, October 14, 2020 12:20 AM
> To: Ruifeng Wang <Ruifeng.Wang@arm.com>
> Cc: Omkar Maslekar <omkar.maslekar@intel.com>; dev@dpdk.org;
> ciara.loftus@intel.com; drc@linux.vnet.ibm.com; jerinj@marvell.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: Re: [PATCH v6] eal: add cache-line demote support
> 
> On Tue, Oct 13, 2020 at 02:59:24AM +0000, Ruifeng Wang wrote:
> >
> > > -----Original Message-----
> > > From: Omkar Maslekar <omkar.maslekar@intel.com>
> > > Sent: Monday, October 12, 2020 6:20 PM
> > > To: dev@dpdk.org
> > > Cc: bruce.richardson@intel.com; ciara.loftus@intel.com;
> > > omkar.maslekar@intel.com; drc@linux.vnet.ibm.com;
> > > jerinj@marvell.com; Ruifeng Wang <Ruifeng.Wang@arm.com>;
> Honnappa
> > > Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > Subject: [PATCH v6] eal: add cache-line demote support
> > >
> > > rte_cldemote is similar to a prefetch hint - in reverse.
> > > cldemote(addr) enables software to hint to hardware that line is likely to
> be shared.
> > > Useful in core-to-core communications where cache-line is likely to
> > > be shared. ARM and PPC implementation is provided with NOP and can
> > > be added if any equivalent instructions could be used for
> > > implementation on those architectures.
> > >
> > > Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
> > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > >
> > > ---
> > > v6: marked rte_cldemote as experimental
> > >     added rte_cldemote call in existing app/test_prefetch.c
> > >
> > > v5: documentation updated
> > >     fixed formatting issue in release notes
> > >     added Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > > *
> > > v4: updated bold text for title and fixed margin in release notes
> > > *
> > > v3: fixed warning regarding whitespace
> > > *
> > > v2: documentation updated
> > > ---
> > > ---
> <snip>
> >
> > > +/**
> > > + * Demote a cache line to a more distant level of cache from the
> processor.
> > > + *
> > > + * CLDEMOTE hints to hardware to move (demote) a cache line from
> > > +the closest to
> > > + * the processor to a level more distant from the processor. It is
> > > +a hint and
> > > + * not guarantee. rte_cldemote is intended to move the cache line
> > > +to the more
> > > + * remote cache, where it expects sharing to be efficient and to
> > > +indicate that a
> > > + * line may be accessed by a different core in the future.
> > > + *
> > > + * @param p
> > > + *   Address to demote
> > > + */
> > > +static inline void
> > > +__rte_experimental
> >
> > 1. Experimental tag is only needed in this file. Tags at other places can be
> removed.
> 
> I'm not sure that is the case. The generic file is used when preparing the docs,
> so the experimental tag needs to go there for the docs, but when actually
> using the function in compiled code the "generic" version is unused.
> Therefore we need the experimental tag there to trigger a build warning
> about using the function if the appropriate ALLOW_EXPERIMENTAL_APIS flag
> is not set.
> 
+David in cc.

I learnt this from David's comment in thread:
http://patches.dpdk.org/patch/61573/
"We only need it in the function prototype"

Hi David,
Can you comment if my understanding of experimental tag usage is correct?

/Ruifeng
> /Bruce
  
David Marchand Oct. 14, 2020, 7:14 a.m. UTC | #5
On Tue, Oct 13, 2020 at 6:21 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
> > 1. Experimental tag is only needed in this file. Tags at other places can be removed.
>
> I'm not sure that is the case. The generic file is used when preparing the
> docs, so the experimental tag needs to go there for the docs, but when
> actually using the function in compiled code the "generic" version is
> unused. Therefore we need the experimental tag there to trigger a build
> warning about using the function if the appropriate ALLOW_EXPERIMENTAL_APIS
> flag is not set.

It is enough to put an experimental tag when declaring a symbol.
Here, the generic/ header only contains the doxygen part and there is
no common declaration: the tag is needed in the arch specific header.
  
Ruifeng Wang Oct. 14, 2020, 7:51 a.m. UTC | #6
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Wednesday, October 14, 2020 3:14 PM
> To: Bruce Richardson <bruce.richardson@intel.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> Cc: Omkar Maslekar <omkar.maslekar@intel.com>; dev@dpdk.org;
> ciara.loftus@intel.com; drc@linux.vnet.ibm.com; jerinj@marvell.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v6] eal: add cache-line demote support
> 
> On Tue, Oct 13, 2020 at 6:21 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> > > 1. Experimental tag is only needed in this file. Tags at other places can be
> removed.
> >
> > I'm not sure that is the case. The generic file is used when preparing
> > the docs, so the experimental tag needs to go there for the docs, but
> > when actually using the function in compiled code the "generic"
> > version is unused. Therefore we need the experimental tag there to
> > trigger a build warning about using the function if the appropriate
> > ALLOW_EXPERIMENTAL_APIS flag is not set.
> 
> It is enough to put an experimental tag when declaring a symbol.
> Here, the generic/ header only contains the doxygen part and there is no
> common declaration: the tag is needed in the arch specific header.
> 
Thank you David for the clarification.

I added my reviewed-by tag to v7.
> 
> --
> David Marchand
  

Patch

diff --git a/app/test/test_prefetch.c b/app/test/test_prefetch.c
index 41f219a..5c58d0c 100644
--- a/app/test/test_prefetch.c
+++ b/app/test/test_prefetch.c
@@ -26,7 +26,11 @@ 
 	rte_prefetch1(&a);
 	rte_prefetch2(&a);
 
+/* test for marking a line as shared to test cldemote functionality */
+	rte_cldemote(&a);
+
 	return 0;
 }
 
+
 REGISTER_TEST_COMMAND(prefetch_autotest, test_prefetch);
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index df227a1..dc402ab 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -55,6 +55,13 @@  New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Added new function rte_cldemote in rte_prefetch.h.**
+
+  Added a hardware hint CLDEMOTE, which is similar to prefetch in reverse.
+  CLDEMOTE moves the cache line to the more remote cache, where it expects
+  sharing to be efficient. Moving the cache line to a level more distant from
+  the processor helps to accelerate core-to-core communication.
+
 
 Removed Items
 -------------
diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h b/lib/librte_eal/arm/include/rte_prefetch_32.h
index e53420a..062ed27 100644
--- a/lib/librte_eal/arm/include/rte_prefetch_32.h
+++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
@@ -10,6 +10,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -33,6 +34,13 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	rte_prefetch0(p);
 }
 
+static inline void
+__rte_experimental
+rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h b/lib/librte_eal/arm/include/rte_prefetch_64.h
index fc2b391..6e5ee07 100644
--- a/lib/librte_eal/arm/include/rte_prefetch_64.h
+++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
@@ -10,6 +10,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -32,6 +33,13 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));
 }
 
+static inline void
+__rte_experimental
+rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_prefetch.h b/lib/librte_eal/include/generic/rte_prefetch.h
index 6e47bdf..3474548 100644
--- a/lib/librte_eal/include/generic/rte_prefetch.h
+++ b/lib/librte_eal/include/generic/rte_prefetch.h
@@ -51,4 +51,20 @@ 
  */
 static inline void rte_prefetch_non_temporal(const volatile void *p);
 
+/**
+ * Demote a cache line to a more distant level of cache from the processor.
+ *
+ * CLDEMOTE hints to hardware to move (demote) a cache line from the closest to
+ * the processor to a level more distant from the processor. It is a hint and
+ * not guarantee. rte_cldemote is intended to move the cache line to the more
+ * remote cache, where it expects sharing to be efficient and to indicate that a
+ * line may be accessed by a different core in the future.
+ *
+ * @param p
+ *   Address to demote
+ */
+static inline void
+__rte_experimental
+rte_cldemote(const volatile void *p);
+
 #endif /* _RTE_PREFETCH_H_ */
diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h b/lib/librte_eal/ppc/include/rte_prefetch.h
index 9ba07c8..9630227 100644
--- a/lib/librte_eal/ppc/include/rte_prefetch.h
+++ b/lib/librte_eal/ppc/include/rte_prefetch.h
@@ -11,6 +11,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -34,6 +35,13 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	rte_prefetch0(p);
 }
 
+static inline void
+__rte_experimental
+rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/x86/include/rte_prefetch.h b/lib/librte_eal/x86/include/rte_prefetch.h
index 384c6b3..e1e120e 100644
--- a/lib/librte_eal/x86/include/rte_prefetch.h
+++ b/lib/librte_eal/x86/include/rte_prefetch.h
@@ -10,6 +10,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -32,6 +33,17 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p));
 }
 
+/*
+ * we're using raw byte codes for now as only the newest compiler
+ * versions support this instruction natively.
+ */
+static inline void
+__rte_experimental
+rte_cldemote(const volatile void *p)
+{
+	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
+}
+
 #ifdef __cplusplus
 }
 #endif