mempool: split statistics from debug

Message ID 20221030115445.2115-1-mb@smartsharesystems.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series mempool: split statistics from debug |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance fail Performance Testing issues
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-x86_64-unit-testing fail Testing issues
ci/github-robot: build fail github build: failed
ci/iol-aarch64-compile-testing success Testing PASS

Commit Message

Morten Brørup Oct. 30, 2022, 11:54 a.m. UTC
  Split statistics from debug, to make mempool statistics available without
the performance cost of continuously validating the cookies in the mempool
elements.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 config/rte_config.h       |  2 ++
 lib/mempool/rte_mempool.c | 10 +++++-----
 lib/mempool/rte_mempool.h | 14 +++++++-------
 3 files changed, 14 insertions(+), 12 deletions(-)
  

Comments

Morten Brørup Oct. 30, 2022, 2:04 p.m. UTC | #1
> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Sunday, 30 October 2022 12.55
> 
> Split statistics from debug, to make mempool statistics available
> without
> the performance cost of continuously validating the cookies in the
> mempool
> elements.

mempool_perf_autotest shows that the rate_persec drops to a third (-66 %) when enabling full mempool debug - quite prohibitive!

With this patch, the performance cost is much lower. I don't have detailed test results, but the initial tests without mempool cache show a performance drop of -11 %. Significant, but not prohibitive.

> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  config/rte_config.h       |  2 ++
>  lib/mempool/rte_mempool.c | 10 +++++-----
>  lib/mempool/rte_mempool.h | 14 +++++++-------
>  3 files changed, 14 insertions(+), 12 deletions(-)
> 
> diff --git a/config/rte_config.h b/config/rte_config.h
> index ae56a86394..1b12d30557 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -47,6 +47,8 @@
> 
>  /* mempool defines */
>  #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
> +// RTE_LIBRTE_MEMPOOL_STATS is not set
> +// RTE_LIBRTE_MEMPOOL_DEBUG is not set
> 
>  /* mbuf defines */
>  #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 21c94a2b9f..f180257a54 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -818,8 +818,8 @@ rte_mempool_create_empty(const char *name, unsigned
> n, unsigned elt_size,
>  			  RTE_CACHE_LINE_MASK) != 0);
>  	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
>  			  RTE_CACHE_LINE_MASK) != 0);
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> -	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_stats) &
>  			  RTE_CACHE_LINE_MASK) != 0);
>  	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
>  			  RTE_CACHE_LINE_MASK) != 0);
> @@ -1221,9 +1221,9 @@ rte_mempool_audit(struct rte_mempool *mp)
>  void
>  rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>  {
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  	struct rte_mempool_info info;
> -	struct rte_mempool_debug_stats sum;
> +	struct rte_mempool_stats sum;
>  	unsigned lcore_id;
>  #endif
>  	struct rte_mempool_memhdr *memhdr;
> @@ -1269,7 +1269,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>  	fprintf(f, "  common_pool_count=%u\n", common_count);
> 
>  	/* sum and dump statistics */
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  	rte_mempool_ops_get_info(mp, &info);
>  	memset(&sum, 0, sizeof(sum));
>  	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 3725a72951..89640e48e5 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -56,14 +56,14 @@ extern "C" {
>  #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header
> cookie. */
>  #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer
> cookie.*/
> 
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  /**
>   * A structure that stores the mempool statistics (per-lcore).
>   * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are
> not
>   * captured since they can be calculated from other stats.
>   * For example: put_cache_objs = put_objs - put_common_pool_objs.
>   */
> -struct rte_mempool_debug_stats {
> +struct rte_mempool_stats {
>  	uint64_t put_bulk;             /**< Number of puts. */
>  	uint64_t put_objs;             /**< Number of objects
> successfully put. */
>  	uint64_t put_common_pool_bulk; /**< Number of bulks enqueued in
> common pool. */
> @@ -237,9 +237,9 @@ struct rte_mempool {
>  	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
>  	struct rte_mempool_memhdr_list mem_list; /**< List of memory
> chunks */
> 
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  	/** Per-lcore statistics. */
> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> +	struct rte_mempool_stats stats[RTE_MAX_LCORE];
>  #endif
>  }  __rte_cache_aligned;
> 
> @@ -293,16 +293,16 @@ struct rte_mempool {
>  	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
>  	)
>  /**
> - * @internal When debug is enabled, store some statistics.
> + * @internal When stats are enabled, store some statistics.
>   *
>   * @param mp
>   *   Pointer to the memory pool.
>   * @param name
>   *   Name of the statistics field to increment in the memory pool.
>   * @param n
> - *   Number to add to the object-oriented statistics.
> + *   Number to add to the statistics.
>   */
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
>  		unsigned __lcore_id = rte_lcore_id();           \
>  		if (__lcore_id < RTE_MAX_LCORE) {               \
> --
> 2.17.1
  
Stephen Hemminger Oct. 30, 2022, 4:12 p.m. UTC | #2
On Sun, 30 Oct 2022 15:04:18 +0100
Morten Brørup <mb@smartsharesystems.com> wrote:

> > From: Morten Brørup [mailto:mb@smartsharesystems.com]
> > Sent: Sunday, 30 October 2022 12.55
> > 
> > Split statistics from debug, to make mempool statistics available
> > without
> > the performance cost of continuously validating the cookies in the
> > mempool
> > elements.  
> 
> mempool_perf_autotest shows that the rate_persec drops to a third (-66 %) when enabling full mempool debug - quite prohibitive!
> 
> With this patch, the performance cost is much lower. I don't have detailed test results, but the initial tests without mempool cache show a performance drop of -11 %. Significant, but not prohibitive.


One trick to avoid conditional in fast path would be to add a dummy stats[] per core.

Another would be to move the fast path get/put stats into the mempool_cache.
The current model has stats[] per cpu always on another cache line.

diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 1f5707f46a21..87905b7286a6 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -236,8 +236,10 @@ struct rte_mempool {
        struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-       /** Per-lcore statistics. */
-       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+       /** Per-lcore statistics.
+        * Allocate one additional per-cpu slot for non-DPDK threads
+        */
+       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
 }  __rte_cache_aligned;
 
@@ -302,10 +304,7 @@ struct rte_mempool {
  */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
-               unsigned __lcore_id = rte_lcore_id();           \
-               if (__lcore_id < RTE_MAX_LCORE) {               \
-                       mp->stats[__lcore_id].name += n;        \
-               }                                               \
+       (mp)->stats[rte_lcore_id()].name += n;
        } while (0)
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
  
Morten Brørup Oct. 30, 2022, 8:29 p.m. UTC | #3
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Sunday, 30 October 2022 17.13
> 
> On Sun, 30 Oct 2022 15:04:18 +0100
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > > From: Morten Brørup [mailto:mb@smartsharesystems.com]
> > > Sent: Sunday, 30 October 2022 12.55
> > >
> > > Split statistics from debug, to make mempool statistics available
> > > without
> > > the performance cost of continuously validating the cookies in the
> > > mempool
> > > elements.
> >
> > mempool_perf_autotest shows that the rate_persec drops to a third (-
> 66 %) when enabling full mempool debug - quite prohibitive!
> >
> > With this patch, the performance cost is much lower. I don't have
> detailed test results, but the initial tests without mempool cache show
> a performance drop of -11 %. Significant, but not prohibitive.
> 
> 
> One trick to avoid conditional in fast path would be to add a dummy
> stats[] per core.
> 
> Another would be to move the fast path get/put stats into the
> mempool_cache.
> The current model has stats[] per cpu always on another cache line.

I submitted a patch to move the likely get/put stats to the mempool cache [1], but retracted it shortly after. I realized that splitting the stats from the debug cookies had much higher performance effect, so we should do this first. We can move statistics counters into the mempool_cache as part two.

[1]: https://patchwork.dpdk.org/project/dpdk/patch/20221028064152.98341-2-mb@smartsharesystems.com/

Also, it might be too late for 22.11 to move stats to the mempool cache. However, splitting the stats from the debug cookies is extremely simple, so that should be accepted for 22.11 (if reviewed properly).

> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1f5707f46a21..87905b7286a6 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -236,8 +236,10 @@ struct rte_mempool {
>         struct rte_mempool_memhdr_list mem_list; /**< List of memory
> chunks */
> 
>  #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> -       /** Per-lcore statistics. */
> -       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> +       /** Per-lcore statistics.
> +        * Allocate one additional per-cpu slot for non-DPDK threads
> +        */
> +       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];

Excellent! As a bonus, this also fixes a bug in the statistics: Non-DPDK thread operations were not counted.

>  #endif
>  }  __rte_cache_aligned;
> 
> @@ -302,10 +304,7 @@ struct rte_mempool {
>   */
>  #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> -               unsigned __lcore_id = rte_lcore_id();           \
> -               if (__lcore_id < RTE_MAX_LCORE) {               \
> -                       mp->stats[__lcore_id].name += n;        \
> -               }                                               \
> +       (mp)->stats[rte_lcore_id()].name += n;

I suppose the index must be offset by one, for rte_lcore_id() returning LCORE_ID_ANY (=UINT32_MAX) to be stored at offset zero:
+       (mp)->stats[rte_lcore_id() + 1].name += n;

>         } while (0)
>  #else
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)

I have marked this patch as Changes Requested, and will submit a v2 patch series with this improvement - but not today, considering the local time.

NB: I am aware that the loop in rte_mempool_dump() in rte_mempool.c must also be updated to account for the additional slot in the stats array. I'll check if there are similar effects elsewhere in the library.
  

Patch

diff --git a/config/rte_config.h b/config/rte_config.h
index ae56a86394..1b12d30557 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -47,6 +47,8 @@ 
 
 /* mempool defines */
 #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
+// RTE_LIBRTE_MEMPOOL_STATS is not set
+// RTE_LIBRTE_MEMPOOL_DEBUG is not set
 
 /* mbuf defines */
 #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..f180257a54 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -818,8 +818,8 @@  rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
@@ -1221,9 +1221,9 @@  rte_mempool_audit(struct rte_mempool *mp)
 void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	struct rte_mempool_info info;
-	struct rte_mempool_debug_stats sum;
+	struct rte_mempool_stats sum;
 	unsigned lcore_id;
 #endif
 	struct rte_mempool_memhdr *memhdr;
@@ -1269,7 +1269,7 @@  rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "  common_pool_count=%u\n", common_count);
 
 	/* sum and dump statistics */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..89640e48e5 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -56,14 +56,14 @@  extern "C" {
 #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header cookie. */
 #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer cookie.*/
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 /**
  * A structure that stores the mempool statistics (per-lcore).
  * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are not
  * captured since they can be calculated from other stats.
  * For example: put_cache_objs = put_objs - put_common_pool_objs.
  */
-struct rte_mempool_debug_stats {
+struct rte_mempool_stats {
 	uint64_t put_bulk;             /**< Number of puts. */
 	uint64_t put_objs;             /**< Number of objects successfully put. */
 	uint64_t put_common_pool_bulk; /**< Number of bulks enqueued in common pool. */
@@ -237,9 +237,9 @@  struct rte_mempool {
 	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	/** Per-lcore statistics. */
-	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+	struct rte_mempool_stats stats[RTE_MAX_LCORE];
 #endif
 }  __rte_cache_aligned;
 
@@ -293,16 +293,16 @@  struct rte_mempool {
 	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
 	)
 /**
- * @internal When debug is enabled, store some statistics.
+ * @internal When stats are enabled, store some statistics.
  *
  * @param mp
  *   Pointer to the memory pool.
  * @param name
  *   Name of the statistics field to increment in the memory pool.
  * @param n
- *   Number to add to the object-oriented statistics.
+ *   Number to add to the statistics.
  */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
 		unsigned __lcore_id = rte_lcore_id();           \
 		if (__lcore_id < RTE_MAX_LCORE) {               \