[v2,3/3] mempool: use cache for frequently updated statistics

Message ID 20221031112634.18329-3-mb@smartsharesystems.com (mailing list archive)
State Superseded, archived
Headers
Series [v2,1/3] mempool: split statistics from debug |

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS

Commit Message

Morten Brørup Oct. 31, 2022, 11:26 a.m. UTC
  When built with statistics enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
performance of mempools with caches is improved as follows.

When accessing objects in the mempool, either the put_bulk and put_objs or
the get_success_bulk and get_success_objs statistics counters are likely
to be incremented.

By adding an alternative set of these counters to the mempool cache
structure, accesing the dedicated statistics structure is avoided in the
likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
Updating some statistics counters in the same cache line has lower
performance cost than accessing the statistics counters in the dedicated
statistics structure, which resides in another cache line.

mempool_perf_autotest with this patch shows the follwing change in
rate_persec.

Compared to only spliting statistics from debug:
+1.5 % and +14.4 %, respectively without and with cache.

Compared to not enabling mempool stats:
-4.4 % and -9.9 %, respectively without and with cache.

v2:
* Move the statistics counters into a stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  9 +++++
 lib/mempool/rte_mempool.h | 73 ++++++++++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 13 deletions(-)
  

Comments

Mattias Rönnblom Nov. 2, 2022, 8:01 a.m. UTC | #1
On 2022-10-31 12:26, Morten Brørup wrote:
> When built with statistics enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
> performance of mempools with caches is improved as follows.
> 
> When accessing objects in the mempool, either the put_bulk and put_objs or
> the get_success_bulk and get_success_objs statistics counters are likely
> to be incremented.
> 
> By adding an alternative set of these counters to the mempool cache
> structure, accesing the dedicated statistics structure is avoided in the
> likely cases where these counters are incremented.
> 
> The trick here is that the cache line holding the mempool cache structure
> is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
> Updating some statistics counters in the same cache line has lower
> performance cost than accessing the statistics counters in the dedicated
> statistics structure, which resides in another cache line.
> 
> mempool_perf_autotest with this patch shows the follwing change in
> rate_persec.
> 
> Compared to only spliting statistics from debug:
> +1.5 % and +14.4 %, respectively without and with cache.
> 
> Compared to not enabling mempool stats:
> -4.4 % and -9.9 %, respectively without and with cache.
> 
> v2:
> * Move the statistics counters into a stats structure.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>   lib/mempool/rte_mempool.c |  9 +++++
>   lib/mempool/rte_mempool.h | 73 ++++++++++++++++++++++++++++++++-------
>   2 files changed, 69 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index e6208125e0..a18e39af04 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -1286,6 +1286,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>   		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
>   		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
>   	}
> +	if (mp->cache_size != 0) {
> +		/* Add the statistics stored in the mempool caches. */
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> +			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
> +			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
> +			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
> +			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
> +		}
> +	}
>   	fprintf(f, "  stats:\n");
>   	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
>   	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 16e7e62e3c..5806e75609 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -86,6 +86,21 @@ struct rte_mempool_cache {
>   	uint32_t size;	      /**< Size of the cache */
>   	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>   	uint32_t len;	      /**< Current cache count */
> +	uint32_t unused0;
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +	/*
> +	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
> +	 * providing faster update access when using a mempool cache.
> +	 */
> +	struct {
> +		uint64_t put_bulk;          /**< Number of puts. */
> +		uint64_t put_objs;          /**< Number of objects successfully put. */
> +		uint64_t get_success_bulk;  /**< Successful allocation number. */
> +		uint64_t get_success_objs;  /**< Objects successfully allocated. */
> +	} stats;                        /**< Statistics */
> +#else
> +	uint64_t unused1[4];

Are a particular DPDK version supposed to be ABI compatible with itself, 
with different configuration options? E.g., with and without 
RTE_LIBRTE_MEMPOOL_STATS. Is that why you have those 4 unused uint64_ts?

> +#endif
>   	/**
>   	 * Cache objects
>   	 *
> @@ -296,14 +311,14 @@ struct rte_mempool {
>   	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
>   	)
>   /**
> - * @internal When debug is enabled, store some statistics.
> + * @internal When stats is enabled, store some statistics.
>    *
>    * @param mp
>    *   Pointer to the memory pool.
>    * @param name
>    *   Name of the statistics field to increment in the memory pool.
>    * @param n
> - *   Number to add to the object-oriented statistics.
> + *   Number to add to the statistics.
>    */
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> @@ -312,6 +327,23 @@ struct rte_mempool {
>   #else
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
>   #endif
> +/**
> + * @internal When stats is enabled, store some statistics.
> + *
> + * @param cache
> + *   Pointer to the memory pool cache.
> + * @param name
> + *   Name of the statistics field to increment in the memory pool cache.
> + * @param n
> + *   Number to add to the statistics.
> + */
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do { \
> +		(cache)->stats.name += n;               \
> +	} while (0)
> +#else
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)

Somewhat unrelated comment: maybe <rte_common.h> should have a RTE_NOP 
macro.

> +#endif
>   
>   /**
>    * @internal Calculate the size of the mempool header.
> @@ -1327,13 +1359,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>   {
>   	void **cache_objs;
>   
> +	/* No cache provided */
> +	if (unlikely(cache == NULL))
> +		goto driver_enqueue;
> +
>   	/* increment stat now, adding in mempool always success */
> -	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>   
> -	/* No cache provided or the request itself is too big for the cache */
> -	if (unlikely(cache == NULL || n > cache->flushthresh))
> -		goto driver_enqueue;
> +	/* The request itself is too big for the cache */
> +	if (unlikely(n > cache->flushthresh))
> +		goto driver_enqueue_stats_incremented;
>   
>   	/*
>   	 * The cache follows the following algorithm:
> @@ -1358,6 +1394,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>   
>   driver_enqueue:
>   
> +	/* increment stat now, adding in mempool always success */
> +	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> +	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +
> +driver_enqueue_stats_incremented:
> +
>   	/* push objects to the backend */
>   	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
>   }
> @@ -1464,8 +1506,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   	if (remaining == 0) {
>   		/* The entire request is satisfied from the cache. */
>   
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
>   		return 0;
>   	}
> @@ -1494,8 +1536,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   
>   	cache->len = cache->size;
>   
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
>   	return 0;
>   
> @@ -1517,8 +1559,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
>   		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>   	} else {
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		if (likely(cache != NULL)) {
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +		} else {
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		}
>   	}
>   
>   	return ret;
  
Morten Brørup Nov. 2, 2022, 9:29 a.m. UTC | #2
> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 2 November 2022 09.01
> 
> On 2022-10-31 12:26, Morten Brørup wrote:

[...]

> > +++ b/lib/mempool/rte_mempool.h
> > @@ -86,6 +86,21 @@ struct rte_mempool_cache {
> >   	uint32_t size;	      /**< Size of the cache */
> >   	uint32_t flushthresh; /**< Threshold before we flush excess
> elements */
> >   	uint32_t len;	      /**< Current cache count */
> > +	uint32_t unused0;
> > +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> > +	/*
> > +	 * Alternative location for the most frequently updated mempool
> statistics (per-lcore),
> > +	 * providing faster update access when using a mempool cache.
> > +	 */
> > +	struct {
> > +		uint64_t put_bulk;          /**< Number of puts. */
> > +		uint64_t put_objs;          /**< Number of objects
> successfully put. */
> > +		uint64_t get_success_bulk;  /**< Successful allocation
> number. */
> > +		uint64_t get_success_objs;  /**< Objects successfully
> allocated. */
> > +	} stats;                        /**< Statistics */
> > +#else
> > +	uint64_t unused1[4];
> 
> Are a particular DPDK version supposed to be ABI compatible with
> itself,
> with different configuration options? E.g., with and without
> RTE_LIBRTE_MEMPOOL_STATS. Is that why you have those 4 unused
> uint64_ts?

Valid point: There was no ABI compatibility between with and without RTE_LIBRTE_MEMPOOL_STATS before this patch, so there is no need to add partial ABI compatibility here.

The #else part of this structure should be removed.

Does anyone disagree?

> > +#endif
  
Mattias Rönnblom Nov. 2, 2022, 5:55 p.m. UTC | #3
On 2022-11-02 10:29, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Wednesday, 2 November 2022 09.01
>>
>> On 2022-10-31 12:26, Morten Brørup wrote:
> 
> [...]
> 
>>> +++ b/lib/mempool/rte_mempool.h
>>> @@ -86,6 +86,21 @@ struct rte_mempool_cache {
>>>    	uint32_t size;	      /**< Size of the cache */
>>>    	uint32_t flushthresh; /**< Threshold before we flush excess
>> elements */
>>>    	uint32_t len;	      /**< Current cache count */
>>> +	uint32_t unused0;
>>> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>>> +	/*
>>> +	 * Alternative location for the most frequently updated mempool
>> statistics (per-lcore),
>>> +	 * providing faster update access when using a mempool cache.
>>> +	 */
>>> +	struct {
>>> +		uint64_t put_bulk;          /**< Number of puts. */
>>> +		uint64_t put_objs;          /**< Number of objects
>> successfully put. */
>>> +		uint64_t get_success_bulk;  /**< Successful allocation
>> number. */
>>> +		uint64_t get_success_objs;  /**< Objects successfully
>> allocated. */
>>> +	} stats;                        /**< Statistics */
>>> +#else
>>> +	uint64_t unused1[4];
>>
>> Are a particular DPDK version supposed to be ABI compatible with
>> itself,
>> with different configuration options? E.g., with and without
>> RTE_LIBRTE_MEMPOOL_STATS. Is that why you have those 4 unused
>> uint64_ts?
> 
> Valid point: There was no ABI compatibility between with and without RTE_LIBRTE_MEMPOOL_STATS before this patch, so there is no need to add partial ABI compatibility here.
> 
> The #else part of this structure should be removed.
> 
> Does anyone disagree?
> 
>>> +#endif
> 

I have no opinion on that matter, but I have another question: if you 
remove 'unused1', should you also remove the unused0 field?
  

Patch

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index e6208125e0..a18e39af04 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1286,6 +1286,15 @@  rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
+	if (mp->cache_size != 0) {
+		/* Add the statistics stored in the mempool caches. */
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
+		}
+	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
 	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 16e7e62e3c..5806e75609 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,6 +86,21 @@  struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+	uint32_t unused0;
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	/*
+	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
+	 * providing faster update access when using a mempool cache.
+	 */
+	struct {
+		uint64_t put_bulk;          /**< Number of puts. */
+		uint64_t put_objs;          /**< Number of objects successfully put. */
+		uint64_t get_success_bulk;  /**< Successful allocation number. */
+		uint64_t get_success_objs;  /**< Objects successfully allocated. */
+	} stats;                        /**< Statistics */
+#else
+	uint64_t unused1[4];
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -296,14 +311,14 @@  struct rte_mempool {
 	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
 	)
 /**
- * @internal When debug is enabled, store some statistics.
+ * @internal When stats is enabled, store some statistics.
  *
  * @param mp
  *   Pointer to the memory pool.
  * @param name
  *   Name of the statistics field to increment in the memory pool.
  * @param n
- *   Number to add to the object-oriented statistics.
+ *   Number to add to the statistics.
  */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
@@ -312,6 +327,23 @@  struct rte_mempool {
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
 #endif
+/**
+ * @internal When stats is enabled, store some statistics.
+ *
+ * @param cache
+ *   Pointer to the memory pool cache.
+ * @param name
+ *   Name of the statistics field to increment in the memory pool cache.
+ * @param n
+ *   Number to add to the statistics.
+ */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do { \
+		(cache)->stats.name += n;               \
+	} while (0)
+#else
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
+#endif
 
 /**
  * @internal Calculate the size of the mempool header.
@@ -1327,13 +1359,17 @@  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request itself is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1358,6 +1394,12 @@  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1464,8 +1506,8 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 		return 0;
 	}
@@ -1494,8 +1536,8 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
 
@@ -1517,8 +1559,13 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		if (likely(cache != NULL)) {
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		} else {
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		}
 	}
 
 	return ret;