mempool: optimized debug statistics

Message ID 20221027182239.89480-1-mb@smartsharesystems.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series mempool: optimized debug statistics |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation warning apply issues

Commit Message

Morten Brørup Oct. 27, 2022, 6:22 p.m. UTC
  When built with debug enabled (RTE_LIBRTE_MEMPOOL_DEBUG defined), the
performance of mempools with caches is improved as follows.

Accessing objects in the mempool is likely to increment either the
put_bulk and put_objs or the get_success_bulk and get_success_objs
debug statistics counters.

By adding an alternative set of these counters to the mempool cache
structure, accesing the dedicated debug statistics structure is avoided in
the likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to update the "len" field. Updating some
debug statistics counters in the same cache line has lower performance
cost than accessing the debug statistics counters in the dedicated debug
statistics structure, i.e. in another cache line.

Running mempool_perf_autotest on a VMware virtual server shows an avg.
increase of 6.4 % in rate_persec for the tests with cache. (Only when
built with with debug enabled, obviously!)

For the tests without cache, the avg. increase in rate_persec is 0.8 %. I
assume this is noise from the test environment.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  7 +++++
 lib/mempool/rte_mempool.h | 55 +++++++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 11 deletions(-)
  

Patch

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..7b8c00a022 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1285,6 +1285,13 @@  rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_fail_objs += mp->stats[lcore_id].get_fail_objs;
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
+		/* Add the fast access statistics, if local caches exist */
+		if (mp->cache_size != 0) {
+			sum.put_bulk += mp->local_cache[lcore_id].put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].get_success_objs;
+		}
 	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..d84087bc92 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,6 +86,14 @@  struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	uint32_t unused;
+	/* Fast access statistics, only for likely events */
+	uint64_t put_bulk;             /**< Number of puts. */
+	uint64_t put_objs;             /**< Number of objects successfully put. */
+	uint64_t get_success_bulk;     /**< Successful allocation number. */
+	uint64_t get_success_objs;     /**< Objects successfully allocated. */
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -1327,13 +1335,19 @@  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	cache->put_bulk += 1;
+	cache->put_objs += n;
+#endif
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1358,6 +1372,12 @@  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1464,8 +1484,10 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		cache->get_success_bulk += 1;
+		cache->get_success_objs += n;
+#endif
 
 		return 0;
 	}
@@ -1494,8 +1516,10 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	cache->get_success_bulk += 1;
+	cache->get_success_objs += n;
+#endif
 
 	return 0;
 
@@ -1517,8 +1541,17 @@  rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (likely(cache != NULL)) {
+			cache->get_success_bulk += 1;
+			cache->get_success_objs += n;
+		} else {
+#endif
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		}
+#endif
 	}
 
 	return ret;