From patchwork Fri Oct 28 06:35:03 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: =?utf-8?q?Morten_Br=C3=B8rup?=
 <mb@smartsharesystems.com>
X-Patchwork-Id: 119247
X-Patchwork-Delegate: thomas@monjalon.net
Return-Path: <dev-bounces@dpdk.org>
X-Original-To: patchwork@inbox.dpdk.org
Delivered-To: patchwork@inbox.dpdk.org
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 3A799A0542;
	Fri, 28 Oct 2022 08:35:11 +0200 (CEST)
Received: from [217.70.189.124] (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id CD0D4400D5;
	Fri, 28 Oct 2022 08:35:10 +0200 (CEST)
Received: from smartserver.smartsharesystems.com
 (smartserver.smartsharesystems.com [77.243.40.215])
 by mails.dpdk.org (Postfix) with ESMTP id 3950C40041
 for <dev@dpdk.org>; Fri, 28 Oct 2022 08:35:09 +0200 (CEST)
Received: from dkrd2.smartsharesys.local ([192.168.4.12]) by
 smartserver.smartsharesystems.com with Microsoft SMTPSVC(6.0.3790.4675);
 Fri, 28 Oct 2022 08:35:06 +0200
From: =?utf-8?q?Morten_Br=C3=B8rup?= <mb@smartsharesystems.com>
To: olivier.matz@6wind.com,
	andrew.rybchenko@oktetlabs.ru
Cc: jerinj@marvell.com, thomas@monjalon.net, bruce.richardson@intel.com,
 dev@dpdk.org, =?utf-8?q?Morten_Br=C3=B8rup?= <mb@smartsharesystems.com>
Subject: [PATCH v3 1/2] mempool: cache align mempool cache objects
Date: Fri, 28 Oct 2022 08:35:03 +0200
Message-Id: <20221028063504.98184-1-mb@smartsharesystems.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20221026144436.71068-1-mb@smartsharesystems.com>
References: <20221026144436.71068-1-mb@smartsharesystems.com>
MIME-Version: 1.0
X-OriginalArrivalTime: 28 Oct 2022 06:35:06.0648 (UTC)
 FILETIME=[6B2CA980:01D8EA97]
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org

Add __rte_cache_aligned to the objs array.

It makes no difference in the general case, but if get/put operations are
always 32 objects, it will reduce the number of memory (or last level
cache) accesses from five to four 64 B cache lines for every get/put
operation.

For readability reasons, an example using 16 objects follows:

Currently, with 16 objects (128B), we access to 3
cache lines:

      ┌────────┐
      │len     │
cache │********│---
line0 │********│ ^
      │********│ |
      ├────────┤ | 16 objects
      │********│ | 128B
cache │********│ |
line1 │********│ |
      │********│ |
      ├────────┤ |
      │********│_v_
cache │        │
line2 │        │
      │        │
      └────────┘

With the alignment, it is also 3 cache lines:

      ┌────────┐
      │len     │
cache │        │
line0 │        │
      │        │
      ├────────┤---
      │********│ ^
cache │********│ |
line1 │********│ |
      │********│ |
      ├────────┤ | 16 objects
      │********│ | 128B
cache │********│ |
line2 │********│ |
      │********│ v
      └────────┘---

However, accessing the objects at the bottom of the mempool cache is a
special case, where cache line0 is also used for objects.

Consider the next burst (and any following bursts):

Current:
      ┌────────┐
      │len     │
cache │        │
line0 │        │
      │        │
      ├────────┤
      │        │
cache │        │
line1 │        │
      │        │
      ├────────┤
      │        │
cache │********│---
line2 │********│ ^
      │********│ |
      ├────────┤ | 16 objects
      │********│ | 128B
cache │********│ |
line3 │********│ |
      │********│ |
      ├────────┤ |
      │********│_v_
cache │        │
line4 │        │
      │        │
      └────────┘
4 cache lines touched, incl. line0 for len.

With the proposed alignment:
      ┌────────┐
      │len     │
cache │        │
line0 │        │
      │        │
      ├────────┤
      │        │
cache │        │
line1 │        │
      │        │
      ├────────┤
      │        │
cache │        │
line2 │        │
      │        │
      ├────────┤
      │********│---
cache │********│ ^
line3 │********│ |
      │********│ | 16 objects
      ├────────┤ | 128B
      │********│ |
cache │********│ |
line4 │********│ |
      │********│_v_
      └────────┘
Only 3 cache lines touched, incl. line0 for len.

Credits go to Olivier Matz for the nice ASCII graphics.

v3:
* No changes. Made part of a series.
v2:
* No such version.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 1f5707f46a..3725a72951 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,11 +86,13 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
-	/*
+	/**
+	 * Cache objects
+	 *
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2]; /**< Cache objects */
+	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2] __rte_cache_aligned;
 } __rte_cache_aligned;
 
 /**

From patchwork Fri Oct 28 06:35:04 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: =?utf-8?q?Morten_Br=C3=B8rup?=
 <mb@smartsharesystems.com>
X-Patchwork-Id: 119248
X-Patchwork-Delegate: thomas@monjalon.net
Return-Path: <dev-bounces@dpdk.org>
X-Original-To: patchwork@inbox.dpdk.org
Delivered-To: patchwork@inbox.dpdk.org
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 8E749A0542;
	Fri, 28 Oct 2022 08:35:16 +0200 (CEST)
Received: from [217.70.189.124] (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id 9BFDF40A89;
	Fri, 28 Oct 2022 08:35:11 +0200 (CEST)
Received: from smartserver.smartsharesystems.com
 (smartserver.smartsharesystems.com [77.243.40.215])
 by mails.dpdk.org (Postfix) with ESMTP id 5B0E2400D5
 for <dev@dpdk.org>; Fri, 28 Oct 2022 08:35:09 +0200 (CEST)
Received: from dkrd2.smartsharesys.local ([192.168.4.12]) by
 smartserver.smartsharesystems.com with Microsoft SMTPSVC(6.0.3790.4675);
 Fri, 28 Oct 2022 08:35:08 +0200
From: =?utf-8?q?Morten_Br=C3=B8rup?= <mb@smartsharesystems.com>
To: olivier.matz@6wind.com,
	andrew.rybchenko@oktetlabs.ru
Cc: jerinj@marvell.com, thomas@monjalon.net, bruce.richardson@intel.com,
 dev@dpdk.org, =?utf-8?q?Morten_Br=C3=B8rup?= <mb@smartsharesystems.com>
Subject: [PATCH v3 2/2] mempool: optimized debug statistics
Date: Fri, 28 Oct 2022 08:35:04 +0200
Message-Id: <20221028063504.98184-2-mb@smartsharesystems.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20221028063504.98184-1-mb@smartsharesystems.com>
References: <20221026144436.71068-1-mb@smartsharesystems.com>
 <20221028063504.98184-1-mb@smartsharesystems.com>
MIME-Version: 1.0
X-OriginalArrivalTime: 28 Oct 2022 06:35:08.0023 (UTC)
 FILETIME=[6BFE7870:01D8EA97]
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org

When built with debug enabled (RTE_LIBRTE_MEMPOOL_DEBUG defined), the
performance of mempools with caches is improved as follows.

Accessing objects in the mempool is likely to increment either the
put_bulk and put_objs or the get_success_bulk and get_success_objs
debug statistics counters.

By adding an alternative set of these counters to the mempool cache
structure, accesing the dedicated debug statistics structure is avoided in
the likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to update the "len" field. Updating some
debug statistics counters in the same cache line has lower performance
cost than accessing the debug statistics counters in the dedicated debug
statistics structure, i.e. in another cache line.

Running mempool_perf_autotest on a VMware virtual server shows an avg.
increase of 6.4 % in rate_persec for the tests with cache. (Only when
built with with debug enabled, obviously!)

For the tests without cache, the avg. increase in rate_persec is 0.8 %. I
assume this is noise from the test environment.

v3:
* Try to fix git reference by making part of a series.
* Add --in-reply-to v1 when sending email.
v2:
* Fix spelling and repeated word in commit message, caught by checkpatch.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  7 +++++
 lib/mempool/rte_mempool.h | 55 +++++++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..7b8c00a022 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1285,6 +1285,13 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_fail_objs += mp->stats[lcore_id].get_fail_objs;
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
+		/* Add the fast access statistics, if local caches exist */
+		if (mp->cache_size != 0) {
+			sum.put_bulk += mp->local_cache[lcore_id].put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].get_success_objs;
+		}
 	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..d84087bc92 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,6 +86,14 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	uint32_t unused;
+	/* Fast access statistics, only for likely events */
+	uint64_t put_bulk;             /**< Number of puts. */
+	uint64_t put_objs;             /**< Number of objects successfully put. */
+	uint64_t get_success_bulk;     /**< Successful allocation number. */
+	uint64_t get_success_objs;     /**< Objects successfully allocated. */
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -1327,13 +1335,19 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	cache->put_bulk += 1;
+	cache->put_objs += n;
+#endif
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1358,6 +1372,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1464,8 +1484,10 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		cache->get_success_bulk += 1;
+		cache->get_success_objs += n;
+#endif
 
 		return 0;
 	}
@@ -1494,8 +1516,10 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	cache->get_success_bulk += 1;
+	cache->get_success_objs += n;
+#endif
 
 	return 0;
 
@@ -1517,8 +1541,17 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (likely(cache != NULL)) {
+			cache->get_success_bulk += 1;
+			cache->get_success_bulk += n;
+		} else {
+#endif
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		}
+#endif
 	}
 
 	return ret;