[v9,04/10] eal: implement functions for thread affinity management

Message ID 1622850274-6946-5-git-send-email-navasile@linux.microsoft.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series eal: Add EAL API for threading |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Narcisa Ana Maria Vasile June 4, 2021, 11:44 p.m. UTC
  From: Narcisa Vasile <navasile@microsoft.com>

Implement functions for getting/setting thread affinity.
Threads can be pinned to specific cores by setting their
affinity attribute.

Signed-off-by: Narcisa Vasile <navasile@microsoft.com>
Signed-off-by: Dmitry Malloy <dmitrym@microsoft.com>
---
 lib/eal/common/rte_thread.c   |  14 +++
 lib/eal/include/rte_thread.h  |  36 ++++++++
 lib/eal/windows/eal_lcore.c   | 169 +++++++++++++++++++++++++---------
 lib/eal/windows/eal_windows.h |  10 ++
 lib/eal/windows/rte_thread.c  | 127 ++++++++++++++++++++++++-
 5 files changed, 310 insertions(+), 46 deletions(-)
  

Comments

Dmitry Kozlyuk June 8, 2021, 11:03 p.m. UTC | #1
2021-06-04 16:44 (UTC-0700), Narcisa Ana Maria Vasile:
[...]
> diff --git a/lib/eal/windows/rte_thread.c b/lib/eal/windows/rte_thread.c
> index 6ea1dc2a05..9e74a538c2 100644
> --- a/lib/eal/windows/rte_thread.c
> +++ b/lib/eal/windows/rte_thread.c
> @@ -7,7 +7,8 @@
>  #include <rte_errno.h>
>  #include <rte_debug.h>
>  #include <rte_thread.h>
> -#include <rte_windows.h>
> +
> +#include "eal_windows.h"
>  
>  struct eal_tls_key {
>  	DWORD thread_index;
> @@ -77,6 +78,130 @@ rte_thread_equal(rte_thread_t t1, rte_thread_t t2)
>  	return t1.opaque_id == t2.opaque_id;
>  }
>  
> +static int
> +rte_convert_cpuset_to_affinity(const rte_cpuset_t *cpuset,
> +			       PGROUP_AFFINITY affinity)
> +{
> +	int ret = 0;
> +	PGROUP_AFFINITY cpu_affinity = NULL;
> +
> +	memset(affinity, 0, sizeof(GROUP_AFFINITY));
> +	affinity->Group = (USHORT)-1;
> +
> +	/* Check that all cpus of the set belong to the same processor group and
> +	 * accumulate thread affinity to be applied.
> +	 */
> +	for (unsigned int cpu_idx = 0; cpu_idx < CPU_SETSIZE; cpu_idx++) {
> +		if (!CPU_ISSET(cpu_idx, cpuset))
> +			continue;
> +
> +		cpu_affinity = eal_get_cpu_affinity(cpu_idx);
> +
> +		if (affinity->Group == (USHORT)-1) {
> +			affinity->Group = cpu_affinity->Group;
> +		} else if (affinity->Group != cpu_affinity->Group) {
> +			ret = EINVAL;
> +			goto cleanup;
> +		}
> +
> +		affinity->Mask |= cpu_affinity->Mask;
> +	}
> +
> +	if (affinity->Mask == 0) {
> +		ret = EINVAL;
> +		goto cleanup;
> +	}
> +
> +cleanup:
> +	return ret;
> +}

For v5 I asked a question that possibly got lost among other comments.
Repeating the question for convenience:

	Just to be clear: is it a kernel limitation that a thread can only
	run on cores of one processor group, or do we impose it so that API
	is atomic (transactional), i.e. because one of multiple
	SetThreadGroupAffinity() calls may fail and leave thread partially
	affinitized?
  
Narcisa Ana Maria Vasile June 18, 2021, 9:44 p.m. UTC | #2
On Wed, Jun 09, 2021 at 02:03:57AM +0300, Dmitry Kozlyuk wrote:
> 2021-06-04 16:44 (UTC-0700), Narcisa Ana Maria Vasile:
> [...]
> > diff --git a/lib/eal/windows/rte_thread.c b/lib/eal/windows/rte_thread.c
> > index 6ea1dc2a05..9e74a538c2 100644
> > --- a/lib/eal/windows/rte_thread.c
> > +++ b/lib/eal/windows/rte_thread.c
> > @@ -7,7 +7,8 @@
> >  #include <rte_errno.h>
> >  #include <rte_debug.h>
> >  #include <rte_thread.h>
> > -#include <rte_windows.h>
> > +
> > +#include "eal_windows.h"
> >  
> >  struct eal_tls_key {
> >  	DWORD thread_index;
> > @@ -77,6 +78,130 @@ rte_thread_equal(rte_thread_t t1, rte_thread_t t2)
> >  	return t1.opaque_id == t2.opaque_id;
> >  }
> >  
> > +static int
> > +rte_convert_cpuset_to_affinity(const rte_cpuset_t *cpuset,
> > +			       PGROUP_AFFINITY affinity)
> > +{
> > +	int ret = 0;
> > +	PGROUP_AFFINITY cpu_affinity = NULL;
> > +
> > +	memset(affinity, 0, sizeof(GROUP_AFFINITY));
> > +	affinity->Group = (USHORT)-1;
> > +
> > +	/* Check that all cpus of the set belong to the same processor group and
> > +	 * accumulate thread affinity to be applied.
> > +	 */
> > +	for (unsigned int cpu_idx = 0; cpu_idx < CPU_SETSIZE; cpu_idx++) {
> > +		if (!CPU_ISSET(cpu_idx, cpuset))
> > +			continue;
> > +
> > +		cpu_affinity = eal_get_cpu_affinity(cpu_idx);
> > +
> > +		if (affinity->Group == (USHORT)-1) {
> > +			affinity->Group = cpu_affinity->Group;
> > +		} else if (affinity->Group != cpu_affinity->Group) {
> > +			ret = EINVAL;
> > +			goto cleanup;
> > +		}
> > +
> > +		affinity->Mask |= cpu_affinity->Mask;
> > +	}
> > +
> > +	if (affinity->Mask == 0) {
> > +		ret = EINVAL;
> > +		goto cleanup;
> > +	}
> > +
> > +cleanup:
> > +	return ret;
> > +}
> 
> For v5 I asked a question that possibly got lost among other comments.
> Repeating the question for convenience:
> 
> 	Just to be clear: is it a kernel limitation that a thread can only
> 	run on cores of one processor group, or do we impose it so that API
> 	is atomic (transactional), i.e. because one of multiple
> 	SetThreadGroupAffinity() calls may fail and leave thread partially
> 	affinitized?
The second reason (to ensure full affinitization). I am not aware of a kernel limitation,
but I'll double check with Dmitry as we co-engineered this patch.
  

Patch

diff --git a/lib/eal/common/rte_thread.c b/lib/eal/common/rte_thread.c
index 4b1e8f995e..ceb27feaa7 100644
--- a/lib/eal/common/rte_thread.c
+++ b/lib/eal/common/rte_thread.c
@@ -34,6 +34,20 @@  rte_thread_equal(rte_thread_t t1, rte_thread_t t2)
 	return pthread_equal(t1.opaque_id, t2.opaque_id);
 }
 
+int
+rte_thread_set_affinity_by_id(rte_thread_t thread_id,
+		const rte_cpuset_t *cpuset)
+{
+	return pthread_setaffinity_np(thread_id.opaque_id, sizeof(*cpuset), cpuset);
+}
+
+int
+rte_thread_get_affinity_by_id(rte_thread_t thread_id,
+		rte_cpuset_t *cpuset)
+{
+	return pthread_getaffinity_np(thread_id.opaque_id, sizeof(*cpuset), cpuset);
+}
+
 int
 rte_thread_attr_init(rte_thread_attr_t *attr)
 {
diff --git a/lib/eal/include/rte_thread.h b/lib/eal/include/rte_thread.h
index f3eeb28753..1f02962146 100644
--- a/lib/eal/include/rte_thread.h
+++ b/lib/eal/include/rte_thread.h
@@ -86,6 +86,42 @@  rte_thread_t rte_thread_self(void);
 __rte_experimental
 int rte_thread_equal(rte_thread_t t1, rte_thread_t t2);
 
+/**
+ * Set the affinity of thread 'thread_id' to the cpu set
+ * specified by 'cpuset'.
+ *
+ * @param thread_id
+ *    Id of the thread for which to set the affinity.
+ *
+ * @param cpuset
+ *   Pointer to CPU affinity to set.
+ *
+ * @return
+ *   On success, return 0.
+ *   On failure, return a positive errno-style error number.
+ */
+__rte_experimental
+int rte_thread_set_affinity_by_id(rte_thread_t thread_id,
+		const rte_cpuset_t *cpuset);
+
+/**
+ * Get the affinity of thread 'thread_id' and store it
+ * in 'cpuset'.
+ *
+ * @param thread_id
+ *    Id of the thread for which to get the affinity.
+ *
+ * @param cpuset
+ *   Pointer for storing the affinity value.
+ *
+ * @return
+ *   On success, return 0.
+ *   On failure, return a positive errno-style error number.
+ */
+__rte_experimental
+int rte_thread_get_affinity_by_id(rte_thread_t thread_id,
+		rte_cpuset_t *cpuset);
+
 /**
  * Initialize the attributes of a thread.
  * These attributes can be passed to the rte_thread_create() function
diff --git a/lib/eal/windows/eal_lcore.c b/lib/eal/windows/eal_lcore.c
index 476c2d2bdf..519a62b96d 100644
--- a/lib/eal/windows/eal_lcore.c
+++ b/lib/eal/windows/eal_lcore.c
@@ -2,7 +2,6 @@ 
  * Copyright(c) 2019 Intel Corporation
  */
 
-#include <pthread.h>
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -27,13 +26,15 @@  struct socket_map {
 };
 
 struct cpu_map {
-	unsigned int socket_count;
 	unsigned int lcore_count;
+	unsigned int socket_count;
+	unsigned int cpu_count;
 	struct lcore_map lcores[RTE_MAX_LCORE];
 	struct socket_map sockets[RTE_MAX_NUMA_NODES];
+	GROUP_AFFINITY cpus[CPU_SETSIZE];
 };
 
-static struct cpu_map cpu_map = { 0 };
+static struct cpu_map cpu_map;
 
 /* eal_create_cpu_map() is called before logging is initialized */
 static void
@@ -47,13 +48,111 @@  log_early(const char *format, ...)
 	va_end(va);
 }
 
+static int
+eal_query_group_affinity(void)
+{
+	SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *infos = NULL;
+	DWORD infos_size = 0;
+	int ret = 0;
+
+	if (!GetLogicalProcessorInformationEx(RelationGroup, NULL,
+					      &infos_size)) {
+		DWORD error = GetLastError();
+		if (error != ERROR_INSUFFICIENT_BUFFER) {
+			log_early("Cannot get group information size, "
+				  "error %lu\n", error);
+			rte_errno = EINVAL;
+			ret = -1;
+			goto cleanup;
+		}
+	}
+
+	infos = malloc(infos_size);
+	if (infos == NULL) {
+		log_early("Cannot allocate memory for NUMA node information\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto cleanup;
+	}
+
+	if (!GetLogicalProcessorInformationEx(RelationGroup, infos,
+					      &infos_size)) {
+		log_early("Cannot get group information, error %lu\n",
+			  GetLastError());
+		rte_errno = EINVAL;
+		ret = -1;
+		goto cleanup;
+	}
+
+	cpu_map.cpu_count = 0;
+	USHORT group_count = infos->Group.ActiveGroupCount;
+	for (USHORT group_number = 0; group_number < group_count; group_number++) {
+		KAFFINITY affinity = infos->Group.GroupInfo[group_number].ActiveProcessorMask;
+
+		for (unsigned int i = 0; i < EAL_PROCESSOR_GROUP_SIZE; i++) {
+			if ((affinity & ((KAFFINITY)1 << i)) == 0)
+				continue;
+			cpu_map.cpus[cpu_map.cpu_count].Group = group_number;
+			cpu_map.cpus[cpu_map.cpu_count].Mask = (KAFFINITY)1 << i;
+			cpu_map.cpu_count++;
+		}
+	}
+
+cleanup:
+	free(infos);
+	return ret;
+}
+
+static bool
+eal_create_lcore_map(const SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *info)
+{
+	const unsigned int node_id = info->NumaNode.NodeNumber;
+	const GROUP_AFFINITY *cores = &info->NumaNode.GroupMask;
+	struct lcore_map *lcore;
+	unsigned int socket_id;
+
+	/* NUMA node may be reported multiple times if it includes
+	 * cores from different processor groups, e. g. 80 cores
+	 * of a physical processor comprise one NUMA node, but two
+	 * processor groups, because group size is limited by 32/64.
+	 */
+	for (socket_id = 0; socket_id < cpu_map.socket_count; socket_id++) {
+		if (cpu_map.sockets[socket_id].node_id == node_id)
+			break;
+	}
+
+	if (socket_id == cpu_map.socket_count) {
+		if (socket_id == RTE_DIM(cpu_map.sockets))
+			return true;
+
+		cpu_map.sockets[socket_id].node_id = node_id;
+		cpu_map.socket_count++;
+	}
+
+	for (unsigned int i = 0; i < EAL_PROCESSOR_GROUP_SIZE; i++) {
+		if ((cores->Mask & ((KAFFINITY)1 << i)) == 0)
+			continue;
+
+		if (cpu_map.lcore_count == RTE_DIM(cpu_map.lcores))
+			return true;
+
+		lcore = &cpu_map.lcores[cpu_map.lcore_count];
+		lcore->socket_id = socket_id;
+		lcore->core_id = cores->Group * EAL_PROCESSOR_GROUP_SIZE + i;
+		cpu_map.lcore_count++;
+	}
+	return false;
+}
+
 int
 eal_create_cpu_map(void)
 {
 	SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *infos, *info;
 	DWORD infos_size;
 	bool full = false;
+	int ret = 0;
 
+	infos = NULL;
 	infos_size = 0;
 	if (!GetLogicalProcessorInformationEx(
 			RelationNumaNode, NULL, &infos_size)) {
@@ -78,57 +177,29 @@  eal_create_cpu_map(void)
 		log_early("Cannot get NUMA node information, error %lu\n",
 			GetLastError());
 		rte_errno = EINVAL;
-		return -1;
+		ret = -1;
+		goto exit;
 	}
 
 	info = infos;
 	while ((uint8_t *)info - (uint8_t *)infos < infos_size) {
-		unsigned int node_id = info->NumaNode.NodeNumber;
-		GROUP_AFFINITY *cores = &info->NumaNode.GroupMask;
-		struct lcore_map *lcore;
-		unsigned int i, socket_id;
-
-		/* NUMA node may be reported multiple times if it includes
-		 * cores from different processor groups, e. g. 80 cores
-		 * of a physical processor comprise one NUMA node, but two
-		 * processor groups, because group size is limited by 32/64.
-		 */
-		for (socket_id = 0; socket_id < cpu_map.socket_count;
-		    socket_id++) {
-			if (cpu_map.sockets[socket_id].node_id == node_id)
-				break;
-		}
-
-		if (socket_id == cpu_map.socket_count) {
-			if (socket_id == RTE_DIM(cpu_map.sockets)) {
-				full = true;
-				goto exit;
-			}
-
-			cpu_map.sockets[socket_id].node_id = node_id;
-			cpu_map.socket_count++;
-		}
-
-		for (i = 0; i < EAL_PROCESSOR_GROUP_SIZE; i++) {
-			if ((cores->Mask & ((KAFFINITY)1 << i)) == 0)
-				continue;
-
-			if (cpu_map.lcore_count == RTE_DIM(cpu_map.lcores)) {
-				full = true;
-				goto exit;
-			}
-
-			lcore = &cpu_map.lcores[cpu_map.lcore_count];
-			lcore->socket_id = socket_id;
-			lcore->core_id =
-				cores->Group * EAL_PROCESSOR_GROUP_SIZE + i;
-			cpu_map.lcore_count++;
+		if (eal_create_lcore_map(info)) {
+			full = true;
+			break;
 		}
 
 		info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)(
 			(uint8_t *)info + info->Size);
 	}
 
+	if (eal_query_group_affinity()) {
+		/*
+		 * No need to set rte_errno here.
+		 * It is set by eal_query_group_affinity().
+		 */
+		ret = -1;
+		goto exit;
+	}
 exit:
 	if (full) {
 		/* Not a fatal error, but important for troubleshooting. */
@@ -138,7 +209,7 @@  eal_create_cpu_map(void)
 
 	free(infos);
 
-	return 0;
+	return ret;
 }
 
 int
@@ -164,3 +235,11 @@  eal_socket_numa_node(unsigned int socket_id)
 {
 	return cpu_map.sockets[socket_id].node_id;
 }
+
+PGROUP_AFFINITY
+eal_get_cpu_affinity(size_t cpu_index)
+{
+	RTE_VERIFY(cpu_index < CPU_SETSIZE);
+
+	return &cpu_map.cpus[cpu_index];
+}
diff --git a/lib/eal/windows/eal_windows.h b/lib/eal/windows/eal_windows.h
index 478accc1b9..dc5dc8240a 100644
--- a/lib/eal/windows/eal_windows.h
+++ b/lib/eal/windows/eal_windows.h
@@ -55,6 +55,16 @@  int eal_thread_create(pthread_t *thread);
  */
 unsigned int eal_socket_numa_node(unsigned int socket_id);
 
+/**
+ * Get pointer to the group affinity for the cpu.
+ *
+ * @param cpu_index
+ *  Index of the cpu, as it comes from rte_cpuset_t.
+ * @return
+ *  Pointer to the group affinity for the cpu.
+ */
+PGROUP_AFFINITY eal_get_cpu_affinity(size_t cpu_index);
+
 /**
  * Schedule code for execution in the interrupt thread.
  *
diff --git a/lib/eal/windows/rte_thread.c b/lib/eal/windows/rte_thread.c
index 6ea1dc2a05..9e74a538c2 100644
--- a/lib/eal/windows/rte_thread.c
+++ b/lib/eal/windows/rte_thread.c
@@ -7,7 +7,8 @@ 
 #include <rte_errno.h>
 #include <rte_debug.h>
 #include <rte_thread.h>
-#include <rte_windows.h>
+
+#include "eal_windows.h"
 
 struct eal_tls_key {
 	DWORD thread_index;
@@ -77,6 +78,130 @@  rte_thread_equal(rte_thread_t t1, rte_thread_t t2)
 	return t1.opaque_id == t2.opaque_id;
 }
 
+static int
+rte_convert_cpuset_to_affinity(const rte_cpuset_t *cpuset,
+			       PGROUP_AFFINITY affinity)
+{
+	int ret = 0;
+	PGROUP_AFFINITY cpu_affinity = NULL;
+
+	memset(affinity, 0, sizeof(GROUP_AFFINITY));
+	affinity->Group = (USHORT)-1;
+
+	/* Check that all cpus of the set belong to the same processor group and
+	 * accumulate thread affinity to be applied.
+	 */
+	for (unsigned int cpu_idx = 0; cpu_idx < CPU_SETSIZE; cpu_idx++) {
+		if (!CPU_ISSET(cpu_idx, cpuset))
+			continue;
+
+		cpu_affinity = eal_get_cpu_affinity(cpu_idx);
+
+		if (affinity->Group == (USHORT)-1) {
+			affinity->Group = cpu_affinity->Group;
+		} else if (affinity->Group != cpu_affinity->Group) {
+			ret = EINVAL;
+			goto cleanup;
+		}
+
+		affinity->Mask |= cpu_affinity->Mask;
+	}
+
+	if (affinity->Mask == 0) {
+		ret = EINVAL;
+		goto cleanup;
+	}
+
+cleanup:
+	return ret;
+}
+
+int
+rte_thread_set_affinity_by_id(rte_thread_t thread_id,
+		const rte_cpuset_t *cpuset)
+{
+	int ret = 0;
+	GROUP_AFFINITY thread_affinity;
+	HANDLE thread_handle = NULL;
+
+	if (cpuset == NULL) {
+		ret = EINVAL;
+		goto cleanup;
+	}
+
+	ret = rte_convert_cpuset_to_affinity(cpuset, &thread_affinity);
+	if (ret != 0) {
+		RTE_LOG(DEBUG, EAL, "Unable to convert cpuset to thread affinity\n");
+		goto cleanup;
+	}
+
+	thread_handle = OpenThread(THREAD_ALL_ACCESS, FALSE, thread_id.opaque_id);
+	if (thread_handle == NULL) {
+		ret = thread_log_last_error("OpenThread()");
+		goto cleanup;
+	}
+
+	if (!SetThreadGroupAffinity(thread_handle, &thread_affinity, NULL)) {
+		ret = thread_log_last_error("SetThreadGroupAffinity()");
+		goto cleanup;
+	}
+
+cleanup:
+	if (thread_handle != NULL) {
+		CloseHandle(thread_handle);
+		thread_handle = NULL;
+	}
+
+	return ret;
+}
+
+int
+rte_thread_get_affinity_by_id(rte_thread_t thread_id,
+		rte_cpuset_t *cpuset)
+{
+	HANDLE thread_handle = NULL;
+	PGROUP_AFFINITY cpu_affinity;
+	GROUP_AFFINITY thread_affinity;
+	int ret = 0;
+
+	if (cpuset == NULL) {
+		ret = EINVAL;
+		goto cleanup;
+	}
+
+	thread_handle = OpenThread(THREAD_ALL_ACCESS, FALSE, thread_id.opaque_id);
+	if (thread_handle == NULL) {
+		ret = thread_log_last_error("OpenThread()");
+		goto cleanup;
+	}
+
+	/* obtain previous thread affinity */
+	if (!GetThreadGroupAffinity(thread_handle, &thread_affinity)) {
+		ret = thread_log_last_error("GetThreadGroupAffinity()");
+		goto cleanup;
+	}
+
+	CPU_ZERO(cpuset);
+
+	/* Convert affinity to DPDK cpu set */
+	for (unsigned int cpu_idx = 0; cpu_idx < CPU_SETSIZE; cpu_idx++) {
+
+		cpu_affinity = eal_get_cpu_affinity(cpu_idx);
+
+		if ((cpu_affinity->Group == thread_affinity.Group) &&
+		   ((cpu_affinity->Mask & thread_affinity.Mask) != 0)) {
+			CPU_SET(cpu_idx, cpuset);
+		}
+	}
+
+cleanup:
+	if (thread_handle != NULL) {
+		CloseHandle(thread_handle);
+		thread_handle = NULL;
+	}
+	return ret;
+}
+
 int
 rte_thread_attr_init(rte_thread_attr_t *attr)
 {