diff mbox series

[v2,9/9] mem: support in-memory mode

Message ID 8fef5019ebdb9d941c1ade936fcdda1ace303738.1531477505.git.anatoly.burakov@intel.com (mailing list archive)
State Superseded, archived
Headers show
Series Support running DPDK without hugetlbfs mountpoint | expand

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Burakov, Anatoly July 13, 2018, 10:27 a.m. UTC
Implement the final piece of the in-memory mode puzzle - enable running
DPDK entirely in memory, without creating any files.

To do it, use mmap with MAP_HUGETLB and size flags to enable DPDK to work
without hugetlbfs mountpoints. In order to enable this, a few things needed
to be changed.

First of all, we need to allow empty hugetlbfs mountpoints in
hugepage_info, and handle them correctly (by not trying to create any
files and lock any directories).

Next, we need to reorder the mapping sequence, because the page is not
really allocated until the page fault, and we cannot get its IOVA
address before we trigger the page fault.

Finally, decide at compile time whether we are going to be supporting
anonymous hugepages or not, because we cannot check for it at runtime.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    RFC->v1:
    - Drop memfd and instead use mmap() with MAP_HUGETLB. This will drop the
      kernel requirements down to 3.8, and does not impose any restrictions
      glibc (as far as i known).
    
      Unfortunately, there's a bit of an issue with this approach, because
      mmap() is stupid and will happily ignore unsupported arguments. This
      means that if the binary were to be compiled on a 3.8+ kernel but run
      on a pre-3.8 kernel (such as currently supported minimum of 3.2), then
      most likely the memory would be allocated using regular pages, causing
      unthinkable performance degradation. No solution to this problem is
      currently known to me.

 .../linuxapp/eal/eal_hugepage_info.c          |  91 +++++++-----
 lib/librte_eal/linuxapp/eal/eal_memalloc.c    | 130 +++++++++++-------
 lib/librte_eal/linuxapp/eal/eal_memory.c      |   3 +-
 3 files changed, 139 insertions(+), 85 deletions(-)

Comments

Thomas Monjalon July 13, 2018, 12:15 p.m. UTC | #1
There is a compilation error:

../lib/librte_eal/linuxapp/eal/eal_memalloc.c: In function ‘alloc_seg’:
../lib/librte_eal/linuxapp/eal/eal_memalloc.c:619:3: error:
‘map_offset’ may be used uninitialized in this function


13/07/2018 12:27, Anatoly Burakov:
> Implement the final piece of the in-memory mode puzzle - enable running
> DPDK entirely in memory, without creating any files.
> 
> To do it, use mmap with MAP_HUGETLB and size flags to enable DPDK to work
> without hugetlbfs mountpoints. In order to enable this, a few things needed
> to be changed.
> 
> First of all, we need to allow empty hugetlbfs mountpoints in
> hugepage_info, and handle them correctly (by not trying to create any
> files and lock any directories).
> 
> Next, we need to reorder the mapping sequence, because the page is not
> really allocated until the page fault, and we cannot get its IOVA
> address before we trigger the page fault.
> 
> Finally, decide at compile time whether we are going to be supporting
> anonymous hugepages or not, because we cannot check for it at runtime.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
> 
> Notes:
>     RFC->v1:
>     - Drop memfd and instead use mmap() with MAP_HUGETLB. This will drop the
>       kernel requirements down to 3.8, and does not impose any restrictions
>       glibc (as far as i known).
>     
>       Unfortunately, there's a bit of an issue with this approach, because
>       mmap() is stupid and will happily ignore unsupported arguments. This
>       means that if the binary were to be compiled on a 3.8+ kernel but run
>       on a pre-3.8 kernel (such as currently supported minimum of 3.2), then
>       most likely the memory would be allocated using regular pages, causing
>       unthinkable performance degradation. No solution to this problem is
>       currently known to me.
diff mbox series

Patch

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 7f8e2fd9c..3a7d4b222 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -18,6 +18,8 @@ 
 #include <sys/queue.h>
 #include <sys/stat.h>
 
+#include <linux/mman.h> /* for hugetlb-related flags */
+
 #include <rte_memory.h>
 #include <rte_eal.h>
 #include <rte_launch.h>
@@ -313,11 +315,49 @@  compare_hpi(const void *a, const void *b)
 	return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
 }
 
+static void
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+{
+	uint64_t total_pages = 0;
+	unsigned int i;
+
+	/*
+	 * first, try to put all hugepages into relevant sockets, but
+	 * if first attempts fails, fall back to collecting all pages
+	 * in one socket and sorting them later
+	 */
+	total_pages = 0;
+	/* we also don't want to do this for legacy init */
+	if (!internal_config.legacy_mem)
+		for (i = 0; i < rte_socket_count(); i++) {
+			int socket = rte_socket_id_by_idx(i);
+			unsigned int num_pages =
+					get_num_hugepages_on_node(
+						dirent->d_name, socket);
+			hpi->num_pages[socket] = num_pages;
+			total_pages += num_pages;
+		}
+	/*
+	 * we failed to sort memory from the get go, so fall
+	 * back to old way
+	 */
+	if (total_pages == 0) {
+		hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+		/* for 32-bit systems, limit number of hugepages to
+		 * 1GB per page size */
+		hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+				RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+	}
+}
+
 static int
 hugepage_info_init(void)
 {	const char dirent_start_text[] = "hugepages-";
 	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
-	unsigned int i, total_pages, num_sizes = 0;
+	unsigned int i, num_sizes = 0;
 	DIR *dir;
 	struct dirent *dirent;
 
@@ -355,6 +395,22 @@  hugepage_info_init(void)
 					"%" PRIu64 " reserved, but no mounted "
 					"hugetlbfs found for that size\n",
 					num_pages, hpi->hugepage_sz);
+			/* if we have kernel support for reserving hugepages
+			 * through mmap, and we're in in-memory mode, treat this
+			 * page size as valid. we cannot be in legacy mode at
+			 * this point because we've checked this earlier in the
+			 * init process.
+			 */
+#ifdef MAP_HUGE_SHIFT
+			if (internal_config.in_memory) {
+				RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
+					"hugepages of size %" PRIu64 " bytes "
+					"will be allocated anonymously\n",
+					hpi->hugepage_sz);
+				calc_num_pages(hpi, dirent);
+				num_sizes++;
+			}
+#endif
 			continue;
 		}
 
@@ -371,35 +427,7 @@  hugepage_info_init(void)
 		if (clear_hugedir(hpi->hugedir) == -1)
 			break;
 
-		/*
-		 * first, try to put all hugepages into relevant sockets, but
-		 * if first attempts fails, fall back to collecting all pages
-		 * in one socket and sorting them later
-		 */
-		total_pages = 0;
-		/* we also don't want to do this for legacy init */
-		if (!internal_config.legacy_mem)
-			for (i = 0; i < rte_socket_count(); i++) {
-				int socket = rte_socket_id_by_idx(i);
-				unsigned int num_pages =
-						get_num_hugepages_on_node(
-							dirent->d_name, socket);
-				hpi->num_pages[socket] = num_pages;
-				total_pages += num_pages;
-			}
-		/*
-		 * we failed to sort memory from the get go, so fall
-		 * back to old way
-		 */
-		if (total_pages == 0)
-			hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
-
-#ifndef RTE_ARCH_64
-		/* for 32-bit systems, limit number of hugepages to
-		 * 1GB per page size */
-		hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
-					    RTE_PGSIZE_1G / hpi->hugepage_sz);
-#endif
+		calc_num_pages(hpi, dirent);
 
 		num_sizes++;
 	}
@@ -423,8 +451,7 @@  hugepage_info_init(void)
 
 		for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
 			num_pages += hpi->num_pages[j];
-		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 &&
-				num_pages > 0)
+		if (num_pages > 0)
 			return 0;
 	}
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index d610923b8..10c959da4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -28,6 +28,7 @@ 
 #include <numaif.h>
 #endif
 #include <linux/falloc.h>
+#include <linux/mman.h> /* for hugetlb-related mmap flags */
 
 #include <rte_common.h>
 #include <rte_log.h>
@@ -41,6 +42,15 @@ 
 #include "eal_memalloc.h"
 #include "eal_private.h"
 
+const int anonymous_hugepages_supported =
+#ifdef MAP_HUGE_SHIFT
+		1;
+#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
+#else
+		0;
+#define RTE_MAP_HUGE_SHIFT 26
+#endif
+
 /*
  * not all kernel version support fallocate on hugetlbfs, so fall back to
  * ftruncate and disallow deallocation if fallocate is not supported.
@@ -461,6 +471,8 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	int cur_socket_id = 0;
 #endif
 	uint64_t map_offset;
+	rte_iova_t iova;
+	void *va;
 	char path[PATH_MAX];
 	int ret = 0;
 	int fd;
@@ -468,43 +480,57 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	int flags;
 	void *new_addr;
 
-	/* takes out a read lock on segment or segment list */
-	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
-	if (fd < 0) {
-		RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
-		return -1;
-	}
-
 	alloc_sz = hi->hugepage_sz;
-	if (internal_config.single_file_segments) {
-		map_offset = seg_idx * alloc_sz;
-		ret = resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
-				alloc_sz, true);
-		if (ret < 0)
-			goto resized;
+	if (internal_config.in_memory && anonymous_hugepages_supported) {
+		int log2, flags;
+
+		log2 = rte_log2_u32(alloc_sz);
+		/* as per mmap() manpage, all page sizes are log2 of page size
+		 * shifted by MAP_HUGE_SHIFT
+		 */
+		flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED |
+				MAP_PRIVATE | MAP_ANONYMOUS;
+		fd = -1;
+		va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0);
 	} else {
-		map_offset = 0;
-		if (ftruncate(fd, alloc_sz) < 0) {
-			RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
-				__func__, strerror(errno));
-			goto resized;
+		/* takes out a read lock on segment or segment list */
+		fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
+			return -1;
 		}
-		if (internal_config.hugepage_unlink) {
-			if (unlink(path)) {
-				RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
+
+		if (internal_config.single_file_segments) {
+			map_offset = seg_idx * alloc_sz;
+			ret = resize_hugefile(fd, path, list_idx, seg_idx,
+					map_offset, alloc_sz, true);
+			if (ret < 0)
+				goto resized;
+		} else {
+			map_offset = 0;
+			if (ftruncate(fd, alloc_sz) < 0) {
+				RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
 					__func__, strerror(errno));
 				goto resized;
 			}
+			if (internal_config.hugepage_unlink) {
+				if (unlink(path)) {
+					RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
+						__func__, strerror(errno));
+					goto resized;
+				}
+			}
 		}
+
+		/*
+		 * map the segment, and populate page tables, the kernel fills
+		 * this segment with zeros if it's a new page.
+		 */
+		va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
+				MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd,
+				map_offset);
 	}
 
-	/*
-	 * map the segment, and populate page tables, the kernel fills this
-	 * segment with zeros if it's a new page.
-	 */
-	void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
-			MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
-
 	if (va == MAP_FAILED) {
 		RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
 			strerror(errno));
@@ -519,24 +545,6 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 		goto resized;
 	}
 
-	rte_iova_t iova = rte_mem_virt2iova(addr);
-	if (iova == RTE_BAD_PHYS_ADDR) {
-		RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
-			__func__);
-		goto mapped;
-	}
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-	move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
-
-	if (cur_socket_id != socket_id) {
-		RTE_LOG(DEBUG, EAL,
-				"%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
-			__func__, socket_id, cur_socket_id);
-		goto mapped;
-	}
-#endif
-
 	/* In linux, hugetlb limitations, like cgroup, are
 	 * enforced at fault time instead of mmap(), even
 	 * with the option of MAP_POPULATE. Kernel will send
@@ -549,9 +557,6 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 			(unsigned int)(alloc_sz >> 20));
 		goto mapped;
 	}
-	/* for non-single file segments, we can close fd here */
-	if (!internal_config.single_file_segments)
-		close(fd);
 
 	/* we need to trigger a write to the page to enforce page fault and
 	 * ensure that page is accessible to us, but we can't overwrite value
@@ -560,6 +565,28 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	 */
 	*(volatile int *)addr = *(volatile int *)addr;
 
+	iova = rte_mem_virt2iova(addr);
+	if (iova == RTE_BAD_PHYS_ADDR) {
+		RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+			__func__);
+		goto mapped;
+	}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+	if (cur_socket_id != socket_id) {
+		RTE_LOG(DEBUG, EAL,
+				"%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+			__func__, socket_id, cur_socket_id);
+		goto mapped;
+	}
+#endif
+	/* for non-single file segments that aren't in-memory, we can close fd
+	 * here */
+	if (!internal_config.single_file_segments && !internal_config.in_memory)
+		close(fd);
+
 	ms->addr = addr;
 	ms->hugepage_sz = alloc_sz;
 	ms->len = alloc_sz;
@@ -595,6 +622,7 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	} else {
 		/* only remove file if we can take out a write lock */
 		if (internal_config.hugepage_unlink == 0 &&
+				internal_config.in_memory == 0 &&
 				lock(fd, LOCK_EX) == 1)
 			unlink(path);
 		close(fd);
@@ -705,7 +733,7 @@  alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	 * during init, we already hold a write lock, so don't try to take out
 	 * another one.
 	 */
-	if (wa->hi->lock_descriptor == -1) {
+	if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
 		if (dir_fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
@@ -809,7 +837,7 @@  free_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	 * during init, we already hold a write lock, so don't try to take out
 	 * another one.
 	 */
-	if (wa->hi->lock_descriptor == -1) {
+	if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
 		if (dir_fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index ddfa8b133..dbf19499e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1088,8 +1088,7 @@  get_socket_mem_size(int socket)
 
 	for (i = 0; i < internal_config.num_hugepage_sizes; i++){
 		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0)
-			size += hpi->hugepage_sz * hpi->num_pages[socket];
+		size += hpi->hugepage_sz * hpi->num_pages[socket];
 	}
 
 	return size;