diff mbox series

mem: accelerate dpdk program startup by reuse page from page cache

Message ID 20181109075830.27265-1-jianmingfan@126.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers show
Series mem: accelerate dpdk program startup by reuse page from page cache | expand

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

建明 Nov. 9, 2018, 7:58 a.m. UTC
During procless startup, dpdk invokes clear_hugedir() to unlink all hugepage files under /dev/hugepages.
 Then in map_all_hugepages(), it invokes mmap to allocate and zero all the huge pages as
 configured in /sys/kernel/mm/hugepages/xxx/nr_hugepages.

 This cause startup process extreamly slow with large size of huge page configured.

 In our use case, we usually configure as large as 200GB hugepages in our router. It takes more than 50s each time dpdk
 process startup to clear the pages.

 To address this issue, user can turn on --reuse-map switch. With it, dpdk will check the validity of the exiting page cache
 under /dev/hugespages. If valid, the cache will be reused not deleted, so that the os doesn't need to zero the pages again.

However, as there are a lot of users ,e.g. rte_kni_alloc, rely on the os zeor page behavior. To keep things work,
I add memset during malloc_heap_alloc(). This makes sense due to the following reason.
 1) user often configure hugepage size too large to be used by the program. In our router, 200GB is configured, but less than 2GB is actually used.
2) dpdk users don't call heap allocation in performance-critical path, they alloc memory during process bootup.

Note, the patch is tested based on dpdk-16.07. Feel free to inform me if
you like to base it on lastest version.

Signed-off-by jianmingfan@126.com
---
 lib/librte_eal/common/eal_common_options.c    |   5 +-
 lib/librte_eal/common/eal_hugepages.h         |   2 +-
 lib/librte_eal/common/eal_internal_cfg.h      |   1 +
 lib/librte_eal/common/eal_options.h           |   2 +
 lib/librte_eal/common/malloc_heap.c           |  17 +-
 lib/librte_eal/linuxapp/eal/eal.c             |   3 +-
 .../linuxapp/eal/eal_hugepage_info.c          | 207 +++++++++++++++++-
 7 files changed, 226 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 1a1bab36e..aee8aeded 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@  eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_REUSE_MAP,         0, NULL, OPT_REUSE_MAP_NUM        },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -850,7 +851,9 @@  eal_parse_common_option(int opt, const char *optarg,
 	case OPT_NO_HUGE_NUM:
 		conf->no_hugetlbfs = 1;
 		break;
-
+	case OPT_REUSE_MAP_NUM:
+		conf->reuse_map = 1;
+		break;
 	case OPT_NO_PCI_NUM:
 		conf->no_pci = 1;
 		break;
diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index 38edac03f..ab818002e 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -62,6 +62,6 @@  struct hugepage_file {
  * Read the information from linux on what hugepages are available
  * for the EAL to use
  */
-int eal_hugepage_info_init(void);
+int eal_hugepage_info_init(int reuse);
 
 #endif /* EAL_HUGEPAGES_H */
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367eb7..68cff4fa7 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@  struct internal_config {
 	volatile unsigned force_nchannel; /**< force number of channels */
 	volatile unsigned force_nrank;    /**< force number of ranks */
 	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+	volatile unsigned reuse_map;
 	unsigned hugepage_unlink;         /**< true to unlink backing files */
 	volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
 	volatile unsigned no_pci;         /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62e2..e4e6677d7 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@  enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_REUSE_MAP          "reuse-map"
+	OPT_REUSE_MAP_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index bc75de34b..34b72ef85 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -171,7 +171,22 @@  malloc_heap_alloc(struct malloc_heap *heap,
 	}
 	rte_spinlock_unlock(&heap->lock);
 
-	return elem == NULL ? NULL : (void *)(&elem[1]);
+	if (elem == NULL) {
+		return NULL;
+	}
+
+	/*
+	 * It's ugly here. The reason is that with reuse-map opt,
+	 * the memory may not be zeroed by hugepagefs during process boot.
+	 * However,some user of memzone alloc ,eg. rte_kni_alloc, assumes
+	 * the page is zeroed.
+	 */
+
+	/*
+	 * No need to memset in rte_free() now, may delete it later.
+	 */
+	memset(&elem[1], 0, size);
+	return (void *)(&elem[1]);
 }
 
 /*
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 3fb2188ff..a136abc8c 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -344,6 +344,7 @@  eal_usage(const char *prgname)
 	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
 	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
 	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
+	       "  --"OPT_REUSE_MAP"         Reuse exist page cache mapping for fast startup\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if ( rte_application_usage_hook ) {
@@ -766,7 +767,7 @@  rte_eal_init(int argc, char **argv)
 	if (internal_config.no_hugetlbfs == 0 &&
 			internal_config.process_type != RTE_PROC_SECONDARY &&
 			internal_config.xen_dom0_support == 0 &&
-			eal_hugepage_info_init() < 0)
+			eal_hugepage_info_init(internal_config.reuse_map) < 0)
 		rte_panic("Cannot get hugepage information\n");
 
 	if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 18858e2dd..ede712ef1 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -58,19 +58,24 @@ 
 #include "eal_internal_cfg.h"
 #include "eal_hugepages.h"
 #include "eal_filesystem.h"
+#include <sys/mman.h>
+#include <sys/stat.h>
 
 static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
 
 /* this function is only called from eal_hugepage_info_init which itself
  * is only called from a primary process */
 static uint32_t
-get_num_hugepages(const char *subdir)
+get_num_hugepages(const char *subdir, int reuse)
 {
 	char path[PATH_MAX];
 	long unsigned resv_pages, num_pages = 0;
 	const char *nr_hp_file = "free_hugepages";
 	const char *nr_rsvd_file = "resv_hugepages";
 
+        if (reuse == 1) {
+            nr_hp_file = "nr_hugepages";
+        }
 	/* first, check how many reserved pages kernel reports */
 	snprintf(path, sizeof(path), "%s/%s/%s",
 			sys_dir_path, subdir, nr_rsvd_file);
@@ -124,6 +129,190 @@  get_default_hp_size(void)
 	return size;
 }
 
+
+/*
+ * If file number under mp equals to nr_pages
+ * and each has a valid page mapped, return 1.
+ * or else return negative value;
+ * */
+static int check_mp(const char *mp, int nr_pages, uint64_t pagesize)
+{
+	DIR *dir;
+	struct dirent *dirent;
+	const char dirent_start_text[] = "rtemap";
+	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
+	int dir_fd;
+	int fd;
+	struct stat file_stat;
+	void *file_mmap;
+	unsigned char *mincore_vec;
+	int cnt = 0;
+	int rc = 0;
+
+	if (mp == NULL || nr_pages <= 0 || pagesize == 0 ) {
+		rc =  -EINVAL;
+		goto ret4;
+	}
+
+	int default_page_size = getpagesize();
+	mincore_vec = calloc(1, (pagesize + default_page_size - 1) / default_page_size);
+	if (mincore_vec == NULL) {
+		RTE_LOG(ERR, EAL, "calloc failed\n");
+		rc = -ENOMEM;
+		goto ret4;
+	}
+
+	dir = opendir(mp);
+	if (dir == NULL) {
+		rc =  -1;
+		goto ret3;
+	}
+
+	for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
+		if  ( strncmp(dirent->d_name, ".", 1 ) == 0
+				||  strncmp(dirent->d_name, "..", 2 ) == 0 ) {
+			continue;
+		}
+
+		/*rabbish file name appears in the mount point. check fail*/
+		if (strncmp(dirent->d_name, dirent_start_text, dirent_start_len) != 0
+				&& strncmp(dirent->d_name, ".", 1 ) != 0
+				&& strncmp(dirent->d_name, "..", 2 ) != 0 ) {
+			rc = -2;
+			goto ret2;
+		}
+
+		/*
+		 * If the file has a valid pagesize mapping return true.
+		 * else return false. don't support one-segment currently
+		 */
+		dir_fd = dirfd(dir);
+		fd = openat(dir_fd, dirent->d_name, O_RDONLY);
+		if (fd == -1) {
+			RTE_LOG(ERR, EAL, "open file failed %s\n", dirent->d_name);
+			rc = -3;
+			goto ret2;
+		}
+
+		if ( fstat( fd, &file_stat ) < 0 ) {
+			RTE_LOG(ERR, EAL, "Could not stat file %s\n", dirent->d_name);
+			rc = -4;
+			goto ret1;
+		}
+
+		if ( (uint64_t)file_stat.st_size != pagesize ) {
+			RTE_LOG(ERR, EAL, "%s file size %ld pagesize %d\n",
+					dirent->d_name, file_stat.st_size, (int)pagesize);
+			rc = -5;
+			goto ret1;
+		}
+
+		file_mmap = mmap((void *)0, pagesize, PROT_READ, MAP_SHARED|MAP_POPULATE, fd, 0);
+		if ( file_mmap == MAP_FAILED ) {
+			RTE_LOG(ERR, EAL,  "Could not mmap file %s\n", dirent->d_name);
+			rc = -6;
+			goto ret1;
+		}
+
+		if ( mincore(file_mmap, pagesize, mincore_vec) != 0 ) {
+			RTE_LOG(ERR, EAL, "Could not call mincore for file");
+			rc = -7;
+			goto ret;
+		}
+
+		if (! (mincore_vec[0] & 1) ){
+			rc = -8;
+			goto ret;
+		}
+
+		//need to unmap this so that
+		///proc/self/numa_maps parse will not fail
+		munmap(file_mmap, pagesize);
+		close(fd);
+		cnt++;
+	}/*for loop end*/
+
+	if (cnt != nr_pages) {
+		rc = -9;
+		goto ret2;
+	}
+
+	/*the successfull case*/
+	rc = 1;
+	goto ret2;
+
+ret:
+	munmap(file_mmap, pagesize);
+ret1:
+	close(fd);
+ret2:
+	close(dir_fd);
+ret3:
+	free(mincore_vec);
+ret4:
+	return rc;
+}
+
+typedef int (*visit_cb) (const char *mp, int nr_pages, uint64_t pagesize);
+/*
+ *  check every valid mount point with a least one page.
+ * */
+static const char *
+get_hugepage_dir(uint64_t hugepage_sz);
+static inline int visit_each_sysdir_entry(visit_cb cb_fn)
+{
+	DIR *dir;
+	struct dirent *dirent;
+	const char dirent_start_text[] = "hugepages-";
+	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
+	int num_sizes = 0;
+	uint64_t hugepage_sz = 0;
+	uint32_t nr;
+	const char *hugedir;
+
+	dir = opendir(sys_dir_path);
+	if (dir == NULL) {
+		return 0;
+	}
+
+	for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
+		if (strncmp(dirent->d_name, dirent_start_text,
+					dirent_start_len) != 0) {
+			continue;
+		}
+
+		/*shall not happend*/
+		if (num_sizes >= MAX_HUGEPAGE_SIZES) {
+			return 0;
+		}
+
+		hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]);
+		hugedir = get_hugepage_dir(hugepage_sz);
+		nr = get_num_hugepages(dirent->d_name, 1);
+		if (hugedir != NULL && nr != 0) {
+			if ( 0 == cb_fn(hugedir, nr, hugepage_sz) ) {
+				return 0;
+			}
+		}
+		num_sizes++;
+	}
+	return 1;
+}
+
+/*
+ * Return 1 only when the following conditions meet:
+ * 0) Use input reuse opt.
+ * 1) The file number under hugepage mount point shall be equal to the nr pages.
+ * 2) Each file 's mapping shall be there
+ */
+static int eal_trust_exist_mapping(int reuse_opt)
+{
+	if (reuse_opt == 0)
+		return 0;
+
+	return  visit_each_sysdir_entry(check_mp);
+}
+
 static const char *
 get_hugepage_dir(uint64_t hugepage_sz)
 {
@@ -274,7 +463,7 @@  compare_hpi(const void *a, const void *b)
  * initialization procedure.
  */
 int
-eal_hugepage_info_init(void)
+eal_hugepage_info_init(int user_opt)
 {
 	const char dirent_start_text[] = "hugepages-";
 	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
@@ -306,7 +495,7 @@  eal_hugepage_info_init(void)
 		if (hpi->hugedir == NULL) {
 			uint32_t num_pages;
 
-			num_pages = get_num_hugepages(dirent->d_name);
+			num_pages = get_num_hugepages(dirent->d_name, 0);
 			if (num_pages > 0)
 				RTE_LOG(NOTICE, EAL,
 					"%" PRIu32 " hugepages of size "
@@ -325,13 +514,17 @@  eal_hugepage_info_init(void)
 				"Failed to lock hugepage directory!\n");
 			break;
 		}
-		/* clear out the hugepages dir from unused pages */
-		if (clear_hugedir(hpi->hugedir) == -1)
-			break;
+
+		int reuse = eal_trust_exist_mapping(user_opt);
+		if (reuse == 0) {
+			/* clear out the hugepages dir from unused pages */
+			if (clear_hugedir(hpi->hugedir) == -1)
+				break;
+		}
 
 		/* for now, put all pages into socket 0,
 		 * later they will be sorted */
-		hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+		hpi->num_pages[0] = get_num_hugepages(dirent->d_name, reuse);
 
 #ifndef RTE_ARCH_64
 		/* for 32-bit systems, limit number of hugepages to