From patchwork Sun Apr 8 20:17:34 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Burakov, Anatoly" X-Patchwork-Id: 37558 Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 64F4C1B6FB; Sun, 8 Apr 2018 22:19:38 +0200 (CEST) Received: from mga02.intel.com (mga02.intel.com [134.134.136.20]) by dpdk.org (Postfix) with ESMTP id 2DD3E1B698 for ; Sun, 8 Apr 2018 22:18:52 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by orsmga101.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 08 Apr 2018 13:18:48 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.48,424,1517904000"; d="scan'208";a="40417697" Received: from irvmail001.ir.intel.com ([163.33.26.43]) by FMSMGA003.fm.intel.com with ESMTP; 08 Apr 2018 13:18:45 -0700 Received: from sivswdev01.ir.intel.com (sivswdev01.ir.intel.com [10.237.217.45]) by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id w38KIijW021069; Sun, 8 Apr 2018 21:18:44 +0100 Received: from sivswdev01.ir.intel.com (localhost [127.0.0.1]) by sivswdev01.ir.intel.com with ESMTP id w38KIiaV010871; Sun, 8 Apr 2018 21:18:44 +0100 Received: (from aburakov@localhost) by sivswdev01.ir.intel.com with LOCAL id w38KIi8d010867; Sun, 8 Apr 2018 21:18:44 +0100 From: Anatoly Burakov To: dev@dpdk.org Cc: keith.wiles@intel.com, jianfeng.tan@intel.com, andras.kovacs@ericsson.com, laszlo.vadkeri@ericsson.com, benjamin.walker@intel.com, bruce.richardson@intel.com, thomas@monjalon.net, konstantin.ananyev@intel.com, kuralamudhan.ramakrishnan@intel.com, louise.m.daly@intel.com, nelio.laranjeiro@6wind.com, yskoh@mellanox.com, pepperjo@japf.ch, jerin.jacob@caviumnetworks.com, hemant.agrawal@nxp.com, olivier.matz@6wind.com, shreyansh.jain@nxp.com, gowrishankar.m@linux.vnet.ibm.com Date: Sun, 8 Apr 2018 21:17:34 +0100 Message-Id: <95392c6d2958cbc185bc07fd74d8f334143565dd.1523218215.git.anatoly.burakov@intel.com> X-Mailer: git-send-email 1.7.0.7 In-Reply-To: References: In-Reply-To: References: Subject: [dpdk-dev] [PATCH v4 01/70] eal: move get_virtual_area out of linuxapp eal_memory.c X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Move get_virtual_area out of linuxapp EAL memory and make it common to EAL, so that other code could reserve virtual areas as well. Signed-off-by: Anatoly Burakov --- Notes: v3: replace uint64_t with size_t for size variables lib/librte_eal/common/eal_common_memory.c | 101 ++++++++++++++++++++++ lib/librte_eal/common/eal_private.h | 33 +++++++ lib/librte_eal/linuxapp/eal/eal_memory.c | 137 ++++++------------------------ 3 files changed, 161 insertions(+), 110 deletions(-) diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index 852f3bb..5b8ced4 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -2,10 +2,12 @@ * Copyright(c) 2010-2014 Intel Corporation */ +#include #include #include #include #include +#include #include #include #include @@ -14,12 +16,111 @@ #include #include #include +#include #include #include "eal_private.h" #include "eal_internal_cfg.h" /* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. + */ + +static uint64_t baseaddr_offset; +static uint64_t system_page_sz; + +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags) +{ + bool addr_is_hint, allow_shrink, unmap, no_align; + uint64_t map_sz; + void *mapped_addr, *aligned_addr; + + if (system_page_sz == 0) + system_page_sz = sysconf(_SC_PAGESIZE); + + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; + allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; + unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; + + if (requested_addr == NULL && internal_config.base_virtaddr != 0) { + requested_addr = (void *) (internal_config.base_virtaddr + + (size_t)baseaddr_offset); + requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); + addr_is_hint = true; + } + + /* if requested address is not aligned by page size, or if requested + * address is NULL, add page size to requested length as we may get an + * address that's aligned by system page size, which can be smaller than + * our requested page size. additionally, we shouldn't try to align if + * system page size is the same as requested page size. + */ + no_align = (requested_addr != NULL && + ((uintptr_t)requested_addr & (page_sz - 1)) == 0) || + page_sz == system_page_sz; + + do { + map_sz = no_align ? *size : *size + page_sz; + + mapped_addr = mmap(requested_addr, map_sz, PROT_READ, + mmap_flags, -1, 0); + if (mapped_addr == MAP_FAILED && allow_shrink) + *size -= page_sz; + } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0); + + /* align resulting address - if map failed, we will ignore the value + * anyway, so no need to add additional checks. + */ + aligned_addr = no_align ? mapped_addr : + RTE_PTR_ALIGN(mapped_addr, page_sz); + + if (*size == 0) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", + strerror(errno)); + rte_errno = errno; + return NULL; + } else if (mapped_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + /* pass errno up the call chain */ + rte_errno = errno; + return NULL; + } else if (requested_addr != NULL && !addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", + requested_addr, aligned_addr); + munmap(mapped_addr, map_sz); + rte_errno = EADDRNOTAVAIL; + return NULL; + } else if (requested_addr != NULL && addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", + requested_addr, aligned_addr); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); + } + + if (unmap) + munmap(mapped_addr, map_sz); + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + aligned_addr, *size); + + baseaddr_offset += *size; + + return aligned_addr; +} + +/* * Return a pointer to a read-only table of struct rte_physmem_desc * elements, containing the layout of all addressable physical * memory. The last element of the table contains a NULL address. diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h index 0b28770..3fed436 100644 --- a/lib/librte_eal/common/eal_private.h +++ b/lib/librte_eal/common/eal_private.h @@ -127,6 +127,39 @@ int rte_eal_alarm_init(void); int rte_eal_check_module(const char *module_name); /** + * Get virtual area of specified size from the OS. + * + * This function is private to the EAL. + * + * @param requested_addr + * Address where to request address space. + * @param size + * Size of requested area. + * @param page_sz + * Page size on which to align requested virtual area. + * @param flags + * EAL_VIRTUAL_AREA_* flags. + * @param mmap_flags + * Extra flags passed directly to mmap(). + * + * @return + * Virtual area address if successful. + * NULL if unsuccessful. + */ + +#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0) +/**< don't fail if cannot get exact requested address. */ +#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1) +/**< try getting smaller sized (decrement by page size) virtual areas if cannot + * get area of requested size. + */ +#define EAL_VIRTUAL_AREA_UNMAP (1 << 2) +/**< immediately unmap reserved virtual area. */ +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags); + +/** * Get cpu core_id. * * This function is private to the EAL. diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index b412fc1..24e6b50 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -28,6 +28,7 @@ #include #endif +#include #include #include #include @@ -57,8 +58,6 @@ * zone as well as a physical contiguous zone. */ -static uint64_t baseaddr_offset; - static bool phys_addrs_available = true; #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" @@ -221,82 +220,6 @@ aslr_enabled(void) } } -/* - * Try to mmap *size bytes in /dev/zero. If it is successful, return the - * pointer to the mmap'd area and keep *size unmodified. Else, retry - * with a smaller zone: decrease *size by hugepage_sz until it reaches - * 0. In this case, return NULL. Note: this function returns an address - * which is a multiple of hugepage size. - */ -static void * -get_virtual_area(size_t *size, size_t hugepage_sz) -{ - void *addr; - void *addr_hint; - int fd; - long aligned_addr; - - if (internal_config.base_virtaddr != 0) { - int page_size = sysconf(_SC_PAGE_SIZE); - addr_hint = (void *) (uintptr_t) - (internal_config.base_virtaddr + baseaddr_offset); - addr_hint = RTE_PTR_ALIGN_FLOOR(addr_hint, page_size); - } else { - addr_hint = NULL; - } - - RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); - - - fd = open("/dev/zero", O_RDONLY); - if (fd < 0){ - RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n"); - return NULL; - } - do { - addr = mmap(addr_hint, (*size) + hugepage_sz, PROT_READ, -#ifdef RTE_ARCH_PPC_64 - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -#else - MAP_PRIVATE, -#endif - fd, 0); - if (addr == MAP_FAILED) { - *size -= hugepage_sz; - } else if (addr_hint != NULL && addr != addr_hint) { - RTE_LOG(WARNING, EAL, "WARNING! Base virtual address " - "hint (%p != %p) not respected!\n", - addr_hint, addr); - RTE_LOG(WARNING, EAL, " This may cause issues with " - "mapping memory into secondary processes\n"); - } - } while (addr == MAP_FAILED && *size > 0); - - if (addr == MAP_FAILED) { - close(fd); - RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", - strerror(errno)); - return NULL; - } - - munmap(addr, (*size) + hugepage_sz); - close(fd); - - /* align addr to a huge page size boundary */ - aligned_addr = (long)addr; - aligned_addr += (hugepage_sz - 1); - aligned_addr &= (~(hugepage_sz - 1)); - addr = (void *)(aligned_addr); - - RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", - addr, *size); - - /* increment offset */ - baseaddr_offset += *size; - - return addr; -} - static sigjmp_buf huge_jmpenv; static void huge_sigbus_handler(int signo __rte_unused) @@ -445,7 +368,16 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, /* get the biggest virtual memory area up to * vma_len. If it fails, vma_addr is NULL, so * let the kernel provide the address. */ - vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz); + vma_addr = eal_get_virtual_area(NULL, &vma_len, + hpi->hugepage_sz, + EAL_VIRTUAL_AREA_ALLOW_SHRINK | + EAL_VIRTUAL_AREA_UNMAP, +#ifdef RTE_ARCH_PPC_64 + MAP_HUGETLB +#else + 0 +#endif + ); if (vma_addr == NULL) vma_len = hugepage_sz; } @@ -1343,7 +1275,7 @@ rte_eal_hugepage_attach(void) unsigned i, s = 0; /* s used to track the segment number */ unsigned max_seg = RTE_MAX_MEMSEG; off_t size = 0; - int fd, fd_zero = -1, fd_hugepage = -1; + int fd, fd_hugepage = -1; if (aslr_enabled() > 0) { RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " @@ -1354,11 +1286,6 @@ rte_eal_hugepage_attach(void) test_phys_addrs_available(); - fd_zero = open("/dev/zero", O_RDONLY); - if (fd_zero < 0) { - RTE_LOG(ERR, EAL, "Could not open /dev/zero\n"); - goto error; - } fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY); if (fd_hugepage < 0) { RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path()); @@ -1368,6 +1295,8 @@ rte_eal_hugepage_attach(void) /* map all segments into memory to make sure we get the addrs */ for (s = 0; s < RTE_MAX_MEMSEG; ++s) { void *base_addr; + size_t mmap_sz; + int mmap_flags = 0; /* * the first memory segment with len==0 is the one that @@ -1376,35 +1305,26 @@ rte_eal_hugepage_attach(void) if (mcfg->memseg[s].len == 0) break; - /* - * fdzero is mmapped to get a contiguous block of virtual - * addresses of the appropriate memseg size. - * use mmap to get identical addresses as the primary process. + /* get identical addresses as the primary process. */ - base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, - PROT_READ, #ifdef RTE_ARCH_PPC_64 - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -#else - MAP_PRIVATE, + mmap_flags |= MAP_HUGETLB; #endif - fd_zero, 0); - if (base_addr == MAP_FAILED || - base_addr != mcfg->memseg[s].addr) { + mmap_sz = mcfg->memseg[s].len; + base_addr = eal_get_virtual_area(mcfg->memseg[s].addr, + &mmap_sz, mcfg->memseg[s].hugepage_sz, 0, + mmap_flags); + if (base_addr == NULL) { max_seg = s; - if (base_addr != MAP_FAILED) { - /* errno is stale, don't use */ - RTE_LOG(ERR, EAL, "Could not mmap %zu bytes " - "in /dev/zero at [%p], got [%p] - " - "please use '--base-virtaddr' option\n", + if (rte_errno == EADDRNOTAVAIL) { + RTE_LOG(ERR, EAL, "Could not mmap %zu bytes at [%p] - please use '--base-virtaddr' option\n", mcfg->memseg[s].len, - mcfg->memseg[s].addr, base_addr); - munmap(base_addr, mcfg->memseg[s].len); + mcfg->memseg[s].addr); } else { - RTE_LOG(ERR, EAL, "Could not mmap %zu bytes " - "in /dev/zero at [%p]: '%s'\n", + RTE_LOG(ERR, EAL, "Could not mmap %zu bytes at [%p]: '%s'\n", mcfg->memseg[s].len, - mcfg->memseg[s].addr, strerror(errno)); + mcfg->memseg[s].addr, + rte_strerror(rte_errno)); } if (aslr_enabled() > 0) { RTE_LOG(ERR, EAL, "It is recommended to " @@ -1469,7 +1389,6 @@ rte_eal_hugepage_attach(void) } /* unmap the hugepage config file, since we are done using it */ munmap(hp, size); - close(fd_zero); close(fd_hugepage); return 0; @@ -1478,8 +1397,6 @@ rte_eal_hugepage_attach(void) munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len); if (hp != NULL && hp != MAP_FAILED) munmap(hp, size); - if (fd_zero >= 0) - close(fd_zero); if (fd_hugepage >= 0) close(fd_hugepage); return -1;