[v5,03/11] eal: introduce memory management wrappers
diff mbox series

Message ID 20200525003720.6410-4-dmitry.kozliuk@gmail.com
State Superseded, archived
Delegated to: Thomas Monjalon
Headers show
Series
  • Windows basic memory management
Related show

Checks

Context Check Description
ci/Intel-compilation fail Compilation issues
ci/checkpatch success coding style OK

Commit Message

Dmitry Kozlyuk May 25, 2020, 12:37 a.m. UTC
Introduce OS-independent wrappers for memory management operations used
across DPDK and specifically in common code of EAL:

* rte_mem_map()
* rte_mem_unmap()
* rte_get_page_size()
* rte_mem_lock()

Windows uses different APIs for memory mapping and reservation, while
Unices reserve memory by mapping it. Introduce EAL private functions to
support memory reservation in common code:

* eal_mem_reserve()
* eal_mem_free()
* eal_mem_set_dump()

Wrappers follow POSIX semantics limited to DPDK tasks, but their
signatures deliberately differ from POSIX ones to be more safe and
expressive.

Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
---
 lib/librte_eal/common/eal_common_fbarray.c |  37 +++--
 lib/librte_eal/common/eal_common_memory.c  |  60 +++-----
 lib/librte_eal/common/eal_private.h        |  78 ++++++++++-
 lib/librte_eal/freebsd/Makefile            |   1 +
 lib/librte_eal/include/rte_memory.h        |  88 ++++++++++++
 lib/librte_eal/linux/Makefile              |   1 +
 lib/librte_eal/linux/eal_memalloc.c        |   5 +-
 lib/librte_eal/rte_eal_version.map         |   6 +
 lib/librte_eal/unix/eal_unix_memory.c      | 152 +++++++++++++++++++++
 lib/librte_eal/unix/meson.build            |   1 +
 10 files changed, 365 insertions(+), 64 deletions(-)
 create mode 100644 lib/librte_eal/unix/eal_unix_memory.c

Comments

Kinsella, Ray May 27, 2020, 6:33 a.m. UTC | #1
Are wrappers 100% are required.
Would it be simpler (and less invasive) to have a windows_compat.h that plugged this holes?
I am not sure on the standard approach here - so I will leave this to others. 

Outside of that - do these symbols really require experimental status.
Are they really likely to change?

Ray K

On 25/05/2020 01:37, Dmitry Kozlyuk wrote:
> Introduce OS-independent wrappers for memory management operations used
> across DPDK and specifically in common code of EAL:
> 
> * rte_mem_map()
> * rte_mem_unmap()
> * rte_get_page_size()
> * rte_mem_lock()
> 
> Windows uses different APIs for memory mapping and reservation, while
> Unices reserve memory by mapping it. Introduce EAL private functions to
> support memory reservation in common code:
> 
> * eal_mem_reserve()
> * eal_mem_free()
> * eal_mem_set_dump()
> 
> Wrappers follow POSIX semantics limited to DPDK tasks, but their
> signatures deliberately differ from POSIX ones to be more safe and
> expressive.
> 
> Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> ---
>  lib/librte_eal/common/eal_common_fbarray.c |  37 +++--
>  lib/librte_eal/common/eal_common_memory.c  |  60 +++-----
>  lib/librte_eal/common/eal_private.h        |  78 ++++++++++-
>  lib/librte_eal/freebsd/Makefile            |   1 +
>  lib/librte_eal/include/rte_memory.h        |  88 ++++++++++++
>  lib/librte_eal/linux/Makefile              |   1 +
>  lib/librte_eal/linux/eal_memalloc.c        |   5 +-
>  lib/librte_eal/rte_eal_version.map         |   6 +
>  lib/librte_eal/unix/eal_unix_memory.c      | 152 +++++++++++++++++++++
>  lib/librte_eal/unix/meson.build            |   1 +
>  10 files changed, 365 insertions(+), 64 deletions(-)
>  create mode 100644 lib/librte_eal/unix/eal_unix_memory.c
> 
> diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
> index cfcab63e9..a41e8ce5f 100644
> --- a/lib/librte_eal/common/eal_common_fbarray.c
> +++ b/lib/librte_eal/common/eal_common_fbarray.c
> @@ -5,15 +5,15 @@
>  #include <fcntl.h>
>  #include <inttypes.h>
>  #include <limits.h>
> -#include <sys/mman.h>
>  #include <stdint.h>
>  #include <errno.h>
>  #include <string.h>
>  #include <unistd.h>
>  
>  #include <rte_common.h>
> -#include <rte_log.h>
>  #include <rte_errno.h>
> +#include <rte_log.h>
> +#include <rte_memory.h>
>  #include <rte_spinlock.h>
>  #include <rte_tailq.h>
>  
> @@ -90,12 +90,9 @@ resize_and_map(int fd, void *addr, size_t len)
>  		return -1;
>  	}
>  
> -	map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
> -			MAP_SHARED | MAP_FIXED, fd, 0);
> +	map_addr = rte_mem_map(addr, len, RTE_PROT_READ | RTE_PROT_WRITE,
> +			RTE_MAP_SHARED | RTE_MAP_FORCE_ADDRESS, fd, 0);
>  	if (map_addr != addr) {
> -		RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
> -		/* pass errno up the chain */
> -		rte_errno = errno;
>  		return -1;
>  	}
>  	return 0;
> @@ -733,7 +730,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
>  		return -1;
>  	}
>  
> -	page_sz = sysconf(_SC_PAGESIZE);
> +	page_sz = rte_get_page_size();
>  	if (page_sz == (size_t)-1) {
>  		free(ma);
>  		return -1;
> @@ -754,9 +751,11 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
>  
>  	if (internal_config.no_shconf) {
>  		/* remap virtual area as writable */
> -		void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
> -				MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
> -		if (new_data == MAP_FAILED) {
> +		static const int flags = RTE_MAP_FORCE_ADDRESS |
> +			RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;
> +		void *new_data = rte_mem_map(data, mmap_len,
> +			RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
> +		if (new_data == NULL) {
>  			RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
>  					__func__, strerror(errno));
>  			goto fail;
> @@ -821,7 +820,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
>  	return 0;
>  fail:
>  	if (data)
> -		munmap(data, mmap_len);
> +		rte_mem_unmap(data, mmap_len);
>  	if (fd >= 0)
>  		close(fd);
>  	free(ma);
> @@ -859,7 +858,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
>  		return -1;
>  	}
>  
> -	page_sz = sysconf(_SC_PAGESIZE);
> +	page_sz = rte_get_page_size();
>  	if (page_sz == (size_t)-1) {
>  		free(ma);
>  		return -1;
> @@ -911,7 +910,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
>  	return 0;
>  fail:
>  	if (data)
> -		munmap(data, mmap_len);
> +		rte_mem_unmap(data, mmap_len);
>  	if (fd >= 0)
>  		close(fd);
>  	free(ma);
> @@ -939,8 +938,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
>  	 * really do anything about it, things will blow up either way.
>  	 */
>  
> -	size_t page_sz = sysconf(_SC_PAGESIZE);
> -
> +	size_t page_sz = rte_get_page_size();
>  	if (page_sz == (size_t)-1)
>  		return -1;
>  
> @@ -959,7 +957,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
>  		goto out;
>  	}
>  
> -	munmap(arr->data, mmap_len);
> +	rte_mem_unmap(arr->data, mmap_len);
>  
>  	/* area is unmapped, close fd and remove the tailq entry */
>  	if (tmp->fd >= 0)
> @@ -994,8 +992,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
>  	 * really do anything about it, things will blow up either way.
>  	 */
>  
> -	size_t page_sz = sysconf(_SC_PAGESIZE);
> -
> +	size_t page_sz = rte_get_page_size();
>  	if (page_sz == (size_t)-1)
>  		return -1;
>  
> @@ -1044,7 +1041,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
>  		}
>  		close(fd);
>  	}
> -	munmap(arr->data, mmap_len);
> +	rte_mem_unmap(arr->data, mmap_len);
>  
>  	/* area is unmapped, remove the tailq entry */
>  	TAILQ_REMOVE(&mem_area_tailq, tmp, next);
> diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
> index 4c897a13f..c6243aca1 100644
> --- a/lib/librte_eal/common/eal_common_memory.c
> +++ b/lib/librte_eal/common/eal_common_memory.c
> @@ -11,7 +11,6 @@
>  #include <string.h>
>  #include <unistd.h>
>  #include <inttypes.h>
> -#include <sys/mman.h>
>  #include <sys/queue.h>
>  
>  #include <rte_fbarray.h>
> @@ -40,18 +39,10 @@
>  static void *next_baseaddr;
>  static uint64_t system_page_sz;
>  
> -#ifdef RTE_EXEC_ENV_LINUX
> -#define RTE_DONTDUMP MADV_DONTDUMP
> -#elif defined RTE_EXEC_ENV_FREEBSD
> -#define RTE_DONTDUMP MADV_NOCORE
> -#else
> -#error "madvise doesn't support this OS"
> -#endif
> -
>  #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
>  void *
>  eal_get_virtual_area(void *requested_addr, size_t *size,
> -		size_t page_sz, int flags, int mmap_flags)
> +	size_t page_sz, int flags, int reserve_flags)
>  {
>  	bool addr_is_hint, allow_shrink, unmap, no_align;
>  	uint64_t map_sz;
> @@ -59,9 +50,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
>  	uint8_t try = 0;
>  
>  	if (system_page_sz == 0)
> -		system_page_sz = sysconf(_SC_PAGESIZE);
> -
> -	mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
> +		system_page_sz = rte_get_page_size();
>  
>  	RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
>  
> @@ -105,24 +94,24 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
>  			return NULL;
>  		}
>  
> -		mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE,
> -				mmap_flags, -1, 0);
> -		if (mapped_addr == MAP_FAILED && allow_shrink)
> +		mapped_addr = eal_mem_reserve(
> +			requested_addr, (size_t)map_sz, reserve_flags);
> +		if ((mapped_addr == NULL) && allow_shrink)
>  			*size -= page_sz;
>  
> -		if (mapped_addr != MAP_FAILED && addr_is_hint &&
> -		    mapped_addr != requested_addr) {
> +		if ((mapped_addr != NULL) && addr_is_hint &&
> +				(mapped_addr != requested_addr)) {
>  			try++;
>  			next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
>  			if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
>  				/* hint was not used. Try with another offset */
> -				munmap(mapped_addr, map_sz);
> -				mapped_addr = MAP_FAILED;
> +				eal_mem_free(mapped_addr, map_sz);
> +				mapped_addr = NULL;
>  				requested_addr = next_baseaddr;
>  			}
>  		}
>  	} while ((allow_shrink || addr_is_hint) &&
> -		 mapped_addr == MAP_FAILED && *size > 0);
> +		(mapped_addr == NULL) && (*size > 0));
>  
>  	/* align resulting address - if map failed, we will ignore the value
>  	 * anyway, so no need to add additional checks.
> @@ -132,20 +121,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
>  
>  	if (*size == 0) {
>  		RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
> -			strerror(errno));
> -		rte_errno = errno;
> +			strerror(rte_errno));
>  		return NULL;
> -	} else if (mapped_addr == MAP_FAILED) {
> +	} else if (mapped_addr == NULL) {
>  		RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
> -			strerror(errno));
> -		/* pass errno up the call chain */
> -		rte_errno = errno;
> +			strerror(rte_errno));
>  		return NULL;
>  	} else if (requested_addr != NULL && !addr_is_hint &&
>  			aligned_addr != requested_addr) {
>  		RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
>  			requested_addr, aligned_addr);
> -		munmap(mapped_addr, map_sz);
> +		eal_mem_free(mapped_addr, map_sz);
>  		rte_errno = EADDRNOTAVAIL;
>  		return NULL;
>  	} else if (requested_addr != NULL && addr_is_hint &&
> @@ -161,7 +147,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
>  		aligned_addr, *size);
>  
>  	if (unmap) {
> -		munmap(mapped_addr, map_sz);
> +		eal_mem_free(mapped_addr, map_sz);
>  	} else if (!no_align) {
>  		void *map_end, *aligned_end;
>  		size_t before_len, after_len;
> @@ -179,19 +165,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
>  		/* unmap space before aligned mmap address */
>  		before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
>  		if (before_len > 0)
> -			munmap(mapped_addr, before_len);
> +			eal_mem_free(mapped_addr, before_len);
>  
>  		/* unmap space after aligned end mmap address */
>  		after_len = RTE_PTR_DIFF(map_end, aligned_end);
>  		if (after_len > 0)
> -			munmap(aligned_end, after_len);
> +			eal_mem_free(aligned_end, after_len);
>  	}
>  
>  	if (!unmap) {
>  		/* Exclude these pages from a core dump. */
> -		if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0)
> -			RTE_LOG(DEBUG, EAL, "madvise failed: %s\n",
> -				strerror(errno));
> +		eal_mem_set_dump(aligned_addr, *size, false);
>  	}
>  
>  	return aligned_addr;
> @@ -547,10 +531,10 @@ rte_eal_memdevice_init(void)
>  int
>  rte_mem_lock_page(const void *virt)
>  {
> -	unsigned long virtual = (unsigned long)virt;
> -	int page_size = getpagesize();
> -	unsigned long aligned = (virtual & ~(page_size - 1));
> -	return mlock((void *)aligned, page_size);
> +	uintptr_t virtual = (uintptr_t)virt;
> +	size_t page_size = rte_get_page_size();
> +	uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
> +	return rte_mem_lock((void *)aligned, page_size);
>  }
>  
>  int
> diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
> index cef73d6fe..a93850c09 100644
> --- a/lib/librte_eal/common/eal_private.h
> +++ b/lib/librte_eal/common/eal_private.h
> @@ -11,6 +11,7 @@
>  
>  #include <rte_dev.h>
>  #include <rte_lcore.h>
> +#include <rte_memory.h>
>  
>  /**
>   * Structure storing internal configuration (per-lcore)
> @@ -202,6 +203,24 @@ int rte_eal_alarm_init(void);
>   */
>  int rte_eal_check_module(const char *module_name);
>  
> +/**
> + * Memory reservation flags.
> + */
> +enum eal_mem_reserve_flags {
> +	/**
> +	 * Reserve hugepages. May be unsupported by some platforms.
> +	 */
> +	EAL_RESERVE_HUGEPAGES = 1 << 0,
> +	/**
> +	 * Force reserving memory at the requested address.
> +	 * This can be a destructive action depending on the implementation.
> +	 *
> +	 * @see RTE_MAP_FORCE_ADDRESS for description of possible consequences
> +	 *      (although implementations are not required to use it).
> +	 */
> +	EAL_RESERVE_FORCE_ADDRESS = 1 << 1
> +};
> +
>  /**
>   * Get virtual area of specified size from the OS.
>   *
> @@ -215,8 +234,8 @@ int rte_eal_check_module(const char *module_name);
>   *   Page size on which to align requested virtual area.
>   * @param flags
>   *   EAL_VIRTUAL_AREA_* flags.
> - * @param mmap_flags
> - *   Extra flags passed directly to mmap().
> + * @param reserve_flags
> + *   Extra flags passed directly to rte_mem_reserve().
>   *
>   * @return
>   *   Virtual area address if successful.
> @@ -233,7 +252,7 @@ int rte_eal_check_module(const char *module_name);
>  /**< immediately unmap reserved virtual area. */
>  void *
>  eal_get_virtual_area(void *requested_addr, size_t *size,
> -		size_t page_sz, int flags, int mmap_flags);
> +		size_t page_sz, int flags, int reserve_flags);
>  
>  /**
>   * Get cpu core_id.
> @@ -467,4 +486,57 @@ eal_file_lock(int fd, enum eal_flock_op op, enum eal_flock_mode mode);
>  int
>  eal_file_truncate(int fd, ssize_t size);
>  
> +/**
> + * Reserve a region of virtual memory.
> + *
> + * Use eal_mem_free() to free reserved memory.
> + *
> + * @param requested_addr
> + *  A desired reservation addressm which must be page-aligned.
> + *  The system might not respect it.
> + *  NULL means the address will be chosen by the system.
> + * @param size
> + *  Reservation size. Must be a multiple of system page size.
> + * @param flags
> + *  Reservation options, a combination of eal_mem_reserve_flags.
> + * @returns
> + *  Starting address of the reserved area on success, NULL on failure.
> + *  Callers must not access this memory until remapping it.
> + */
> +void *
> +eal_mem_reserve(void *requested_addr, size_t size, int flags);
> +
> +/**
> + * Free memory obtained by eal_mem_reserve() or eal_mem_alloc().
> + *
> + * If *virt* and *size* describe a part of the reserved region,
> + * only this part of the region is freed (accurately up to the system
> + * page size). If *virt* points to allocated memory, *size* must match
> + * the one specified on allocation. The behavior is undefined
> + * if the memory pointed by *virt* is obtained from another source
> + * than listed above.
> + *
> + * @param virt
> + *  A virtual address in a region previously reserved.
> + * @param size
> + *  Number of bytes to unreserve.
> + */
> +void
> +eal_mem_free(void *virt, size_t size);
> +
> +/**
> + * Configure memory region inclusion into core dumps.
> + *
> + * @param virt
> + *  Starting address of the region.
> + * @param size
> + *  Size of the region.
> + * @param dump
> + *  True to include memory into core dumps, false to exclude.
> + * @return
> + *  0 on success, (-1) on failure and rte_errno is set.
> + */
> +int
> +eal_mem_set_dump(void *virt, size_t size, bool dump);
> +
>  #endif /* _EAL_PRIVATE_H_ */
> diff --git a/lib/librte_eal/freebsd/Makefile b/lib/librte_eal/freebsd/Makefile
> index 4654ca2b3..f64a3994c 100644
> --- a/lib/librte_eal/freebsd/Makefile
> +++ b/lib/librte_eal/freebsd/Makefile
> @@ -77,6 +77,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c
>  
>  # from unix dir
>  SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix.c
> +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix_memory.c
>  
>  # from arch dir
>  SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c
> diff --git a/lib/librte_eal/include/rte_memory.h b/lib/librte_eal/include/rte_memory.h
> index 65374d53a..63ff0773d 100644
> --- a/lib/librte_eal/include/rte_memory.h
> +++ b/lib/librte_eal/include/rte_memory.h
> @@ -82,6 +82,94 @@ struct rte_memseg_list {
>  	struct rte_fbarray memseg_arr;
>  };
>  
> +/**
> + * Memory protection flags.
> + */
> +enum rte_mem_prot {
> +	RTE_PROT_READ = 1 << 0,   /**< Read access. */
> +	RTE_PROT_WRITE = 1 << 1,  /**< Write access. */
> +	RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
> +};
> +
> +/**
> + * Additional flags for memory mapping.
> + */
> +enum rte_map_flags {
> +	/** Changes to the mapped memory are visible to other processes. */
> +	RTE_MAP_SHARED = 1 << 0,
> +	/** Mapping is not backed by a regular file. */
> +	RTE_MAP_ANONYMOUS = 1 << 1,
> +	/** Copy-on-write mapping, changes are invisible to other processes. */
> +	RTE_MAP_PRIVATE = 1 << 2,
> +	/**
> +	 * Force mapping to the requested address. This flag should be used
> +	 * with caution, because to fulfill the request implementation
> +	 * may remove all other mappings in the requested region. However,
> +	 * it is not required to do so, thus mapping with this flag may fail.
> +	 */
> +	RTE_MAP_FORCE_ADDRESS = 1 << 3
> +};
> +
> +/**
> + * Map a portion of an opened file or the page file into memory.
> + *
> + * This function is similar to POSIX mmap(3) with common MAP_ANONYMOUS
> + * extension, except for the return value.
> + *
> + * @param requested_addr
> + *  Desired virtual address for mapping. Can be NULL to let OS choose.
> + * @param size
> + *  Size of the mapping in bytes.
> + * @param prot
> + *  Protection flags, a combination of rte_mem_prot values.
> + * @param flags
> + *  Addtional mapping flags, a combination of rte_map_flags.
> + * @param fd
> + *  Mapped file descriptor. Can be negative for anonymous mapping.
> + * @param offset
> + *  Offset of the mapped region in fd. Must be 0 for anonymous mappings.
> + * @return
> + *  Mapped address or NULL on failure and rte_errno is set to OS error.
> + */
> +__rte_experimental
> +void *
> +rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
> +	int fd, size_t offset);
> +
> +/**
> + * OS-independent implementation of POSIX munmap(3).
> + */
> +__rte_experimental
> +int
> +rte_mem_unmap(void *virt, size_t size);
> +
> +/**
> + * Get system page size. This function never fails.
> + *
> + * @return
> + *   Page size in bytes.
> + */
> +__rte_experimental
> +size_t
> +rte_get_page_size(void);
> +
> +/**
> + * Lock in physical memory all pages crossed by the address region.
> + *
> + * @param virt
> + *   Base virtual address of the region.
> + * @param size
> + *   Size of the region.
> + * @return
> + *   0 on success, negative on error.
> + *
> + * @see rte_get_page_size() to retrieve the page size.
> + * @see rte_mem_lock_page() to lock an entire single page.
> + */
> +__rte_experimental
> +int
> +rte_mem_lock(const void *virt, size_t size);
> +
>  /**
>   * Lock page in physical memory and prevent from swapping.
>   *
> diff --git a/lib/librte_eal/linux/Makefile b/lib/librte_eal/linux/Makefile
> index 4f39d462c..d314648cb 100644
> --- a/lib/librte_eal/linux/Makefile
> +++ b/lib/librte_eal/linux/Makefile
> @@ -84,6 +84,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c
>  
>  # from unix dir
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix.c
> +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix_memory.c
>  
>  # from arch dir
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c
> diff --git a/lib/librte_eal/linux/eal_memalloc.c b/lib/librte_eal/linux/eal_memalloc.c
> index 2c717f8bd..bf29b83c6 100644
> --- a/lib/librte_eal/linux/eal_memalloc.c
> +++ b/lib/librte_eal/linux/eal_memalloc.c
> @@ -630,7 +630,7 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
>  mapped:
>  	munmap(addr, alloc_sz);
>  unmapped:
> -	flags = MAP_FIXED;
> +	flags = EAL_RESERVE_FORCE_ADDRESS;
>  	new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
>  	if (new_addr != addr) {
>  		if (new_addr != NULL)
> @@ -687,8 +687,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
>  		return -1;
>  	}
>  
> -	if (madvise(ms->addr, ms->len, MADV_DONTDUMP) != 0)
> -		RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", strerror(errno));
> +	eal_mem_set_dump(ms->addr, ms->len, false);
>  
>  	exit_early = false;
>  
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index d8038749a..dff51b13d 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -386,4 +386,10 @@ EXPERIMENTAL {
>  	rte_trace_point_lookup;
>  	rte_trace_regexp;
>  	rte_trace_save;
> +
> +	# added in 20.08
> +	rte_get_page_size;
> +	rte_mem_lock;
> +	rte_mem_map;
> +	rte_mem_unmap;
>  };
> diff --git a/lib/librte_eal/unix/eal_unix_memory.c b/lib/librte_eal/unix/eal_unix_memory.c
> new file mode 100644
> index 000000000..658595b6e
> --- /dev/null
> +++ b/lib/librte_eal/unix/eal_unix_memory.c
> @@ -0,0 +1,152 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Dmitry Kozlyuk
> + */
> +
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <unistd.h>
> +
> +#include <rte_errno.h>
> +#include <rte_log.h>
> +#include <rte_memory.h>
> +
> +#include "eal_private.h"
> +
> +#ifdef RTE_EXEC_ENV_LINUX
> +#define EAL_DONTDUMP MADV_DONTDUMP
> +#define EAL_DODUMP   MADV_DODUMP
> +#elif defined RTE_EXEC_ENV_FREEBSD
> +#define EAL_DONTDUMP MADV_NOCORE
> +#define EAL_DODUMP   MADV_CORE
> +#else
> +#error "madvise doesn't support this OS"
> +#endif
> +
> +static void *
> +mem_map(void *requested_addr, size_t size, int prot, int flags,
> +	int fd, size_t offset)
> +{
> +	void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
> +	if (virt == MAP_FAILED) {
> +		RTE_LOG(DEBUG, EAL,
> +			"Cannot mmap(%p, 0x%zx, 0x%x, 0x%x, %d, 0x%zx): %s\n",
> +			requested_addr, size, prot, flags, fd, offset,
> +			strerror(errno));
> +		rte_errno = errno;
> +		return NULL;
> +	}
> +	return virt;
> +}
> +
> +static int
> +mem_unmap(void *virt, size_t size)
> +{
> +	int ret = munmap(virt, size);
> +	if (ret < 0) {
> +		RTE_LOG(DEBUG, EAL, "Cannot munmap(%p, 0x%zx): %s\n",
> +			virt, size, strerror(errno));
> +		rte_errno = errno;
> +	}
> +	return ret;
> +}
> +
> +void *
> +eal_mem_reserve(void *requested_addr, size_t size, int flags)
> +{
> +	int sys_flags = MAP_PRIVATE | MAP_ANONYMOUS;
> +
> +	if (flags & EAL_RESERVE_HUGEPAGES) {
> +#ifdef MAP_HUGETLB
> +		sys_flags |= MAP_HUGETLB;
> +#else
> +		rte_errno = ENOTSUP;
> +		return NULL;
> +#endif
> +	}
> +
> +	if (flags & EAL_RESERVE_FORCE_ADDRESS)
> +		sys_flags |= MAP_FIXED;
> +
> +	return mem_map(requested_addr, size, PROT_NONE, sys_flags, -1, 0);
> +}
> +
> +void
> +eal_mem_free(void *virt, size_t size)
> +{
> +	mem_unmap(virt, size);
> +}
> +
> +int
> +eal_mem_set_dump(void *virt, size_t size, bool dump)
> +{
> +	int flags = dump ? EAL_DODUMP : EAL_DONTDUMP;
> +	int ret = madvise(virt, size, flags);
> +	if (ret) {
> +		RTE_LOG(DEBUG, EAL, "madvise(%p, %#zx, %d) failed: %s\n",
> +				virt, size, flags, strerror(rte_errno));
> +		rte_errno = errno;
> +	}
> +	return ret;
> +}
> +
> +static int
> +mem_rte_to_sys_prot(int prot)
> +{
> +	int sys_prot = PROT_NONE;
> +
> +	if (prot & RTE_PROT_READ)
> +		sys_prot |= PROT_READ;
> +	if (prot & RTE_PROT_WRITE)
> +		sys_prot |= PROT_WRITE;
> +	if (prot & RTE_PROT_EXECUTE)
> +		sys_prot |= PROT_EXEC;
> +
> +	return sys_prot;
> +}
> +
> +void *
> +rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
> +	int fd, size_t offset)
> +{
> +	int sys_flags = 0;
> +	int sys_prot;
> +
> +	sys_prot = mem_rte_to_sys_prot(prot);
> +
> +	if (flags & RTE_MAP_SHARED)
> +		sys_flags |= MAP_SHARED;
> +	if (flags & RTE_MAP_ANONYMOUS)
> +		sys_flags |= MAP_ANONYMOUS;
> +	if (flags & RTE_MAP_PRIVATE)
> +		sys_flags |= MAP_PRIVATE;
> +	if (flags & RTE_MAP_FORCE_ADDRESS)
> +		sys_flags |= MAP_FIXED;
> +
> +	return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset);
> +}
> +
> +int
> +rte_mem_unmap(void *virt, size_t size)
> +{
> +	return mem_unmap(virt, size);
> +}
> +
> +size_t
> +rte_get_page_size(void)
> +{
> +	static size_t page_size;
> +
> +	if (!page_size)
> +		page_size = sysconf(_SC_PAGESIZE);
> +
> +	return page_size;
> +}
> +
> +int
> +rte_mem_lock(const void *virt, size_t size)
> +{
> +	int ret = mlock(virt, size);
> +	if (ret)
> +		rte_errno = errno;
> +	return ret;
> +}
> diff --git a/lib/librte_eal/unix/meson.build b/lib/librte_eal/unix/meson.build
> index cfa1b4ef9..5734f26ad 100644
> --- a/lib/librte_eal/unix/meson.build
> +++ b/lib/librte_eal/unix/meson.build
> @@ -3,4 +3,5 @@
>  
>  sources += files(
>  	'eal_unix.c',
> +	'eal_unix_memory.c',
>  )
>
Dmitry Kozlyuk May 27, 2020, 4:34 p.m. UTC | #2
Answers below is the summary of discussion with Thomas, Ranjit, Tal, et al.

On Wed, 27 May 2020 07:33:32 +0100 Ray Kinsella <mdr@ashroe.eu> wrote:
> Are wrappers 100% are required.
> Would it be simpler (and less invasive) to have a windows_compat.h that plugged this holes?
> I am not sure on the standard approach here - so I will leave this to others. 

With wrappers, we control API and semantics, which is limited compared to the
underlying syscalls. It is also cleaner not to export non-RTE symbols from
DPDK libraries. Regarding invasion: it requires little change, factoring
out some common error logging in the process.

> Outside of that - do these symbols really require experimental status.
> Are they really likely to change?

Indeed, the wrappers should be internal, not experimental. Will fix in v6.
Burakov, Anatoly May 28, 2020, 11:26 a.m. UTC | #3
On 25-May-20 1:37 AM, Dmitry Kozlyuk wrote:
> Introduce OS-independent wrappers for memory management operations used
> across DPDK and specifically in common code of EAL:
> 
> * rte_mem_map()
> * rte_mem_unmap()
> * rte_get_page_size()
> * rte_mem_lock()
> 
> Windows uses different APIs for memory mapping and reservation, while
> Unices reserve memory by mapping it. Introduce EAL private functions to
> support memory reservation in common code:
> 
> * eal_mem_reserve()
> * eal_mem_free()
> * eal_mem_set_dump()
> 
> Wrappers follow POSIX semantics limited to DPDK tasks, but their
> signatures deliberately differ from POSIX ones to be more safe and
> expressive.
> 
> Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> ---

<snip>

> -	page_sz = sysconf(_SC_PAGESIZE);
> +	page_sz = rte_get_page_size();
>   	if (page_sz == (size_t)-1) {
>   		free(ma);
>   		return -1;
> @@ -754,9 +751,11 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
>   
>   	if (internal_config.no_shconf) {
>   		/* remap virtual area as writable */
> -		void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
> -				MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
> -		if (new_data == MAP_FAILED) {
> +		static const int flags = RTE_MAP_FORCE_ADDRESS |
> +			RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;
> +		void *new_data = rte_mem_map(data, mmap_len,
> +			RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
> +		if (new_data == NULL) {
>   			RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
>   					__func__, strerror(errno));

I believe this should be rte_strerror(rte_errno) instead of strerror(errno).

>   			goto fail;
> @@ -821,7 +820,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
>   	return 0;
>   fail:
>   	if (data)
> -		munmap(data, mmap_len);
> +		rte_mem_unmap(data, mmap_len);
>   	if (fd >= 0)
>   		close(fd);
>   	free(ma);
> @@ -859,7 +858,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
>   		return -1;

<snip>

>   
> +/**
> + * Memory protection flags.
> + */
> +enum rte_mem_prot {
> +	RTE_PROT_READ = 1 << 0,   /**< Read access. */
> +	RTE_PROT_WRITE = 1 << 1,  /**< Write access. */
> +	RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
> +};
> +
> +/**
> + * Additional flags for memory mapping.
> + */
> +enum rte_map_flags {
> +	/** Changes to the mapped memory are visible to other processes. */
> +	RTE_MAP_SHARED = 1 << 0,
> +	/** Mapping is not backed by a regular file. */
> +	RTE_MAP_ANONYMOUS = 1 << 1,
> +	/** Copy-on-write mapping, changes are invisible to other processes. */
> +	RTE_MAP_PRIVATE = 1 << 2,
> +	/**
> +	 * Force mapping to the requested address. This flag should be used
> +	 * with caution, because to fulfill the request implementation
> +	 * may remove all other mappings in the requested region. However,
> +	 * it is not required to do so, thus mapping with this flag may fail.
> +	 */
> +	RTE_MAP_FORCE_ADDRESS = 1 << 3
> +};

I have no strong opinion on this, but it feels like the fact that these 
are enums is a relic from the times where you used enum everywhere :) i 
have a feeling that DPDK codebase prefers #define's for this usage, 
while what you have here is more of a C++ thing.
Burakov, Anatoly May 28, 2020, 11:52 a.m. UTC | #4
On 25-May-20 1:37 AM, Dmitry Kozlyuk wrote:
> Introduce OS-independent wrappers for memory management operations used
> across DPDK and specifically in common code of EAL:
> 
> * rte_mem_map()
> * rte_mem_unmap()
> * rte_get_page_size()
> * rte_mem_lock()
> 
> Windows uses different APIs for memory mapping and reservation, while
> Unices reserve memory by mapping it. Introduce EAL private functions to
> support memory reservation in common code:
> 
> * eal_mem_reserve()
> * eal_mem_free()
> * eal_mem_set_dump()
> 
> Wrappers follow POSIX semantics limited to DPDK tasks, but their
> signatures deliberately differ from POSIX ones to be more safe and
> expressive.
> 
> Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> ---

<snip>

> +	} else if (mapped_addr == NULL) {
>   		RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
> -			strerror(errno));
> -		/* pass errno up the call chain */
> -		rte_errno = errno;
> +			strerror(rte_errno));

Also, please check that you're using rte_strerror with rte_errno :)
Thomas Monjalon June 1, 2020, 9:08 p.m. UTC | #5
28/05/2020 13:26, Burakov, Anatoly:
> On 25-May-20 1:37 AM, Dmitry Kozlyuk wrote:
> > +/**
> > + * Memory protection flags.
> > + */
> > +enum rte_mem_prot {
> > +	RTE_PROT_READ = 1 << 0,   /**< Read access. */
> > +	RTE_PROT_WRITE = 1 << 1,  /**< Write access. */
> > +	RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
> > +};
> > +
> > +/**
> > + * Additional flags for memory mapping.
> > + */
> > +enum rte_map_flags {
> > +	/** Changes to the mapped memory are visible to other processes. */
> > +	RTE_MAP_SHARED = 1 << 0,
> > +	/** Mapping is not backed by a regular file. */
> > +	RTE_MAP_ANONYMOUS = 1 << 1,
> > +	/** Copy-on-write mapping, changes are invisible to other processes. */
> > +	RTE_MAP_PRIVATE = 1 << 2,
> > +	/**
> > +	 * Force mapping to the requested address. This flag should be used
> > +	 * with caution, because to fulfill the request implementation
> > +	 * may remove all other mappings in the requested region. However,
> > +	 * it is not required to do so, thus mapping with this flag may fail.
> > +	 */
> > +	RTE_MAP_FORCE_ADDRESS = 1 << 3
> > +};
> 
> I have no strong opinion on this, but it feels like the fact that these 
> are enums is a relic from the times where you used enum everywhere :) i 
> have a feeling that DPDK codebase prefers #define's for this usage, 
> while what you have here is more of a C++ thing.

The benefit of using an enum is to explicitly name the type
of the variables, serving documentation purpose.

+1 for the enums

Patch
diff mbox series

diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
index cfcab63e9..a41e8ce5f 100644
--- a/lib/librte_eal/common/eal_common_fbarray.c
+++ b/lib/librte_eal/common/eal_common_fbarray.c
@@ -5,15 +5,15 @@ 
 #include <fcntl.h>
 #include <inttypes.h>
 #include <limits.h>
-#include <sys/mman.h>
 #include <stdint.h>
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
 
 #include <rte_common.h>
-#include <rte_log.h>
 #include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
 #include <rte_spinlock.h>
 #include <rte_tailq.h>
 
@@ -90,12 +90,9 @@  resize_and_map(int fd, void *addr, size_t len)
 		return -1;
 	}
 
-	map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
-			MAP_SHARED | MAP_FIXED, fd, 0);
+	map_addr = rte_mem_map(addr, len, RTE_PROT_READ | RTE_PROT_WRITE,
+			RTE_MAP_SHARED | RTE_MAP_FORCE_ADDRESS, fd, 0);
 	if (map_addr != addr) {
-		RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
-		/* pass errno up the chain */
-		rte_errno = errno;
 		return -1;
 	}
 	return 0;
@@ -733,7 +730,7 @@  rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
 		return -1;
 	}
 
-	page_sz = sysconf(_SC_PAGESIZE);
+	page_sz = rte_get_page_size();
 	if (page_sz == (size_t)-1) {
 		free(ma);
 		return -1;
@@ -754,9 +751,11 @@  rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
 
 	if (internal_config.no_shconf) {
 		/* remap virtual area as writable */
-		void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
-				MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
-		if (new_data == MAP_FAILED) {
+		static const int flags = RTE_MAP_FORCE_ADDRESS |
+			RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;
+		void *new_data = rte_mem_map(data, mmap_len,
+			RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
+		if (new_data == NULL) {
 			RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
 					__func__, strerror(errno));
 			goto fail;
@@ -821,7 +820,7 @@  rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
 	return 0;
 fail:
 	if (data)
-		munmap(data, mmap_len);
+		rte_mem_unmap(data, mmap_len);
 	if (fd >= 0)
 		close(fd);
 	free(ma);
@@ -859,7 +858,7 @@  rte_fbarray_attach(struct rte_fbarray *arr)
 		return -1;
 	}
 
-	page_sz = sysconf(_SC_PAGESIZE);
+	page_sz = rte_get_page_size();
 	if (page_sz == (size_t)-1) {
 		free(ma);
 		return -1;
@@ -911,7 +910,7 @@  rte_fbarray_attach(struct rte_fbarray *arr)
 	return 0;
 fail:
 	if (data)
-		munmap(data, mmap_len);
+		rte_mem_unmap(data, mmap_len);
 	if (fd >= 0)
 		close(fd);
 	free(ma);
@@ -939,8 +938,7 @@  rte_fbarray_detach(struct rte_fbarray *arr)
 	 * really do anything about it, things will blow up either way.
 	 */
 
-	size_t page_sz = sysconf(_SC_PAGESIZE);
-
+	size_t page_sz = rte_get_page_size();
 	if (page_sz == (size_t)-1)
 		return -1;
 
@@ -959,7 +957,7 @@  rte_fbarray_detach(struct rte_fbarray *arr)
 		goto out;
 	}
 
-	munmap(arr->data, mmap_len);
+	rte_mem_unmap(arr->data, mmap_len);
 
 	/* area is unmapped, close fd and remove the tailq entry */
 	if (tmp->fd >= 0)
@@ -994,8 +992,7 @@  rte_fbarray_destroy(struct rte_fbarray *arr)
 	 * really do anything about it, things will blow up either way.
 	 */
 
-	size_t page_sz = sysconf(_SC_PAGESIZE);
-
+	size_t page_sz = rte_get_page_size();
 	if (page_sz == (size_t)-1)
 		return -1;
 
@@ -1044,7 +1041,7 @@  rte_fbarray_destroy(struct rte_fbarray *arr)
 		}
 		close(fd);
 	}
-	munmap(arr->data, mmap_len);
+	rte_mem_unmap(arr->data, mmap_len);
 
 	/* area is unmapped, remove the tailq entry */
 	TAILQ_REMOVE(&mem_area_tailq, tmp, next);
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 4c897a13f..c6243aca1 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -11,7 +11,6 @@ 
 #include <string.h>
 #include <unistd.h>
 #include <inttypes.h>
-#include <sys/mman.h>
 #include <sys/queue.h>
 
 #include <rte_fbarray.h>
@@ -40,18 +39,10 @@ 
 static void *next_baseaddr;
 static uint64_t system_page_sz;
 
-#ifdef RTE_EXEC_ENV_LINUX
-#define RTE_DONTDUMP MADV_DONTDUMP
-#elif defined RTE_EXEC_ENV_FREEBSD
-#define RTE_DONTDUMP MADV_NOCORE
-#else
-#error "madvise doesn't support this OS"
-#endif
-
 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
 void *
 eal_get_virtual_area(void *requested_addr, size_t *size,
-		size_t page_sz, int flags, int mmap_flags)
+	size_t page_sz, int flags, int reserve_flags)
 {
 	bool addr_is_hint, allow_shrink, unmap, no_align;
 	uint64_t map_sz;
@@ -59,9 +50,7 @@  eal_get_virtual_area(void *requested_addr, size_t *size,
 	uint8_t try = 0;
 
 	if (system_page_sz == 0)
-		system_page_sz = sysconf(_SC_PAGESIZE);
-
-	mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+		system_page_sz = rte_get_page_size();
 
 	RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
 
@@ -105,24 +94,24 @@  eal_get_virtual_area(void *requested_addr, size_t *size,
 			return NULL;
 		}
 
-		mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE,
-				mmap_flags, -1, 0);
-		if (mapped_addr == MAP_FAILED && allow_shrink)
+		mapped_addr = eal_mem_reserve(
+			requested_addr, (size_t)map_sz, reserve_flags);
+		if ((mapped_addr == NULL) && allow_shrink)
 			*size -= page_sz;
 
-		if (mapped_addr != MAP_FAILED && addr_is_hint &&
-		    mapped_addr != requested_addr) {
+		if ((mapped_addr != NULL) && addr_is_hint &&
+				(mapped_addr != requested_addr)) {
 			try++;
 			next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
 			if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
 				/* hint was not used. Try with another offset */
-				munmap(mapped_addr, map_sz);
-				mapped_addr = MAP_FAILED;
+				eal_mem_free(mapped_addr, map_sz);
+				mapped_addr = NULL;
 				requested_addr = next_baseaddr;
 			}
 		}
 	} while ((allow_shrink || addr_is_hint) &&
-		 mapped_addr == MAP_FAILED && *size > 0);
+		(mapped_addr == NULL) && (*size > 0));
 
 	/* align resulting address - if map failed, we will ignore the value
 	 * anyway, so no need to add additional checks.
@@ -132,20 +121,17 @@  eal_get_virtual_area(void *requested_addr, size_t *size,
 
 	if (*size == 0) {
 		RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
-			strerror(errno));
-		rte_errno = errno;
+			strerror(rte_errno));
 		return NULL;
-	} else if (mapped_addr == MAP_FAILED) {
+	} else if (mapped_addr == NULL) {
 		RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
-			strerror(errno));
-		/* pass errno up the call chain */
-		rte_errno = errno;
+			strerror(rte_errno));
 		return NULL;
 	} else if (requested_addr != NULL && !addr_is_hint &&
 			aligned_addr != requested_addr) {
 		RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
 			requested_addr, aligned_addr);
-		munmap(mapped_addr, map_sz);
+		eal_mem_free(mapped_addr, map_sz);
 		rte_errno = EADDRNOTAVAIL;
 		return NULL;
 	} else if (requested_addr != NULL && addr_is_hint &&
@@ -161,7 +147,7 @@  eal_get_virtual_area(void *requested_addr, size_t *size,
 		aligned_addr, *size);
 
 	if (unmap) {
-		munmap(mapped_addr, map_sz);
+		eal_mem_free(mapped_addr, map_sz);
 	} else if (!no_align) {
 		void *map_end, *aligned_end;
 		size_t before_len, after_len;
@@ -179,19 +165,17 @@  eal_get_virtual_area(void *requested_addr, size_t *size,
 		/* unmap space before aligned mmap address */
 		before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
 		if (before_len > 0)
-			munmap(mapped_addr, before_len);
+			eal_mem_free(mapped_addr, before_len);
 
 		/* unmap space after aligned end mmap address */
 		after_len = RTE_PTR_DIFF(map_end, aligned_end);
 		if (after_len > 0)
-			munmap(aligned_end, after_len);
+			eal_mem_free(aligned_end, after_len);
 	}
 
 	if (!unmap) {
 		/* Exclude these pages from a core dump. */
-		if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0)
-			RTE_LOG(DEBUG, EAL, "madvise failed: %s\n",
-				strerror(errno));
+		eal_mem_set_dump(aligned_addr, *size, false);
 	}
 
 	return aligned_addr;
@@ -547,10 +531,10 @@  rte_eal_memdevice_init(void)
 int
 rte_mem_lock_page(const void *virt)
 {
-	unsigned long virtual = (unsigned long)virt;
-	int page_size = getpagesize();
-	unsigned long aligned = (virtual & ~(page_size - 1));
-	return mlock((void *)aligned, page_size);
+	uintptr_t virtual = (uintptr_t)virt;
+	size_t page_size = rte_get_page_size();
+	uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
+	return rte_mem_lock((void *)aligned, page_size);
 }
 
 int
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index cef73d6fe..a93850c09 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -11,6 +11,7 @@ 
 
 #include <rte_dev.h>
 #include <rte_lcore.h>
+#include <rte_memory.h>
 
 /**
  * Structure storing internal configuration (per-lcore)
@@ -202,6 +203,24 @@  int rte_eal_alarm_init(void);
  */
 int rte_eal_check_module(const char *module_name);
 
+/**
+ * Memory reservation flags.
+ */
+enum eal_mem_reserve_flags {
+	/**
+	 * Reserve hugepages. May be unsupported by some platforms.
+	 */
+	EAL_RESERVE_HUGEPAGES = 1 << 0,
+	/**
+	 * Force reserving memory at the requested address.
+	 * This can be a destructive action depending on the implementation.
+	 *
+	 * @see RTE_MAP_FORCE_ADDRESS for description of possible consequences
+	 *      (although implementations are not required to use it).
+	 */
+	EAL_RESERVE_FORCE_ADDRESS = 1 << 1
+};
+
 /**
  * Get virtual area of specified size from the OS.
  *
@@ -215,8 +234,8 @@  int rte_eal_check_module(const char *module_name);
  *   Page size on which to align requested virtual area.
  * @param flags
  *   EAL_VIRTUAL_AREA_* flags.
- * @param mmap_flags
- *   Extra flags passed directly to mmap().
+ * @param reserve_flags
+ *   Extra flags passed directly to rte_mem_reserve().
  *
  * @return
  *   Virtual area address if successful.
@@ -233,7 +252,7 @@  int rte_eal_check_module(const char *module_name);
 /**< immediately unmap reserved virtual area. */
 void *
 eal_get_virtual_area(void *requested_addr, size_t *size,
-		size_t page_sz, int flags, int mmap_flags);
+		size_t page_sz, int flags, int reserve_flags);
 
 /**
  * Get cpu core_id.
@@ -467,4 +486,57 @@  eal_file_lock(int fd, enum eal_flock_op op, enum eal_flock_mode mode);
 int
 eal_file_truncate(int fd, ssize_t size);
 
+/**
+ * Reserve a region of virtual memory.
+ *
+ * Use eal_mem_free() to free reserved memory.
+ *
+ * @param requested_addr
+ *  A desired reservation addressm which must be page-aligned.
+ *  The system might not respect it.
+ *  NULL means the address will be chosen by the system.
+ * @param size
+ *  Reservation size. Must be a multiple of system page size.
+ * @param flags
+ *  Reservation options, a combination of eal_mem_reserve_flags.
+ * @returns
+ *  Starting address of the reserved area on success, NULL on failure.
+ *  Callers must not access this memory until remapping it.
+ */
+void *
+eal_mem_reserve(void *requested_addr, size_t size, int flags);
+
+/**
+ * Free memory obtained by eal_mem_reserve() or eal_mem_alloc().
+ *
+ * If *virt* and *size* describe a part of the reserved region,
+ * only this part of the region is freed (accurately up to the system
+ * page size). If *virt* points to allocated memory, *size* must match
+ * the one specified on allocation. The behavior is undefined
+ * if the memory pointed by *virt* is obtained from another source
+ * than listed above.
+ *
+ * @param virt
+ *  A virtual address in a region previously reserved.
+ * @param size
+ *  Number of bytes to unreserve.
+ */
+void
+eal_mem_free(void *virt, size_t size);
+
+/**
+ * Configure memory region inclusion into core dumps.
+ *
+ * @param virt
+ *  Starting address of the region.
+ * @param size
+ *  Size of the region.
+ * @param dump
+ *  True to include memory into core dumps, false to exclude.
+ * @return
+ *  0 on success, (-1) on failure and rte_errno is set.
+ */
+int
+eal_mem_set_dump(void *virt, size_t size, bool dump);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/freebsd/Makefile b/lib/librte_eal/freebsd/Makefile
index 4654ca2b3..f64a3994c 100644
--- a/lib/librte_eal/freebsd/Makefile
+++ b/lib/librte_eal/freebsd/Makefile
@@ -77,6 +77,7 @@  SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c
 
 # from unix dir
 SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix_memory.c
 
 # from arch dir
 SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c
diff --git a/lib/librte_eal/include/rte_memory.h b/lib/librte_eal/include/rte_memory.h
index 65374d53a..63ff0773d 100644
--- a/lib/librte_eal/include/rte_memory.h
+++ b/lib/librte_eal/include/rte_memory.h
@@ -82,6 +82,94 @@  struct rte_memseg_list {
 	struct rte_fbarray memseg_arr;
 };
 
+/**
+ * Memory protection flags.
+ */
+enum rte_mem_prot {
+	RTE_PROT_READ = 1 << 0,   /**< Read access. */
+	RTE_PROT_WRITE = 1 << 1,  /**< Write access. */
+	RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
+};
+
+/**
+ * Additional flags for memory mapping.
+ */
+enum rte_map_flags {
+	/** Changes to the mapped memory are visible to other processes. */
+	RTE_MAP_SHARED = 1 << 0,
+	/** Mapping is not backed by a regular file. */
+	RTE_MAP_ANONYMOUS = 1 << 1,
+	/** Copy-on-write mapping, changes are invisible to other processes. */
+	RTE_MAP_PRIVATE = 1 << 2,
+	/**
+	 * Force mapping to the requested address. This flag should be used
+	 * with caution, because to fulfill the request implementation
+	 * may remove all other mappings in the requested region. However,
+	 * it is not required to do so, thus mapping with this flag may fail.
+	 */
+	RTE_MAP_FORCE_ADDRESS = 1 << 3
+};
+
+/**
+ * Map a portion of an opened file or the page file into memory.
+ *
+ * This function is similar to POSIX mmap(3) with common MAP_ANONYMOUS
+ * extension, except for the return value.
+ *
+ * @param requested_addr
+ *  Desired virtual address for mapping. Can be NULL to let OS choose.
+ * @param size
+ *  Size of the mapping in bytes.
+ * @param prot
+ *  Protection flags, a combination of rte_mem_prot values.
+ * @param flags
+ *  Addtional mapping flags, a combination of rte_map_flags.
+ * @param fd
+ *  Mapped file descriptor. Can be negative for anonymous mapping.
+ * @param offset
+ *  Offset of the mapped region in fd. Must be 0 for anonymous mappings.
+ * @return
+ *  Mapped address or NULL on failure and rte_errno is set to OS error.
+ */
+__rte_experimental
+void *
+rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
+	int fd, size_t offset);
+
+/**
+ * OS-independent implementation of POSIX munmap(3).
+ */
+__rte_experimental
+int
+rte_mem_unmap(void *virt, size_t size);
+
+/**
+ * Get system page size. This function never fails.
+ *
+ * @return
+ *   Page size in bytes.
+ */
+__rte_experimental
+size_t
+rte_get_page_size(void);
+
+/**
+ * Lock in physical memory all pages crossed by the address region.
+ *
+ * @param virt
+ *   Base virtual address of the region.
+ * @param size
+ *   Size of the region.
+ * @return
+ *   0 on success, negative on error.
+ *
+ * @see rte_get_page_size() to retrieve the page size.
+ * @see rte_mem_lock_page() to lock an entire single page.
+ */
+__rte_experimental
+int
+rte_mem_lock(const void *virt, size_t size);
+
 /**
  * Lock page in physical memory and prevent from swapping.
  *
diff --git a/lib/librte_eal/linux/Makefile b/lib/librte_eal/linux/Makefile
index 4f39d462c..d314648cb 100644
--- a/lib/librte_eal/linux/Makefile
+++ b/lib/librte_eal/linux/Makefile
@@ -84,6 +84,7 @@  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c
 
 # from unix dir
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix_memory.c
 
 # from arch dir
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c
diff --git a/lib/librte_eal/linux/eal_memalloc.c b/lib/librte_eal/linux/eal_memalloc.c
index 2c717f8bd..bf29b83c6 100644
--- a/lib/librte_eal/linux/eal_memalloc.c
+++ b/lib/librte_eal/linux/eal_memalloc.c
@@ -630,7 +630,7 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 mapped:
 	munmap(addr, alloc_sz);
 unmapped:
-	flags = MAP_FIXED;
+	flags = EAL_RESERVE_FORCE_ADDRESS;
 	new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
 	if (new_addr != addr) {
 		if (new_addr != NULL)
@@ -687,8 +687,7 @@  free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 		return -1;
 	}
 
-	if (madvise(ms->addr, ms->len, MADV_DONTDUMP) != 0)
-		RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", strerror(errno));
+	eal_mem_set_dump(ms->addr, ms->len, false);
 
 	exit_early = false;
 
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index d8038749a..dff51b13d 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -386,4 +386,10 @@  EXPERIMENTAL {
 	rte_trace_point_lookup;
 	rte_trace_regexp;
 	rte_trace_save;
+
+	# added in 20.08
+	rte_get_page_size;
+	rte_mem_lock;
+	rte_mem_map;
+	rte_mem_unmap;
 };
diff --git a/lib/librte_eal/unix/eal_unix_memory.c b/lib/librte_eal/unix/eal_unix_memory.c
new file mode 100644
index 000000000..658595b6e
--- /dev/null
+++ b/lib/librte_eal/unix/eal_unix_memory.c
@@ -0,0 +1,152 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Dmitry Kozlyuk
+ */
+
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+
+#include "eal_private.h"
+
+#ifdef RTE_EXEC_ENV_LINUX
+#define EAL_DONTDUMP MADV_DONTDUMP
+#define EAL_DODUMP   MADV_DODUMP
+#elif defined RTE_EXEC_ENV_FREEBSD
+#define EAL_DONTDUMP MADV_NOCORE
+#define EAL_DODUMP   MADV_CORE
+#else
+#error "madvise doesn't support this OS"
+#endif
+
+static void *
+mem_map(void *requested_addr, size_t size, int prot, int flags,
+	int fd, size_t offset)
+{
+	void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
+	if (virt == MAP_FAILED) {
+		RTE_LOG(DEBUG, EAL,
+			"Cannot mmap(%p, 0x%zx, 0x%x, 0x%x, %d, 0x%zx): %s\n",
+			requested_addr, size, prot, flags, fd, offset,
+			strerror(errno));
+		rte_errno = errno;
+		return NULL;
+	}
+	return virt;
+}
+
+static int
+mem_unmap(void *virt, size_t size)
+{
+	int ret = munmap(virt, size);
+	if (ret < 0) {
+		RTE_LOG(DEBUG, EAL, "Cannot munmap(%p, 0x%zx): %s\n",
+			virt, size, strerror(errno));
+		rte_errno = errno;
+	}
+	return ret;
+}
+
+void *
+eal_mem_reserve(void *requested_addr, size_t size, int flags)
+{
+	int sys_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+	if (flags & EAL_RESERVE_HUGEPAGES) {
+#ifdef MAP_HUGETLB
+		sys_flags |= MAP_HUGETLB;
+#else
+		rte_errno = ENOTSUP;
+		return NULL;
+#endif
+	}
+
+	if (flags & EAL_RESERVE_FORCE_ADDRESS)
+		sys_flags |= MAP_FIXED;
+
+	return mem_map(requested_addr, size, PROT_NONE, sys_flags, -1, 0);
+}
+
+void
+eal_mem_free(void *virt, size_t size)
+{
+	mem_unmap(virt, size);
+}
+
+int
+eal_mem_set_dump(void *virt, size_t size, bool dump)
+{
+	int flags = dump ? EAL_DODUMP : EAL_DONTDUMP;
+	int ret = madvise(virt, size, flags);
+	if (ret) {
+		RTE_LOG(DEBUG, EAL, "madvise(%p, %#zx, %d) failed: %s\n",
+				virt, size, flags, strerror(rte_errno));
+		rte_errno = errno;
+	}
+	return ret;
+}
+
+static int
+mem_rte_to_sys_prot(int prot)
+{
+	int sys_prot = PROT_NONE;
+
+	if (prot & RTE_PROT_READ)
+		sys_prot |= PROT_READ;
+	if (prot & RTE_PROT_WRITE)
+		sys_prot |= PROT_WRITE;
+	if (prot & RTE_PROT_EXECUTE)
+		sys_prot |= PROT_EXEC;
+
+	return sys_prot;
+}
+
+void *
+rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
+	int fd, size_t offset)
+{
+	int sys_flags = 0;
+	int sys_prot;
+
+	sys_prot = mem_rte_to_sys_prot(prot);
+
+	if (flags & RTE_MAP_SHARED)
+		sys_flags |= MAP_SHARED;
+	if (flags & RTE_MAP_ANONYMOUS)
+		sys_flags |= MAP_ANONYMOUS;
+	if (flags & RTE_MAP_PRIVATE)
+		sys_flags |= MAP_PRIVATE;
+	if (flags & RTE_MAP_FORCE_ADDRESS)
+		sys_flags |= MAP_FIXED;
+
+	return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset);
+}
+
+int
+rte_mem_unmap(void *virt, size_t size)
+{
+	return mem_unmap(virt, size);
+}
+
+size_t
+rte_get_page_size(void)
+{
+	static size_t page_size;
+
+	if (!page_size)
+		page_size = sysconf(_SC_PAGESIZE);
+
+	return page_size;
+}
+
+int
+rte_mem_lock(const void *virt, size_t size)
+{
+	int ret = mlock(virt, size);
+	if (ret)
+		rte_errno = errno;
+	return ret;
+}
diff --git a/lib/librte_eal/unix/meson.build b/lib/librte_eal/unix/meson.build
index cfa1b4ef9..5734f26ad 100644
--- a/lib/librte_eal/unix/meson.build
+++ b/lib/librte_eal/unix/meson.build
@@ -3,4 +3,5 @@ 
 
 sources += files(
 	'eal_unix.c',
+	'eal_unix_memory.c',
 )