@@ -270,7 +270,7 @@ if is_windows
add_project_link_arguments('-lmincore', language: 'c')
endif
- add_project_link_arguments('-ladvapi32', language: 'c')
+ add_project_link_arguments('-ladvapi32', '-lsetupapi', language: 'c')
endif
if get_option('b_lto')
@@ -7,10 +7,10 @@ Running DPDK Applications
Grant *Lock pages in memory* Privilege
--------------------------------------
-Use of hugepages ("large pages" in Windows terminolocy) requires
+Use of hugepages ("large pages" in Windows terminology) requires
``SeLockMemoryPrivilege`` for the user running an application.
-1. Open *Local Security Policy* snap in, either:
+1. Open *Local Security Policy* snap-in, either:
* Control Panel / Computer Management / Local Security Policy;
* or Win+R, type ``secpol``, press Enter.
@@ -24,7 +24,44 @@ Use of hugepages ("large pages" in Windows terminolocy) requires
See `Large-Page Support`_ in MSDN for details.
-.. _Large-page Support: https://docs.microsoft.com/en-us/windows/win32/memory/large-page-support
+.. _Large-Page Support: https://docs.microsoft.com/en-us/windows/win32/memory/large-page-support
+
+
+Load virt2phys Driver
+---------------------
+
+Access to physical addresses is provided by a kernel-mode driver, virt2phys.
+It is mandatory at least for using hardware PMDs, but may also be required
+for mempools.
+
+Refer to documentation in ``dpdk-kmods`` repository for details on system
+setup, driver build and installation. This driver is not signed, so signature
+checking must be disabled to load it.
+
+.. warning::
+
+ Disabling driver signature enforcement weakens OS security.
+ It is discouraged in production environments.
+
+Compiled package consists of ``virt2phys.inf``, ``virt2phys.cat``,
+and ``virt2phys.sys``. It can be installed as follows
+from Elevated Command Prompt:
+
+.. code-block:: console
+
+ pnputil /add-driver Z:\path\to\virt2phys.inf /install
+
+When loaded successfully, the driver is shown in *Device Manager* as *Virtual
+to physical address translator* device under *Kernel bypass* category.
+Installed driver persists across reboots.
+
+If DPDK is unable to communicate with the driver, a warning is printed
+on initialization (debug-level logs provide more details):
+
+.. code-block:: text
+
+ EAL: Cannot open virt2phys driver interface
+
Run the ``helloworld`` Example
@@ -5,15 +5,15 @@
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
-#include <sys/mman.h>
#include <stdint.h>
#include <errno.h>
-#include <sys/file.h>
#include <string.h>
+#include <unistd.h>
#include <rte_common.h>
-#include <rte_log.h>
#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
#include <rte_spinlock.h>
#include <rte_tailq.h>
@@ -85,19 +85,16 @@ resize_and_map(int fd, void *addr, size_t len)
char path[PATH_MAX];
void *map_addr;
- if (ftruncate(fd, len)) {
+ if (eal_file_truncate(fd, len)) {
RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path);
/* pass errno up the chain */
rte_errno = errno;
return -1;
}
- map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FIXED, fd, 0);
+ map_addr = rte_mem_map(addr, len, RTE_PROT_READ | RTE_PROT_WRITE,
+ RTE_MAP_SHARED | RTE_MAP_FIXED, fd, 0);
if (map_addr != addr) {
- RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
- /* pass errno up the chain */
- rte_errno = errno;
return -1;
}
return 0;
@@ -735,7 +732,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
return -1;
}
- page_sz = sysconf(_SC_PAGESIZE);
+ page_sz = rte_get_page_size();
if (page_sz == (size_t)-1) {
free(ma);
return -1;
@@ -756,9 +753,12 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
if (internal_config.no_shconf) {
/* remap virtual area as writable */
- void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
- MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
- if (new_data == MAP_FAILED) {
+ void *new_data = rte_mem_map(
+ data, mmap_len,
+ RTE_PROT_READ | RTE_PROT_WRITE,
+ RTE_MAP_FIXED | RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS,
+ fd, 0);
+ if (new_data == NULL) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
__func__, strerror(errno));
goto fail;
@@ -778,7 +778,8 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
__func__, path, strerror(errno));
rte_errno = errno;
goto fail;
- } else if (flock(fd, LOCK_EX | LOCK_NB)) {
+ } else if (eal_file_lock(
+ fd, EAL_FLOCK_EXCLUSIVE, EAL_FLOCK_RETURN)) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n",
__func__, path, strerror(errno));
rte_errno = EBUSY;
@@ -789,10 +790,8 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
* still attach to it, but no other process could reinitialize
* it.
*/
- if (flock(fd, LOCK_SH | LOCK_NB)) {
- rte_errno = errno;
+ if (eal_file_lock(fd, EAL_FLOCK_SHARED, EAL_FLOCK_RETURN))
goto fail;
- }
if (resize_and_map(fd, data, mmap_len))
goto fail;
@@ -824,7 +823,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
return 0;
fail:
if (data)
- munmap(data, mmap_len);
+ rte_mem_unmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
@@ -862,7 +861,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
return -1;
}
- page_sz = sysconf(_SC_PAGESIZE);
+ page_sz = rte_get_page_size();
if (page_sz == (size_t)-1) {
free(ma);
return -1;
@@ -895,10 +894,8 @@ rte_fbarray_attach(struct rte_fbarray *arr)
}
/* lock the file, to let others know we're using it */
- if (flock(fd, LOCK_SH | LOCK_NB)) {
- rte_errno = errno;
+ if (eal_file_lock(fd, EAL_FLOCK_SHARED, EAL_FLOCK_RETURN))
goto fail;
- }
if (resize_and_map(fd, data, mmap_len))
goto fail;
@@ -916,7 +913,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
return 0;
fail:
if (data)
- munmap(data, mmap_len);
+ rte_mem_unmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
@@ -944,8 +941,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
* really do anything about it, things will blow up either way.
*/
- size_t page_sz = sysconf(_SC_PAGESIZE);
-
+ size_t page_sz = rte_get_page_size();
if (page_sz == (size_t)-1)
return -1;
@@ -964,7 +960,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
goto out;
}
- munmap(arr->data, mmap_len);
+ rte_mem_unmap(arr->data, mmap_len);
/* area is unmapped, close fd and remove the tailq entry */
if (tmp->fd >= 0)
@@ -999,8 +995,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
* really do anything about it, things will blow up either way.
*/
- size_t page_sz = sysconf(_SC_PAGESIZE);
-
+ size_t page_sz = rte_get_page_size();
if (page_sz == (size_t)-1)
return -1;
@@ -1025,7 +1020,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
* has been detached by all other processes
*/
fd = tmp->fd;
- if (flock(fd, LOCK_EX | LOCK_NB)) {
+ if (eal_file_lock(fd, EAL_FLOCK_EXCLUSIVE, EAL_FLOCK_RETURN)) {
RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n");
rte_errno = EBUSY;
ret = -1;
@@ -1042,14 +1037,14 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
* we're still holding an exclusive lock, so drop it to
* shared.
*/
- flock(fd, LOCK_SH | LOCK_NB);
+ eal_file_lock(fd, EAL_FLOCK_SHARED, EAL_FLOCK_RETURN);
ret = -1;
goto out;
}
close(fd);
}
- munmap(arr->data, mmap_len);
+ rte_mem_unmap(arr->data, mmap_len);
/* area is unmapped, remove the tailq entry */
TAILQ_REMOVE(&mem_area_tailq, tmp, next);
@@ -11,7 +11,6 @@
#include <string.h>
#include <unistd.h>
#include <inttypes.h>
-#include <sys/mman.h>
#include <sys/queue.h>
#include <rte_fbarray.h>
@@ -44,7 +43,7 @@ static uint64_t system_page_sz;
#define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
- size_t page_sz, int flags, int mmap_flags)
+ size_t page_sz, int flags, enum eal_mem_reserve_flags reserve_flags)
{
bool addr_is_hint, allow_shrink, unmap, no_align;
uint64_t map_sz;
@@ -52,9 +51,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
uint8_t try = 0;
if (system_page_sz == 0)
- system_page_sz = sysconf(_SC_PAGESIZE);
-
- mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+ system_page_sz = rte_get_page_size();
RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
@@ -98,24 +95,24 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
return NULL;
}
- mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE,
- mmap_flags, -1, 0);
- if (mapped_addr == MAP_FAILED && allow_shrink)
- *size -= page_sz;
+ mapped_addr = eal_mem_reserve(
+ requested_addr, (size_t)map_sz, reserve_flags);
+ if ((mapped_addr == NULL) && allow_shrink)
+ size -= page_sz;
- if (mapped_addr != MAP_FAILED && addr_is_hint &&
- mapped_addr != requested_addr) {
+ if ((mapped_addr != NULL) && addr_is_hint &&
+ (mapped_addr != requested_addr)) {
try++;
next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
/* hint was not used. Try with another offset */
- munmap(mapped_addr, map_sz);
- mapped_addr = MAP_FAILED;
+ eal_mem_free(mapped_addr, *size);
+ mapped_addr = NULL;
requested_addr = next_baseaddr;
}
}
} while ((allow_shrink || addr_is_hint) &&
- mapped_addr == MAP_FAILED && *size > 0);
+ (mapped_addr == NULL) && (*size > 0));
/* align resulting address - if map failed, we will ignore the value
* anyway, so no need to add additional checks.
@@ -125,20 +122,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
if (*size == 0) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
- strerror(errno));
- rte_errno = errno;
+ strerror(rte_errno));
return NULL;
- } else if (mapped_addr == MAP_FAILED) {
+ } else if (mapped_addr == NULL) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
- strerror(errno));
- /* pass errno up the call chain */
- rte_errno = errno;
+ strerror(rte_errno));
return NULL;
} else if (requested_addr != NULL && !addr_is_hint &&
aligned_addr != requested_addr) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
requested_addr, aligned_addr);
- munmap(mapped_addr, map_sz);
+ eal_mem_free(mapped_addr, map_sz);
rte_errno = EADDRNOTAVAIL;
return NULL;
} else if (requested_addr != NULL && addr_is_hint &&
@@ -154,7 +148,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
aligned_addr, *size);
if (unmap) {
- munmap(mapped_addr, map_sz);
+ eal_mem_free(mapped_addr, map_sz);
} else if (!no_align) {
void *map_end, *aligned_end;
size_t before_len, after_len;
@@ -172,12 +166,12 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
/* unmap space before aligned mmap address */
before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
if (before_len > 0)
- munmap(mapped_addr, before_len);
+ eal_mem_free(mapped_addr, before_len);
/* unmap space after aligned end mmap address */
after_len = RTE_PTR_DIFF(map_end, aligned_end);
if (after_len > 0)
- munmap(aligned_end, after_len);
+ eal_mem_free(aligned_end, after_len);
}
return aligned_addr;
@@ -586,10 +580,10 @@ rte_eal_memdevice_init(void)
int
rte_mem_lock_page(const void *virt)
{
- unsigned long virtual = (unsigned long)virt;
- int page_size = getpagesize();
- unsigned long aligned = (virtual & ~(page_size - 1));
- return mlock((void *)aligned, page_size);
+ uintptr_t virtual = (uintptr_t)virt;
+ int page_size = rte_get_page_size();
+ uintptr_t aligned = (virtual & ~(page_size - 1));
+ return rte_mem_lock((void *)aligned, page_size);
}
int
@@ -226,8 +226,8 @@ enum eal_mem_reserve_flags {
* Page size on which to align requested virtual area.
* @param flags
* EAL_VIRTUAL_AREA_* flags.
- * @param mmap_flags
- * Extra flags passed directly to mmap().
+ * @param reserve_flags
+ * Extra flags passed directly to rte_mem_reserve().
*
* @return
* Virtual area address if successful.
@@ -244,7 +244,7 @@ enum eal_mem_reserve_flags {
/**< immediately unmap reserved virtual area. */
void *
eal_get_virtual_area(void *requested_addr, size_t *size, size_t page_sz,
- int flags, int mmap_flags);
+ int flags, enum eal_mem_reserve_flags reserve_flags);
/**
* Reserve VA space for a memory segment list.
@@ -729,6 +729,7 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
if (ret != NULL)
return ret;
}
+
return NULL;
}
@@ -9,11 +9,20 @@ if is_windows
'eal_common_class.c',
'eal_common_devargs.c',
'eal_common_errno.c',
+ 'eal_common_fbarray.c',
'eal_common_launch.c',
'eal_common_lcore.c',
'eal_common_log.c',
+ 'eal_common_mcfg.c',
+ 'eal_common_memalloc.c',
+ 'eal_common_memory.c',
+ 'eal_common_memzone.c',
'eal_common_options.c',
+ 'eal_common_tailqs.c',
'eal_common_thread.c',
+ 'malloc_elem.c',
+ 'malloc_heap.c',
+ 'rte_malloc.c',
'rte_option.c',
)
subdir_done()
@@ -355,7 +355,6 @@ memseg_list_reserve(struct rte_memseg_list *msl)
return eal_reserve_memseg_list(msl, flags);
}
-
static int
memseg_primary_init(void)
{
@@ -1,13 +1,128 @@
EXPORTS
__rte_panic
+ rte_calloc
+ rte_calloc_socket
rte_eal_get_configuration
+ rte_eal_has_hugepages
rte_eal_init
+ rte_eal_iova_mode
rte_eal_mp_remote_launch
rte_eal_mp_wait_lcore
+ rte_eal_process_type
rte_eal_remote_launch
- rte_get_page_size
+ rte_eal_tailq_lookup
+ rte_eal_tailq_register
+ rte_eal_using_phys_addrs
+ rte_free
rte_log
+ rte_malloc
+ rte_malloc_dump_stats
+ rte_malloc_get_socket_stats
+ rte_malloc_set_limit
+ rte_malloc_socket
+ rte_malloc_validate
+ rte_malloc_virt2iova
+ rte_mcfg_mem_read_lock
+ rte_mcfg_mem_read_unlock
+ rte_mcfg_mem_write_lock
+ rte_mcfg_mem_write_unlock
+ rte_mcfg_mempool_read_lock
+ rte_mcfg_mempool_read_unlock
+ rte_mcfg_mempool_write_lock
+ rte_mcfg_mempool_write_unlock
+ rte_mcfg_tailq_read_lock
+ rte_mcfg_tailq_read_unlock
+ rte_mcfg_tailq_write_lock
+ rte_mcfg_tailq_write_unlock
+ rte_mem_lock_page
+ rte_mem_virt2iova
+ rte_mem_virt2phy
+ rte_memory_get_nchannel
+ rte_memory_get_nrank
+ rte_memzone_dump
+ rte_memzone_free
+ rte_memzone_lookup
+ rte_memzone_reserve
+ rte_memzone_reserve_aligned
+ rte_memzone_reserve_bounded
+ rte_memzone_walk
+ rte_vlog
+ rte_realloc
+ rte_zmalloc
+ rte_zmalloc_socket
+
+ rte_mp_action_register
+ rte_mp_action_unregister
+ rte_mp_reply
+ rte_mp_sendmsg
+
+ rte_fbarray_attach
+ rte_fbarray_destroy
+ rte_fbarray_detach
+ rte_fbarray_dump_metadata
+ rte_fbarray_find_contig_free
+ rte_fbarray_find_contig_used
+ rte_fbarray_find_idx
+ rte_fbarray_find_next_free
+ rte_fbarray_find_next_n_free
+ rte_fbarray_find_next_n_used
+ rte_fbarray_find_next_used
+ rte_fbarray_get
+ rte_fbarray_init
+ rte_fbarray_is_used
+ rte_fbarray_set_free
+ rte_fbarray_set_used
+ rte_malloc_dump_heaps
+ rte_mem_alloc_validator_register
+ rte_mem_alloc_validator_unregister
+ rte_mem_check_dma_mask
+ rte_mem_event_callback_register
+ rte_mem_event_callback_unregister
+ rte_mem_iova2virt
+ rte_mem_virt2memseg
+ rte_mem_virt2memseg_list
+ rte_memseg_contig_walk
+ rte_memseg_list_walk
+ rte_memseg_walk
+ rte_mp_request_async
+ rte_mp_request_sync
+
+ rte_fbarray_find_prev_free
+ rte_fbarray_find_prev_n_free
+ rte_fbarray_find_prev_n_used
+ rte_fbarray_find_prev_used
+ rte_fbarray_find_rev_contig_free
+ rte_fbarray_find_rev_contig_used
+ rte_memseg_contig_walk_thread_unsafe
+ rte_memseg_list_walk_thread_unsafe
+ rte_memseg_walk_thread_unsafe
+
+ rte_malloc_heap_create
+ rte_malloc_heap_destroy
+ rte_malloc_heap_get_socket
+ rte_malloc_heap_memory_add
+ rte_malloc_heap_memory_attach
+ rte_malloc_heap_memory_detach
+ rte_malloc_heap_memory_remove
+ rte_malloc_heap_socket_is_external
+ rte_mem_check_dma_mask_thread_unsafe
+ rte_mem_set_dma_mask
+ rte_memseg_get_fd
+ rte_memseg_get_fd_offset
+ rte_memseg_get_fd_offset_thread_unsafe
+ rte_memseg_get_fd_thread_unsafe
+
+ rte_extmem_attach
+ rte_extmem_detach
+ rte_extmem_register
+ rte_extmem_unregister
+
+ rte_fbarray_find_biggest_free
+ rte_fbarray_find_biggest_used
+ rte_fbarray_find_rev_biggest_free
+ rte_fbarray_find_rev_biggest_used
+
+ rte_get_page_size
rte_mem_lock
rte_mem_map
rte_mem_unmap
- rte_vlog
@@ -93,6 +93,24 @@ eal_proc_type_detect(void)
return ptype;
}
+enum rte_proc_type_t
+rte_eal_process_type(void)
+{
+ return rte_config.process_type;
+}
+
+int
+rte_eal_has_hugepages(void)
+{
+ return !internal_config.no_hugetlbfs;
+}
+
+enum rte_iova_mode
+rte_eal_iova_mode(void)
+{
+ return rte_config.iova_mode;
+}
+
/* display usage */
static void
eal_usage(const char *prgname)
@@ -328,6 +346,13 @@ rte_eal_init(int argc, char **argv)
if (fctret < 0)
exit(1);
+ /* Prevent creation of shared memory files. */
+ if (internal_config.no_shconf == 0) {
+ RTE_LOG(WARNING, EAL, "Multi-process support is requested, "
+ "but not available.\n");
+ internal_config.no_shconf = 1;
+ }
+
if (!internal_config.no_hugetlbfs && (eal_hugepage_info_init() < 0)) {
rte_eal_init_alert("Cannot get hugepage information");
rte_errno = EACCES;
@@ -345,6 +370,36 @@ rte_eal_init(int argc, char **argv)
return -1;
}
+ if (eal_mem_virt2iova_init() < 0) {
+ /* Non-fatal error if physical addresses are not required. */
+ RTE_LOG(WARNING, EAL, "Cannot access virt2phys driver, "
+ "PA will not be available\n");
+ }
+
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
+ if (rte_eal_memory_init() < 0) {
+ rte_eal_init_alert("Cannot init memory");
+ rte_errno = ENOMEM;
+ return -1;
+ }
+
+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
+ if (rte_eal_tailqs_init() < 0) {
+ rte_eal_init_alert("Cannot init tail queues for objects");
+ rte_errno = EFAULT;
+ return -1;
+ }
+
eal_thread_init_master(rte_config.master_lcore);
RTE_LCORE_FOREACH_SLAVE(i) {
new file mode 100644
@@ -0,0 +1,418 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Dmitry Kozlyuk
+ */
+
+#include <rte_errno.h>
+#include <rte_os.h>
+#include <rte_windows.h>
+
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+#include "eal_memcfg.h"
+#include "eal_private.h"
+#include "eal_windows.h"
+
+int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
+{
+ /* Hugepages have no assiciated files in Windows. */
+ RTE_SET_USED(list_idx);
+ RTE_SET_USED(seg_idx);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
+{
+ /* Hugepages have no assiciated files in Windows. */
+ RTE_SET_USED(list_idx);
+ RTE_SET_USED(seg_idx);
+ RTE_SET_USED(offset);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+static int
+alloc_seg(struct rte_memseg *ms, void *requested_addr, int socket_id,
+ struct hugepage_info *hi)
+{
+ HANDLE current_process;
+ unsigned int numa_node;
+ size_t alloc_sz;
+ void *addr;
+ rte_iova_t iova = RTE_BAD_IOVA;
+ PSAPI_WORKING_SET_EX_INFORMATION info;
+ PSAPI_WORKING_SET_EX_BLOCK *page;
+
+ if (ms->len > 0) {
+ /* If a segment is already allocated as needed, return it. */
+ if ((ms->addr == requested_addr) &&
+ (ms->socket_id == socket_id) &&
+ (ms->hugepage_sz == hi->hugepage_sz)) {
+ return 0;
+ }
+
+ /* Bugcheck, should not happen. */
+ RTE_LOG(DEBUG, EAL, "Attempted to reallocate segment %p "
+ "(size %zu) on socket %d", ms->addr,
+ ms->len, ms->socket_id);
+ return -1;
+ }
+
+ current_process = GetCurrentProcess();
+ numa_node = eal_socket_numa_node(socket_id);
+ alloc_sz = hi->hugepage_sz;
+
+ if (requested_addr == NULL) {
+ /* Request a new chunk of memory from OS. */
+ addr = eal_mem_alloc_socket(alloc_sz, socket_id);
+ if (addr == NULL) {
+ RTE_LOG(DEBUG, EAL, "Cannot allocate %zu bytes "
+ "on socket %d\n", alloc_sz, socket_id);
+ return -1;
+ }
+ } else {
+ /* Requested address is already reserved, commit memory. */
+ addr = eal_mem_commit(requested_addr, alloc_sz, socket_id);
+ if (addr == NULL) {
+ RTE_LOG(DEBUG, EAL, "Cannot commit reserved memory %p "
+ "(size %zu) on socket %d\n",
+ requested_addr, alloc_sz, socket_id);
+ return -1;
+ }
+ }
+
+ /* Force OS to allocate a physical page and select a NUMA node.
+ * Hugepages are not pageable in Windows, so there's no race
+ * for physical address.
+ */
+ *(volatile int *)addr = *(volatile int *)addr;
+
+ /* Only try to obtain IOVA if it's available, so that applications
+ * that do not need IOVA can use this allocator.
+ */
+ if (rte_eal_using_phys_addrs()) {
+ iova = rte_mem_virt2iova(addr);
+ if (iova == RTE_BAD_IOVA) {
+ RTE_LOG(DEBUG, EAL,
+ "Cannot get IOVA of allocated segment\n");
+ goto error;
+ }
+ }
+
+ /* Only "Ex" function can handle hugepages. */
+ info.VirtualAddress = addr;
+ if (!QueryWorkingSetEx(current_process, &info, sizeof(info))) {
+ RTE_LOG_WIN32_ERR("QueryWorkingSetEx()");
+ goto error;
+ }
+
+ page = &info.VirtualAttributes;
+ if (!page->Valid || !page->LargePage) {
+ RTE_LOG(DEBUG, EAL, "Got regular page instead of a hugepage\n");
+ goto error;
+ }
+ if (page->Node != numa_node) {
+ RTE_LOG(DEBUG, EAL,
+ "NUMA node hint %u (socket %d) not respected, got %u\n",
+ numa_node, socket_id, page->Node);
+ goto error;
+ }
+
+ ms->addr = addr;
+ ms->hugepage_sz = hi->hugepage_sz;
+ ms->len = alloc_sz;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+ ms->iova = iova;
+ ms->socket_id = socket_id;
+
+ return 0;
+
+error:
+ /* Only jump here when `addr` and `alloc_sz` are valid. */
+ eal_mem_decommit(addr, alloc_sz);
+ return -1;
+}
+
+static int
+free_seg(struct rte_memseg *ms)
+{
+ if (eal_mem_decommit(ms->addr, ms->len))
+ return -1;
+
+ /* Must clear the segment, because alloc_seg() inspects it. */
+ memset(ms, 0, sizeof(*ms));
+ return 0;
+}
+
+struct alloc_walk_param {
+ struct hugepage_info *hi;
+ struct rte_memseg **ms;
+ size_t page_sz;
+ unsigned int segs_allocated;
+ unsigned int n_segs;
+ int socket;
+ bool exact;
+};
+
+static int
+alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct alloc_walk_param *wa = arg;
+ struct rte_memseg_list *cur_msl;
+ size_t page_sz;
+ int cur_idx, start_idx, j;
+ unsigned int msl_idx, need, i;
+
+ if (msl->page_sz != wa->page_sz)
+ return 0;
+ if (msl->socket_id != wa->socket)
+ return 0;
+
+ page_sz = (size_t)msl->page_sz;
+
+ msl_idx = msl - mcfg->memsegs;
+ cur_msl = &mcfg->memsegs[msl_idx];
+
+ need = wa->n_segs;
+
+ /* try finding space in memseg list */
+ if (wa->exact) {
+ /* if we require exact number of pages in a list, find them */
+ cur_idx = rte_fbarray_find_next_n_free(
+ &cur_msl->memseg_arr, 0, need);
+ if (cur_idx < 0)
+ return 0;
+ start_idx = cur_idx;
+ } else {
+ int cur_len;
+
+ /* we don't require exact number of pages, so we're going to go
+ * for best-effort allocation. that means finding the biggest
+ * unused block, and going with that.
+ */
+ cur_idx = rte_fbarray_find_biggest_free(
+ &cur_msl->memseg_arr, 0);
+ if (cur_idx < 0)
+ return 0;
+ start_idx = cur_idx;
+ /* adjust the size to possibly be smaller than original
+ * request, but do not allow it to be bigger.
+ */
+ cur_len = rte_fbarray_find_contig_free(
+ &cur_msl->memseg_arr, cur_idx);
+ need = RTE_MIN(need, (unsigned int)cur_len);
+ }
+
+ for (i = 0; i < need; i++, cur_idx++) {
+ struct rte_memseg *cur;
+ void *map_addr;
+
+ cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
+ map_addr = RTE_PTR_ADD(cur_msl->base_va, cur_idx * page_sz);
+
+ if (alloc_seg(cur, map_addr, wa->socket, wa->hi)) {
+ RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, "
+ "but only %i were allocated\n", need, i);
+
+ /* if exact number wasn't requested, stop */
+ if (!wa->exact)
+ goto out;
+
+ /* clean up */
+ for (j = start_idx; j < cur_idx; j++) {
+ struct rte_memseg *tmp;
+ struct rte_fbarray *arr = &cur_msl->memseg_arr;
+
+ tmp = rte_fbarray_get(arr, j);
+ rte_fbarray_set_free(arr, j);
+
+ if (free_seg(tmp))
+ RTE_LOG(DEBUG, EAL, "Cannot free page\n");
+ }
+ /* clear the list */
+ if (wa->ms)
+ memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
+
+ return -1;
+ }
+ if (wa->ms)
+ wa->ms[i] = cur;
+
+ rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
+ }
+
+out:
+ wa->segs_allocated = i;
+ if (i > 0)
+ cur_msl->version++;
+
+ /* if we didn't allocate any segments, move on to the next list */
+ return i > 0;
+}
+
+struct free_walk_param {
+ struct hugepage_info *hi;
+ struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *found_msl;
+ struct free_walk_param *wa = arg;
+ uintptr_t start_addr, end_addr;
+ int msl_idx, seg_idx, ret;
+
+ start_addr = (uintptr_t) msl->base_va;
+ end_addr = start_addr + msl->len;
+
+ if ((uintptr_t)wa->ms->addr < start_addr ||
+ (uintptr_t)wa->ms->addr >= end_addr)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
+
+ /* msl is const */
+ found_msl = &mcfg->memsegs[msl_idx];
+ found_msl->version++;
+
+ rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+
+ ret = free_seg(wa->ms);
+
+ return (ret < 0) ? (-1) : 1;
+}
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs,
+ size_t page_sz, int socket, bool exact)
+{
+ unsigned int i;
+ int ret = -1;
+ struct alloc_walk_param wa;
+ struct hugepage_info *hi = NULL;
+
+ if (internal_config.legacy_mem) {
+ RTE_LOG(ERR, EAL, "dynamic allocation not supported in legacy mode\n");
+ return -ENOTSUP;
+ }
+
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+ if (page_sz == hpi->hugepage_sz) {
+ hi = hpi;
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "cannot find relevant hugepage_info entry\n");
+ return -1;
+ }
+
+ memset(&wa, 0, sizeof(wa));
+ wa.exact = exact;
+ wa.hi = hi;
+ wa.ms = ms;
+ wa.n_segs = n_segs;
+ wa.page_sz = page_sz;
+ wa.socket = socket;
+ wa.segs_allocated = 0;
+
+ /* memalloc is locked, so it's safe to use thread-unsafe version */
+ ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "cannot find suitable memseg_list\n");
+ ret = -1;
+ } else if (ret > 0) {
+ ret = (int)wa.segs_allocated;
+ }
+
+ return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket)
+{
+ struct rte_memseg *ms = NULL;
+ eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true);
+ return ms;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+ int seg, ret = 0;
+
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (seg = 0; seg < n_segs; seg++) {
+ struct rte_memseg *cur = ms[seg];
+ struct hugepage_info *hi = NULL;
+ struct free_walk_param wa;
+ size_t i;
+ int walk_res;
+
+ /* if this page is marked as unfreeable, fail */
+ if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+ RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
+ ret = -1;
+ continue;
+ }
+
+ memset(&wa, 0, sizeof(wa));
+
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info);
+ i++) {
+ hi = &internal_config.hugepage_info[i];
+ if (cur->hugepage_sz == hi->hugepage_sz)
+ break;
+ }
+ if (i == RTE_DIM(internal_config.hugepage_info)) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ ret = -1;
+ continue;
+ }
+
+ wa.ms = cur;
+ wa.hi = hi;
+
+ /* memalloc is locked, so it's safe to use thread-unsafe version
+ */
+ walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
+ &wa);
+ if (walk_res == 1)
+ continue;
+ if (walk_res == 0)
+ RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+ ret = -1;
+ }
+ return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+ return eal_memalloc_free_seg_bulk(&ms, 1);
+}
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ /* No multi-process support. */
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+eal_memalloc_init(void)
+{
+ /* No action required. */
+ return 0;
+}
@@ -1,11 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2010-2014 Intel Corporation (functions from Linux EAL)
+ * Copyright (c) 2020 Dmitry Kozlyuk (Windows specifics)
+ */
+
+#include <inttypes.h>
#include <io.h>
#include <rte_errno.h>
#include <rte_memory.h>
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+#include "eal_memcfg.h"
+#include "eal_options.h"
#include "eal_private.h"
#include "eal_windows.h"
+#include <rte_virt2phys.h>
+
/* MinGW-w64 headers lack VirtualAlloc2() in some distributions.
* Provide a copy of definitions and code to load it dynamically.
* Note: definitions are copied verbatim from Microsoft documentation
@@ -120,6 +132,119 @@ eal_mem_win32api_init(void)
#endif /* no VirtualAlloc2() */
+static HANDLE virt2phys_device = INVALID_HANDLE_VALUE;
+
+int
+eal_mem_virt2iova_init(void)
+{
+ HDEVINFO list = INVALID_HANDLE_VALUE;
+ SP_DEVICE_INTERFACE_DATA ifdata;
+ SP_DEVICE_INTERFACE_DETAIL_DATA *detail = NULL;
+ DWORD detail_size;
+ int ret = -1;
+
+ list = SetupDiGetClassDevs(
+ &GUID_DEVINTERFACE_VIRT2PHYS, NULL, NULL,
+ DIGCF_DEVICEINTERFACE | DIGCF_PRESENT);
+ if (list == INVALID_HANDLE_VALUE) {
+ RTE_LOG_WIN32_ERR("SetupDiGetClassDevs()");
+ goto exit;
+ }
+
+ ifdata.cbSize = sizeof(ifdata);
+ if (!SetupDiEnumDeviceInterfaces(
+ list, NULL, &GUID_DEVINTERFACE_VIRT2PHYS, 0, &ifdata)) {
+ RTE_LOG_WIN32_ERR("SetupDiEnumDeviceInterfaces()");
+ goto exit;
+ }
+
+ if (!SetupDiGetDeviceInterfaceDetail(
+ list, &ifdata, NULL, 0, &detail_size, NULL)) {
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+ RTE_LOG_WIN32_ERR(
+ "SetupDiGetDeviceInterfaceDetail(probe)");
+ goto exit;
+ }
+ }
+
+ detail = malloc(detail_size);
+ if (detail == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate virt2phys "
+ "device interface detail data\n");
+ goto exit;
+ }
+
+ detail->cbSize = sizeof(*detail);
+ if (!SetupDiGetDeviceInterfaceDetail(
+ list, &ifdata, detail, detail_size, NULL, NULL)) {
+ RTE_LOG_WIN32_ERR("SetupDiGetDeviceInterfaceDetail(read)");
+ goto exit;
+ }
+
+ RTE_LOG(DEBUG, EAL, "Found virt2phys device: %s\n", detail->DevicePath);
+
+ virt2phys_device = CreateFile(
+ detail->DevicePath, 0, 0, NULL, OPEN_EXISTING, 0, NULL);
+ if (virt2phys_device == INVALID_HANDLE_VALUE) {
+ RTE_LOG_WIN32_ERR("CreateFile()");
+ goto exit;
+ }
+
+ /* Indicate success. */
+ ret = 0;
+
+exit:
+ if (detail != NULL)
+ free(detail);
+ if (list != INVALID_HANDLE_VALUE)
+ SetupDiDestroyDeviceInfoList(list);
+
+ return ret;
+}
+
+phys_addr_t
+rte_mem_virt2phy(const void *virt)
+{
+ LARGE_INTEGER phys;
+ DWORD bytes_returned;
+
+ if (virt2phys_device == INVALID_HANDLE_VALUE)
+ return RTE_BAD_PHYS_ADDR;
+
+ if (!DeviceIoControl(
+ virt2phys_device, IOCTL_VIRT2PHYS_TRANSLATE,
+ &virt, sizeof(virt), &phys, sizeof(phys),
+ &bytes_returned, NULL)) {
+ RTE_LOG_WIN32_ERR("DeviceIoControl(IOCTL_VIRT2PHYS_TRANSLATE)");
+ return RTE_BAD_PHYS_ADDR;
+ }
+
+ return phys.QuadPart;
+}
+
+/* Windows currently only supports IOVA as PA. */
+rte_iova_t
+rte_mem_virt2iova(const void *virt)
+{
+ phys_addr_t phys;
+
+ if (virt2phys_device == INVALID_HANDLE_VALUE)
+ return RTE_BAD_IOVA;
+
+ phys = rte_mem_virt2phy(virt);
+ if (phys == RTE_BAD_PHYS_ADDR)
+ return RTE_BAD_IOVA;
+
+ return (rte_iova_t)phys;
+}
+
+/* Always using physical addresses under Windows if they can be obtained. */
+int
+rte_eal_using_phys_addrs(void)
+{
+ return virt2phys_device != INVALID_HANDLE_VALUE;
+}
+
/* Approximate error mapping from VirtualAlloc2() to POSIX mmap(3). */
static void
set_errno_from_win32_alloc_error(DWORD code)
@@ -253,6 +378,10 @@ eal_mem_commit(void *requested_addr, size_t size, int socket_id)
int
eal_mem_decommit(void *addr, size_t size)
{
+ /* Decommit memory, which might be a part of a larger reserved region.
+ * Allocator commits hugepage-sized placeholders, so there's no need
+ * to coalesce placeholders back into region, they can be reused as is.
+ */
if (!VirtualFree(addr, size, MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER)) {
RTE_LOG_WIN32_ERR("VirtualFree(%p, %zu, ...)", addr, size);
return -1;
@@ -364,7 +493,7 @@ rte_mem_map(void *requested_addr, size_t size, enum rte_mem_prot prot,
return NULL;
}
- /* TODO: there is a race for the requested_addr between mem_free()
+ /* There is a race for the requested_addr between mem_free()
* and MapViewOfFileEx(). MapViewOfFile3() that can replace a reserved
* region with a mapping in a single operation, but it does not support
* private mappings.
@@ -414,6 +543,16 @@ rte_mem_unmap(void *virt, size_t size)
return 0;
}
+uint64_t
+eal_get_baseaddr(void)
+{
+ /* Windows strategy for memory allocation is undocumented.
+ * Returning 0 here effectively disables address guessing
+ * unless user provides an address hint.
+ */
+ return 0;
+}
+
int
rte_get_page_size(void)
{
@@ -435,3 +574,568 @@ rte_mem_lock(const void *virt, size_t size)
return 0;
}
+
+static int
+memseg_list_alloc(struct rte_memseg_list *msl, uint64_t page_sz,
+ int n_segs, int socket_id, int type_msl_idx)
+{
+ return eal_alloc_memseg_list(
+ msl, page_sz, n_segs, socket_id, type_msl_idx, true);
+}
+
+static int
+memseg_list_reserve(struct rte_memseg_list *msl)
+{
+ return eal_reserve_memseg_list(msl, 0);
+}
+
+/*
+ * Remaining code in this file largely duplicates Linux EAL.
+ * Although Windows EAL supports only one hugepage size currently,
+ * code structure and comments are preserved so that changes may be
+ * easily ported until duplication is removed.
+ */
+
+static int
+memseg_primary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct memtype {
+ uint64_t page_sz;
+ int socket_id;
+ } *memtypes = NULL;
+ int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
+ struct rte_memseg_list *msl;
+ uint64_t max_mem, max_mem_per_type;
+ unsigned int max_seglists_per_type;
+ unsigned int n_memtypes, cur_type;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ /*
+ * figuring out amount of memory we're going to have is a long and very
+ * involved process. the basic element we're operating with is a memory
+ * type, defined as a combination of NUMA node ID and page size (so that
+ * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
+ *
+ * deciding amount of memory going towards each memory type is a
+ * balancing act between maximum segments per type, maximum memory per
+ * type, and number of detected NUMA nodes. the goal is to make sure
+ * each memory type gets at least one memseg list.
+ *
+ * the total amount of memory is limited by RTE_MAX_MEM_MB value.
+ *
+ * the total amount of memory per type is limited by either
+ * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
+ * of detected NUMA nodes. additionally, maximum number of segments per
+ * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
+ * smaller page sizes, it can take hundreds of thousands of segments to
+ * reach the above specified per-type memory limits.
+ *
+ * additionally, each type may have multiple memseg lists associated
+ * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
+ * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
+ *
+ * the number of memseg lists per type is decided based on the above
+ * limits, and also taking number of detected NUMA nodes, to make sure
+ * that we don't run out of memseg lists before we populate all NUMA
+ * nodes with memory.
+ *
+ * we do this in three stages. first, we collect the number of types.
+ * then, we figure out memory constraints and populate the list of
+ * would-be memseg lists. then, we go ahead and allocate the memseg
+ * lists.
+ */
+
+ /* create space for mem types */
+ n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
+ memtypes = calloc(n_memtypes, sizeof(*memtypes));
+ if (memtypes == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
+ return -1;
+ }
+
+ /* populate mem types */
+ cur_type = 0;
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
+ int socket_id = rte_socket_id_by_idx(i);
+
+ memtypes[cur_type].page_sz = hugepage_sz;
+ memtypes[cur_type].socket_id = socket_id;
+
+ RTE_LOG(DEBUG, EAL, "Detected memory type: "
+ "socket_id:%u hugepage_sz:%" PRIu64 "\n",
+ socket_id, hugepage_sz);
+ }
+ }
+ /* number of memtypes could have been lower due to no NUMA support */
+ n_memtypes = cur_type;
+
+ /* set up limits for types */
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
+ max_mem / n_memtypes);
+
+ /*
+ * limit maximum number of segment lists per type to ensure there's
+ * space for memseg lists for all NUMA nodes with all page sizes
+ */
+ max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
+
+ if (max_seglists_per_type == 0) {
+ RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
+
+ /* go through all mem types and create segment lists */
+ msl_idx = 0;
+ for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
+ unsigned int cur_seglist, n_seglists, n_segs;
+ unsigned int max_segs_per_type, max_segs_per_list;
+ struct memtype *type = &memtypes[cur_type];
+ uint64_t max_mem_per_list, pagesz;
+ int socket_id;
+
+ pagesz = type->page_sz;
+ socket_id = type->socket_id;
+
+ /*
+ * we need to create segment lists for this type. we must take
+ * into account the following things:
+ *
+ * 1. total amount of memory we can use for this memory type
+ * 2. total amount of memory per memseg list allowed
+ * 3. number of segments needed to fit the amount of memory
+ * 4. number of segments allowed per type
+ * 5. number of segments allowed per memseg list
+ * 6. number of memseg lists we are allowed to take up
+ */
+
+ /* calculate how much segments we will need in total */
+ max_segs_per_type = max_mem_per_type / pagesz;
+ /* limit number of segments to maximum allowed per type */
+ max_segs_per_type = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
+ /* limit number of segments to maximum allowed per list */
+ max_segs_per_list = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
+
+ /* calculate how much memory we can have per segment list */
+ max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
+ (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
+
+ /* calculate how many segments each segment list will have */
+ n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
+
+ /* calculate how many segment lists we can have */
+ n_seglists = RTE_MIN(max_segs_per_type / n_segs,
+ max_mem_per_type / max_mem_per_list);
+
+ /* limit number of segment lists according to our maximum */
+ n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
+
+ RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
+ "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
+ n_seglists, n_segs, socket_id, pagesz);
+
+ /* create all segment lists */
+ for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
+ msl = &mcfg->memsegs[msl_idx++];
+
+ if (memseg_list_alloc(msl, pagesz, n_segs,
+ socket_id, cur_seglist))
+ goto out;
+
+ if (memseg_list_reserve(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ goto out;
+ }
+ }
+ }
+ /* we're successful */
+ ret = 0;
+out:
+ free(memtypes);
+ return ret;
+}
+
+static int
+memseg_secondary_init(void)
+{
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+rte_eal_memseg_init(void)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return memseg_primary_init();
+ return memseg_secondary_init();
+}
+
+static inline uint64_t
+get_socket_mem_size(int socket)
+{
+ uint64_t size = 0;
+ unsigned int i;
+
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+ size += hpi->hugepage_sz * hpi->num_pages[socket];
+ }
+
+ return size;
+}
+
+static int
+calc_num_pages_per_socket(uint64_t *memory,
+ struct hugepage_info *hp_info,
+ struct hugepage_info *hp_used,
+ unsigned int num_hp_info)
+{
+ unsigned int socket, j, i = 0;
+ unsigned int requested, available;
+ int total_num_pages = 0;
+ uint64_t remaining_mem, cur_mem;
+ uint64_t total_mem = internal_config.memory;
+
+ if (num_hp_info == 0)
+ return -1;
+
+ /* if specific memory amounts per socket weren't requested */
+ if (internal_config.force_sockets == 0) {
+ size_t total_size;
+ int cpu_per_socket[RTE_MAX_NUMA_NODES];
+ size_t default_size;
+ unsigned int lcore_id;
+
+ /* Compute number of cores per socket */
+ memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
+ RTE_LCORE_FOREACH(lcore_id) {
+ cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
+ }
+
+ /*
+ * Automatically spread requested memory amongst detected
+ * sockets according to number of cores from cpu mask present
+ * on each socket.
+ */
+ total_size = internal_config.memory;
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
+ socket++) {
+
+ /* Set memory amount per socket */
+ default_size = internal_config.memory *
+ cpu_per_socket[socket] / rte_lcore_count();
+
+ /* Limit to maximum available memory on socket */
+ default_size = RTE_MIN(
+ default_size, get_socket_mem_size(socket));
+
+ /* Update sizes */
+ memory[socket] = default_size;
+ total_size -= default_size;
+ }
+
+ /*
+ * If some memory is remaining, try to allocate it by getting
+ * all available memory from sockets, one after the other.
+ */
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
+ socket++) {
+ /* take whatever is available */
+ default_size = RTE_MIN(
+ get_socket_mem_size(socket) - memory[socket],
+ total_size);
+
+ /* Update sizes */
+ memory[socket] += default_size;
+ total_size -= default_size;
+ }
+ }
+
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0;
+ socket++) {
+ /* skips if the memory on specific socket wasn't requested */
+ for (i = 0; i < num_hp_info && memory[socket] != 0; i++) {
+ strncpy(hp_used[i].hugedir, hp_info[i].hugedir,
+ sizeof(hp_used[i].hugedir));
+ hp_used[i].num_pages[socket] = RTE_MIN(
+ memory[socket] / hp_info[i].hugepage_sz,
+ hp_info[i].num_pages[socket]);
+
+ cur_mem = hp_used[i].num_pages[socket] *
+ hp_used[i].hugepage_sz;
+
+ memory[socket] -= cur_mem;
+ total_mem -= cur_mem;
+
+ total_num_pages += hp_used[i].num_pages[socket];
+
+ /* check if we have met all memory requests */
+ if (memory[socket] == 0)
+ break;
+
+ /* Check if we have any more pages left at this size,
+ * if so, move on to next size.
+ */
+ if (hp_used[i].num_pages[socket] ==
+ hp_info[i].num_pages[socket])
+ continue;
+
+ /* At this point we know that there are more pages
+ * available that are bigger than the memory we want,
+ * so lets see if we can get enough from other page
+ * sizes.
+ */
+ remaining_mem = 0;
+ for (j = i+1; j < num_hp_info; j++)
+ remaining_mem += hp_info[j].hugepage_sz *
+ hp_info[j].num_pages[socket];
+
+ /* Is there enough other memory?
+ * If not, allocate another page and quit.
+ */
+ if (remaining_mem < memory[socket]) {
+ cur_mem = RTE_MIN(
+ memory[socket], hp_info[i].hugepage_sz);
+ memory[socket] -= cur_mem;
+ total_mem -= cur_mem;
+ hp_used[i].num_pages[socket]++;
+ total_num_pages++;
+ break; /* we are done with this socket*/
+ }
+ }
+ /* if we didn't satisfy all memory requirements per socket */
+ if (memory[socket] > 0 &&
+ internal_config.socket_mem[socket] != 0) {
+ /* to prevent icc errors */
+ requested = (unsigned int)(
+ internal_config.socket_mem[socket] / 0x100000);
+ available = requested -
+ ((unsigned int)(memory[socket] / 0x100000));
+ RTE_LOG(ERR, EAL, "Not enough memory available on "
+ "socket %u! Requested: %uMB, available: %uMB\n",
+ socket, requested, available);
+ return -1;
+ }
+ }
+
+ /* if we didn't satisfy total memory requirements */
+ if (total_mem > 0) {
+ requested = (unsigned int) (internal_config.memory / 0x100000);
+ available = requested - (unsigned int) (total_mem / 0x100000);
+ RTE_LOG(ERR, EAL, "Not enough memory available! "
+ "Requested: %uMB, available: %uMB\n",
+ requested, available);
+ return -1;
+ }
+ return total_num_pages;
+}
+
+/* Limit is checked by validator itself, nothing left to analyze.*/
+static int
+limits_callback(int socket_id, size_t cur_limit, size_t new_len)
+{
+ RTE_SET_USED(socket_id);
+ RTE_SET_USED(cur_limit);
+ RTE_SET_USED(new_len);
+ return -1;
+}
+
+static int
+eal_hugepage_init(void)
+{
+ struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ uint64_t memory[RTE_MAX_NUMA_NODES];
+ int hp_sz_idx, socket_id;
+
+ memset(used_hp, 0, sizeof(used_hp));
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ /* also initialize used_hp hugepage sizes in used_hp */
+ struct hugepage_info *hpi;
+ hpi = &internal_config.hugepage_info[hp_sz_idx];
+ used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
+ }
+
+ /* make a copy of socket_mem, needed for balanced allocation. */
+ for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; socket_id++)
+ memory[socket_id] = internal_config.socket_mem[socket_id];
+
+ /* calculate final number of pages */
+ if (calc_num_pages_per_socket(memory,
+ internal_config.hugepage_info, used_hp,
+ internal_config.num_hugepage_sizes) < 0)
+ return -1;
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int)internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
+ socket_id++) {
+ struct rte_memseg **pages;
+ struct hugepage_info *hpi = &used_hp[hp_sz_idx];
+ unsigned int num_pages = hpi->num_pages[socket_id];
+ unsigned int num_pages_alloc;
+
+ if (num_pages == 0)
+ continue;
+
+ RTE_LOG(DEBUG, EAL,
+ "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
+ num_pages, hpi->hugepage_sz >> 20, socket_id);
+
+ /* we may not be able to allocate all pages in one go,
+ * because we break up our memory map into multiple
+ * memseg lists. therefore, try allocating multiple
+ * times and see if we can get the desired number of
+ * pages from multiple allocations.
+ */
+
+ num_pages_alloc = 0;
+ do {
+ int i, cur_pages, needed;
+
+ needed = num_pages - num_pages_alloc;
+
+ pages = malloc(sizeof(*pages) * needed);
+
+ /* do not request exact number of pages */
+ cur_pages = eal_memalloc_alloc_seg_bulk(pages,
+ needed, hpi->hugepage_sz,
+ socket_id, false);
+ if (cur_pages <= 0) {
+ free(pages);
+ return -1;
+ }
+
+ /* mark preallocated pages as unfreeable */
+ for (i = 0; i < cur_pages; i++) {
+ struct rte_memseg *ms = pages[i];
+ ms->flags |=
+ RTE_MEMSEG_FLAG_DO_NOT_FREE;
+ }
+ free(pages);
+
+ num_pages_alloc += cur_pages;
+ } while (num_pages_alloc != num_pages);
+ }
+ }
+ /* if socket limits were specified, set them */
+ if (internal_config.force_socket_limits) {
+ unsigned int i;
+ for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
+ uint64_t limit = internal_config.socket_limit[i];
+ if (limit == 0)
+ continue;
+ if (rte_mem_alloc_validator_register("socket-limit",
+ limits_callback, i, limit))
+ RTE_LOG(ERR, EAL, "Failed to register socket "
+ "limits validator callback\n");
+ }
+ }
+ return 0;
+}
+
+static int
+eal_nohuge_init(void)
+{
+ struct rte_mem_config *mcfg;
+ struct rte_memseg_list *msl;
+ int n_segs, cur_seg;
+ uint64_t page_sz;
+ void *addr;
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;
+
+ mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* nohuge mode is legacy mode */
+ internal_config.legacy_mem = 1;
+
+ /* create a memseg list */
+ msl = &mcfg->memsegs[0];
+
+ page_sz = RTE_PGSIZE_4K;
+ n_segs = internal_config.memory / page_sz;
+
+ if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
+ addr = eal_mem_alloc(internal_config.memory, 0);
+ if (addr == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate %zu bytes",
+ internal_config.memory);
+ return -1;
+ }
+
+ msl->base_va = addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = 0;
+ msl->len = internal_config.memory;
+ msl->heap = 1;
+
+ /* populate memsegs. each memseg is one page long */
+ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+ arr = &msl->memseg_arr;
+
+ ms = rte_fbarray_get(arr, cur_seg);
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->socket_id = 0;
+ ms->len = page_sz;
+
+ rte_fbarray_set_used(arr, cur_seg);
+
+ addr = RTE_PTR_ADD(addr, (size_t)page_sz);
+ }
+
+ if (mcfg->dma_maskbits &&
+ rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
+ RTE_LOG(ERR, EAL,
+ "%s(): couldn't allocate memory due to IOVA "
+ "exceeding limits of current DMA mask.\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+rte_eal_hugepage_init(void)
+{
+ return internal_config.no_hugetlbfs ?
+ eal_nohuge_init() : eal_hugepage_init();
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
new file mode 100644
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Dmitry Kozlyuk
+ */
+
+/**
+ * @file Multiprocess support stubs
+ *
+ * Stubs must log an error until implemented. If success is required
+ * for non-multiprocess operation, stub must log a warning and a comment
+ * must document what requires success emulation.
+ */
+
+#include <rte_eal.h>
+#include <rte_errno.h>
+
+#include "eal_private.h"
+#include "eal_windows.h"
+#include "malloc_mp.h"
+
+void
+rte_mp_channel_cleanup(void)
+{
+ EAL_LOG_NOT_IMPLEMENTED();
+}
+
+int
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+ RTE_SET_USED(name);
+ RTE_SET_USED(action);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+void
+rte_mp_action_unregister(const char *name)
+{
+ RTE_SET_USED(name);
+ EAL_LOG_NOT_IMPLEMENTED();
+}
+
+int
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+ RTE_SET_USED(msg);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ RTE_SET_USED(req);
+ RTE_SET_USED(reply);
+ RTE_SET_USED(ts);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
+ rte_mp_async_reply_t clb)
+{
+ RTE_SET_USED(req);
+ RTE_SET_USED(ts);
+ RTE_SET_USED(clb);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+ RTE_SET_USED(msg);
+ RTE_SET_USED(peer);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+register_mp_requests(void)
+{
+ /* Non-stub function succeeds if multi-process is not supported. */
+ EAL_LOG_STUB();
+ return 0;
+}
+
+int
+request_to_primary(struct malloc_mp_req *req)
+{
+ RTE_SET_USED(req);
+ EAL_LOG_NOT_IMPLEMENTED();
+ return -1;
+}
+
+int
+request_sync(void)
+{
+ /* Common memory allocator depends on this function success. */
+ EAL_LOG_STUB();
+ return 0;
+}
@@ -9,8 +9,24 @@
* @file Facilities private to Windows EAL
*/
+#include <rte_errno.h>
#include <rte_windows.h>
+/**
+ * Log current function as not implemented and set rte_errno.
+ */
+#define EAL_LOG_NOT_IMPLEMENTED() \
+ do { \
+ RTE_LOG(DEBUG, EAL, "%s() is not implemented\n", __func__); \
+ rte_errno = ENOTSUP; \
+ } while (0)
+
+/**
+ * Log current function as a stub.
+ */
+#define EAL_LOG_STUB() \
+ RTE_LOG(DEBUG, EAL, "Windows: %s() is a stub\n", __func__)
+
/**
* Create a map of processors and cores on the system.
*/
@@ -36,6 +52,13 @@ int eal_thread_create(pthread_t *thread);
*/
unsigned int eal_socket_numa_node(unsigned int socket_id);
+/**
+ * Open virt2phys driver interface device.
+ *
+ * @return 0 on success, (-1) on failure.
+ */
+int eal_mem_virt2iova_init(void);
+
/**
* Locate Win32 memory management routines in system libraries.
*
@@ -5,5 +5,6 @@ includes += include_directories('.')
headers += files(
'rte_os.h',
+ 'rte_virt2phys.h',
'rte_windows.h',
)
@@ -36,6 +36,10 @@ extern "C" {
#define strncasecmp(s1, s2, count) _strnicmp(s1, s2, count)
+#define open _open
+#define close _close
+#define unlink _unlink
+
/* cpu_set macros implementation */
#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2)
#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2)
new file mode 100644
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Dmitry Kozlyuk
+ */
+
+/**
+ * @file virt2phys driver interface
+ */
+
+/**
+ * Driver device interface GUID {539c2135-793a-4926-afec-d3a1b61bbc8a}.
+ */
+DEFINE_GUID(GUID_DEVINTERFACE_VIRT2PHYS,
+ 0x539c2135, 0x793a, 0x4926,
+ 0xaf, 0xec, 0xd3, 0xa1, 0xb6, 0x1b, 0xbc, 0x8a);
+
+/**
+ * Driver device type for IO control codes.
+ */
+#define VIRT2PHYS_DEVTYPE 0x8000
+
+/**
+ * Translate a valid non-paged virtual address to a physical address.
+ *
+ * Note: A physical address zero (0) is reported if input address
+ * is paged out or not mapped. However, if input is a valid mapping
+ * of I/O port 0x0000, output is also zero. There is no way
+ * to distinguish between these cases by return value only.
+ *
+ * Input: a non-paged virtual address (PVOID).
+ *
+ * Output: the corresponding physical address (LARGE_INTEGER).
+ */
+#define IOCTL_VIRT2PHYS_TRANSLATE CTL_CODE( \
+ VIRT2PHYS_DEVTYPE, 0x800, METHOD_BUFFERED, FILE_ANY_ACCESS)
@@ -23,6 +23,8 @@
#include <basetsd.h>
#include <psapi.h>
+#include <setupapi.h>
+#include <winioctl.h>
/* Have GUIDs defined. */
#ifndef INITGUID
@@ -9,4 +9,7 @@
* as Microsoft libc does not contain unistd.h. This may be removed
* in future releases.
*/
+
+#include <io.h>
+
#endif /* _UNISTD_H_ */
@@ -8,7 +8,11 @@ sources += files(
'eal_debug.c',
'eal_hugepages.c',
'eal_lcore.c',
+ 'eal_memalloc.c',
'eal_memory.c',
+ 'eal_mp.c',
'eal_thread.c',
'getopt.c',
)
+
+dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true)