diff mbox

[dpdk-dev,RFC,2/2] eal: memzone allocated by malloc

Message ID 1431103079-18096-3-git-send-email-sergio.gonzalez.monroy@intel.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Sergio Gonzalez Monroy May 8, 2015, 4:37 p.m. UTC
In the current memory hierarchy, memsegs are groups of physically contiguous
hugepages, memzone are slices of memsegs and malloc further slices memzones
into smaller memory chunks.

This patch modifies malloc so it slices/partitions memsegs instead of memzones.
Thus memzones would call malloc internally for memoy allocation while
maintaining its ABI. The only exception is the reserving a memzone with
len=0 is not supported anymore.

Signed-off-by: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>
---
 lib/librte_eal/common/eal_common_memzone.c      | 233 ++----------------------
 lib/librte_eal/common/include/rte_malloc_heap.h |   4 +-
 lib/librte_eal/common/include/rte_memory.h      |   1 +
 lib/librte_eal/common/malloc_elem.c             |  60 ++++--
 lib/librte_eal/common/malloc_elem.h             |  14 +-
 lib/librte_eal/common/malloc_heap.c             | 188 +++++++++++++------
 lib/librte_eal/common/malloc_heap.h             |   4 +-
 lib/librte_eal/common/rte_malloc.c              |   7 +-
 8 files changed, 207 insertions(+), 304 deletions(-)
diff mbox

Patch

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 888f9e5..3dc8133 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -50,11 +50,10 @@ 
 #include <rte_string_fns.h>
 #include <rte_common.h>
 
+#include "malloc_heap.h"
+#include "malloc_elem.h"
 #include "eal_private.h"
 
-/* internal copy of free memory segments */
-static struct rte_memseg *free_memseg = NULL;
-
 static inline const struct rte_memzone *
 memzone_lookup_thread_unsafe(const char *name)
 {
@@ -88,53 +87,12 @@  rte_memzone_reserve(const char *name, size_t len, int socket_id,
 			len, socket_id, flags, RTE_CACHE_LINE_SIZE);
 }
 
-/*
- * Helper function for memzone_reserve_aligned_thread_unsafe().
- * Calculate address offset from the start of the segment.
- * Align offset in that way that it satisfy istart alignmnet and
- * buffer of the  requested length would not cross specified boundary.
- */
-static inline phys_addr_t
-align_phys_boundary(const struct rte_memseg *ms, size_t len, size_t align,
-	size_t bound)
-{
-	phys_addr_t addr_offset, bmask, end, start;
-	size_t step;
-
-	step = RTE_MAX(align, bound);
-	bmask = ~((phys_addr_t)bound - 1);
-
-	/* calculate offset to closest alignment */
-	start = RTE_ALIGN_CEIL(ms->phys_addr, align);
-	addr_offset = start - ms->phys_addr;
-
-	while (addr_offset + len < ms->len) {
-
-		/* check, do we meet boundary condition */
-		end = start + len - (len != 0);
-		if ((start & bmask) == (end & bmask))
-			break;
-
-		/* calculate next offset */
-		start = RTE_ALIGN_CEIL(start + 1, step);
-		addr_offset = start - ms->phys_addr;
-	}
-
-	return (addr_offset);
-}
-
 static const struct rte_memzone *
 memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		int socket_id, unsigned flags, unsigned align, unsigned bound)
 {
 	struct rte_mem_config *mcfg;
-	unsigned i = 0;
-	int memseg_idx = -1;
-	uint64_t addr_offset, seg_offset = 0;
 	size_t requested_len;
-	size_t memseg_len = 0;
-	phys_addr_t memseg_physaddr;
-	void *memseg_addr;
 
 	/* get pointer to global configuration */
 	mcfg = rte_eal_get_configuration()->mem_config;
@@ -166,10 +124,10 @@  memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	if (align < RTE_CACHE_LINE_SIZE)
 		align = RTE_CACHE_LINE_SIZE;
 
-
-	/* align length on cache boundary. Check for overflow before doing so */
-	if (len > SIZE_MAX - RTE_CACHE_LINE_MASK) {
-		rte_errno = EINVAL; /* requested size too big */
+	/* align length on cache boundary. Check for overflow before doing so
+	 * FIXME need to update API doc regarding len value*/
+	if ((len > SIZE_MAX - RTE_CACHE_LINE_MASK) || (len == 0)){
+		rte_errno = EINVAL;
 		return NULL;
 	}
 
@@ -186,123 +144,29 @@  memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		return NULL;
 	}
 
-	/* find the smallest segment matching requirements */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		/* last segment */
-		if (free_memseg[i].addr == NULL)
-			break;
-
-		/* empty segment, skip it */
-		if (free_memseg[i].len == 0)
-			continue;
-
-		/* bad socket ID */
-		if (socket_id != SOCKET_ID_ANY &&
-		    free_memseg[i].socket_id != SOCKET_ID_ANY &&
-		    socket_id != free_memseg[i].socket_id)
-			continue;
-
-		/*
-		 * calculate offset to closest alignment that
-		 * meets boundary conditions.
-		 */
-		addr_offset = align_phys_boundary(free_memseg + i,
-			requested_len, align, bound);
-
-		/* check len */
-		if ((requested_len + addr_offset) > free_memseg[i].len)
-			continue;
-
-		/* check flags for hugepage sizes */
-		if ((flags & RTE_MEMZONE_2MB) &&
-				free_memseg[i].hugepage_sz == RTE_PGSIZE_1G)
-			continue;
-		if ((flags & RTE_MEMZONE_1GB) &&
-				free_memseg[i].hugepage_sz == RTE_PGSIZE_2M)
-			continue;
-		if ((flags & RTE_MEMZONE_16MB) &&
-				free_memseg[i].hugepage_sz == RTE_PGSIZE_16G)
-			continue;
-		if ((flags & RTE_MEMZONE_16GB) &&
-				free_memseg[i].hugepage_sz == RTE_PGSIZE_16M)
-			continue;
-
-		/* this segment is the best until now */
-		if (memseg_idx == -1) {
-			memseg_idx = i;
-			memseg_len = free_memseg[i].len;
-			seg_offset = addr_offset;
-		}
-		/* find the biggest contiguous zone */
-		else if (len == 0) {
-			if (free_memseg[i].len > memseg_len) {
-				memseg_idx = i;
-				memseg_len = free_memseg[i].len;
-				seg_offset = addr_offset;
-			}
-		}
-		/*
-		 * find the smallest (we already checked that current
-		 * zone length is > len
-		 */
-		else if (free_memseg[i].len + align < memseg_len ||
-				(free_memseg[i].len <= memseg_len + align &&
-				addr_offset < seg_offset)) {
-			memseg_idx = i;
-			memseg_len = free_memseg[i].len;
-			seg_offset = addr_offset;
-		}
-	}
 
-	/* no segment found */
-	if (memseg_idx == -1) {
-		/*
-		 * If RTE_MEMZONE_SIZE_HINT_ONLY flag is specified,
-		 * try allocating again without the size parameter otherwise -fail.
-		 */
-		if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY)  &&
-		    ((flags & RTE_MEMZONE_1GB) || (flags & RTE_MEMZONE_2MB)
-		|| (flags & RTE_MEMZONE_16MB) || (flags & RTE_MEMZONE_16GB)))
-			return memzone_reserve_aligned_thread_unsafe(name,
-				len, socket_id, 0, align, bound);
+	/* get socket heap */
 
+	/* allocate memory on heap */
+	void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket_id], NULL,
+			requested_len, flags, align, bound);
+	if (mz_addr == NULL) {
 		rte_errno = ENOMEM;
 		return NULL;
 	}
 
-	/* save aligned physical and virtual addresses */
-	memseg_physaddr = free_memseg[memseg_idx].phys_addr + seg_offset;
-	memseg_addr = RTE_PTR_ADD(free_memseg[memseg_idx].addr,
-			(uintptr_t) seg_offset);
-
-	/* if we are looking for a biggest memzone */
-	if (len == 0) {
-		if (bound == 0)
-			requested_len = memseg_len - seg_offset;
-		else
-			requested_len = RTE_ALIGN_CEIL(memseg_physaddr + 1,
-				bound) - memseg_physaddr;
-	}
-
-	/* set length to correct value */
-	len = (size_t)seg_offset + requested_len;
-
-	/* update our internal state */
-	free_memseg[memseg_idx].len -= len;
-	free_memseg[memseg_idx].phys_addr += len;
-	free_memseg[memseg_idx].addr =
-		(char *)free_memseg[memseg_idx].addr + len;
+	const struct malloc_elem *elem = malloc_elem_from_data(mz_addr);
 
 	/* fill the zone in config */
 	struct rte_memzone *mz = &mcfg->memzone[mcfg->memzone_idx++];
 	snprintf(mz->name, sizeof(mz->name), "%s", name);
-	mz->phys_addr = memseg_physaddr;
-	mz->addr = memseg_addr;
+	mz->phys_addr = rte_malloc_virt2phy(mz_addr);
+	mz->addr = mz_addr;
 	mz->len = requested_len;
-	mz->hugepage_sz = free_memseg[memseg_idx].hugepage_sz;
-	mz->socket_id = free_memseg[memseg_idx].socket_id;
+	mz->hugepage_sz = elem->ms->hugepage_sz;
+	mz->socket_id = elem->ms->socket_id;
 	mz->flags = 0;
-	mz->memseg_id = memseg_idx;
+	mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;
 
 	return mz;
 }
@@ -419,45 +283,6 @@  rte_memzone_dump(FILE *f)
 }
 
 /*
- * called by init: modify the free memseg list to have cache-aligned
- * addresses and cache-aligned lengths
- */
-static int
-memseg_sanitize(struct rte_memseg *memseg)
-{
-	unsigned phys_align;
-	unsigned virt_align;
-	unsigned off;
-
-	phys_align = memseg->phys_addr & RTE_CACHE_LINE_MASK;
-	virt_align = (unsigned long)memseg->addr & RTE_CACHE_LINE_MASK;
-
-	/*
-	 * sanity check: phys_addr and addr must have the same
-	 * alignment
-	 */
-	if (phys_align != virt_align)
-		return -1;
-
-	/* memseg is really too small, don't bother with it */
-	if (memseg->len < (2 * RTE_CACHE_LINE_SIZE)) {
-		memseg->len = 0;
-		return 0;
-	}
-
-	/* align start address */
-	off = (RTE_CACHE_LINE_SIZE - phys_align) & RTE_CACHE_LINE_MASK;
-	memseg->phys_addr += off;
-	memseg->addr = (char *)memseg->addr + off;
-	memseg->len -= off;
-
-	/* align end address */
-	memseg->len &= ~((uint64_t)RTE_CACHE_LINE_MASK);
-
-	return 0;
-}
-
-/*
  * Init the memzone subsystem
  */
 int
@@ -465,14 +290,10 @@  rte_eal_memzone_init(void)
 {
 	struct rte_mem_config *mcfg;
 	const struct rte_memseg *memseg;
-	unsigned i = 0;
 
 	/* get pointer to global configuration */
 	mcfg = rte_eal_get_configuration()->mem_config;
 
-	/* mirror the runtime memsegs from config */
-	free_memseg = mcfg->free_memseg;
-
 	/* secondary processes don't need to initialise anything */
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
 		return 0;
@@ -485,26 +306,6 @@  rte_eal_memzone_init(void)
 
 	rte_rwlock_write_lock(&mcfg->mlock);
 
-	/* fill in uninitialized free_memsegs */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		if (memseg[i].addr == NULL)
-			break;
-		if (free_memseg[i].addr != NULL)
-			continue;
-		memcpy(&free_memseg[i], &memseg[i], sizeof(struct rte_memseg));
-	}
-
-	/* make all zones cache-aligned */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		if (free_memseg[i].addr == NULL)
-			break;
-		if (memseg_sanitize(&free_memseg[i]) < 0) {
-			RTE_LOG(ERR, EAL, "%s(): Sanity check failed\n", __func__);
-			rte_rwlock_write_unlock(&mcfg->mlock);
-			return -1;
-		}
-	}
-
 	/* delete all zones */
 	mcfg->memzone_idx = 0;
 	memset(mcfg->memzone, 0, sizeof(mcfg->memzone));
diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index 716216f..5333348 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -40,7 +40,7 @@ 
 #include <rte_memory.h>
 
 /* Number of free lists per heap, grouped by size. */
-#define RTE_HEAP_NUM_FREELISTS  5
+#define RTE_HEAP_NUM_FREELISTS  10
 
 /**
  * Structure to hold malloc heap
@@ -48,7 +48,7 @@ 
 struct malloc_heap {
 	rte_spinlock_t lock;
 	LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
-	unsigned mz_count;
+	unsigned ms_count;
 	unsigned alloc_count;
 	size_t total_size;
 } __rte_cache_aligned;
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 7f8103f..ab13d04 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -100,6 +100,7 @@  struct rte_memseg {
 	 /**< store segment MFNs */
 	uint64_t mfn[DOM0_NUM_MEMBLOCK];
 #endif
+	uint8_t used;               /**< already used by a heap */
 } __attribute__((__packed__));
 
 /**
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index a5e1248..5e95abb 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -37,7 +37,6 @@ 
 #include <sys/queue.h>
 
 #include <rte_memory.h>
-#include <rte_memzone.h>
 #include <rte_eal.h>
 #include <rte_launch.h>
 #include <rte_per_lcore.h>
@@ -56,10 +55,10 @@ 
  */
 void
 malloc_elem_init(struct malloc_elem *elem,
-		struct malloc_heap *heap, const struct rte_memzone *mz, size_t size)
+		struct malloc_heap *heap, const struct rte_memseg *ms, size_t size)
 {
 	elem->heap = heap;
-	elem->mz = mz;
+	elem->ms = ms;
 	elem->prev = NULL;
 	memset(&elem->free_list, 0, sizeof(elem->free_list));
 	elem->state = ELEM_FREE;
@@ -70,12 +69,12 @@  malloc_elem_init(struct malloc_elem *elem,
 }
 
 /*
- * initialise a dummy malloc_elem header for the end-of-memzone marker
+ * initialise a dummy malloc_elem header for the end-of-memseg marker
  */
 void
 malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
 {
-	malloc_elem_init(elem, prev->heap, prev->mz, 0);
+	malloc_elem_init(elem, prev->heap, prev->ms, 0);
 	elem->prev = prev;
 	elem->state = ELEM_BUSY; /* mark busy so its never merged */
 }
@@ -86,12 +85,24 @@  malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
  * fit, return NULL.
  */
 static void *
-elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align)
+elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
+		size_t bound)
 {
-	const uintptr_t end_pt = (uintptr_t)elem +
+	const size_t bmask = ~(bound - 1);
+	uintptr_t end_pt = (uintptr_t)elem +
 			elem->size - MALLOC_ELEM_TRAILER_LEN;
-	const uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
-	const uintptr_t new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;
+	uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
+	uintptr_t new_elem_start;
+
+	/* check boundary */
+	if ((new_data_start & bmask) != (end_pt & bmask)) {
+		end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
+		new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
+		if ((end_pt & bmask) != (new_data_start & bmask))
+			return NULL;
+	}
+
+	new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;
 
 	/* if the new start point is before the exist start, it won't fit */
 	return (new_elem_start < (uintptr_t)elem) ? NULL : (void *)new_elem_start;
@@ -102,9 +113,10 @@  elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align)
  * alignment request from the current element
  */
 int
-malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align)
+malloc_elem_can_hold(struct malloc_elem *elem, size_t size,	unsigned align,
+		size_t bound)
 {
-	return elem_start_pt(elem, size, align) != NULL;
+	return elem_start_pt(elem, size, align, bound) != NULL;
 }
 
 /*
@@ -118,7 +130,7 @@  split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
 	const unsigned old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
 	const unsigned new_elem_size = elem->size - old_elem_size;
 
-	malloc_elem_init(split_pt, elem->heap, elem->mz, new_elem_size);
+	malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
 	split_pt->prev = elem;
 	next_elem->prev = split_pt;
 	elem->size = old_elem_size;
@@ -190,12 +202,25 @@  elem_free_list_remove(struct malloc_elem *elem)
  * is not done here, as it's done there previously.
  */
 struct malloc_elem *
-malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align)
+malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
+		size_t bound)
 {
-	struct malloc_elem *new_elem = elem_start_pt(elem, size, align);
-	const unsigned old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem;
+	struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound);
+	const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem;
+	const size_t trailer_size = elem->size - old_elem_size - size;
+
+	elem_free_list_remove(elem);
 
-	if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE){
+	if (trailer_size > MALLOC_ELEM_OVERHEAD * 2 + MIN_DATA_SIZE) {
+		/* split it, too much free space after elem */
+		struct malloc_elem *new_free_elem =
+				RTE_PTR_ADD(new_elem, size + MALLOC_ELEM_OVERHEAD);
+
+		split_elem(elem, new_free_elem);
+		malloc_elem_free_list_insert(new_free_elem);
+	}
+
+	if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
 		/* don't split it, pad the element instead */
 		elem->state = ELEM_BUSY;
 		elem->pad = old_elem_size;
@@ -208,8 +233,6 @@  malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align)
 			new_elem->size = elem->size - elem->pad;
 			set_header(new_elem);
 		}
-		/* remove element from free list */
-		elem_free_list_remove(elem);
 
 		return new_elem;
 	}
@@ -219,7 +242,6 @@  malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align)
 	 * Re-insert original element, in case its new size makes it
 	 * belong on a different list.
 	 */
-	elem_free_list_remove(elem);
 	split_elem(elem, new_elem);
 	new_elem->state = ELEM_BUSY;
 	malloc_elem_free_list_insert(elem);
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 9790b1a..e05d2ea 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -47,9 +47,9 @@  enum elem_state {
 
 struct malloc_elem {
 	struct malloc_heap *heap;
-	struct malloc_elem *volatile prev;      /* points to prev elem in memzone */
+	struct malloc_elem *volatile prev;      /* points to prev elem in memseg */
 	LIST_ENTRY(malloc_elem) free_list;      /* list of free elements in heap */
-	const struct rte_memzone *mz;
+	const struct rte_memseg *ms;
 	volatile enum elem_state state;
 	uint32_t pad;
 	size_t size;
@@ -136,11 +136,11 @@  malloc_elem_from_data(const void *data)
 void
 malloc_elem_init(struct malloc_elem *elem,
 		struct malloc_heap *heap,
-		const struct rte_memzone *mz,
+		const struct rte_memseg *ms,
 		size_t size);
 
 /*
- * initialise a dummy malloc_elem header for the end-of-memzone marker
+ * initialise a dummy malloc_elem header for the end-of-memseg marker
  */
 void
 malloc_elem_mkend(struct malloc_elem *elem,
@@ -151,14 +151,16 @@  malloc_elem_mkend(struct malloc_elem *elem,
  * of the requested size and with the requested alignment
  */
 int
-malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align);
+malloc_elem_can_hold(struct malloc_elem *elem, size_t size,
+		unsigned align, size_t bound);
 
 /*
  * reserve a block of data in an existing malloc_elem. If the malloc_elem
  * is much larger than the data block requested, we split the element in two.
  */
 struct malloc_elem *
-malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align);
+malloc_elem_alloc(struct malloc_elem *elem, size_t size,
+		unsigned align, size_t bound);
 
 /*
  * free a malloc_elem block by adding it to the free list. If the
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index defb903..b79e0e9 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -39,7 +39,6 @@ 
 #include <sys/queue.h>
 
 #include <rte_memory.h>
-#include <rte_memzone.h>
 #include <rte_eal.h>
 #include <rte_eal_memconfig.h>
 #include <rte_launch.h>
@@ -54,69 +53,136 @@ 
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
-/* since the memzone size starts with a digit, it will appear unquoted in
- * rte_config.h, so quote it so it can be passed to rte_str_to_size */
-#define MALLOC_MEMZONE_SIZE RTE_STR(RTE_MALLOC_MEMZONE_SIZE)
+static unsigned
+check_hugepage_sz(unsigned flags, size_t hugepage_sz)
+{
+	unsigned ret = 1;
 
-/*
- * returns the configuration setting for the memzone size as a size_t value
- */
-static inline size_t
-get_malloc_memzone_size(void)
+	if ((flags & RTE_MEMZONE_2MB) && hugepage_sz == RTE_PGSIZE_1G)
+		ret = 0;
+	if ((flags & RTE_MEMZONE_1GB) && hugepage_sz == RTE_PGSIZE_2M)
+		ret = 0;
+	if ((flags & RTE_MEMZONE_16MB) && hugepage_sz == RTE_PGSIZE_16G)
+		ret = 0;
+	if ((flags & RTE_MEMZONE_16GB) && hugepage_sz == RTE_PGSIZE_16M)
+		ret = 0;
+
+	return ret;
+}
+
+static struct rte_memseg*
+find_suitable_memseg(int socket_id, size_t size, unsigned flags,
+		size_t align, size_t bound)
 {
-	return rte_str_to_size(MALLOC_MEMZONE_SIZE);
+	struct rte_memseg *ms = rte_eal_get_configuration()->mem_config->memseg;
+	uintptr_t data_end, data_start;
+	size_t bmask = ~(bound - 1);
+	unsigned i;
+	int ms_idx = -1, alt_ms_idx = -1;
+	size_t ms_len = 0, alt_ms_len = 0;
+	size_t min_size;
+
+	min_size = size + align + MALLOC_ELEM_OVERHEAD * 2;
+
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		/* last segment */
+		if (ms[i].addr == NULL)
+			break;
+
+		/* in use */
+		if (ms[i].used)
+			continue;
+
+		/* bad socket ID */
+		if (socket_id != SOCKET_ID_ANY && ms[i].socket_id != SOCKET_ID_ANY &&
+		    socket_id != ms[i].socket_id)
+			continue;
+
+		/* check len */
+		if (min_size > ms[i].len)
+			continue;
+
+		/* check boundary */
+		data_end = (uintptr_t)ms[i].addr + ms[i].len -
+			MALLOC_ELEM_OVERHEAD - MALLOC_ELEM_TRAILER_LEN ;
+		data_end = RTE_ALIGN_FLOOR(data_end, RTE_CACHE_LINE_SIZE);
+		data_start = RTE_ALIGN_FLOOR((data_end - size), align);
+		if ((data_end & bmask) != (data_start & bmask)) {
+			/* check we have enough space before boudnary */
+			data_end = RTE_ALIGN_FLOOR(data_end, bound);
+			data_start = RTE_ALIGN_FLOOR((data_end - size), align);
+			if (((data_end & bmask) != (data_start & bmask)) ||
+					((uintptr_t)ms[i].addr > (data_start - MALLOC_ELEM_HEADER_LEN)))
+				continue;
+		}
+
+		/* at this point, we have a memseg */
+
+		/* keep best memseg found */
+		if ((alt_ms_idx == -1) ||
+			(ms[i].len < alt_ms_len)) {
+			alt_ms_idx = i;
+			alt_ms_len = ms[i].len;
+		}
+
+		/* check flags for hugepage sizes */
+		if (!check_hugepage_sz(flags, ms[i].hugepage_sz))
+			continue;
+
+		/* keep best memseg found with requested hugepage size */
+		if ((ms_idx == -1) ||
+			(ms[i].len < ms_len)) {
+			ms_idx = i;
+			ms_len = ms[i].len;
+		}
+	}
+
+	if ((ms_idx == -1) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY))
+		ms_idx = alt_ms_idx;
+
+	if (ms_idx == -1)
+		return NULL;
+
+	return &ms[ms_idx];
 }
 
+/* This function expects correct values:
+ * - size: >= RTE_CACHE_LINE_SIZE
+ * - align: power_of_two && >= RTE_CACHE_LINE_SIZE
+ * - bound: power_of_two && >= size
+ */
 /*
+ * find a suitable memory segment available to expand the heap
  * reserve an extra memory zone and make it available for use by a particular
  * heap. This reserves the zone and sets a dummy malloc_elem header at the end
  * to prevent overflow. The rest of the zone is added to free list as a single
  * large free block
  */
 static int
-malloc_heap_add_memzone(struct malloc_heap *heap, size_t size, unsigned align)
+malloc_heap_add_memseg(struct malloc_heap *heap, size_t size,
+		unsigned flags, size_t align, size_t bound)
 {
-	const unsigned mz_flags = 0;
-	const size_t block_size = get_malloc_memzone_size();
-	/* ensure the data we want to allocate will fit in the memzone */
-	const size_t min_size = size + align + MALLOC_ELEM_OVERHEAD * 2;
-	const struct rte_memzone *mz = NULL;
+	struct rte_memseg *ms = NULL;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-	unsigned numa_socket = heap - mcfg->malloc_heaps;
-
-	size_t mz_size = min_size;
-	if (mz_size < block_size)
-		mz_size = block_size;
-
-	char mz_name[RTE_MEMZONE_NAMESIZE];
-	snprintf(mz_name, sizeof(mz_name), "MALLOC_S%u_HEAP_%u",
-		     numa_socket, heap->mz_count++);
-
-	/* try getting a block. if we fail and we don't need as big a block
-	 * as given in the config, we can shrink our request and try again
-	 */
-	do {
-		mz = rte_memzone_reserve(mz_name, mz_size, numa_socket,
-					 mz_flags);
-		if (mz == NULL)
-			mz_size /= 2;
-	} while (mz == NULL && mz_size > min_size);
-	if (mz == NULL)
+	unsigned socket_id = heap - mcfg->malloc_heaps;
+
+	ms = find_suitable_memseg(socket_id, size, flags, align, bound);
+	if (ms == NULL)
 		return -1;
 
+	ms->used = 1;
 	/* allocate the memory block headers, one at end, one at start */
-	struct malloc_elem *start_elem = (struct malloc_elem *)mz->addr;
-	struct malloc_elem *end_elem = RTE_PTR_ADD(mz->addr,
-			mz_size - MALLOC_ELEM_OVERHEAD);
+	struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
+	struct malloc_elem *end_elem = RTE_PTR_ADD(ms->addr,
+			ms->len - MALLOC_ELEM_OVERHEAD);
 	end_elem = RTE_PTR_ALIGN_FLOOR(end_elem, RTE_CACHE_LINE_SIZE);
 
 	const unsigned elem_size = (uintptr_t)end_elem - (uintptr_t)start_elem;
-	malloc_elem_init(start_elem, heap, mz, elem_size);
+	malloc_elem_init(start_elem, heap, ms, elem_size);
 	malloc_elem_mkend(end_elem, start_elem);
 	malloc_elem_free_list_insert(start_elem);
 
-	/* increase heap total size by size of new memzone */
-	heap->total_size+=mz_size - MALLOC_ELEM_OVERHEAD;
+	heap->total_size += ms->len - MALLOC_ELEM_OVERHEAD;
 	return 0;
 }
 
@@ -126,10 +192,11 @@  malloc_heap_add_memzone(struct malloc_heap *heap, size_t size, unsigned align)
  * Returns null on failure, or pointer to element on success.
  */
 static struct malloc_elem *
-find_suitable_element(struct malloc_heap *heap, size_t size, unsigned align)
+find_suitable_element(struct malloc_heap *heap, size_t size,
+		unsigned flags, size_t align, size_t bound)
 {
 	size_t idx;
-	struct malloc_elem *elem;
+	struct malloc_elem *elem, *alt_elem = NULL;
 
 	for (idx = malloc_elem_free_list_index(size);
 		idx < RTE_HEAP_NUM_FREELISTS; idx++)
@@ -137,40 +204,51 @@  find_suitable_element(struct malloc_heap *heap, size_t size, unsigned align)
 		for (elem = LIST_FIRST(&heap->free_head[idx]);
 			!!elem; elem = LIST_NEXT(elem, free_list))
 		{
-			if (malloc_elem_can_hold(elem, size, align))
-				return elem;
+			if (malloc_elem_can_hold(elem, size, align, bound)) {
+				if (check_hugepage_sz(flags, elem->ms->hugepage_sz))
+					return elem;
+				else
+					alt_elem = elem;
+			}
 		}
 	}
+
+	if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY))
+		return alt_elem;
+
 	return NULL;
 }
 
 /*
- * Main function called by malloc to allocate a block of memory from the
- * heap. It locks the free list, scans it, and adds a new memzone if the
- * scan fails. Once the new memzone is added, it re-scans and should return
+ * Main function to allocate a block of memory from the heap.
+ * It locks the free list, scans it, and adds a new memseg if the
+ * scan fails. Once the new memseg is added, it re-scans and should return
  * the new element after releasing the lock.
  */
 void *
 malloc_heap_alloc(struct malloc_heap *heap,
-		const char *type __attribute__((unused)), size_t size, unsigned align)
+		const char *type __attribute__((unused)), size_t size, unsigned flags,
+		size_t align, size_t bound)
 {
 	size = RTE_CACHE_LINE_ROUNDUP(size);
 	align = RTE_CACHE_LINE_ROUNDUP(align);
+
 	rte_spinlock_lock(&heap->lock);
-	struct malloc_elem *elem = find_suitable_element(heap, size, align);
+	struct malloc_elem *elem = find_suitable_element(heap, size, flags,
+			align, bound);
 	if (elem == NULL){
-		if ((malloc_heap_add_memzone(heap, size, align)) == 0)
-			elem = find_suitable_element(heap, size, align);
+		if ((malloc_heap_add_memseg(heap, size, flags, align, bound)) == 0)
+			elem = find_suitable_element(heap, size, flags, align, bound);
 	}
 
 	if (elem != NULL){
-		elem = malloc_elem_alloc(elem, size, align);
+		elem = malloc_elem_alloc(elem, size, align, bound);
 		/* increase heap's count of allocated elements */
 		heap->alloc_count++;
 	}
 	rte_spinlock_unlock(&heap->lock);
-	return elem == NULL ? NULL : (void *)(&elem[1]);
 
+	return elem == NULL ? NULL : (void *)(&elem[1]);
 }
 
 /*
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index a47136d..4ba3353 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -53,8 +53,8 @@  malloc_get_numa_socket(void)
 }
 
 void *
-malloc_heap_alloc(struct malloc_heap *heap, const char *type,
-		size_t size, unsigned align);
+malloc_heap_alloc(struct malloc_heap *heap,	const char *type, size_t size,
+		unsigned flags, size_t align, size_t bound);
 
 int
 malloc_heap_get_stats(const struct malloc_heap *heap,
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index c313a57..54c2bd8 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -39,7 +39,6 @@ 
 
 #include <rte_memcpy.h>
 #include <rte_memory.h>
-#include <rte_memzone.h>
 #include <rte_eal.h>
 #include <rte_eal_memconfig.h>
 #include <rte_branch_prediction.h>
@@ -87,7 +86,7 @@  rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
 		return NULL;
 
 	ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
-				size, align == 0 ? 1 : align);
+				size, 0, align == 0 ? 1 : align, 0);
 	if (ret != NULL || socket_arg != SOCKET_ID_ANY)
 		return ret;
 
@@ -98,7 +97,7 @@  rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
 			continue;
 
 		ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type,
-					size, align == 0 ? 1 : align);
+					size, 0, align == 0 ? 1 : align, 0);
 		if (ret != NULL)
 			return ret;
 	}
@@ -256,5 +255,5 @@  rte_malloc_virt2phy(const void *addr)
 	const struct malloc_elem *elem = malloc_elem_from_data(addr);
 	if (elem == NULL)
 		return 0;
-	return elem->mz->phys_addr + ((uintptr_t)addr - (uintptr_t)elem->mz->addr);
+	return elem->ms->phys_addr + ((uintptr_t)addr - (uintptr_t)elem->ms->addr);
 }