get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/37052/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 37052,
    "url": "http://patches.dpdk.org/api/patches/37052/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/cdd2ebf717dd474a7043f06db7523dc45dd83021.1522797505.git.anatoly.burakov@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<cdd2ebf717dd474a7043f06db7523dc45dd83021.1522797505.git.anatoly.burakov@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/cdd2ebf717dd474a7043f06db7523dc45dd83021.1522797505.git.anatoly.burakov@intel.com",
    "date": "2018-04-03T23:22:01",
    "name": "[dpdk-dev,v3,49/68] eal: replace memseg with memseg lists",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "e5b64b5289294851578e75dc807eb9b84b64dff2",
    "submitter": {
        "id": 4,
        "url": "http://patches.dpdk.org/api/people/4/?format=api",
        "name": "Anatoly Burakov",
        "email": "anatoly.burakov@intel.com"
    },
    "delegate": null,
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/cdd2ebf717dd474a7043f06db7523dc45dd83021.1522797505.git.anatoly.burakov@intel.com/mbox/",
    "series": [],
    "comments": "http://patches.dpdk.org/api/patches/37052/comments/",
    "check": "fail",
    "checks": "http://patches.dpdk.org/api/patches/37052/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 34ECA1BADC;\n\tWed,  4 Apr 2018 01:24:26 +0200 (CEST)",
            "from mga01.intel.com (mga01.intel.com [192.55.52.88])\n\tby dpdk.org (Postfix) with ESMTP id AB4121B89D\n\tfor <dev@dpdk.org>; Wed,  4 Apr 2018 01:22:37 +0200 (CEST)",
            "from fmsmga003.fm.intel.com ([10.253.24.29])\n\tby fmsmga101.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384;\n\t03 Apr 2018 16:22:36 -0700",
            "from irvmail001.ir.intel.com ([163.33.26.43])\n\tby FMSMGA003.fm.intel.com with ESMTP; 03 Apr 2018 16:22:31 -0700",
            "from sivswdev01.ir.intel.com (sivswdev01.ir.intel.com\n\t[10.237.217.45])\n\tby irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id\n\tw33NMU2G013185; Wed, 4 Apr 2018 00:22:30 +0100",
            "from sivswdev01.ir.intel.com (localhost [127.0.0.1])\n\tby sivswdev01.ir.intel.com with ESMTP id w33NMU0m014943;\n\tWed, 4 Apr 2018 00:22:30 +0100",
            "(from aburakov@localhost)\n\tby sivswdev01.ir.intel.com with LOCAL id w33NMTud014939;\n\tWed, 4 Apr 2018 00:22:29 +0100"
        ],
        "X-Amp-Result": "SKIPPED(no attachment in message)",
        "X-Amp-File-Uploaded": "False",
        "X-ExtLoop1": "1",
        "X-IronPort-AV": "E=Sophos;i=\"5.48,403,1517904000\"; d=\"scan'208\";a=\"39161128\"",
        "From": "Anatoly Burakov <anatoly.burakov@intel.com>",
        "To": "dev@dpdk.org",
        "Cc": "Thomas Monjalon <thomas@monjalon.net>,\n\tBruce Richardson <bruce.richardson@intel.com>,\n\tNeil Horman <nhorman@tuxdriver.com>,\n\tJohn McNamara <john.mcnamara@intel.com>,\n\tMarko Kovacevic <marko.kovacevic@intel.com>,\n\tHemant Agrawal <hemant.agrawal@nxp.com>,\n\tShreyansh Jain <shreyansh.jain@nxp.com>,\n\tAkhil Goyal <akhil.goyal@nxp.com>, \n\tAdrien Mazarguil <adrien.mazarguil@6wind.com>,\n\tNelio Laranjeiro <nelio.laranjeiro@6wind.com>,\n\tYongseok Koh <yskoh@mellanox.com>,\n\tMaxime Coquelin <maxime.coquelin@redhat.com>,\n\tTiwei Bie <tiwei.bie@intel.com>, Olivier Matz <olivier.matz@6wind.com>,\n\tkeith.wiles@intel.com, jianfeng.tan@intel.com,\n\tandras.kovacs@ericsson.com, laszlo.vadkeri@ericsson.com,\n\tbenjamin.walker@intel.com, konstantin.ananyev@intel.com,\n\tkuralamudhan.ramakrishnan@intel.com, louise.m.daly@intel.com,\n\tpepperjo@japf.ch, jerin.jacob@caviumnetworks.com,\n\tgowrishankar.m@linux.vnet.ibm.com",
        "Date": "Wed,  4 Apr 2018 00:22:01 +0100",
        "Message-Id": "<cdd2ebf717dd474a7043f06db7523dc45dd83021.1522797505.git.anatoly.burakov@intel.com>",
        "X-Mailer": "git-send-email 1.7.0.7",
        "In-Reply-To": [
            "<cover.1522797505.git.anatoly.burakov@intel.com>",
            "<cover.1522797505.git.anatoly.burakov@intel.com>"
        ],
        "References": [
            "<cover.1522797505.git.anatoly.burakov@intel.com>",
            "<cover.1520428025.git.anatoly.burakov@intel.com>\n\t<cover.1522797505.git.anatoly.burakov@intel.com>"
        ],
        "Subject": "[dpdk-dev] [PATCH v3 49/68] eal: replace memseg with memseg lists",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Before, we were aggregating multiple pages into one memseg, so the\nnumber of memsegs was small. Now, each page gets its own memseg,\nso the list of memsegs is huge. To accommodate the new memseg list\nsize and to keep the under-the-hood workings sane, the memseg list\nis now not just a single list, but multiple lists. To be precise,\neach hugepage size available on the system gets one or more memseg\nlists, per socket.\n\nIn order to support dynamic memory allocation, we reserve all\nmemory in advance (unless we're in 32-bit legacy mode, in which\ncase we do not preallocate memory). As in, we do an anonymous\nmmap() of the entire maximum size of memory per hugepage size, per\nsocket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or\nRTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the\nsmaller one), split over multiple lists (which are limited to\neither RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST\nmegabytes per list, whichever is the smaller one). There is also\na global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly\nused for 32-bit targets to limit amounts of preallocated memory,\nbut can be used to place an upper limit on total amount of VA\nmemory that can be allocated by DPDK application.\n\nSo, for each hugepage size, we get (by default) up to 128G worth\nof memory, per socket, split into chunks of up to 32G in size.\nThe address space is claimed at the start, in eal_common_memory.c.\nThe actual page allocation code is in eal_memalloc.c (Linux-only),\nand largely consists of copied EAL memory init code.\n\nPages in the list are also indexed by address. That is, in order\nto figure out where the page belongs, one can simply look at base\naddress for a memseg list. Similarly, figuring out IOVA address\nof a memzone is a matter of finding the right memseg list, getting\noffset and dividing by page size to get the appropriate memseg.\n\nThis commit also removes rte_eal_dump_physmem_layout() call,\naccording to deprecation notice [1], and removes that deprecation\nnotice as well.\n\nOn 32-bit targets due to limited VA space, DPDK will no longer\nspread memory to different sockets like before. Instead, it will\n(by default) allocate all of the memory on socket where master\nlcore is. To override this behavior, --socket-mem must be used.\n\nThe rest of the changes are really ripple effects from the memseg\nchange - heap changes, compile fixes, and rewrites to support\nfbarray-backed memseg lists. Due to earlier switch to _walk()\nfunctions, most of the changes are simple fixes, however some\nof the _walk() calls were switched to memseg list walk, where\nit made sense to do so.\n\nAdditionally, we are also switching locks from flock() to fcntl().\nDown the line, we will be introducing single-file segments option,\nand we cannot use flock() locks to lock parts of the file. Therefore,\nwe will use fcntl() locks for legacy mem as well, in case someone is\nunfortunate enough to accidentally start legacy mem primary process\nalongside an already working non-legacy mem-based primary process.\n\n[1] http://dpdk.org/dev/patchwork/patch/34002/\n\nSigned-off-by: Anatoly Burakov <anatoly.burakov@intel.com>\n---\n\nNotes:\n    v3:\n    - New and improved legacy mode, without (too much) crazy hacks\n    - 32-bit support\n    - FreeBSD support\n    - Compile fixes for all platforms\n\n config/common_base                                |  15 +-\n config/defconfig_i686-native-linuxapp-gcc         |   3 +\n config/defconfig_i686-native-linuxapp-icc         |   3 +\n config/defconfig_x86_x32-native-linuxapp-gcc      |   3 +\n config/rte_config.h                               |   7 +-\n doc/guides/rel_notes/deprecation.rst              |   9 -\n drivers/bus/fslmc/fslmc_vfio.c                    |  10 +-\n drivers/bus/fslmc/portal/dpaa2_hw_pvt.h           |   2 +-\n drivers/bus/pci/linux/pci.c                       |   8 +-\n drivers/crypto/dpaa_sec/dpaa_sec.c                |   2 +-\n drivers/net/mlx4/mlx4_mr.c                        |   4 +-\n drivers/net/mlx5/mlx5.c                           |   3 +-\n drivers/net/mlx5/mlx5_mr.c                        |   4 +-\n drivers/net/virtio/virtio_user/vhost_kernel.c     |   4 +-\n lib/librte_eal/bsdapp/eal/eal.c                   |  12 +-\n lib/librte_eal/bsdapp/eal/eal_hugepage_info.c     |  17 +-\n lib/librte_eal/bsdapp/eal/eal_memory.c            | 207 ++++-\n lib/librte_eal/common/eal_common_memory.c         | 581 ++++++++++++--\n lib/librte_eal/common/eal_common_memzone.c        |  48 +-\n lib/librte_eal/common/eal_hugepages.h             |   1 -\n lib/librte_eal/common/eal_internal_cfg.h          |   2 +-\n lib/librte_eal/common/include/rte_eal_memconfig.h |  22 +-\n lib/librte_eal/common/include/rte_memory.h        |  56 +-\n lib/librte_eal/common/include/rte_memzone.h       |   1 -\n lib/librte_eal/common/malloc_elem.c               |  12 +-\n lib/librte_eal/common/malloc_elem.h               |   6 +-\n lib/librte_eal/common/malloc_heap.c               |  62 +-\n lib/librte_eal/common/rte_malloc.c                |  22 +-\n lib/librte_eal/linuxapp/eal/eal.c                 |  15 +-\n lib/librte_eal/linuxapp/eal/eal_hugepage_info.c   |  25 +-\n lib/librte_eal/linuxapp/eal/eal_memory.c          | 913 +++++++++++++++-------\n lib/librte_eal/linuxapp/eal/eal_vfio.c            |   9 +-\n lib/librte_eal/rte_eal_version.map                |   3 +-\n lib/librte_mempool/rte_mempool.c                  |   9 +-\n test/test/test_malloc.c                           |  30 +-\n test/test/test_memory.c                           |  10 +-\n test/test/test_memzone.c                          |  12 +-\n 37 files changed, 1563 insertions(+), 589 deletions(-)",
    "diff": "diff --git a/config/common_base b/config/common_base\nindex 7abf7c6..0ca1a06 100644\n--- a/config/common_base\n+++ b/config/common_base\n@@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64\n CONFIG_RTE_LIBRTE_EAL=y\n CONFIG_RTE_MAX_LCORE=128\n CONFIG_RTE_MAX_NUMA_NODES=8\n-CONFIG_RTE_MAX_MEMSEG=256\n+CONFIG_RTE_MAX_MEMSEG_LISTS=64\n+# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages\n+# or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller\n+CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192\n+CONFIG_RTE_MAX_MEM_MB_PER_LIST=32768\n+# a \"type\" is a combination of page size and NUMA node. total number of memseg\n+# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split\n+# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or\n+# RTE_MAX_MEM_MB_PER_TYPE megabytes of memory (split over multiple lists of\n+# RTE_MAX_MEM_MB_PER_LIST), whichever is smaller\n+CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768\n+CONFIG_RTE_MAX_MEM_MB_PER_TYPE=131072\n+# global maximum usable amount of VA, in megabytes\n+CONFIG_RTE_MAX_MEM_MB=524288\n CONFIG_RTE_MAX_MEMZONE=2560\n CONFIG_RTE_MAX_TAILQ=32\n CONFIG_RTE_ENABLE_ASSERT=n\ndiff --git a/config/defconfig_i686-native-linuxapp-gcc b/config/defconfig_i686-native-linuxapp-gcc\nindex a42ba4f..1178fe3 100644\n--- a/config/defconfig_i686-native-linuxapp-gcc\n+++ b/config/defconfig_i686-native-linuxapp-gcc\n@@ -46,3 +46,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n\n # AVP PMD is not supported on 32-bit\n #\n CONFIG_RTE_LIBRTE_AVP_PMD=n\n+\n+# 32-bit doesn't break up memory in lists, but does have VA allocation limit\n+CONFIG_RTE_MAX_MEM_MB=2048\ndiff --git a/config/defconfig_i686-native-linuxapp-icc b/config/defconfig_i686-native-linuxapp-icc\nindex 144ba0a..f096e22 100644\n--- a/config/defconfig_i686-native-linuxapp-icc\n+++ b/config/defconfig_i686-native-linuxapp-icc\n@@ -51,3 +51,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n\n # AVP PMD is not supported on 32-bit\n #\n CONFIG_RTE_LIBRTE_AVP_PMD=n\n+\n+# 32-bit doesn't break up memory in lists, but does have VA allocation limit\n+CONFIG_RTE_MAX_MEM_MB=2048\ndiff --git a/config/defconfig_x86_x32-native-linuxapp-gcc b/config/defconfig_x86_x32-native-linuxapp-gcc\nindex b6206a5..57d000d 100644\n--- a/config/defconfig_x86_x32-native-linuxapp-gcc\n+++ b/config/defconfig_x86_x32-native-linuxapp-gcc\n@@ -26,3 +26,6 @@ CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n\n # AVP PMD is not supported on 32-bit\n #\n CONFIG_RTE_LIBRTE_AVP_PMD=n\n+\n+# 32-bit doesn't break up memory in lists, but does have VA allocation limit\n+CONFIG_RTE_MAX_MEM_MB=2048\ndiff --git a/config/rte_config.h b/config/rte_config.h\nindex 72c0aa2..e42be1c 100644\n--- a/config/rte_config.h\n+++ b/config/rte_config.h\n@@ -21,7 +21,12 @@\n /****** library defines ********/\n \n /* EAL defines */\n-#define RTE_MAX_MEMSEG 512\n+#define RTE_MAX_MEMSEG_LISTS 128\n+#define RTE_MAX_MEMSEG_PER_LIST 8192\n+#define RTE_MAX_MEM_MB_PER_LIST 32768\n+#define RTE_MAX_MEMSEG_PER_TYPE 32768\n+#define RTE_MAX_MEM_MB_PER_TYPE 65536\n+#define RTE_MAX_MEM_MB 524288\n #define RTE_MAX_MEMZONE 2560\n #define RTE_MAX_TAILQ 32\n #define RTE_LOG_LEVEL RTE_LOG_INFO\ndiff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst\nindex ec70b5f..c9f2703 100644\n--- a/doc/guides/rel_notes/deprecation.rst\n+++ b/doc/guides/rel_notes/deprecation.rst\n@@ -38,15 +38,6 @@ Deprecation Notices\n   success and failure, respectively.  This will change to 1 and 0 for true and\n   false, respectively, to make use of the function more intuitive.\n \n-* eal: due to internal data layout reorganization, there will be changes to\n-  several structures and functions as a result of coming changes to support\n-  memory hotplug in v18.05.\n-  ``rte_eal_get_physmem_layout`` will be deprecated and removed in subsequent\n-  releases.\n-  ``rte_mem_config`` contents will change due to switch to memseg lists.\n-  ``rte_memzone`` member ``memseg_id`` will no longer serve any useful purpose\n-  and will be removed.\n-\n * eal: a new set of mbuf mempool ops name APIs for user, platform and best\n   mempool names have been defined in ``rte_mbuf`` in v18.02. The uses of\n   ``rte_eal_mbuf_default_mempool_ops`` shall be replaced by\ndiff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c\nindex ccdbeff..31831e3 100644\n--- a/drivers/bus/fslmc/fslmc_vfio.c\n+++ b/drivers/bus/fslmc/fslmc_vfio.c\n@@ -194,7 +194,8 @@ static int vfio_map_irq_region(struct fslmc_vfio_group *group)\n }\n \n static int\n-fslmc_vfio_map(const struct rte_memseg *ms, void *arg)\n+fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg)\n {\n \tint *n_segs = arg;\n \tstruct fslmc_vfio_group *group;\n@@ -236,18 +237,11 @@ fslmc_vfio_map(const struct rte_memseg *ms, void *arg)\n \n int rte_fslmc_vfio_dmamap(void)\n {\n-\tconst struct rte_memseg *memseg;\n \tint i = 0;\n \n \tif (is_dma_done)\n \t\treturn 0;\n \n-\tmemseg = rte_eal_get_physmem_layout();\n-\tif (memseg == NULL) {\n-\t\tFSLMC_VFIO_LOG(ERR, \"Cannot get physical layout.\");\n-\t\treturn -ENODEV;\n-\t}\n-\n \tif (rte_memseg_walk(fslmc_vfio_map, &i) < 0)\n \t\treturn -1;\n \ndiff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h\nindex 45fd41e..72aae43 100644\n--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h\n+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h\n@@ -274,7 +274,7 @@ static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr)\n \tif (dpaa2_virt_mode)\n \t\treturn vaddr;\n \n-\tmemseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr);\n+\tmemseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);\n \tif (memseg)\n \t\treturn memseg->phys_addr + RTE_PTR_DIFF(vaddr, memseg->addr);\n \treturn (size_t)NULL;\ndiff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c\nindex 6dda054..4630a80 100644\n--- a/drivers/bus/pci/linux/pci.c\n+++ b/drivers/bus/pci/linux/pci.c\n@@ -117,9 +117,10 @@ rte_pci_unmap_device(struct rte_pci_device *dev)\n }\n \n static int\n-find_max_end_va(const struct rte_memseg *ms, void *arg)\n+find_max_end_va(const struct rte_memseg_list *msl, void *arg)\n {\n-\tvoid *end_va = RTE_PTR_ADD(ms->addr, ms->len);\n+\tsize_t sz = msl->memseg_arr.len * msl->page_sz;\n+\tvoid *end_va = RTE_PTR_ADD(msl->base_va, sz);\n \tvoid **max_va = arg;\n \n \tif (*max_va < end_va)\n@@ -132,10 +133,11 @@ pci_find_max_end_va(void)\n {\n \tvoid *va = NULL;\n \n-\trte_memseg_walk(find_max_end_va, &va);\n+\trte_memseg_list_walk(find_max_end_va, &va);\n \treturn va;\n }\n \n+\n /* parse one line of the \"resource\" sysfs file (note that the 'line'\n  * string is modified)\n  */\ndiff --git a/drivers/crypto/dpaa_sec/dpaa_sec.c b/drivers/crypto/dpaa_sec/dpaa_sec.c\nindex a14e669..b685220 100644\n--- a/drivers/crypto/dpaa_sec/dpaa_sec.c\n+++ b/drivers/crypto/dpaa_sec/dpaa_sec.c\n@@ -95,7 +95,7 @@ dpaa_mem_vtop(void *vaddr)\n {\n \tconst struct rte_memseg *ms;\n \n-\tms = rte_mem_virt2memseg(vaddr);\n+\tms = rte_mem_virt2memseg(vaddr, NULL);\n \tif (ms)\n \t\treturn ms->iova + RTE_PTR_DIFF(vaddr, ms->addr);\n \treturn (size_t)NULL;\ndiff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c\nindex 47dd542..2ba609e 100644\n--- a/drivers/net/mlx4/mlx4_mr.c\n+++ b/drivers/net/mlx4/mlx4_mr.c\n@@ -142,10 +142,10 @@ mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)\n \t      (void *)mp, (void *)start, (void *)end,\n \t      (size_t)(end - start));\n \t/* Round start and end to page boundary if found in memory segments. */\n-\tms = rte_mem_virt2memseg((void *)start);\n+\tms = rte_mem_virt2memseg((void *)start, NULL);\n \tif (ms != NULL)\n \t\tstart = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);\n-\tms = rte_mem_virt2memseg((void *)end);\n+\tms = rte_mem_virt2memseg((void *)end, NULL);\n \tif (ms != NULL)\n \t\tend = RTE_ALIGN_CEIL(end, ms->hugepage_sz);\n \ndiff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c\nindex 1724b65..e228356 100644\n--- a/drivers/net/mlx5/mlx5.c\n+++ b/drivers/net/mlx5/mlx5.c\n@@ -478,7 +478,8 @@ static struct rte_pci_driver mlx5_driver;\n static void *uar_base;\n \n static int\n-find_lower_va_bound(const struct rte_memseg *ms, void *arg)\n+find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg)\n {\n \tvoid **addr = arg;\n \ndiff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c\nindex d8c04dc..6638185 100644\n--- a/drivers/net/mlx5/mlx5_mr.c\n+++ b/drivers/net/mlx5/mlx5_mr.c\n@@ -263,10 +263,10 @@ mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)\n \tmr->end = end;\n \n \t/* Round start and end to page boundary if found in memory segments. */\n-\tms = rte_mem_virt2memseg((void *)start);\n+\tms = rte_mem_virt2memseg((void *)start, NULL);\n \tif (ms != NULL)\n \t\tstart = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);\n-\tms = rte_mem_virt2memseg((void *)end);\n+\tms = rte_mem_virt2memseg((void *)end, NULL);\n \tif (ms != NULL)\n \t\tend = RTE_ALIGN_CEIL(end, ms->hugepage_sz);\n \ndiff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c\nindex 93d7efe..b244409 100644\n--- a/drivers/net/virtio/virtio_user/vhost_kernel.c\n+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c\n@@ -75,7 +75,8 @@ struct walk_arg {\n \tuint32_t region_nr;\n };\n static int\n-add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)\n+add_memory_region(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, size_t len, void *arg)\n {\n \tstruct walk_arg *wa = arg;\n \tstruct vhost_memory_region *mr;\n@@ -95,7 +96,6 @@ add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)\n \treturn 0;\n }\n \n-\n /* By default, vhost kernel module allows 64 regions, but DPDK allows\n  * 256 segments. As a relief, below function merges those virtually\n  * adjacent memsegs into one region.\ndiff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c\nindex f44b904..d009cf0 100644\n--- a/lib/librte_eal/bsdapp/eal/eal.c\n+++ b/lib/librte_eal/bsdapp/eal/eal.c\n@@ -64,8 +64,8 @@ static int mem_cfg_fd = -1;\n static struct flock wr_lock = {\n \t\t.l_type = F_WRLCK,\n \t\t.l_whence = SEEK_SET,\n-\t\t.l_start = offsetof(struct rte_mem_config, memseg),\n-\t\t.l_len = sizeof(early_mem_config.memseg),\n+\t\t.l_start = offsetof(struct rte_mem_config, memsegs),\n+\t\t.l_len = sizeof(early_mem_config.memsegs),\n };\n \n /* Address of global and public configuration */\n@@ -430,11 +430,11 @@ eal_parse_args(int argc, char **argv)\n }\n \n static int\n-check_socket(const struct rte_memseg *ms, void *arg)\n+check_socket(const struct rte_memseg_list *msl, void *arg)\n {\n \tint *socket_id = arg;\n \n-\tif (ms->socket_id == *socket_id)\n+\tif (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)\n \t\treturn 1;\n \n \treturn 0;\n@@ -447,10 +447,11 @@ eal_check_mem_on_local_socket(void)\n \n \tsocket_id = rte_lcore_to_socket_id(rte_config.master_lcore);\n \n-\tif (rte_memseg_walk(check_socket, &socket_id) == 0)\n+\tif (rte_memseg_list_walk(check_socket, &socket_id) == 0)\n \t\tRTE_LOG(WARNING, EAL, \"WARNING: Master core has no memory on local socket!\\n\");\n }\n \n+\n static int\n sync_func(__attribute__((unused)) void *arg)\n {\n@@ -561,7 +562,6 @@ rte_eal_init(int argc, char **argv)\n \trte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();\n \n \tif (internal_config.no_hugetlbfs == 0 &&\n-\t\t\tinternal_config.process_type != RTE_PROC_SECONDARY &&\n \t\t\teal_hugepage_info_init() < 0) {\n \t\trte_eal_init_alert(\"Cannot get hugepage information.\");\n \t\trte_errno = EACCES;\ndiff --git a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c\nindex be2dbf0..ba44da0 100644\n--- a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c\n+++ b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c\n@@ -47,12 +47,18 @@ eal_hugepage_info_init(void)\n \tstruct hugepage_info *hpi = &internal_config.hugepage_info[0];\n \tstruct hugepage_info *tmp_hpi;\n \n+\tinternal_config.num_hugepage_sizes = 1;\n+\n+\t/* nothing more to be done for secondary */\n+\tif (rte_eal_process_type() == RTE_PROC_SECONDARY)\n+\t\treturn 0;\n+\n \tsysctl_size = sizeof(num_buffers);\n \terror = sysctlbyname(\"hw.contigmem.num_buffers\", &num_buffers,\n \t\t\t&sysctl_size, NULL, 0);\n \n \tif (error != 0) {\n-\t\tRTE_LOG(ERR, EAL, \"could not read sysctl hw.contigmem.num_buffers\");\n+\t\tRTE_LOG(ERR, EAL, \"could not read sysctl hw.contigmem.num_buffers\\n\");\n \t\treturn -1;\n \t}\n \n@@ -61,7 +67,7 @@ eal_hugepage_info_init(void)\n \t\t\t&sysctl_size, NULL, 0);\n \n \tif (error != 0) {\n-\t\tRTE_LOG(ERR, EAL, \"could not read sysctl hw.contigmem.buffer_size\");\n+\t\tRTE_LOG(ERR, EAL, \"could not read sysctl hw.contigmem.buffer_size\\n\");\n \t\treturn -1;\n \t}\n \n@@ -81,22 +87,21 @@ eal_hugepage_info_init(void)\n \t\tRTE_LOG(INFO, EAL, \"Contigmem driver has %d buffers, each of size %dKB\\n\",\n \t\t\t\tnum_buffers, (int)(buffer_size>>10));\n \n-\tinternal_config.num_hugepage_sizes = 1;\n \thpi->hugedir = CONTIGMEM_DEV;\n \thpi->hugepage_sz = buffer_size;\n \thpi->num_pages[0] = num_buffers;\n \thpi->lock_descriptor = fd;\n \n \ttmp_hpi = create_shared_memory(eal_hugepage_info_path(),\n-\t\t\t\t\tsizeof(struct hugepage_info));\n+\t\t\tsizeof(internal_config.hugepage_info));\n \tif (tmp_hpi == NULL ) {\n \t\tRTE_LOG(ERR, EAL, \"Failed to create shared memory!\\n\");\n \t\treturn -1;\n \t}\n \n-\tmemcpy(tmp_hpi, hpi, sizeof(struct hugepage_info));\n+\tmemcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));\n \n-\tif ( munmap(tmp_hpi, sizeof(struct hugepage_info)) < 0) {\n+\tif (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {\n \t\tRTE_LOG(ERR, EAL, \"Failed to unmap shared memory!\\n\");\n \t\treturn -1;\n \t}\ndiff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c\nindex bdfb882..6692b3d 100644\n--- a/lib/librte_eal/bsdapp/eal/eal_memory.c\n+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c\n@@ -6,6 +6,8 @@\n #include <sys/types.h>\n #include <sys/sysctl.h>\n #include <inttypes.h>\n+#include <errno.h>\n+#include <string.h>\n #include <fcntl.h>\n \n #include <rte_eal.h>\n@@ -41,37 +43,135 @@ rte_eal_hugepage_init(void)\n \tstruct rte_mem_config *mcfg;\n \tuint64_t total_mem = 0;\n \tvoid *addr;\n-\tunsigned i, j, seg_idx = 0;\n+\tunsigned int i, j, seg_idx = 0;\n \n \t/* get pointer to global configuration */\n \tmcfg = rte_eal_get_configuration()->mem_config;\n \n \t/* for debug purposes, hugetlbfs can be disabled */\n \tif (internal_config.no_hugetlbfs) {\n-\t\taddr = malloc(internal_config.memory);\n-\t\tmcfg->memseg[0].iova = (rte_iova_t)(uintptr_t)addr;\n-\t\tmcfg->memseg[0].addr = addr;\n-\t\tmcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;\n-\t\tmcfg->memseg[0].len = internal_config.memory;\n-\t\tmcfg->memseg[0].socket_id = 0;\n+\t\tstruct rte_memseg_list *msl;\n+\t\tstruct rte_fbarray *arr;\n+\t\tstruct rte_memseg *ms;\n+\t\tuint64_t page_sz;\n+\t\tint n_segs, cur_seg;\n+\n+\t\t/* create a memseg list */\n+\t\tmsl = &mcfg->memsegs[0];\n+\n+\t\tpage_sz = RTE_PGSIZE_4K;\n+\t\tn_segs = internal_config.memory / page_sz;\n+\n+\t\tif (rte_fbarray_init(&msl->memseg_arr, \"nohugemem\", n_segs,\n+\t\t\t\tsizeof(struct rte_memseg))) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Cannot allocate memseg list\\n\");\n+\t\t\treturn -1;\n+\t\t}\n+\n+\t\taddr = mmap(NULL, internal_config.memory,\n+\t\t\t\tPROT_READ | PROT_WRITE,\n+\t\t\t\tMAP_PRIVATE | MAP_ANONYMOUS, 0, 0);\n+\t\tif (addr == MAP_FAILED) {\n+\t\t\tRTE_LOG(ERR, EAL, \"%s: mmap() failed: %s\\n\", __func__,\n+\t\t\t\t\tstrerror(errno));\n+\t\t\treturn -1;\n+\t\t}\n+\t\tmsl->base_va = addr;\n+\t\tmsl->page_sz = page_sz;\n+\t\tmsl->socket_id = 0;\n+\n+\t\t/* populate memsegs. each memseg is 1 page long */\n+\t\tfor (cur_seg = 0; cur_seg < n_segs; cur_seg++) {\n+\t\t\tarr = &mcfg->memsegs[cur_seg].memseg_arr;\n+\n+\t\t\tms = rte_fbarray_get(arr, cur_seg);\n+\t\t\tif (rte_eal_iova_mode() == RTE_IOVA_VA)\n+\t\t\t\tms->iova = (uintptr_t)addr;\n+\t\t\telse\n+\t\t\t\tms->iova = RTE_BAD_IOVA;\n+\t\t\tms->addr = addr;\n+\t\t\tms->hugepage_sz = page_sz;\n+\t\t\tms->len = page_sz;\n+\t\t\tms->socket_id = 0;\n+\n+\t\t\trte_fbarray_set_used(arr, cur_seg);\n+\n+\t\t\taddr = RTE_PTR_ADD(addr, page_sz);\n+\t\t}\n \t\treturn 0;\n \t}\n \n \t/* map all hugepages and sort them */\n \tfor (i = 0; i < internal_config.num_hugepage_sizes; i ++){\n \t\tstruct hugepage_info *hpi;\n+\t\tuint64_t page_sz, mem_needed;\n+\t\tunsigned int n_pages, max_pages;\n \n \t\thpi = &internal_config.hugepage_info[i];\n-\t\tfor (j = 0; j < hpi->num_pages[0]; j++) {\n+\t\tpage_sz = hpi->hugepage_sz;\n+\t\tmax_pages = hpi->num_pages[0];\n+\t\tmem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem,\n+\t\t\t\tpage_sz);\n+\n+\t\tn_pages = RTE_MIN(mem_needed / page_sz, max_pages);\n+\n+\t\tfor (j = 0; j < n_pages; j++) {\n+\t\t\tstruct rte_memseg_list *msl;\n+\t\t\tstruct rte_fbarray *arr;\n \t\t\tstruct rte_memseg *seg;\n+\t\t\tint msl_idx, ms_idx;\n \t\t\trte_iova_t physaddr;\n \t\t\tint error;\n \t\t\tsize_t sysctl_size = sizeof(physaddr);\n \t\t\tchar physaddr_str[64];\n \n-\t\t\taddr = mmap(NULL, hpi->hugepage_sz, PROT_READ|PROT_WRITE,\n-\t\t\t\t    MAP_SHARED, hpi->lock_descriptor,\n-\t\t\t\t    j * EAL_PAGE_SIZE);\n+\t\t\tfor (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;\n+\t\t\t\t\tmsl_idx++) {\n+\t\t\t\tbool empty;\n+\t\t\t\tmsl = &mcfg->memsegs[msl_idx];\n+\t\t\t\tarr = &msl->memseg_arr;\n+\n+\t\t\t\tif (msl->page_sz != page_sz)\n+\t\t\t\t\tcontinue;\n+\n+\t\t\t\tempty = arr->count == 0;\n+\n+\t\t\t\t/* we need 1, plus hole if not empty */\n+\t\t\t\tms_idx = rte_fbarray_find_next_n_free(arr,\n+\t\t\t\t\t\t0, empty ? 1 : 0);\n+\n+\t\t\t\t/* memseg list is full? */\n+\t\t\t\tif (ms_idx < 0)\n+\t\t\t\t\tcontinue;\n+\n+\t\t\t\t/* leave some space between memsegs, they are\n+\t\t\t\t * not IOVA contiguous, so they shouldn't be VA\n+\t\t\t\t * contiguous either.\n+\t\t\t\t */\n+\t\t\t\tif (!empty)\n+\t\t\t\t\tms_idx++;\n+\n+\t\t\t\tbreak;\n+\t\t\t}\n+\t\t\tif (msl_idx == RTE_MAX_MEMSEG_LISTS) {\n+\t\t\t\tRTE_LOG(ERR, EAL, \"Could not find space for memseg. Please increase %s and/or %s in configuration.\\n\",\n+\t\t\t\t\tRTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),\n+\t\t\t\t\tRTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));\n+\t\t\t\treturn -1;\n+\t\t\t}\n+\t\t\tarr = &msl->memseg_arr;\n+\t\t\tseg = rte_fbarray_get(arr, ms_idx);\n+\n+\t\t\taddr = RTE_PTR_ADD(msl->base_va,\n+\t\t\t\t\t(size_t)msl->page_sz * ms_idx);\n+\n+\t\t\t/* address is already mapped in memseg list, so using\n+\t\t\t * MAP_FIXED here is safe.\n+\t\t\t */\n+\t\t\taddr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,\n+\t\t\t\t\tMAP_SHARED | MAP_FIXED,\n+\t\t\t\t\thpi->lock_descriptor,\n+\t\t\t\t\tj * EAL_PAGE_SIZE);\n \t\t\tif (addr == MAP_FAILED) {\n \t\t\t\tRTE_LOG(ERR, EAL, \"Failed to mmap buffer %u from %s\\n\",\n \t\t\t\t\t\tj, hpi->hugedir);\n@@ -88,33 +188,60 @@ rte_eal_hugepage_init(void)\n \t\t\t\treturn -1;\n \t\t\t}\n \n-\t\t\tseg = &mcfg->memseg[seg_idx++];\n \t\t\tseg->addr = addr;\n \t\t\tseg->iova = physaddr;\n-\t\t\tseg->hugepage_sz = hpi->hugepage_sz;\n-\t\t\tseg->len = hpi->hugepage_sz;\n+\t\t\tseg->hugepage_sz = page_sz;\n+\t\t\tseg->len = page_sz;\n \t\t\tseg->nchannel = mcfg->nchannel;\n \t\t\tseg->nrank = mcfg->nrank;\n \t\t\tseg->socket_id = 0;\n \n+\t\t\trte_fbarray_set_used(arr, ms_idx);\n+\n \t\t\tRTE_LOG(INFO, EAL, \"Mapped memory segment %u @ %p: physaddr:0x%\"\n \t\t\t\t\tPRIx64\", len %zu\\n\",\n-\t\t\t\t\tseg_idx, addr, physaddr, hpi->hugepage_sz);\n-\t\t\tif (total_mem >= internal_config.memory ||\n-\t\t\t\t\tseg_idx >= RTE_MAX_MEMSEG)\n-\t\t\t\tbreak;\n+\t\t\t\t\tseg_idx, addr, physaddr, page_sz);\n+\n+\t\t\ttotal_mem += seg->len;\n \t\t}\n+\t\tif (total_mem >= internal_config.memory)\n+\t\t\tbreak;\n+\t}\n+\tif (total_mem < internal_config.memory) {\n+\t\tRTE_LOG(ERR, EAL, \"Couldn't reserve requested memory, requested: %\" PRIu64 \"M available: %\" PRIu64 \"M\\n\",\n+\t\t\t\tinternal_config.memory >> 20, total_mem >> 20);\n+\t\treturn -1;\n \t}\n \treturn 0;\n }\n \n+struct attach_walk_args {\n+\tint fd_hugepage;\n+\tint seg_idx;\n+};\n+static int\n+attach_segment(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg)\n+{\n+\tstruct attach_walk_args *wa = arg;\n+\tvoid *addr;\n+\n+\taddr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,\n+\t\t\tMAP_SHARED | MAP_FIXED, wa->fd_hugepage,\n+\t\t\twa->seg_idx * EAL_PAGE_SIZE);\n+\tif (addr == MAP_FAILED || addr != ms->addr)\n+\t\treturn -1;\n+\twa->seg_idx++;\n+\n+\treturn 0;\n+}\n+\n int\n rte_eal_hugepage_attach(void)\n {\n \tconst struct hugepage_info *hpi;\n \tint fd_hugepage_info, fd_hugepage = -1;\n-\tunsigned i = 0;\n-\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tunsigned int i;\n \n \t/* Obtain a file descriptor for hugepage_info */\n \tfd_hugepage_info = open(eal_hugepage_info_path(), O_RDONLY);\n@@ -124,41 +251,43 @@ rte_eal_hugepage_attach(void)\n \t}\n \n \t/* Map the shared hugepage_info into the process address spaces */\n-\thpi = mmap(NULL, sizeof(struct hugepage_info), PROT_READ, MAP_PRIVATE,\n-\t\t\tfd_hugepage_info, 0);\n+\thpi = mmap(NULL, sizeof(internal_config.hugepage_info),\n+\t\t\tPROT_READ, MAP_PRIVATE, fd_hugepage_info, 0);\n \tif (hpi == MAP_FAILED) {\n \t\tRTE_LOG(ERR, EAL, \"Could not mmap %s\\n\", eal_hugepage_info_path());\n \t\tgoto error;\n \t}\n \n-\t/* Obtain a file descriptor for contiguous memory */\n-\tfd_hugepage = open(hpi->hugedir, O_RDWR);\n-\tif (fd_hugepage < 0) {\n-\t\tRTE_LOG(ERR, EAL, \"Could not open %s\\n\", hpi->hugedir);\n-\t\tgoto error;\n-\t}\n+\tfor (i = 0; i < internal_config.num_hugepage_sizes; i++) {\n+\t\tconst struct hugepage_info *cur_hpi = &hpi[i];\n+\t\tstruct attach_walk_args wa;\n \n-\t/* Map the contiguous memory into each memory segment */\n-\tfor (i = 0; i < hpi->num_pages[0]; i++) {\n+\t\tmemset(&wa, 0, sizeof(wa));\n \n-\t\tvoid *addr;\n-\t\tstruct rte_memseg *seg = &mcfg->memseg[i];\n+\t\t/* Obtain a file descriptor for contiguous memory */\n+\t\tfd_hugepage = open(cur_hpi->hugedir, O_RDWR);\n+\t\tif (fd_hugepage < 0) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Could not open %s\\n\",\n+\t\t\t\t\tcur_hpi->hugedir);\n+\t\t\tgoto error;\n+\t\t}\n+\t\twa.fd_hugepage = fd_hugepage;\n+\t\twa.seg_idx = 0;\n \n-\t\taddr = mmap(seg->addr, hpi->hugepage_sz, PROT_READ|PROT_WRITE,\n-\t\t\t    MAP_SHARED|MAP_FIXED, fd_hugepage,\n-\t\t\t    i * EAL_PAGE_SIZE);\n-\t\tif (addr == MAP_FAILED || addr != seg->addr) {\n+\t\t/* Map the contiguous memory into each memory segment */\n+\t\tif (rte_memseg_walk(attach_segment, &wa) < 0) {\n \t\t\tRTE_LOG(ERR, EAL, \"Failed to mmap buffer %u from %s\\n\",\n-\t\t\t\ti, hpi->hugedir);\n+\t\t\t\twa.seg_idx, cur_hpi->hugedir);\n \t\t\tgoto error;\n \t\t}\n \n+\t\tclose(fd_hugepage);\n+\t\tfd_hugepage = -1;\n \t}\n \n \t/* hugepage_info is no longer required */\n-\tmunmap((void *)(uintptr_t)hpi, sizeof(struct hugepage_info));\n+\tmunmap((void *)(uintptr_t)hpi, sizeof(internal_config.hugepage_info));\n \tclose(fd_hugepage_info);\n-\tclose(fd_hugepage);\n \treturn 0;\n \n error:\ndiff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c\nindex fd78d2f..0a6d678 100644\n--- a/lib/librte_eal/common/eal_common_memory.c\n+++ b/lib/librte_eal/common/eal_common_memory.c\n@@ -13,6 +13,7 @@\n #include <sys/mman.h>\n #include <sys/queue.h>\n \n+#include <rte_fbarray.h>\n #include <rte_memory.h>\n #include <rte_eal.h>\n #include <rte_eal_memconfig.h>\n@@ -30,6 +31,8 @@\n  * which is a multiple of hugepage size.\n  */\n \n+#define MEMSEG_LIST_FMT \"memseg-%\" PRIu64 \"k-%i-%i\"\n+\n static uint64_t baseaddr_offset;\n static uint64_t system_page_sz;\n \n@@ -120,15 +123,393 @@ eal_get_virtual_area(void *requested_addr, size_t *size,\n \treturn aligned_addr;\n }\n \n-/*\n- * Return a pointer to a read-only table of struct rte_physmem_desc\n- * elements, containing the layout of all addressable physical\n- * memory. The last element of the table contains a NULL address.\n- */\n-const struct rte_memseg *\n-rte_eal_get_physmem_layout(void)\n+static uint64_t\n+get_mem_amount(uint64_t page_sz, uint64_t max_mem)\n+{\n+\tuint64_t area_sz, max_pages;\n+\n+\t/* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */\n+\tmax_pages = RTE_MAX_MEMSEG_PER_LIST;\n+\tmax_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);\n+\n+\tarea_sz = RTE_MIN(page_sz * max_pages, max_mem);\n+\n+\t/* make sure the list isn't smaller than the page size */\n+\tarea_sz = RTE_MAX(area_sz, page_sz);\n+\n+\treturn RTE_ALIGN(area_sz, page_sz);\n+}\n+\n+static int\n+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,\n+\t\tuint64_t max_mem, int socket_id, int type_msl_idx)\n+{\n+\tchar name[RTE_FBARRAY_NAME_LEN];\n+\tuint64_t mem_amount;\n+\tint max_segs;\n+\n+\tmem_amount = get_mem_amount(page_sz, max_mem);\n+\tmax_segs = mem_amount / page_sz;\n+\n+\tsnprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,\n+\t\t type_msl_idx);\n+\tif (rte_fbarray_init(&msl->memseg_arr, name, max_segs,\n+\t\t\tsizeof(struct rte_memseg))) {\n+\t\tRTE_LOG(ERR, EAL, \"Cannot allocate memseg list: %s\\n\",\n+\t\t\trte_strerror(rte_errno));\n+\t\treturn -1;\n+\t}\n+\n+\tmsl->page_sz = page_sz;\n+\tmsl->socket_id = socket_id;\n+\tmsl->base_va = NULL;\n+\n+\tRTE_LOG(DEBUG, EAL, \"Memseg list allocated: 0x%zxkB at socket %i\\n\",\n+\t\t\t(size_t)page_sz >> 10, socket_id);\n+\n+\treturn 0;\n+}\n+\n+static int\n+alloc_va_space(struct rte_memseg_list *msl)\n+{\n+\tuint64_t page_sz;\n+\tsize_t mem_sz;\n+\tvoid *addr;\n+\tint flags = 0;\n+\n+#ifdef RTE_ARCH_PPC_64\n+\tflags |= MAP_HUGETLB;\n+#endif\n+\n+\tpage_sz = msl->page_sz;\n+\tmem_sz = page_sz * msl->memseg_arr.len;\n+\n+\taddr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);\n+\tif (addr == NULL) {\n+\t\tif (rte_errno == EADDRNOTAVAIL)\n+\t\t\tRTE_LOG(ERR, EAL, \"Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\\n\",\n+\t\t\t\t(unsigned long long)mem_sz, msl->base_va);\n+\t\telse\n+\t\t\tRTE_LOG(ERR, EAL, \"Cannot reserve memory\\n\");\n+\t\treturn -1;\n+\t}\n+\tmsl->base_va = addr;\n+\n+\treturn 0;\n+}\n+\n+static int __rte_unused\n+memseg_primary_init_32(void)\n+{\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tint active_sockets, hpi_idx, msl_idx = 0;\n+\tunsigned int socket_id, i;\n+\tstruct rte_memseg_list *msl;\n+\tuint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;\n+\tuint64_t max_mem;\n+\n+\t/* no-huge does not need this at all */\n+\tif (internal_config.no_hugetlbfs)\n+\t\treturn 0;\n+\n+\t/* this is a giant hack, but desperate times call for desperate\n+\t * measures. in legacy 32-bit mode, we cannot preallocate VA space,\n+\t * because having upwards of 2 gigabytes of VA space already mapped will\n+\t * interfere with our ability to map and sort hugepages.\n+\t *\n+\t * therefore, in legacy 32-bit mode, we will be initializing memseg\n+\t * lists much later - in eal_memory.c, right after we unmap all the\n+\t * unneeded pages. this will not affect secondary processes, as those\n+\t * should be able to mmap the space without (too many) problems.\n+\t */\n+\tif (internal_config.legacy_mem)\n+\t\treturn 0;\n+\n+\t/* 32-bit mode is a very special case. we cannot know in advance where\n+\t * the user will want to allocate their memory, so we have to do some\n+\t * heuristics.\n+\t */\n+\tactive_sockets = 0;\n+\ttotal_requested_mem = 0;\n+\tif (internal_config.force_sockets)\n+\t\tfor (i = 0; i < rte_socket_count(); i++) {\n+\t\t\tuint64_t mem;\n+\n+\t\t\tsocket_id = rte_socket_id_by_idx(i);\n+\t\t\tmem = internal_config.socket_mem[socket_id];\n+\n+\t\t\tif (mem == 0)\n+\t\t\t\tcontinue;\n+\n+\t\t\tactive_sockets++;\n+\t\t\ttotal_requested_mem += mem;\n+\t\t}\n+\telse\n+\t\ttotal_requested_mem = internal_config.memory;\n+\n+\tmax_mem = (uint64_t) RTE_MAX_MEM_MB_PER_TYPE << 20;\n+\tif (total_requested_mem > max_mem) {\n+\t\tRTE_LOG(ERR, EAL, \"Invalid parameters: 32-bit process can at most use %uM of memory\\n\",\n+\t\t\t\t(unsigned int)(max_mem >> 20));\n+\t\treturn -1;\n+\t}\n+\ttotal_extra_mem = max_mem - total_requested_mem;\n+\textra_mem_per_socket = active_sockets == 0 ? total_extra_mem :\n+\t\t\ttotal_extra_mem / active_sockets;\n+\n+\t/* the allocation logic is a little bit convoluted, but here's how it\n+\t * works, in a nutshell:\n+\t *  - if user hasn't specified on which sockets to allocate memory via\n+\t *    --socket-mem, we allocate all of our memory on master core socket.\n+\t *  - if user has specified sockets to allocate memory on, there may be\n+\t *    some \"unused\" memory left (e.g. if user has specified --socket-mem\n+\t *    such that not all memory adds up to 2 gigabytes), so add it to all\n+\t *    sockets that are in use equally.\n+\t *\n+\t * page sizes are sorted by size in descending order, so we can safely\n+\t * assume that we dispense with bigger page sizes first.\n+\t */\n+\n+\t/* create memseg lists */\n+\tfor (i = 0; i < rte_socket_count(); i++) {\n+\t\tint hp_sizes = (int) internal_config.num_hugepage_sizes;\n+\t\tuint64_t max_socket_mem, cur_socket_mem;\n+\t\tunsigned int master_lcore_socket;\n+\t\tstruct rte_config *cfg = rte_eal_get_configuration();\n+\t\tbool skip;\n+\n+\t\tsocket_id = rte_socket_id_by_idx(i);\n+\n+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES\n+\t\tif (socket_id > 0)\n+\t\t\tbreak;\n+#endif\n+\n+\t\t/* if we didn't specifically request memory on this socket */\n+\t\tskip = active_sockets != 0 &&\n+\t\t\t\tinternal_config.socket_mem[socket_id] == 0;\n+\t\t/* ...or if we didn't specifically request memory on *any*\n+\t\t * socket, and this is not master lcore\n+\t\t */\n+\t\tmaster_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);\n+\t\tskip |= active_sockets == 0 && socket_id != master_lcore_socket;\n+\n+\t\tif (skip) {\n+\t\t\tRTE_LOG(DEBUG, EAL, \"Will not preallocate memory on socket %u\\n\",\n+\t\t\t\t\tsocket_id);\n+\t\t\tcontinue;\n+\t\t}\n+\n+\t\t/* max amount of memory on this socket */\n+\t\tmax_socket_mem = (active_sockets != 0 ?\n+\t\t\t\t\tinternal_config.socket_mem[socket_id] :\n+\t\t\t\t\tinternal_config.memory) +\n+\t\t\t\t\textra_mem_per_socket;\n+\t\tcur_socket_mem = 0;\n+\n+\t\tfor (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {\n+\t\t\tuint64_t max_pagesz_mem, cur_pagesz_mem = 0;\n+\t\t\tuint64_t hugepage_sz;\n+\t\t\tstruct hugepage_info *hpi;\n+\t\t\tint type_msl_idx, max_segs, total_segs = 0;\n+\n+\t\t\thpi = &internal_config.hugepage_info[hpi_idx];\n+\t\t\thugepage_sz = hpi->hugepage_sz;\n+\n+\t\t\tmax_segs = RTE_MAX_MEMSEG_PER_TYPE;\n+\t\t\tmax_pagesz_mem = max_socket_mem - cur_socket_mem;\n+\n+\t\t\t/* make it multiple of page size */\n+\t\t\tmax_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,\n+\t\t\t\t\thugepage_sz);\n+\n+\t\t\tRTE_LOG(DEBUG, EAL, \"Attempting to preallocate %\" PRIu64 \"M on socket %i\\n\",\n+\t\t\t\t\tmax_pagesz_mem >> 20, socket_id);\n+\n+\t\t\ttype_msl_idx = 0;\n+\t\t\twhile (cur_pagesz_mem < max_pagesz_mem &&\n+\t\t\t\t\ttotal_segs < max_segs) {\n+\t\t\t\tif (msl_idx >= RTE_MAX_MEMSEG_LISTS) {\n+\t\t\t\t\tRTE_LOG(ERR, EAL,\n+\t\t\t\t\t\t\"No more space in memseg lists, please increase %s\\n\",\n+\t\t\t\t\t\tRTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));\n+\t\t\t\t\treturn -1;\n+\t\t\t\t}\n+\n+\t\t\t\tmsl = &mcfg->memsegs[msl_idx++];\n+\n+\t\t\t\tif (alloc_memseg_list(msl, hugepage_sz,\n+\t\t\t\t\t\tmax_pagesz_mem, socket_id,\n+\t\t\t\t\t\ttype_msl_idx))\n+\t\t\t\t\treturn -1;\n+\n+\t\t\t\ttotal_segs += msl->memseg_arr.len;\n+\t\t\t\tcur_pagesz_mem = total_segs * hugepage_sz;\n+\t\t\t\ttype_msl_idx++;\n+\n+\t\t\t\tif (alloc_va_space(msl)) {\n+\t\t\t\t\tRTE_LOG(ERR, EAL, \"Cannot allocate VA space for memseg list\\n\");\n+\t\t\t\t\treturn -1;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tcur_socket_mem += cur_pagesz_mem;\n+\t\t}\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static int __rte_unused\n+memseg_primary_init(void)\n+{\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tint i, socket_id, hpi_idx, msl_idx = 0;\n+\tstruct rte_memseg_list *msl;\n+\tuint64_t max_mem, total_mem;\n+\n+\t/* no-huge does not need this at all */\n+\tif (internal_config.no_hugetlbfs)\n+\t\treturn 0;\n+\n+\tmax_mem = (uint64_t)RTE_MAX_MEM_MB << 20;\n+\ttotal_mem = 0;\n+\n+\t/* create memseg lists */\n+\tfor (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;\n+\t\t\thpi_idx++) {\n+\t\tstruct hugepage_info *hpi;\n+\t\tuint64_t hugepage_sz;\n+\n+\t\thpi = &internal_config.hugepage_info[hpi_idx];\n+\t\thugepage_sz = hpi->hugepage_sz;\n+\n+\t\tfor (i = 0; i < (int) rte_socket_count(); i++) {\n+\t\t\tuint64_t max_type_mem, total_type_mem = 0;\n+\t\t\tint type_msl_idx, max_segs, total_segs = 0;\n+\n+\t\t\tsocket_id = rte_socket_id_by_idx(i);\n+\n+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES\n+\t\t\tif (socket_id > 0)\n+\t\t\t\tbreak;\n+#endif\n+\n+\t\t\tmax_type_mem = RTE_MIN(max_mem - total_mem,\n+\t\t\t\t(uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);\n+\t\t\tmax_segs = RTE_MAX_MEMSEG_PER_TYPE;\n+\n+\t\t\ttype_msl_idx = 0;\n+\t\t\twhile (total_type_mem < max_type_mem &&\n+\t\t\t\t\ttotal_segs < max_segs) {\n+\t\t\t\tuint64_t cur_max_mem;\n+\t\t\t\tif (msl_idx >= RTE_MAX_MEMSEG_LISTS) {\n+\t\t\t\t\tRTE_LOG(ERR, EAL,\n+\t\t\t\t\t\t\"No more space in memseg lists, please increase %s\\n\",\n+\t\t\t\t\t\tRTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));\n+\t\t\t\t\treturn -1;\n+\t\t\t\t}\n+\n+\t\t\t\tmsl = &mcfg->memsegs[msl_idx++];\n+\n+\t\t\t\tcur_max_mem = max_type_mem - total_type_mem;\n+\t\t\t\tif (alloc_memseg_list(msl, hugepage_sz,\n+\t\t\t\t\t\tcur_max_mem, socket_id,\n+\t\t\t\t\t\ttype_msl_idx))\n+\t\t\t\t\treturn -1;\n+\n+\t\t\t\ttotal_segs += msl->memseg_arr.len;\n+\t\t\t\ttotal_type_mem = total_segs * hugepage_sz;\n+\t\t\t\ttype_msl_idx++;\n+\n+\t\t\t\tif (alloc_va_space(msl)) {\n+\t\t\t\t\tRTE_LOG(ERR, EAL, \"Cannot allocate VA space for memseg list\\n\");\n+\t\t\t\t\treturn -1;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\ttotal_mem += total_type_mem;\n+\t\t}\n+\t}\n+\treturn 0;\n+}\n+\n+static int\n+memseg_secondary_init(void)\n+{\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tint msl_idx = 0;\n+\tstruct rte_memseg_list *msl;\n+\n+\tfor (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {\n+\n+\t\tmsl = &mcfg->memsegs[msl_idx];\n+\n+\t\t/* skip empty memseg lists */\n+\t\tif (msl->memseg_arr.len == 0)\n+\t\t\tcontinue;\n+\n+\t\tif (rte_fbarray_attach(&msl->memseg_arr)) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Cannot attach to primary process memseg lists\\n\");\n+\t\t\treturn -1;\n+\t\t}\n+\n+\t\t/* preallocate VA space */\n+\t\tif (alloc_va_space(msl)) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Cannot preallocate VA space for hugepage memory\\n\");\n+\t\t\treturn -1;\n+\t\t}\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static struct rte_memseg *\n+virt2memseg(const void *addr, const struct rte_memseg_list *msl)\n {\n-\treturn rte_eal_get_configuration()->mem_config->memseg;\n+\tconst struct rte_fbarray *arr;\n+\tvoid *start, *end;\n+\tint ms_idx;\n+\n+\t/* a memseg list was specified, check if it's the right one */\n+\tstart = msl->base_va;\n+\tend = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);\n+\n+\tif (addr < start || addr >= end)\n+\t\treturn NULL;\n+\n+\t/* now, calculate index */\n+\tarr = &msl->memseg_arr;\n+\tms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;\n+\treturn rte_fbarray_get(arr, ms_idx);\n+}\n+\n+static struct rte_memseg_list *\n+virt2memseg_list(const void *addr)\n+{\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tstruct rte_memseg_list *msl;\n+\tint msl_idx;\n+\n+\tfor (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {\n+\t\tvoid *start, *end;\n+\t\tmsl = &mcfg->memsegs[msl_idx];\n+\n+\t\tstart = msl->base_va;\n+\t\tend = RTE_PTR_ADD(start,\n+\t\t\t\t(size_t)msl->page_sz * msl->memseg_arr.len);\n+\t\tif (addr >= start && addr < end)\n+\t\t\tbreak;\n+\t}\n+\t/* if we didn't find our memseg list */\n+\tif (msl_idx == RTE_MAX_MEMSEG_LISTS)\n+\t\treturn NULL;\n+\treturn msl;\n+}\n+\n+__rte_experimental struct rte_memseg_list *\n+rte_mem_virt2memseg_list(const void *addr)\n+{\n+\treturn virt2memseg_list(addr);\n }\n \n struct virtiova {\n@@ -136,7 +517,8 @@ struct virtiova {\n \tvoid *virt;\n };\n static int\n-find_virt(const struct rte_memseg *ms, void *arg)\n+find_virt(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg)\n {\n \tstruct virtiova *vi = arg;\n \tif (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {\n@@ -161,49 +543,19 @@ rte_mem_iova2virt(rte_iova_t iova)\n \treturn vi.virt;\n }\n \n-struct virtms {\n-\tconst void *virt;\n-\tstruct rte_memseg *ms;\n-};\n-static int\n-find_memseg(const struct rte_memseg *ms, void *arg)\n-{\n-\tstruct virtms *vm = arg;\n-\n-\tif (arg >= ms->addr && arg < RTE_PTR_ADD(ms->addr, ms->len)) {\n-\t\tstruct rte_memseg *memseg, *found_ms;\n-\t\tint idx;\n-\n-\t\tmemseg = rte_eal_get_configuration()->mem_config->memseg;\n-\t\tidx = ms - memseg;\n-\t\tfound_ms = &memseg[idx];\n-\n-\t\tvm->ms = found_ms;\n-\t\treturn 1;\n-\t}\n-\treturn 0;\n-}\n-\n __rte_experimental struct rte_memseg *\n-rte_mem_virt2memseg(const void *addr)\n+rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)\n {\n-\tstruct virtms vm;\n-\n-\tmemset(&vm, 0, sizeof(vm));\n-\n-\tvm.virt = addr;\n-\n-\trte_memseg_walk(find_memseg, &vm);\n-\n-\treturn vm.ms;\n+\treturn virt2memseg(addr, msl != NULL ? msl :\n+\t\t\trte_mem_virt2memseg_list(addr));\n }\n \n static int\n-physmem_size(const struct rte_memseg *ms, void *arg)\n+physmem_size(const struct rte_memseg_list *msl, void *arg)\n {\n \tuint64_t *total_len = arg;\n \n-\t*total_len += ms->len;\n+\t*total_len += msl->memseg_arr.count * msl->page_sz;\n \n \treturn 0;\n }\n@@ -214,32 +566,39 @@ rte_eal_get_physmem_size(void)\n {\n \tuint64_t total_len = 0;\n \n-\trte_memseg_walk(physmem_size, &total_len);\n+\trte_memseg_list_walk(physmem_size, &total_len);\n \n \treturn total_len;\n }\n \n static int\n-dump_memseg(const struct rte_memseg *ms, void *arg)\n+dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,\n+\t\tvoid *arg)\n {\n \tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n-\tint i = ms - mcfg->memseg;\n+\tint msl_idx, ms_idx;\n \tFILE *f = arg;\n \n-\tif (i < 0 || i >= RTE_MAX_MEMSEG)\n+\tmsl_idx = msl - mcfg->memsegs;\n+\tif (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)\n \t\treturn -1;\n \n-\tfprintf(f, \"Segment %u: IOVA:0x%\"PRIx64\", len:%zu, \"\n+\tms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);\n+\tif (ms_idx < 0)\n+\t\treturn -1;\n+\n+\tfprintf(f, \"Segment %i-%i: IOVA:0x%\"PRIx64\", len:%zu, \"\n \t\t\t\"virt:%p, socket_id:%\"PRId32\", \"\n \t\t\t\"hugepage_sz:%\"PRIu64\", nchannel:%\"PRIx32\", \"\n-\t\t\t\"nrank:%\"PRIx32\"\\n\", i,\n-\t\t\tmcfg->memseg[i].iova,\n-\t\t\tmcfg->memseg[i].len,\n-\t\t\tmcfg->memseg[i].addr,\n-\t\t\tmcfg->memseg[i].socket_id,\n-\t\t\tmcfg->memseg[i].hugepage_sz,\n-\t\t\tmcfg->memseg[i].nchannel,\n-\t\t\tmcfg->memseg[i].nrank);\n+\t\t\t\"nrank:%\"PRIx32\"\\n\",\n+\t\t\tmsl_idx, ms_idx,\n+\t\t\tms->iova,\n+\t\t\tms->len,\n+\t\t\tms->addr,\n+\t\t\tms->socket_id,\n+\t\t\tms->hugepage_sz,\n+\t\t\tms->nchannel,\n+\t\t\tms->nrank);\n \n \treturn 0;\n }\n@@ -289,55 +648,89 @@ rte_mem_lock_page(const void *virt)\n }\n \n int __rte_experimental\n-rte_memseg_walk(rte_memseg_walk_t func, void *arg)\n+rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)\n {\n \tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n-\tint i, ret;\n+\tint i, ms_idx, ret = 0;\n \n-\tfor (i = 0; i < RTE_MAX_MEMSEG; i++) {\n-\t\tconst struct rte_memseg *ms = &mcfg->memseg[i];\n+\tfor (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {\n+\t\tstruct rte_memseg_list *msl = &mcfg->memsegs[i];\n+\t\tconst struct rte_memseg *ms;\n+\t\tstruct rte_fbarray *arr;\n \n-\t\tif (ms->addr == NULL)\n+\t\tif (msl->memseg_arr.count == 0)\n \t\t\tcontinue;\n \n-\t\tret = func(ms, arg);\n-\t\tif (ret < 0)\n-\t\t\treturn -1;\n-\t\tif (ret > 0)\n-\t\t\treturn 1;\n+\t\tarr = &msl->memseg_arr;\n+\n+\t\tms_idx = rte_fbarray_find_next_used(arr, 0);\n+\t\twhile (ms_idx >= 0) {\n+\t\t\tint n_segs;\n+\t\t\tsize_t len;\n+\n+\t\t\tms = rte_fbarray_get(arr, ms_idx);\n+\n+\t\t\t/* find how many more segments there are, starting with\n+\t\t\t * this one.\n+\t\t\t */\n+\t\t\tn_segs = rte_fbarray_find_contig_used(arr, ms_idx);\n+\t\t\tlen = n_segs * msl->page_sz;\n+\n+\t\t\tret = func(msl, ms, len, arg);\n+\t\t\tif (ret < 0)\n+\t\t\t\treturn -1;\n+\t\t\telse if (ret > 0)\n+\t\t\t\treturn 1;\n+\t\t\tms_idx = rte_fbarray_find_next_used(arr,\n+\t\t\t\t\tms_idx + n_segs);\n+\t\t}\n \t}\n \treturn 0;\n }\n \n int __rte_experimental\n-rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)\n+rte_memseg_walk(rte_memseg_walk_t func, void *arg)\n {\n \tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n-\tint i, j, ret;\n+\tint i, ms_idx, ret = 0;\n \n-\tfor (i = 0; i < RTE_MAX_MEMSEG; i++) {\n-\t\tconst struct rte_memseg *ms = &mcfg->memseg[i];\n-\t\tsize_t total_len;\n-\t\tvoid *end_addr;\n+\tfor (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {\n+\t\tstruct rte_memseg_list *msl = &mcfg->memsegs[i];\n+\t\tconst struct rte_memseg *ms;\n+\t\tstruct rte_fbarray *arr;\n \n-\t\tif (ms->addr == NULL)\n+\t\tif (msl->memseg_arr.count == 0)\n \t\t\tcontinue;\n \n-\t\tend_addr = RTE_PTR_ADD(ms->addr, ms->len);\n+\t\tarr = &msl->memseg_arr;\n+\n+\t\tms_idx = rte_fbarray_find_next_used(arr, 0);\n+\t\twhile (ms_idx >= 0) {\n+\t\t\tms = rte_fbarray_get(arr, ms_idx);\n+\t\t\tret = func(msl, ms, arg);\n+\t\t\tif (ret < 0)\n+\t\t\t\treturn -1;\n+\t\t\telse if (ret > 0)\n+\t\t\t\treturn 1;\n+\t\t\tms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);\n+\t\t}\n+\t}\n+\treturn 0;\n+}\n \n-\t\t/* check how many more segments are contiguous to this one */\n-\t\tfor (j = i + 1; j < RTE_MAX_MEMSEG; j++) {\n-\t\t\tconst struct rte_memseg *next = &mcfg->memseg[j];\n+int __rte_experimental\n+rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)\n+{\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tint i, ret = 0;\n \n-\t\t\tif (next->addr != end_addr)\n-\t\t\t\tbreak;\n+\tfor (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {\n+\t\tstruct rte_memseg_list *msl = &mcfg->memsegs[i];\n \n-\t\t\tend_addr = RTE_PTR_ADD(next->addr, next->len);\n-\t\t\ti++;\n-\t\t}\n-\t\ttotal_len = RTE_PTR_DIFF(end_addr, ms->addr);\n+\t\tif (msl->base_va == NULL)\n+\t\t\tcontinue;\n \n-\t\tret = func(ms, total_len, arg);\n+\t\tret = func(msl, arg);\n \t\tif (ret < 0)\n \t\t\treturn -1;\n \t\tif (ret > 0)\n@@ -350,9 +743,25 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)\n int\n rte_eal_memory_init(void)\n {\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tint retval;\n \tRTE_LOG(DEBUG, EAL, \"Setting up physically contiguous memory...\\n\");\n \n-\tconst int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?\n+\tif (!mcfg)\n+\t\treturn -1;\n+\n+\tretval = rte_eal_process_type() == RTE_PROC_PRIMARY ?\n+#ifndef RTE_ARCH_64\n+\t\t\tmemseg_primary_init_32() :\n+#else\n+\t\t\tmemseg_primary_init() :\n+#endif\n+\t\t\tmemseg_secondary_init();\n+\n+\tif (retval < 0)\n+\t\treturn -1;\n+\n+\tretval = rte_eal_process_type() == RTE_PROC_PRIMARY ?\n \t\t\trte_eal_hugepage_init() :\n \t\t\trte_eal_hugepage_attach();\n \tif (retval < 0)\ndiff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c\nindex 88f401f..529b36f 100644\n--- a/lib/librte_eal/common/eal_common_memzone.c\n+++ b/lib/librte_eal/common/eal_common_memzone.c\n@@ -234,10 +234,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,\n \tmz->iova = rte_malloc_virt2iova(mz_addr);\n \tmz->addr = mz_addr;\n \tmz->len = (requested_len == 0 ? elem->size : requested_len);\n-\tmz->hugepage_sz = elem->ms->hugepage_sz;\n-\tmz->socket_id = elem->ms->socket_id;\n+\tmz->hugepage_sz = elem->msl->page_sz;\n+\tmz->socket_id = elem->msl->socket_id;\n \tmz->flags = 0;\n-\tmz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;\n \n \treturn mz;\n }\n@@ -399,20 +398,50 @@ static void\n dump_memzone(const struct rte_memzone *mz, void *arg)\n {\n \tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tstruct rte_memseg_list *msl = NULL;\n+\tvoid *cur_addr, *mz_end;\n+\tstruct rte_memseg *ms;\n+\tint mz_idx, ms_idx;\n+\tsize_t page_sz;\n \tFILE *f = arg;\n-\tint mz_idx;\n \n \tmz_idx = mz - mcfg->memzone;\n \n-\tfprintf(f, \"Zone %u: name:<%s>, IO:0x%\"PRIx64\", len:0x%zx, virt:%p, \"\n+\tfprintf(f, \"Zone %u: name:<%s>, len:0x%zx, virt:%p, \"\n \t\t\t\t\"socket_id:%\"PRId32\", flags:%\"PRIx32\"\\n\",\n \t\t\tmz_idx,\n \t\t\tmz->name,\n-\t\t\tmz->iova,\n \t\t\tmz->len,\n \t\t\tmz->addr,\n \t\t\tmz->socket_id,\n \t\t\tmz->flags);\n+\n+\t/* go through each page occupied by this memzone */\n+\tmsl = rte_mem_virt2memseg_list(mz->addr);\n+\tif (!msl) {\n+\t\tRTE_LOG(DEBUG, EAL, \"Skipping bad memzone\\n\");\n+\t\treturn;\n+\t}\n+\tpage_sz = (size_t)mz->hugepage_sz;\n+\tcur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz);\n+\tmz_end = RTE_PTR_ADD(cur_addr, mz->len);\n+\n+\tfprintf(f, \"physical segments used:\\n\");\n+\tms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz;\n+\tms = rte_fbarray_get(&msl->memseg_arr, ms_idx);\n+\n+\tdo {\n+\t\tfprintf(f, \"  addr: %p iova: 0x%\" PRIx64 \" \"\n+\t\t\t\t\"len: 0x%zx \"\n+\t\t\t\t\"pagesz: 0x%zx\\n\",\n+\t\t\tcur_addr, ms->iova, ms->len, page_sz);\n+\n+\t\t/* advance VA to next page */\n+\t\tcur_addr = RTE_PTR_ADD(cur_addr, page_sz);\n+\n+\t\t/* memzones occupy contiguous segments */\n+\t\t++ms;\n+\t} while (cur_addr < mz_end);\n }\n \n /* Dump all reserved memory zones on console */\n@@ -429,7 +458,6 @@ int\n rte_eal_memzone_init(void)\n {\n \tstruct rte_mem_config *mcfg;\n-\tconst struct rte_memseg *memseg;\n \n \t/* get pointer to global configuration */\n \tmcfg = rte_eal_get_configuration()->mem_config;\n@@ -438,12 +466,6 @@ rte_eal_memzone_init(void)\n \tif (rte_eal_process_type() == RTE_PROC_SECONDARY)\n \t\treturn 0;\n \n-\tmemseg = rte_eal_get_physmem_layout();\n-\tif (memseg == NULL) {\n-\t\tRTE_LOG(ERR, EAL, \"%s(): Cannot get physical layout\\n\", __func__);\n-\t\treturn -1;\n-\t}\n-\n \trte_rwlock_write_lock(&mcfg->mlock);\n \n \t/* delete all zones */\ndiff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h\nindex 1d519bb..ad1b0b6 100644\n--- a/lib/librte_eal/common/eal_hugepages.h\n+++ b/lib/librte_eal/common/eal_hugepages.h\n@@ -22,7 +22,6 @@ struct hugepage_file {\n \tsize_t size;        /**< the page size */\n \tint socket_id;      /**< NUMA socket ID */\n \tint file_id;        /**< the '%d' in HUGEFILE_FMT */\n-\tint memseg_id;      /**< the memory segment to which page belongs */\n \tchar filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */\n };\n \ndiff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h\nindex fda087b..5cf7102 100644\n--- a/lib/librte_eal/common/eal_internal_cfg.h\n+++ b/lib/librte_eal/common/eal_internal_cfg.h\n@@ -23,7 +23,7 @@ struct hugepage_info {\n \tuint64_t hugepage_sz;   /**< size of a huge page */\n \tconst char *hugedir;    /**< dir where hugetlbfs is mounted */\n \tuint32_t num_pages[RTE_MAX_NUMA_NODES];\n-\t\t\t\t/**< number of hugepages of that size on each socket */\n+\t/**< number of hugepages of that size on each socket */\n \tint lock_descriptor;    /**< file descriptor for hugepage dir */\n };\n \ndiff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h\nindex 29fa0b6..b745e18 100644\n--- a/lib/librte_eal/common/include/rte_eal_memconfig.h\n+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h\n@@ -12,12 +12,30 @@\n #include <rte_malloc_heap.h>\n #include <rte_rwlock.h>\n #include <rte_pause.h>\n+#include <rte_fbarray.h>\n \n #ifdef __cplusplus\n extern \"C\" {\n #endif\n \n /**\n+ * memseg list is a special case as we need to store a bunch of other data\n+ * together with the array itself.\n+ */\n+struct rte_memseg_list {\n+\tRTE_STD_C11\n+\tunion {\n+\t\tvoid *base_va;\n+\t\t/**< Base virtual address for this memseg list. */\n+\t\tuint64_t addr_64;\n+\t\t/**< Makes sure addr is always 64-bits */\n+\t};\n+\tint socket_id; /**< Socket ID for all memsegs in this list. */\n+\tuint64_t page_sz; /**< Page size for all memsegs in this list. */\n+\tstruct rte_fbarray memseg_arr;\n+};\n+\n+/**\n  * the structure for the memory configuration for the RTE.\n  * Used by the rte_config structure. It is separated out, as for multi-process\n  * support, the memory details should be shared across instances\n@@ -43,9 +61,11 @@ struct rte_mem_config {\n \tuint32_t memzone_cnt; /**< Number of allocated memzones */\n \n \t/* memory segments and zones */\n-\tstruct rte_memseg memseg[RTE_MAX_MEMSEG];    /**< Physmem descriptors. */\n \tstruct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */\n \n+\tstruct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];\n+\t/**< list of dynamic arrays holding memsegs */\n+\n \tstruct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */\n \n \t/* Heaps of Malloc per socket */\ndiff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h\nindex b3d7e61..55383c4 100644\n--- a/lib/librte_eal/common/include/rte_memory.h\n+++ b/lib/librte_eal/common/include/rte_memory.h\n@@ -23,6 +23,9 @@ extern \"C\" {\n #include <rte_compat.h>\n #include <rte_config.h>\n \n+/* forward declaration for pointers */\n+struct rte_memseg_list;\n+\n __extension__\n enum rte_page_sizes {\n \tRTE_PGSIZE_4K    = 1ULL << 12,\n@@ -151,7 +154,18 @@ rte_mem_iova2virt(rte_iova_t iova);\n  *   Memseg pointer on success, or NULL on error.\n  */\n __rte_experimental struct rte_memseg *\n-rte_mem_virt2memseg(const void *virt);\n+rte_mem_virt2memseg(const void *virt, const struct rte_memseg_list *msl);\n+\n+/**\n+ * Get memseg list corresponding to virtual memory address.\n+ *\n+ * @param virt\n+ *   The virtual address.\n+ * @return\n+ *   Memseg list to which this virtual address belongs to.\n+ */\n+__rte_experimental struct rte_memseg_list *\n+rte_mem_virt2memseg_list(const void *virt);\n \n /**\n  * Memseg walk function prototype.\n@@ -160,7 +174,8 @@ rte_mem_virt2memseg(const void *virt);\n  * Returning 1 will stop the walk\n  * Returning -1 will stop the walk and report error\n  */\n-typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);\n+typedef int (*rte_memseg_walk_t)(const struct rte_memseg_list *msl,\n+\t\tconst struct rte_memseg *ms, void *arg);\n \n /**\n  * Memseg contig walk function prototype. This will trigger a callback on every\n@@ -171,8 +186,19 @@ typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);\n  * Returning 1 will stop the walk\n  * Returning -1 will stop the walk and report error\n  */\n-typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg *ms,\n-\t\tsize_t len, void *arg);\n+typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg_list *msl,\n+\t\tconst struct rte_memseg *ms, size_t len, void *arg);\n+\n+/**\n+ * Memseg list walk function prototype. This will trigger a callback on every\n+ * allocated memseg list.\n+ *\n+ * Returning 0 will continue walk\n+ * Returning 1 will stop the walk\n+ * Returning -1 will stop the walk and report error\n+ */\n+typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl,\n+\t\tvoid *arg);\n \n /**\n  * Walk list of all memsegs.\n@@ -205,21 +231,19 @@ int __rte_experimental\n rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);\n \n /**\n- * Get the layout of the available physical memory.\n- *\n- * It can be useful for an application to have the full physical\n- * memory layout to decide the size of a memory zone to reserve. This\n- * table is stored in rte_config (see rte_eal_get_configuration()).\n+ * Walk each allocated memseg list.\n  *\n+ * @param func\n+ *   Iterator function\n+ * @param arg\n+ *   Argument passed to iterator\n  * @return\n- *  - On success, return a pointer to a read-only table of struct\n- *    rte_physmem_desc elements, containing the layout of all\n- *    addressable physical memory. The last element of the table\n- *    contains a NULL address.\n- *  - On error, return NULL. This should not happen since it is a fatal\n- *    error that will probably cause the entire system to panic.\n+ *   0 if walked over the entire list\n+ *   1 if stopped by the user\n+ *   -1 if user function reported error\n  */\n-const struct rte_memseg *rte_eal_get_physmem_layout(void);\n+int __rte_experimental\n+rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg);\n \n /**\n  * Dump the physical memory layout to a file.\ndiff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h\nindex ef3a4dd..6d4bdf1 100644\n--- a/lib/librte_eal/common/include/rte_memzone.h\n+++ b/lib/librte_eal/common/include/rte_memzone.h\n@@ -67,7 +67,6 @@ struct rte_memzone {\n \tint32_t socket_id;                /**< NUMA socket ID. */\n \n \tuint32_t flags;                   /**< Characteristics of this memzone. */\n-\tuint32_t memseg_id;               /**< Memseg it belongs. */\n } __attribute__((__packed__));\n \n /**\ndiff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c\nindex 87695b9..685aac4 100644\n--- a/lib/librte_eal/common/malloc_elem.c\n+++ b/lib/librte_eal/common/malloc_elem.c\n@@ -27,11 +27,11 @@\n  * Initialize a general malloc_elem header structure\n  */\n void\n-malloc_elem_init(struct malloc_elem *elem,\n-\t\tstruct malloc_heap *heap, const struct rte_memseg *ms, size_t size)\n+malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,\n+\t\tstruct rte_memseg_list *msl, size_t size)\n {\n \telem->heap = heap;\n-\telem->ms = ms;\n+\telem->msl = msl;\n \telem->prev = NULL;\n \telem->next = NULL;\n \tmemset(&elem->free_list, 0, sizeof(elem->free_list));\n@@ -100,7 +100,7 @@ malloc_elem_insert(struct malloc_elem *elem)\n  * so we just check the page addresses.\n  */\n static bool\n-elem_check_phys_contig(const struct rte_memseg *ms __rte_unused,\n+elem_check_phys_contig(const struct rte_memseg_list *msl __rte_unused,\n \t\tvoid *start, size_t size)\n {\n \trte_iova_t cur, expected;\n@@ -191,7 +191,7 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,\n \t\t\t * couldn't fit all data into one physically contiguous\n \t\t\t * block, try again with lower addresses.\n \t\t\t */\n-\t\t\tif (!elem_check_phys_contig(elem->ms,\n+\t\t\tif (!elem_check_phys_contig(elem->msl,\n \t\t\t\t\t(void *)new_data_start,\n \t\t\t\t\tnew_data_size)) {\n \t\t\t\telem_size -= align;\n@@ -225,7 +225,7 @@ split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)\n \tconst size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;\n \tconst size_t new_elem_size = elem->size - old_elem_size;\n \n-\tmalloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);\n+\tmalloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size);\n \tsplit_pt->prev = elem;\n \tsplit_pt->next = next_elem;\n \tif (next_elem)\ndiff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h\nindex 34bd268..620dd44 100644\n--- a/lib/librte_eal/common/malloc_elem.h\n+++ b/lib/librte_eal/common/malloc_elem.h\n@@ -7,7 +7,7 @@\n \n #include <stdbool.h>\n \n-#include <rte_memory.h>\n+#include <rte_eal_memconfig.h>\n \n /* dummy definition of struct so we can use pointers to it in malloc_elem struct */\n struct malloc_heap;\n@@ -26,7 +26,7 @@ struct malloc_elem {\n \t/**< points to next elem in memseg */\n \tLIST_ENTRY(malloc_elem) free_list;\n \t/**< list of free elements in heap */\n-\tconst struct rte_memseg *ms;\n+\tstruct rte_memseg_list *msl;\n \tvolatile enum elem_state state;\n \tuint32_t pad;\n \tsize_t size;\n@@ -113,7 +113,7 @@ malloc_elem_from_data(const void *data)\n void\n malloc_elem_init(struct malloc_elem *elem,\n \t\tstruct malloc_heap *heap,\n-\t\tconst struct rte_memseg *ms,\n+\t\tstruct rte_memseg_list *msl,\n \t\tsize_t size);\n \n void\ndiff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c\nindex 79914fc..0ef2c45 100644\n--- a/lib/librte_eal/common/malloc_heap.c\n+++ b/lib/librte_eal/common/malloc_heap.c\n@@ -21,6 +21,7 @@\n #include <rte_memcpy.h>\n #include <rte_atomic.h>\n \n+#include \"eal_internal_cfg.h\"\n #include \"malloc_elem.h\"\n #include \"malloc_heap.h\"\n \n@@ -62,36 +63,49 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)\n }\n \n /*\n- * Expand the heap with a memseg.\n- * This reserves the zone and sets a dummy malloc_elem header at the end\n- * to prevent overflow. The rest of the zone is added to free list as a single\n- * large free block\n+ * Expand the heap with a memory area.\n  */\n+static struct malloc_elem *\n+malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,\n+\t\tvoid *start, size_t len)\n+{\n+\tstruct malloc_elem *elem = start;\n+\n+\tmalloc_elem_init(elem, heap, msl, len);\n+\n+\tmalloc_elem_insert(elem);\n+\n+\telem = malloc_elem_join_adjacent_free(elem);\n+\n+\tmalloc_elem_free_list_insert(elem);\n+\n+\theap->total_size += len;\n+\n+\treturn elem;\n+}\n+\n static int\n-malloc_heap_add_memseg(const struct rte_memseg *ms, void *arg __rte_unused)\n+malloc_add_seg(const struct rte_memseg_list *msl,\n+\t\tconst struct rte_memseg *ms, size_t len, void *arg __rte_unused)\n {\n \tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n-\tstruct malloc_elem *start_elem;\n-\tstruct rte_memseg *found_ms;\n+\tstruct rte_memseg_list *found_msl;\n \tstruct malloc_heap *heap;\n-\tsize_t elem_size;\n-\tint ms_idx;\n-\n-\theap = &mcfg->malloc_heaps[ms->socket_id];\n+\tint msl_idx;\n \n-\t/* ms is const, so find it */\n-\tms_idx = ms - mcfg->memseg;\n-\tfound_ms = &mcfg->memseg[ms_idx];\n+\theap = &mcfg->malloc_heaps[msl->socket_id];\n \n-\tstart_elem = (struct malloc_elem *)found_ms->addr;\n-\telem_size = ms->len - MALLOC_ELEM_OVERHEAD;\n+\t/* msl is const, so find it */\n+\tmsl_idx = msl - mcfg->memsegs;\n+\tfound_msl = &mcfg->memsegs[msl_idx];\n \n-\tmalloc_elem_init(start_elem, heap, found_ms, elem_size);\n-\tmalloc_elem_insert(start_elem);\n-\tmalloc_elem_free_list_insert(start_elem);\n+\tif (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)\n+\t\treturn -1;\n \n-\theap->total_size += elem_size;\n+\tmalloc_heap_add_memory(heap, found_msl, ms->addr, len);\n \n+\tRTE_LOG(DEBUG, EAL, \"Added %zuM to heap on socket %i\\n\", len >> 20,\n+\t\t\tmsl->socket_id);\n \treturn 0;\n }\n \n@@ -114,7 +128,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,\n \t\t\t\t!!elem; elem = LIST_NEXT(elem, free_list)) {\n \t\t\tif (malloc_elem_can_hold(elem, size, align, bound,\n \t\t\t\t\tcontig)) {\n-\t\t\t\tif (check_hugepage_sz(flags, elem->ms->hugepage_sz))\n+\t\t\t\tif (check_hugepage_sz(flags,\n+\t\t\t\t\t\telem->msl->page_sz))\n \t\t\t\t\treturn elem;\n \t\t\t\tif (alt_elem == NULL)\n \t\t\t\t\talt_elem = elem;\n@@ -263,7 +278,6 @@ rte_eal_malloc_heap_init(void)\n \tif (mcfg == NULL)\n \t\treturn -1;\n \n-\trte_memseg_walk(malloc_heap_add_memseg, NULL);\n-\n-\treturn 0;\n+\t/* add all IOVA-contiguous areas to the heap */\n+\treturn rte_memseg_contig_walk(malloc_add_seg, NULL);\n }\ndiff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c\nindex 436818a..c6d3e57 100644\n--- a/lib/librte_eal/common/rte_malloc.c\n+++ b/lib/librte_eal/common/rte_malloc.c\n@@ -242,17 +242,21 @@ rte_malloc_set_limit(__rte_unused const char *type,\n rte_iova_t\n rte_malloc_virt2iova(const void *addr)\n {\n-\trte_iova_t iova;\n-\tconst struct malloc_elem *elem = malloc_elem_from_data(addr);\n+\tconst struct rte_memseg *ms;\n+\tstruct malloc_elem *elem = malloc_elem_from_data(addr);\n+\n \tif (elem == NULL)\n \t\treturn RTE_BAD_IOVA;\n-\tif (elem->ms->iova == RTE_BAD_IOVA)\n-\t\treturn RTE_BAD_IOVA;\n \n \tif (rte_eal_iova_mode() == RTE_IOVA_VA)\n-\t\tiova = (uintptr_t)addr;\n-\telse\n-\t\tiova = elem->ms->iova +\n-\t\t\tRTE_PTR_DIFF(addr, elem->ms->addr);\n-\treturn iova;\n+\t\treturn (uintptr_t) addr;\n+\n+\tms = rte_mem_virt2memseg(addr, elem->msl);\n+\tif (ms == NULL)\n+\t\treturn RTE_BAD_IOVA;\n+\n+\tif (ms->iova == RTE_BAD_IOVA)\n+\t\treturn RTE_BAD_IOVA;\n+\n+\treturn ms->iova + RTE_PTR_DIFF(addr, ms->addr);\n }\ndiff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c\nindex b34e57a..ffcbd71 100644\n--- a/lib/librte_eal/linuxapp/eal/eal.c\n+++ b/lib/librte_eal/linuxapp/eal/eal.c\n@@ -74,8 +74,8 @@ static int mem_cfg_fd = -1;\n static struct flock wr_lock = {\n \t\t.l_type = F_WRLCK,\n \t\t.l_whence = SEEK_SET,\n-\t\t.l_start = offsetof(struct rte_mem_config, memseg),\n-\t\t.l_len = sizeof(early_mem_config.memseg),\n+\t\t.l_start = offsetof(struct rte_mem_config, memsegs),\n+\t\t.l_len = sizeof(early_mem_config.memsegs),\n };\n \n /* Address of global and public configuration */\n@@ -640,11 +640,14 @@ eal_parse_args(int argc, char **argv)\n }\n \n static int\n-check_mem(const struct rte_memseg *ms, void *arg)\n+check_socket(const struct rte_memseg_list *msl, void *arg)\n {\n-\tint *socket = arg;\n+\tint *socket_id = arg;\n \n-\treturn ms->socket_id == *socket;\n+\tif (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)\n+\t\treturn 1;\n+\n+\treturn 0;\n }\n \n static void\n@@ -654,7 +657,7 @@ eal_check_mem_on_local_socket(void)\n \n \tsocket_id = rte_lcore_to_socket_id(rte_config.master_lcore);\n \n-\tif (rte_memseg_walk(check_mem, &socket_id) == 0)\n+\tif (rte_memseg_list_walk(check_socket, &socket_id) == 0)\n \t\tRTE_LOG(WARNING, EAL, \"WARNING: Master core has no memory on local socket!\\n\");\n }\n \ndiff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c\nindex 8bbf771..afebd42 100644\n--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c\n+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c\n@@ -15,6 +15,7 @@\n #include <unistd.h>\n #include <errno.h>\n #include <sys/queue.h>\n+#include <sys/stat.h>\n \n #include <rte_memory.h>\n #include <rte_eal.h>\n@@ -160,6 +161,18 @@ get_hugepage_dir(uint64_t hugepage_sz)\n }\n \n /*\n+ * uses fstat to report the size of a file on disk\n+ */\n+static off_t\n+get_file_size(int fd)\n+{\n+\tstruct stat st;\n+\tif (fstat(fd, &st) < 0)\n+\t\treturn 0;\n+\treturn st.st_size;\n+}\n+\n+/*\n  * Clear the hugepage directory of whatever hugepage files\n  * there are. Checks if the file is locked (i.e.\n  * if it's in use by another DPDK process).\n@@ -189,6 +202,8 @@ clear_hugedir(const char * hugedir)\n \t}\n \n \twhile(dirent != NULL){\n+\t\tstruct flock lck = {0};\n+\n \t\t/* skip files that don't match the hugepage pattern */\n \t\tif (fnmatch(filter, dirent->d_name, 0) > 0) {\n \t\t\tdirent = readdir(dir);\n@@ -205,11 +220,17 @@ clear_hugedir(const char * hugedir)\n \t\t}\n \n \t\t/* non-blocking lock */\n-\t\tlck_result = flock(fd, LOCK_EX | LOCK_NB);\n+\t\tlck.l_type = F_RDLCK;\n+\t\tlck.l_whence = SEEK_SET;\n+\t\tlck.l_start = 0;\n+\t\tlck.l_len = get_file_size(fd);\n+\n+\t\tlck_result = fcntl(fd, F_SETLK, &lck);\n \n \t\t/* if lock succeeds, unlock and remove the file */\n \t\tif (lck_result != -1) {\n-\t\t\tflock(fd, LOCK_UN);\n+\t\t\tlck.l_type = F_UNLCK;\n+\t\t\tfcntl(fd, F_SETLK, &lck);\n \t\t\tunlinkat(dir_fd, dirent->d_name, 0);\n \t\t}\n \t\tclose (fd);\ndiff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c\nindex 1d3defe..d38fb68 100644\n--- a/lib/librte_eal/linuxapp/eal/eal_memory.c\n+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c\n@@ -253,13 +253,12 @@ void numa_error(char *where)\n  */\n static unsigned\n map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n-\t\t  uint64_t *essential_memory __rte_unused, int orig)\n+\t\t  uint64_t *essential_memory __rte_unused)\n {\n \tint fd;\n \tunsigned i;\n \tvoid *virtaddr;\n-\tvoid *vma_addr = NULL;\n-\tsize_t vma_len = 0;\n+\tstruct flock lck = {0};\n #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES\n \tint node_id = -1;\n \tint essential_prev = 0;\n@@ -274,7 +273,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n \t\thave_numa = false;\n \t}\n \n-\tif (orig && have_numa) {\n+\tif (have_numa) {\n \t\tRTE_LOG(DEBUG, EAL, \"Trying to obtain current memory policy.\\n\");\n \t\tif (get_mempolicy(&oldpolicy, oldmask->maskp,\n \t\t\t\t  oldmask->size + 1, 0, 0) < 0) {\n@@ -290,6 +289,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n #endif\n \n \tfor (i = 0; i < hpi->num_pages[0]; i++) {\n+\t\tstruct hugepage_file *hf = &hugepg_tbl[i];\n \t\tuint64_t hugepage_sz = hpi->hugepage_sz;\n \n #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES\n@@ -324,66 +324,14 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n \t\t}\n #endif\n \n-\t\tif (orig) {\n-\t\t\thugepg_tbl[i].file_id = i;\n-\t\t\thugepg_tbl[i].size = hugepage_sz;\n-\t\t\teal_get_hugefile_path(hugepg_tbl[i].filepath,\n-\t\t\t\t\tsizeof(hugepg_tbl[i].filepath), hpi->hugedir,\n-\t\t\t\t\thugepg_tbl[i].file_id);\n-\t\t\thugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\\0';\n-\t\t}\n-#ifndef RTE_ARCH_64\n-\t\t/* for 32-bit systems, don't remap 1G and 16G pages, just reuse\n-\t\t * original map address as final map address.\n-\t\t */\n-\t\telse if ((hugepage_sz == RTE_PGSIZE_1G)\n-\t\t\t|| (hugepage_sz == RTE_PGSIZE_16G)) {\n-\t\t\thugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;\n-\t\t\thugepg_tbl[i].orig_va = NULL;\n-\t\t\tcontinue;\n-\t\t}\n-#endif\n-\t\telse if (vma_len == 0) {\n-\t\t\tunsigned j, num_pages;\n-\n-\t\t\t/* reserve a virtual area for next contiguous\n-\t\t\t * physical block: count the number of\n-\t\t\t * contiguous physical pages. */\n-\t\t\tfor (j = i+1; j < hpi->num_pages[0] ; j++) {\n-#ifdef RTE_ARCH_PPC_64\n-\t\t\t\t/* The physical addresses are sorted in\n-\t\t\t\t * descending order on PPC64 */\n-\t\t\t\tif (hugepg_tbl[j].physaddr !=\n-\t\t\t\t    hugepg_tbl[j-1].physaddr - hugepage_sz)\n-\t\t\t\t\tbreak;\n-#else\n-\t\t\t\tif (hugepg_tbl[j].physaddr !=\n-\t\t\t\t    hugepg_tbl[j-1].physaddr + hugepage_sz)\n-\t\t\t\t\tbreak;\n-#endif\n-\t\t\t}\n-\t\t\tnum_pages = j - i;\n-\t\t\tvma_len = num_pages * hugepage_sz;\n-\n-\t\t\t/* get the biggest virtual memory area up to\n-\t\t\t * vma_len. If it fails, vma_addr is NULL, so\n-\t\t\t * let the kernel provide the address. */\n-\t\t\tvma_addr = eal_get_virtual_area(NULL, &vma_len,\n-\t\t\t\t\thpi->hugepage_sz,\n-\t\t\t\t\tEAL_VIRTUAL_AREA_ALLOW_SHRINK |\n-\t\t\t\t\tEAL_VIRTUAL_AREA_UNMAP,\n-#ifdef RTE_ARCH_PPC_64\n-\t\t\t\t\tMAP_HUGETLB\n-#else\n-\t\t\t\t\t0\n-#endif\n-\t\t\t\t\t);\n-\t\t\tif (vma_addr == NULL)\n-\t\t\t\tvma_len = hugepage_sz;\n-\t\t}\n+\t\thf->file_id = i;\n+\t\thf->size = hugepage_sz;\n+\t\teal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),\n+\t\t\t\thpi->hugedir, hf->file_id);\n+\t\thf->filepath[sizeof(hf->filepath) - 1] = '\\0';\n \n \t\t/* try to create hugepage file */\n-\t\tfd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0600);\n+\t\tfd = open(hf->filepath, O_CREAT | O_RDWR, 0600);\n \t\tif (fd < 0) {\n \t\t\tRTE_LOG(DEBUG, EAL, \"%s(): open failed: %s\\n\", __func__,\n \t\t\t\t\tstrerror(errno));\n@@ -391,8 +339,11 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n \t\t}\n \n \t\t/* map the segment, and populate page tables,\n-\t\t * the kernel fills this segment with zeros */\n-\t\tvirtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,\n+\t\t * the kernel fills this segment with zeros. we don't care where\n+\t\t * this gets mapped - we already have contiguous memory areas\n+\t\t * ready for us to map into.\n+\t\t */\n+\t\tvirtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,\n \t\t\t\tMAP_SHARED | MAP_POPULATE, fd, 0);\n \t\tif (virtaddr == MAP_FAILED) {\n \t\t\tRTE_LOG(DEBUG, EAL, \"%s(): mmap failed: %s\\n\", __func__,\n@@ -401,44 +352,38 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n \t\t\tgoto out;\n \t\t}\n \n-\t\tif (orig) {\n-\t\t\thugepg_tbl[i].orig_va = virtaddr;\n-\t\t}\n-\t\telse {\n-\t\t\t/* rewrite physical addresses in IOVA as VA mode */\n-\t\t\tif (rte_eal_iova_mode() == RTE_IOVA_VA)\n-\t\t\t\thugepg_tbl[i].physaddr = (uintptr_t)virtaddr;\n-\t\t\thugepg_tbl[i].final_va = virtaddr;\n-\t\t}\n+\t\thf->orig_va = virtaddr;\n \n-\t\tif (orig) {\n-\t\t\t/* In linux, hugetlb limitations, like cgroup, are\n-\t\t\t * enforced at fault time instead of mmap(), even\n-\t\t\t * with the option of MAP_POPULATE. Kernel will send\n-\t\t\t * a SIGBUS signal. To avoid to be killed, save stack\n-\t\t\t * environment here, if SIGBUS happens, we can jump\n-\t\t\t * back here.\n-\t\t\t */\n-\t\t\tif (huge_wrap_sigsetjmp()) {\n-\t\t\t\tRTE_LOG(DEBUG, EAL, \"SIGBUS: Cannot mmap more \"\n-\t\t\t\t\t\"hugepages of size %u MB\\n\",\n-\t\t\t\t\t(unsigned)(hugepage_sz / 0x100000));\n-\t\t\t\tmunmap(virtaddr, hugepage_sz);\n-\t\t\t\tclose(fd);\n-\t\t\t\tunlink(hugepg_tbl[i].filepath);\n+\t\t/* In linux, hugetlb limitations, like cgroup, are\n+\t\t * enforced at fault time instead of mmap(), even\n+\t\t * with the option of MAP_POPULATE. Kernel will send\n+\t\t * a SIGBUS signal. To avoid to be killed, save stack\n+\t\t * environment here, if SIGBUS happens, we can jump\n+\t\t * back here.\n+\t\t */\n+\t\tif (huge_wrap_sigsetjmp()) {\n+\t\t\tRTE_LOG(DEBUG, EAL, \"SIGBUS: Cannot mmap more \"\n+\t\t\t\t\"hugepages of size %u MB\\n\",\n+\t\t\t\t(unsigned int)(hugepage_sz / 0x100000));\n+\t\t\tmunmap(virtaddr, hugepage_sz);\n+\t\t\tclose(fd);\n+\t\t\tunlink(hugepg_tbl[i].filepath);\n #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES\n-\t\t\t\tif (maxnode)\n-\t\t\t\t\tessential_memory[node_id] =\n-\t\t\t\t\t\tessential_prev;\n+\t\t\tif (maxnode)\n+\t\t\t\tessential_memory[node_id] =\n+\t\t\t\t\tessential_prev;\n #endif\n-\t\t\t\tgoto out;\n-\t\t\t}\n-\t\t\t*(int *)virtaddr = 0;\n+\t\t\tgoto out;\n \t\t}\n+\t\t*(int *)virtaddr = 0;\n \n \n-\t\t/* set shared flock on the file. */\n-\t\tif (flock(fd, LOCK_SH | LOCK_NB) == -1) {\n+\t\t/* set shared lock on the file. */\n+\t\tlck.l_type = F_RDLCK;\n+\t\tlck.l_whence = SEEK_SET;\n+\t\tlck.l_start = 0;\n+\t\tlck.l_len = hugepage_sz;\n+\t\tif (fcntl(fd, F_SETLK, &lck) == -1) {\n \t\t\tRTE_LOG(DEBUG, EAL, \"%s(): Locking file failed:%s \\n\",\n \t\t\t\t__func__, strerror(errno));\n \t\t\tclose(fd);\n@@ -446,9 +391,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n \t\t}\n \n \t\tclose(fd);\n-\n-\t\tvma_addr = (char *)vma_addr + hugepage_sz;\n-\t\tvma_len -= hugepage_sz;\n \t}\n \n out:\n@@ -470,20 +412,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,\n \treturn i;\n }\n \n-/* Unmap all hugepages from original mapping */\n-static int\n-unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)\n-{\n-        unsigned i;\n-        for (i = 0; i < hpi->num_pages[0]; i++) {\n-                if (hugepg_tbl[i].orig_va) {\n-                        munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);\n-                        hugepg_tbl[i].orig_va = NULL;\n-                }\n-        }\n-        return 0;\n-}\n-\n /*\n  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge\n  * page.\n@@ -623,7 +551,7 @@ copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,\n \tint src_pos, dst_pos = 0;\n \n \tfor (src_pos = 0; src_pos < src_size; src_pos++) {\n-\t\tif (src[src_pos].final_va != NULL) {\n+\t\tif (src[src_pos].orig_va != NULL) {\n \t\t\t/* error on overflow attempt */\n \t\t\tif (dst_pos == dest_size)\n \t\t\t\treturn -1;\n@@ -694,9 +622,10 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,\n \t\t\t\t\t\tunmap_len = hp->size;\n \n \t\t\t\t\t\t/* get start addr and len of the remaining segment */\n-\t\t\t\t\t\tmunmap(hp->final_va, (size_t) unmap_len);\n+\t\t\t\t\t\tmunmap(hp->orig_va,\n+\t\t\t\t\t\t\t(size_t)unmap_len);\n \n-\t\t\t\t\t\thp->final_va = NULL;\n+\t\t\t\t\t\thp->orig_va = NULL;\n \t\t\t\t\t\tif (unlink(hp->filepath) == -1) {\n \t\t\t\t\t\t\tRTE_LOG(ERR, EAL, \"%s(): Removing %s failed: %s\\n\",\n \t\t\t\t\t\t\t\t\t__func__, hp->filepath, strerror(errno));\n@@ -715,6 +644,413 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,\n \treturn 0;\n }\n \n+static int\n+remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)\n+{\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tstruct rte_memseg_list *msl;\n+\tstruct rte_fbarray *arr;\n+\tint cur_page, seg_len;\n+\tunsigned int msl_idx;\n+\tint ms_idx;\n+\tuint64_t page_sz;\n+\tsize_t memseg_len;\n+\tint socket_id;\n+\n+\tpage_sz = hugepages[seg_start].size;\n+\tsocket_id = hugepages[seg_start].socket_id;\n+\tseg_len = seg_end - seg_start;\n+\n+\tRTE_LOG(DEBUG, EAL, \"Attempting to map %\" PRIu64 \"M on socket %i\\n\",\n+\t\t\t(seg_len * page_sz) >> 20ULL, socket_id);\n+\n+\t/* find free space in memseg lists */\n+\tfor (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {\n+\t\tbool empty;\n+\t\tmsl = &mcfg->memsegs[msl_idx];\n+\t\tarr = &msl->memseg_arr;\n+\n+\t\tif (msl->page_sz != page_sz)\n+\t\t\tcontinue;\n+\t\tif (msl->socket_id != socket_id)\n+\t\t\tcontinue;\n+\n+\t\t/* leave space for a hole if array is not empty */\n+\t\tempty = arr->count == 0;\n+\t\tms_idx = rte_fbarray_find_next_n_free(arr, 0,\n+\t\t\t\tseg_len + (empty ? 0 : 1));\n+\n+\t\t/* memseg list is full? */\n+\t\tif (ms_idx < 0)\n+\t\t\tcontinue;\n+\n+\t\t/* leave some space between memsegs, they are not IOVA\n+\t\t * contiguous, so they shouldn't be VA contiguous either.\n+\t\t */\n+\t\tif (!empty)\n+\t\t\tms_idx++;\n+\t\tbreak;\n+\t}\n+\tif (msl_idx == RTE_MAX_MEMSEG_LISTS) {\n+\t\tRTE_LOG(ERR, EAL, \"Could not find space for memseg. Please increase %s and/or %s in configuration.\\n\",\n+\t\t\t\tRTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),\n+\t\t\t\tRTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));\n+\t\treturn -1;\n+\t}\n+\n+#ifdef RTE_ARCH_PPC64\n+\t/* for PPC64 we go through the list backwards */\n+\tfor (cur_page = seg_end - 1; cur_page >= seg_start;\n+\t\t\tcur_page--, ms_idx++) {\n+#else\n+\tfor (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {\n+#endif\n+\t\tstruct hugepage_file *hfile = &hugepages[cur_page];\n+\t\tstruct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);\n+\t\tstruct flock lck;\n+\t\tvoid *addr;\n+\t\tint fd;\n+\n+\t\tfd = open(hfile->filepath, O_RDWR);\n+\t\tif (fd < 0) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Could not open '%s': %s\\n\",\n+\t\t\t\t\thfile->filepath, strerror(errno));\n+\t\t\treturn -1;\n+\t\t}\n+\t\t/* set shared lock on the file. */\n+\t\tlck.l_type = F_RDLCK;\n+\t\tlck.l_whence = SEEK_SET;\n+\t\tlck.l_start = 0;\n+\t\tlck.l_len = page_sz;\n+\t\tif (fcntl(fd, F_SETLK, &lck) == -1) {\n+\t\t\tRTE_LOG(DEBUG, EAL, \"Could not lock '%s': %s\\n\",\n+\t\t\t\t\thfile->filepath, strerror(errno));\n+\t\t\tclose(fd);\n+\t\t\treturn -1;\n+\t\t}\n+\t\tmemseg_len = (size_t)page_sz;\n+\t\taddr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);\n+\n+\t\t/* we know this address is already mmapped by memseg list, so\n+\t\t * using MAP_FIXED here is safe\n+\t\t */\n+\t\taddr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,\n+\t\t\t\tMAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);\n+\t\tif (addr == MAP_FAILED) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Couldn't remap '%s': %s\\n\",\n+\t\t\t\t\thfile->filepath, strerror(errno));\n+\t\t\tclose(fd);\n+\t\t\treturn -1;\n+\t\t}\n+\n+\t\t/* we have a new address, so unmap previous one */\n+#ifndef RTE_ARCH_64\n+\t\t/* in 32-bit legacy mode, we have already unmapped the page */\n+\t\tif (!internal_config.legacy_mem)\n+\t\t\tmunmap(hfile->orig_va, page_sz);\n+#else\n+\t\tmunmap(hfile->orig_va, page_sz);\n+#endif\n+\n+\t\thfile->orig_va = NULL;\n+\t\thfile->final_va = addr;\n+\n+\t\t/* rewrite physical addresses in IOVA as VA mode */\n+\t\tif (rte_eal_iova_mode() == RTE_IOVA_VA)\n+\t\t\thfile->physaddr = (uintptr_t)addr;\n+\n+\t\t/* set up memseg data */\n+\t\tms->addr = addr;\n+\t\tms->hugepage_sz = page_sz;\n+\t\tms->len = memseg_len;\n+\t\tms->iova = hfile->physaddr;\n+\t\tms->socket_id = hfile->socket_id;\n+\t\tms->nchannel = rte_memory_get_nchannel();\n+\t\tms->nrank = rte_memory_get_nrank();\n+\n+\t\trte_fbarray_set_used(arr, ms_idx);\n+\n+\t\tclose(fd);\n+\t}\n+\tRTE_LOG(DEBUG, EAL, \"Allocated %\" PRIu64 \"M on socket %i\\n\",\n+\t\t\t(seg_len * page_sz) >> 20, socket_id);\n+\treturn 0;\n+}\n+\n+#define MEMSEG_LIST_FMT \"memseg-%\" PRIu64 \"k-%i-%i\"\n+static int\n+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,\n+\t\tint n_segs, int socket_id, int type_msl_idx)\n+{\n+\tchar name[RTE_FBARRAY_NAME_LEN];\n+\n+\tsnprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,\n+\t\t type_msl_idx);\n+\tif (rte_fbarray_init(&msl->memseg_arr, name, n_segs,\n+\t\t\tsizeof(struct rte_memseg))) {\n+\t\tRTE_LOG(ERR, EAL, \"Cannot allocate memseg list: %s\\n\",\n+\t\t\trte_strerror(rte_errno));\n+\t\treturn -1;\n+\t}\n+\n+\tmsl->page_sz = page_sz;\n+\tmsl->socket_id = socket_id;\n+\tmsl->base_va = NULL;\n+\n+\tRTE_LOG(DEBUG, EAL, \"Memseg list allocated: 0x%zxkB at socket %i\\n\",\n+\t\t\t(size_t)page_sz >> 10, socket_id);\n+\n+\treturn 0;\n+}\n+\n+static int\n+alloc_va_space(struct rte_memseg_list *msl)\n+{\n+\tuint64_t page_sz;\n+\tsize_t mem_sz;\n+\tvoid *addr;\n+\tint flags = 0;\n+\n+#ifdef RTE_ARCH_PPC_64\n+\tflags |= MAP_HUGETLB;\n+#endif\n+\n+\tpage_sz = msl->page_sz;\n+\tmem_sz = page_sz * msl->memseg_arr.len;\n+\n+\taddr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);\n+\tif (addr == NULL) {\n+\t\tif (rte_errno == EADDRNOTAVAIL)\n+\t\t\tRTE_LOG(ERR, EAL, \"Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\\n\",\n+\t\t\t\t(unsigned long long)mem_sz, msl->base_va);\n+\t\telse\n+\t\t\tRTE_LOG(ERR, EAL, \"Cannot reserve memory\\n\");\n+\t\treturn -1;\n+\t}\n+\tmsl->base_va = addr;\n+\n+\treturn 0;\n+}\n+\n+/*\n+ * Our VA space is not preallocated yet, so preallocate it here. We need to know\n+ * how many segments there are in order to map all pages into one address space,\n+ * and leave appropriate holes between segments so that rte_malloc does not\n+ * concatenate them into one big segment.\n+ *\n+ * we also need to unmap original pages to free up address space.\n+ */\n+static int __rte_unused\n+prealloc_segments(struct hugepage_file *hugepages, int n_pages)\n+{\n+\tstruct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n+\tint cur_page, seg_start_page, end_seg, new_memseg;\n+\tunsigned int hpi_idx, socket, i;\n+\tint n_contig_segs, n_segs;\n+\tint msl_idx;\n+\n+\t/* before we preallocate segments, we need to free up our VA space.\n+\t * we're not removing files, and we already have information about\n+\t * PA-contiguousness, so it is safe to unmap everything.\n+\t */\n+\tfor (cur_page = 0; cur_page < n_pages; cur_page++) {\n+\t\tstruct hugepage_file *hpi = &hugepages[cur_page];\n+\t\tmunmap(hpi->orig_va, hpi->size);\n+\t\thpi->orig_va = NULL;\n+\t}\n+\n+\t/* we cannot know how many page sizes and sockets we have discovered, so\n+\t * loop over all of them\n+\t */\n+\tfor (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;\n+\t\t\thpi_idx++) {\n+\t\tuint64_t page_sz =\n+\t\t\tinternal_config.hugepage_info[hpi_idx].hugepage_sz;\n+\n+\t\tfor (i = 0; i < rte_socket_count(); i++) {\n+\t\t\tstruct rte_memseg_list *msl;\n+\n+\t\t\tsocket = rte_socket_id_by_idx(i);\n+\t\t\tn_contig_segs = 0;\n+\t\t\tn_segs = 0;\n+\t\t\tseg_start_page = -1;\n+\n+\t\t\tfor (cur_page = 0; cur_page < n_pages; cur_page++) {\n+\t\t\t\tstruct hugepage_file *prev, *cur;\n+\t\t\t\tint prev_seg_start_page = -1;\n+\n+\t\t\t\tcur = &hugepages[cur_page];\n+\t\t\t\tprev = cur_page == 0 ? NULL :\n+\t\t\t\t\t\t&hugepages[cur_page - 1];\n+\n+\t\t\t\tnew_memseg = 0;\n+\t\t\t\tend_seg = 0;\n+\n+\t\t\t\tif (cur->size == 0)\n+\t\t\t\t\tend_seg = 1;\n+\t\t\t\telse if (cur->socket_id != (int) socket)\n+\t\t\t\t\tend_seg = 1;\n+\t\t\t\telse if (cur->size != page_sz)\n+\t\t\t\t\tend_seg = 1;\n+\t\t\t\telse if (cur_page == 0)\n+\t\t\t\t\tnew_memseg = 1;\n+#ifdef RTE_ARCH_PPC_64\n+\t\t\t\t/* On PPC64 architecture, the mmap always start\n+\t\t\t\t * from higher address to lower address. Here,\n+\t\t\t\t * physical addresses are in descending order.\n+\t\t\t\t */\n+\t\t\t\telse if ((prev->physaddr - cur->physaddr) !=\n+\t\t\t\t\t\tcur->size)\n+\t\t\t\t\tnew_memseg = 1;\n+#else\n+\t\t\t\telse if ((cur->physaddr - prev->physaddr) !=\n+\t\t\t\t\t\tcur->size)\n+\t\t\t\t\tnew_memseg = 1;\n+#endif\n+\t\t\t\tif (new_memseg) {\n+\t\t\t\t\t/* if we're already inside a segment,\n+\t\t\t\t\t * new segment means end of current one\n+\t\t\t\t\t */\n+\t\t\t\t\tif (seg_start_page != -1) {\n+\t\t\t\t\t\tend_seg = 1;\n+\t\t\t\t\t\tprev_seg_start_page =\n+\t\t\t\t\t\t\t\tseg_start_page;\n+\t\t\t\t\t}\n+\t\t\t\t\tseg_start_page = cur_page;\n+\t\t\t\t}\n+\n+\t\t\t\tif (end_seg) {\n+\t\t\t\t\tif (prev_seg_start_page != -1) {\n+\t\t\t\t\t\t/* we've found a new segment */\n+\t\t\t\t\t\tn_contig_segs++;\n+\t\t\t\t\t\tn_segs += cur_page -\n+\t\t\t\t\t\t\tprev_seg_start_page;\n+\t\t\t\t\t} else if (seg_start_page != -1) {\n+\t\t\t\t\t\t/* we didn't find new segment,\n+\t\t\t\t\t\t * but did end current one\n+\t\t\t\t\t\t */\n+\t\t\t\t\t\tn_contig_segs++;\n+\t\t\t\t\t\tn_segs += cur_page -\n+\t\t\t\t\t\t\t\tseg_start_page;\n+\t\t\t\t\t\tseg_start_page = -1;\n+\t\t\t\t\t\tcontinue;\n+\t\t\t\t\t} else {\n+\t\t\t\t\t\t/* we're skipping this page */\n+\t\t\t\t\t\tcontinue;\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t\t/* segment continues */\n+\t\t\t}\n+\t\t\t/* check if we missed last segment */\n+\t\t\tif (seg_start_page != -1) {\n+\t\t\t\tn_contig_segs++;\n+\t\t\t\tn_segs += cur_page - seg_start_page;\n+\t\t\t}\n+\n+\t\t\t/* if no segments were found, do not preallocate */\n+\t\t\tif (n_segs == 0)\n+\t\t\t\tcontinue;\n+\n+\t\t\t/* we now have total number of pages that we will\n+\t\t\t * allocate for this segment list. add separator pages\n+\t\t\t * to the total count, and preallocate VA space.\n+\t\t\t */\n+\t\t\tn_segs += n_contig_segs - 1;\n+\n+\t\t\t/* now, preallocate VA space for these segments */\n+\n+\t\t\t/* first, find suitable memseg list for this */\n+\t\t\tfor (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;\n+\t\t\t\t\tmsl_idx++) {\n+\t\t\t\tmsl = &mcfg->memsegs[msl_idx];\n+\n+\t\t\t\tif (msl->base_va != NULL)\n+\t\t\t\t\tcontinue;\n+\t\t\t\tbreak;\n+\t\t\t}\n+\t\t\tif (msl_idx == RTE_MAX_MEMSEG_LISTS) {\n+\t\t\t\tRTE_LOG(ERR, EAL, \"Not enough space in memseg lists, please increase %s\\n\",\n+\t\t\t\t\tRTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));\n+\t\t\t\treturn -1;\n+\t\t\t}\n+\n+\t\t\t/* now, allocate fbarray itself */\n+\t\t\tif (alloc_memseg_list(msl, page_sz, n_segs, socket,\n+\t\t\t\t\t\tmsl_idx) < 0)\n+\t\t\t\treturn -1;\n+\n+\t\t\t/* finally, allocate VA space */\n+\t\t\tif (alloc_va_space(msl) < 0)\n+\t\t\t\treturn -1;\n+\t\t}\n+\t}\n+\treturn 0;\n+}\n+\n+/*\n+ * We cannot reallocate memseg lists on the fly because PPC64 stores pages\n+ * backwards, therefore we have to process the entire memseg first before\n+ * remapping it into memseg list VA space.\n+ */\n+static int\n+remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)\n+{\n+\tint cur_page, seg_start_page, new_memseg, ret;\n+\n+\tseg_start_page = 0;\n+\tfor (cur_page = 0; cur_page < n_pages; cur_page++) {\n+\t\tstruct hugepage_file *prev, *cur;\n+\n+\t\tnew_memseg = 0;\n+\n+\t\tcur = &hugepages[cur_page];\n+\t\tprev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];\n+\n+\t\t/* if size is zero, no more pages left */\n+\t\tif (cur->size == 0)\n+\t\t\tbreak;\n+\n+\t\tif (cur_page == 0)\n+\t\t\tnew_memseg = 1;\n+\t\telse if (cur->socket_id != prev->socket_id)\n+\t\t\tnew_memseg = 1;\n+\t\telse if (cur->size != prev->size)\n+\t\t\tnew_memseg = 1;\n+#ifdef RTE_ARCH_PPC_64\n+\t\t/* On PPC64 architecture, the mmap always start from higher\n+\t\t * address to lower address. Here, physical addresses are in\n+\t\t * descending order.\n+\t\t */\n+\t\telse if ((prev->physaddr - cur->physaddr) != cur->size)\n+\t\t\tnew_memseg = 1;\n+#else\n+\t\telse if ((cur->physaddr - prev->physaddr) != cur->size)\n+\t\t\tnew_memseg = 1;\n+#endif\n+\n+\t\tif (new_memseg) {\n+\t\t\t/* if this isn't the first time, remap segment */\n+\t\t\tif (cur_page != 0) {\n+\t\t\t\tret = remap_segment(hugepages, seg_start_page,\n+\t\t\t\t\t\tcur_page);\n+\t\t\t\tif (ret != 0)\n+\t\t\t\t\treturn -1;\n+\t\t\t}\n+\t\t\t/* remember where we started */\n+\t\t\tseg_start_page = cur_page;\n+\t\t}\n+\t\t/* continuation of previous memseg */\n+\t}\n+\t/* we were stopped, but we didn't remap the last segment, do it now */\n+\tif (cur_page != 0) {\n+\t\tret = remap_segment(hugepages, seg_start_page,\n+\t\t\t\tcur_page);\n+\t\tif (ret != 0)\n+\t\t\treturn -1;\n+\t}\n+\treturn 0;\n+}\n+\n static inline uint64_t\n get_socket_mem_size(int socket)\n {\n@@ -753,8 +1089,10 @@ calc_num_pages_per_socket(uint64_t * memory,\n \n \t/* if specific memory amounts per socket weren't requested */\n \tif (internal_config.force_sockets == 0) {\n+\t\tsize_t total_size;\n+#ifdef RTE_ARCH_64\n \t\tint cpu_per_socket[RTE_MAX_NUMA_NODES];\n-\t\tsize_t default_size, total_size;\n+\t\tsize_t default_size;\n \t\tunsigned lcore_id;\n \n \t\t/* Compute number of cores per socket */\n@@ -772,7 +1110,7 @@ calc_num_pages_per_socket(uint64_t * memory,\n \n \t\t\t/* Set memory amount per socket */\n \t\t\tdefault_size = (internal_config.memory * cpu_per_socket[socket])\n-\t\t\t                / rte_lcore_count();\n+\t\t\t\t\t/ rte_lcore_count();\n \n \t\t\t/* Limit to maximum available memory on socket */\n \t\t\tdefault_size = RTE_MIN(default_size, get_socket_mem_size(socket));\n@@ -789,12 +1127,33 @@ calc_num_pages_per_socket(uint64_t * memory,\n \t\tfor (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {\n \t\t\t/* take whatever is available */\n \t\t\tdefault_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],\n-\t\t\t                       total_size);\n+\t\t\t\t\t       total_size);\n \n \t\t\t/* Update sizes */\n \t\t\tmemory[socket] += default_size;\n \t\t\ttotal_size -= default_size;\n \t\t}\n+#else\n+\t\t/* in 32-bit mode, allocate all of the memory only on master\n+\t\t * lcore socket\n+\t\t */\n+\t\ttotal_size = internal_config.memory;\n+\t\tfor (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;\n+\t\t\t\tsocket++) {\n+\t\t\tstruct rte_config *cfg = rte_eal_get_configuration();\n+\t\t\tunsigned int master_lcore_socket;\n+\n+\t\t\tmaster_lcore_socket =\n+\t\t\t\trte_lcore_to_socket_id(cfg->master_lcore);\n+\n+\t\t\tif (master_lcore_socket != socket)\n+\t\t\t\tcontinue;\n+\n+\t\t\t/* Update sizes */\n+\t\t\tmemory[socket] = total_size;\n+\t\t\tbreak;\n+\t\t}\n+#endif\n \t}\n \n \tfor (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {\n@@ -842,7 +1201,8 @@ calc_num_pages_per_socket(uint64_t * memory,\n \t\t\t}\n \t\t}\n \t\t/* if we didn't satisfy all memory requirements per socket */\n-\t\tif (memory[socket] > 0) {\n+\t\tif (memory[socket] > 0 &&\n+\t\t\t\tinternal_config.socket_mem[socket] != 0) {\n \t\t\t/* to prevent icc errors */\n \t\t\trequested = (unsigned) (internal_config.socket_mem[socket] /\n \t\t\t\t\t0x100000);\n@@ -928,11 +1288,13 @@ eal_legacy_hugepage_init(void)\n \tstruct rte_mem_config *mcfg;\n \tstruct hugepage_file *hugepage = NULL, *tmp_hp = NULL;\n \tstruct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];\n+\tstruct rte_fbarray *arr;\n+\tstruct rte_memseg *ms;\n \n \tuint64_t memory[RTE_MAX_NUMA_NODES];\n \n \tunsigned hp_offset;\n-\tint i, j, new_memseg;\n+\tint i, j;\n \tint nr_hugefiles, nr_hugepages = 0;\n \tvoid *addr;\n \n@@ -945,6 +1307,25 @@ eal_legacy_hugepage_init(void)\n \n \t/* hugetlbfs can be disabled */\n \tif (internal_config.no_hugetlbfs) {\n+\t\tstruct rte_memseg_list *msl;\n+\t\tuint64_t page_sz;\n+\t\tint n_segs, cur_seg;\n+\n+\t\t/* nohuge mode is legacy mode */\n+\t\tinternal_config.legacy_mem = 1;\n+\n+\t\t/* create a memseg list */\n+\t\tmsl = &mcfg->memsegs[0];\n+\n+\t\tpage_sz = RTE_PGSIZE_4K;\n+\t\tn_segs = internal_config.memory / page_sz;\n+\n+\t\tif (rte_fbarray_init(&msl->memseg_arr, \"nohugemem\", n_segs,\n+\t\t\t\t     sizeof(struct rte_memseg))) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Cannot allocate memseg list\\n\");\n+\t\t\treturn -1;\n+\t\t}\n+\n \t\taddr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,\n \t\t\t\tMAP_PRIVATE | MAP_ANONYMOUS, 0, 0);\n \t\tif (addr == MAP_FAILED) {\n@@ -952,14 +1333,27 @@ eal_legacy_hugepage_init(void)\n \t\t\t\t\tstrerror(errno));\n \t\t\treturn -1;\n \t\t}\n-\t\tif (rte_eal_iova_mode() == RTE_IOVA_VA)\n-\t\t\tmcfg->memseg[0].iova = (uintptr_t)addr;\n-\t\telse\n-\t\t\tmcfg->memseg[0].iova = RTE_BAD_IOVA;\n-\t\tmcfg->memseg[0].addr = addr;\n-\t\tmcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;\n-\t\tmcfg->memseg[0].len = internal_config.memory;\n-\t\tmcfg->memseg[0].socket_id = 0;\n+\t\tmsl->base_va = addr;\n+\t\tmsl->page_sz = page_sz;\n+\t\tmsl->socket_id = 0;\n+\n+\t\t/* populate memsegs. each memseg is one page long */\n+\t\tfor (cur_seg = 0; cur_seg < n_segs; cur_seg++) {\n+\t\t\tarr = &mcfg->memsegs[cur_seg].memseg_arr;\n+\n+\t\t\tms = rte_fbarray_get(arr, cur_seg);\n+\t\t\tif (rte_eal_iova_mode() == RTE_IOVA_VA)\n+\t\t\t\tms->iova = (uintptr_t)addr;\n+\t\t\telse\n+\t\t\t\tms->iova = RTE_BAD_IOVA;\n+\t\t\tms->addr = addr;\n+\t\t\tms->hugepage_sz = page_sz;\n+\t\t\tms->socket_id = 0;\n+\n+\t\t\trte_fbarray_set_used(arr, cur_seg);\n+\n+\t\t\taddr = RTE_PTR_ADD(addr, (size_t)page_sz);\n+\t\t}\n \t\treturn 0;\n \t}\n \n@@ -992,7 +1386,6 @@ eal_legacy_hugepage_init(void)\n \tfor (i = 0; i < RTE_MAX_NUMA_NODES; i++)\n \t\tmemory[i] = internal_config.socket_mem[i];\n \n-\n \t/* map all hugepages and sort them */\n \tfor (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){\n \t\tunsigned pages_old, pages_new;\n@@ -1010,8 +1403,7 @@ eal_legacy_hugepage_init(void)\n \n \t\t/* map all hugepages available */\n \t\tpages_old = hpi->num_pages[0];\n-\t\tpages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,\n-\t\t\t\t\t      memory, 1);\n+\t\tpages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);\n \t\tif (pages_new < pages_old) {\n \t\t\tRTE_LOG(DEBUG, EAL,\n \t\t\t\t\"%d not %d hugepages of size %u MB allocated\\n\",\n@@ -1054,18 +1446,6 @@ eal_legacy_hugepage_init(void)\n \t\tqsort(&tmp_hp[hp_offset], hpi->num_pages[0],\n \t\t      sizeof(struct hugepage_file), cmp_physaddr);\n \n-\t\t/* remap all hugepages */\n-\t\tif (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=\n-\t\t    hpi->num_pages[0]) {\n-\t\t\tRTE_LOG(ERR, EAL, \"Failed to remap %u MB pages\\n\",\n-\t\t\t\t\t(unsigned)(hpi->hugepage_sz / 0x100000));\n-\t\t\tgoto fail;\n-\t\t}\n-\n-\t\t/* unmap original mappings */\n-\t\tif (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)\n-\t\t\tgoto fail;\n-\n \t\t/* we have processed a num of hugepages of this size, so inc offset */\n \t\thp_offset += hpi->num_pages[0];\n \t}\n@@ -1148,7 +1528,7 @@ eal_legacy_hugepage_init(void)\n \n \t/*\n \t * copy stuff from malloc'd hugepage* to the actual shared memory.\n-\t * this procedure only copies those hugepages that have final_va\n+\t * this procedure only copies those hugepages that have orig_va\n \t * not NULL. has overflow protection.\n \t */\n \tif (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,\n@@ -1157,6 +1537,23 @@ eal_legacy_hugepage_init(void)\n \t\tgoto fail;\n \t}\n \n+#ifndef RTE_ARCH_64\n+\t/* for legacy 32-bit mode, we did not preallocate VA space, so do it */\n+\tif (internal_config.legacy_mem &&\n+\t\t\tprealloc_segments(hugepage, nr_hugefiles)) {\n+\t\tRTE_LOG(ERR, EAL, \"Could not preallocate VA space for hugepages\\n\");\n+\t\tgoto fail;\n+\t}\n+#endif\n+\n+\t/* remap all pages we do need into memseg list VA space, so that those\n+\t * pages become first-class citizens in DPDK memory subsystem\n+\t */\n+\tif (remap_needed_hugepages(hugepage, nr_hugefiles)) {\n+\t\tRTE_LOG(ERR, EAL, \"Couldn't remap hugepage files into memseg lists\\n\");\n+\t\tgoto fail;\n+\t}\n+\n \t/* free the hugepage backing files */\n \tif (internal_config.hugepage_unlink &&\n \t\tunlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {\n@@ -1168,75 +1565,30 @@ eal_legacy_hugepage_init(void)\n \tfree(tmp_hp);\n \ttmp_hp = NULL;\n \n-\t/* first memseg index shall be 0 after incrementing it below */\n-\tj = -1;\n-\tfor (i = 0; i < nr_hugefiles; i++) {\n-\t\tnew_memseg = 0;\n-\n-\t\t/* if this is a new section, create a new memseg */\n-\t\tif (i == 0)\n-\t\t\tnew_memseg = 1;\n-\t\telse if (hugepage[i].socket_id != hugepage[i-1].socket_id)\n-\t\t\tnew_memseg = 1;\n-\t\telse if (hugepage[i].size != hugepage[i-1].size)\n-\t\t\tnew_memseg = 1;\n-\n-#ifdef RTE_ARCH_PPC_64\n-\t\t/* On PPC64 architecture, the mmap always start from higher\n-\t\t * virtual address to lower address. Here, both the physical\n-\t\t * address and virtual address are in descending order */\n-\t\telse if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=\n-\t\t    hugepage[i].size)\n-\t\t\tnew_memseg = 1;\n-\t\telse if (((unsigned long)hugepage[i-1].final_va -\n-\t\t    (unsigned long)hugepage[i].final_va) != hugepage[i].size)\n-\t\t\tnew_memseg = 1;\n-#else\n-\t\telse if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=\n-\t\t    hugepage[i].size)\n-\t\t\tnew_memseg = 1;\n-\t\telse if (((unsigned long)hugepage[i].final_va -\n-\t\t    (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)\n-\t\t\tnew_memseg = 1;\n-#endif\n+\tmunmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));\n \n-\t\tif (new_memseg) {\n-\t\t\tj += 1;\n-\t\t\tif (j == RTE_MAX_MEMSEG)\n-\t\t\t\tbreak;\n+\t/* we're not going to allocate more pages, so release VA space for\n+\t * unused memseg lists\n+\t */\n+\tfor (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {\n+\t\tstruct rte_memseg_list *msl = &mcfg->memsegs[i];\n+\t\tsize_t mem_sz;\n \n-\t\t\tmcfg->memseg[j].iova = hugepage[i].physaddr;\n-\t\t\tmcfg->memseg[j].addr = hugepage[i].final_va;\n-\t\t\tmcfg->memseg[j].len = hugepage[i].size;\n-\t\t\tmcfg->memseg[j].socket_id = hugepage[i].socket_id;\n-\t\t\tmcfg->memseg[j].hugepage_sz = hugepage[i].size;\n-\t\t}\n-\t\t/* continuation of previous memseg */\n-\t\telse {\n-#ifdef RTE_ARCH_PPC_64\n-\t\t/* Use the phy and virt address of the last page as segment\n-\t\t * address for IBM Power architecture */\n-\t\t\tmcfg->memseg[j].iova = hugepage[i].physaddr;\n-\t\t\tmcfg->memseg[j].addr = hugepage[i].final_va;\n-#endif\n-\t\t\tmcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;\n-\t\t}\n-\t\thugepage[i].memseg_id = j;\n-\t}\n+\t\t/* skip inactive lists */\n+\t\tif (msl->base_va == NULL)\n+\t\t\tcontinue;\n+\t\t/* skip lists where there is at least one page allocated */\n+\t\tif (msl->memseg_arr.count > 0)\n+\t\t\tcontinue;\n+\t\t/* this is an unused list, deallocate it */\n+\t\tmem_sz = (size_t)msl->page_sz * msl->memseg_arr.len;\n+\t\tmunmap(msl->base_va, mem_sz);\n+\t\tmsl->base_va = NULL;\n \n-\tif (i < nr_hugefiles) {\n-\t\tRTE_LOG(ERR, EAL, \"Can only reserve %d pages \"\n-\t\t\t\"from %d requested\\n\"\n-\t\t\t\"Current %s=%d is not enough\\n\"\n-\t\t\t\"Please either increase it or request less amount \"\n-\t\t\t\"of memory.\\n\",\n-\t\t\ti, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),\n-\t\t\tRTE_MAX_MEMSEG);\n-\t\tgoto fail;\n+\t\t/* destroy backing fbarray */\n+\t\trte_fbarray_destroy(&msl->memseg_arr);\n \t}\n \n-\tmunmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));\n-\n \treturn 0;\n \n fail:\n@@ -1269,11 +1621,10 @@ getFileSize(int fd)\n static int\n eal_legacy_hugepage_attach(void)\n {\n-\tconst struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;\n \tstruct hugepage_file *hp = NULL;\n-\tunsigned num_hp = 0;\n-\tunsigned i, s = 0; /* s used to track the segment number */\n-\tunsigned max_seg = RTE_MAX_MEMSEG;\n+\tunsigned int num_hp = 0;\n+\tunsigned int i = 0;\n+\tunsigned int cur_seg;\n \toff_t size = 0;\n \tint fd, fd_hugepage = -1;\n \n@@ -1292,50 +1643,6 @@ eal_legacy_hugepage_attach(void)\n \t\tgoto error;\n \t}\n \n-\t/* map all segments into memory to make sure we get the addrs */\n-\tfor (s = 0; s < RTE_MAX_MEMSEG; ++s) {\n-\t\tvoid *base_addr;\n-\t\tsize_t mmap_sz;\n-\t\tint mmap_flags = 0;\n-\n-\t\t/*\n-\t\t * the first memory segment with len==0 is the one that\n-\t\t * follows the last valid segment.\n-\t\t */\n-\t\tif (mcfg->memseg[s].len == 0)\n-\t\t\tbreak;\n-\n-\t\t/* get identical addresses as the primary process.\n-\t\t */\n-#ifdef RTE_ARCH_PPC_64\n-\t\tmmap_flags |= MAP_HUGETLB;\n-#endif\n-\t\tmmap_sz = mcfg->memseg[s].len;\n-\t\tbase_addr = eal_get_virtual_area(mcfg->memseg[s].addr,\n-\t\t\t\t&mmap_sz, mcfg->memseg[s].hugepage_sz, 0,\n-\t\t\t\tmmap_flags);\n-\t\tif (base_addr == NULL) {\n-\t\t\tmax_seg = s;\n-\t\t\tif (rte_errno == EADDRNOTAVAIL) {\n-\t\t\t\tRTE_LOG(ERR, EAL, \"Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\\n\",\n-\t\t\t\t\t(unsigned long long)mcfg->memseg[s].len,\n-\t\t\t\t\tmcfg->memseg[s].addr);\n-\t\t\t} else {\n-\t\t\t\tRTE_LOG(ERR, EAL, \"Could not mmap %llu bytes at [%p]: '%s'\\n\",\n-\t\t\t\t\t(unsigned long long)mcfg->memseg[s].len,\n-\t\t\t\t\tmcfg->memseg[s].addr,\n-\t\t\t\t\trte_strerror(rte_errno));\n-\t\t\t}\n-\t\t\tif (aslr_enabled() > 0) {\n-\t\t\t\tRTE_LOG(ERR, EAL, \"It is recommended to \"\n-\t\t\t\t\t\"disable ASLR in the kernel \"\n-\t\t\t\t\t\"and retry running both primary \"\n-\t\t\t\t\t\"and secondary processes\\n\");\n-\t\t\t}\n-\t\t\tgoto error;\n-\t\t}\n-\t}\n-\n \tsize = getFileSize(fd_hugepage);\n \thp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);\n \tif (hp == MAP_FAILED) {\n@@ -1346,46 +1653,49 @@ eal_legacy_hugepage_attach(void)\n \tnum_hp = size / sizeof(struct hugepage_file);\n \tRTE_LOG(DEBUG, EAL, \"Analysing %u files\\n\", num_hp);\n \n-\ts = 0;\n-\twhile (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){\n-\t\tvoid *addr, *base_addr;\n-\t\tuintptr_t offset = 0;\n-\t\tsize_t mapping_size;\n-\t\t/*\n-\t\t * free previously mapped memory so we can map the\n-\t\t * hugepages into the space\n-\t\t */\n-\t\tbase_addr = mcfg->memseg[s].addr;\n-\t\tmunmap(base_addr, mcfg->memseg[s].len);\n-\n-\t\t/* find the hugepages for this segment and map them\n-\t\t * we don't need to worry about order, as the server sorted the\n-\t\t * entries before it did the second mmap of them */\n-\t\tfor (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){\n-\t\t\tif (hp[i].memseg_id == (int)s){\n-\t\t\t\tfd = open(hp[i].filepath, O_RDWR);\n-\t\t\t\tif (fd < 0) {\n-\t\t\t\t\tRTE_LOG(ERR, EAL, \"Could not open %s\\n\",\n-\t\t\t\t\t\thp[i].filepath);\n-\t\t\t\t\tgoto error;\n-\t\t\t\t}\n-\t\t\t\tmapping_size = hp[i].size;\n-\t\t\t\taddr = mmap(RTE_PTR_ADD(base_addr, offset),\n-\t\t\t\t\t\tmapping_size, PROT_READ | PROT_WRITE,\n-\t\t\t\t\t\tMAP_SHARED, fd, 0);\n-\t\t\t\tclose(fd); /* close file both on success and on failure */\n-\t\t\t\tif (addr == MAP_FAILED ||\n-\t\t\t\t\t\taddr != RTE_PTR_ADD(base_addr, offset)) {\n-\t\t\t\t\tRTE_LOG(ERR, EAL, \"Could not mmap %s\\n\",\n-\t\t\t\t\t\thp[i].filepath);\n-\t\t\t\t\tgoto error;\n-\t\t\t\t}\n-\t\t\t\toffset+=mapping_size;\n-\t\t\t}\n+\t/* map all segments into memory to make sure we get the addrs. the\n+\t * segments themselves are already in memseg list (which is shared and\n+\t * has its VA space already preallocated), so we just need to map\n+\t * everything into correct addresses.\n+\t */\n+\tfor (i = 0; i < num_hp; i++) {\n+\t\tstruct hugepage_file *hf = &hp[i];\n+\t\tsize_t map_sz = hf->size;\n+\t\tvoid *map_addr = hf->final_va;\n+\t\tstruct flock lck;\n+\n+\t\t/* if size is zero, no more pages left */\n+\t\tif (map_sz == 0)\n+\t\t\tbreak;\n+\n+\t\tfd = open(hf->filepath, O_RDWR);\n+\t\tif (fd < 0) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Could not open %s: %s\\n\",\n+\t\t\t\thf->filepath, strerror(errno));\n+\t\t\tgoto error;\n \t\t}\n-\t\tRTE_LOG(DEBUG, EAL, \"Mapped segment %u of size 0x%llx\\n\", s,\n-\t\t\t\t(unsigned long long)mcfg->memseg[s].len);\n-\t\ts++;\n+\n+\t\tmap_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,\n+\t\t\t\tMAP_SHARED | MAP_FIXED, fd, 0);\n+\t\tif (map_addr == MAP_FAILED) {\n+\t\t\tRTE_LOG(ERR, EAL, \"Could not map %s: %s\\n\",\n+\t\t\t\thf->filepath, strerror(errno));\n+\t\t\tgoto error;\n+\t\t}\n+\n+\t\t/* set shared lock on the file. */\n+\t\tlck.l_type = F_RDLCK;\n+\t\tlck.l_whence = SEEK_SET;\n+\t\tlck.l_start = 0;\n+\t\tlck.l_len = map_sz;\n+\t\tif (fcntl(fd, F_SETLK, &lck) == -1) {\n+\t\t\tRTE_LOG(DEBUG, EAL, \"%s(): Locking file failed: %s\\n\",\n+\t\t\t\t__func__, strerror(errno));\n+\t\t\tclose(fd);\n+\t\t\tgoto error;\n+\t\t}\n+\n+\t\tclose(fd);\n \t}\n \t/* unmap the hugepage config file, since we are done using it */\n \tmunmap(hp, size);\n@@ -1393,8 +1703,15 @@ eal_legacy_hugepage_attach(void)\n \treturn 0;\n \n error:\n-\tfor (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++)\n-\t\tmunmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);\n+\t/* map all segments into memory to make sure we get the addrs */\n+\tcur_seg = 0;\n+\tfor (cur_seg = 0; cur_seg < i; cur_seg++) {\n+\t\tstruct hugepage_file *hf = &hp[i];\n+\t\tsize_t map_sz = hf->size;\n+\t\tvoid *map_addr = hf->final_va;\n+\n+\t\tmunmap(map_addr, map_sz);\n+\t}\n \tif (hp != NULL && hp != MAP_FAILED)\n \t\tmunmap(hp, size);\n \tif (fd_hugepage >= 0)\ndiff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c\nindex f6fe93e..2c27063 100644\n--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c\n+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c\n@@ -686,7 +686,8 @@ vfio_get_group_no(const char *sysfs_base,\n }\n \n static int\n-type1_map(const struct rte_memseg *ms, void *arg)\n+type1_map(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg)\n {\n \tint *vfio_container_fd = arg;\n \n@@ -799,7 +800,8 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,\n }\n \n static int\n-vfio_spapr_map_walk(const struct rte_memseg *ms, void *arg)\n+vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg)\n {\n \tint *vfio_container_fd = arg;\n \n@@ -812,7 +814,8 @@ struct spapr_walk_param {\n \tuint64_t hugepage_sz;\n };\n static int\n-vfio_spapr_window_size_walk(const struct rte_memseg *ms, void *arg)\n+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg)\n {\n \tstruct spapr_walk_param *param = arg;\n \tuint64_t max = ms->iova + ms->len;\ndiff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map\nindex 3a12112..df5802d 100644\n--- a/lib/librte_eal/rte_eal_version.map\n+++ b/lib/librte_eal/rte_eal_version.map\n@@ -25,7 +25,6 @@ DPDK_2.0 {\n \trte_eal_devargs_type_count;\n \trte_eal_get_configuration;\n \trte_eal_get_lcore_state;\n-\trte_eal_get_physmem_layout;\n \trte_eal_get_physmem_size;\n \trte_eal_has_hugepages;\n \trte_eal_hpet_init;\n@@ -241,7 +240,9 @@ EXPERIMENTAL {\n \trte_malloc_dump_heaps;\n \trte_mem_iova2virt;\n \trte_mem_virt2memseg;\n+\trte_mem_virt2memseg_list;\n \trte_memseg_contig_walk;\n+\trte_memseg_list_walk;\n \trte_memseg_walk;\n \trte_memzone_reserve_contig;\n \trte_memzone_reserve_aligned_contig;\ndiff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c\nindex bb33c3a..38fb1ba 100644\n--- a/lib/librte_mempool/rte_mempool.c\n+++ b/lib/librte_mempool/rte_mempool.c\n@@ -100,12 +100,12 @@ static unsigned optimize_object_size(unsigned obj_size)\n }\n \n static int\n-find_min_pagesz(const struct rte_memseg *ms, void *arg)\n+find_min_pagesz(const struct rte_memseg_list *msl, void *arg)\n {\n \tsize_t *min = arg;\n \n-\tif (ms->hugepage_sz < *min)\n-\t\t*min = ms->hugepage_sz;\n+\tif (msl->page_sz < *min)\n+\t\t*min = msl->page_sz;\n \n \treturn 0;\n }\n@@ -115,11 +115,12 @@ get_min_page_size(void)\n {\n \tsize_t min_pagesz = SIZE_MAX;\n \n-\trte_memseg_walk(find_min_pagesz, &min_pagesz);\n+\trte_memseg_list_walk(find_min_pagesz, &min_pagesz);\n \n \treturn min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;\n }\n \n+\n static void\n mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)\n {\ndiff --git a/test/test/test_malloc.c b/test/test/test_malloc.c\nindex 578ad04..805bf04 100644\n--- a/test/test/test_malloc.c\n+++ b/test/test/test_malloc.c\n@@ -12,6 +12,7 @@\n \n #include <rte_common.h>\n #include <rte_memory.h>\n+#include <rte_eal_memconfig.h>\n #include <rte_per_lcore.h>\n #include <rte_launch.h>\n #include <rte_eal.h>\n@@ -706,36 +707,20 @@ test_malloc_bad_params(void)\n }\n \n static int\n-check_socket_mem(const struct rte_memseg *ms, void *arg)\n+check_socket_mem(const struct rte_memseg_list *msl, void *arg)\n {\n \tint32_t *socket = arg;\n \n-\treturn *socket == ms->socket_id;\n+\treturn *socket == msl->socket_id;\n }\n \n /* Check if memory is available on a specific socket */\n static int\n is_mem_on_socket(int32_t socket)\n {\n-\treturn rte_memseg_walk(check_socket_mem, &socket);\n+\treturn rte_memseg_list_walk(check_socket_mem, &socket);\n }\n \n-struct walk_param {\n-\tvoid *addr;\n-\tint32_t socket;\n-};\n-static int\n-find_socket(const struct rte_memseg *ms, void *arg)\n-{\n-\tstruct walk_param *param = arg;\n-\n-\tif (param->addr >= ms->addr &&\n-\t\t\tparam->addr < RTE_PTR_ADD(ms->addr, ms->len)) {\n-\t\tparam->socket = ms->socket_id;\n-\t\treturn 1;\n-\t}\n-\treturn 0;\n-}\n \n /*\n  * Find what socket a memory address is on. Only works for addresses within\n@@ -744,10 +729,9 @@ find_socket(const struct rte_memseg *ms, void *arg)\n static int32_t\n addr_to_socket(void * addr)\n {\n-\tstruct walk_param param = {.addr = addr, .socket = 0};\n-\tif (rte_memseg_walk(find_socket, &param) > 0)\n-\t\treturn param.socket;\n-\treturn -1;\n+\tconst struct rte_memseg *ms = rte_mem_virt2memseg(addr, NULL);\n+\treturn ms == NULL ? -1 : ms->socket_id;\n+\n }\n \n /* Test using rte_[c|m|zm]alloc_socket() on a specific socket */\ndiff --git a/test/test/test_memory.c b/test/test/test_memory.c\nindex c9b287c..b96bca7 100644\n--- a/test/test/test_memory.c\n+++ b/test/test/test_memory.c\n@@ -5,8 +5,11 @@\n #include <stdio.h>\n #include <stdint.h>\n \n+#include <rte_eal.h>\n+#include <rte_eal_memconfig.h>\n #include <rte_memory.h>\n #include <rte_common.h>\n+#include <rte_memzone.h>\n \n #include \"test.h\"\n \n@@ -23,12 +26,13 @@\n  */\n \n static int\n-check_mem(const struct rte_memseg *ms, void *arg __rte_unused)\n+check_mem(const struct rte_memseg_list *msl __rte_unused,\n+\t\tconst struct rte_memseg *ms, void *arg __rte_unused)\n {\n \tvolatile uint8_t *mem = (volatile uint8_t *) ms->addr;\n-\tsize_t i;\n+\tsize_t i, max = ms->len;\n \n-\tfor (i = 0; i < ms->len; i++, mem++)\n+\tfor (i = 0; i < max; i++, mem++)\n \t\t*mem;\n \treturn 0;\n }\ndiff --git a/test/test/test_memzone.c b/test/test/test_memzone.c\nindex cbf0cfa..0046f04 100644\n--- a/test/test/test_memzone.c\n+++ b/test/test/test_memzone.c\n@@ -111,17 +111,17 @@ struct walk_arg {\n \tint hugepage_16GB_avail;\n };\n static int\n-find_available_pagesz(const struct rte_memseg *ms, void *arg)\n+find_available_pagesz(const struct rte_memseg_list *msl, void *arg)\n {\n \tstruct walk_arg *wa = arg;\n \n-\tif (ms->hugepage_sz == RTE_PGSIZE_2M)\n+\tif (msl->page_sz == RTE_PGSIZE_2M)\n \t\twa->hugepage_2MB_avail = 1;\n-\tif (ms->hugepage_sz == RTE_PGSIZE_1G)\n+\tif (msl->page_sz == RTE_PGSIZE_1G)\n \t\twa->hugepage_1GB_avail = 1;\n-\tif (ms->hugepage_sz == RTE_PGSIZE_16M)\n+\tif (msl->page_sz == RTE_PGSIZE_16M)\n \t\twa->hugepage_16MB_avail = 1;\n-\tif (ms->hugepage_sz == RTE_PGSIZE_16G)\n+\tif (msl->page_sz == RTE_PGSIZE_16G)\n \t\twa->hugepage_16GB_avail = 1;\n \n \treturn 0;\n@@ -138,7 +138,7 @@ test_memzone_reserve_flags(void)\n \n \tmemset(&wa, 0, sizeof(wa));\n \n-\trte_memseg_walk(find_available_pagesz, &wa);\n+\trte_memseg_list_walk(find_available_pagesz, &wa);\n \n \thugepage_2MB_avail = wa.hugepage_2MB_avail;\n \thugepage_1GB_avail = wa.hugepage_1GB_avail;\n",
    "prefixes": [
        "dpdk-dev",
        "v3",
        "49/68"
    ]
}