@@ -8,6 +8,7 @@ LIB = librte_pmd_mlx5_vdpa.a
# Sources.
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c
# Basic CFLAGS.
CFLAGS += -O3
@@ -15,6 +16,7 @@ CFLAGS += -std=c11 -Wall -Wextra
CFLAGS += -g
CFLAGS += -I$(RTE_SDK)/drivers/common/mlx5
CFLAGS += -I$(RTE_SDK)/drivers/net/mlx5_vdpa
+CFLAGS += -I$(RTE_SDK)/lib/librte_sched
CFLAGS += -I$(BUILDDIR)/drivers/common/mlx5
CFLAGS += -D_BSD_SOURCE
CFLAGS += -D_DEFAULT_SOURCE
@@ -22,7 +24,7 @@ CFLAGS += -D_XOPEN_SOURCE=600
CFLAGS += $(WERROR_FLAGS)
CFLAGS += -Wno-strict-prototypes
LDLIBS += -lrte_common_mlx5
-LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci
+LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci -lrte_sched
# A few warnings cannot be avoided in external headers.
CFLAGS += -Wno-error=cast-qual
@@ -9,9 +9,10 @@ endif
fmt_name = 'mlx5_vdpa'
allow_experimental_apis = true
-deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal']
+deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched']
sources = files(
'mlx5_vdpa.c',
+ 'mlx5_vdpa_mem.c',
)
cflags_options = [
'-std=c11',
@@ -7,7 +7,6 @@
#include <rte_log.h>
#include <rte_errno.h>
#include <rte_bus_pci.h>
-#include <rte_vdpa.h>
#include <mlx5_glue.h>
#include <mlx5_common.h>
@@ -15,16 +14,9 @@
#include <mlx5_prm.h>
#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
-struct mlx5_vdpa_priv {
- TAILQ_ENTRY(mlx5_vdpa_priv) next;
- int id; /* vDPA device id. */
- struct ibv_context *ctx; /* Device context. */
- struct rte_vdpa_dev_addr dev_addr;
- struct mlx5_hca_vdpa_attr caps;
-};
-
#ifndef VIRTIO_F_ORDER_PLATFORM
#define VIRTIO_F_ORDER_PLATFORM 36
#endif
@@ -236,6 +228,7 @@ struct mlx5_vdpa_priv {
rte_errno = rte_errno ? rte_errno : EINVAL;
goto error;
}
+ SLIST_INIT(&priv->mr_list);
pthread_mutex_lock(&priv_list_lock);
TAILQ_INSERT_TAIL(&priv_list, priv, next);
pthread_mutex_unlock(&priv_list_lock);
new file mode 100644
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_VDPA_H_
+#define RTE_PMD_MLX5_VDPA_H_
+
+#include <sys/queue.h>
+
+#include <rte_vdpa.h>
+#include <rte_vhost.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+
+struct mlx5_vdpa_query_mr {
+ SLIST_ENTRY(mlx5_vdpa_query_mr) next;
+ void *addr;
+ uint64_t length;
+ struct mlx5dv_devx_umem *umem;
+ struct mlx5_devx_obj *mkey;
+ int is_indirect;
+};
+
+struct mlx5_vdpa_priv {
+ TAILQ_ENTRY(mlx5_vdpa_priv) next;
+ int id; /* vDPA device id. */
+ int vid; /* vhost device id. */
+ struct ibv_context *ctx; /* Device context. */
+ struct rte_vdpa_dev_addr dev_addr;
+ struct mlx5_hca_vdpa_attr caps;
+ uint32_t pdn; /* Protection Domain number. */
+ struct ibv_pd *pd;
+ uint32_t gpa_mkey_index;
+ struct ibv_mr *null_mr;
+ struct rte_vhost_memory *vmem;
+ SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;
+};
+
+/**
+ * Release all the prepared memory regions and all their related resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ */
+void mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Register all the memory regions of the virtio device to the HW and allocate
+ * all their related resources.
+ *
+ * @param[in] priv
+ * The vdpa driver private structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv);
+
+#endif /* RTE_PMD_MLX5_VDPA_H_ */
new file mode 100644
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <stdlib.h>
+
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_common.h>
+#include <rte_sched_common.h>
+
+#include <mlx5_prm.h>
+#include <mlx5_common.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+static int
+mlx5_vdpa_pd_prepare(struct mlx5_vdpa_priv *priv)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ if (priv->pd)
+ return 0;
+ priv->pd = mlx5_glue->alloc_pd(priv->ctx);
+ if (priv->pd == NULL) {
+ DRV_LOG(ERR, "Failed to allocate PD.");
+ return errno ? -errno : -ENOMEM;
+ }
+ struct mlx5dv_obj obj;
+ struct mlx5dv_pd pd_info;
+ int ret = 0;
+
+ obj.pd.in = priv->pd;
+ obj.pd.out = &pd_info;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+ if (ret) {
+ DRV_LOG(ERR, "Fail to get PD object info.");
+ mlx5_glue->dealloc_pd(priv->pd);
+ priv->pd = NULL;
+ return -errno;
+ }
+ priv->pdn = pd_info.pdn;
+ return 0;
+#else
+ (void)priv;
+ DRV_LOG(ERR, "Cannot get pdn - no DV support.");
+ return -ENOTSUP;
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+}
+
+void
+mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv)
+{
+ struct mlx5_vdpa_query_mr *entry;
+ struct mlx5_vdpa_query_mr *next;
+
+ entry = SLIST_FIRST(&priv->mr_list);
+ while (entry) {
+ next = SLIST_NEXT(entry, next);
+ claim_zero(mlx5_devx_cmd_destroy(entry->mkey));
+ if (!entry->is_indirect)
+ claim_zero(mlx5_glue->devx_umem_dereg(entry->umem));
+ SLIST_REMOVE(&priv->mr_list, entry, mlx5_vdpa_query_mr, next);
+ rte_free(entry);
+ entry = next;
+ }
+ SLIST_INIT(&priv->mr_list);
+ if (priv->null_mr) {
+ claim_zero(mlx5_glue->dereg_mr(priv->null_mr));
+ priv->null_mr = NULL;
+ }
+ if (priv->pd) {
+ claim_zero(mlx5_glue->dealloc_pd(priv->pd));
+ priv->pd = NULL;
+ }
+ if (priv->vmem) {
+ free(priv->vmem);
+ priv->vmem = NULL;
+ }
+}
+
+static int
+mlx5_vdpa_regions_addr_cmp(const void *a, const void *b)
+{
+ const struct rte_vhost_mem_region *region_a = a;
+ const struct rte_vhost_mem_region *region_b = b;
+
+ if (region_a->guest_phys_addr < region_b->guest_phys_addr)
+ return -1;
+ if (region_a->guest_phys_addr > region_b->guest_phys_addr)
+ return 1;
+ return 0;
+}
+
+#define KLM_NUM_MAX_ALIGN(sz) (RTE_ALIGN_CEIL(sz, MLX5_MAX_KLM_BYTE_COUNT) / \
+ MLX5_MAX_KLM_BYTE_COUNT)
+
+/*
+ * Allocate and sort the region list and choose indirect mkey mode:
+ * 1. Calculate GCD, guest memory size and indirect mkey entries num per mode.
+ * 2. Align GCD to the maximum allowed size(2G) and to be power of 2.
+ * 2. Decide the indirect mkey mode according to the next rules:
+ * a. If both KLM_FBS entries number and KLM entries number are bigger
+ * than the maximum allowed(MLX5_DEVX_MAX_KLM_ENTRIES) - error.
+ * b. KLM mode if KLM_FBS entries number is bigger than the maximum
+ * allowed(MLX5_DEVX_MAX_KLM_ENTRIES).
+ * c. KLM mode if GCD is smaller than the minimum allowed(4K).
+ * d. KLM mode if the total size of KLM entries is in one cache line
+ * and the total size of KLM_FBS entries is not in one cache line.
+ * e. Otherwise, KLM_FBS mode.
+ */
+static struct rte_vhost_memory *
+mlx5_vdpa_vhost_mem_regions_prepare(int vid, uint8_t *mode, uint64_t *mem_size,
+ uint64_t *gcd, uint32_t *entries_num)
+{
+ struct rte_vhost_memory *mem;
+ uint64_t size;
+ uint64_t klm_entries_num = 0;
+ uint64_t klm_fbs_entries_num;
+ uint32_t i;
+ int ret = rte_vhost_get_mem_table(vid, &mem);
+
+ if (ret < 0) {
+ DRV_LOG(ERR, "Failed to get VM memory layout vid =%d.", vid);
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ qsort(mem->regions, mem->nregions, sizeof(mem->regions[0]),
+ mlx5_vdpa_regions_addr_cmp);
+ *mem_size = (mem->regions[(mem->nregions - 1)].guest_phys_addr) +
+ (mem->regions[(mem->nregions - 1)].size) -
+ (mem->regions[0].guest_phys_addr);
+ *gcd = 0;
+ for (i = 0; i < mem->nregions; ++i) {
+ DRV_LOG(INFO, "Region %u: HVA 0x%" PRIx64 ", GPA 0x%" PRIx64
+ ", size 0x%" PRIx64 ".", i,
+ mem->regions[i].host_user_addr,
+ mem->regions[i].guest_phys_addr, mem->regions[i].size);
+ if (i > 0) {
+ /* Hole handle. */
+ size = mem->regions[i].guest_phys_addr -
+ (mem->regions[i - 1].guest_phys_addr +
+ mem->regions[i - 1].size);
+ *gcd = rte_get_gcd(*gcd, size);
+ klm_entries_num += KLM_NUM_MAX_ALIGN(size);
+ }
+ size = mem->regions[i].size;
+ *gcd = rte_get_gcd(*gcd, size);
+ klm_entries_num += KLM_NUM_MAX_ALIGN(size);
+ }
+ if (*gcd > MLX5_MAX_KLM_BYTE_COUNT)
+ *gcd = rte_get_gcd(*gcd, MLX5_MAX_KLM_BYTE_COUNT);
+ if (!RTE_IS_POWER_OF_2(*gcd)) {
+ uint64_t candidate_gcd = rte_align64prevpow2(*gcd);
+
+ while (candidate_gcd > 1 && (*gcd % candidate_gcd))
+ candidate_gcd /= 2;
+ DRV_LOG(DEBUG, "GCD 0x%" PRIx64 " is not power of 2. Adjusted "
+ "GCD is 0x%" PRIx64 ".", *gcd, candidate_gcd);
+ *gcd = candidate_gcd;
+ }
+ klm_fbs_entries_num = *mem_size / *gcd;
+ if (*gcd < MLX5_MIN_KLM_FIXED_BUFFER_SIZE || klm_fbs_entries_num >
+ MLX5_DEVX_MAX_KLM_ENTRIES ||
+ ((klm_entries_num * sizeof(struct mlx5_klm)) <=
+ RTE_CACHE_LINE_SIZE && (klm_fbs_entries_num *
+ sizeof(struct mlx5_klm)) >
+ RTE_CACHE_LINE_SIZE)) {
+ *mode = MLX5_MKC_ACCESS_MODE_KLM;
+ *entries_num = klm_entries_num;
+ DRV_LOG(INFO, "Indirect mkey mode is KLM.");
+ } else {
+ *mode = MLX5_MKC_ACCESS_MODE_KLM_FBS;
+ *entries_num = klm_fbs_entries_num;
+ DRV_LOG(INFO, "Indirect mkey mode is KLM Fixed Buffer Size.");
+ }
+ DRV_LOG(DEBUG, "Memory registration information: nregions = %u, "
+ "mem_size = 0x%" PRIx64 ", GCD = 0x%" PRIx64
+ ", klm_fbs_entries_num = 0x%" PRIx64 ", klm_entries_num = 0x%"
+ PRIx64 ".", mem->nregions, *mem_size, *gcd, klm_fbs_entries_num,
+ klm_entries_num);
+ if (*entries_num > MLX5_DEVX_MAX_KLM_ENTRIES) {
+ DRV_LOG(ERR, "Failed to prepare memory of vid %d - memory is "
+ "too fragmented.", vid);
+ free(mem);
+ return NULL;
+ }
+ return mem;
+}
+
+#define KLM_SIZE_MAX_ALIGN(sz) ((sz) > MLX5_MAX_KLM_BYTE_COUNT ? \
+ MLX5_MAX_KLM_BYTE_COUNT : (sz))
+
+/*
+ * The target here is to group all the physical memory regions of the
+ * virtio device in one indirect mkey.
+ * For KLM Fixed Buffer Size mode (HW find the translation entry in one
+ * read according to the guest phisical address):
+ * All the sub-direct mkeys of it must be in the same size, hence, each
+ * one of them should be in the GCD size of all the virtio memory
+ * regions and the holes between them.
+ * For KLM mode (each entry may be in different size so HW must iterate
+ * the entries):
+ * Each virtio memory region and each hole between them have one entry,
+ * just need to cover the maximum allowed size(2G) by splitting entries
+ * which their associated memory regions are bigger than 2G.
+ * It means that each virtio memory region may be mapped to more than
+ * one direct mkey in the 2 modes.
+ * All the holes of invalid memory between the virtio memory regions
+ * will be mapped to the null memory region for security.
+ */
+int
+mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
+{
+ struct mlx5_devx_mkey_attr mkey_attr;
+ struct mlx5_vdpa_query_mr *entry = NULL;
+ struct rte_vhost_mem_region *reg = NULL;
+ uint8_t mode;
+ uint32_t entries_num = 0;
+ uint32_t i;
+ uint64_t gcd;
+ uint64_t klm_size;
+ uint64_t mem_size;
+ uint64_t k;
+ int klm_index = 0;
+ int ret;
+ struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare
+ (priv->vid, &mode, &mem_size, &gcd, &entries_num);
+ struct mlx5_klm klm_array[entries_num];
+
+ if (!mem)
+ return -rte_errno;
+ priv->vmem = mem;
+ ret = mlx5_vdpa_pd_prepare(priv);
+ if (ret)
+ goto error;
+ priv->null_mr = mlx5_glue->alloc_null_mr(priv->pd);
+ if (!priv->null_mr) {
+ DRV_LOG(ERR, "Failed to allocate null MR.");
+ ret = -errno;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "Dump fill Mkey = %u.", priv->null_mr->lkey);
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+ entry = rte_zmalloc(__func__, sizeof(*entry), 0);
+ if (!entry) {
+ ret = -ENOMEM;
+ DRV_LOG(ERR, "Failed to allocate mem entry memory.");
+ goto error;
+ }
+ entry->umem = mlx5_glue->devx_umem_reg(priv->ctx,
+ (void *)(uintptr_t)reg->host_user_addr,
+ reg->size, IBV_ACCESS_LOCAL_WRITE);
+ if (!entry->umem) {
+ DRV_LOG(ERR, "Failed to register Umem by Devx.");
+ ret = -errno;
+ goto error;
+ }
+ mkey_attr.addr = (uintptr_t)(reg->guest_phys_addr);
+ mkey_attr.size = reg->size;
+ mkey_attr.umem_id = entry->umem->umem_id;
+ mkey_attr.pd = priv->pdn;
+ mkey_attr.pg_access = 1;
+ mkey_attr.klm_array = NULL;
+ mkey_attr.klm_num = 0;
+ entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr);
+ if (!entry->mkey) {
+ DRV_LOG(ERR, "Failed to create direct Mkey.");
+ ret = -rte_errno;
+ goto error;
+ }
+ entry->addr = (void *)(uintptr_t)(reg->host_user_addr);
+ entry->length = reg->size;
+ entry->is_indirect = 0;
+ if (i > 0) {
+ uint64_t sadd;
+ uint64_t empty_region_sz = reg->guest_phys_addr -
+ (mem->regions[i - 1].guest_phys_addr +
+ mem->regions[i - 1].size);
+
+ if (empty_region_sz > 0) {
+ sadd = mem->regions[i - 1].guest_phys_addr +
+ mem->regions[i - 1].size;
+ klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
+ KLM_SIZE_MAX_ALIGN(empty_region_sz) : gcd;
+ for (k = 0; k < empty_region_sz;
+ k += klm_size) {
+ klm_array[klm_index].byte_count =
+ k + klm_size > empty_region_sz ?
+ empty_region_sz - k : klm_size;
+ klm_array[klm_index].mkey =
+ priv->null_mr->lkey;
+ klm_array[klm_index].address = sadd + k;
+ klm_index++;
+ }
+ }
+ }
+ klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
+ KLM_SIZE_MAX_ALIGN(reg->size) : gcd;
+ for (k = 0; k < reg->size; k += klm_size) {
+ klm_array[klm_index].byte_count = k + klm_size >
+ reg->size ? reg->size - k : klm_size;
+ klm_array[klm_index].mkey = entry->mkey->id;
+ klm_array[klm_index].address = reg->guest_phys_addr + k;
+ klm_index++;
+ }
+ SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
+ }
+ mkey_attr.addr = (uintptr_t)(mem->regions[0].guest_phys_addr);
+ mkey_attr.size = mem_size;
+ mkey_attr.pd = priv->pdn;
+ mkey_attr.umem_id = 0;
+ /* Must be zero for KLM mode. */
+ mkey_attr.log_entity_size = mode == MLX5_MKC_ACCESS_MODE_KLM_FBS ?
+ rte_log2_u64(gcd) : 0;
+ mkey_attr.pg_access = 0;
+ mkey_attr.klm_array = klm_array;
+ mkey_attr.klm_num = klm_index;
+ entry = rte_zmalloc(__func__, sizeof(*entry), 0);
+ if (!entry) {
+ DRV_LOG(ERR, "Failed to allocate memory for indirect entry.");
+ ret = -ENOMEM;
+ goto error;
+ }
+ entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr);
+ if (!entry->mkey) {
+ DRV_LOG(ERR, "Failed to create indirect Mkey.");
+ ret = -rte_errno;
+ goto error;
+ }
+ entry->is_indirect = 1;
+ SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
+ priv->gpa_mkey_index = entry->mkey->id;
+ return 0;
+error:
+ if (entry) {
+ if (entry->mkey)
+ mlx5_devx_cmd_destroy(entry->mkey);
+ if (entry->umem)
+ mlx5_glue->devx_umem_dereg(entry->umem);
+ rte_free(entry);
+ }
+ mlx5_vdpa_mem_dereg(priv);
+ rte_errno = -ret;
+ return ret;
+}