[v2,10/17] net/mana: implement memory registration

Message ID 1657067328-18374-11-git-send-email-longli@linuxonhyperv.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Introduce Microsoft Azure Network Adatper (MANA) PMD |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Long Li July 6, 2022, 12:28 a.m. UTC
  From: Long Li <longli@microsoft.com>

MANA hardware has iommu built-in, that provides hardware safe access to
user memory through memory registration. Since memory registration is an
expensive operation, this patch implements a two level memory registration
cache mechanisum for each queue and for each port.

Signed-off-by: Long Li <longli@microsoft.com>
---

Change log:
v2:
Change all header file functions to start with mana_.
Use spinlock in place of rwlock to memory cache access.
Remove unused header files.

 drivers/net/mana/mana.c      |  20 +++
 drivers/net/mana/mana.h      |  39 +++++
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/mp.c        |  85 +++++++++
 drivers/net/mana/mr.c        | 324 +++++++++++++++++++++++++++++++++++
 5 files changed, 469 insertions(+)
 create mode 100644 drivers/net/mana/mr.c
  

Patch

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 95ef322c95..24741197c9 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -103,6 +103,8 @@  mana_dev_close(struct rte_eth_dev *dev)
 	struct mana_priv *priv = dev->data->dev_private;
 	int ret;
 
+	mana_remove_all_mr(priv);
+
 	ret = mana_intr_uninstall(priv);
 	if (ret)
 		return ret;
@@ -317,6 +319,13 @@  static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		goto fail;
 	}
 
+	ret = mana_mr_btree_init(&txq->mr_btree,
+				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to init TXQ MR btree");
+		goto fail;
+	}
+
 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
 		queue_idx, nb_desc, socket_id, txq->desc_ring);
 
@@ -338,6 +347,8 @@  static void mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
 	struct mana_txq *txq = dev->data->tx_queues[qid];
 
+	mana_mr_btree_free(&txq->mr_btree);
+
 	rte_free(txq->desc_ring);
 	rte_free(txq);
 }
@@ -374,6 +385,13 @@  static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
 		goto fail;
 	}
 
+	ret = mana_mr_btree_init(&rxq->mr_btree,
+				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to init RXQ MR btree");
+		goto fail;
+	}
+
 	rxq->num_desc = nb_desc;
 
 	rxq->priv = priv;
@@ -393,6 +411,8 @@  static void mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
 
+	mana_mr_btree_free(&rxq->mr_btree);
+
 	rte_free(rxq->desc_ring);
 	rte_free(rxq);
 }
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 33f68b3d1b..9e15b43275 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -50,6 +50,22 @@  struct mana_shared_data {
 #define MAX_RECEIVE_BUFFERS_PER_QUEUE	256
 #define MAX_SEND_BUFFERS_PER_QUEUE	256
 
+struct mana_mr_cache {
+	uint32_t	lkey;
+	uintptr_t	addr;
+	size_t		len;
+	void		*verb_obj;
+};
+
+#define MANA_MR_BTREE_CACHE_N	512
+struct mana_mr_btree {
+	uint16_t	len;	/* Used entries */
+	uint16_t	size;	/* Total entries */
+	int		overflow;
+	int		socket;
+	struct mana_mr_cache *table;
+};
+
 struct mana_process_priv {
 	void *db_page;
 };
@@ -82,6 +98,8 @@  struct mana_priv {
 	int max_recv_sge;
 	int max_mr;
 	uint64_t max_mr_size;
+	struct mana_mr_btree mr_btree;
+	rte_spinlock_t	mr_btree_lock;
 };
 
 struct mana_txq_desc {
@@ -131,6 +149,7 @@  struct mana_txq {
 	uint32_t desc_ring_head, desc_ring_tail;
 
 	struct mana_stats stats;
+	struct mana_mr_btree mr_btree;
 	unsigned int socket;
 };
 
@@ -153,6 +172,7 @@  struct mana_rxq {
 	struct mana_gdma_queue gdma_cq;
 
 	struct mana_stats stats;
+	struct mana_mr_btree mr_btree;
 
 	unsigned int socket;
 };
@@ -176,6 +196,24 @@  uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 
+struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
+				       struct mana_priv *priv,
+				       struct rte_mbuf *mbuf);
+int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+		    struct rte_mempool *pool);
+void mana_remove_all_mr(struct mana_priv *priv);
+void mana_del_pmd_mr(struct mana_mr_cache *mr);
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp, void *opaque,
+			   struct rte_mempool_memhdr *memhdr, unsigned int idx);
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+					   uint16_t *idx,
+					   uintptr_t addr, size_t len);
+int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry);
+int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket);
+void mana_mr_btree_free(struct mana_mr_btree *bt);
+
 /** Request timeout for IPC. */
 #define MANA_MP_REQ_TIMEOUT_SEC 5
 
@@ -204,6 +242,7 @@  int mana_mp_init_secondary(void);
 void mana_mp_uninit_primary(void);
 void mana_mp_uninit_secondary(void);
 int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len);
 
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 0eb5ff30ee..59b18923df 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@  deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
 	'mana.c',
+	'mr.c',
 	'mp.c',
 )
 
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index d7580e8a28..f4f78d2787 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -12,6 +12,52 @@ 
 
 extern struct mana_shared_data *mana_shared_data;
 
+static int mana_mp_mr_create(struct mana_priv *priv, uintptr_t addr,
+			     uint32_t len)
+{
+	struct ibv_mr *ibv_mr;
+	int ret;
+	struct mana_mr_cache *mr;
+
+	ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)addr, len,
+			    IBV_ACCESS_LOCAL_WRITE);
+
+	if (!ibv_mr)
+		return -errno;
+
+	DRV_LOG(DEBUG, "MR (2nd) lkey %u addr %p len %zu",
+		ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
+
+	mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
+	if (!mr) {
+		DRV_LOG(ERR, "(2nd) Failed to allocate MR");
+		ret = -ENOMEM;
+		goto fail_alloc;
+	}
+	mr->lkey = ibv_mr->lkey;
+	mr->addr = (uintptr_t)ibv_mr->addr;
+	mr->len = ibv_mr->length;
+	mr->verb_obj = ibv_mr;
+
+	rte_spinlock_lock(&priv->mr_btree_lock);
+	ret = mana_mr_btree_insert(&priv->mr_btree, mr);
+	rte_spinlock_unlock(&priv->mr_btree_lock);
+	if (ret) {
+		DRV_LOG(ERR, "(2nd) Failed to add to global MR btree");
+		goto fail_btree;
+	}
+
+	return 0;
+
+fail_btree:
+	rte_free(mr);
+
+fail_alloc:
+	ibv_dereg_mr(ibv_mr);
+
+	return ret;
+}
+
 static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type,
 			int port_id)
 {
@@ -47,6 +93,12 @@  static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
 	mp_init_msg(&mp_res, param->type, param->port_id);
 
 	switch (param->type) {
+	case MANA_MP_REQ_CREATE_MR:
+		ret = mana_mp_mr_create(priv, param->addr, param->len);
+		res->result = ret;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
 	case MANA_MP_REQ_VERBS_CMD_FD:
 		mp_res.num_fds = 1;
 		mp_res.fds[0] = priv->ib_ctx->cmd_fd;
@@ -189,6 +241,39 @@  int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
 	return ret;
 }
 
+int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
+{
+	struct rte_mp_msg mp_req = { 0 };
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mana_mp_param *req = (struct mana_mp_param *)mp_req.param;
+	struct mana_mp_param *res;
+	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int ret;
+
+	mp_init_msg(&mp_req, MANA_MP_REQ_CREATE_MR, priv->port_id);
+	req->addr = addr;
+	req->len = len;
+
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		DRV_LOG(ERR, "Port %u request to primary failed",
+			req->port_id);
+		return ret;
+	}
+
+	if (mp_rep.nb_received != 1)
+		return -EPROTO;
+
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mana_mp_param *)mp_res->param;
+	ret = res->result;
+
+	free(mp_rep.msgs);
+
+	return ret;
+}
+
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 {
 	struct rte_mp_msg mp_req = { 0 };
diff --git a/drivers/net/mana/mr.c b/drivers/net/mana/mr.c
new file mode 100644
index 0000000000..9f4f0fdc06
--- /dev/null
+++ b/drivers/net/mana/mr.c
@@ -0,0 +1,324 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <rte_eal_paging.h>
+
+#include <infiniband/verbs.h>
+
+#include "mana.h"
+
+struct mana_range {
+	uintptr_t	start;
+	uintptr_t	end;
+	uint32_t	len;
+};
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+			   struct rte_mempool_memhdr *memhdr, unsigned int idx)
+{
+	struct mana_range *ranges = opaque;
+	struct mana_range *range = &ranges[idx];
+	uint64_t page_size = rte_mem_page_size();
+
+	range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
+	range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len,
+				    page_size);
+	range->len = range->end - range->start;
+}
+
+int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+		    struct rte_mempool *pool)
+{
+	struct ibv_mr *ibv_mr;
+	struct mana_range ranges[pool->nb_mem_chunks];
+	uint32_t i;
+	struct mana_mr_cache *mr;
+	int ret;
+
+	rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
+
+	for (i = 0; i < pool->nb_mem_chunks; i++) {
+		if (ranges[i].len > priv->max_mr_size) {
+			DRV_LOG(ERR, "memory chunk size %u exceeding max MR\n",
+				ranges[i].len);
+			return -ENOMEM;
+		}
+
+		DRV_LOG(DEBUG,
+			"registering memory chunk start 0x%" PRIx64 " len %u",
+			ranges[i].start, ranges[i].len);
+
+		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+			/* Send a message to the primary to do MR */
+			ret = mana_mp_req_mr_create(priv, ranges[i].start,
+						    ranges[i].len);
+			if (ret) {
+				DRV_LOG(ERR,
+					"MR failed start 0x%" PRIx64 " len %u",
+					ranges[i].start, ranges[i].len);
+				return ret;
+			}
+			continue;
+		}
+
+		ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start,
+				    ranges[i].len, IBV_ACCESS_LOCAL_WRITE);
+		if (ibv_mr) {
+			DRV_LOG(DEBUG, "MR lkey %u addr %p len %" PRIu64,
+				ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
+
+			mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
+			mr->lkey = ibv_mr->lkey;
+			mr->addr = (uintptr_t)ibv_mr->addr;
+			mr->len = ibv_mr->length;
+			mr->verb_obj = ibv_mr;
+
+			rte_spinlock_lock(&priv->mr_btree_lock);
+			ret = mana_mr_btree_insert(&priv->mr_btree, mr);
+			rte_spinlock_unlock(&priv->mr_btree_lock);
+			if (ret) {
+				ibv_dereg_mr(ibv_mr);
+				DRV_LOG(ERR, "Failed to add to global MR btree");
+				return ret;
+			}
+
+			ret = mana_mr_btree_insert(local_tree, mr);
+			if (ret) {
+				/* Don't need to clean up MR as it's already
+				 * in the global tree
+				 */
+				DRV_LOG(ERR, "Failed to add to local MR btree");
+				return ret;
+			}
+		} else {
+			DRV_LOG(ERR, "MR failed at 0x%" PRIx64 " len %u",
+				ranges[i].start, ranges[i].len);
+			return -errno;
+		}
+	}
+	return 0;
+}
+
+void mana_del_pmd_mr(struct mana_mr_cache *mr)
+{
+	int ret;
+	struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj;
+
+	ret = ibv_dereg_mr(ibv_mr);
+	if (ret)
+		DRV_LOG(ERR, "dereg MR failed ret %d", ret);
+}
+
+struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_mr_btree,
+				       struct mana_priv *priv,
+				       struct rte_mbuf *mbuf)
+{
+	struct rte_mempool *pool = mbuf->pool;
+	int ret, second_try = 0;
+	struct mana_mr_cache *mr;
+	uint16_t idx;
+
+	DRV_LOG(DEBUG, "finding mr for mbuf addr %p len %d",
+		mbuf->buf_addr, mbuf->buf_len);
+
+try_again:
+	/* First try to find the MR in local queue tree */
+	mr = mana_mr_btree_lookup(local_mr_btree, &idx,
+				  (uintptr_t)mbuf->buf_addr, mbuf->buf_len);
+	if (mr) {
+		DRV_LOG(DEBUG,
+			"Local mr lkey %u addr 0x%" PRIx64 " len %" PRIu64,
+			mr->lkey, mr->addr, mr->len);
+		return mr;
+	}
+
+	/* If not found, try to find the MR in global tree */
+	rte_spinlock_lock(&priv->mr_btree_lock);
+	mr = mana_mr_btree_lookup(&priv->mr_btree, &idx,
+				  (uintptr_t)mbuf->buf_addr,
+				  mbuf->buf_len);
+	rte_spinlock_unlock(&priv->mr_btree_lock);
+
+	/* If found in the global tree, add it to the local tree */
+	if (mr) {
+		ret = mana_mr_btree_insert(local_mr_btree, mr);
+		if (ret) {
+			DRV_LOG(DEBUG, "Failed to add MR to local tree.");
+			return NULL;
+		}
+
+		DRV_LOG(DEBUG,
+			"Added local MR key %u addr 0x%" PRIx64 " len %" PRIu64,
+			mr->lkey, mr->addr, mr->len);
+		return mr;
+	}
+
+	if (second_try) {
+		DRV_LOG(ERR, "Internal error second try failed");
+		return NULL;
+	}
+
+	ret = mana_new_pmd_mr(local_mr_btree, priv, pool);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d",
+			ret, mbuf->buf_addr, mbuf->buf_len);
+		return NULL;
+	}
+
+	second_try = 1;
+	goto try_again;
+}
+
+void mana_remove_all_mr(struct mana_priv *priv)
+{
+	struct mana_mr_btree *bt = &priv->mr_btree;
+	struct mana_mr_cache *mr;
+	struct ibv_mr *ibv_mr;
+	uint16_t i;
+
+	rte_spinlock_lock(&priv->mr_btree_lock);
+	/* Start with index 1 as the 1st entry is always NULL */
+	for (i = 1; i < bt->len; i++) {
+		mr = &bt->table[i];
+		ibv_mr = mr->verb_obj;
+		ibv_dereg_mr(ibv_mr);
+	}
+	bt->len = 1;
+	rte_spinlock_unlock(&priv->mr_btree_lock);
+}
+
+static int mana_mr_btree_expand(struct mana_mr_btree *bt, int n)
+{
+	void *mem;
+
+	mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache),
+				 0, bt->socket);
+	if (!mem) {
+		DRV_LOG(ERR, "Failed to expand btree size %d", n);
+		return -1;
+	}
+
+	DRV_LOG(ERR, "Expanded btree to size %d", n);
+	bt->table = mem;
+	bt->size = n;
+
+	return 0;
+}
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+					   uint16_t *idx,
+					   uintptr_t addr, size_t len)
+{
+	struct mana_mr_cache *table;
+	uint16_t n;
+	uint16_t base = 0;
+	int ret;
+
+	n = bt->len;
+
+	/* Try to double the cache if it's full */
+	if (n == bt->size) {
+		ret = mana_mr_btree_expand(bt, bt->size << 1);
+		if (ret)
+			return NULL;
+	}
+
+	table = bt->table;
+
+	/* Do binary search on addr */
+	do {
+		uint16_t delta = n >> 1;
+
+		if (addr < table[base + delta].addr) {
+			n = delta;
+		} else {
+			base += delta;
+			n -= delta;
+		}
+	} while (n > 1);
+
+	*idx = base;
+
+	if (addr + len <= table[base].addr + table[base].len)
+		return &table[base];
+
+	DRV_LOG(DEBUG,
+		"addr 0x%" PRIx64 " len %zu idx %u sum 0x%" PRIx64 " not found",
+		addr, len, *idx, addr + len);
+
+	return NULL;
+}
+
+int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
+{
+	memset(bt, 0, sizeof(*bt));
+	bt->table = rte_calloc_socket("MANA B-tree table",
+				      n,
+				      sizeof(struct mana_mr_cache),
+				      0, socket);
+	if (!bt->table) {
+		DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d",
+			n, socket);
+		return -ENOMEM;
+	}
+
+	bt->socket = socket;
+	bt->size = n;
+
+	/* First entry must be NULL for binary search to work */
+	bt->table[0] = (struct mana_mr_cache) {
+		.lkey = UINT32_MAX,
+	};
+	bt->len = 1;
+
+	DRV_LOG(ERR, "B-tree initialized table %p size %d len %d",
+		bt->table, n, bt->len);
+
+	return 0;
+}
+
+void mana_mr_btree_free(struct mana_mr_btree *bt)
+{
+	rte_free(bt->table);
+	memset(bt, 0, sizeof(*bt));
+}
+
+int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry)
+{
+	struct mana_mr_cache *table;
+	uint16_t idx = 0;
+	uint16_t shift;
+
+	if (mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len)) {
+		DRV_LOG(DEBUG, "Addr 0x%" PRIx64 " len %zu exists in btree",
+			entry->addr, entry->len);
+		return 0;
+	}
+
+	if (bt->len >= bt->size) {
+		bt->overflow = 1;
+		return -1;
+	}
+
+	table = bt->table;
+
+	idx++;
+	shift = (bt->len - idx) * sizeof(struct mana_mr_cache);
+	if (shift) {
+		DRV_LOG(DEBUG, "Moving %u bytes from idx %u to %u",
+			shift, idx, idx + 1);
+		memmove(&table[idx + 1], &table[idx], shift);
+	}
+
+	table[idx] = *entry;
+	bt->len++;
+
+	DRV_LOG(DEBUG,
+		"Inserted MR b-tree table %p idx %d addr 0x%" PRIx64 " len %zu",
+		table, idx, entry->addr, entry->len);
+
+	return 0;
+}