[v2] net/mlx5: share Memory Regions for multiport device

Message ID 1556095470-11407-1-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series [v2] net/mlx5: share Memory Regions for multiport device |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS

Commit Message

Slava Ovsiienko April 24, 2019, 8:44 a.m. UTC
  The multiport Infiniband device support was introduced [1].
All active ports, belonging to the same Infiniband device use the signle
shared Infiniband context of that device and share the resources:
  - QPs are created within shared context
  - Verbs flows are also created with specifying port index
  - DV/DR resources
  - Protection Domain
  - Event Handlers

This patchset adds support for Memory Regions sharing between
portes, created on the base of multiport Infiniban device.
The datapath of mlx5 uses the layered cache subsystem for
allocating/releasing Memory Regions, only the lowest layer L3
is subject to share due to performance issues.

[1] http://patches.dpdk.org/cover/51800/

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
v2:
  - intendation issues
  - comments cleanup

v1:
  http://patches.dpdk.org/patch/52723/

 drivers/net/mlx5/mlx5.c     |  40 +++++++----
 drivers/net/mlx5/mlx5.h     |  15 ++--
 drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
 drivers/net/mlx5/mlx5_mr.h  |   5 +-
 drivers/net/mlx5/mlx5_txq.c |   2 +-
 5 files changed, 121 insertions(+), 105 deletions(-)
  

Comments

Yongseok Koh April 24, 2019, 8:52 a.m. UTC | #1
> On Apr 24, 2019, at 1:44 AM, Viacheslav Ovsiienko <viacheslavo@mellanox.com> wrote:
> 
> The multiport Infiniband device support was introduced [1].
> All active ports, belonging to the same Infiniband device use the signle
> shared Infiniband context of that device and share the resources:
>  - QPs are created within shared context
>  - Verbs flows are also created with specifying port index
>  - DV/DR resources
>  - Protection Domain
>  - Event Handlers
> 
> This patchset adds support for Memory Regions sharing between
> portes, created on the base of multiport Infiniban device.
> The datapath of mlx5 uses the layered cache subsystem for
> allocating/releasing Memory Regions, only the lowest layer L3
> is subject to share due to performance issues.
> 
> [1] http://patches.dpdk.org/cover/51800/
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
Acked-by: Yongseok Koh <yskoh@mellanox.com>

> v2:
>  - intendation issues
>  - comments cleanup
> 
> v1:
>  http://patches.dpdk.org/patch/52723/
> 
> drivers/net/mlx5/mlx5.c     |  40 +++++++----
> drivers/net/mlx5/mlx5.h     |  15 ++--
> drivers/net/mlx5/mlx5_mr.c  | 164 ++++++++++++++++++++++----------------------
> drivers/net/mlx5/mlx5_mr.h  |   5 +-
> drivers/net/mlx5/mlx5_txq.c |   2 +-
> 5 files changed, 121 insertions(+), 105 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 1bb58b1..b563e0f 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -147,6 +147,7 @@ struct mlx5_dev_spawn_data {
> 	struct mlx5_switch_info info; /**< Switch information. */
> 	struct ibv_device *ibv_dev; /**< Associated IB device. */
> 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> +	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
> };
> 
> static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
> @@ -225,6 +226,7 @@ struct mlx5_dev_spawn_data {
> 		sizeof(sh->ibdev_name));
> 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
> 		sizeof(sh->ibdev_path));
> +	sh->pci_dev = spawn->pci_dev;
> 	pthread_mutex_init(&sh->intr_mutex, NULL);
> 	/*
> 	 * Setting port_id to max unallowed value means
> @@ -239,6 +241,22 @@ struct mlx5_dev_spawn_data {
> 		err = ENOMEM;
> 		goto error;
> 	}
> +	/*
> +	 * Once the device is added to the list of memory event
> +	 * callback, its global MR cache table cannot be expanded
> +	 * on the fly because of deadlock. If it overflows, lookup
> +	 * should be done by searching MR list linearly, which is slow.
> +	 *
> +	 * At this point the device is not added to the memory
> +	 * event list yet, context is just being created.
> +	 */
> +	err = mlx5_mr_btree_init(&sh->mr.cache,
> +				 MLX5_MR_BTREE_CACHE_N * 2,
> +				 sh->pci_dev->device.numa_node);
> +	if (err) {
> +		err = rte_errno;
> +		goto error;
> +	}
> 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
> exit:
> 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
> @@ -286,6 +304,8 @@ struct mlx5_dev_spawn_data {
> 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> 	if (--sh->refcnt)
> 		goto exit;
> +	/* Release created Memory Regions. */
> +	mlx5_mr_release(sh);
> 	LIST_REMOVE(sh, next);
> 	/*
> 	 *  Ensure there is no async event handler installed.
> @@ -651,7 +671,10 @@ struct mlx5_dev_spawn_data {
> 	}
> 	mlx5_proc_priv_uninit(dev);
> 	mlx5_mprq_free_mp(dev);
> -	mlx5_mr_release(dev);
> +	/* Remove from memory callback device list. */
> +	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> +	LIST_REMOVE(priv, mem_event_cb);
> +	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 	assert(priv->sh);
> 	mlx5_free_shared_dr(priv);
> 	if (priv->rss_conf.rss_key != NULL)
> @@ -1548,19 +1571,6 @@ struct mlx5_dev_spawn_data {
> 		goto error;
> 	}
> 	priv->config.flow_prio = err;
> -	/*
> -	 * Once the device is added to the list of memory event
> -	 * callback, its global MR cache table cannot be expanded
> -	 * on the fly because of deadlock. If it overflows, lookup
> -	 * should be done by searching MR list linearly, which is slow.
> -	 */
> -	err = mlx5_mr_btree_init(&priv->mr.cache,
> -				 MLX5_MR_BTREE_CACHE_N * 2,
> -				 eth_dev->device->numa_node);
> -	if (err) {
> -		err = rte_errno;
> -		goto error;
> -	}
> 	/* Add device to memory callback list. */
> 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
> @@ -1757,6 +1767,7 @@ struct mlx5_dev_spawn_data {
> 			list[ns].ibv_port = i;
> 			list[ns].ibv_dev = ibv_match[0];
> 			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
> 			list[ns].ifindex = mlx5_nl_ifindex
> 					(nl_rdma, list[ns].ibv_dev->name, i);
> 			if (!list[ns].ifindex) {
> @@ -1823,6 +1834,7 @@ struct mlx5_dev_spawn_data {
> 			list[ns].ibv_port = 1;
> 			list[ns].ibv_dev = ibv_match[i];
> 			list[ns].eth_dev = NULL;
> +			list[ns].pci_dev = pci_dev;
> 			list[ns].ifindex = 0;
> 			if (nl_rdma >= 0)
> 				list[ns].ifindex = mlx5_nl_ifindex
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 0a6d7f1..2575732 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -275,6 +275,14 @@ struct mlx5_ibv_shared {
> 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
> 	struct ibv_device_attr_ex device_attr; /* Device properties. */
> +	struct rte_pci_device *pci_dev; /* Backend PCI device. */
> +	struct {
> +		uint32_t dev_gen; /* Generation number to flush local caches. */
> +		rte_rwlock_t rwlock; /* MR Lock. */
> +		struct mlx5_mr_btree cache; /* Global MR cache table. */
> +		struct mlx5_mr_list mr_list; /* Registered MR list. */
> +		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> +	} mr;
> 	/* Shared DV/DR flow data section. */
> 	pthread_mutex_t dv_mutex; /* DV context mutex. */
> 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
> @@ -347,13 +355,6 @@ struct mlx5_priv {
> 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
> 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
> 	/* Flow counters. */
> -	struct {
> -		uint32_t dev_gen; /* Generation number to flush local caches. */
> -		rte_rwlock_t rwlock; /* MR Lock. */
> -		struct mlx5_mr_btree cache; /* Global MR cache table. */
> -		struct mlx5_mr_list mr_list; /* Registered MR list. */
> -		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
> -	} mr;
> 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
> 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
> 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
> diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
> index e4957a4..a7a63b1 100644
> --- a/drivers/net/mlx5/mlx5_mr.c
> +++ b/drivers/net/mlx5/mlx5_mr.c
> @@ -36,7 +36,7 @@ struct mr_update_mp_data {
> 
> /**
>  * Expand B-tree table to a given size. Can't be called with holding
> - * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
> + * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
>  *
>  * @param bt
>  *   Pointer to B-tree structure.
> @@ -350,7 +350,7 @@ struct mr_update_mp_data {
> 		n = mr_find_next_chunk(mr, &entry, n);
> 		if (!entry.end)
> 			break;
> -		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
> +		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
> 			/*
> 			 * Overflowed, but the global table cannot be expanded
> 			 * because of deadlock.
> @@ -382,7 +382,7 @@ struct mr_update_mp_data {
> 	struct mlx5_mr *mr;
> 
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
> 		unsigned int n;
> 
> 		if (mr->ms_n == 0)
> @@ -420,6 +420,7 @@ struct mr_update_mp_data {
> 	      uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	uint16_t idx;
> 	uint32_t lkey = UINT32_MAX;
> 	struct mlx5_mr *mr;
> @@ -430,10 +431,10 @@ struct mr_update_mp_data {
> 	 * has to be searched by traversing the original MR list instead, which
> 	 * is very slow path. Otherwise, the global cache is all inclusive.
> 	 */
> -	if (!unlikely(priv->mr.cache.overflow)) {
> -		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	if (!unlikely(sh->mr.cache.overflow)) {
> +		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> 		if (lkey != UINT32_MAX)
> -			*entry = (*priv->mr.cache.table)[idx];
> +			*entry = (*sh->mr.cache.table)[idx];
> 	} else {
> 		/* Falling back to the slowest path. */
> 		mr = mr_lookup_dev_list(dev, entry, addr);
> @@ -468,13 +469,12 @@ struct mr_update_mp_data {
> /**
>  * Release resources of detached MR having no online entry.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> static void
> -mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
> +mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
> {
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr_next;
> 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
> 
> @@ -484,11 +484,11 @@ struct mr_update_mp_data {
> 	 * MR can't be freed with holding the lock because rte_free() could call
> 	 * memory free callback function. This will be a deadlock situation.
> 	 */
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Detach the whole free list and release it after unlocking. */
> -	free_list = priv->mr.mr_free_list;
> -	LIST_INIT(&priv->mr.mr_free_list);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	free_list = sh->mr.mr_free_list;
> +	LIST_INIT(&sh->mr.mr_free_list);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Release resources. */
> 	mr_next = LIST_FIRST(&free_list);
> 	while (mr_next != NULL) {
> @@ -548,12 +548,12 @@ struct mr_update_mp_data {
> 		      dev->data->port_id, (void *)addr);
> 		return UINT32_MAX;
> 	}
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
> 	/* Fill in output data. */
> 	mr_lookup_dev(dev, entry, addr);
> 	/* Lookup can't fail. */
> 	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
> 	DEBUG("port %u MR CREATED by primary process for %p:\n"
> 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
> 	      dev->data->port_id, (void *)addr,
> @@ -582,6 +582,7 @@ struct mr_update_mp_data {
> 		       uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_dev_config *config = &priv->config;
> 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> 	const struct rte_memseg_list *msl;
> @@ -602,12 +603,12 @@ struct mr_update_mp_data {
> 		dev->data->port_id, (void *)addr);
> 	/*
> 	 * Release detached MRs if any. This can't be called with holding either
> -	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
> +	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
> 	 * been detached by the memory free event but it couldn't be released
> 	 * inside the callback due to deadlock. As a result, releasing resources
> 	 * is quite opportunistic.
> 	 */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
> 	/*
> 	 * If enabled, find out a contiguous virtual address chunk in use, to
> 	 * which the given address belongs, in order to register maximum range.
> @@ -710,7 +711,7 @@ struct mr_update_mp_data {
> 		goto alloc_resources;
> 	}
> 	assert(data.msl == data_re.msl);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/*
> 	 * Check the address is really missing. If other thread already created
> 	 * one or it is not found due to overflow, abort and return.
> @@ -721,10 +722,10 @@ struct mr_update_mp_data {
> 		 * low-on-memory. Then, this entry will have to be searched
> 		 * here again.
> 		 */
> -		mr_btree_insert(&priv->mr.cache, entry);
> +		mr_btree_insert(&sh->mr.cache, entry);
> 		DEBUG("port %u found MR for %p on final lookup, abort",
> 		      dev->data->port_id, (void *)addr);
> -		rte_rwlock_write_unlock(&priv->mr.rwlock);
> +		rte_rwlock_write_unlock(&sh->mr.rwlock);
> 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> 		/*
> 		 * Must be unlocked before calling rte_free() because
> @@ -769,7 +770,7 @@ struct mr_update_mp_data {
> 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
> 	 * through mlx5_alloc_verbs_buf().
> 	 */
> -	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
> +	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
> 				       IBV_ACCESS_LOCAL_WRITE);
> 	if (mr->ibv_mr == NULL) {
> 		DEBUG("port %u fail to create a verbs MR for address (%p)",
> @@ -779,7 +780,7 @@ struct mr_update_mp_data {
> 	}
> 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
> 	assert(mr->ibv_mr->length == len);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	DEBUG("port %u MR CREATED (%p) for %p:\n"
> 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
> 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
> @@ -792,11 +793,11 @@ struct mr_update_mp_data {
> 	mr_lookup_dev(dev, entry, addr);
> 	/* Lookup can't fail. */
> 	assert(entry->lkey != UINT32_MAX);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> 	return entry->lkey;
> err_mrlock:
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> err_memlock:
> 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> err_nolock:
> @@ -854,14 +855,15 @@ struct mr_update_mp_data {
> mr_rebuild_dev_cache(struct rte_eth_dev *dev)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr *mr;
> 
> 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
> 	/* Flush cache to rebuild. */
> -	priv->mr.cache.len = 1;
> -	priv->mr.cache.overflow = 0;
> +	sh->mr.cache.len = 1;
> +	sh->mr.cache.overflow = 0;
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
> 		if (mr_insert_dev_cache(dev, mr) < 0)
> 			return;
> }
> @@ -888,6 +890,7 @@ struct mr_update_mp_data {
> mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	const struct rte_memseg_list *msl;
> 	struct mlx5_mr *mr;
> 	int ms_n;
> @@ -901,7 +904,7 @@ struct mr_update_mp_data {
> 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
> 	assert(len == RTE_ALIGN(len, msl->page_sz));
> 	ms_n = len / msl->page_sz;
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Clear bits of freed memsegs from MR. */
> 	for (i = 0; i < ms_n; ++i) {
> 		const struct rte_memseg *ms;
> @@ -928,7 +931,7 @@ struct mr_update_mp_data {
> 		rte_bitmap_clear(mr->ms_bmp, pos);
> 		if (--mr->ms_n == 0) {
> 			LIST_REMOVE(mr, mr);
> -			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 			DEBUG("port %u remove MR(%p) from list",
> 			      dev->data->port_id, (void *)mr);
> 		}
> @@ -949,12 +952,12 @@ struct mr_update_mp_data {
> 		 * generation below) will be guaranteed to be seen by other core
> 		 * before the core sees the newly allocated memory.
> 		 */
> -		++priv->mr.dev_gen;
> +		++sh->mr.dev_gen;
> 		DEBUG("broadcasting local cache flush, gen=%d",
> -		      priv->mr.dev_gen);
> +		      sh->mr.dev_gen);
> 		rte_smp_wmb();
> 	}
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> }
> 
> /**
> @@ -1013,6 +1016,7 @@ struct mr_update_mp_data {
> 		   struct mlx5_mr_cache *entry, uintptr_t addr)
> {
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
> 	uint16_t idx;
> 	uint32_t lkey;
> @@ -1021,12 +1025,12 @@ struct mr_update_mp_data {
> 	if (unlikely(bt->len == bt->size))
> 		mr_btree_expand(bt, bt->size << 1);
> 	/* Look up in the global cache. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> -	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> +	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
> 	if (lkey != UINT32_MAX) {
> 		/* Found. */
> -		*entry = (*priv->mr.cache.table)[idx];
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		*entry = (*sh->mr.cache.table)[idx];
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> 		/*
> 		 * Update local cache. Even if it fails, return the found entry
> 		 * to update top-half cache. Next time, this entry will be found
> @@ -1035,7 +1039,7 @@ struct mr_update_mp_data {
> 		mr_btree_insert(bt, entry);
> 		return lkey;
> 	}
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	/* First time to see the address? Create a new MR. */
> 	lkey = mlx5_mr_create(dev, entry, addr);
> 	/*
> @@ -1261,6 +1265,7 @@ struct mr_update_mp_data {
> 	struct mr_update_mp_data *data = opaque;
> 	struct rte_eth_dev *dev = data->dev;
> 	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_ibv_shared *sh = priv->sh;
> 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
> 	struct mlx5_mr *mr = NULL;
> 	uintptr_t addr = (uintptr_t)memhdr->addr;
> @@ -1270,9 +1275,9 @@ struct mr_update_mp_data {
> 
> 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> 	/* If already registered, it should return. */
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	lkey = mr_lookup_dev(dev, &entry, addr);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	if (lkey != UINT32_MAX)
> 		return;
> 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
> @@ -1286,11 +1291,11 @@ struct mr_update_mp_data {
> 		data->ret = -1;
> 		return;
> 	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	/* Insert to the global cache table. */
> 	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Insert to the local cache table */
> 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
> }
> @@ -1339,6 +1344,7 @@ struct mr_update_mp_data {
> 	struct rte_eth_dev *dev;
> 	struct mlx5_mr *mr;
> 	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
> 
> 	dev = pci_dev_to_eth_dev(pdev);
> 	if (!dev) {
> @@ -1355,11 +1361,12 @@ struct mr_update_mp_data {
> 		rte_errno = EINVAL;
> 		return -1;
> 	}
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> -	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	sh = priv->sh;
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> +	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
> 	/* Insert to the global cache table. */
> 	mr_insert_dev_cache(dev, mr);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	return 0;
> }
> 
> @@ -1384,6 +1391,7 @@ struct mr_update_mp_data {
> {
> 	struct rte_eth_dev *dev;
> 	struct mlx5_priv *priv;
> +	struct mlx5_ibv_shared *sh;
> 	struct mlx5_mr *mr;
> 	struct mlx5_mr_cache entry;
> 
> @@ -1395,10 +1403,11 @@ struct mr_update_mp_data {
> 		return -1;
> 	}
> 	priv = dev->data->dev_private;
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	sh = priv->sh;
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> 	if (!mr) {
> -		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		rte_rwlock_read_unlock(&sh->mr.rwlock);
> 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
> 				 "to PCI device %p", (uintptr_t)addr,
> 				 (void *)pdev);
> @@ -1406,7 +1415,7 @@ struct mr_update_mp_data {
> 		return -1;
> 	}
> 	LIST_REMOVE(mr, mr);
> -	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> 	      (void *)mr);
> 	mr_rebuild_dev_cache(dev);
> @@ -1419,11 +1428,10 @@ struct mr_update_mp_data {
> 	 * generation below) will be guaranteed to be seen by other core
> 	 * before the core sees the newly allocated memory.
> 	 */
> -	++priv->mr.dev_gen;
> -	DEBUG("broadcasting local cache flush, gen=%d",
> -			priv->mr.dev_gen);
> +	++sh->mr.dev_gen;
> +	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
> 	rte_smp_wmb();
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> 	return 0;
> }
> 
> @@ -1544,25 +1552,24 @@ struct mr_update_mp_data {
> /**
>  * Dump all the created MRs and the global cache entries.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> void
> -mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
> +mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
> {
> #ifndef NDEBUG
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr;
> 	int mr_n = 0;
> 	int chunk_n = 0;
> 
> -	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	rte_rwlock_read_lock(&sh->mr.rwlock);
> 	/* Iterate all the existing MRs. */
> -	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
> +	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
> 		unsigned int n;
> 
> -		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> -		      dev->data->port_id, mr_n++,
> +		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
> +		      sh->ibdev_name, mr_n++,
> 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
> 		      mr->ms_n, mr->ms_bmp_n);
> 		if (mr->ms_n == 0)
> @@ -1577,45 +1584,40 @@ struct mr_update_mp_data {
> 			      chunk_n++, ret.start, ret.end);
> 		}
> 	}
> -	DEBUG("port %u dumping global cache", dev->data->port_id);
> -	mlx5_mr_btree_dump(&priv->mr.cache);
> -	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	DEBUG("device %s dumping global cache", sh->ibdev_name);
> +	mlx5_mr_btree_dump(&sh->mr.cache);
> +	rte_rwlock_read_unlock(&sh->mr.rwlock);
> #endif
> }
> 
> /**
> - * Release all the created MRs and resources. Remove device from memory callback
> + * Release all the created MRs and resources for shared device context.
>  * list.
>  *
> - * @param dev
> - *   Pointer to Ethernet device.
> + * @param sh
> + *   Pointer to Ethernet device shared context.
>  */
> void
> -mlx5_mr_release(struct rte_eth_dev *dev)
> +mlx5_mr_release(struct mlx5_ibv_shared *sh)
> {
> -	struct mlx5_priv *priv = dev->data->dev_private;
> 	struct mlx5_mr *mr_next;
> 
> -	/* Remove from memory callback device list. */
> -	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
> -	LIST_REMOVE(priv, mem_event_cb);
> -	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
> -		mlx5_mr_dump_dev(dev);
> -	rte_rwlock_write_lock(&priv->mr.rwlock);
> +		mlx5_mr_dump_dev(sh);
> +	rte_rwlock_write_lock(&sh->mr.rwlock);
> 	/* Detach from MR list and move to free list. */
> -	mr_next = LIST_FIRST(&priv->mr.mr_list);
> +	mr_next = LIST_FIRST(&sh->mr.mr_list);
> 	while (mr_next != NULL) {
> 		struct mlx5_mr *mr = mr_next;
> 
> 		mr_next = LIST_NEXT(mr, mr);
> 		LIST_REMOVE(mr, mr);
> -		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
> 	}
> -	LIST_INIT(&priv->mr.mr_list);
> +	LIST_INIT(&sh->mr.mr_list);
> 	/* Free global cache. */
> -	mlx5_mr_btree_free(&priv->mr.cache);
> -	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	mlx5_mr_btree_free(&sh->mr.cache);
> +	rte_rwlock_write_unlock(&sh->mr.rwlock);
> 	/* Free all remaining MRs. */
> -	mlx5_mr_garbage_collect(dev);
> +	mlx5_mr_garbage_collect(sh);
> }
> diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
> index 786f6a3..89e89b7 100644
> --- a/drivers/net/mlx5/mlx5_mr.h
> +++ b/drivers/net/mlx5/mlx5_mr.h
> @@ -62,6 +62,7 @@ struct mlx5_mr_ctrl {
> 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
> } __rte_packed;
> 
> +struct mlx5_ibv_shared;
> extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
> extern rte_rwlock_t mlx5_mem_event_rwlock;
> 
> @@ -76,11 +77,11 @@ void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
> 			  size_t len, void *arg);
> int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
> 		      struct rte_mempool *mp);
> -void mlx5_mr_release(struct rte_eth_dev *dev);
> +void mlx5_mr_release(struct mlx5_ibv_shared *sh);
> 
> /* Debug purpose functions. */
> void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
> -void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
> +void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
> 
> /**
>  * Look up LKey from given lookup table by linear search. Firstly look up the
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index dbe074f..4d55fd4 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -814,7 +814,7 @@ struct mlx5_txq_ctrl *
> 		goto error;
> 	}
> 	/* Save pointer of global generation number to check memory event. */
> -	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
> +	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
> 	assert(desc > MLX5_TX_COMP_THRESH);
> 	tmpl->txq.offloads = conf->offloads |
> 			     dev->data->dev_conf.txmode.offloads;
> -- 
> 1.8.3.1
>
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1bb58b1..b563e0f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -147,6 +147,7 @@  struct mlx5_dev_spawn_data {
 	struct mlx5_switch_info info; /**< Switch information. */
 	struct ibv_device *ibv_dev; /**< Associated IB device. */
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
+	struct rte_pci_device *pci_dev; /**< Backend PCI device. */
 };
 
 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
@@ -225,6 +226,7 @@  struct mlx5_dev_spawn_data {
 		sizeof(sh->ibdev_name));
 	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
 		sizeof(sh->ibdev_path));
+	sh->pci_dev = spawn->pci_dev;
 	pthread_mutex_init(&sh->intr_mutex, NULL);
 	/*
 	 * Setting port_id to max unallowed value means
@@ -239,6 +241,22 @@  struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Once the device is added to the list of memory event
+	 * callback, its global MR cache table cannot be expanded
+	 * on the fly because of deadlock. If it overflows, lookup
+	 * should be done by searching MR list linearly, which is slow.
+	 *
+	 * At this point the device is not added to the memory
+	 * event list yet, context is just being created.
+	 */
+	err = mlx5_mr_btree_init(&sh->mr.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 sh->pci_dev->device.numa_node);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
 	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
 exit:
 	pthread_mutex_unlock(&mlx5_ibv_list_mutex);
@@ -286,6 +304,8 @@  struct mlx5_dev_spawn_data {
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	if (--sh->refcnt)
 		goto exit;
+	/* Release created Memory Regions. */
+	mlx5_mr_release(sh);
 	LIST_REMOVE(sh, next);
 	/*
 	 *  Ensure there is no async event handler installed.
@@ -651,7 +671,10 @@  struct mlx5_dev_spawn_data {
 	}
 	mlx5_proc_priv_uninit(dev);
 	mlx5_mprq_free_mp(dev);
-	mlx5_mr_release(dev);
+	/* Remove from memory callback device list. */
+	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+	LIST_REMOVE(priv, mem_event_cb);
+	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	assert(priv->sh);
 	mlx5_free_shared_dr(priv);
 	if (priv->rss_conf.rss_key != NULL)
@@ -1548,19 +1571,6 @@  struct mlx5_dev_spawn_data {
 		goto error;
 	}
 	priv->config.flow_prio = err;
-	/*
-	 * Once the device is added to the list of memory event
-	 * callback, its global MR cache table cannot be expanded
-	 * on the fly because of deadlock. If it overflows, lookup
-	 * should be done by searching MR list linearly, which is slow.
-	 */
-	err = mlx5_mr_btree_init(&priv->mr.cache,
-				 MLX5_MR_BTREE_CACHE_N * 2,
-				 eth_dev->device->numa_node);
-	if (err) {
-		err = rte_errno;
-		goto error;
-	}
 	/* Add device to memory callback list. */
 	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
 	LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
@@ -1757,6 +1767,7 @@  struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = i;
 			list[ns].ibv_dev = ibv_match[0];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = mlx5_nl_ifindex
 					(nl_rdma, list[ns].ibv_dev->name, i);
 			if (!list[ns].ifindex) {
@@ -1823,6 +1834,7 @@  struct mlx5_dev_spawn_data {
 			list[ns].ibv_port = 1;
 			list[ns].ibv_dev = ibv_match[i];
 			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
 			list[ns].ifindex = 0;
 			if (nl_rdma >= 0)
 				list[ns].ifindex = mlx5_nl_ifindex
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 0a6d7f1..2575732 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -275,6 +275,14 @@  struct mlx5_ibv_shared {
 	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_pci_device *pci_dev; /* Backend PCI device. */
+	struct {
+		uint32_t dev_gen; /* Generation number to flush local caches. */
+		rte_rwlock_t rwlock; /* MR Lock. */
+		struct mlx5_mr_btree cache; /* Global MR cache table. */
+		struct mlx5_mr_list mr_list; /* Registered MR list. */
+		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
+	} mr;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
@@ -347,13 +355,6 @@  struct mlx5_priv {
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
 	/* Flow counters. */
-	struct {
-		uint32_t dev_gen; /* Generation number to flush local caches. */
-		rte_rwlock_t rwlock; /* MR Lock. */
-		struct mlx5_mr_btree cache; /* Global MR cache table. */
-		struct mlx5_mr_list mr_list; /* Registered MR list. */
-		struct mlx5_mr_list mr_free_list; /* Freed MR list. */
-	} mr;
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index e4957a4..a7a63b1 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -36,7 +36,7 @@  struct mr_update_mp_data {
 
 /**
  * Expand B-tree table to a given size. Can't be called with holding
- * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ * memory_hotplug_lock or sh->mr.rwlock due to rte_realloc().
  *
  * @param bt
  *   Pointer to B-tree structure.
@@ -350,7 +350,7 @@  struct mr_update_mp_data {
 		n = mr_find_next_chunk(mr, &entry, n);
 		if (!entry.end)
 			break;
-		if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+		if (mr_btree_insert(&priv->sh->mr.cache, &entry) < 0) {
 			/*
 			 * Overflowed, but the global table cannot be expanded
 			 * because of deadlock.
@@ -382,7 +382,7 @@  struct mr_update_mp_data {
 	struct mlx5_mr *mr;
 
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &priv->sh->mr.mr_list, mr) {
 		unsigned int n;
 
 		if (mr->ms_n == 0)
@@ -420,6 +420,7 @@  struct mr_update_mp_data {
 	      uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	uint16_t idx;
 	uint32_t lkey = UINT32_MAX;
 	struct mlx5_mr *mr;
@@ -430,10 +431,10 @@  struct mr_update_mp_data {
 	 * has to be searched by traversing the original MR list instead, which
 	 * is very slow path. Otherwise, the global cache is all inclusive.
 	 */
-	if (!unlikely(priv->mr.cache.overflow)) {
-		lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	if (!unlikely(sh->mr.cache.overflow)) {
+		lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 		if (lkey != UINT32_MAX)
-			*entry = (*priv->mr.cache.table)[idx];
+			*entry = (*sh->mr.cache.table)[idx];
 	} else {
 		/* Falling back to the slowest path. */
 		mr = mr_lookup_dev_list(dev, entry, addr);
@@ -468,13 +469,12 @@  struct mr_update_mp_data {
 /**
  * Release resources of detached MR having no online entry.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 static void
-mlx5_mr_garbage_collect(struct rte_eth_dev *dev)
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
@@ -484,11 +484,11 @@  struct mr_update_mp_data {
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
 	 */
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach the whole free list and release it after unlocking. */
-	free_list = priv->mr.mr_free_list;
-	LIST_INIT(&priv->mr.mr_free_list);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	free_list = sh->mr.mr_free_list;
+	LIST_INIT(&sh->mr.mr_free_list);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Release resources. */
 	mr_next = LIST_FIRST(&free_list);
 	while (mr_next != NULL) {
@@ -548,12 +548,12 @@  struct mr_update_mp_data {
 		      dev->data->port_id, (void *)addr);
 		return UINT32_MAX;
 	}
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&priv->sh->mr.rwlock);
 	/* Fill in output data. */
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
 	DEBUG("port %u MR CREATED by primary process for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 	      dev->data->port_id, (void *)addr,
@@ -582,6 +582,7 @@  struct mr_update_mp_data {
 		       uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_dev_config *config = &priv->config;
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct rte_memseg_list *msl;
@@ -602,12 +603,12 @@  struct mr_update_mp_data {
 		dev->data->port_id, (void *)addr);
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
-	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+	 * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
 	 * been detached by the memory free event but it couldn't be released
 	 * inside the callback due to deadlock. As a result, releasing resources
 	 * is quite opportunistic.
 	 */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 	/*
 	 * If enabled, find out a contiguous virtual address chunk in use, to
 	 * which the given address belongs, in order to register maximum range.
@@ -710,7 +711,7 @@  struct mr_update_mp_data {
 		goto alloc_resources;
 	}
 	assert(data.msl == data_re.msl);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/*
 	 * Check the address is really missing. If other thread already created
 	 * one or it is not found due to overflow, abort and return.
@@ -721,10 +722,10 @@  struct mr_update_mp_data {
 		 * low-on-memory. Then, this entry will have to be searched
 		 * here again.
 		 */
-		mr_btree_insert(&priv->mr.cache, entry);
+		mr_btree_insert(&sh->mr.cache, entry);
 		DEBUG("port %u found MR for %p on final lookup, abort",
 		      dev->data->port_id, (void *)addr);
-		rte_rwlock_write_unlock(&priv->mr.rwlock);
+		rte_rwlock_write_unlock(&sh->mr.rwlock);
 		rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 		/*
 		 * Must be unlocked before calling rte_free() because
@@ -769,7 +770,7 @@  struct mr_update_mp_data {
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	mr->ibv_mr = mlx5_glue->reg_mr(priv->sh->pd, (void *)data.start, len,
+	mr->ibv_mr = mlx5_glue->reg_mr(sh->pd, (void *)data.start, len,
 				       IBV_ACCESS_LOCAL_WRITE);
 	if (mr->ibv_mr == NULL) {
 		DEBUG("port %u fail to create a verbs MR for address (%p)",
@@ -779,7 +780,7 @@  struct mr_update_mp_data {
 	}
 	assert((uintptr_t)mr->ibv_mr->addr == data.start);
 	assert(mr->ibv_mr->length == len);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	DEBUG("port %u MR CREATED (%p) for %p:\n"
 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
@@ -792,11 +793,11 @@  struct mr_update_mp_data {
 	mr_lookup_dev(dev, entry, addr);
 	/* Lookup can't fail. */
 	assert(entry->lkey != UINT32_MAX);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 	return entry->lkey;
 err_mrlock:
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 err_memlock:
 	rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
 err_nolock:
@@ -854,14 +855,15 @@  struct mr_update_mp_data {
 mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr *mr;
 
 	DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id);
 	/* Flush cache to rebuild. */
-	priv->mr.cache.len = 1;
-	priv->mr.cache.overflow = 0;
+	sh->mr.cache.len = 1;
+	sh->mr.cache.overflow = 0;
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr)
 		if (mr_insert_dev_cache(dev, mr) < 0)
 			return;
 }
@@ -888,6 +890,7 @@  struct mr_update_mp_data {
 mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	const struct rte_memseg_list *msl;
 	struct mlx5_mr *mr;
 	int ms_n;
@@ -901,7 +904,7 @@  struct mr_update_mp_data {
 	assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 	assert(len == RTE_ALIGN(len, msl->page_sz));
 	ms_n = len / msl->page_sz;
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Clear bits of freed memsegs from MR. */
 	for (i = 0; i < ms_n; ++i) {
 		const struct rte_memseg *ms;
@@ -928,7 +931,7 @@  struct mr_update_mp_data {
 		rte_bitmap_clear(mr->ms_bmp, pos);
 		if (--mr->ms_n == 0) {
 			LIST_REMOVE(mr, mr);
-			LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+			LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 			DEBUG("port %u remove MR(%p) from list",
 			      dev->data->port_id, (void *)mr);
 		}
@@ -949,12 +952,12 @@  struct mr_update_mp_data {
 		 * generation below) will be guaranteed to be seen by other core
 		 * before the core sees the newly allocated memory.
 		 */
-		++priv->mr.dev_gen;
+		++sh->mr.dev_gen;
 		DEBUG("broadcasting local cache flush, gen=%d",
-		      priv->mr.dev_gen);
+		      sh->mr.dev_gen);
 		rte_smp_wmb();
 	}
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 }
 
 /**
@@ -1013,6 +1016,7 @@  struct mr_update_mp_data {
 		   struct mlx5_mr_cache *entry, uintptr_t addr)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
 	uint16_t idx;
 	uint32_t lkey;
@@ -1021,12 +1025,12 @@  struct mr_update_mp_data {
 	if (unlikely(bt->len == bt->size))
 		mr_btree_expand(bt, bt->size << 1);
 	/* Look up in the global cache. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
-	lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
+	lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
 	if (lkey != UINT32_MAX) {
 		/* Found. */
-		*entry = (*priv->mr.cache.table)[idx];
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		*entry = (*sh->mr.cache.table)[idx];
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		/*
 		 * Update local cache. Even if it fails, return the found entry
 		 * to update top-half cache. Next time, this entry will be found
@@ -1035,7 +1039,7 @@  struct mr_update_mp_data {
 		mr_btree_insert(bt, entry);
 		return lkey;
 	}
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	/* First time to see the address? Create a new MR. */
 	lkey = mlx5_mr_create(dev, entry, addr);
 	/*
@@ -1261,6 +1265,7 @@  struct mr_update_mp_data {
 	struct mr_update_mp_data *data = opaque;
 	struct rte_eth_dev *dev = data->dev;
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_ibv_shared *sh = priv->sh;
 	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
 	struct mlx5_mr *mr = NULL;
 	uintptr_t addr = (uintptr_t)memhdr->addr;
@@ -1270,9 +1275,9 @@  struct mr_update_mp_data {
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	if (lkey != UINT32_MAX)
 		return;
 	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
@@ -1286,11 +1291,11 @@  struct mr_update_mp_data {
 		data->ret = -1;
 		return;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Insert to the local cache table */
 	mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr);
 }
@@ -1339,6 +1344,7 @@  struct mr_update_mp_data {
 	struct rte_eth_dev *dev;
 	struct mlx5_mr *mr;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 
 	dev = pci_dev_to_eth_dev(pdev);
 	if (!dev) {
@@ -1355,11 +1361,12 @@  struct mr_update_mp_data {
 		rte_errno = EINVAL;
 		return -1;
 	}
-	rte_rwlock_write_lock(&priv->mr.rwlock);
-	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	sh = priv->sh;
+	rte_rwlock_write_lock(&sh->mr.rwlock);
+	LIST_INSERT_HEAD(&sh->mr.mr_list, mr, mr);
 	/* Insert to the global cache table. */
 	mr_insert_dev_cache(dev, mr);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1384,6 +1391,7 @@  struct mr_update_mp_data {
 {
 	struct rte_eth_dev *dev;
 	struct mlx5_priv *priv;
+	struct mlx5_ibv_shared *sh;
 	struct mlx5_mr *mr;
 	struct mlx5_mr_cache entry;
 
@@ -1395,10 +1403,11 @@  struct mr_update_mp_data {
 		return -1;
 	}
 	priv = dev->data->dev_private;
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	sh = priv->sh;
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
 	if (!mr) {
-		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		rte_rwlock_read_unlock(&sh->mr.rwlock);
 		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
 				 "to PCI device %p", (uintptr_t)addr,
 				 (void *)pdev);
@@ -1406,7 +1415,7 @@  struct mr_update_mp_data {
 		return -1;
 	}
 	LIST_REMOVE(mr, mr);
-	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
 	      (void *)mr);
 	mr_rebuild_dev_cache(dev);
@@ -1419,11 +1428,10 @@  struct mr_update_mp_data {
 	 * generation below) will be guaranteed to be seen by other core
 	 * before the core sees the newly allocated memory.
 	 */
-	++priv->mr.dev_gen;
-	DEBUG("broadcasting local cache flush, gen=%d",
-			priv->mr.dev_gen);
+	++sh->mr.dev_gen;
+	DEBUG("broadcasting local cache flush, gen=%d",	sh->mr.dev_gen);
 	rte_smp_wmb();
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 	return 0;
 }
 
@@ -1544,25 +1552,24 @@  struct mr_update_mp_data {
 /**
  * Dump all the created MRs and the global cache entries.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused)
+mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh __rte_unused)
 {
 #ifndef NDEBUG
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr;
 	int mr_n = 0;
 	int chunk_n = 0;
 
-	rte_rwlock_read_lock(&priv->mr.rwlock);
+	rte_rwlock_read_lock(&sh->mr.rwlock);
 	/* Iterate all the existing MRs. */
-	LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+	LIST_FOREACH(mr, &sh->mr.mr_list, mr) {
 		unsigned int n;
 
-		DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
-		      dev->data->port_id, mr_n++,
+		DEBUG("device %s MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+		      sh->ibdev_name, mr_n++,
 		      rte_cpu_to_be_32(mr->ibv_mr->lkey),
 		      mr->ms_n, mr->ms_bmp_n);
 		if (mr->ms_n == 0)
@@ -1577,45 +1584,40 @@  struct mr_update_mp_data {
 			      chunk_n++, ret.start, ret.end);
 		}
 	}
-	DEBUG("port %u dumping global cache", dev->data->port_id);
-	mlx5_mr_btree_dump(&priv->mr.cache);
-	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	DEBUG("device %s dumping global cache", sh->ibdev_name);
+	mlx5_mr_btree_dump(&sh->mr.cache);
+	rte_rwlock_read_unlock(&sh->mr.rwlock);
 #endif
 }
 
 /**
- * Release all the created MRs and resources. Remove device from memory callback
+ * Release all the created MRs and resources for shared device context.
  * list.
  *
- * @param dev
- *   Pointer to Ethernet device.
+ * @param sh
+ *   Pointer to Ethernet device shared context.
  */
 void
-mlx5_mr_release(struct rte_eth_dev *dev)
+mlx5_mr_release(struct mlx5_ibv_shared *sh)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_mr *mr_next;
 
-	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
-	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG)
-		mlx5_mr_dump_dev(dev);
-	rte_rwlock_write_lock(&priv->mr.rwlock);
+		mlx5_mr_dump_dev(sh);
+	rte_rwlock_write_lock(&sh->mr.rwlock);
 	/* Detach from MR list and move to free list. */
-	mr_next = LIST_FIRST(&priv->mr.mr_list);
+	mr_next = LIST_FIRST(&sh->mr.mr_list);
 	while (mr_next != NULL) {
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
 		LIST_REMOVE(mr, mr);
-		LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+		LIST_INSERT_HEAD(&sh->mr.mr_free_list, mr, mr);
 	}
-	LIST_INIT(&priv->mr.mr_list);
+	LIST_INIT(&sh->mr.mr_list);
 	/* Free global cache. */
-	mlx5_mr_btree_free(&priv->mr.cache);
-	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	mlx5_mr_btree_free(&sh->mr.cache);
+	rte_rwlock_write_unlock(&sh->mr.rwlock);
 	/* Free all remaining MRs. */
-	mlx5_mr_garbage_collect(dev);
+	mlx5_mr_garbage_collect(sh);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h
index 786f6a3..89e89b7 100644
--- a/drivers/net/mlx5/mlx5_mr.h
+++ b/drivers/net/mlx5/mlx5_mr.h
@@ -62,6 +62,7 @@  struct mlx5_mr_ctrl {
 	struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */
 } __rte_packed;
 
+struct mlx5_ibv_shared;
 extern struct mlx5_dev_list  mlx5_mem_event_cb_list;
 extern rte_rwlock_t mlx5_mem_event_rwlock;
 
@@ -76,11 +77,11 @@  void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 			  size_t len, void *arg);
 int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
 		      struct rte_mempool *mp);
-void mlx5_mr_release(struct rte_eth_dev *dev);
+void mlx5_mr_release(struct mlx5_ibv_shared *sh);
 
 /* Debug purpose functions. */
 void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt);
-void mlx5_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx5_mr_dump_dev(struct mlx5_ibv_shared *sh);
 
 /**
  * Look up LKey from given lookup table by linear search. Firstly look up the
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbe074f..4d55fd4 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -814,7 +814,7 @@  struct mlx5_txq_ctrl *
 		goto error;
 	}
 	/* Save pointer of global generation number to check memory event. */
-	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
+	tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->sh->mr.dev_gen;
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl->txq.offloads = conf->offloads |
 			     dev->data->dev_conf.txmode.offloads;