[v2,2/3] net/mlx5: remove device register remap

Message ID 20190401212225.26380-3-yskoh@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series net/mlx: remove device register remap |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply issues

Commit Message

Yongseok Koh April 1, 2019, 9:22 p.m. UTC
  UAR (User Access Region) registers will be stored in a process-local table
and a process accesses a register in a table entry with index. Alloc/free
of table entry is managed by a global bitmap.

When there's a need to store a UAR register such as Tx BlueFlame register
for doorbell, an index should be allocated by mlx5_uar_alloc_index() and
address of the allocated table entry must be acquired by
mlx5_uar_get_addr_ptr() so that the table can be expanded if overflowed.
The local UAR register table doesn't cover all the indexes in the bitmap.
This will be expanded if more indexes are allocated than the current size
of the table.

For example, the BlueFlame register for Tx doorbell has to be remapped on
each secondary process. On initialization, primary process allocates an
index for the UAR register table and stores the register address in the
indexed entry of its own table when configuring a Tx queue. The index is
stored in the shared memory(txq->bfreg_idx) and visiable to secondary
processes. As secondary processes know the index, each process stores
remapped register in the same indexed entry of its local UAR register
table.

On the datapath of each process, the register can be referenced simply by
MLX5_UAR_REG(idx) which accesses its local UAR register table by the index.

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
 drivers/net/mlx5/mlx5.c         | 262 +++++++++++++++++++++++++++-------------
 drivers/net/mlx5/mlx5.h         |  19 ++-
 drivers/net/mlx5/mlx5_rxtx.h    |   8 +-
 drivers/net/mlx5/mlx5_trigger.c |   2 +-
 drivers/net/mlx5/mlx5_txq.c     |  96 +++++++--------
 5 files changed, 242 insertions(+), 145 deletions(-)
  

Comments

Shahaf Shuler April 2, 2019, 6:50 a.m. UTC | #1
Hi Koh,

See my comments below, same comments apply for mlx4 patch. 

Tuesday, April 2, 2019 12:22 AM, Yongseok Koh:
> Subject: [dpdk-dev] [PATCH v2 2/3] net/mlx5: remove device register remap
> 
> UAR (User Access Region) registers will be stored in a process-local table and
> a process accesses a register in a table entry with index. Alloc/free of table
> entry is managed by a global bitmap.
> 
> When there's a need to store a UAR register such as Tx BlueFlame register
> for doorbell, an index should be allocated by mlx5_uar_alloc_index() and
> address of the allocated table entry must be acquired by
> mlx5_uar_get_addr_ptr() so that the table can be expanded if overflowed.
> The local UAR register table doesn't cover all the indexes in the bitmap.
> This will be expanded if more indexes are allocated than the current size of
> the table.
> 
> For example, the BlueFlame register for Tx doorbell has to be remapped on
> each secondary process. On initialization, primary process allocates an index
> for the UAR register table and stores the register address in the indexed
> entry of its own table when configuring a Tx queue. The index is stored in the
> shared memory(txq->bfreg_idx) and visiable to secondary processes. As
> secondary processes know the index, each process stores remapped register
> in the same indexed entry of its local UAR register table.
> 
> On the datapath of each process, the register can be referenced simply by
> MLX5_UAR_REG(idx) which accesses its local UAR register table by the index.
> 
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c         | 262 +++++++++++++++++++++++++++--------
> -----
>  drivers/net/mlx5/mlx5.h         |  19 ++-
>  drivers/net/mlx5/mlx5_rxtx.h    |   8 +-
>  drivers/net/mlx5/mlx5_trigger.c |   2 +-
>  drivers/net/mlx5/mlx5_txq.c     |  96 +++++++--------
>  5 files changed, 242 insertions(+), 145 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 40445056f5..103841b2bc 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -25,6 +25,7 @@
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> 
> +#include <rte_bitmap.h>
>  #include <rte_malloc.h>
>  #include <rte_ethdev_driver.h>
>  #include <rte_ethdev_pci.h>
> @@ -131,7 +132,7 @@ struct mlx5_shared_data *mlx5_shared_data;  static
> rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
> 
>  /* Process local data for secondary processes. */ -static struct
> mlx5_local_data mlx5_local_data;
> +struct mlx5_local_data mlx5_local_data;
> 
>  /** Driver-specific log messages type. */  int mlx5_logtype; @@ -810,130
> +811,225 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs
> *devargs)
> 
>  static struct rte_pci_driver mlx5_driver;
> 
> +
> +/**
> + * Expand the local UAR register table.
> + *
> + * @param size
> + *   Size of the table to be expanded
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
>  static int
> -find_lower_va_bound(const struct rte_memseg_list *msl,
> -		const struct rte_memseg *ms, void *arg)
> +uar_expand_table(uint32_t size)

This function needs to be protected w/ mutex. Since it can be called by multiple control threads on different eth devices. 

>  {
> -	void **addr = arg;
> +	struct mlx5_local_data *ld = &mlx5_local_data;
> +	void *mem;
> +	size_t tbl_sz = ld->uar_table_sz;
> 
> -	if (msl->external)
> +	if (size <= tbl_sz)
>  		return 0;
> -	if (*addr == NULL)
> -		*addr = ms->addr;
> -	else
> -		*addr = RTE_MIN(*addr, ms->addr);
> -
> +	tbl_sz = RTE_ALIGN_CEIL(size, RTE_BITMAP_SLAB_BIT_SIZE);
> +	mem = rte_realloc(ld->uar_table, tbl_sz * sizeof(void *),
> +			  RTE_CACHE_LINE_SIZE);
> +	if (!mem) {
> +		rte_errno = ENOMEM;
> +		DRV_LOG(ERR, "failed to expand uar table");
> +		return -rte_errno;
> +	}
> +	DRV_LOG(DEBUG, "UAR reg. table is expanded to %zu", tbl_sz);
> +	ld->uar_table = mem;
> +	ld->uar_table_sz = tbl_sz;
>  	return 0;
>  }
> 
>  /**
> - * Reserve UAR address space for primary process.
> + * Return the pointer of the indexed slot in the local UAR register table.
>   *
> - * Process local resource is used by both primary and secondary to avoid
> - * duplicate reservation. The space has to be available on both primary and
> - * secondary process, TXQ UAR maps to this area using fixed mmap w/o
> double
> - * check.
> + * The indexed slot must be allocated by mlx5_uar_alloc_index() in
> + advance. And
> + * the table will be expanded if overflowed.
> + *
> + * @param idx
> + *   Index of the table.
>   *
>   * @return
> - *   0 on success, a negative errno value otherwise and rte_errno is set.
> + *   Pointer of table entry on success, NULL otherwise and rte_errno is set.
>   */
> -static int
> -mlx5_uar_init_primary(void)
> +void **
> +mlx5_uar_get_addr_ptr(uint32_t idx)

Wondering if we can possibly have coherency issue here.
Suppose we have 2 eth devices. One is doing datapath, one is at configuration stage. 
The one on configuration stage may trigger the expand
The one on datapath may read from the UAR table.

> +{
> +	struct mlx5_local_data *ld = &mlx5_local_data;
> +	int ret;
> +
> +	assert(idx < MLX5_UAR_TABLE_SIZE_MAX);
> +	if (idx >= ld->uar_table_sz) {
> +		ret = uar_expand_table(idx + 1);
> +		if (ret)
> +			return NULL;
> +	}
> +	return &(*ld->uar_table)[idx];
> +}
> +
> +/**
> + * Allocate a slot of UAR register table.
> + *
> + * Allocation is done by scanning the global bitmap. The global
> +spinlock should
> + * be held.
> + *
> + * @return
> + *   Index of a free slot on success, a negative errno value otherwise and
> + *   rte_errno is set.
> + */
> +uint32_t
> +mlx5_uar_alloc_index(void)
>  {
>  	struct mlx5_shared_data *sd = mlx5_shared_data;
> -	void *addr = (void *)0;
> +	uint32_t idx = 0;
> +	uint64_t slab = 0;
> +	int ret;
> 
> -	if (sd->uar_base)
> -		return 0;
> -	/* find out lower bound of hugepage segments */
> -	rte_memseg_walk(find_lower_va_bound, &addr);
> -	/* keep distance to hugepages to minimize potential conflicts. */
> -	addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX5_UAR_OFFSET +
> MLX5_UAR_SIZE));
> -	/* anonymous mmap, no real memory consumption. */
> -	addr = mmap(addr, MLX5_UAR_SIZE,
> -		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> -	if (addr == MAP_FAILED) {
> -		DRV_LOG(ERR,
> -			"Failed to reserve UAR address space, please"
> -			" adjust MLX5_UAR_SIZE or try --base-virtaddr");
> -		rte_errno = ENOMEM;
> +	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +	rte_spinlock_lock(&sd->lock);
> +	__rte_bitmap_scan_init(sd->uar_bmp);
> +	ret = rte_bitmap_scan(sd->uar_bmp, &idx, &slab);
> +	if (unlikely(!ret)) {
> +		/*
> +		 * This cannot happen unless there are unreasonably large
> number
> +		 * of queues and ports.
> +		 */
> +		rte_errno = ENOSPC;
> +		rte_spinlock_unlock(&sd->lock);
>  		return -rte_errno;
>  	}
> -	/* Accept either same addr or a new addr returned from mmap if
> target
> -	 * range occupied.
> -	 */
> -	DRV_LOG(INFO, "Reserved UAR address space: %p", addr);
> -	sd->uar_base = addr; /* for primary and secondary UAR re-mmap. */
> -	return 0;
> +	idx += __builtin_ctzll(slab);
> +	/* Mark the slot is occupied. */
> +	rte_bitmap_clear(sd->uar_bmp, idx);
> +	rte_spinlock_unlock(&sd->lock);
> +	DRV_LOG(DEBUG, "index %d is allocated in UAR reg. table", idx);
> +	return idx;
>  }
> 
>  /**
> - * Unmap UAR address space reserved for primary process.
> + * Free a slot of UAR register table.
>   */
> -static void
> -mlx5_uar_uninit_primary(void)
> +void
> +mlx5_uar_free_index(uint32_t idx)
>  {
>  	struct mlx5_shared_data *sd = mlx5_shared_data;
> 
> -	if (!sd->uar_base)
> -		return;
> -	munmap(sd->uar_base, MLX5_UAR_SIZE);
> -	sd->uar_base = NULL;
> +	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +	assert(idx < MLX5_UAR_TABLE_SIZE_MAX);
> +	rte_spinlock_lock(&sd->lock);
> +	/* Mark the slot is empty. */
> +	rte_bitmap_set(sd->uar_bmp, idx);
> +	rte_spinlock_unlock(&sd->lock);
> +	DRV_LOG(DEBUG, "index %d is freed in UAR reg. table", idx);
>  }
> 
>  /**
> - * Reserve UAR address space for secondary process, align with primary
> process.
> + * Initialize UAR register table bitmap.
> + *
> + * UAR registers will be stored in a process-local table and the table
> + is
> + * managed by a global bitmap. When there's a need to store a UAR
> + register, an
> + * index should be allocated by mlx5_uar_alloc_index() and address of
> + the
> + * allocated table entry must be acquired by mlx5_uar_get_addr_ptr() so
> + that the
> + * table can be expanded if overflowed.
> + *
> + * The local UAR register table doesn't cover all the indexes in the bitmap.
> + * This will be expanded if more indexes are allocated than the current
> + size of
> + * the table.
> + *
> + * Secondary process should have reference of the index and store
> + remapped
> + * register at the same index in its local UAR register table.
> + *
> + * On the datapath of each process, the register can be referenced
> + simply by
> + * MLX5_UAR_REG(idx).
>   *
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  static int
> -mlx5_uar_init_secondary(void)
> +uar_init_primary(void)
>  {
>  	struct mlx5_shared_data *sd = mlx5_shared_data;
> -	struct mlx5_local_data *ld = &mlx5_local_data;
> -	void *addr;
> +	struct rte_bitmap *bmp;
> +	void *bmp_mem;
> +	uint32_t bmp_size;
> +	unsigned int i;
> 
> -	if (ld->uar_base) { /* Already reserved. */
> -		assert(sd->uar_base == ld->uar_base);
> -		return 0;
> -	}
> -	assert(sd->uar_base);
> -	/* anonymous mmap, no real memory consumption. */
> -	addr = mmap(sd->uar_base, MLX5_UAR_SIZE,
> -		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> -	if (addr == MAP_FAILED) {
> -		DRV_LOG(ERR, "UAR mmap failed: %p size: %llu",
> -			sd->uar_base, MLX5_UAR_SIZE);
> -		rte_errno = ENXIO;
> +	bmp_size =
> rte_bitmap_get_memory_footprint(MLX5_UAR_TABLE_SIZE_MAX);
> +	bmp_mem = rte_zmalloc("uar_table", bmp_size,
> RTE_CACHE_LINE_SIZE);
> +	if (!bmp_mem) {
> +		rte_errno = ENOMEM;
> +		DRV_LOG(ERR, "failed to allocate memory for uar table");
>  		return -rte_errno;
>  	}
> -	if (sd->uar_base != addr) {
> -		DRV_LOG(ERR,
> -			"UAR address %p size %llu occupied, please"
> -			" adjust MLX5_UAR_OFFSET or try EAL parameter"
> -			" --base-virtaddr",
> -			sd->uar_base, MLX5_UAR_SIZE);
> -		rte_errno = ENXIO;
> -		return -rte_errno;
> +	bmp = rte_bitmap_init(MLX5_UAR_TABLE_SIZE_MAX, bmp_mem,
> bmp_size);
> +	/* Set the entire bitmap as 1 means vacant and 0 means empty. */
> +	for (i = 0; i < bmp->array2_size; ++i)
> +		rte_bitmap_set_slab(bmp, i * RTE_BITMAP_SLAB_BIT_SIZE, -
> 1);
> +	sd->uar_bmp = bmp;
> +	return 0;
> +}
> +
> +/**
> + * Un-initialize UAR register resources.
> + *
> + * The global bitmap and the register table of primary process are freed.
> + */
> +static void
> +uar_uninit_primary(void)
> +{
> +	struct mlx5_shared_data *sd = mlx5_shared_data;
> +	struct mlx5_local_data *ld = &mlx5_local_data;
> +
> +	if (sd->uar_bmp) {
> +		rte_bitmap_free(sd->uar_bmp);
> +		rte_free(sd->uar_bmp);
> +		sd->uar_bmp = NULL;
> +	}
> +	/* Free primary's table. */
> +	if (ld->uar_table) {
> +		rte_free(ld->uar_table);
> +		ld->uar_table = NULL;
> +		ld->uar_table_sz = 0;
>  	}
> -	ld->uar_base = addr;
> -	DRV_LOG(INFO, "Reserved UAR address space: %p", addr);
> +}
> +
> +/**
> + * Initialize UAR register resources for secondary process.
> + *
> + * Allocate the local UAR register table. Initially, the number of
> +entries is
> + * same as the size of a bitmap slab.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +uar_init_secondary(void)
> +{
> +	/* Prepare at least a bitmap slab. */
> +	uar_expand_table(RTE_BITMAP_SLAB_BIT_SIZE);
>  	return 0;
>  }
> 
>  /**
> - * Unmap UAR address space reserved for secondary process.
> + * Un-initialize UAR register resources for secondary process.
> + *
> + * The local UAR register table is freed.
>   */
>  static void
> -mlx5_uar_uninit_secondary(void)
> +uar_uninit_secondary(void)
>  {
>  	struct mlx5_local_data *ld = &mlx5_local_data;
> 
> -	if (!ld->uar_base)
> -		return;
> -	munmap(ld->uar_base, MLX5_UAR_SIZE);
> -	ld->uar_base = NULL;
> +	/* Free process-local table. */
> +	if (ld->uar_table) {
> +		rte_free(ld->uar_table);
> +		ld->uar_table = NULL;
> +		ld->uar_table_sz = 0;
> +	}
>  }
> 
>  /**
> @@ -967,7 +1063,7 @@ mlx5_init_once(void)
> 
> 	rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
>  						mlx5_mr_mem_event_cb,
> NULL);
>  		mlx5_mp_init_primary();
> -		ret = mlx5_uar_init_primary();
> +		ret = uar_init_primary();
>  		if (ret)
>  			goto error;
>  		sd->init_done = true;
> @@ -976,7 +1072,7 @@ mlx5_init_once(void)
>  		if (ld->init_done)
>  			break;
>  		mlx5_mp_init_secondary();
> -		ret = mlx5_uar_init_secondary();
> +		ret = uar_init_secondary();
>  		if (ret)
>  			goto error;
>  		++sd->secondary_cnt;
> @@ -990,12 +1086,12 @@ mlx5_init_once(void)
>  error:
>  	switch (rte_eal_process_type()) {
>  	case RTE_PROC_PRIMARY:
> -		mlx5_uar_uninit_primary();
> +		uar_uninit_primary();
>  		mlx5_mp_uninit_primary();
> 
> 	rte_mem_event_callback_unregister("MLX5_MEM_EVENT_CB",
> NULL);
>  		break;
>  	case RTE_PROC_SECONDARY:
> -		mlx5_uar_uninit_secondary();
> +		uar_uninit_secondary();
>  		mlx5_mp_uninit_secondary();
>  		break;
>  	default:
> @@ -1099,7 +1195,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		if (err < 0)
>  			return NULL;
>  		/* Remap UAR for Tx queues. */
> -		err = mlx5_tx_uar_remap(eth_dev, err);
> +		err = mlx5_txq_uar_init(eth_dev, err);
>  		if (err)
>  			return NULL;
>  		/*
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 8ce8361a85..f77517bee0 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -97,8 +97,8 @@ struct mlx5_shared_data {
>  	/* Global spinlock for primary and secondary processes. */
>  	int init_done; /* Whether primary has done initialization. */
>  	unsigned int secondary_cnt; /* Number of secondary processes
> init'd. */
> -	void *uar_base;
> -	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping.
> */
> +	struct rte_bitmap *uar_bmp;
> +	/* Bitmap to keep track of BlueFlame register table. */
>  	struct mlx5_dev_list mem_event_cb_list;
>  	rte_rwlock_t mem_event_rwlock;
>  };
> @@ -106,11 +106,19 @@ struct mlx5_shared_data {
>  /* Per-process data structure, not visible to other processes. */  struct
> mlx5_local_data {
>  	int init_done; /* Whether a secondary has done initialization. */
> -	void *uar_base;
> -	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping.
> */
> +	void *(*uar_table)[];
> +	/* Table of BlueFlame registers for each process. */
> +	size_t uar_table_sz;
> +	/* Size of BlueFlame register table. */
>  };
> 
>  extern struct mlx5_shared_data *mlx5_shared_data;
> +extern struct mlx5_local_data mlx5_local_data;
> +
> +/* The maximum size of BlueFlame register table. */ #define
> +MLX5_UAR_TABLE_SIZE_MAX (RTE_MAX_ETHPORTS *
> RTE_MAX_QUEUES_PER_PORT)
> +
> +#define MLX5_UAR_REG(idx) ((*mlx5_local_data.uar_table)[(idx)])

Same concern - what if the uar table is expanded due to other device in configuration stage?

> 
>  struct mlx5_counter_ctrl {
>  	/* Name of the counter. */
> @@ -331,6 +339,9 @@ struct mlx5_priv {
>  /* mlx5.c */
> 
>  int mlx5_getenv_int(const char *);
> +void **mlx5_uar_get_addr_ptr(uint32_t idx); uint32_t
> +mlx5_uar_alloc_index(void); void mlx5_uar_free_index(uint32_t idx);
> 
>  /* mlx5_ethdev.c */
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index ced9945888..b32c1d6e0f 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -203,7 +203,7 @@ struct mlx5_txq_data {
>  	volatile void *wqes; /* Work queue (use volatile to write into). */
>  	volatile uint32_t *qp_db; /* Work queue doorbell. */
>  	volatile uint32_t *cq_db; /* Completion queue doorbell. */
> -	volatile void *bf_reg; /* Blueflame register remapped. */
> +	uint32_t bfreg_idx; /* Blueflame register index. */
>  	struct rte_mbuf *(*elts)[]; /* TX elements. */
>  	struct mlx5_txq_stats stats; /* TX queue counters. */  #ifndef
> RTE_ARCH_64 @@ -232,7 +232,7 @@ struct mlx5_txq_ctrl {
>  	struct mlx5_priv *priv; /* Back pointer to private data. */
>  	struct mlx5_txq_data txq; /* Data path structure. */
>  	off_t uar_mmap_offset; /* UAR mmap offset for non-primary
> process. */
> -	volatile void *bf_reg_orig; /* Blueflame register from verbs. */
> +	void *bf_reg; /* BlueFlame register from Verbs. */
>  	uint16_t idx; /* Queue index. */
>  };
> 
> @@ -303,7 +303,7 @@ uint64_t mlx5_get_rx_queue_offloads(struct
> rte_eth_dev *dev);  int mlx5_tx_queue_setup(struct rte_eth_dev *dev,
> uint16_t idx, uint16_t desc,
>  			unsigned int socket, const struct rte_eth_txconf
> *conf);  void mlx5_tx_queue_release(void *dpdk_txq); -int
> mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd);
> +int mlx5_txq_uar_init(struct rte_eth_dev *dev, int fd);
>  struct mlx5_txq_ibv *mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t
> idx);  struct mlx5_txq_ibv *mlx5_txq_ibv_get(struct rte_eth_dev *dev,
> uint16_t idx);  int mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv); @@
> -706,7 +706,7 @@ static __rte_always_inline void
> mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct
> mlx5_wqe *wqe,
>  		       int cond)
>  {
> -	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
> +	uint64_t *dst = MLX5_UAR_REG(txq->bfreg_idx);
>  	volatile uint64_t *src = ((volatile uint64_t *)wqe);
> 
>  	rte_cio_wmb();
> diff --git a/drivers/net/mlx5/mlx5_trigger.c
> b/drivers/net/mlx5/mlx5_trigger.c index 5b73f0ff03..d7f27702e8 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -58,7 +58,7 @@ mlx5_txq_start(struct rte_eth_dev *dev)
>  			goto error;
>  		}
>  	}
> -	ret = mlx5_tx_uar_remap(dev, priv->sh->ctx->cmd_fd);
> +	ret = mlx5_txq_uar_init(dev, priv->sh->ctx->cmd_fd);
>  	if (ret) {
>  		/* Adjust index for rollback. */
>  		i = priv->txqs_n - 1;
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index 1b3d89f2f6..d8e0bda371 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -231,9 +231,13 @@ mlx5_tx_queue_release(void *dpdk_txq)
> 
> 
>  /**
> - * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
> - * Both primary and secondary process do mmap to make UAR address
> - * aligned.
> + * Initialize UAR register access for Tx.
> + *
> + * For both primary and secondary, initialize UAR locks for atomic access.
> + *
> + * For secondary, remap BlueFlame registers for secondary process.
> + Remapped
> + * address is stored at the same indexed entry of the local UAR
> + register table
> + * as primary process.
>   *
>   * @param[in] dev
>   *   Pointer to Ethernet device.
> @@ -244,75 +248,52 @@ mlx5_tx_queue_release(void *dpdk_txq)
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd)
> +mlx5_txq_uar_init(struct rte_eth_dev *dev, int fd)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> -	unsigned int i, j;
> -	uintptr_t pages[priv->txqs_n];
> -	unsigned int pages_n = 0;
> -	uintptr_t uar_va;
> -	uintptr_t off;
> -	void *addr;
> -	void *ret;
>  	struct mlx5_txq_data *txq;
>  	struct mlx5_txq_ctrl *txq_ctrl;
> -	int already_mapped;
> +	void *addr;
> +	void **addr_ptr;
> +	uintptr_t uar_va;
> +	uintptr_t offset;
>  	size_t page_size = sysconf(_SC_PAGESIZE);
> +	unsigned int i;
>  #ifndef RTE_ARCH_64
>  	unsigned int lock_idx;
>  #endif
> 
> -	memset(pages, 0, priv->txqs_n * sizeof(uintptr_t));
> -	/*
> -	 * As rdma-core, UARs are mapped in size of OS page size.
> -	 * Use aligned address to avoid duplicate mmap.
> -	 * Ref to libmlx5 function: mlx5_init_context()
> -	 */
>  	for (i = 0; i != priv->txqs_n; ++i) {
>  		if (!(*priv->txqs)[i])
>  			continue;
>  		txq = (*priv->txqs)[i];
>  		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
>  		assert(txq_ctrl->idx == (uint16_t)i);
> -		/* UAR addr form verbs used to find dup and offset in page.
> */
> -		uar_va = (uintptr_t)txq_ctrl->bf_reg_orig;
> -		off = uar_va & (page_size - 1); /* offset in page. */
> -		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page
> addr. */
> -		already_mapped = 0;
> -		for (j = 0; j != pages_n; ++j) {
> -			if (pages[j] == uar_va) {
> -				already_mapped = 1;
> -				break;
> -			}
> -		}
> -		/* new address in reserved UAR address space. */
> -		addr = RTE_PTR_ADD(mlx5_shared_data->uar_base,
> -				   uar_va & (uintptr_t)(MLX5_UAR_SIZE - 1));
> -		if (!already_mapped) {
> -			pages[pages_n++] = uar_va;
> -			/* fixed mmap to specified address in reserved
> -			 * address space.
> +		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +			/*
> +			 * As rdma-core, UARs are mapped in size of OS page
> +			 * size. Ref to libmlx5 function: mlx5_init_context()
>  			 */
> -			ret = mmap(addr, page_size,
> -				   PROT_WRITE, MAP_FIXED | MAP_SHARED,
> fd,
> -				   txq_ctrl->uar_mmap_offset);
> -			if (ret != addr) {
> -				/* fixed mmap have to return same address
> */
> +			uar_va = (uintptr_t)txq_ctrl->bf_reg;
> +			offset = uar_va & (page_size - 1); /* Offset in page.
> */
> +			addr = mmap(NULL, page_size, PROT_WRITE,
> MAP_SHARED, fd,
> +				    txq_ctrl->uar_mmap_offset);
> +			if (addr == MAP_FAILED) {
>  				DRV_LOG(ERR,
> -					"port %u call to mmap failed on UAR"
> -					" for txq %u",
> +					"port %u mmap failed for BF reg."
> +					" of txq %u",
>  					dev->data->port_id, txq_ctrl->idx);
>  				rte_errno = ENXIO;
>  				return -rte_errno;
>  			}
> +			addr = RTE_PTR_ADD(addr, offset);
> +			addr_ptr = mlx5_uar_get_addr_ptr(txq->bfreg_idx);
> +			if (!addr_ptr)
> +				return -rte_errno;
> +			*addr_ptr = addr;
>  		}
> -		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save
> once */
> -			txq_ctrl->txq.bf_reg = RTE_PTR_ADD((void *)addr,
> off);
> -		else
> -			assert(txq_ctrl->txq.bf_reg ==
> -			       RTE_PTR_ADD((void *)addr, off));
>  #ifndef RTE_ARCH_64
> -		/* Assign a UAR lock according to UAR page number */
> +		/* Assign an UAR lock according to UAR page number */
>  		lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
>  			   MLX5_UAR_PAGE_NUM_MASK;
>  		txq->uar_lock = &priv->uar_lock[lock_idx]; @@ -372,6 +353,7
> @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
>  	struct mlx5dv_obj obj;
>  	const int desc = 1 << txq_data->elts_n;
>  	eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev);
> +	void **addr_ptr;
>  	int ret = 0;
> 
>  	assert(txq_data);
> @@ -507,7 +489,17 @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  	txq_data->wqes = qp.sq.buf;
>  	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
>  	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
> -	txq_ctrl->bf_reg_orig = qp.bf.reg;
> +	/* Allocate a new index in UAR table. */
> +	ret = mlx5_uar_alloc_index();
> +	if (ret < 0)
> +		goto error;
> +	txq_data->bfreg_idx = ret;
> +	txq_ctrl->bf_reg = qp.bf.reg;
> +	/* Store the BlueFlame register address in the local table. */
> +	addr_ptr = mlx5_uar_get_addr_ptr(txq_data->bfreg_idx);
> +	if (!addr_ptr)
> +		goto error;
> +	*addr_ptr = txq_ctrl->bf_reg;
>  	txq_data->cq_db = cq_info.dbrec;
>  	txq_data->cqes =
>  		(volatile struct mlx5_cqe (*)[])
> @@ -589,6 +581,7 @@ mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv)
> {
>  	assert(txq_ibv);
>  	if (rte_atomic32_dec_and_test(&txq_ibv->refcnt)) {
> +		mlx5_uar_free_index(txq_ibv->txq_ctrl->txq.bfreg_idx);
>  		claim_zero(mlx5_glue->destroy_qp(txq_ibv->qp));
>  		claim_zero(mlx5_glue->destroy_cq(txq_ibv->cq));
>  		LIST_REMOVE(txq_ibv, next);
> @@ -837,15 +830,12 @@ mlx5_txq_release(struct rte_eth_dev *dev,
> uint16_t idx)  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_txq_ctrl *txq;
> -	size_t page_size = sysconf(_SC_PAGESIZE);
> 
>  	if (!(*priv->txqs)[idx])
>  		return 0;
>  	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
>  	if (txq->ibv && !mlx5_txq_ibv_release(txq->ibv))
>  		txq->ibv = NULL;
> -	munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg,
> page_size),
> -	       page_size);
>  	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
>  		txq_free_elts(txq);
>  		mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh);
> --
> 2.11.0
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 40445056f5..103841b2bc 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -25,6 +25,7 @@ 
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_bitmap.h>
 #include <rte_malloc.h>
 #include <rte_ethdev_driver.h>
 #include <rte_ethdev_pci.h>
@@ -131,7 +132,7 @@  struct mlx5_shared_data *mlx5_shared_data;
 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
 
 /* Process local data for secondary processes. */
-static struct mlx5_local_data mlx5_local_data;
+struct mlx5_local_data mlx5_local_data;
 
 /** Driver-specific log messages type. */
 int mlx5_logtype;
@@ -810,130 +811,225 @@  mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 
 static struct rte_pci_driver mlx5_driver;
 
+
+/**
+ * Expand the local UAR register table.
+ *
+ * @param size
+ *   Size of the table to be expanded
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
 static int
-find_lower_va_bound(const struct rte_memseg_list *msl,
-		const struct rte_memseg *ms, void *arg)
+uar_expand_table(uint32_t size)
 {
-	void **addr = arg;
+	struct mlx5_local_data *ld = &mlx5_local_data;
+	void *mem;
+	size_t tbl_sz = ld->uar_table_sz;
 
-	if (msl->external)
+	if (size <= tbl_sz)
 		return 0;
-	if (*addr == NULL)
-		*addr = ms->addr;
-	else
-		*addr = RTE_MIN(*addr, ms->addr);
-
+	tbl_sz = RTE_ALIGN_CEIL(size, RTE_BITMAP_SLAB_BIT_SIZE);
+	mem = rte_realloc(ld->uar_table, tbl_sz * sizeof(void *),
+			  RTE_CACHE_LINE_SIZE);
+	if (!mem) {
+		rte_errno = ENOMEM;
+		DRV_LOG(ERR, "failed to expand uar table");
+		return -rte_errno;
+	}
+	DRV_LOG(DEBUG, "UAR reg. table is expanded to %zu", tbl_sz);
+	ld->uar_table = mem;
+	ld->uar_table_sz = tbl_sz;
 	return 0;
 }
 
 /**
- * Reserve UAR address space for primary process.
+ * Return the pointer of the indexed slot in the local UAR register table.
  *
- * Process local resource is used by both primary and secondary to avoid
- * duplicate reservation. The space has to be available on both primary and
- * secondary process, TXQ UAR maps to this area using fixed mmap w/o double
- * check.
+ * The indexed slot must be allocated by mlx5_uar_alloc_index() in advance. And
+ * the table will be expanded if overflowed.
+ *
+ * @param idx
+ *   Index of the table.
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   Pointer of table entry on success, NULL otherwise and rte_errno is set.
  */
-static int
-mlx5_uar_init_primary(void)
+void **
+mlx5_uar_get_addr_ptr(uint32_t idx)
+{
+	struct mlx5_local_data *ld = &mlx5_local_data;
+	int ret;
+
+	assert(idx < MLX5_UAR_TABLE_SIZE_MAX);
+	if (idx >= ld->uar_table_sz) {
+		ret = uar_expand_table(idx + 1);
+		if (ret)
+			return NULL;
+	}
+	return &(*ld->uar_table)[idx];
+}
+
+/**
+ * Allocate a slot of UAR register table.
+ *
+ * Allocation is done by scanning the global bitmap. The global spinlock should
+ * be held.
+ *
+ * @return
+ *   Index of a free slot on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+uint32_t
+mlx5_uar_alloc_index(void)
 {
 	struct mlx5_shared_data *sd = mlx5_shared_data;
-	void *addr = (void *)0;
+	uint32_t idx = 0;
+	uint64_t slab = 0;
+	int ret;
 
-	if (sd->uar_base)
-		return 0;
-	/* find out lower bound of hugepage segments */
-	rte_memseg_walk(find_lower_va_bound, &addr);
-	/* keep distance to hugepages to minimize potential conflicts. */
-	addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX5_UAR_OFFSET + MLX5_UAR_SIZE));
-	/* anonymous mmap, no real memory consumption. */
-	addr = mmap(addr, MLX5_UAR_SIZE,
-		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (addr == MAP_FAILED) {
-		DRV_LOG(ERR,
-			"Failed to reserve UAR address space, please"
-			" adjust MLX5_UAR_SIZE or try --base-virtaddr");
-		rte_errno = ENOMEM;
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	rte_spinlock_lock(&sd->lock);
+	__rte_bitmap_scan_init(sd->uar_bmp);
+	ret = rte_bitmap_scan(sd->uar_bmp, &idx, &slab);
+	if (unlikely(!ret)) {
+		/*
+		 * This cannot happen unless there are unreasonably large number
+		 * of queues and ports.
+		 */
+		rte_errno = ENOSPC;
+		rte_spinlock_unlock(&sd->lock);
 		return -rte_errno;
 	}
-	/* Accept either same addr or a new addr returned from mmap if target
-	 * range occupied.
-	 */
-	DRV_LOG(INFO, "Reserved UAR address space: %p", addr);
-	sd->uar_base = addr; /* for primary and secondary UAR re-mmap. */
-	return 0;
+	idx += __builtin_ctzll(slab);
+	/* Mark the slot is occupied. */
+	rte_bitmap_clear(sd->uar_bmp, idx);
+	rte_spinlock_unlock(&sd->lock);
+	DRV_LOG(DEBUG, "index %d is allocated in UAR reg. table", idx);
+	return idx;
 }
 
 /**
- * Unmap UAR address space reserved for primary process.
+ * Free a slot of UAR register table.
  */
-static void
-mlx5_uar_uninit_primary(void)
+void
+mlx5_uar_free_index(uint32_t idx)
 {
 	struct mlx5_shared_data *sd = mlx5_shared_data;
 
-	if (!sd->uar_base)
-		return;
-	munmap(sd->uar_base, MLX5_UAR_SIZE);
-	sd->uar_base = NULL;
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	assert(idx < MLX5_UAR_TABLE_SIZE_MAX);
+	rte_spinlock_lock(&sd->lock);
+	/* Mark the slot is empty. */
+	rte_bitmap_set(sd->uar_bmp, idx);
+	rte_spinlock_unlock(&sd->lock);
+	DRV_LOG(DEBUG, "index %d is freed in UAR reg. table", idx);
 }
 
 /**
- * Reserve UAR address space for secondary process, align with primary process.
+ * Initialize UAR register table bitmap.
+ *
+ * UAR registers will be stored in a process-local table and the table is
+ * managed by a global bitmap. When there's a need to store a UAR register, an
+ * index should be allocated by mlx5_uar_alloc_index() and address of the
+ * allocated table entry must be acquired by mlx5_uar_get_addr_ptr() so that the
+ * table can be expanded if overflowed.
+ *
+ * The local UAR register table doesn't cover all the indexes in the bitmap.
+ * This will be expanded if more indexes are allocated than the current size of
+ * the table.
+ *
+ * Secondary process should have reference of the index and store remapped
+ * register at the same index in its local UAR register table.
+ *
+ * On the datapath of each process, the register can be referenced simply by
+ * MLX5_UAR_REG(idx).
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx5_uar_init_secondary(void)
+uar_init_primary(void)
 {
 	struct mlx5_shared_data *sd = mlx5_shared_data;
-	struct mlx5_local_data *ld = &mlx5_local_data;
-	void *addr;
+	struct rte_bitmap *bmp;
+	void *bmp_mem;
+	uint32_t bmp_size;
+	unsigned int i;
 
-	if (ld->uar_base) { /* Already reserved. */
-		assert(sd->uar_base == ld->uar_base);
-		return 0;
-	}
-	assert(sd->uar_base);
-	/* anonymous mmap, no real memory consumption. */
-	addr = mmap(sd->uar_base, MLX5_UAR_SIZE,
-		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (addr == MAP_FAILED) {
-		DRV_LOG(ERR, "UAR mmap failed: %p size: %llu",
-			sd->uar_base, MLX5_UAR_SIZE);
-		rte_errno = ENXIO;
+	bmp_size = rte_bitmap_get_memory_footprint(MLX5_UAR_TABLE_SIZE_MAX);
+	bmp_mem = rte_zmalloc("uar_table", bmp_size, RTE_CACHE_LINE_SIZE);
+	if (!bmp_mem) {
+		rte_errno = ENOMEM;
+		DRV_LOG(ERR, "failed to allocate memory for uar table");
 		return -rte_errno;
 	}
-	if (sd->uar_base != addr) {
-		DRV_LOG(ERR,
-			"UAR address %p size %llu occupied, please"
-			" adjust MLX5_UAR_OFFSET or try EAL parameter"
-			" --base-virtaddr",
-			sd->uar_base, MLX5_UAR_SIZE);
-		rte_errno = ENXIO;
-		return -rte_errno;
+	bmp = rte_bitmap_init(MLX5_UAR_TABLE_SIZE_MAX, bmp_mem, bmp_size);
+	/* Set the entire bitmap as 1 means vacant and 0 means empty. */
+	for (i = 0; i < bmp->array2_size; ++i)
+		rte_bitmap_set_slab(bmp, i * RTE_BITMAP_SLAB_BIT_SIZE, -1);
+	sd->uar_bmp = bmp;
+	return 0;
+}
+
+/**
+ * Un-initialize UAR register resources.
+ *
+ * The global bitmap and the register table of primary process are freed.
+ */
+static void
+uar_uninit_primary(void)
+{
+	struct mlx5_shared_data *sd = mlx5_shared_data;
+	struct mlx5_local_data *ld = &mlx5_local_data;
+
+	if (sd->uar_bmp) {
+		rte_bitmap_free(sd->uar_bmp);
+		rte_free(sd->uar_bmp);
+		sd->uar_bmp = NULL;
+	}
+	/* Free primary's table. */
+	if (ld->uar_table) {
+		rte_free(ld->uar_table);
+		ld->uar_table = NULL;
+		ld->uar_table_sz = 0;
 	}
-	ld->uar_base = addr;
-	DRV_LOG(INFO, "Reserved UAR address space: %p", addr);
+}
+
+/**
+ * Initialize UAR register resources for secondary process.
+ *
+ * Allocate the local UAR register table. Initially, the number of entries is
+ * same as the size of a bitmap slab.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+uar_init_secondary(void)
+{
+	/* Prepare at least a bitmap slab. */
+	uar_expand_table(RTE_BITMAP_SLAB_BIT_SIZE);
 	return 0;
 }
 
 /**
- * Unmap UAR address space reserved for secondary process.
+ * Un-initialize UAR register resources for secondary process.
+ *
+ * The local UAR register table is freed.
  */
 static void
-mlx5_uar_uninit_secondary(void)
+uar_uninit_secondary(void)
 {
 	struct mlx5_local_data *ld = &mlx5_local_data;
 
-	if (!ld->uar_base)
-		return;
-	munmap(ld->uar_base, MLX5_UAR_SIZE);
-	ld->uar_base = NULL;
+	/* Free process-local table. */
+	if (ld->uar_table) {
+		rte_free(ld->uar_table);
+		ld->uar_table = NULL;
+		ld->uar_table_sz = 0;
+	}
 }
 
 /**
@@ -967,7 +1063,7 @@  mlx5_init_once(void)
 		rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
 						mlx5_mr_mem_event_cb, NULL);
 		mlx5_mp_init_primary();
-		ret = mlx5_uar_init_primary();
+		ret = uar_init_primary();
 		if (ret)
 			goto error;
 		sd->init_done = true;
@@ -976,7 +1072,7 @@  mlx5_init_once(void)
 		if (ld->init_done)
 			break;
 		mlx5_mp_init_secondary();
-		ret = mlx5_uar_init_secondary();
+		ret = uar_init_secondary();
 		if (ret)
 			goto error;
 		++sd->secondary_cnt;
@@ -990,12 +1086,12 @@  mlx5_init_once(void)
 error:
 	switch (rte_eal_process_type()) {
 	case RTE_PROC_PRIMARY:
-		mlx5_uar_uninit_primary();
+		uar_uninit_primary();
 		mlx5_mp_uninit_primary();
 		rte_mem_event_callback_unregister("MLX5_MEM_EVENT_CB", NULL);
 		break;
 	case RTE_PROC_SECONDARY:
-		mlx5_uar_uninit_secondary();
+		uar_uninit_secondary();
 		mlx5_mp_uninit_secondary();
 		break;
 	default:
@@ -1099,7 +1195,7 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		if (err < 0)
 			return NULL;
 		/* Remap UAR for Tx queues. */
-		err = mlx5_tx_uar_remap(eth_dev, err);
+		err = mlx5_txq_uar_init(eth_dev, err);
 		if (err)
 			return NULL;
 		/*
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8ce8361a85..f77517bee0 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -97,8 +97,8 @@  struct mlx5_shared_data {
 	/* Global spinlock for primary and secondary processes. */
 	int init_done; /* Whether primary has done initialization. */
 	unsigned int secondary_cnt; /* Number of secondary processes init'd. */
-	void *uar_base;
-	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+	struct rte_bitmap *uar_bmp;
+	/* Bitmap to keep track of BlueFlame register table. */
 	struct mlx5_dev_list mem_event_cb_list;
 	rte_rwlock_t mem_event_rwlock;
 };
@@ -106,11 +106,19 @@  struct mlx5_shared_data {
 /* Per-process data structure, not visible to other processes. */
 struct mlx5_local_data {
 	int init_done; /* Whether a secondary has done initialization. */
-	void *uar_base;
-	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+	void *(*uar_table)[];
+	/* Table of BlueFlame registers for each process. */
+	size_t uar_table_sz;
+	/* Size of BlueFlame register table. */
 };
 
 extern struct mlx5_shared_data *mlx5_shared_data;
+extern struct mlx5_local_data mlx5_local_data;
+
+/* The maximum size of BlueFlame register table. */
+#define MLX5_UAR_TABLE_SIZE_MAX (RTE_MAX_ETHPORTS * RTE_MAX_QUEUES_PER_PORT)
+
+#define MLX5_UAR_REG(idx) ((*mlx5_local_data.uar_table)[(idx)])
 
 struct mlx5_counter_ctrl {
 	/* Name of the counter. */
@@ -331,6 +339,9 @@  struct mlx5_priv {
 /* mlx5.c */
 
 int mlx5_getenv_int(const char *);
+void **mlx5_uar_get_addr_ptr(uint32_t idx);
+uint32_t mlx5_uar_alloc_index(void);
+void mlx5_uar_free_index(uint32_t idx);
 
 /* mlx5_ethdev.c */
 
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index ced9945888..b32c1d6e0f 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -203,7 +203,7 @@  struct mlx5_txq_data {
 	volatile void *wqes; /* Work queue (use volatile to write into). */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
-	volatile void *bf_reg; /* Blueflame register remapped. */
+	uint32_t bfreg_idx; /* Blueflame register index. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
 #ifndef RTE_ARCH_64
@@ -232,7 +232,7 @@  struct mlx5_txq_ctrl {
 	struct mlx5_priv *priv; /* Back pointer to private data. */
 	struct mlx5_txq_data txq; /* Data path structure. */
 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
-	volatile void *bf_reg_orig; /* Blueflame register from verbs. */
+	void *bf_reg; /* BlueFlame register from Verbs. */
 	uint16_t idx; /* Queue index. */
 };
 
@@ -303,7 +303,7 @@  uint64_t mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev);
 int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 			unsigned int socket, const struct rte_eth_txconf *conf);
 void mlx5_tx_queue_release(void *dpdk_txq);
-int mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd);
+int mlx5_txq_uar_init(struct rte_eth_dev *dev, int fd);
 struct mlx5_txq_ibv *mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx);
 struct mlx5_txq_ibv *mlx5_txq_ibv_get(struct rte_eth_dev *dev, uint16_t idx);
 int mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv);
@@ -706,7 +706,7 @@  static __rte_always_inline void
 mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe,
 		       int cond)
 {
-	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
+	uint64_t *dst = MLX5_UAR_REG(txq->bfreg_idx);
 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
 
 	rte_cio_wmb();
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 5b73f0ff03..d7f27702e8 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -58,7 +58,7 @@  mlx5_txq_start(struct rte_eth_dev *dev)
 			goto error;
 		}
 	}
-	ret = mlx5_tx_uar_remap(dev, priv->sh->ctx->cmd_fd);
+	ret = mlx5_txq_uar_init(dev, priv->sh->ctx->cmd_fd);
 	if (ret) {
 		/* Adjust index for rollback. */
 		i = priv->txqs_n - 1;
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 1b3d89f2f6..d8e0bda371 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -231,9 +231,13 @@  mlx5_tx_queue_release(void *dpdk_txq)
 
 
 /**
- * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
- * Both primary and secondary process do mmap to make UAR address
- * aligned.
+ * Initialize UAR register access for Tx.
+ *
+ * For both primary and secondary, initialize UAR locks for atomic access.
+ *
+ * For secondary, remap BlueFlame registers for secondary process. Remapped
+ * address is stored at the same indexed entry of the local UAR register table
+ * as primary process.
  *
  * @param[in] dev
  *   Pointer to Ethernet device.
@@ -244,75 +248,52 @@  mlx5_tx_queue_release(void *dpdk_txq)
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd)
+mlx5_txq_uar_init(struct rte_eth_dev *dev, int fd)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	unsigned int i, j;
-	uintptr_t pages[priv->txqs_n];
-	unsigned int pages_n = 0;
-	uintptr_t uar_va;
-	uintptr_t off;
-	void *addr;
-	void *ret;
 	struct mlx5_txq_data *txq;
 	struct mlx5_txq_ctrl *txq_ctrl;
-	int already_mapped;
+	void *addr;
+	void **addr_ptr;
+	uintptr_t uar_va;
+	uintptr_t offset;
 	size_t page_size = sysconf(_SC_PAGESIZE);
+	unsigned int i;
 #ifndef RTE_ARCH_64
 	unsigned int lock_idx;
 #endif
 
-	memset(pages, 0, priv->txqs_n * sizeof(uintptr_t));
-	/*
-	 * As rdma-core, UARs are mapped in size of OS page size.
-	 * Use aligned address to avoid duplicate mmap.
-	 * Ref to libmlx5 function: mlx5_init_context()
-	 */
 	for (i = 0; i != priv->txqs_n; ++i) {
 		if (!(*priv->txqs)[i])
 			continue;
 		txq = (*priv->txqs)[i];
 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
 		assert(txq_ctrl->idx == (uint16_t)i);
-		/* UAR addr form verbs used to find dup and offset in page. */
-		uar_va = (uintptr_t)txq_ctrl->bf_reg_orig;
-		off = uar_va & (page_size - 1); /* offset in page. */
-		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr. */
-		already_mapped = 0;
-		for (j = 0; j != pages_n; ++j) {
-			if (pages[j] == uar_va) {
-				already_mapped = 1;
-				break;
-			}
-		}
-		/* new address in reserved UAR address space. */
-		addr = RTE_PTR_ADD(mlx5_shared_data->uar_base,
-				   uar_va & (uintptr_t)(MLX5_UAR_SIZE - 1));
-		if (!already_mapped) {
-			pages[pages_n++] = uar_va;
-			/* fixed mmap to specified address in reserved
-			 * address space.
+		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+			/*
+			 * As rdma-core, UARs are mapped in size of OS page
+			 * size. Ref to libmlx5 function: mlx5_init_context()
 			 */
-			ret = mmap(addr, page_size,
-				   PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
-				   txq_ctrl->uar_mmap_offset);
-			if (ret != addr) {
-				/* fixed mmap have to return same address */
+			uar_va = (uintptr_t)txq_ctrl->bf_reg;
+			offset = uar_va & (page_size - 1); /* Offset in page. */
+			addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
+				    txq_ctrl->uar_mmap_offset);
+			if (addr == MAP_FAILED) {
 				DRV_LOG(ERR,
-					"port %u call to mmap failed on UAR"
-					" for txq %u",
+					"port %u mmap failed for BF reg."
+					" of txq %u",
 					dev->data->port_id, txq_ctrl->idx);
 				rte_errno = ENXIO;
 				return -rte_errno;
 			}
+			addr = RTE_PTR_ADD(addr, offset);
+			addr_ptr = mlx5_uar_get_addr_ptr(txq->bfreg_idx);
+			if (!addr_ptr)
+				return -rte_errno;
+			*addr_ptr = addr;
 		}
-		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once */
-			txq_ctrl->txq.bf_reg = RTE_PTR_ADD((void *)addr, off);
-		else
-			assert(txq_ctrl->txq.bf_reg ==
-			       RTE_PTR_ADD((void *)addr, off));
 #ifndef RTE_ARCH_64
-		/* Assign a UAR lock according to UAR page number */
+		/* Assign an UAR lock according to UAR page number */
 		lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
 			   MLX5_UAR_PAGE_NUM_MASK;
 		txq->uar_lock = &priv->uar_lock[lock_idx];
@@ -372,6 +353,7 @@  mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 	struct mlx5dv_obj obj;
 	const int desc = 1 << txq_data->elts_n;
 	eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev);
+	void **addr_ptr;
 	int ret = 0;
 
 	assert(txq_data);
@@ -507,7 +489,17 @@  mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 	txq_data->wqes = qp.sq.buf;
 	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
 	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
-	txq_ctrl->bf_reg_orig = qp.bf.reg;
+	/* Allocate a new index in UAR table. */
+	ret = mlx5_uar_alloc_index();
+	if (ret < 0)
+		goto error;
+	txq_data->bfreg_idx = ret;
+	txq_ctrl->bf_reg = qp.bf.reg;
+	/* Store the BlueFlame register address in the local table. */
+	addr_ptr = mlx5_uar_get_addr_ptr(txq_data->bfreg_idx);
+	if (!addr_ptr)
+		goto error;
+	*addr_ptr = txq_ctrl->bf_reg;
 	txq_data->cq_db = cq_info.dbrec;
 	txq_data->cqes =
 		(volatile struct mlx5_cqe (*)[])
@@ -589,6 +581,7 @@  mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv)
 {
 	assert(txq_ibv);
 	if (rte_atomic32_dec_and_test(&txq_ibv->refcnt)) {
+		mlx5_uar_free_index(txq_ibv->txq_ctrl->txq.bfreg_idx);
 		claim_zero(mlx5_glue->destroy_qp(txq_ibv->qp));
 		claim_zero(mlx5_glue->destroy_cq(txq_ibv->cq));
 		LIST_REMOVE(txq_ibv, next);
@@ -837,15 +830,12 @@  mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_txq_ctrl *txq;
-	size_t page_size = sysconf(_SC_PAGESIZE);
 
 	if (!(*priv->txqs)[idx])
 		return 0;
 	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
 	if (txq->ibv && !mlx5_txq_ibv_release(txq->ibv))
 		txq->ibv = NULL;
-	munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg, page_size),
-	       page_size);
 	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
 		txq_free_elts(txq);
 		mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh);