[v3,3/4] net/mlx5: remove device register remap

Message ID 20190405013357.14503-4-yskoh@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series net/mlx: remove device register remap |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Yongseok Koh April 5, 2019, 1:33 a.m. UTC
  UAR (User Access Region) register does not need to be remapped for primary
process but it should be remapped only for secondary process. UAR register
table is in the process private structure in rte_eth_devices[],
	(struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private

The actual UAR table follows the data structure and the table is used for
both Tx and Rx.

For Tx, BlueFlame in UAR is used to ring the doorbell. MLX5_TX_BFREG(txq)
is defined to get a register for the txq. Processes access its own private
data to acquire the register from the UAR table.

For Rx, the doorbell in UAR is required in arming CQ event. However, it is
a known issue that the register isn't remapped for secondary process.

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
 drivers/net/mlx5/mlx5.c         | 198 +++++-----------------------------------
 drivers/net/mlx5/mlx5.h         |  15 ++-
 drivers/net/mlx5/mlx5_ethdev.c  |  17 ++++
 drivers/net/mlx5/mlx5_rxtx.h    |  11 ++-
 drivers/net/mlx5/mlx5_trigger.c |   6 --
 drivers/net/mlx5/mlx5_txq.c     | 180 ++++++++++++++++++++++--------------
 6 files changed, 168 insertions(+), 259 deletions(-)
  

Comments

Shahaf Shuler April 8, 2019, 5:48 a.m. UTC | #1
Hi Koh,

See small comments below. Same for mlx4 patch.


Friday, April 5, 2019 4:34 AM, Yongseok Koh:
> Subject: [dpdk-dev] [PATCH v3 3/4] net/mlx5: remove device register remap
> 
> UAR (User Access Region) register does not need to be remapped for
> primary process but it should be remapped only for secondary process. UAR
> register table is in the process private structure in rte_eth_devices[],
> 	(struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private
> 
> The actual UAR table follows the data structure and the table is used for both
> Tx and Rx.
> 
> For Tx, BlueFlame in UAR is used to ring the doorbell. MLX5_TX_BFREG(txq)
> is defined to get a register for the txq. Processes access its own private data
> to acquire the register from the UAR table.
> 
> For Rx, the doorbell in UAR is required in arming CQ event. However, it is a
> known issue that the register isn't remapped for secondary process.
> 
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c         | 198 +++++-----------------------------------
>  drivers/net/mlx5/mlx5.h         |  15 ++-
>  drivers/net/mlx5/mlx5_ethdev.c  |  17 ++++
>  drivers/net/mlx5/mlx5_rxtx.h    |  11 ++-
>  drivers/net/mlx5/mlx5_trigger.c |   6 --
>  drivers/net/mlx5/mlx5_txq.c     | 180 ++++++++++++++++++++++-------------
> -

[...]

>  /**
> @@ -1182,12 +1010,32 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  	}
>  	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
>  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +		struct mlx5_proc_priv *ppriv;
> +		size_t ppriv_size;
> +
>  		eth_dev = rte_eth_dev_attach_secondary(name);
>  		if (eth_dev == NULL) {
>  			DRV_LOG(ERR, "can not attach rte ethdev");
>  			rte_errno = ENOMEM;
>  			return NULL;
>  		}
> +		priv = eth_dev->data->dev_private;
> +		/*
> +		 * UAR register table follows the process private structure.
> +		 * BlueFlame registers for Tx queues come first and registers
> +		 * for Rx queues follows.
> +		 */
> +		ppriv_size = sizeof(struct mlx5_proc_priv) +
> +			     (priv->rxqs_n + priv->txqs_n) * sizeof(void *);

Why you add also the rxqs_n? why not only the txqs? 

> +		ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
> +					  RTE_CACHE_LINE_SIZE,
> +					  dpdk_dev->numa_node);
> +		if (!ppriv) {
> +			rte_errno = ENOMEM;
> +			return NULL;
> +		}
> +		ppriv->uar_table_sz = ppriv_size;
> +		eth_dev->process_private = ppriv;
>  		eth_dev->device = dpdk_dev;
>  		eth_dev->dev_ops = &mlx5_dev_sec_ops;
>  		/* Receive command fd from primary process */ @@ -1195,7
> +1043,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		if (err < 0)
>  			return NULL;
>  		/* Remap UAR for Tx queues. */
> -		err = mlx5_tx_uar_remap(eth_dev, err);
> +		err = mlx5_tx_uar_init_secondary(eth_dev, err);
>  		if (err)
>  			return NULL;
>  		/*
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 699c8fcf6d..1ac4ad71b1 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -97,8 +97,6 @@ struct mlx5_shared_data {
>  	/* Global spinlock for primary and secondary processes. */
>  	int init_done; /* Whether primary has done initialization. */
>  	unsigned int secondary_cnt; /* Number of secondary processes
> init'd. */
> -	void *uar_base;
> -	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping.
> */
>  	struct mlx5_dev_list mem_event_cb_list;
>  	rte_rwlock_t mem_event_rwlock;
>  };
> @@ -106,8 +104,6 @@ struct mlx5_shared_data {
>  /* Per-process data structure, not visible to other processes. */  struct
> mlx5_local_data {
>  	int init_done; /* Whether a secondary has done initialization. */
> -	void *uar_base;
> -	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping.
> */
>  };
> 
>  extern struct mlx5_shared_data *mlx5_shared_data; @@ -282,6 +278,17
> @@ struct mlx5_ibv_shared {
>  	struct mlx5_ibv_shared_port port[]; /* per device port data array. */
> };
> 
> +/* Per-process private structure. */
> +struct mlx5_proc_priv {
> +	size_t uar_table_sz;
> +	/* Size of UAR register table. */
> +	void *uar_table[];
> +	/* Table of UAR registers for each process. */ };
> +
> +#define MLX5_PROC_PRIV(port_id) \
> +	((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
> +
>  struct mlx5_priv {
>  	LIST_ENTRY(mlx5_priv) mem_event_cb;
>  	/**< Called by memory event callback. */ diff --git
> a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index
> 9ae9dddd3c..42297f11c9 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -382,6 +382,8 @@ int
>  mlx5_dev_configure(struct rte_eth_dev *dev)  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_proc_priv *ppriv;
> +	size_t ppriv_size;
>  	unsigned int rxqs_n = dev->data->nb_rx_queues;
>  	unsigned int txqs_n = dev->data->nb_tx_queues;
>  	unsigned int i;
> @@ -450,6 +452,21 @@ mlx5_dev_configure(struct rte_eth_dev *dev)
>  		if (++j == rxqs_n)
>  			j = 0;
>  	}
> +	/*
> +	 * UAR register table follows the process private structure. BlueFlame
> +	 * registers for Tx queues come first and registers for Rx queues
> +	 * follows.
> +	 */
> +	ppriv_size = sizeof(struct mlx5_proc_priv) +
> +		     (priv->rxqs_n + priv->txqs_n) * sizeof(void *);

Ditto. 

> +	ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
> +				  RTE_CACHE_LINE_SIZE, dev->device-
> >numa_node);
> +	if (!ppriv) {
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	ppriv->uar_table_sz = ppriv_size;
> +	dev->process_private = ppriv;
>  	return 0;
>  }
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 7b58063ceb..5d49892429 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -201,8 +201,8 @@ struct mlx5_txq_data {
>  	volatile void *wqes; /* Work queue (use volatile to write into). */
>  	volatile uint32_t *qp_db; /* Work queue doorbell. */
>  	volatile uint32_t *cq_db; /* Completion queue doorbell. */
> -	volatile void *bf_reg; /* Blueflame register remapped. */
>  	struct rte_mbuf *(*elts)[]; /* TX elements. */
> +	uint16_t port_id; /* Port ID of device. */
>  	uint16_t idx; /* Queue index. */
>  	struct mlx5_txq_stats stats; /* TX queue counters. */  #ifndef
> RTE_ARCH_64 @@ -231,9 +231,12 @@ struct mlx5_txq_ctrl {
>  	struct mlx5_txq_ibv *ibv; /* Verbs queue object. */
>  	struct mlx5_priv *priv; /* Back pointer to private data. */
>  	off_t uar_mmap_offset; /* UAR mmap offset for non-primary
> process. */
> -	volatile void *bf_reg_orig; /* Blueflame register from verbs. */
> +	void *bf_reg; /* BlueFlame register from Verbs. */

I guess you keep this one in order to get the VA offset for the secondary mapping, right? Because otherwise we can take the bf_reg from the UAR table on the process private.

If so, better to rename it to uar_page_offset (or other name you like) in order to avoid fields duplication. 

>  };
> 
> +#define MLX5_TX_BFREG(txq) \
> +		(MLX5_PROC_PRIV((txq)->port_id)->uar_table[(txq)->idx])
> +
>  /* mlx5_rxq.c */
> 
>  extern uint8_t rss_hash_default_key[];
> @@ -301,7 +304,7 @@ uint64_t mlx5_get_rx_queue_offloads(struct
> rte_eth_dev *dev);  int mlx5_tx_queue_setup(struct rte_eth_dev *dev,
> uint16_t idx, uint16_t desc,
>  			unsigned int socket, const struct rte_eth_txconf
> *conf);  void mlx5_tx_queue_release(void *dpdk_txq); -int
> mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd);
> +int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
>  struct mlx5_txq_ibv *mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t
> idx);  struct mlx5_txq_ibv *mlx5_txq_ibv_get(struct rte_eth_dev *dev,
> uint16_t idx);  int mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv); @@
> -704,7 +707,7 @@ static __rte_always_inline void
> mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct
> mlx5_wqe *wqe,
>  		       int cond)
>  {
> -	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
> +	uint64_t *dst = MLX5_TX_BFREG(txq);

I guess no perf penalty due to this change right?
Would you consider to prefetch the addr before the db logic just to be on the safe side?

>  	volatile uint64_t *src = ((volatile uint64_t *)wqe);
> 
>  	rte_cio_wmb();
> diff --git a/drivers/net/mlx5/mlx5_trigger.c
> b/drivers/net/mlx5/mlx5_trigger.c index 7c1e5594d6..b7fde35758 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -58,12 +58,6 @@ mlx5_txq_start(struct rte_eth_dev *dev)
>  			goto error;
>  		}
>  	}
> -	ret = mlx5_tx_uar_remap(dev, priv->sh->ctx->cmd_fd);
> -	if (ret) {
> -		/* Adjust index for rollback. */
> -		i = priv->txqs_n - 1;
> -		goto error;
> -	}
>  	return 0;
>  error:
>  	ret = rte_errno; /* Save rte_errno before cleanup. */ diff --git
> a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 4bd08cb035..5fb1761955 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -229,13 +229,99 @@ mlx5_tx_queue_release(void *dpdk_txq)
>  		}
>  }
> 
> +/**
> + * Initialize Tx UAR registers for primary process.
> + *
> + * @param txq_ctrl
> + *   Pointer to Tx queue control structure.
> + */
> +static void
> +txq_uar_init(struct mlx5_txq_ctrl *txq_ctrl) {
> +	struct mlx5_priv *priv = txq_ctrl->priv;
> +	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
> +
> +	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +	assert(ppriv);
> +	ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg; #ifndef
> +RTE_ARCH_64
> +	struct mlx5_priv *priv = txq_ctrl->priv;
> +	struct mlx5_txq_data *txq = &txq_ctrl->txq;
> +	unsigned int lock_idx;
> +	/* Assign an UAR lock according to UAR page number */
> +	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
> +		   MLX5_UAR_PAGE_NUM_MASK;
> +	txq->uar_lock = &priv->uar_lock[lock_idx]; #endif }
> 
>  /**
> - * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
> - * Both primary and secondary process do mmap to make UAR address
> - * aligned.
> + * Remap UAR register of a Tx queue for secondary process.
>   *
> - * @param[in] dev
> + * Remapped address is stored at the table in the process private
> +structure of
> + * the device, indexed by queue index.
> + *
> + * @param txq_ctrl
> + *   Pointer to Tx queue control structure.
> + * @param fd
> + *   Verbs file descriptor to map UAR pages.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +txq_uar_init_secondary(struct mlx5_txq_ctrl *txq_ctrl, int fd) {
> +	struct mlx5_priv *priv = txq_ctrl->priv;
> +	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
> +	struct mlx5_txq_data *txq = &txq_ctrl->txq;
> +	void *addr;
> +	uintptr_t uar_va;
> +	uintptr_t offset;
> +	const size_t page_size = sysconf(_SC_PAGESIZE);
> +
> +	assert(ppriv);
> +	/*
> +	 * As rdma-core, UARs are mapped in size of OS page
> +	 * size. Ref to libmlx5 function: mlx5_init_context()
> +	 */
> +	uar_va = (uintptr_t)txq_ctrl->bf_reg;
> +	offset = uar_va & (page_size - 1); /* Offset in page. */
> +	addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
> +			txq_ctrl->uar_mmap_offset);
> +	if (addr == MAP_FAILED) {
> +		DRV_LOG(ERR,
> +			"port %u mmap failed for BF reg of txq %u",
> +			txq->port_id, txq->idx);
> +		rte_errno = ENXIO;
> +		return -rte_errno;
> +	}
> +	addr = RTE_PTR_ADD(addr, offset);
> +	ppriv->uar_table[txq->idx] = addr;
> +	return 0;
> +}
> +
> +/**
> + * Unmap UAR register of a Tx queue for secondary process.
> + *
> + * @param txq_ctrl
> + *   Pointer to Tx queue control structure.
> + */
> +static void
> +txq_uar_uninit_secondary(struct mlx5_txq_ctrl *txq_ctrl) {
> +	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(txq_ctrl-
> >priv));
> +	const size_t page_size = sysconf(_SC_PAGESIZE);
> +	void *addr;
> +
> +	addr = ppriv->uar_table[txq_ctrl->txq.idx];
> +	munmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size); }
> +
> +/**
> + * Initialize Tx UAR registers for secondary process.
> + *
> + * @param dev
>   *   Pointer to Ethernet device.
>   * @param fd
>   *   Verbs file descriptor to map UAR pages.
> @@ -244,81 +330,36 @@ mlx5_tx_queue_release(void *dpdk_txq)
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd)
> +mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd)
>  {
>  	struct mlx5_priv *priv = dev->data->dev_private;
> -	unsigned int i, j;
> -	uintptr_t pages[priv->txqs_n];
> -	unsigned int pages_n = 0;
> -	uintptr_t uar_va;
> -	uintptr_t off;
> -	void *addr;
> -	void *ret;
>  	struct mlx5_txq_data *txq;
>  	struct mlx5_txq_ctrl *txq_ctrl;
> -	int already_mapped;
> -	size_t page_size = sysconf(_SC_PAGESIZE);
> -#ifndef RTE_ARCH_64
> -	unsigned int lock_idx;
> -#endif
> +	unsigned int i;
> +	int ret;
> 
> -	memset(pages, 0, priv->txqs_n * sizeof(uintptr_t));
> -	/*
> -	 * As rdma-core, UARs are mapped in size of OS page size.
> -	 * Use aligned address to avoid duplicate mmap.
> -	 * Ref to libmlx5 function: mlx5_init_context()
> -	 */
> +	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
>  	for (i = 0; i != priv->txqs_n; ++i) {
>  		if (!(*priv->txqs)[i])
>  			continue;
>  		txq = (*priv->txqs)[i];
>  		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
>  		assert(txq->idx == (uint16_t)i);
> -		/* UAR addr form verbs used to find dup and offset in page.
> */
> -		uar_va = (uintptr_t)txq_ctrl->bf_reg_orig;
> -		off = uar_va & (page_size - 1); /* offset in page. */
> -		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page
> addr. */
> -		already_mapped = 0;
> -		for (j = 0; j != pages_n; ++j) {
> -			if (pages[j] == uar_va) {
> -				already_mapped = 1;
> -				break;
> -			}
> -		}
> -		/* new address in reserved UAR address space. */
> -		addr = RTE_PTR_ADD(mlx5_shared_data->uar_base,
> -				   uar_va & (uintptr_t)(MLX5_UAR_SIZE - 1));
> -		if (!already_mapped) {
> -			pages[pages_n++] = uar_va;
> -			/* fixed mmap to specified address in reserved
> -			 * address space.
> -			 */
> -			ret = mmap(addr, page_size,
> -				   PROT_WRITE, MAP_FIXED | MAP_SHARED,
> fd,
> -				   txq_ctrl->uar_mmap_offset);
> -			if (ret != addr) {
> -				/* fixed mmap have to return same address
> */
> -				DRV_LOG(ERR,
> -					"port %u call to mmap failed on UAR"
> -					" for txq %u",
> -					dev->data->port_id, txq->idx);
> -				rte_errno = ENXIO;
> -				return -rte_errno;
> -			}
> -		}
> -		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save
> once */
> -			txq_ctrl->txq.bf_reg = RTE_PTR_ADD((void *)addr,
> off);
> -		else
> -			assert(txq_ctrl->txq.bf_reg ==
> -			       RTE_PTR_ADD((void *)addr, off));
> -#ifndef RTE_ARCH_64
> -		/* Assign a UAR lock according to UAR page number */
> -		lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
> -			   MLX5_UAR_PAGE_NUM_MASK;
> -		txq->uar_lock = &priv->uar_lock[lock_idx];
> -#endif
> +		ret = txq_uar_init_secondary(txq_ctrl, fd);
> +		if (ret)
> +			goto error;
>  	}
>  	return 0;
> +error:
> +	/* Rollback. */
> +	do {
> +		if (!(*priv->txqs)[i])
> +			continue;
> +		txq = (*priv->txqs)[i];
> +		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
> +		txq_uar_uninit_secondary(txq_ctrl);
> +	} while (i--);
> +	return -rte_errno;
>  }
> 
>  /**
> @@ -507,7 +548,6 @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  	txq_data->wqes = qp.sq.buf;
>  	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
>  	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
> -	txq_ctrl->bf_reg_orig = qp.bf.reg;
>  	txq_data->cq_db = cq_info.dbrec;
>  	txq_data->cqes =
>  		(volatile struct mlx5_cqe (*)[])
> @@ -521,6 +561,8 @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  	txq_ibv->qp = tmpl.qp;
>  	txq_ibv->cq = tmpl.cq;
>  	rte_atomic32_inc(&txq_ibv->refcnt);
> +	txq_ctrl->bf_reg = qp.bf.reg;
> +	txq_uar_init(txq_ctrl);
>  	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
>  		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
>  		DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%lx", @@ -
> 778,6 +820,7 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
> uint16_t desc,
>  	tmpl->priv = priv;
>  	tmpl->socket = socket;
>  	tmpl->txq.elts_n = log2above(desc);
> +	tmpl->txq.port_id = dev->data->port_id;
>  	tmpl->txq.idx = idx;
>  	txq_set_params(tmpl);
>  	DRV_LOG(DEBUG, "port %u device_attr.max_qp_wr is %d", @@ -
> 836,15 +879,12 @@ mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
> {
>  	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_txq_ctrl *txq;
> -	size_t page_size = sysconf(_SC_PAGESIZE);
> 
>  	if (!(*priv->txqs)[idx])
>  		return 0;
>  	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
>  	if (txq->ibv && !mlx5_txq_ibv_release(txq->ibv))
>  		txq->ibv = NULL;
> -	munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg,
> page_size),
> -	       page_size);
>  	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
>  		txq_free_elts(txq);
>  		mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh);
> --
> 2.11.0
  
Yongseok Koh April 9, 2019, 7:36 p.m. UTC | #2
> On Apr 7, 2019, at 10:48 PM, Shahaf Shuler <shahafs@mellanox.com> wrote:
> 
> Hi Koh,
> 
> See small comments below. Same for mlx4 patch.
> 
> 
> Friday, April 5, 2019 4:34 AM, Yongseok Koh:
>> Subject: [dpdk-dev] [PATCH v3 3/4] net/mlx5: remove device register remap
>> 
>> UAR (User Access Region) register does not need to be remapped for
>> primary process but it should be remapped only for secondary process. UAR
>> register table is in the process private structure in rte_eth_devices[],
>> 	(struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private
>> 
>> The actual UAR table follows the data structure and the table is used for both
>> Tx and Rx.
>> 
>> For Tx, BlueFlame in UAR is used to ring the doorbell. MLX5_TX_BFREG(txq)
>> is defined to get a register for the txq. Processes access its own private data
>> to acquire the register from the UAR table.
>> 
>> For Rx, the doorbell in UAR is required in arming CQ event. However, it is a
>> known issue that the register isn't remapped for secondary process.
>> 
>> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
>> ---
>> drivers/net/mlx5/mlx5.c         | 198 +++++-----------------------------------
>> drivers/net/mlx5/mlx5.h         |  15 ++-
>> drivers/net/mlx5/mlx5_ethdev.c  |  17 ++++
>> drivers/net/mlx5/mlx5_rxtx.h    |  11 ++-
>> drivers/net/mlx5/mlx5_trigger.c |   6 --
>> drivers/net/mlx5/mlx5_txq.c     | 180 ++++++++++++++++++++++-------------
>> -
> 
> [...]
> 
>> /**
>> @@ -1182,12 +1010,32 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>> 	}
>> 	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
>> 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>> +		struct mlx5_proc_priv *ppriv;
>> +		size_t ppriv_size;
>> +
>> 		eth_dev = rte_eth_dev_attach_secondary(name);
>> 		if (eth_dev == NULL) {
>> 			DRV_LOG(ERR, "can not attach rte ethdev");
>> 			rte_errno = ENOMEM;
>> 			return NULL;
>> 		}
>> +		priv = eth_dev->data->dev_private;
>> +		/*
>> +		 * UAR register table follows the process private structure.
>> +		 * BlueFlame registers for Tx queues come first and registers
>> +		 * for Rx queues follows.
>> +		 */
>> +		ppriv_size = sizeof(struct mlx5_proc_priv) +
>> +			     (priv->rxqs_n + priv->txqs_n) * sizeof(void *);
> 
> Why you add also the rxqs_n? why not only the txqs?

I wanted to make a room for the registers for arming Rx CQ, which will be fixed
soon. But, I think it would be better to avoid confusion in this patch. Will remove.

[...]
>> +	ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
>> +				  RTE_CACHE_LINE_SIZE, dev->device-
>>> numa_node);
>> +	if (!ppriv) {
>> +		rte_errno = ENOMEM;
>> +		return -rte_errno;
>> +	}
>> +	ppriv->uar_table_sz = ppriv_size;
>> +	dev->process_private = ppriv;
>> 	return 0;
>> }
>> 
>> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
>> index 7b58063ceb..5d49892429 100644
>> --- a/drivers/net/mlx5/mlx5_rxtx.h
>> +++ b/drivers/net/mlx5/mlx5_rxtx.h
>> @@ -201,8 +201,8 @@ struct mlx5_txq_data {
>> 	volatile void *wqes; /* Work queue (use volatile to write into). */
>> 	volatile uint32_t *qp_db; /* Work queue doorbell. */
>> 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
>> -	volatile void *bf_reg; /* Blueflame register remapped. */
>> 	struct rte_mbuf *(*elts)[]; /* TX elements. */
>> +	uint16_t port_id; /* Port ID of device. */
>> 	uint16_t idx; /* Queue index. */
>> 	struct mlx5_txq_stats stats; /* TX queue counters. */  #ifndef
>> RTE_ARCH_64 @@ -231,9 +231,12 @@ struct mlx5_txq_ctrl {
>> 	struct mlx5_txq_ibv *ibv; /* Verbs queue object. */
>> 	struct mlx5_priv *priv; /* Back pointer to private data. */
>> 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary
>> process. */
>> -	volatile void *bf_reg_orig; /* Blueflame register from verbs. */
>> +	void *bf_reg; /* BlueFlame register from Verbs. */
> 
> I guess you keep this one in order to get the VA offset for the secondary mapping, right? Because otherwise we can take the bf_reg from the UAR table on the process private.
> 
> If so, better to rename it to uar_page_offset (or other name you like) in order to avoid fields duplication. 

It doesn't have the offset (offset can be calculated when remapping).
It is the original BF register address gotten from QP creation.

>> };
>> 
>> +#define MLX5_TX_BFREG(txq) \
>> +		(MLX5_PROC_PRIV((txq)->port_id)->uar_table[(txq)->idx])
>> +
>> /* mlx5_rxq.c */
>> 
>> extern uint8_t rss_hash_default_key[];
>> @@ -301,7 +304,7 @@ uint64_t mlx5_get_rx_queue_offloads(struct
>> rte_eth_dev *dev);  int mlx5_tx_queue_setup(struct rte_eth_dev *dev,
>> uint16_t idx, uint16_t desc,
>> 			unsigned int socket, const struct rte_eth_txconf
>> *conf);  void mlx5_tx_queue_release(void *dpdk_txq); -int
>> mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd);
>> +int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
>> struct mlx5_txq_ibv *mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t
>> idx);  struct mlx5_txq_ibv *mlx5_txq_ibv_get(struct rte_eth_dev *dev,
>> uint16_t idx);  int mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv); @@
>> -704,7 +707,7 @@ static __rte_always_inline void
>> mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct
>> mlx5_wqe *wqe,
>> 		       int cond)
>> {
>> -	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
>> +	uint64_t *dst = MLX5_TX_BFREG(txq);
> 
> I guess no perf penalty due to this change right?

I've confirmed no perf drop on x86 and ARM (BlueField).
It might have been smaller than the jitter even if any.

> Would you consider to prefetch the addr before the db logic just to be on the safe side?

As it is not a random/sequential access but accessing the same cacheline
repeatedly, I wouldn't have a prefetch here. Sometimes, prefetch have a little cost.

Will send out a new version just in case you agree and want to merge it.
If you still want to change something, please comment on the new one.


Thanks,
Yongseok
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f571ba2e97..c28a66fa07 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -449,30 +449,6 @@  mlx5_init_shared_data(void)
 }
 
 /**
- * Uninitialize shared data between primary and secondary process.
- *
- * The pointer of secondary process is dereferenced and primary process frees
- * the memzone.
- */
-static void
-mlx5_uninit_shared_data(void)
-{
-	const struct rte_memzone *mz;
-
-	rte_spinlock_lock(&mlx5_shared_data_lock);
-	if (mlx5_shared_data) {
-		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
-			mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
-			rte_memzone_free(mz);
-		} else {
-			memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
-		}
-		mlx5_shared_data = NULL;
-	}
-	rte_spinlock_unlock(&mlx5_shared_data_lock);
-}
-
-/**
  * Retrieve integer value from environment variable.
  *
  * @param[in] name
@@ -589,6 +565,8 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 		priv->txqs_n = 0;
 		priv->txqs = NULL;
 	}
+	rte_free(dev->process_private);
+	dev->process_private = NULL;
 	mlx5_mprq_free_mp(dev);
 	mlx5_mr_release(dev);
 	assert(priv->sh);
@@ -906,132 +884,6 @@  mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 
 static struct rte_pci_driver mlx5_driver;
 
-static int
-find_lower_va_bound(const struct rte_memseg_list *msl,
-		const struct rte_memseg *ms, void *arg)
-{
-	void **addr = arg;
-
-	if (msl->external)
-		return 0;
-	if (*addr == NULL)
-		*addr = ms->addr;
-	else
-		*addr = RTE_MIN(*addr, ms->addr);
-
-	return 0;
-}
-
-/**
- * Reserve UAR address space for primary process.
- *
- * Process local resource is used by both primary and secondary to avoid
- * duplicate reservation. The space has to be available on both primary and
- * secondary process, TXQ UAR maps to this area using fixed mmap w/o double
- * check.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_uar_init_primary(void)
-{
-	struct mlx5_shared_data *sd = mlx5_shared_data;
-	void *addr = (void *)0;
-
-	if (sd->uar_base)
-		return 0;
-	/* find out lower bound of hugepage segments */
-	rte_memseg_walk(find_lower_va_bound, &addr);
-	/* keep distance to hugepages to minimize potential conflicts. */
-	addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX5_UAR_OFFSET + MLX5_UAR_SIZE));
-	/* anonymous mmap, no real memory consumption. */
-	addr = mmap(addr, MLX5_UAR_SIZE,
-		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (addr == MAP_FAILED) {
-		DRV_LOG(ERR,
-			"Failed to reserve UAR address space, please"
-			" adjust MLX5_UAR_SIZE or try --base-virtaddr");
-		rte_errno = ENOMEM;
-		return -rte_errno;
-	}
-	/* Accept either same addr or a new addr returned from mmap if target
-	 * range occupied.
-	 */
-	DRV_LOG(INFO, "Reserved UAR address space: %p", addr);
-	sd->uar_base = addr; /* for primary and secondary UAR re-mmap. */
-	return 0;
-}
-
-/**
- * Unmap UAR address space reserved for primary process.
- */
-static void
-mlx5_uar_uninit_primary(void)
-{
-	struct mlx5_shared_data *sd = mlx5_shared_data;
-
-	if (!sd->uar_base)
-		return;
-	munmap(sd->uar_base, MLX5_UAR_SIZE);
-	sd->uar_base = NULL;
-}
-
-/**
- * Reserve UAR address space for secondary process, align with primary process.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_uar_init_secondary(void)
-{
-	struct mlx5_shared_data *sd = mlx5_shared_data;
-	struct mlx5_local_data *ld = &mlx5_local_data;
-	void *addr;
-
-	if (ld->uar_base) { /* Already reserved. */
-		assert(sd->uar_base == ld->uar_base);
-		return 0;
-	}
-	assert(sd->uar_base);
-	/* anonymous mmap, no real memory consumption. */
-	addr = mmap(sd->uar_base, MLX5_UAR_SIZE,
-		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (addr == MAP_FAILED) {
-		DRV_LOG(ERR, "UAR mmap failed: %p size: %llu",
-			sd->uar_base, MLX5_UAR_SIZE);
-		rte_errno = ENXIO;
-		return -rte_errno;
-	}
-	if (sd->uar_base != addr) {
-		DRV_LOG(ERR,
-			"UAR address %p size %llu occupied, please"
-			" adjust MLX5_UAR_OFFSET or try EAL parameter"
-			" --base-virtaddr",
-			sd->uar_base, MLX5_UAR_SIZE);
-		rte_errno = ENXIO;
-		return -rte_errno;
-	}
-	ld->uar_base = addr;
-	DRV_LOG(INFO, "Reserved UAR address space: %p", addr);
-	return 0;
-}
-
-/**
- * Unmap UAR address space reserved for secondary process.
- */
-static void
-mlx5_uar_uninit_secondary(void)
-{
-	struct mlx5_local_data *ld = &mlx5_local_data;
-
-	if (!ld->uar_base)
-		return;
-	munmap(ld->uar_base, MLX5_UAR_SIZE);
-	ld->uar_base = NULL;
-}
-
 /**
  * PMD global initialization.
  *
@@ -1047,7 +899,6 @@  mlx5_init_once(void)
 {
 	struct mlx5_shared_data *sd;
 	struct mlx5_local_data *ld = &mlx5_local_data;
-	int ret;
 
 	if (mlx5_init_shared_data())
 		return -rte_errno;
@@ -1063,18 +914,12 @@  mlx5_init_once(void)
 		rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
 						mlx5_mr_mem_event_cb, NULL);
 		mlx5_mp_init_primary();
-		ret = mlx5_uar_init_primary();
-		if (ret)
-			goto error;
 		sd->init_done = true;
 		break;
 	case RTE_PROC_SECONDARY:
 		if (ld->init_done)
 			break;
 		mlx5_mp_init_secondary();
-		ret = mlx5_uar_init_secondary();
-		if (ret)
-			goto error;
 		++sd->secondary_cnt;
 		ld->init_done = true;
 		break;
@@ -1083,23 +928,6 @@  mlx5_init_once(void)
 	}
 	rte_spinlock_unlock(&sd->lock);
 	return 0;
-error:
-	switch (rte_eal_process_type()) {
-	case RTE_PROC_PRIMARY:
-		mlx5_uar_uninit_primary();
-		mlx5_mp_uninit_primary();
-		rte_mem_event_callback_unregister("MLX5_MEM_EVENT_CB", NULL);
-		break;
-	case RTE_PROC_SECONDARY:
-		mlx5_uar_uninit_secondary();
-		mlx5_mp_uninit_secondary();
-		break;
-	default:
-		break;
-	}
-	rte_spinlock_unlock(&sd->lock);
-	mlx5_uninit_shared_data();
-	return -rte_errno;
 }
 
 /**
@@ -1182,12 +1010,32 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		struct mlx5_proc_priv *ppriv;
+		size_t ppriv_size;
+
 		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
 			DRV_LOG(ERR, "can not attach rte ethdev");
 			rte_errno = ENOMEM;
 			return NULL;
 		}
+		priv = eth_dev->data->dev_private;
+		/*
+		 * UAR register table follows the process private structure.
+		 * BlueFlame registers for Tx queues come first and registers
+		 * for Rx queues follows.
+		 */
+		ppriv_size = sizeof(struct mlx5_proc_priv) +
+			     (priv->rxqs_n + priv->txqs_n) * sizeof(void *);
+		ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
+					  RTE_CACHE_LINE_SIZE,
+					  dpdk_dev->numa_node);
+		if (!ppriv) {
+			rte_errno = ENOMEM;
+			return NULL;
+		}
+		ppriv->uar_table_sz = ppriv_size;
+		eth_dev->process_private = ppriv;
 		eth_dev->device = dpdk_dev;
 		eth_dev->dev_ops = &mlx5_dev_sec_ops;
 		/* Receive command fd from primary process */
@@ -1195,7 +1043,7 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		if (err < 0)
 			return NULL;
 		/* Remap UAR for Tx queues. */
-		err = mlx5_tx_uar_remap(eth_dev, err);
+		err = mlx5_tx_uar_init_secondary(eth_dev, err);
 		if (err)
 			return NULL;
 		/*
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 699c8fcf6d..1ac4ad71b1 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -97,8 +97,6 @@  struct mlx5_shared_data {
 	/* Global spinlock for primary and secondary processes. */
 	int init_done; /* Whether primary has done initialization. */
 	unsigned int secondary_cnt; /* Number of secondary processes init'd. */
-	void *uar_base;
-	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
 	struct mlx5_dev_list mem_event_cb_list;
 	rte_rwlock_t mem_event_rwlock;
 };
@@ -106,8 +104,6 @@  struct mlx5_shared_data {
 /* Per-process data structure, not visible to other processes. */
 struct mlx5_local_data {
 	int init_done; /* Whether a secondary has done initialization. */
-	void *uar_base;
-	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
 };
 
 extern struct mlx5_shared_data *mlx5_shared_data;
@@ -282,6 +278,17 @@  struct mlx5_ibv_shared {
 	struct mlx5_ibv_shared_port port[]; /* per device port data array. */
 };
 
+/* Per-process private structure. */
+struct mlx5_proc_priv {
+	size_t uar_table_sz;
+	/* Size of UAR register table. */
+	void *uar_table[];
+	/* Table of UAR registers for each process. */
+};
+
+#define MLX5_PROC_PRIV(port_id) \
+	((struct mlx5_proc_priv *)rte_eth_devices[port_id].process_private)
+
 struct mlx5_priv {
 	LIST_ENTRY(mlx5_priv) mem_event_cb;
 	/**< Called by memory event callback. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 9ae9dddd3c..42297f11c9 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -382,6 +382,8 @@  int
 mlx5_dev_configure(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_proc_priv *ppriv;
+	size_t ppriv_size;
 	unsigned int rxqs_n = dev->data->nb_rx_queues;
 	unsigned int txqs_n = dev->data->nb_tx_queues;
 	unsigned int i;
@@ -450,6 +452,21 @@  mlx5_dev_configure(struct rte_eth_dev *dev)
 		if (++j == rxqs_n)
 			j = 0;
 	}
+	/*
+	 * UAR register table follows the process private structure. BlueFlame
+	 * registers for Tx queues come first and registers for Rx queues
+	 * follows.
+	 */
+	ppriv_size = sizeof(struct mlx5_proc_priv) +
+		     (priv->rxqs_n + priv->txqs_n) * sizeof(void *);
+	ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
+				  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+	if (!ppriv) {
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	ppriv->uar_table_sz = ppriv_size;
+	dev->process_private = ppriv;
 	return 0;
 }
 
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 7b58063ceb..5d49892429 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -201,8 +201,8 @@  struct mlx5_txq_data {
 	volatile void *wqes; /* Work queue (use volatile to write into). */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
-	volatile void *bf_reg; /* Blueflame register remapped. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
+	uint16_t port_id; /* Port ID of device. */
 	uint16_t idx; /* Queue index. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
 #ifndef RTE_ARCH_64
@@ -231,9 +231,12 @@  struct mlx5_txq_ctrl {
 	struct mlx5_txq_ibv *ibv; /* Verbs queue object. */
 	struct mlx5_priv *priv; /* Back pointer to private data. */
 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
-	volatile void *bf_reg_orig; /* Blueflame register from verbs. */
+	void *bf_reg; /* BlueFlame register from Verbs. */
 };
 
+#define MLX5_TX_BFREG(txq) \
+		(MLX5_PROC_PRIV((txq)->port_id)->uar_table[(txq)->idx])
+
 /* mlx5_rxq.c */
 
 extern uint8_t rss_hash_default_key[];
@@ -301,7 +304,7 @@  uint64_t mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev);
 int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 			unsigned int socket, const struct rte_eth_txconf *conf);
 void mlx5_tx_queue_release(void *dpdk_txq);
-int mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd);
+int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
 struct mlx5_txq_ibv *mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx);
 struct mlx5_txq_ibv *mlx5_txq_ibv_get(struct rte_eth_dev *dev, uint16_t idx);
 int mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv);
@@ -704,7 +707,7 @@  static __rte_always_inline void
 mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe,
 		       int cond)
 {
-	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
+	uint64_t *dst = MLX5_TX_BFREG(txq);
 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
 
 	rte_cio_wmb();
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 7c1e5594d6..b7fde35758 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -58,12 +58,6 @@  mlx5_txq_start(struct rte_eth_dev *dev)
 			goto error;
 		}
 	}
-	ret = mlx5_tx_uar_remap(dev, priv->sh->ctx->cmd_fd);
-	if (ret) {
-		/* Adjust index for rollback. */
-		i = priv->txqs_n - 1;
-		goto error;
-	}
 	return 0;
 error:
 	ret = rte_errno; /* Save rte_errno before cleanup. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4bd08cb035..5fb1761955 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -229,13 +229,99 @@  mlx5_tx_queue_release(void *dpdk_txq)
 		}
 }
 
+/**
+ * Initialize Tx UAR registers for primary process.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ */
+static void
+txq_uar_init(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
+
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	assert(ppriv);
+	ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg;
+#ifndef RTE_ARCH_64
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_txq_data *txq = &txq_ctrl->txq;
+	unsigned int lock_idx;
+	/* Assign an UAR lock according to UAR page number */
+	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
+		   MLX5_UAR_PAGE_NUM_MASK;
+	txq->uar_lock = &priv->uar_lock[lock_idx];
+#endif
+}
 
 /**
- * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
- * Both primary and secondary process do mmap to make UAR address
- * aligned.
+ * Remap UAR register of a Tx queue for secondary process.
  *
- * @param[in] dev
+ * Remapped address is stored at the table in the process private structure of
+ * the device, indexed by queue index.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ * @param fd
+ *   Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+txq_uar_init_secondary(struct mlx5_txq_ctrl *txq_ctrl, int fd)
+{
+	struct mlx5_priv *priv = txq_ctrl->priv;
+	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
+	struct mlx5_txq_data *txq = &txq_ctrl->txq;
+	void *addr;
+	uintptr_t uar_va;
+	uintptr_t offset;
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+
+	assert(ppriv);
+	/*
+	 * As rdma-core, UARs are mapped in size of OS page
+	 * size. Ref to libmlx5 function: mlx5_init_context()
+	 */
+	uar_va = (uintptr_t)txq_ctrl->bf_reg;
+	offset = uar_va & (page_size - 1); /* Offset in page. */
+	addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
+			txq_ctrl->uar_mmap_offset);
+	if (addr == MAP_FAILED) {
+		DRV_LOG(ERR,
+			"port %u mmap failed for BF reg of txq %u",
+			txq->port_id, txq->idx);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	addr = RTE_PTR_ADD(addr, offset);
+	ppriv->uar_table[txq->idx] = addr;
+	return 0;
+}
+
+/**
+ * Unmap UAR register of a Tx queue for secondary process.
+ *
+ * @param txq_ctrl
+ *   Pointer to Tx queue control structure.
+ */
+static void
+txq_uar_uninit_secondary(struct mlx5_txq_ctrl *txq_ctrl)
+{
+	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(txq_ctrl->priv));
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+	void *addr;
+
+	addr = ppriv->uar_table[txq_ctrl->txq.idx];
+	munmap(RTE_PTR_ALIGN_FLOOR(addr, page_size), page_size);
+}
+
+/**
+ * Initialize Tx UAR registers for secondary process.
+ *
+ * @param dev
  *   Pointer to Ethernet device.
  * @param fd
  *   Verbs file descriptor to map UAR pages.
@@ -244,81 +330,36 @@  mlx5_tx_queue_release(void *dpdk_txq)
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd)
+mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	unsigned int i, j;
-	uintptr_t pages[priv->txqs_n];
-	unsigned int pages_n = 0;
-	uintptr_t uar_va;
-	uintptr_t off;
-	void *addr;
-	void *ret;
 	struct mlx5_txq_data *txq;
 	struct mlx5_txq_ctrl *txq_ctrl;
-	int already_mapped;
-	size_t page_size = sysconf(_SC_PAGESIZE);
-#ifndef RTE_ARCH_64
-	unsigned int lock_idx;
-#endif
+	unsigned int i;
+	int ret;
 
-	memset(pages, 0, priv->txqs_n * sizeof(uintptr_t));
-	/*
-	 * As rdma-core, UARs are mapped in size of OS page size.
-	 * Use aligned address to avoid duplicate mmap.
-	 * Ref to libmlx5 function: mlx5_init_context()
-	 */
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
 	for (i = 0; i != priv->txqs_n; ++i) {
 		if (!(*priv->txqs)[i])
 			continue;
 		txq = (*priv->txqs)[i];
 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
 		assert(txq->idx == (uint16_t)i);
-		/* UAR addr form verbs used to find dup and offset in page. */
-		uar_va = (uintptr_t)txq_ctrl->bf_reg_orig;
-		off = uar_va & (page_size - 1); /* offset in page. */
-		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr. */
-		already_mapped = 0;
-		for (j = 0; j != pages_n; ++j) {
-			if (pages[j] == uar_va) {
-				already_mapped = 1;
-				break;
-			}
-		}
-		/* new address in reserved UAR address space. */
-		addr = RTE_PTR_ADD(mlx5_shared_data->uar_base,
-				   uar_va & (uintptr_t)(MLX5_UAR_SIZE - 1));
-		if (!already_mapped) {
-			pages[pages_n++] = uar_va;
-			/* fixed mmap to specified address in reserved
-			 * address space.
-			 */
-			ret = mmap(addr, page_size,
-				   PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
-				   txq_ctrl->uar_mmap_offset);
-			if (ret != addr) {
-				/* fixed mmap have to return same address */
-				DRV_LOG(ERR,
-					"port %u call to mmap failed on UAR"
-					" for txq %u",
-					dev->data->port_id, txq->idx);
-				rte_errno = ENXIO;
-				return -rte_errno;
-			}
-		}
-		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once */
-			txq_ctrl->txq.bf_reg = RTE_PTR_ADD((void *)addr, off);
-		else
-			assert(txq_ctrl->txq.bf_reg ==
-			       RTE_PTR_ADD((void *)addr, off));
-#ifndef RTE_ARCH_64
-		/* Assign a UAR lock according to UAR page number */
-		lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
-			   MLX5_UAR_PAGE_NUM_MASK;
-		txq->uar_lock = &priv->uar_lock[lock_idx];
-#endif
+		ret = txq_uar_init_secondary(txq_ctrl, fd);
+		if (ret)
+			goto error;
 	}
 	return 0;
+error:
+	/* Rollback. */
+	do {
+		if (!(*priv->txqs)[i])
+			continue;
+		txq = (*priv->txqs)[i];
+		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+		txq_uar_uninit_secondary(txq_ctrl);
+	} while (i--);
+	return -rte_errno;
 }
 
 /**
@@ -507,7 +548,6 @@  mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 	txq_data->wqes = qp.sq.buf;
 	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
 	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
-	txq_ctrl->bf_reg_orig = qp.bf.reg;
 	txq_data->cq_db = cq_info.dbrec;
 	txq_data->cqes =
 		(volatile struct mlx5_cqe (*)[])
@@ -521,6 +561,8 @@  mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 	txq_ibv->qp = tmpl.qp;
 	txq_ibv->cq = tmpl.cq;
 	rte_atomic32_inc(&txq_ibv->refcnt);
+	txq_ctrl->bf_reg = qp.bf.reg;
+	txq_uar_init(txq_ctrl);
 	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
 		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
 		DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%lx",
@@ -778,6 +820,7 @@  mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->priv = priv;
 	tmpl->socket = socket;
 	tmpl->txq.elts_n = log2above(desc);
+	tmpl->txq.port_id = dev->data->port_id;
 	tmpl->txq.idx = idx;
 	txq_set_params(tmpl);
 	DRV_LOG(DEBUG, "port %u device_attr.max_qp_wr is %d",
@@ -836,15 +879,12 @@  mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_txq_ctrl *txq;
-	size_t page_size = sysconf(_SC_PAGESIZE);
 
 	if (!(*priv->txqs)[idx])
 		return 0;
 	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
 	if (txq->ibv && !mlx5_txq_ibv_release(txq->ibv))
 		txq->ibv = NULL;
-	munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg, page_size),
-	       page_size);
 	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
 		txq_free_elts(txq);
 		mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh);