net/mlx5: fix transmit doorbell register write barrier

Message ID 1568789674-11264-1-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Changes Requested, archived
Delegated to: Raslan Darawsheh
Headers
Series net/mlx5: fix transmit doorbell register write barrier |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-dpdk_compile success Compile Testing PASS
ci/iol-dpdk_compile_spdk success Compile Testing PASS
ci/iol-dpdk_compile_ovs success Compile Testing PASS
ci/intel-Performance success Performance Testing PASS
ci/mellanox-Performance fail Performance Testing issues
ci/Intel-compilation success Compilation OK

Commit Message

Slava Ovsiienko Sept. 18, 2019, 6:54 a.m. UTC
  The rdma core library can map doorbell register in two ways,
depending on the environment variable "MLX5_SHUT_UP_BF":

  - as regular cached memory, the variable is either missing or
    set to zero. This type of mapping may cause the significant
    doorbell register writing latency and requires explicit
    memory write barrier to mitigate this issue and prevent
    write combining.

  - as non-cached memory, the variable is present and set to
    not "0" value. This type of mapping may cause performance
    impact under heavy loading conditions but the explicit write
    memory barrier is not required and it may improve core
    performance.

This patch checks the mapping type and provides the memory
barrier after writing to tx doorbell register if it is needed.
The mapping type is extracted directly from the uar_mmap_offset
field in the queue properties.

Fixes: 18a1c20044c0 ("net/mlx5: implement Tx burst template")

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/Makefile    |  5 +++++
 drivers/net/mlx5/meson.build |  2 ++
 drivers/net/mlx5/mlx5_defs.h |  8 ++++++++
 drivers/net/mlx5/mlx5_rxtx.c | 17 ++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |  1 +
 drivers/net/mlx5/mlx5_txq.c  | 17 ++++++++++++++++-
 6 files changed, 48 insertions(+), 2 deletions(-)
  

Comments

Matan Azrad Sept. 22, 2019, 6:58 a.m. UTC | #1
Hi Slava

Questions inline.

From: Viacheslav Ovsiienko
> The rdma core library can map doorbell register in two ways, depending on
> the environment variable "MLX5_SHUT_UP_BF":
> 
>   - as regular cached memory, the variable is either missing or
>     set to zero. This type of mapping may cause the significant
>     doorbell register writing latency and requires explicit
>     memory write barrier to mitigate this issue and prevent
>     write combining.
> 
>   - as non-cached memory, the variable is present and set to
>     not "0" value. This type of mapping may cause performance
>     impact under heavy loading conditions but the explicit write
>     memory barrier is not required and it may improve core
>     performance.
> 
> This patch checks the mapping type and provides the memory barrier after
> writing to tx doorbell register if it is needed.
> The mapping type is extracted directly from the uar_mmap_offset field in
> the queue properties.
> 
> Fixes: 18a1c20044c0 ("net/mlx5: implement Tx burst template")
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/net/mlx5/Makefile    |  5 +++++
>  drivers/net/mlx5/meson.build |  2 ++
>  drivers/net/mlx5/mlx5_defs.h |  8 ++++++++  drivers/net/mlx5/mlx5_rxtx.c
> | 17 ++++++++++++++++-  drivers/net/mlx5/mlx5_rxtx.h |  1 +
> drivers/net/mlx5/mlx5_txq.c  | 17 ++++++++++++++++-
>  6 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index
> d89a7b5..7100fa3 100644
> --- a/drivers/net/mlx5/Makefile
> +++ b/drivers/net/mlx5/Makefile
> @@ -189,6 +189,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-
> config-h.sh
>  		func mlx5dv_dr_action_create_dest_devx_tir \
>  		$(AUTOCONF_OUTPUT)
>  	$Q sh -- '$<' '$@' \
> +		HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD \
> +		infiniband/mlx5dv.h \
> +		enum MLX5_MMAP_GET_NC_PAGES_CMD \
> +		$(AUTOCONF_OUTPUT)
> +	$Q sh -- '$<' '$@' \
>  		HAVE_ETHTOOL_LINK_MODE_25G \
>  		/usr/include/linux/ethtool.h \
>  		enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \ diff --
> git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build index
> fb764fa..e56e018 100644
> --- a/drivers/net/mlx5/meson.build
> +++ b/drivers/net/mlx5/meson.build
> @@ -128,6 +128,8 @@ if build
>  		'mlx5dv_devx_obj_query_async' ],
>  		[ 'HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR',
> 'infiniband/mlx5dv.h',
>  		'mlx5dv_dr_action_create_dest_devx_tir' ],
> +		[ 'HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD',
> 'infiniband/mlx5dv.h',
> +		'MLX5_MMAP_GET_NC_PAGES_CMD' ],
>  		[ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
>  		'MLX5DV_DR_DOMAIN_TYPE_NIC_RX' ],
>  		[ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h', diff --
> git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index
> d7440fd..03a8086 100644
> --- a/drivers/net/mlx5/mlx5_defs.h
> +++ b/drivers/net/mlx5/mlx5_defs.h
> @@ -115,6 +115,14 @@
>  #define MLX5_UAR_PAGE_NUM_MAX 64
>  #define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) -
> 1)
> 
> +/* Fields of memory mapping type in offset parameter of mmap() */
> +#define MLX5_UAR_MMAP_CMD_SHIFT 8 #define
> MLX5_UAR_MMAP_CMD_MASK 0xff
> +
> +#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD #define
> +MLX5_MMAP_GET_NC_PAGES_CMD 3 #endif
> +
>  /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */
> #define MLX5_MPRQ_STRIDE_NUM_N 6U
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index f540977..fa3aa15 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -4730,8 +4730,23 @@ enum mlx5_txcmp_code {
>  	 * to improve latencies. The pure software related data treatment
>  	 * can be completed after doorbell. Tx CQEs for this SQ are
>  	 * processed in this thread only by the polling.
> +	 *
> +	 * The rdma core library can map doorbell register in two ways,
> +	 * depending on the environment variable "MLX5_SHUT_UP_BF":
> +	 *
> +	 * - as regular cached memory, the variable is either missing or
> +	 *   set to zero. This type of mapping may cause the significant
> +	 *   doorbell register writing latency and requires explicit
> +	 *   memory write barrier to mitigate this issue and prevent
> +	 *   write combining.
> +	 *
> +	 * - as non-cached memory, the variable is present and set to
> +	 *   not "0" value. This type of mapping may cause performance
> +	 *   impact under heavy loading conditions but the explicit write
> +	 *   memory barrier is not required and it may improve core
> +	 *   performance.
>  	 */
> -	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0);
> +	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc);
>  	/* Not all of the mbufs may be stored into elts yet. */
>  	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent -
> loc.pkts_copy;
>  	if (!MLX5_TXOFF_CONFIG(INLINE) && part) { diff --git
> a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index
> 4f73d91..ae3a763 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -277,6 +277,7 @@ struct mlx5_txq_data {
>  	/* When set TX offload for tunneled packets are supported. */
>  	uint16_t swp_en:1; /* Whether SW parser is enabled. */
>  	uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
> +	uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
>  	uint16_t inlen_send; /* Ordinary send data inline size. */
>  	uint16_t inlen_empw; /* eMPW max packet size to inline. */
>  	uint16_t inlen_mode; /* Minimal data length to inline. */ diff --git
> a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 2b7d6c0..7ec2ef3 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -18,6 +18,7 @@
>  #pragma GCC diagnostic ignored "-Wpedantic"
>  #endif
>  #include <infiniband/verbs.h>
> +#include <infiniband/mlx5dv.h>
>  #ifdef PEDANTIC
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> @@ -241,14 +242,21 @@
>  {
>  	struct mlx5_priv *priv = txq_ctrl->priv;
>  	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
> +	const size_t page_size = sysconf(_SC_PAGESIZE);
> +	unsigned int cmd;
>  #ifndef RTE_ARCH_64
>  	unsigned int lock_idx;
> -	const size_t page_size = sysconf(_SC_PAGESIZE);
>  #endif
> 
>  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>  	assert(ppriv);
>  	ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg;
> +	/* Check the doorbell register mapping type. */
> +	cmd = txq_ctrl->uar_mmap_offset / page_size;
> +	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
> +	cmd &= MLX5_UAR_MMAP_CMD_MASK;
> +	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
> +		txq_ctrl->txq.db_nc = 1;

Are you sure we can't retrieve the value in compile time?

>  #ifndef RTE_ARCH_64
>  	/* Assign an UAR lock according to UAR page number */
>  	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & @@ -281,6
> +289,7 @@
>  	uintptr_t uar_va;
>  	uintptr_t offset;
>  	const size_t page_size = sysconf(_SC_PAGESIZE);
> +	unsigned int cmd;
> 
>  	assert(ppriv);
>  	/*
> @@ -300,6 +309,12 @@
>  	}
>  	addr = RTE_PTR_ADD(addr, offset);
>  	ppriv->uar_table[txq->idx] = addr;
> +	/* Check the doorbell register mapping type. */
> +	cmd = txq_ctrl->uar_mmap_offset / page_size;
> +	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
> +	cmd &= MLX5_UAR_MMAP_CMD_MASK;
> +	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
> +		txq_ctrl->txq.db_nc = 1;

Looks like double code.
Maybe it is better to get the value by a MACRO?


>  	return 0;
>  }
> 
> --
> 1.8.3.1
  
Slava Ovsiienko Sept. 22, 2019, 10:13 a.m. UTC | #2
> -----Original Message-----
> From: Matan Azrad <matan@mellanox.com>
> Sent: Sunday, September 22, 2019 9:59
> To: Slava Ovsiienko <viacheslavo@mellanox.com>; dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@mellanox.com>
> Subject: RE: [PATCH] net/mlx5: fix transmit doorbell register write barrier
> 
> Hi Slava
> 
> Questions inline.
> 

[.. skipped ...]


> > +	/* Check the doorbell register mapping type. */
> > +	cmd = txq_ctrl->uar_mmap_offset / page_size;
> > +	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
> > +	cmd &= MLX5_UAR_MMAP_CMD_MASK;
> > +	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
> > +		txq_ctrl->txq.db_nc = 1;
> 
> Are you sure we can't retrieve the value in compile time?
It is deduced by rdma_core library in runtime,
grounding on the MLX5_SHUT_UP_BF environment variable.
This variable is checked by rdma library in ibv device open calls
(both verbs/dv).

> 
> >  #ifndef RTE_ARCH_64
> >  	/* Assign an UAR lock according to UAR page number */
> >  	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & @@ -281,6
> > +289,7 @@
> >  	uintptr_t uar_va;
> >  	uintptr_t offset;
> >  	const size_t page_size = sysconf(_SC_PAGESIZE);
> > +	unsigned int cmd;
> >
> >  	assert(ppriv);
> >  	/*
> > @@ -300,6 +309,12 @@
> >  	}
> >  	addr = RTE_PTR_ADD(addr, offset);
> >  	ppriv->uar_table[txq->idx] = addr;
> > +	/* Check the doorbell register mapping type. */
> > +	cmd = txq_ctrl->uar_mmap_offset / page_size;
> > +	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
> > +	cmd &= MLX5_UAR_MMAP_CMD_MASK;
> > +	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
> > +		txq_ctrl->txq.db_nc = 1;
> 
> Looks like double code.
> Maybe it is better to get the value by a MACRO?
Yes, it would be better.

WBR, Slava
  

Patch

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index d89a7b5..7100fa3 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -189,6 +189,11 @@  mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		func mlx5dv_dr_action_create_dest_devx_tir \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD \
+		infiniband/mlx5dv.h \
+		enum MLX5_MMAP_GET_NC_PAGES_CMD \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_ETHTOOL_LINK_MODE_25G \
 		/usr/include/linux/ethtool.h \
 		enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \
diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index fb764fa..e56e018 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -128,6 +128,8 @@  if build
 		'mlx5dv_devx_obj_query_async' ],
 		[ 'HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR', 'infiniband/mlx5dv.h',
 		'mlx5dv_dr_action_create_dest_devx_tir' ],
+		[ 'HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD', 'infiniband/mlx5dv.h',
+		'MLX5_MMAP_GET_NC_PAGES_CMD' ],
 		[ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
 		'MLX5DV_DR_DOMAIN_TYPE_NIC_RX' ],
 		[ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h',
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index d7440fd..03a8086 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -115,6 +115,14 @@ 
 #define MLX5_UAR_PAGE_NUM_MAX 64
 #define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) - 1)
 
+/* Fields of memory mapping type in offset parameter of mmap() */
+#define MLX5_UAR_MMAP_CMD_SHIFT 8
+#define MLX5_UAR_MMAP_CMD_MASK 0xff
+
+#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD
+#define MLX5_MMAP_GET_NC_PAGES_CMD 3
+#endif
+
 /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */
 #define MLX5_MPRQ_STRIDE_NUM_N 6U
 
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f540977..fa3aa15 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -4730,8 +4730,23 @@  enum mlx5_txcmp_code {
 	 * to improve latencies. The pure software related data treatment
 	 * can be completed after doorbell. Tx CQEs for this SQ are
 	 * processed in this thread only by the polling.
+	 *
+	 * The rdma core library can map doorbell register in two ways,
+	 * depending on the environment variable "MLX5_SHUT_UP_BF":
+	 *
+	 * - as regular cached memory, the variable is either missing or
+	 *   set to zero. This type of mapping may cause the significant
+	 *   doorbell register writing latency and requires explicit
+	 *   memory write barrier to mitigate this issue and prevent
+	 *   write combining.
+	 *
+	 * - as non-cached memory, the variable is present and set to
+	 *   not "0" value. This type of mapping may cause performance
+	 *   impact under heavy loading conditions but the explicit write
+	 *   memory barrier is not required and it may improve core
+	 *   performance.
 	 */
-	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0);
+	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc);
 	/* Not all of the mbufs may be stored into elts yet. */
 	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
 	if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 4f73d91..ae3a763 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -277,6 +277,7 @@  struct mlx5_txq_data {
 	/* When set TX offload for tunneled packets are supported. */
 	uint16_t swp_en:1; /* Whether SW parser is enabled. */
 	uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
+	uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
 	uint16_t inlen_send; /* Ordinary send data inline size. */
 	uint16_t inlen_empw; /* eMPW max packet size to inline. */
 	uint16_t inlen_mode; /* Minimal data length to inline. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 2b7d6c0..7ec2ef3 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -18,6 +18,7 @@ 
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
 #include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
@@ -241,14 +242,21 @@ 
 {
 	struct mlx5_priv *priv = txq_ctrl->priv;
 	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+	unsigned int cmd;
 #ifndef RTE_ARCH_64
 	unsigned int lock_idx;
-	const size_t page_size = sysconf(_SC_PAGESIZE);
 #endif
 
 	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	assert(ppriv);
 	ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg;
+	/* Check the doorbell register mapping type. */
+	cmd = txq_ctrl->uar_mmap_offset / page_size;
+	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
+	cmd &= MLX5_UAR_MMAP_CMD_MASK;
+	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
+		txq_ctrl->txq.db_nc = 1;
 #ifndef RTE_ARCH_64
 	/* Assign an UAR lock according to UAR page number */
 	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
@@ -281,6 +289,7 @@ 
 	uintptr_t uar_va;
 	uintptr_t offset;
 	const size_t page_size = sysconf(_SC_PAGESIZE);
+	unsigned int cmd;
 
 	assert(ppriv);
 	/*
@@ -300,6 +309,12 @@ 
 	}
 	addr = RTE_PTR_ADD(addr, offset);
 	ppriv->uar_table[txq->idx] = addr;
+	/* Check the doorbell register mapping type. */
+	cmd = txq_ctrl->uar_mmap_offset / page_size;
+	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
+	cmd &= MLX5_UAR_MMAP_CMD_MASK;
+	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
+		txq_ctrl->txq.db_nc = 1;
 	return 0;
 }