net/mlx5: fix read device clock in real time mode
Checks
Commit Message
Since ConnectX-6DX the real time timestamp mode is supported.
The rte_eth_read_clock() routine queries current timestamp
value from the PMD.
The mlx5 PMD has special infrastructure to schedule packet
sending in real time mode which can be engaged with tx_pp devarg.
This infrastructure provides the timestamp reading from the special
queue CEQs directly from the host memory in user space, without
involving kernel calls.
The ConnectX-7 NIC has hardware capability to schedule packet
sending without special infrastructure and tx_pp devarg can be
omitted. If there is no tx_pp devarg specified the mlx5 uses kernel
calls to query current timestamp value. The kernel can be completely
unaware about engaged real time mode, also kernel might use its
internal queue CQEs to get timestamps, that is neither precise nor
reliable, inconsistent values might be returned, causing send
scheduling malfunction.
The HCA PCI BAR provides the real time direct reading from hardware.
This patch maps PCI resource to the process address space on demand
and allows reading the real time timestamp values from the NIC
directly.
Fixes: b94d93ca73803 ("net/mlx5: support reading device clock")
Cc: stable@dpdk.org
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
drivers/common/mlx5/mlx5_common.h | 1 +
drivers/common/mlx5/mlx5_prm.h | 5 +-
drivers/common/mlx5/version.map | 1 +
drivers/net/mlx5/linux/mlx5_ethdev_os.c | 68 +++++++++++++++++++++++
drivers/net/mlx5/mlx5.c | 6 +-
drivers/net/mlx5/mlx5.h | 4 ++
drivers/net/mlx5/mlx5_txpp.c | 15 ++++-
drivers/net/mlx5/windows/mlx5_ethdev_os.c | 30 ++++++++++
8 files changed, 127 insertions(+), 3 deletions(-)
Comments
Hi,
> -----Original Message-----
> From: Slava Ovsiienko <viacheslavo@nvidia.com>
> Sent: Tuesday, January 3, 2023 1:12 PM
> To: dev@dpdk.org
> Cc: Matan Azrad <matan@nvidia.com>; Raslan Darawsheh
> <rasland@nvidia.com>; Ori Kam <orika@nvidia.com>; stable@dpdk.org
> Subject: [PATCH] net/mlx5: fix read device clock in real time mode
>
> Since ConnectX-6DX the real time timestamp mode is supported.
> The rte_eth_read_clock() routine queries current timestamp value from the
> PMD.
>
> The mlx5 PMD has special infrastructure to schedule packet sending in real
> time mode which can be engaged with tx_pp devarg.
> This infrastructure provides the timestamp reading from the special queue
> CEQs directly from the host memory in user space, without involving kernel
> calls.
>
> The ConnectX-7 NIC has hardware capability to schedule packet sending
> without special infrastructure and tx_pp devarg can be omitted. If there is no
> tx_pp devarg specified the mlx5 uses kernel calls to query current timestamp
> value. The kernel can be completely unaware about engaged real time
> mode, also kernel might use its internal queue CQEs to get timestamps, that
> is neither precise nor reliable, inconsistent values might be returned, causing
> send scheduling malfunction.
>
> The HCA PCI BAR provides the real time direct reading from hardware.
> This patch maps PCI resource to the process address space on demand and
> allows reading the real time timestamp values from the NIC directly.
>
> Fixes: b94d93ca73803 ("net/mlx5: support reading device clock")
> Cc: stable@dpdk.org
>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
Patch applied to next-net-mlx,
Kindest regards,
Raslan Darawsheh
@@ -221,6 +221,7 @@ check_cqe(volatile struct mlx5_cqe *cqe, const uint16_t cqes_n,
* - 0 on success.
* - Negative value and rte_errno is set otherwise.
*/
+__rte_internal
int mlx5_dev_to_pci_str(const struct rte_device *dev, char *addr, size_t size);
/*
@@ -3040,6 +3040,7 @@ struct mlx5_ifc_health_buffer_bits {
u8 ext_synd[0x10];
};
+/* HCA PCI BAR resource structure. */
struct mlx5_ifc_initial_seg_bits {
u8 fw_rev_minor[0x10];
u8 fw_rev_major[0x10];
@@ -3067,7 +3068,9 @@ struct mlx5_ifc_initial_seg_bits {
u8 clear_int[0x1];
u8 health_syndrome[0x8];
u8 health_counter[0x18];
- u8 reserved_8[0x17fc0];
+ u8 reserved_8[0x160];
+ u8 real_time[0x40];
+ u8 reserved_9[0x17e20];
};
struct mlx5_ifc_create_cq_out_bits {
@@ -14,6 +14,7 @@ INTERNAL {
mlx5_dev_is_pci;
mlx5_dev_is_vf_pci;
+ mlx5_dev_to_pci_str;
mlx5_dev_mempool_unregister;
mlx5_dev_mempool_subscribe;
@@ -28,6 +28,7 @@
#include <bus_pci_driver.h>
#include <rte_mbuf.h>
#include <rte_common.h>
+#include <rte_eal_paging.h>
#include <rte_interrupts.h>
#include <rte_malloc.h>
#include <rte_string_fns.h>
@@ -1776,3 +1777,70 @@ int mlx5_get_flag_dropless_rq(struct rte_eth_dev *dev)
mlx5_free(sset_info);
return ret;
}
+
+/**
+ * Unmaps HCA PCI BAR from the current process address space.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev)
+{
+ struct mlx5_proc_priv *ppriv = dev->process_private;
+
+ if (ppriv && ppriv->hca_bar) {
+ rte_mem_unmap(ppriv->hca_bar, MLX5_ST_SZ_BYTES(initial_seg));
+ ppriv->hca_bar = NULL;
+ }
+}
+
+/**
+ * Maps HCA PCI BAR to the current process address space.
+ * Stores pointer in the process private structure allowing
+ * to read internal and real time counter directly from the HW.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success and not NULL pointer to mapped area in process structure.
+ * negative otherwise and NULL pointer
+ */
+int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev)
+{
+ struct mlx5_proc_priv *ppriv = dev->process_private;
+ char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
+ void *base, *expected = NULL;
+ int fd, ret;
+
+ if (!ppriv) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ if (ppriv->hca_bar)
+ return 0;
+ ret = mlx5_dev_to_pci_str(dev->device, pci_addr, sizeof(pci_addr));
+ if (ret < 0)
+ return -rte_errno;
+ /* Open PCI device resource 0 - HCA initialize segment */
+ MKSTR(name, "/sys/bus/pci/devices/%s/resource0", pci_addr);
+ fd = open(name, O_RDWR | O_SYNC);
+ if (fd == -1) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ base = rte_mem_map(NULL, MLX5_ST_SZ_BYTES(initial_seg),
+ RTE_PROT_READ, RTE_MAP_SHARED, fd, 0);
+ close(fd);
+ if (!base) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ /* Check there is no concurrent mapping in other thread. */
+ if (!__atomic_compare_exchange_n(&ppriv->hca_bar, &expected,
+ base, false,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED))
+ rte_mem_unmap(base, MLX5_ST_SZ_BYTES(initial_seg));
+ return 0;
+}
+
@@ -1977,8 +1977,12 @@ mlx5_proc_priv_init(struct rte_eth_dev *dev)
void
mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
{
- if (!dev->process_private)
+ struct mlx5_proc_priv *ppriv = dev->process_private;
+
+ if (!ppriv)
return;
+ if (ppriv->hca_bar)
+ mlx5_txpp_unmap_hca_bar(dev);
mlx5_free(dev->process_private);
dev->process_private = NULL;
}
@@ -1463,6 +1463,8 @@ struct mlx5_dev_ctx_shared {
* Caution, secondary process may rebuild the struct during port start.
*/
struct mlx5_proc_priv {
+ void *hca_bar;
+ /* Mapped HCA PCI BAR area. */
size_t uar_table_sz;
/* Size of UAR register table. */
struct mlx5_uar_data uar_table[];
@@ -2163,6 +2165,8 @@ int mlx5_txpp_xstats_get_names(struct rte_eth_dev *dev,
struct rte_eth_xstat_name *xstats_names,
unsigned int n, unsigned int n_used);
void mlx5_txpp_interrupt_handler(void *cb_arg);
+int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev);
+void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev);
/* mlx5_rxtx.c */
@@ -969,6 +969,8 @@ mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
{
struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_dev_ctx_shared *sh = priv->sh;
+ struct mlx5_proc_priv *ppriv;
+ uint64_t ts;
int ret;
if (sh->txpp.refcnt) {
@@ -979,7 +981,6 @@ mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
rte_int128_t u128;
struct mlx5_cqe_ts cts;
} to;
- uint64_t ts;
mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
if (to.cts.op_own >> 4) {
@@ -994,6 +995,18 @@ mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
*timestamp = ts;
return 0;
}
+ /* Check and try to map HCA PIC BAR to allow reading real time. */
+ ppriv = dev->process_private;
+ if (ppriv && !ppriv->hca_bar &&
+ sh->dev_cap.rt_timestamp && mlx5_dev_is_pci(dev->device))
+ mlx5_txpp_map_hca_bar(dev);
+ /* Check if we can read timestamp directly from hardware. */
+ if (ppriv && ppriv->hca_bar) {
+ ts = MLX5_GET64(initial_seg, ppriv->hca_bar, real_time);
+ ts = mlx5_txpp_convert_rx_ts(sh, ts);
+ *timestamp = ts;
+ return 0;
+ }
/* Not supported in isolated mode - kernel does not see the CQEs. */
if (priv->isolated || rte_eal_process_type() != RTE_PROC_PRIMARY)
return -ENOTSUP;
@@ -416,3 +416,33 @@ int mlx5_get_flag_dropless_rq(struct rte_eth_dev *dev)
RTE_SET_USED(dev);
return -ENOTSUP;
}
+
+/**
+ * Unmaps HCA PCI BAR from the current process address space.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev)
+{
+ RTE_SET_USED(dev);
+}
+
+/**
+ * Maps HCA PCI BAR to the current process address space.
+ * Stores pointer in the process private structure allowing
+ * to read internal and real time counter directly from the HW.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success and not NULL pointer to mapped area in process structure.
+ * negative otherwise and NULL pointer
+ */
+int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev)
+{
+ RTE_SET_USED(dev);
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+}