[v3,09/17] net/mlx5: introduce clock queue service routine
diff mbox series

Message ID 1594887800-6563-10-git-send-email-viacheslavo@mellanox.com
State Accepted
Delegated to: Raslan Darawsheh
Headers show
Series
  • net/mlx5: introduce accurate packet Tx scheduling
Related show

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Slava Ovsiienko July 16, 2020, 8:23 a.m. UTC
Service routine is invoked periodically on Rearm Queue
completion interrupts, typically once per some milliseconds
(1-16) to track clock jitter and wander in robust fashion.
It performs the following:

- fetches the completed CQEs for Rearm Queue
- restarts Rearm Queue on errors
- pushes new requests to Rearm Queue to make it
  continuously running and pushing cross-channel requests
  to Clock Queue
- reads and caches the Clock Queue CQE to be used in datapath
- gathers statistics to estimate clock jitter and wander
- gathers Clock Queue errors statistics

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx5/mlx5.h      |  16 ++
 drivers/net/mlx5/mlx5_defs.h |   1 +
 drivers/net/mlx5/mlx5_rxtx.h |  20 +++
 drivers/net/mlx5/mlx5_txpp.c | 338 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 375 insertions(+)

Patch
diff mbox series

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e8a7b10..bb2c096 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -555,6 +555,12 @@  struct mlx5_txpp_wq {
 	volatile uint32_t *sq_dbrec;
 };
 
+/* Tx packet pacing internal timestamp. */
+struct mlx5_txpp_ts {
+	rte_atomic64_t ci_ts;
+	rte_atomic64_t ts;
+};
+
 /* Tx packet pacing structure. */
 struct mlx5_dev_txpp {
 	pthread_mutex_t mutex; /* Pacing create/destroy mutex. */
@@ -570,6 +576,15 @@  struct mlx5_dev_txpp {
 	struct mlx5_txpp_wq rearm_queue; /* Clock Queue. */
 	struct mlx5dv_pp *pp; /* Packet pacing context. */
 	uint16_t pp_id; /* Packet pacing context index. */
+	uint16_t ts_n; /* Number of captured timestamps. */
+	uint16_t ts_p; /* Pointer to statisticks timestamp. */
+	struct mlx5_txpp_ts *tsa; /* Timestamps sliding window stats. */
+	struct mlx5_txpp_ts ts; /* Cached completion id/timestamp. */
+	uint32_t sync_lost:1; /* ci/timestamp synchronization lost. */
+	/* Statistics counters. */
+	rte_atomic32_t err_miss_int; /* Missed service interrupt. */
+	rte_atomic32_t err_rearm_queue; /* Rearm Queue errors. */
+	rte_atomic32_t err_clock_queue; /* Clock Queue errors. */
 };
 
 /*
@@ -993,5 +1008,6 @@  void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
 
 int mlx5_txpp_start(struct rte_eth_dev *dev);
 void mlx5_txpp_stop(struct rte_eth_dev *dev);
+void mlx5_txpp_interrupt_handler(void *cb_arg);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index a8626a4..7ed3e88 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -172,6 +172,7 @@ 
 #define MLX5_TXDB_HEURISTIC 2
 
 /* Tx accurate scheduling on timestamps parameters. */
+#define MLX5_TXPP_WAIT_INIT_TS 1000ul /* How long to wait timestamp. */
 #define MLX5_TXPP_CLKQ_SIZE 1
 #define MLX5_TXPP_REARM	((1UL << MLX5_WQ_INDEX_WIDTH) / 4)
 #define MLX5_TXPP_REARM_SQ_SIZE (((1UL << MLX5_CQ_INDEX_WIDTH) / \
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1b797da..8a8d2b5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -30,6 +30,7 @@ 
 #include <rte_io.h>
 #include <rte_bus_pci.h>
 #include <rte_malloc.h>
+#include <rte_cycles.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_prm.h>
@@ -695,4 +696,23 @@  int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
 	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
+/**
+ * Convert timestamp from HW format to linear counter
+ * from Packet Pacing Clock Queue CQE timestamp format.
+ *
+ * @param sh
+ *   Pointer to the device shared context. Might be needed
+ *   to convert according current device configuration.
+ * @param ts
+ *   Timestamp from CQE to convert.
+ * @return
+ *   UTC in nanoseconds
+ */
+static __rte_always_inline uint64_t
+mlx5_txpp_convert_rx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t ts)
+{
+	RTE_SET_USED(sh);
+	return (ts & UINT32_MAX) + (ts >> 32) * NS_PER_S;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index a0ee872..7ed5d2c 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -1,6 +1,9 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2020 Mellanox Technologies, Ltd
  */
+#include <fcntl.h>
+#include <stdint.h>
+
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_interrupts.h>
@@ -144,6 +147,33 @@ 
 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
 
 	mlx5_txpp_destroy_send_queue(wq);
+	if (sh->txpp.tsa) {
+		rte_free(sh->txpp.tsa);
+		sh->txpp.tsa = NULL;
+	}
+}
+
+static void
+mlx5_txpp_doorbell_rearm_queue(struct mlx5_dev_ctx_shared *sh, uint16_t ci)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+	union {
+		uint32_t w32[2];
+		uint64_t w64;
+	} cs;
+
+	wq->sq_ci = ci + 1;
+	cs.w32[0] = rte_cpu_to_be_32(rte_be_to_cpu_32
+		   (wq->wqes[ci & (wq->sq_size - 1)].ctrl[0]) | (ci - 1) << 8);
+	cs.w32[1] = wq->wqes[ci & (wq->sq_size - 1)].ctrl[1];
+	/* Update SQ doorbell record with new SQ ci. */
+	rte_compiler_barrier();
+	*wq->sq_dbrec = rte_cpu_to_be_32(wq->sq_ci);
+	/* Make sure the doorbell record is updated. */
+	rte_wmb();
+	/* Write to doorbel register to start processing. */
+	__mlx5_uar_write64_relaxed(cs.w64, sh->tx_uar->reg_addr, NULL);
+	rte_wmb();
 }
 
 static void
@@ -433,6 +463,16 @@ 
 	uint32_t umem_size, umem_dbrec;
 	int ret;
 
+	sh->txpp.tsa = rte_zmalloc_socket(__func__,
+					   MLX5_TXPP_REARM_SQ_SIZE *
+					   sizeof(struct mlx5_txpp_ts),
+					   0, sh->numa_node);
+	if (!sh->txpp.tsa) {
+		DRV_LOG(ERR, "Failed to allocate memory for CQ stats.");
+		return -ENOMEM;
+	}
+	sh->txpp.ts_p = 0;
+	sh->txpp.ts_n = 0;
 	/* Allocate memory buffer for CQEs and doorbell record. */
 	umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_CLKQ_SIZE;
 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
@@ -562,6 +602,299 @@ 
 	return ret;
 }
 
+/* Enable notification from the Rearm Queue CQ. */
+static inline void
+mlx5_txpp_cq_arm(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *aq = &sh->txpp.rearm_queue;
+	uint32_t arm_sn = aq->arm_sn << MLX5_CQ_SQN_OFFSET;
+	uint32_t db_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | aq->cq_ci;
+	uint64_t db_be = rte_cpu_to_be_64(((uint64_t)db_hi << 32) | aq->cq->id);
+	uint32_t *addr = RTE_PTR_ADD(sh->tx_uar->base_addr, MLX5_CQ_DOORBELL);
+
+	rte_compiler_barrier();
+	aq->cq_dbrec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(db_hi);
+	rte_wmb();
+#ifdef RTE_ARCH_64
+	*(uint64_t *)addr = db_be;
+#else
+	*(uint32_t *)addr = db_be;
+	rte_io_wmb();
+	*((uint32_t *)addr + 1) = db_be >> 32;
+#endif
+	aq->arm_sn++;
+}
+
+static inline void
+mlx5_atomic_read_cqe(rte_int128_t *from, rte_int128_t *ts)
+{
+	/*
+	 * The only CQE of Clock Queue is being continuously
+	 * update by hardware with soecified rate. We have to
+	 * read timestump and WQE completion index atomically.
+	 */
+#if defined(RTE_ARCH_PPC_64) || defined(RTE_ARCH_32)
+	rte_atomic64_t *cqe = (rte_atomic64_t *)from;
+
+	/* Power architecture does not support 16B compare-and-swap. */
+	for (;;) {
+		int64_t tm, op;
+		int64_t *ps;
+
+		rte_compiler_barrier();
+		tm = rte_atomic64_read(cqe + 0);
+		op = rte_atomic64_read(cqe + 1);
+		rte_compiler_barrier();
+		if (tm != rte_atomic64_read(cqe + 0))
+			continue;
+		if (op != rte_atomic64_read(cqe + 1))
+			continue;
+		ps = (int64_t *)ts;
+		ps[0] = tm;
+		ps[1] = op;
+		return;
+	}
+#else
+	rte_int128_t src;
+
+	memset(&src, 0, sizeof(src));
+	*ts = src;
+	/* if (*from == *ts) *from = *src else *ts = *from; */
+	rte_atomic128_cmp_exchange(from, ts, &src, 0,
+				   __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+#endif
+}
+
+/* Stores timestamp in the cache structure to share data with datapath. */
+static inline void
+mlx5_txpp_cache_timestamp(struct mlx5_dev_ctx_shared *sh,
+			   uint64_t ts, uint64_t ci)
+{
+	ci = ci << (64 - MLX5_CQ_INDEX_WIDTH);
+	ci |= (ts << MLX5_CQ_INDEX_WIDTH) >> MLX5_CQ_INDEX_WIDTH;
+	rte_compiler_barrier();
+	rte_atomic64_set(&sh->txpp.ts.ts, ts);
+	rte_atomic64_set(&sh->txpp.ts.ci_ts, ci);
+	rte_wmb();
+}
+
+/* Reads timestamp from Clock Queue CQE and stores in the cache. */
+static inline void
+mlx5_txpp_update_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+	struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
+	union {
+		rte_int128_t u128;
+		struct mlx5_cqe_ts cts;
+	} to;
+	uint64_t ts;
+	uint16_t ci;
+
+	static_assert(sizeof(struct mlx5_cqe_ts) == sizeof(rte_int128_t),
+		      "Wrong timestamp CQE part size");
+	mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
+	if (to.cts.op_own >> 4) {
+		DRV_LOG(DEBUG, "Clock Queue error sync lost.");
+		rte_atomic32_inc(&sh->txpp.err_clock_queue);
+		sh->txpp.sync_lost = 1;
+		return;
+	}
+	ci = rte_be_to_cpu_16(to.cts.wqe_counter);
+	ts = rte_be_to_cpu_64(to.cts.timestamp);
+	ts = mlx5_txpp_convert_rx_ts(sh, ts);
+	wq->cq_ci += (ci - wq->sq_ci) & UINT16_MAX;
+	wq->sq_ci = ci;
+	mlx5_txpp_cache_timestamp(sh, ts, wq->cq_ci);
+}
+
+/* Gather statistics for timestamp from Clock Queue CQE. */
+static inline void
+mlx5_txpp_gather_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+	/* Check whether we have a valid timestamp. */
+	if (!sh->txpp.clock_queue.sq_ci && !sh->txpp.ts_n)
+		return;
+	MLX5_ASSERT(sh->txpp.ts_p < MLX5_TXPP_REARM_SQ_SIZE);
+	sh->txpp.tsa[sh->txpp.ts_p] = sh->txpp.ts;
+	if (++sh->txpp.ts_p >= MLX5_TXPP_REARM_SQ_SIZE)
+		sh->txpp.ts_p = 0;
+	if (sh->txpp.ts_n < MLX5_TXPP_REARM_SQ_SIZE)
+		++sh->txpp.ts_n;
+}
+
+/* Waits for the first completion on Clock Queue to init timestamp. */
+static inline void
+mlx5_txpp_init_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+	uint32_t wait;
+
+	sh->txpp.ts_p = 0;
+	sh->txpp.ts_n = 0;
+	for (wait = 0; wait < MLX5_TXPP_WAIT_INIT_TS; wait++) {
+		struct timespec onems;
+
+		mlx5_txpp_update_timestamp(sh);
+		if (wq->sq_ci)
+			return;
+		/* Wait one millisecond and try again. */
+		onems.tv_sec = 0;
+		onems.tv_nsec = NS_PER_S / MS_PER_S;
+		nanosleep(&onems, 0);
+	}
+	DRV_LOG(ERR, "Unable to initialize timestamp.");
+	sh->txpp.sync_lost = 1;
+}
+
+#ifdef HAVE_IBV_DEVX_EVENT
+/* Handles Rearm Queue completions in periodic service. */
+static __rte_always_inline void
+mlx5_txpp_handle_rearm_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+	uint32_t cq_ci = wq->cq_ci;
+	bool error = false;
+	int ret;
+
+	do {
+		volatile struct mlx5_cqe *cqe;
+
+		cqe = &wq->cqes[cq_ci & (MLX5_TXPP_REARM_CQ_SIZE - 1)];
+		ret = check_cqe(cqe, MLX5_TXPP_REARM_CQ_SIZE, cq_ci);
+		switch (ret) {
+		case MLX5_CQE_STATUS_ERR:
+			error = true;
+			++cq_ci;
+			break;
+		case MLX5_CQE_STATUS_SW_OWN:
+			wq->sq_ci += 2;
+			++cq_ci;
+			break;
+		case MLX5_CQE_STATUS_HW_OWN:
+			break;
+		default:
+			MLX5_ASSERT(false);
+			break;
+		}
+	} while (ret != MLX5_CQE_STATUS_HW_OWN);
+	if (likely(cq_ci != wq->cq_ci)) {
+		/* Check whether we have missed interrupts. */
+		if (cq_ci - wq->cq_ci != 1) {
+			DRV_LOG(DEBUG, "Rearm Queue missed interrupt.");
+			rte_atomic32_inc(&sh->txpp.err_miss_int);
+			/* Check sync lost on wqe index. */
+			if (cq_ci - wq->cq_ci >=
+				(((1UL << MLX5_WQ_INDEX_WIDTH) /
+				  MLX5_TXPP_REARM) - 1))
+				error = 1;
+		}
+		/* Update doorbell record to notify hardware. */
+		rte_compiler_barrier();
+		*wq->cq_dbrec = rte_cpu_to_be_32(cq_ci);
+		rte_wmb();
+		wq->cq_ci = cq_ci;
+		/* Fire new requests to Rearm Queue. */
+		if (error) {
+			DRV_LOG(DEBUG, "Rearm Queue error sync lost.");
+			rte_atomic32_inc(&sh->txpp.err_rearm_queue);
+			sh->txpp.sync_lost = 1;
+		}
+	}
+}
+
+/* Handles Clock Queue completions in periodic service. */
+static __rte_always_inline void
+mlx5_txpp_handle_clock_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	mlx5_txpp_update_timestamp(sh);
+	mlx5_txpp_gather_timestamp(sh);
+}
+#endif
+
+/* Invoked periodically on Rearm Queue completions. */
+void
+mlx5_txpp_interrupt_handler(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_EVENT
+	RTE_SET_USED(cb_arg);
+	return;
+#else
+	struct mlx5_dev_ctx_shared *sh = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Process events in the loop. Only rearm completions are expected. */
+	while (mlx5_glue->devx_get_event
+				(sh->txpp.echan,
+				 &out.event_resp,
+				 sizeof(out.buf)) >=
+				 (ssize_t)sizeof(out.event_resp.cookie)) {
+		mlx5_txpp_handle_rearm_queue(sh);
+		mlx5_txpp_handle_clock_queue(sh);
+		mlx5_txpp_cq_arm(sh);
+		mlx5_txpp_doorbell_rearm_queue
+					(sh, sh->txpp.rearm_queue.sq_ci - 1);
+	}
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+static void
+mlx5_txpp_stop_service(struct mlx5_dev_ctx_shared *sh)
+{
+	if (!sh->txpp.intr_handle.fd)
+		return;
+	mlx5_intr_callback_unregister(&sh->txpp.intr_handle,
+				      mlx5_txpp_interrupt_handler, sh);
+	sh->txpp.intr_handle.fd = 0;
+}
+
+/* Attach interrupt handler and fires first request to Rearm Queue. */
+static int
+mlx5_txpp_start_service(struct mlx5_dev_ctx_shared *sh)
+{
+	uint16_t event_nums[1] = {0};
+	int flags;
+	int ret;
+
+	/* Attach interrupt handler to process Rearm Queue completions. */
+	flags = fcntl(sh->txpp.echan->fd, F_GETFL);
+	ret = fcntl(sh->txpp.echan->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change event channel FD.");
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	memset(&sh->txpp.intr_handle, 0, sizeof(sh->txpp.intr_handle));
+	sh->txpp.intr_handle.fd = sh->txpp.echan->fd;
+	sh->txpp.intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&sh->txpp.intr_handle,
+				       mlx5_txpp_interrupt_handler, sh)) {
+		sh->txpp.intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
+		return -rte_errno;
+	}
+	/* Subscribe CQ event to the event channel controlled by the driver. */
+	ret = mlx5_glue->devx_subscribe_devx_event(sh->txpp.echan,
+						   sh->txpp.rearm_queue.cq->obj,
+						   sizeof(event_nums),
+						   event_nums, 0);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe CQE event.");
+		rte_errno = errno;
+		return -errno;
+	}
+	/* Enable interrupts in the CQ. */
+	mlx5_txpp_cq_arm(sh);
+	/* Fire the first request on Rearm Queue. */
+	mlx5_txpp_doorbell_rearm_queue(sh, sh->txpp.rearm_queue.sq_size - 1);
+	mlx5_txpp_init_timestamp(sh);
+	return 0;
+}
+
 /*
  * The routine initializes the packet pacing infrastructure:
  * - allocates PP context
@@ -595,8 +928,12 @@ 
 	ret = mlx5_txpp_create_rearm_queue(sh);
 	if (ret)
 		goto exit;
+	ret = mlx5_txpp_start_service(sh);
+	if (ret)
+		goto exit;
 exit:
 	if (ret) {
+		mlx5_txpp_stop_service(sh);
 		mlx5_txpp_destroy_rearm_queue(sh);
 		mlx5_txpp_destroy_clock_queue(sh);
 		mlx5_txpp_free_pp_index(sh);
@@ -618,6 +955,7 @@ 
 static void
 mlx5_txpp_destroy(struct mlx5_dev_ctx_shared *sh)
 {
+	mlx5_txpp_stop_service(sh);
 	mlx5_txpp_destroy_rearm_queue(sh);
 	mlx5_txpp_destroy_clock_queue(sh);
 	mlx5_txpp_free_pp_index(sh);