[v2,3/4] pcapng: modify timestamp calculation

Message ID 20231005230648.68244-4-stephen@networkplumber.org (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series dumpcap and pcapng fixes |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Stephen Hemminger Oct. 5, 2023, 11:06 p.m. UTC
  The computation of timestamp is best done in the part of
pcapng library that is in secondary process.
The secondary process is already doing a bunch of system
calls which makes it not performance sensitive.

Simplify the computation of nanoseconds from TSC to a two
step process which avoids numeric overflow issues. The previous
code was not thread safe as well.

Fixes: c882eb544842 ("pcapng: fix timestamp wrapping in output files")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/dumpcap/main.c      |  25 +++------
 app/test/test_pcapng.c  |   4 +-
 lib/graph/graph_pcap.c  |   2 +-
 lib/pcapng/rte_pcapng.c | 119 +++++++++++++++-------------------------
 lib/pcapng/rte_pcapng.h |  19 ++-----
 lib/pdump/rte_pdump.c   |   4 +-
 6 files changed, 61 insertions(+), 112 deletions(-)
  

Patch

diff --git a/app/dumpcap/main.c b/app/dumpcap/main.c
index 37754fd06f4f..764dac6c37c0 100644
--- a/app/dumpcap/main.c
+++ b/app/dumpcap/main.c
@@ -66,13 +66,13 @@  static bool print_stats;
 
 /* capture limit options */
 static struct {
-	uint64_t  duration;	/* nanoseconds */
+	time_t  duration;	/* seconds */
 	unsigned long packets;  /* number of packets in file */
 	size_t size;		/* file size (bytes) */
 } stop;
 
 /* Running state */
-static uint64_t start_time, end_time;
+static time_t start_time;
 static uint64_t packets_received;
 static size_t file_size;
 
@@ -197,7 +197,7 @@  static void auto_stop(char *opt)
 		if (*value == '\0' || *endp != '\0' || interval <= 0)
 			rte_exit(EXIT_FAILURE,
 				 "Invalid duration \"%s\"\n", value);
-		stop.duration = NSEC_PER_SEC * interval;
+		stop.duration = interval;
 	} else if (strcmp(opt, "filesize") == 0) {
 		stop.size = get_uint(value, "filesize", 0) * 1024;
 	} else if (strcmp(opt, "packets") == 0) {
@@ -511,15 +511,6 @@  static void statistics_loop(void)
 	}
 }
 
-/* Return the time since 1/1/1970 in nanoseconds */
-static uint64_t create_timestamp(void)
-{
-	struct timespec now;
-
-	clock_gettime(CLOCK_MONOTONIC, &now);
-	return rte_timespec_to_ns(&now);
-}
-
 static void
 cleanup_pdump_resources(void)
 {
@@ -589,9 +580,8 @@  report_packet_stats(dumpcap_out_t out)
 		ifdrop = pdump_stats.nombuf + pdump_stats.ringfull;
 
 		if (use_pcapng)
-			rte_pcapng_write_stats(out.pcapng, intf->port, NULL,
-					       start_time, end_time,
-					       ifrecv, ifdrop);
+			rte_pcapng_write_stats(out.pcapng, intf->port,
+					       ifrecv, ifdrop, NULL);
 
 		if (ifrecv == 0)
 			percent = 0;
@@ -983,7 +973,7 @@  int main(int argc, char **argv)
 	mp = create_mempool();
 	out = create_output();
 
-	start_time = create_timestamp();
+	start_time = time(NULL);
 	enable_pdump(r, mp);
 
 	if (!quiet) {
@@ -1005,11 +995,10 @@  int main(int argc, char **argv)
 			break;
 
 		if (stop.duration != 0 &&
-		    create_timestamp() - start_time > stop.duration)
+		    time(NULL) - start_time > stop.duration)
 			break;
 	}
 
-	end_time = create_timestamp();
 	disable_primary_monitor();
 
 	if (rte_eal_primary_proc_alive(NULL))
diff --git a/app/test/test_pcapng.c b/app/test/test_pcapng.c
index b8429a02f160..55aa2cf93666 100644
--- a/app/test/test_pcapng.c
+++ b/app/test/test_pcapng.c
@@ -173,8 +173,8 @@  test_write_stats(void)
 	ssize_t len;
 
 	/* write a statistics block */
-	len = rte_pcapng_write_stats(pcapng, port_id,
-				     NULL, 0, 0,
+	len = rte_pcapng_write_stats(pcapng, port_id, NULL,
+				     0, 0, 0,
 				     NUM_PACKETS, 0);
 	if (len <= 0) {
 		fprintf(stderr, "Write of statistics failed\n");
diff --git a/lib/graph/graph_pcap.c b/lib/graph/graph_pcap.c
index db722c375fa7..89525f1220ca 100644
--- a/lib/graph/graph_pcap.c
+++ b/lib/graph/graph_pcap.c
@@ -214,7 +214,7 @@  graph_pcap_dispatch(struct rte_graph *graph,
 		mbuf = (struct rte_mbuf *)objs[i];
 
 		mc = rte_pcapng_copy(mbuf->port, 0, mbuf, pkt_mp, mbuf->pkt_len,
-				     rte_get_tsc_cycles(), 0, buffer);
+				     0, buffer);
 		if (mc == NULL)
 			break;
 
diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
index 3c91fc77644a..13fd2b97fb80 100644
--- a/lib/pcapng/rte_pcapng.c
+++ b/lib/pcapng/rte_pcapng.c
@@ -36,22 +36,14 @@ 
 /* Format of the capture file handle */
 struct rte_pcapng {
 	int  outfd;		/* output file */
-
 	unsigned int ports;	/* number of interfaces added */
+	uint64_t offset_ns;	/* ns since 1/1/1970 when initialized */
+	uint64_t tsc_base;	/* TSC when started */
 
 	/* DPDK port id to interface index in file */
 	uint32_t port_index[RTE_MAX_ETHPORTS];
 };
 
-/* For converting TSC cycles to PCAPNG ns format */
-static struct pcapng_time {
-	uint64_t ns;
-	uint64_t cycles;
-	uint64_t tsc_hz;
-	struct rte_reciprocal_u64 tsc_hz_inverse;
-} pcapng_time;
-
-
 #ifdef RTE_EXEC_ENV_WINDOWS
 /*
  * Windows does not have writev() call.
@@ -102,56 +94,21 @@  static ssize_t writev(int fd, const struct iovec *iov, int iovcnt)
 #define if_indextoname(ifindex, ifname) NULL
 #endif
 
-static inline void
-pcapng_init(void)
+/* Convert from TSC (CPU cycles) to nanoseconds */
+static uint64_t
+pcapng_timestamp(const rte_pcapng_t *self, uint64_t cycles)
 {
-	struct timespec ts;
+	uint64_t delta, rem, secs, ns;
+	const uint64_t hz = rte_get_tsc_hz();
 
-	pcapng_time.cycles = rte_get_tsc_cycles();
-	clock_gettime(CLOCK_REALTIME, &ts);
-	pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2;
-	pcapng_time.ns = rte_timespec_to_ns(&ts);
-
-	pcapng_time.tsc_hz = rte_get_tsc_hz();
-	pcapng_time.tsc_hz_inverse = rte_reciprocal_value_u64(pcapng_time.tsc_hz);
-}
+	delta = cycles - self->tsc_base;
 
-/* PCAPNG timestamps are in nanoseconds */
-static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
-{
-	uint64_t delta, secs;
-
-	if (!pcapng_time.tsc_hz)
-		pcapng_init();
-
-	/* In essence the calculation is:
-	 *   delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC / rte_get_tsc_hz()
-	 * but this overflows within 4 to 8 seconds depending on TSC frequency.
-	 * Instead, if delta >= pcapng_time.tsc_hz:
-	 *   Increase pcapng_time.ns and pcapng_time.cycles by the number of
-	 *   whole seconds in delta and reduce delta accordingly.
-	 * delta will therefore always lie in the interval [0, pcapng_time.tsc_hz),
-	 * which will not overflow when multiplied by NSEC_PER_SEC provided the
-	 * TSC frequency < approx 18.4GHz.
-	 *
-	 * Currently all TSCs operate below 5GHz.
-	 */
-	delta = cycles - pcapng_time.cycles;
-	if (unlikely(delta >= pcapng_time.tsc_hz)) {
-		if (likely(delta < pcapng_time.tsc_hz * 2)) {
-			delta -= pcapng_time.tsc_hz;
-			pcapng_time.cycles += pcapng_time.tsc_hz;
-			pcapng_time.ns += NSEC_PER_SEC;
-		} else {
-			secs = rte_reciprocal_divide_u64(delta, &pcapng_time.tsc_hz_inverse);
-			delta -= secs * pcapng_time.tsc_hz;
-			pcapng_time.cycles += secs * pcapng_time.tsc_hz;
-			pcapng_time.ns += secs * NSEC_PER_SEC;
-		}
-	}
+	/* Avoid numeric wraparound by computing seconds first */
+	secs = delta / hz;
+	rem = delta % hz;
+	ns = (rem * NS_PER_S) / hz;
 
-	return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC,
-							  &pcapng_time.tsc_hz_inverse);
+	return secs * NS_PER_S + ns + self->offset_ns;
 }
 
 /* length of option including padding */
@@ -368,15 +325,15 @@  rte_pcapng_add_interface(rte_pcapng_t *self, uint16_t port,
  */
 ssize_t
 rte_pcapng_write_stats(rte_pcapng_t *self, uint16_t port_id,
-		       const char *comment,
-		       uint64_t start_time, uint64_t end_time,
-		       uint64_t ifrecv, uint64_t ifdrop)
+		       uint64_t ifrecv, uint64_t ifdrop,
+		       const char *comment)
 {
 	struct pcapng_statistics *hdr;
 	struct pcapng_option *opt;
+	uint64_t start_time = self->offset_ns;
+	uint64_t sample_time;
 	uint32_t optlen, len;
 	uint8_t *buf;
-	uint64_t ns;
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 
@@ -386,10 +343,10 @@  rte_pcapng_write_stats(rte_pcapng_t *self, uint16_t port_id,
 		optlen += pcapng_optlen(sizeof(ifrecv));
 	if (ifdrop != UINT64_MAX)
 		optlen += pcapng_optlen(sizeof(ifdrop));
+
 	if (start_time != 0)
 		optlen += pcapng_optlen(sizeof(start_time));
-	if (end_time != 0)
-		optlen += pcapng_optlen(sizeof(end_time));
+
 	if (comment)
 		optlen += pcapng_optlen(strlen(comment));
 	if (optlen != 0)
@@ -409,9 +366,6 @@  rte_pcapng_write_stats(rte_pcapng_t *self, uint16_t port_id,
 	if (start_time != 0)
 		opt = pcapng_add_option(opt, PCAPNG_ISB_STARTTIME,
 					 &start_time, sizeof(start_time));
-	if (end_time != 0)
-		opt = pcapng_add_option(opt, PCAPNG_ISB_ENDTIME,
-					 &end_time, sizeof(end_time));
 	if (ifrecv != UINT64_MAX)
 		opt = pcapng_add_option(opt, PCAPNG_ISB_IFRECV,
 				&ifrecv, sizeof(ifrecv));
@@ -425,9 +379,9 @@  rte_pcapng_write_stats(rte_pcapng_t *self, uint16_t port_id,
 	hdr->block_length = len;
 	hdr->interface_id = self->port_index[port_id];
 
-	ns = pcapng_tsc_to_ns(rte_get_tsc_cycles());
-	hdr->timestamp_hi = ns >> 32;
-	hdr->timestamp_lo = (uint32_t)ns;
+	sample_time = pcapng_timestamp(self, rte_get_tsc_cycles());
+	hdr->timestamp_hi = sample_time >> 32;
+	hdr->timestamp_lo = (uint32_t)sample_time;
 
 	/* clone block_length after option */
 	memcpy(opt, &len, sizeof(uint32_t));
@@ -520,23 +474,21 @@  struct rte_mbuf *
 rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 		const struct rte_mbuf *md,
 		struct rte_mempool *mp,
-		uint32_t length, uint64_t cycles,
+		uint32_t length,
 		enum rte_pcapng_direction direction,
 		const char *comment)
 {
 	struct pcapng_enhance_packet_block *epb;
 	uint32_t orig_len, data_len, padding, flags;
 	struct pcapng_option *opt;
+	uint64_t timestamp;
 	uint16_t optlen;
 	struct rte_mbuf *mc;
-	uint64_t ns;
 	bool rss_hash;
 
 #ifdef RTE_LIBRTE_ETHDEV_DEBUG
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, NULL);
 #endif
-	ns = pcapng_tsc_to_ns(cycles);
-
 	orig_len = rte_pktmbuf_pkt_len(md);
 
 	/* Take snapshot of the data */
@@ -641,8 +593,10 @@  rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 	/* Interface index is filled in later during write */
 	mc->port = port_id;
 
-	epb->timestamp_hi = ns >> 32;
-	epb->timestamp_lo = (uint32_t)ns;
+	/* Put timestamp in cycles here - adjust in packet write */
+	timestamp = rte_get_tsc_cycles();
+	epb->timestamp_hi = timestamp >> 32;
+	epb->timestamp_lo = (uint32_t)timestamp;
 	epb->capture_length = data_len;
 	epb->original_length = orig_len;
 
@@ -668,6 +622,7 @@  rte_pcapng_write_packets(rte_pcapng_t *self,
 	for (i = 0; i < nb_pkts; i++) {
 		struct rte_mbuf *m = pkts[i];
 		struct pcapng_enhance_packet_block *epb;
+		uint64_t cycles, timestamp;
 
 		/* sanity check that is really a pcapng mbuf */
 		epb = rte_pktmbuf_mtod(m, struct pcapng_enhance_packet_block *);
@@ -684,6 +639,13 @@  rte_pcapng_write_packets(rte_pcapng_t *self,
 			return -1;
 		}
 
+		/* adjust timestamp recorded in packet */
+		cycles = (uint64_t)epb->timestamp_hi << 32;
+		cycles += epb->timestamp_lo;
+		timestamp = pcapng_timestamp(self, cycles);
+		epb->timestamp_hi = timestamp >> 32;
+		epb->timestamp_lo = (uint32_t)timestamp;
+
 		/*
 		 * Handle case of highly fragmented and large burst size
 		 * Note: this assumes that max segments per mbuf < IOV_MAX
@@ -725,6 +687,8 @@  rte_pcapng_fdopen(int fd,
 {
 	unsigned int i;
 	rte_pcapng_t *self;
+	struct timespec ts;
+	uint64_t cycles;
 
 	self = malloc(sizeof(*self));
 	if (!self) {
@@ -734,6 +698,13 @@  rte_pcapng_fdopen(int fd,
 
 	self->outfd = fd;
 	self->ports = 0;
+
+	/* record start time in ns since 1/1/1970 */
+	cycles = rte_get_tsc_cycles();
+	clock_gettime(CLOCK_REALTIME, &ts);
+	self->tsc_base = (cycles + rte_get_tsc_cycles()) / 2;
+	self->offset_ns = rte_timespec_to_ns(&ts);
+
 	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
 		self->port_index[i] = UINT32_MAX;
 
diff --git a/lib/pcapng/rte_pcapng.h b/lib/pcapng/rte_pcapng.h
index d93cc9f73ad5..c40795c721de 100644
--- a/lib/pcapng/rte_pcapng.h
+++ b/lib/pcapng/rte_pcapng.h
@@ -121,8 +121,6 @@  enum rte_pcapng_direction {
  * @param length
  *   The upper limit on bytes to copy.  Passing UINT32_MAX
  *   means all data (after offset).
- * @param timestamp
- *   The timestamp in TSC cycles.
  * @param direction
  *   The direction of the packer: receive, transmit or unknown.
  * @param comment
@@ -136,7 +134,7 @@  __rte_experimental
 struct rte_mbuf *
 rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 		const struct rte_mbuf *m, struct rte_mempool *mp,
-		uint32_t length, uint64_t timestamp,
+		uint32_t length,
 		enum rte_pcapng_direction direction, const char *comment);
 
 
@@ -188,29 +186,22 @@  rte_pcapng_write_packets(rte_pcapng_t *self,
  *  The handle to the packet capture file
  * @param port
  *  The Ethernet port to report stats on.
- * @param comment
- *   Optional comment to add to statistics.
- * @param start_time
- *  The time when packet capture was started in nanoseconds.
- *  Optional: can be zero if not known.
- * @param end_time
- *  The time when packet capture was stopped in nanoseconds.
- *  Optional: can be zero if not finished;
  * @param ifrecv
  *  The number of packets received by capture.
  *  Optional: use UINT64_MAX if not known.
  * @param ifdrop
  *  The number of packets missed by the capture process.
  *  Optional: use UINT64_MAX if not known.
+ * @param comment
+ *  Optional comment to add to statistics.
  * @return
  *  number of bytes written to file, -1 on failure to write file
  */
 __rte_experimental
 ssize_t
 rte_pcapng_write_stats(rte_pcapng_t *self, uint16_t port,
-		       const char *comment,
-		       uint64_t start_time, uint64_t end_time,
-		       uint64_t ifrecv, uint64_t ifdrop);
+		       uint64_t ifrecv, uint64_t ifdrop,
+		       const char *comment);
 
 #ifdef __cplusplus
 }
diff --git a/lib/pdump/rte_pdump.c b/lib/pdump/rte_pdump.c
index a70085bd0211..903f92839b8e 100644
--- a/lib/pdump/rte_pdump.c
+++ b/lib/pdump/rte_pdump.c
@@ -90,7 +90,6 @@  pdump_copy(uint16_t port_id, uint16_t queue,
 	int ring_enq;
 	uint16_t d_pkts = 0;
 	struct rte_mbuf *dup_bufs[nb_pkts];
-	uint64_t ts;
 	struct rte_ring *ring;
 	struct rte_mempool *mp;
 	struct rte_mbuf *p;
@@ -99,7 +98,6 @@  pdump_copy(uint16_t port_id, uint16_t queue,
 	if (cbs->filter)
 		rte_bpf_exec_burst(cbs->filter, (void **)pkts, rcs, nb_pkts);
 
-	ts = rte_get_tsc_cycles();
 	ring = cbs->ring;
 	mp = cbs->mp;
 	for (i = 0; i < nb_pkts; i++) {
@@ -122,7 +120,7 @@  pdump_copy(uint16_t port_id, uint16_t queue,
 		if (cbs->ver == V2)
 			p = rte_pcapng_copy(port_id, queue,
 					    pkts[i], mp, cbs->snaplen,
-					    ts, direction, NULL);
+					    direction, NULL);
 		else
 			p = rte_pktmbuf_copy(pkts[i], mp, 0, cbs->snaplen);