[dpdk-dev,25/33] app/testeventdev: perf queue: add worker functions

Message ID 20170528195854.6064-26-jerin.jacob@caviumnetworks.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Jerin Jacob May 28, 2017, 7:58 p.m. UTC
  Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 app/test-eventdev/test_perf_common.h |  60 +++++++++++++++
 app/test-eventdev/test_perf_queue.c  | 137 +++++++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+)
  

Comments

Eads, Gage June 1, 2017, 9:04 p.m. UTC | #1
>  -----Original Message-----
>  From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
>  Sent: Sunday, May 28, 2017 2:59 PM
>  To: dev@dpdk.org
>  Cc: Richardson, Bruce <bruce.richardson@intel.com>; Van Haaren, Harry
>  <harry.van.haaren@intel.com>; hemant.agrawal@nxp.com; Eads, Gage
>  <gage.eads@intel.com>; nipun.gupta@nxp.com; Vangati, Narender
>  <narender.vangati@intel.com>; Rao, Nikhil <nikhil.rao@intel.com>;
>  gprathyusha@caviumnetworks.com; Jerin Jacob
>  <jerin.jacob@caviumnetworks.com>
>  Subject: [dpdk-dev] [PATCH 25/33] app/testeventdev: perf queue: add worker
>  functions
>  
>  Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
>  ---
>   app/test-eventdev/test_perf_common.h |  60 +++++++++++++++  app/test-
>  eventdev/test_perf_queue.c  | 137 +++++++++++++++++++++++++++++++++++
>   2 files changed, 197 insertions(+)
>  
>  diff --git a/app/test-eventdev/test_perf_common.h b/app/test-
>  eventdev/test_perf_common.h
>  index f8246953a..9888e5078 100644
>  --- a/app/test-eventdev/test_perf_common.h
>  +++ b/app/test-eventdev/test_perf_common.h
>  @@ -86,6 +86,66 @@ struct perf_elt {
>   	uint64_t timestamp;
>   } __rte_cache_aligned;
>  
>  +#define BURST_SIZE 16
>  +
>  +#define PERF_WORKER_INIT\
>  +	struct worker_data *w  = arg;\
>  +	struct test_perf *t = w->t;\
>  +	struct evt_options *opt = t->opt;\
>  +	const uint8_t dev = w->dev_id;\
>  +	const uint8_t port = w->port_id;\
>  +	uint8_t *const sched_type_list = &t->sched_type_list[0];\
>  +	struct rte_mempool *const pool = t->pool;\
>  +	const uint8_t nb_stages = t->opt->nb_stages;\
>  +	const uint8_t laststage = nb_stages - 1;\
>  +	uint8_t cnt = 0;\
>  +	void *bufs[16] __rte_cache_aligned;\
>  +	int const sz = RTE_DIM(bufs);\
>  +	if (opt->verbose_level > 1)\
>  +		printf("%s(): lcore %d dev_id %d port=%d\n", __func__,\
>  +				rte_lcore_id(), dev, port)
>  +
>  +static inline __attribute__((always_inline)) int
>  +perf_process_last_stage(struct rte_mempool *const pool,
>  +		struct rte_event *const ev, struct worker_data *const w,
>  +		void *bufs[], int const buf_sz, uint8_t count) {
>  +	bufs[count++] = ev->event_ptr;
>  +	w->processed_pkts++;
>  +	rte_smp_wmb();
>  +
>  +	if (unlikely(count == buf_sz)) {
>  +		count = 0;
>  +		rte_mempool_put_bulk(pool, bufs, buf_sz);
>  +	}
>  +	return count;
>  +}
>  +
>  +static inline __attribute__((always_inline)) uint8_t
>  +perf_process_last_stage_latency(struct rte_mempool *const pool,
>  +		struct rte_event *const ev, struct worker_data *const w,
>  +		void *bufs[], int const buf_sz, uint8_t count) {
>  +	uint64_t latency;
>  +	struct perf_elt *const m = ev->event_ptr;
>  +
>  +	bufs[count++] = ev->event_ptr;
>  +	w->processed_pkts++;
>  +
>  +	if (unlikely(count == buf_sz)) {
>  +		count = 0;
>  +		latency = rte_get_timer_cycles() - m->timestamp;
>  +		rte_mempool_put_bulk(pool, bufs, buf_sz);
>  +	} else {
>  +		latency = rte_get_timer_cycles() - m->timestamp;
>  +	}
>  +
>  +	w->latency += latency;
>  +	rte_smp_wmb();
>  +	return count;
>  +}

What purpose does the store barrier serve in these two functions?
  
Jerin Jacob June 2, 2017, 12:21 p.m. UTC | #2
-----Original Message-----
> Date: Thu, 1 Jun 2017 21:04:15 +0000
> From: "Eads, Gage" <gage.eads@intel.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>, "dev@dpdk.org"
>  <dev@dpdk.org>
> CC: "Richardson, Bruce" <bruce.richardson@intel.com>, "Van Haaren, Harry"
>  <harry.van.haaren@intel.com>, "hemant.agrawal@nxp.com"
>  <hemant.agrawal@nxp.com>, "nipun.gupta@nxp.com" <nipun.gupta@nxp.com>,
>  "Vangati, Narender" <narender.vangati@intel.com>, "Rao, Nikhil"
>  <nikhil.rao@intel.com>, "gprathyusha@caviumnetworks.com"
>  <gprathyusha@caviumnetworks.com>
> Subject: RE: [dpdk-dev] [PATCH 25/33] app/testeventdev: perf queue: add
>  worker functions
> 
> 
> 
> >  -----Original Message-----
> >  From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> >  Sent: Sunday, May 28, 2017 2:59 PM
> >  To: dev@dpdk.org
> >  Cc: Richardson, Bruce <bruce.richardson@intel.com>; Van Haaren, Harry
> >  <harry.van.haaren@intel.com>; hemant.agrawal@nxp.com; Eads, Gage
> >  <gage.eads@intel.com>; nipun.gupta@nxp.com; Vangati, Narender
> >  <narender.vangati@intel.com>; Rao, Nikhil <nikhil.rao@intel.com>;
> >  gprathyusha@caviumnetworks.com; Jerin Jacob
> >  <jerin.jacob@caviumnetworks.com>
> >  Subject: [dpdk-dev] [PATCH 25/33] app/testeventdev: perf queue: add worker
> >  functions
> >  
> >  Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> >  ---
> >   app/test-eventdev/test_perf_common.h |  60 +++++++++++++++  app/test-
> >  eventdev/test_perf_queue.c  | 137 +++++++++++++++++++++++++++++++++++
> >   2 files changed, 197 insertions(+)
> >  
> >  diff --git a/app/test-eventdev/test_perf_common.h b/app/test-
> >  eventdev/test_perf_common.h
> >  index f8246953a..9888e5078 100644
> >  --- a/app/test-eventdev/test_perf_common.h
> >  +++ b/app/test-eventdev/test_perf_common.h
> >  @@ -86,6 +86,66 @@ struct perf_elt {
> >   	uint64_t timestamp;
> >   } __rte_cache_aligned;
> >  
> >  +#define BURST_SIZE 16
> >  +
> >  +#define PERF_WORKER_INIT\
> >  +	struct worker_data *w  = arg;\
> >  +	struct test_perf *t = w->t;\
> >  +	struct evt_options *opt = t->opt;\
> >  +	const uint8_t dev = w->dev_id;\
> >  +	const uint8_t port = w->port_id;\
> >  +	uint8_t *const sched_type_list = &t->sched_type_list[0];\
> >  +	struct rte_mempool *const pool = t->pool;\
> >  +	const uint8_t nb_stages = t->opt->nb_stages;\
> >  +	const uint8_t laststage = nb_stages - 1;\
> >  +	uint8_t cnt = 0;\
> >  +	void *bufs[16] __rte_cache_aligned;\
> >  +	int const sz = RTE_DIM(bufs);\
> >  +	if (opt->verbose_level > 1)\
> >  +		printf("%s(): lcore %d dev_id %d port=%d\n", __func__,\
> >  +				rte_lcore_id(), dev, port)
> >  +
> >  +static inline __attribute__((always_inline)) int
> >  +perf_process_last_stage(struct rte_mempool *const pool,
> >  +		struct rte_event *const ev, struct worker_data *const w,
> >  +		void *bufs[], int const buf_sz, uint8_t count) {
> >  +	bufs[count++] = ev->event_ptr;
> >  +	w->processed_pkts++;
> >  +	rte_smp_wmb();
> >  +
> >  +	if (unlikely(count == buf_sz)) {
> >  +		count = 0;
> >  +		rte_mempool_put_bulk(pool, bufs, buf_sz);
> >  +	}
> >  +	return count;
> >  +}
> >  +
> >  +static inline __attribute__((always_inline)) uint8_t
> >  +perf_process_last_stage_latency(struct rte_mempool *const pool,
> >  +		struct rte_event *const ev, struct worker_data *const w,
> >  +		void *bufs[], int const buf_sz, uint8_t count) {
> >  +	uint64_t latency;
> >  +	struct perf_elt *const m = ev->event_ptr;
> >  +
> >  +	bufs[count++] = ev->event_ptr;
> >  +	w->processed_pkts++;
> >  +
> >  +	if (unlikely(count == buf_sz)) {
> >  +		count = 0;
> >  +		latency = rte_get_timer_cycles() - m->timestamp;
> >  +		rte_mempool_put_bulk(pool, bufs, buf_sz);
> >  +	} else {
> >  +		latency = rte_get_timer_cycles() - m->timestamp;
> >  +	}
> >  +
> >  +	w->latency += latency;
> >  +	rte_smp_wmb();
> >  +	return count;
> >  +}
> 
> What purpose does the store barrier serve in these two functions?

The master core(!worker core) reads w->latency and
w->processed_pkts periodically from all workers.
  

Patch

diff --git a/app/test-eventdev/test_perf_common.h b/app/test-eventdev/test_perf_common.h
index f8246953a..9888e5078 100644
--- a/app/test-eventdev/test_perf_common.h
+++ b/app/test-eventdev/test_perf_common.h
@@ -86,6 +86,66 @@  struct perf_elt {
 	uint64_t timestamp;
 } __rte_cache_aligned;
 
+#define BURST_SIZE 16
+
+#define PERF_WORKER_INIT\
+	struct worker_data *w  = arg;\
+	struct test_perf *t = w->t;\
+	struct evt_options *opt = t->opt;\
+	const uint8_t dev = w->dev_id;\
+	const uint8_t port = w->port_id;\
+	uint8_t *const sched_type_list = &t->sched_type_list[0];\
+	struct rte_mempool *const pool = t->pool;\
+	const uint8_t nb_stages = t->opt->nb_stages;\
+	const uint8_t laststage = nb_stages - 1;\
+	uint8_t cnt = 0;\
+	void *bufs[16] __rte_cache_aligned;\
+	int const sz = RTE_DIM(bufs);\
+	if (opt->verbose_level > 1)\
+		printf("%s(): lcore %d dev_id %d port=%d\n", __func__,\
+				rte_lcore_id(), dev, port)
+
+static inline __attribute__((always_inline)) int
+perf_process_last_stage(struct rte_mempool *const pool,
+		struct rte_event *const ev, struct worker_data *const w,
+		void *bufs[], int const buf_sz, uint8_t count)
+{
+	bufs[count++] = ev->event_ptr;
+	w->processed_pkts++;
+	rte_smp_wmb();
+
+	if (unlikely(count == buf_sz)) {
+		count = 0;
+		rte_mempool_put_bulk(pool, bufs, buf_sz);
+	}
+	return count;
+}
+
+static inline __attribute__((always_inline)) uint8_t
+perf_process_last_stage_latency(struct rte_mempool *const pool,
+		struct rte_event *const ev, struct worker_data *const w,
+		void *bufs[], int const buf_sz, uint8_t count)
+{
+	uint64_t latency;
+	struct perf_elt *const m = ev->event_ptr;
+
+	bufs[count++] = ev->event_ptr;
+	w->processed_pkts++;
+
+	if (unlikely(count == buf_sz)) {
+		count = 0;
+		latency = rte_get_timer_cycles() - m->timestamp;
+		rte_mempool_put_bulk(pool, bufs, buf_sz);
+	} else {
+		latency = rte_get_timer_cycles() - m->timestamp;
+	}
+
+	w->latency += latency;
+	rte_smp_wmb();
+	return count;
+}
+
+
 static inline int
 perf_nb_event_ports(struct evt_options *opt)
 {
diff --git a/app/test-eventdev/test_perf_queue.c b/app/test-eventdev/test_perf_queue.c
index 352240c7b..811f7f78d 100644
--- a/app/test-eventdev/test_perf_queue.c
+++ b/app/test-eventdev/test_perf_queue.c
@@ -41,6 +41,142 @@  perf_queue_nb_event_queues(struct evt_options *opt)
 	return evt_nr_active_lcores(opt->plcores) * opt->nb_stages;
 }
 
+static inline __attribute__((always_inline)) void
+mark_fwd_latency(struct rte_event *const ev,
+		const uint8_t nb_stages)
+{
+	if (unlikely((ev->queue_id % nb_stages) == 0)) {
+		struct perf_elt *const m = ev->event_ptr;
+
+		m->timestamp = rte_get_timer_cycles();
+	}
+}
+
+static inline __attribute__((always_inline)) void
+fwd_event(struct rte_event *const ev, uint8_t *const sched_type_list,
+		const uint8_t nb_stages)
+{
+	ev->queue_id++;
+	ev->sched_type = sched_type_list[ev->queue_id % nb_stages];
+	ev->op = RTE_EVENT_OP_FORWARD;
+	ev->event_type = RTE_EVENT_TYPE_CPU;
+}
+
+static int
+perf_queue_worker(void *arg, const int enable_fwd_latency)
+{
+	PERF_WORKER_INIT;
+	struct rte_event ev;
+
+	while (t->done == false) {
+		uint16_t event = rte_event_dequeue_burst(dev, port, &ev, 1, 0);
+
+		if (!event) {
+			rte_pause();
+			continue;
+		}
+		if (enable_fwd_latency)
+		/* first q in pipeline, mark timestamp to compute fwd latency */
+			mark_fwd_latency(&ev, nb_stages);
+
+		/* last stage in pipeline */
+		if (unlikely((ev.queue_id % nb_stages) == laststage)) {
+			if (enable_fwd_latency)
+				cnt = perf_process_last_stage_latency(pool,
+					&ev, w, bufs, sz, cnt);
+			else
+				cnt = perf_process_last_stage(pool,
+					&ev, w, bufs, sz, cnt);
+		} else {
+			fwd_event(&ev, sched_type_list, nb_stages);
+			while (rte_event_enqueue_burst(dev, port, &ev, 1) != 1)
+				rte_pause();
+		}
+	}
+	return 0;
+}
+
+static int
+perf_queue_worker_burst(void *arg, const int enable_fwd_latency)
+{
+	PERF_WORKER_INIT;
+	uint16_t i;
+	/* +1 to avoid prefetch out of array check */
+	struct rte_event ev[BURST_SIZE + 1];
+
+	while (t->done == false) {
+		uint16_t const nb_rx = rte_event_dequeue_burst(dev, port, ev,
+				BURST_SIZE, 0);
+
+		if (!nb_rx) {
+			rte_pause();
+			continue;
+		}
+
+		for (i = 0; i < nb_rx; i++) {
+			if (enable_fwd_latency) {
+				rte_prefetch0(ev[i+1].event_ptr);
+				/* first queue in pipeline.
+				 * mark time stamp to compute fwd latency
+				 */
+				mark_fwd_latency(&ev[i], nb_stages);
+			}
+			/* last stage in pipeline */
+			if (unlikely((ev[i].queue_id % nb_stages) ==
+						 laststage)) {
+				if (enable_fwd_latency)
+					cnt = perf_process_last_stage_latency(
+						pool, &ev[i], w, bufs, sz, cnt);
+				else
+					cnt = perf_process_last_stage(pool,
+						&ev[i], w, bufs, sz, cnt);
+
+				ev[i].op = RTE_EVENT_OP_RELEASE;
+			} else {
+				fwd_event(&ev[i], sched_type_list, nb_stages);
+			}
+		}
+
+		uint16_t enq;
+
+		enq = rte_event_enqueue_burst(dev, port, ev, nb_rx);
+		while (enq < nb_rx) {
+			enq += rte_event_enqueue_burst(dev, port,
+							ev + enq, nb_rx - enq);
+		}
+	}
+	return 0;
+}
+
+static int
+worker_wrapper(void *arg)
+{
+	struct worker_data *w  = arg;
+	struct evt_options *opt = w->t->opt;
+
+	/* FIXME: probe through device capability */
+	const int burst = 1;
+	const int fwd_latency = opt->fwd_latency;
+
+	/* allow compiler to optimize */
+	if (!burst && !fwd_latency)
+		return perf_queue_worker(arg, 0);
+	else if (!burst && fwd_latency)
+		return perf_queue_worker(arg, 1);
+	else if (burst && !fwd_latency)
+		return perf_queue_worker_burst(arg, 0);
+	else if (burst && fwd_latency)
+		return perf_queue_worker_burst(arg, 1);
+
+	rte_panic("invalid worker\n");
+}
+
+static int
+perf_queue_launch_lcores(struct evt_test *test, struct evt_options *opt)
+{
+	return perf_launch_lcores(test, opt, worker_wrapper);
+}
+
 static int
 perf_queue_eventdev_setup(struct evt_test *test, struct evt_options *opt)
 {
@@ -143,6 +279,7 @@  static const struct evt_test_ops perf_queue =  {
 	.test_setup         = perf_test_setup,
 	.mempool_setup      = perf_mempool_setup,
 	.eventdev_setup     = perf_queue_eventdev_setup,
+	.launch_lcores      = perf_queue_launch_lcores,
 	.eventdev_destroy   = perf_eventdev_destroy,
 	.mempool_destroy    = perf_mempool_destroy,
 	.test_result        = perf_test_result,