[v1] event/sw: performance improvements

Message ID 20200908105211.10066-1-radu.nicolau@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers
Series [v1] event/sw: performance improvements |

Checks

Context Check Description
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-testing success Testing PASS
ci/travis-robot success Travis build: passed
ci/iol-mellanox-Performance success Performance Testing PASS
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Radu Nicolau Sept. 8, 2020, 10:52 a.m. UTC
  Add minimum burst throughout the scheduler pipeline and a flush counter.
Replace ring API calls with local single threaded implementation
where possible.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/event/sw/sw_evdev.h           | 11 +++-
 drivers/event/sw/sw_evdev_scheduler.c | 83 +++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 13 deletions(-)
  

Comments

Van Haaren, Harry Sept. 23, 2020, 11:13 a.m. UTC | #1
> -----Original Message-----
> From: Nicolau, Radu <radu.nicolau@intel.com>
> Sent: Tuesday, September 8, 2020 11:52 AM
> To: dev@dpdk.org
> Cc: jerinj@marvell.com; Van Haaren, Harry <harry.van.haaren@intel.com>;
> Nicolau, Radu <radu.nicolau@intel.com>
> Subject: [PATCH v1] event/sw: performance improvements
> 
> Add minimum burst throughout the scheduler pipeline and a flush counter.
> Replace ring API calls with local single threaded implementation
> where possible.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>

Thanks for the patch, a few comments inline.

> ---
>  drivers/event/sw/sw_evdev.h           | 11 +++-
>  drivers/event/sw/sw_evdev_scheduler.c | 83 +++++++++++++++++++++++----
>  2 files changed, 81 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
> index 7c77b2495..95e51065f 100644
> --- a/drivers/event/sw/sw_evdev.h
> +++ b/drivers/event/sw/sw_evdev.h
> @@ -29,7 +29,13 @@
>  /* report dequeue burst sizes in buckets */
>  #define SW_DEQ_STAT_BUCKET_SHIFT 2
>  /* how many packets pulled from port by sched */
> -#define SCHED_DEQUEUE_BURST_SIZE 32
> +#define SCHED_DEQUEUE_BURST_SIZE 64
> +
> +#define SCHED_MIN_BURST_SIZE 8
> +#define SCHED_NO_ENQ_CYCLE_FLUSH 256
> +/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
> +#define SCHED_REFILL_ONCE_PER_CALL 1

Is it possible to make the above #define a runtime option?
Eg, --vdev event_sw,refill_iter=1

That would allow packaged versions of DPDK to be usable in both modes.

> +
> 
>  #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our history list
> */
>  #define NUM_SAMPLES 64 /* how many data points use for average stats */
> @@ -214,6 +220,9 @@ struct sw_evdev {
>  	uint32_t xstats_count_mode_port;
>  	uint32_t xstats_count_mode_queue;
> 
> +	uint16_t sched_flush_count;
> +	uint16_t sched_min_burst;
> +
>  	/* Contains all ports - load balanced and directed */
>  	struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;
> 
> diff --git a/drivers/event/sw/sw_evdev_scheduler.c
> b/drivers/event/sw/sw_evdev_scheduler.c
> index cff747da8..ca6d1caff 100644
> --- a/drivers/event/sw/sw_evdev_scheduler.c
> +++ b/drivers/event/sw/sw_evdev_scheduler.c
> @@ -26,6 +26,29 @@
>  /* use cheap bit mixing, we only need to lose a few bits */
>  #define SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)
> 
> +
> +/* single object enq and deq for non MT ring */
> +static __rte_always_inline void
> +sw_nonmt_ring_dequeue(struct rte_ring *r, void **obj)
> +{
> +	if ((r->prod.tail - r->cons.tail) < 1)
> +		return;
> +	void **ring = (void **)&r[1];
> +	*obj = ring[r->cons.tail & r->mask];
> +	r->cons.tail++;
> +}
> +static __rte_always_inline int
> +sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj)
> +{
> +	if ((r->capacity + r->cons.tail - r->prod.tail) < 1)
> +		return 0;
> +	void **ring = (void **)&r[1];
> +	ring[r->prod.tail & r->mask] = obj;
> +	r->prod.tail++;
> +	return 1;
> +}
> +
> +
>  static inline uint32_t
>  sw_schedule_atomic_to_cq(struct sw_evdev *sw, struct sw_qid * const qid,
>  		uint32_t iq_num, unsigned int count)
> @@ -146,9 +169,9 @@ sw_schedule_parallel_to_cq(struct sw_evdev *sw, struct
> sw_qid * const qid,
>  				cq_idx = 0;
>  			cq = qid->cq_map[cq_idx++];
> 
> -		} while (rte_event_ring_free_count(
> -				sw->ports[cq].cq_worker_ring) == 0 ||
> -				sw->ports[cq].inflights == SW_PORT_HIST_LIST);
> +		} while (sw->ports[cq].inflights == SW_PORT_HIST_LIST ||
> +				rte_event_ring_free_count(
> +					sw->ports[cq].cq_worker_ring) == 0);
> 
>  		struct sw_port *p = &sw->ports[cq];
>  		if (sw->cq_ring_space[cq] == 0 ||
> @@ -164,7 +187,7 @@ sw_schedule_parallel_to_cq(struct sw_evdev *sw, struct
> sw_qid * const qid,
>  		p->hist_list[head].qid = qid_id;
> 
>  		if (keep_order)
> -			rte_ring_sc_dequeue(qid->reorder_buffer_freelist,
> +			sw_nonmt_ring_dequeue(qid->reorder_buffer_freelist,
>  					(void *)&p->hist_list[head].rob_entry);
> 
>  		sw->ports[cq].cq_buf[sw->ports[cq].cq_buf_count++] = *qe;
> @@ -229,7 +252,7 @@ sw_schedule_qid_to_cq(struct sw_evdev *sw)
>  		uint32_t pkts_done = 0;
>  		uint32_t count = iq_count(&qid->iq[iq_num]);
> 
> -		if (count > 0) {
> +		if (count >= sw->sched_min_burst) {
>  			if (type == SW_SCHED_TYPE_DIRECT)
>  				pkts_done += sw_schedule_dir_to_cq(sw, qid,
>  						iq_num, count);
> @@ -267,7 +290,7 @@ sw_schedule_reorder(struct sw_evdev *sw, int qid_start, int
> qid_end)
> 
>  	for (; qid_start < qid_end; qid_start++) {
>  		struct sw_qid *qid = &sw->qids[qid_start];
> -		int i, num_entries_in_use;
> +		unsigned int i, num_entries_in_use;
> 
>  		if (qid->type != RTE_SCHED_TYPE_ORDERED)
>  			continue;
> @@ -275,6 +298,9 @@ sw_schedule_reorder(struct sw_evdev *sw, int qid_start, int
> qid_end)
>  		num_entries_in_use = rte_ring_free_count(
>  					qid->reorder_buffer_freelist);
> 
> +		if (num_entries_in_use < sw->sched_min_burst)
> +			num_entries_in_use = 0;
> +
>  		for (i = 0; i < num_entries_in_use; i++) {
>  			struct reorder_buffer_entry *entry;
>  			int j;
> @@ -320,7 +346,7 @@ sw_schedule_reorder(struct sw_evdev *sw, int qid_start, int
> qid_end)
>  			if (!entry->ready) {
>  				entry->fragment_index = 0;
> 
> -				rte_ring_sp_enqueue(
> +				sw_nonmt_ring_enqueue(
>  						qid->reorder_buffer_freelist,
>  						entry);
> 
> @@ -349,9 +375,11 @@ __pull_port_lb(struct sw_evdev *sw, uint32_t port_id, int
> allow_reorder)
>  	uint32_t pkts_iter = 0;
>  	struct sw_port *port = &sw->ports[port_id];
> 
> +#if !SCHED_REFILL_ONCE_PER_CALL
>  	/* If shadow ring has 0 pkts, pull from worker ring */
>  	if (port->pp_buf_count == 0)
>  		sw_refill_pp_buf(sw, port);
> +#endif

As per above comment, this #if would become a runtime check.
Similar for the below #if comments.


>  	while (port->pp_buf_count) {
>  		const struct rte_event *qe = &port->pp_buf[port->pp_buf_start];
> @@ -467,9 +495,11 @@ sw_schedule_pull_port_dir(struct sw_evdev *sw, uint32_t
> port_id)
>  	uint32_t pkts_iter = 0;
>  	struct sw_port *port = &sw->ports[port_id];
> 
> +#if !SCHED_REFILL_ONCE_PER_CALL
>  	/* If shadow ring has 0 pkts, pull from worker ring */
>  	if (port->pp_buf_count == 0)
>  		sw_refill_pp_buf(sw, port);
> +#endif
> 
>  	while (port->pp_buf_count) {
>  		const struct rte_event *qe = &port->pp_buf[port->pp_buf_start];
> @@ -557,12 +587,41 @@ sw_event_schedule(struct rte_eventdev *dev)
>  	/* push all the internal buffered QEs in port->cq_ring to the
>  	 * worker cores: aka, do the ring transfers batched.
>  	 */
> +	int no_enq = 1;
>  	for (i = 0; i < sw->port_count; i++) {
> -		struct rte_event_ring *worker = sw->ports[i].cq_worker_ring;
> -		rte_event_ring_enqueue_burst(worker, sw->ports[i].cq_buf,
> -				sw->ports[i].cq_buf_count,
> -				&sw->cq_ring_space[i]);
> -		sw->ports[i].cq_buf_count = 0;
> +		struct sw_port *port = &sw->ports[i];
> +		struct rte_event_ring *worker = port->cq_worker_ring;
> +
> +#if SCHED_REFILL_ONCE_PER_CALL
> +		/* If shadow ring has 0 pkts, pull from worker ring */
> +		if (port->pp_buf_count == 0)
> +			sw_refill_pp_buf(sw, port);
> +#endif
> +
> +		if (port->cq_buf_count >= sw->sched_min_burst) {
> +			rte_event_ring_enqueue_burst(worker,
> +					port->cq_buf,
> +					port->cq_buf_count,
> +					&sw->cq_ring_space[i]);
> +			port->cq_buf_count = 0;
> +			no_enq = 0;
> +		} else {
> +			sw->cq_ring_space[i] =
> +					rte_event_ring_free_count(worker) -
> +					port->cq_buf_count;
> +		}
> +	}
> +
> +	if (no_enq) {
> +		if (unlikely(sw->sched_flush_count >
> SCHED_NO_ENQ_CYCLE_FLUSH))
> +			sw->sched_min_burst = 1;
> +		else
> +			sw->sched_flush_count++;
> +	} else {
> +		if (sw->sched_flush_count)
> +			sw->sched_flush_count--;
> +		else
> +			sw->sched_min_burst = SCHED_MIN_BURST_SIZE;
>  	}
> 
>  }
> --
> 2.17.1
  
Honnappa Nagarahalli Sept. 23, 2020, 11:10 p.m. UTC | #2
<snip>

> >
> > Add minimum burst throughout the scheduler pipeline and a flush counter.
> > Replace ring API calls with local single threaded implementation where
> > possible.
> >
> > Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> 
> Thanks for the patch, a few comments inline.
> 
> > ---
> >  drivers/event/sw/sw_evdev.h           | 11 +++-
> >  drivers/event/sw/sw_evdev_scheduler.c | 83
> > +++++++++++++++++++++++----
> >  2 files changed, 81 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
> > index 7c77b2495..95e51065f 100644
> > --- a/drivers/event/sw/sw_evdev.h
> > +++ b/drivers/event/sw/sw_evdev.h
> > @@ -29,7 +29,13 @@
> >  /* report dequeue burst sizes in buckets */  #define
> > SW_DEQ_STAT_BUCKET_SHIFT 2
> >  /* how many packets pulled from port by sched */ -#define
> > SCHED_DEQUEUE_BURST_SIZE 32
> > +#define SCHED_DEQUEUE_BURST_SIZE 64
> > +
> > +#define SCHED_MIN_BURST_SIZE 8
> > +#define SCHED_NO_ENQ_CYCLE_FLUSH 256
> > +/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
> > +#define SCHED_REFILL_ONCE_PER_CALL 1
> 
> Is it possible to make the above #define a runtime option?
> Eg, --vdev event_sw,refill_iter=1
> 
> That would allow packaged versions of DPDK to be usable in both modes.
> 
> > +
> >
> >  #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our
> > history list */  #define NUM_SAMPLES 64 /* how many data points use
> > for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {
> >  	uint32_t xstats_count_mode_port;
> >  	uint32_t xstats_count_mode_queue;
> >
> > +	uint16_t sched_flush_count;
> > +	uint16_t sched_min_burst;
> > +
> >  	/* Contains all ports - load balanced and directed */
> >  	struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;
> >
> > diff --git a/drivers/event/sw/sw_evdev_scheduler.c
> > b/drivers/event/sw/sw_evdev_scheduler.c
> > index cff747da8..ca6d1caff 100644
> > --- a/drivers/event/sw/sw_evdev_scheduler.c
> > +++ b/drivers/event/sw/sw_evdev_scheduler.c
> > @@ -26,6 +26,29 @@
> >  /* use cheap bit mixing, we only need to lose a few bits */  #define
> > SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)
> >
> > +
> > +/* single object enq and deq for non MT ring */ static
> > +__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,
> > +void **obj) {
> > +	if ((r->prod.tail - r->cons.tail) < 1)
> > +		return;
> > +	void **ring = (void **)&r[1];
> > +	*obj = ring[r->cons.tail & r->mask];
> > +	r->cons.tail++;
> > +}
> > +static __rte_always_inline int
> > +sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) {
> > +	if ((r->capacity + r->cons.tail - r->prod.tail) < 1)
> > +		return 0;
> > +	void **ring = (void **)&r[1];
> > +	ring[r->prod.tail & r->mask] = obj;
> > +	r->prod.tail++;
> > +	return 1;
> > +
Why not make these APIs part of the rte_ring library? You could further optimize them by keeping the indices on the same cacheline.

> > +
> > +
> >  static inline uint32_t
> >  sw_schedule_atomic_to_cq(struct sw_evdev *sw, struct sw_qid * const qid,
> >  		uint32_t iq_num, unsigned int count)
> > @@ -146,9 +169,9 @@ sw_schedule_parallel_to_cq(struct sw_evdev *sw,
> struct
> > sw_qid * const qid,
> >  				cq_idx = 0;
> >  			cq = qid->cq_map[cq_idx++];
> >
> > -		} while (rte_event_ring_free_count(
> > -				sw->ports[cq].cq_worker_ring) == 0 ||
> > -				sw->ports[cq].inflights ==
> SW_PORT_HIST_LIST);
> > +		} while (sw->ports[cq].inflights == SW_PORT_HIST_LIST ||
> > +				rte_event_ring_free_count(
> > +					sw->ports[cq].cq_worker_ring) == 0);
> >
> >  		struct sw_port *p = &sw->ports[cq];
> >  		if (sw->cq_ring_space[cq] == 0 ||
> > @@ -164,7 +187,7 @@ sw_schedule_parallel_to_cq(struct sw_evdev *sw,
> struct
> > sw_qid * const qid,
> >  		p->hist_list[head].qid = qid_id;
> >
> >  		if (keep_order)
> > -			rte_ring_sc_dequeue(qid->reorder_buffer_freelist,
> > +			sw_nonmt_ring_dequeue(qid->reorder_buffer_freelist,
> >  					(void *)&p->hist_list[head].rob_entry);
> >
> >  		sw->ports[cq].cq_buf[sw->ports[cq].cq_buf_count++] = *qe;
> > @@ -229,7 +252,7 @@ sw_schedule_qid_to_cq(struct sw_evdev *sw)
> >  		uint32_t pkts_done = 0;
> >  		uint32_t count = iq_count(&qid->iq[iq_num]);
> >
> > -		if (count > 0) {
> > +		if (count >= sw->sched_min_burst) {
> >  			if (type == SW_SCHED_TYPE_DIRECT)
> >  				pkts_done += sw_schedule_dir_to_cq(sw, qid,
> >  						iq_num, count);
> > @@ -267,7 +290,7 @@ sw_schedule_reorder(struct sw_evdev *sw, int
> qid_start, int
> > qid_end)
> >
> >  	for (; qid_start < qid_end; qid_start++) {
> >  		struct sw_qid *qid = &sw->qids[qid_start];
> > -		int i, num_entries_in_use;
> > +		unsigned int i, num_entries_in_use;
> >
> >  		if (qid->type != RTE_SCHED_TYPE_ORDERED)
> >  			continue;
> > @@ -275,6 +298,9 @@ sw_schedule_reorder(struct sw_evdev *sw, int
> qid_start, int
> > qid_end)
> >  		num_entries_in_use = rte_ring_free_count(
> >  					qid->reorder_buffer_freelist);
> >
> > +		if (num_entries_in_use < sw->sched_min_burst)
> > +			num_entries_in_use = 0;
> > +
> >  		for (i = 0; i < num_entries_in_use; i++) {
> >  			struct reorder_buffer_entry *entry;
> >  			int j;
> > @@ -320,7 +346,7 @@ sw_schedule_reorder(struct sw_evdev *sw, int
> qid_start, int
> > qid_end)
> >  			if (!entry->ready) {
> >  				entry->fragment_index = 0;
> >
> > -				rte_ring_sp_enqueue(
> > +				sw_nonmt_ring_enqueue(
> >  						qid->reorder_buffer_freelist,
> >  						entry);
> >
> > @@ -349,9 +375,11 @@ __pull_port_lb(struct sw_evdev *sw, uint32_t port_id,
> int
> > allow_reorder)
> >  	uint32_t pkts_iter = 0;
> >  	struct sw_port *port = &sw->ports[port_id];
> >
> > +#if !SCHED_REFILL_ONCE_PER_CALL
> >  	/* If shadow ring has 0 pkts, pull from worker ring */
> >  	if (port->pp_buf_count == 0)
> >  		sw_refill_pp_buf(sw, port);
> > +#endif
> 
> As per above comment, this #if would become a runtime check.
> Similar for the below #if comments.
> 
> 
> >  	while (port->pp_buf_count) {
> >  		const struct rte_event *qe = &port->pp_buf[port-
> >pp_buf_start];
> > @@ -467,9 +495,11 @@ sw_schedule_pull_port_dir(struct sw_evdev *sw,
> uint32_t
> > port_id)
> >  	uint32_t pkts_iter = 0;
> >  	struct sw_port *port = &sw->ports[port_id];
> >
> > +#if !SCHED_REFILL_ONCE_PER_CALL
> >  	/* If shadow ring has 0 pkts, pull from worker ring */
> >  	if (port->pp_buf_count == 0)
> >  		sw_refill_pp_buf(sw, port);
> > +#endif
> >
> >  	while (port->pp_buf_count) {
> >  		const struct rte_event *qe = &port->pp_buf[port-
> >pp_buf_start];
> > @@ -557,12 +587,41 @@ sw_event_schedule(struct rte_eventdev *dev)
> >  	/* push all the internal buffered QEs in port->cq_ring to the
> >  	 * worker cores: aka, do the ring transfers batched.
> >  	 */
> > +	int no_enq = 1;
> >  	for (i = 0; i < sw->port_count; i++) {
> > -		struct rte_event_ring *worker = sw->ports[i].cq_worker_ring;
> > -		rte_event_ring_enqueue_burst(worker, sw->ports[i].cq_buf,
> > -				sw->ports[i].cq_buf_count,
> > -				&sw->cq_ring_space[i]);
> > -		sw->ports[i].cq_buf_count = 0;
> > +		struct sw_port *port = &sw->ports[i];
> > +		struct rte_event_ring *worker = port->cq_worker_ring;
> > +
> > +#if SCHED_REFILL_ONCE_PER_CALL
> > +		/* If shadow ring has 0 pkts, pull from worker ring */
> > +		if (port->pp_buf_count == 0)
> > +			sw_refill_pp_buf(sw, port);
> > +#endif
> > +
> > +		if (port->cq_buf_count >= sw->sched_min_burst) {
> > +			rte_event_ring_enqueue_burst(worker,
> > +					port->cq_buf,
> > +					port->cq_buf_count,
> > +					&sw->cq_ring_space[i]);
> > +			port->cq_buf_count = 0;
> > +			no_enq = 0;
> > +		} else {
> > +			sw->cq_ring_space[i] =
> > +					rte_event_ring_free_count(worker) -
> > +					port->cq_buf_count;
> > +		}
> > +	}
> > +
> > +	if (no_enq) {
> > +		if (unlikely(sw->sched_flush_count >
> > SCHED_NO_ENQ_CYCLE_FLUSH))
> > +			sw->sched_min_burst = 1;
> > +		else
> > +			sw->sched_flush_count++;
> > +	} else {
> > +		if (sw->sched_flush_count)
> > +			sw->sched_flush_count--;
> > +		else
> > +			sw->sched_min_burst = SCHED_MIN_BURST_SIZE;
> >  	}
> >
> >  }
> > --
> > 2.17.1
  
Radu Nicolau Sept. 24, 2020, 3:27 p.m. UTC | #3
On 9/24/2020 12:10 AM, Honnappa Nagarahalli wrote:

<snip>





Add minimum burst throughout the scheduler pipeline and a flush counter.

Replace ring API calls with local single threaded implementation where

possible.



Signed-off-by: Radu Nicolau <radu.nicolau@intel.com><mailto:radu.nicolau@intel.com>



Thanks for the patch, a few comments inline.



---

 drivers/event/sw/sw_evdev.h           | 11 +++-

 drivers/event/sw/sw_evdev_scheduler.c | 83

+++++++++++++++++++++++----

 2 files changed, 81 insertions(+), 13 deletions(-)



diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h

index 7c77b2495..95e51065f 100644

--- a/drivers/event/sw/sw_evdev.h

+++ b/drivers/event/sw/sw_evdev.h

@@ -29,7 +29,13 @@

 /* report dequeue burst sizes in buckets */  #define

SW_DEQ_STAT_BUCKET_SHIFT 2

 /* how many packets pulled from port by sched */ -#define

SCHED_DEQUEUE_BURST_SIZE 32

+#define SCHED_DEQUEUE_BURST_SIZE 64

+

+#define SCHED_MIN_BURST_SIZE 8

+#define SCHED_NO_ENQ_CYCLE_FLUSH 256

+/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/

+#define SCHED_REFILL_ONCE_PER_CALL 1



Is it possible to make the above #define a runtime option?

Eg, --vdev event_sw,refill_iter=1



That would allow packaged versions of DPDK to be usable in both modes.



+



 #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our

history list */  #define NUM_SAMPLES 64 /* how many data points use

for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {

     uint32_t xstats_count_mode_port;

     uint32_t xstats_count_mode_queue;



+    uint16_t sched_flush_count;

+    uint16_t sched_min_burst;

+

     /* Contains all ports - load balanced and directed */

     struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;



diff --git a/drivers/event/sw/sw_evdev_scheduler.c

b/drivers/event/sw/sw_evdev_scheduler.c

index cff747da8..ca6d1caff 100644

--- a/drivers/event/sw/sw_evdev_scheduler.c

+++ b/drivers/event/sw/sw_evdev_scheduler.c

@@ -26,6 +26,29 @@

 /* use cheap bit mixing, we only need to lose a few bits */  #define

SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)



+

+/* single object enq and deq for non MT ring */ static

+__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,

+void **obj) {

+    if ((r->prod.tail - r->cons.tail) < 1)

+            return;

+    void **ring = (void **)&r[1];

+    *obj = ring[r->cons.tail & r->mask];

+    r->cons.tail++;

+}

+static __rte_always_inline int

+sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) {

+    if ((r->capacity + r->cons.tail - r->prod.tail) < 1)

+            return 0;

+    void **ring = (void **)&r[1];

+    ring[r->prod.tail & r->mask] = obj;

+    r->prod.tail++;

+    return 1;

+

Why not make these APIs part of the rte_ring library? You could further optimize them by keeping the indices on the same cacheline.

I'm not sure there is any need for non thread-safe rings outside this particular case.
  
Honnappa Nagarahalli Sept. 24, 2020, 5:38 p.m. UTC | #4
<snip>



Add minimum burst throughout the scheduler pipeline and a flush counter.

Replace ring API calls with local single threaded implementation where

possible.



Signed-off-by: Radu Nicolau <radu.nicolau@intel.com><mailto:radu.nicolau@intel.com>



Thanks for the patch, a few comments inline.



---

 drivers/event/sw/sw_evdev.h           | 11 +++-

 drivers/event/sw/sw_evdev_scheduler.c | 83

+++++++++++++++++++++++----

 2 files changed, 81 insertions(+), 13 deletions(-)



diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h

index 7c77b2495..95e51065f 100644

--- a/drivers/event/sw/sw_evdev.h

+++ b/drivers/event/sw/sw_evdev.h

@@ -29,7 +29,13 @@

 /* report dequeue burst sizes in buckets */  #define

SW_DEQ_STAT_BUCKET_SHIFT 2

 /* how many packets pulled from port by sched */ -#define

SCHED_DEQUEUE_BURST_SIZE 32

+#define SCHED_DEQUEUE_BURST_SIZE 64

+

+#define SCHED_MIN_BURST_SIZE 8

+#define SCHED_NO_ENQ_CYCLE_FLUSH 256

+/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/

+#define SCHED_REFILL_ONCE_PER_CALL 1



Is it possible to make the above #define a runtime option?

Eg, --vdev event_sw,refill_iter=1



That would allow packaged versions of DPDK to be usable in both modes.



+



 #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our

history list */  #define NUM_SAMPLES 64 /* how many data points use

for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {

     uint32_t xstats_count_mode_port;

     uint32_t xstats_count_mode_queue;



+    uint16_t sched_flush_count;

+    uint16_t sched_min_burst;

+

     /* Contains all ports - load balanced and directed */

     struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;



diff --git a/drivers/event/sw/sw_evdev_scheduler.c

b/drivers/event/sw/sw_evdev_scheduler.c

index cff747da8..ca6d1caff 100644

--- a/drivers/event/sw/sw_evdev_scheduler.c

+++ b/drivers/event/sw/sw_evdev_scheduler.c

@@ -26,6 +26,29 @@

 /* use cheap bit mixing, we only need to lose a few bits */  #define

SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)



+

+/* single object enq and deq for non MT ring */ static

+__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,

+void **obj) {

+    if ((r->prod.tail - r->cons.tail) < 1)

+            return;

+    void **ring = (void **)&r[1];

+    *obj = ring[r->cons.tail & r->mask];

+    r->cons.tail++;

+}

+static __rte_always_inline int

+sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) {

+    if ((r->capacity + r->cons.tail - r->prod.tail) < 1)

+            return 0;

+    void **ring = (void **)&r[1];

+    ring[r->prod.tail & r->mask] = obj;

+    r->prod.tail++;

+    return 1;

+

Why not make these APIs part of the rte_ring library? You could further optimize them by keeping the indices on the same cacheline.

I'm not sure there is any need for non thread-safe rings outside this particular case.

[Honnappa] I think if we add the APIs, we will find the use cases.

But, more than that, I understand that rte_ring structure is exposed to the application. The reason for doing that is the inline functions that rte_ring provides. IMO, we should still maintain modularity and should not use the internals of the rte_ring structure outside of the library.
  
Ananyev, Konstantin Sept. 24, 2020, 6:02 p.m. UTC | #5
<snip>

Add minimum burst throughout the scheduler pipeline and a flush counter.
Replace ring API calls with local single threaded implementation where
possible.

Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com

Thanks for the patch, a few comments inline.

---
 drivers/event/sw/sw_evdev.h           | 11 +++-
 drivers/event/sw/sw_evdev_scheduler.c | 83
+++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
index 7c77b2495..95e51065f 100644
--- a/drivers/event/sw/sw_evdev.h
+++ b/drivers/event/sw/sw_evdev.h
@@ -29,7 +29,13 @@
 /* report dequeue burst sizes in buckets */  #define
SW_DEQ_STAT_BUCKET_SHIFT 2
 /* how many packets pulled from port by sched */ -#define
SCHED_DEQUEUE_BURST_SIZE 32
+#define SCHED_DEQUEUE_BURST_SIZE 64
+
+#define SCHED_MIN_BURST_SIZE 8
+#define SCHED_NO_ENQ_CYCLE_FLUSH 256
+/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
+#define SCHED_REFILL_ONCE_PER_CALL 1

Is it possible to make the above #define a runtime option?
Eg, --vdev event_sw,refill_iter=1

That would allow packaged versions of DPDK to be usable in both modes.

+

 #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our
history list */  #define NUM_SAMPLES 64 /* how many data points use
for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {
     uint32_t xstats_count_mode_port;
     uint32_t xstats_count_mode_queue;

+    uint16_t sched_flush_count;
+    uint16_t sched_min_burst;
+
     /* Contains all ports - load balanced and directed */
     struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;

diff --git a/drivers/event/sw/sw_evdev_scheduler.c
b/drivers/event/sw/sw_evdev_scheduler.c
index cff747da8..ca6d1caff 100644
--- a/drivers/event/sw/sw_evdev_scheduler.c
+++ b/drivers/event/sw/sw_evdev_scheduler.c
@@ -26,6 +26,29 @@
 /* use cheap bit mixing, we only need to lose a few bits */  #define
SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)

+
+/* single object enq and deq for non MT ring */ static
+__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,
+void **obj) {
+    if ((r->prod.tail - r->cons.tail) < 1)
+            return;
+    void **ring = (void **)&r[1];
+    *obj = ring[r->cons.tail & r->mask];
+    r->cons.tail++;
+}
+static __rte_always_inline int
+sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) {
+    if ((r->capacity + r->cons.tail - r->prod.tail) < 1)
+            return 0;
+    void **ring = (void **)&r[1];
+    ring[r->prod.tail & r->mask] = obj;
+    r->prod.tail++;
+    return 1;
+
Why not make these APIs part of the rte_ring library? You could further optimize them by keeping the indices on the same cacheline.
I'm not sure there is any need for non thread-safe rings outside this particular case.
[Honnappa] I think if we add the APIs, we will find the use cases.
But, more than that, I understand that rte_ring structure is exposed to the application. The reason for doing that is the inline functions that rte_ring provides. IMO, we should still maintain modularity and should not use the internals of the rte_ring structure outside of the library.

+1 to that.

BTW, is there any real perf benefit from such micor-optimisation?
  
Bruce Richardson Sept. 25, 2020, 10:28 a.m. UTC | #6
On Thu, Sep 24, 2020 at 06:02:48PM +0000, Ananyev, Konstantin wrote:
> 
> 
> 
> <snip>
> 
> Add minimum burst throughout the scheduler pipeline and a flush counter.
> Replace ring API calls with local single threaded implementation where
> possible.
> 
> Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
> 
> Thanks for the patch, a few comments inline.
> 
> ---
>  drivers/event/sw/sw_evdev.h           | 11 +++-
>  drivers/event/sw/sw_evdev_scheduler.c | 83
> +++++++++++++++++++++++----
>  2 files changed, 81 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
> index 7c77b2495..95e51065f 100644
> --- a/drivers/event/sw/sw_evdev.h
> +++ b/drivers/event/sw/sw_evdev.h
> @@ -29,7 +29,13 @@
>  /* report dequeue burst sizes in buckets */  #define
> SW_DEQ_STAT_BUCKET_SHIFT 2
>  /* how many packets pulled from port by sched */ -#define
> SCHED_DEQUEUE_BURST_SIZE 32
> +#define SCHED_DEQUEUE_BURST_SIZE 64
> +
> +#define SCHED_MIN_BURST_SIZE 8
> +#define SCHED_NO_ENQ_CYCLE_FLUSH 256
> +/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
> +#define SCHED_REFILL_ONCE_PER_CALL 1
> 
> Is it possible to make the above #define a runtime option?
> Eg, --vdev event_sw,refill_iter=1
> 
> That would allow packaged versions of DPDK to be usable in both modes.
> 
> +
> 
>  #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our
> history list */  #define NUM_SAMPLES 64 /* how many data points use
> for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {
>      uint32_t xstats_count_mode_port;
>      uint32_t xstats_count_mode_queue;
> 
> +    uint16_t sched_flush_count;
> +    uint16_t sched_min_burst;
> +
>      /* Contains all ports - load balanced and directed */
>      struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;
> 
> diff --git a/drivers/event/sw/sw_evdev_scheduler.c
> b/drivers/event/sw/sw_evdev_scheduler.c
> index cff747da8..ca6d1caff 100644
> --- a/drivers/event/sw/sw_evdev_scheduler.c
> +++ b/drivers/event/sw/sw_evdev_scheduler.c
> @@ -26,6 +26,29 @@
>  /* use cheap bit mixing, we only need to lose a few bits */  #define
> SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)
> 
> +
> +/* single object enq and deq for non MT ring */ static
> +__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,
> +void **obj) {
> +    if ((r->prod.tail - r->cons.tail) < 1)
> +            return;
> +    void **ring = (void **)&r[1];
> +    *obj = ring[r->cons.tail & r->mask];
> +    r->cons.tail++;
> +}
> +static __rte_always_inline int
> +sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) {
> +    if ((r->capacity + r->cons.tail - r->prod.tail) < 1)
> +            return 0;
> +    void **ring = (void **)&r[1];
> +    ring[r->prod.tail & r->mask] = obj;
> +    r->prod.tail++;
> +    return 1;
> +
> Why not make these APIs part of the rte_ring library? You could further optimize them by keeping the indices on the same cacheline.
> I'm not sure there is any need for non thread-safe rings outside this particular case.
> [Honnappa] I think if we add the APIs, we will find the use cases.
> But, more than that, I understand that rte_ring structure is exposed to the application. The reason for doing that is the inline functions that rte_ring provides. IMO, we should still maintain modularity and should not use the internals of the rte_ring structure outside of the library.
> 
> +1 to that.
> 
> BTW, is there any real perf benefit from such micor-optimisation?

I'd tend to view these as use-case specific, and I'm not sure we should
clutter up the ring library with yet more functions, especially since they
can't be mixed with the existing enqueue/dequeue functions, since they
don't use the head pointers.
  
Honnappa Nagarahalli Sept. 28, 2020, 4:02 p.m. UTC | #7
<snip>
> > Add minimum burst throughout the scheduler pipeline and a flush counter.
> > Replace ring API calls with local single threaded implementation where
> > possible.
> >
> > Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
> >
> > Thanks for the patch, a few comments inline.
> >
> > ---
> >  drivers/event/sw/sw_evdev.h���������� | 11 +++-
> > drivers/event/sw/sw_evdev_scheduler.c | 83
> > +++++++++++++++++++++++----
> >  2 files changed, 81 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
> > index 7c77b2495..95e51065f 100644
> > --- a/drivers/event/sw/sw_evdev.h
> > +++ b/drivers/event/sw/sw_evdev.h
> > @@ -29,7 +29,13 @@
> >  /* report dequeue burst sizes in buckets */� #define
> > SW_DEQ_STAT_BUCKET_SHIFT 2
> >  /* how many packets pulled from port by sched */ -#define
> > SCHED_DEQUEUE_BURST_SIZE 32
> > +#define SCHED_DEQUEUE_BURST_SIZE 64
> > +
> > +#define SCHED_MIN_BURST_SIZE 8
> > +#define SCHED_NO_ENQ_CYCLE_FLUSH 256
> > +/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
> > +#define SCHED_REFILL_ONCE_PER_CALL 1
> >
> > Is it possible to make the above #define a runtime option?
> > Eg, --vdev event_sw,refill_iter=1
> >
> > That would allow packaged versions of DPDK to be usable in both modes.
> >
> > +
> >
> >  #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our
> > history list */� #define NUM_SAMPLES 64 /* how many data points use
> > for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {  ���
> > uint32_t xstats_count_mode_port;  ��� uint32_t
> > xstats_count_mode_queue;
> >
> > +��� uint16_t sched_flush_count; ��� uint16_t
> > +sched_min_burst;
> > +
> >  ��� /* Contains all ports - load balanced and directed */
> > ��� struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;
> >
> > diff --git a/drivers/event/sw/sw_evdev_scheduler.c
> > b/drivers/event/sw/sw_evdev_scheduler.c
> > index cff747da8..ca6d1caff 100644
> > --- a/drivers/event/sw/sw_evdev_scheduler.c
> > +++ b/drivers/event/sw/sw_evdev_scheduler.c
> > @@ -26,6 +26,29 @@
> >  /* use cheap bit mixing, we only need to lose a few bits */�
> > #define
> > SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)
> >
> > +
> > +/* single object enq and deq for non MT ring */ static
> > +__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,
> > +void **obj) { ��� if ((r->prod.tail - r->cons.tail) < 1)
> > +����������� return; ��� void **ring =
> > +(void **)&r[1]; ��� *obj = ring[r->cons.tail & r->mask];
> > +��� r->cons.tail++; } static __rte_always_inline int
> > +sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) { ��� if
> > +((r->capacity + r->cons.tail - r->prod.tail) < 1)
> > +����������� return 0; ��� void **ring =
> > +(void **)&r[1]; ��� ring[r->prod.tail & r->mask] = obj;
> > +��� r->prod.tail++; ��� return 1;
> > +
> > Why not make these APIs part of the rte_ring library? You could further
> optimize them by keeping the indices on the same cacheline.
> > I'm not sure there is any need for non thread-safe rings outside this
> particular case.
> > [Honnappa] I think if we add the APIs, we will find the use cases.
> > But, more than that, I understand that rte_ring structure is exposed to the
> application. The reason for doing that is the inline functions that rte_ring
> provides. IMO, we should still maintain modularity and should not use the
> internals of the rte_ring structure outside of the library.
> >
> > +1 to that.
> >
> > BTW, is there any real perf benefit from such micor-optimisation?
> 
> I'd tend to view these as use-case specific, and I'm not sure we should clutter
> up the ring library with yet more functions, especially since they can't be
> mixed with the existing enqueue/dequeue functions, since they don't use
> the head pointers.
IMO, the ring library is pretty organized with the recent addition of HTS/RTS modes. This can be one of the modes and should allow us to use the existing functions (though additional functions are required as well).
The other concern I have is, this implementation can be further optimized by using a single cache line for the pointers. It uses 2 cache lines just because of the layout of the rte_ring structure.
There was a question earlier about the performance improvements of this patch? Are there any % performance improvements that can be shared?
It is also possible to change the above functions to use the head/tail pointers from producer or the consumer cache line alone to check for perf differences.
  
Radu Nicolau Sept. 29, 2020, 9:02 a.m. UTC | #8
On 9/28/2020 5:02 PM, Honnappa Nagarahalli wrote:
> <snip>
>>> Add minimum burst throughout the scheduler pipeline and a flush counter.
>>> Replace ring API calls with local single threaded implementation where
>>> possible.
>>>
>>> Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
>>>
>>> Thanks for the patch, a few comments inline.
>>>
>>> ---
>>>   drivers/event/sw/sw_evdev.h���������� | 11 +++-
>>> drivers/event/sw/sw_evdev_scheduler.c | 83
>>> +++++++++++++++++++++++----
>>>   2 files changed, 81 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
>>> index 7c77b2495..95e51065f 100644
>>> --- a/drivers/event/sw/sw_evdev.h
>>> +++ b/drivers/event/sw/sw_evdev.h
>>> @@ -29,7 +29,13 @@
>>>   /* report dequeue burst sizes in buckets */� #define
>>> SW_DEQ_STAT_BUCKET_SHIFT 2
>>>   /* how many packets pulled from port by sched */ -#define
>>> SCHED_DEQUEUE_BURST_SIZE 32
>>> +#define SCHED_DEQUEUE_BURST_SIZE 64
>>> +
>>> +#define SCHED_MIN_BURST_SIZE 8
>>> +#define SCHED_NO_ENQ_CYCLE_FLUSH 256
>>> +/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
>>> +#define SCHED_REFILL_ONCE_PER_CALL 1
>>>
>>> Is it possible to make the above #define a runtime option?
>>> Eg, --vdev event_sw,refill_iter=1
>>>
>>> That would allow packaged versions of DPDK to be usable in both modes.
>>>
>>> +
>>>
>>>   #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our
>>> history list */� #define NUM_SAMPLES 64 /* how many data points use
>>> for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {  ���
>>> uint32_t xstats_count_mode_port;  ��� uint32_t
>>> xstats_count_mode_queue;
>>>
>>> +��� uint16_t sched_flush_count; ��� uint16_t
>>> +sched_min_burst;
>>> +
>>>   ��� /* Contains all ports - load balanced and directed */
>>> ��� struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;
>>>
>>> diff --git a/drivers/event/sw/sw_evdev_scheduler.c
>>> b/drivers/event/sw/sw_evdev_scheduler.c
>>> index cff747da8..ca6d1caff 100644
>>> --- a/drivers/event/sw/sw_evdev_scheduler.c
>>> +++ b/drivers/event/sw/sw_evdev_scheduler.c
>>> @@ -26,6 +26,29 @@
>>>   /* use cheap bit mixing, we only need to lose a few bits */�
>>> #define
>>> SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)
>>>
>>> +
>>> +/* single object enq and deq for non MT ring */ static
>>> +__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,
>>> +void **obj) { ��� if ((r->prod.tail - r->cons.tail) < 1)
>>> +����������� return; ��� void **ring =
>>> +(void **)&r[1]; ��� *obj = ring[r->cons.tail & r->mask];
>>> +��� r->cons.tail++; } static __rte_always_inline int
>>> +sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) { ��� if
>>> +((r->capacity + r->cons.tail - r->prod.tail) < 1)
>>> +����������� return 0; ��� void **ring =
>>> +(void **)&r[1]; ��� ring[r->prod.tail & r->mask] = obj;
>>> +��� r->prod.tail++; ��� return 1;
>>> +
>>> Why not make these APIs part of the rte_ring library? You could further
>> optimize them by keeping the indices on the same cacheline.
>>> I'm not sure there is any need for non thread-safe rings outside this
>> particular case.
>>> [Honnappa] I think if we add the APIs, we will find the use cases.
>>> But, more than that, I understand that rte_ring structure is exposed to the
>> application. The reason for doing that is the inline functions that rte_ring
>> provides. IMO, we should still maintain modularity and should not use the
>> internals of the rte_ring structure outside of the library.
>>> +1 to that.
>>>
>>> BTW, is there any real perf benefit from such micor-optimisation?
>> I'd tend to view these as use-case specific, and I'm not sure we should clutter
>> up the ring library with yet more functions, especially since they can't be
>> mixed with the existing enqueue/dequeue functions, since they don't use
>> the head pointers.
> IMO, the ring library is pretty organized with the recent addition of HTS/RTS modes. This can be one of the modes and should allow us to use the existing functions (though additional functions are required as well).
> The other concern I have is, this implementation can be further optimized by using a single cache line for the pointers. It uses 2 cache lines just because of the layout of the rte_ring structure.
> There was a question earlier about the performance improvements of this patch? Are there any % performance improvements that can be shared?
> It is also possible to change the above functions to use the head/tail pointers from producer or the consumer cache line alone to check for perf differences.

I don't have a % for the final improvement for this change alone, but 
there was some improvement in the memory overhead measurable during 
development, which very likely resulted in the whole optimization having 
more headroom.

I agree that this may be further optimized, maybe by having a local 
implementation of a ring-like container instead.
  
Jerin Jacob Oct. 5, 2020, 4:35 p.m. UTC | #9
On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
>
>
> On 9/28/2020 5:02 PM, Honnappa Nagarahalli wrote:
> > <snip>
> >>> Add minimum burst throughout the scheduler pipeline and a flush counter.
> >>> Replace ring API calls with local single threaded implementation where
> >>> possible.
> >>>
> >>> Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
> >>>
> >>> Thanks for the patch, a few comments inline.
> >>>

> >>> Why not make these APIs part of the rte_ring library? You could further
> >> optimize them by keeping the indices on the same cacheline.
> >>> I'm not sure there is any need for non thread-safe rings outside this
> >> particular case.
> >>> [Honnappa] I think if we add the APIs, we will find the use cases.
> >>> But, more than that, I understand that rte_ring structure is exposed to the
> >> application. The reason for doing that is the inline functions that rte_ring
> >> provides. IMO, we should still maintain modularity and should not use the
> >> internals of the rte_ring structure outside of the library.
> >>> +1 to that.
> >>>
> >>> BTW, is there any real perf benefit from such micor-optimisation?
> >> I'd tend to view these as use-case specific, and I'm not sure we should clutter
> >> up the ring library with yet more functions, especially since they can't be
> >> mixed with the existing enqueue/dequeue functions, since they don't use
> >> the head pointers.
> > IMO, the ring library is pretty organized with the recent addition of HTS/RTS modes. This can be one of the modes and should allow us to use the existing functions (though additional functions are required as well).
> > The other concern I have is, this implementation can be further optimized by using a single cache line for the pointers. It uses 2 cache lines just because of the layout of the rte_ring structure.
> > There was a question earlier about the performance improvements of this patch? Are there any % performance improvements that can be shared?
> > It is also possible to change the above functions to use the head/tail pointers from producer or the consumer cache line alone to check for perf differences.
>
> I don't have a % for the final improvement for this change alone, but
> there was some improvement in the memory overhead measurable during
> development, which very likely resulted in the whole optimization having
> more headroom.
>
> I agree that this may be further optimized, maybe by having a local
> implementation of a ring-like container instead.

Have we decided on the next steps for this patch? Is the plan to
supersede this patch and have different
one in rte_ring subsystem,


>
  
Van Haaren, Harry Oct. 6, 2020, 7:59 a.m. UTC | #10
> -----Original Message-----
> From: Jerin Jacob <jerinjacobk@gmail.com>
> Sent: Monday, October 5, 2020 5:35 PM
> To: Nicolau, Radu <radu.nicolau@intel.com>
> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Van Haaren, Harry
> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
> <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
> 
> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> >
> >
> > On 9/28/2020 5:02 PM, Honnappa Nagarahalli wrote:
> > > <snip>
> > >>> Add minimum burst throughout the scheduler pipeline and a flush counter.
> > >>> Replace ring API calls with local single threaded implementation where
> > >>> possible.
> > >>>
> > >>> Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
> > >>>
> > >>> Thanks for the patch, a few comments inline.
> > >>>
> 
> > >>> Why not make these APIs part of the rte_ring library? You could further
> > >> optimize them by keeping the indices on the same cacheline.
> > >>> I'm not sure there is any need for non thread-safe rings outside this
> > >> particular case.
> > >>> [Honnappa] I think if we add the APIs, we will find the use cases.
> > >>> But, more than that, I understand that rte_ring structure is exposed to the
> > >> application. The reason for doing that is the inline functions that rte_ring
> > >> provides. IMO, we should still maintain modularity and should not use the
> > >> internals of the rte_ring structure outside of the library.
> > >>> +1 to that.
> > >>>
> > >>> BTW, is there any real perf benefit from such micor-optimisation?
> > >> I'd tend to view these as use-case specific, and I'm not sure we should clutter
> > >> up the ring library with yet more functions, especially since they can't be
> > >> mixed with the existing enqueue/dequeue functions, since they don't use
> > >> the head pointers.
> > > IMO, the ring library is pretty organized with the recent addition of HTS/RTS
> modes. This can be one of the modes and should allow us to use the existing
> functions (though additional functions are required as well).
> > > The other concern I have is, this implementation can be further optimized by
> using a single cache line for the pointers. It uses 2 cache lines just because of the
> layout of the rte_ring structure.
> > > There was a question earlier about the performance improvements of this
> patch? Are there any % performance improvements that can be shared?
> > > It is also possible to change the above functions to use the head/tail pointers
> from producer or the consumer cache line alone to check for perf differences.
> >
> > I don't have a % for the final improvement for this change alone, but
> > there was some improvement in the memory overhead measurable during
> > development, which very likely resulted in the whole optimization having
> > more headroom.
> >
> > I agree that this may be further optimized, maybe by having a local
> > implementation of a ring-like container instead.
> 
> Have we decided on the next steps for this patch? Is the plan to
> supersede this patch and have different
> one in rte_ring subsystem,

My preference is to merge this version of the patch;
1) The ring helper functions are stripped to the SW PMD usage, and not valid to use in the general.
2) Adding static inline APIs in an LTS without extensive doesn't seem a good idea.

If Honnappa is OK with the above solution for 20.11, we can see about moving the rings part of the
code to rte_ring library location in 21.02, and give ourselves some time to settle the usage/API before
the next LTS.

Regards, -Harry
  
Ananyev, Konstantin Oct. 6, 2020, 10:13 a.m. UTC | #11
> 
> > -----Original Message-----
> > From: Jerin Jacob <jerinjacobk@gmail.com>
> > Sent: Monday, October 5, 2020 5:35 PM
> > To: Nicolau, Radu <radu.nicolau@intel.com>
> > Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; Van Haaren, Harry
> > <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
> > <nd@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
> >
> > On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> > >
> > >
> > > On 9/28/2020 5:02 PM, Honnappa Nagarahalli wrote:
> > > > <snip>
> > > >>> Add minimum burst throughout the scheduler pipeline and a flush counter.
> > > >>> Replace ring API calls with local single threaded implementation where
> > > >>> possible.
> > > >>>
> > > >>> Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
> > > >>>
> > > >>> Thanks for the patch, a few comments inline.
> > > >>>
> >
> > > >>> Why not make these APIs part of the rte_ring library? You could further
> > > >> optimize them by keeping the indices on the same cacheline.
> > > >>> I'm not sure there is any need for non thread-safe rings outside this
> > > >> particular case.
> > > >>> [Honnappa] I think if we add the APIs, we will find the use cases.
> > > >>> But, more than that, I understand that rte_ring structure is exposed to the
> > > >> application. The reason for doing that is the inline functions that rte_ring
> > > >> provides. IMO, we should still maintain modularity and should not use the
> > > >> internals of the rte_ring structure outside of the library.
> > > >>> +1 to that.
> > > >>>
> > > >>> BTW, is there any real perf benefit from such micor-optimisation?
> > > >> I'd tend to view these as use-case specific, and I'm not sure we should clutter
> > > >> up the ring library with yet more functions, especially since they can't be
> > > >> mixed with the existing enqueue/dequeue functions, since they don't use
> > > >> the head pointers.
> > > > IMO, the ring library is pretty organized with the recent addition of HTS/RTS
> > modes. This can be one of the modes and should allow us to use the existing
> > functions (though additional functions are required as well).
> > > > The other concern I have is, this implementation can be further optimized by
> > using a single cache line for the pointers. It uses 2 cache lines just because of the
> > layout of the rte_ring structure.
> > > > There was a question earlier about the performance improvements of this
> > patch? Are there any % performance improvements that can be shared?
> > > > It is also possible to change the above functions to use the head/tail pointers
> > from producer or the consumer cache line alone to check for perf differences.
> > >
> > > I don't have a % for the final improvement for this change alone, but
> > > there was some improvement in the memory overhead measurable during
> > > development, which very likely resulted in the whole optimization having
> > > more headroom.
> > >
> > > I agree that this may be further optimized, maybe by having a local
> > > implementation of a ring-like container instead.
> >
> > Have we decided on the next steps for this patch? Is the plan to
> > supersede this patch and have different
> > one in rte_ring subsystem,
> 
> My preference is to merge this version of the patch;
> 1) The ring helper functions are stripped to the SW PMD usage, and not valid to use in the general.
> 2) Adding static inline APIs in an LTS without extensive doesn't seem a good idea.
> 
> If Honnappa is OK with the above solution for 20.11, we can see about moving the rings part of the
> code to rte_ring library location in 21.02, and give ourselves some time to settle the usage/API before
> the next LTS.
> 

As ring library maintainer I share Honnappa concern that another library not uses public ring API,
but instead accesses ring internals directly. Obviously such coding practice is not welcomed
as it makes harder to maintain/extend ring library in future.
About 2) - these new API can(/shoud) be marked an experimental anyway. 
As another thing - it is still unclear what a performance gain we are talking about here.
Is it really worth it comparing to just using SP/SC?
  
Radu Nicolau Oct. 7, 2020, 10:44 a.m. UTC | #12
On 10/6/2020 11:13 AM, Ananyev, Konstantin wrote:
>>> -----Original Message-----
>>> From: Jerin Jacob <jerinjacobk@gmail.com>
>>> Sent: Monday, October 5, 2020 5:35 PM
>>> To: Nicolau, Radu <radu.nicolau@intel.com>
>>> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
>>> <bruce.richardson@intel.com>; Ananyev, Konstantin
>>> <konstantin.ananyev@intel.com>; Van Haaren, Harry
>>> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
>>> <nd@arm.com>
>>> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
>>>
>>> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
>>>>
>>>> On 9/28/2020 5:02 PM, Honnappa Nagarahalli wrote:
>>>>> <snip>
>>>>>>> Add minimum burst throughout the scheduler pipeline and a flush counter.
>>>>>>> Replace ring API calls with local single threaded implementation where
>>>>>>> possible.
>>>>>>>
>>>>>>> Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
>>>>>>>
>>>>>>> Thanks for the patch, a few comments inline.
>>>>>>>
>>>>>>> Why not make these APIs part of the rte_ring library? You could further
>>>>>> optimize them by keeping the indices on the same cacheline.
>>>>>>> I'm not sure there is any need for non thread-safe rings outside this
>>>>>> particular case.
>>>>>>> [Honnappa] I think if we add the APIs, we will find the use cases.
>>>>>>> But, more than that, I understand that rte_ring structure is exposed to the
>>>>>> application. The reason for doing that is the inline functions that rte_ring
>>>>>> provides. IMO, we should still maintain modularity and should not use the
>>>>>> internals of the rte_ring structure outside of the library.
>>>>>>> +1 to that.
>>>>>>>
>>>>>>> BTW, is there any real perf benefit from such micor-optimisation?
>>>>>> I'd tend to view these as use-case specific, and I'm not sure we should clutter
>>>>>> up the ring library with yet more functions, especially since they can't be
>>>>>> mixed with the existing enqueue/dequeue functions, since they don't use
>>>>>> the head pointers.
>>>>> IMO, the ring library is pretty organized with the recent addition of HTS/RTS
>>> modes. This can be one of the modes and should allow us to use the existing
>>> functions (though additional functions are required as well).
>>>>> The other concern I have is, this implementation can be further optimized by
>>> using a single cache line for the pointers. It uses 2 cache lines just because of the
>>> layout of the rte_ring structure.
>>>>> There was a question earlier about the performance improvements of this
>>> patch? Are there any % performance improvements that can be shared?
>>>>> It is also possible to change the above functions to use the head/tail pointers
>>> from producer or the consumer cache line alone to check for perf differences.
>>>> I don't have a % for the final improvement for this change alone, but
>>>> there was some improvement in the memory overhead measurable during
>>>> development, which very likely resulted in the whole optimization having
>>>> more headroom.
>>>>
>>>> I agree that this may be further optimized, maybe by having a local
>>>> implementation of a ring-like container instead.
>>> Have we decided on the next steps for this patch? Is the plan to
>>> supersede this patch and have different
>>> one in rte_ring subsystem,
>> My preference is to merge this version of the patch;
>> 1) The ring helper functions are stripped to the SW PMD usage, and not valid to use in the general.
>> 2) Adding static inline APIs in an LTS without extensive doesn't seem a good idea.
>>
>> If Honnappa is OK with the above solution for 20.11, we can see about moving the rings part of the
>> code to rte_ring library location in 21.02, and give ourselves some time to settle the usage/API before
>> the next LTS.
>>
> As ring library maintainer I share Honnappa concern that another library not uses public ring API,
> but instead accesses ring internals directly. Obviously such coding practice is not welcomed
> as it makes harder to maintain/extend ring library in future.
> About 2) - these new API can(/shoud) be marked an experimental anyway.
> As another thing - it is still unclear what a performance gain we are talking about here.
> Is it really worth it comparing to just using SP/SC?

The change itself came after I analyzed the memory bound sections of the 
code, and I just did a quick test, I got about 3.5% improvement in 
throughput,  maybe not so much but significant for such a small change, 
and depending on the usecase it may be more.

As for the implementation itself, I would favour having a custom ring 
like container in the PMD code, this will solve the issue of using 
rte_ring internals while still allow for full optimisation. If this is 
acceptable, I will follow up by tomorrow.
  
Ananyev, Konstantin Oct. 7, 2020, 10:52 a.m. UTC | #13
> On 10/6/2020 11:13 AM, Ananyev, Konstantin wrote:
> >>> -----Original Message-----
> >>> From: Jerin Jacob <jerinjacobk@gmail.com>
> >>> Sent: Monday, October 5, 2020 5:35 PM
> >>> To: Nicolau, Radu <radu.nicolau@intel.com>
> >>> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
> >>> <bruce.richardson@intel.com>; Ananyev, Konstantin
> >>> <konstantin.ananyev@intel.com>; Van Haaren, Harry
> >>> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
> >>> <nd@arm.com>
> >>> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
> >>>
> >>> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> >>>>
> >>>> On 9/28/2020 5:02 PM, Honnappa Nagarahalli wrote:
> >>>>> <snip>
> >>>>>>> Add minimum burst throughout the scheduler pipeline and a flush counter.
> >>>>>>> Replace ring API calls with local single threaded implementation where
> >>>>>>> possible.
> >>>>>>>
> >>>>>>> Signed-off-by: Radu Nicolau mailto:radu.nicolau@intel.com
> >>>>>>>
> >>>>>>> Thanks for the patch, a few comments inline.
> >>>>>>>
> >>>>>>> Why not make these APIs part of the rte_ring library? You could further
> >>>>>> optimize them by keeping the indices on the same cacheline.
> >>>>>>> I'm not sure there is any need for non thread-safe rings outside this
> >>>>>> particular case.
> >>>>>>> [Honnappa] I think if we add the APIs, we will find the use cases.
> >>>>>>> But, more than that, I understand that rte_ring structure is exposed to the
> >>>>>> application. The reason for doing that is the inline functions that rte_ring
> >>>>>> provides. IMO, we should still maintain modularity and should not use the
> >>>>>> internals of the rte_ring structure outside of the library.
> >>>>>>> +1 to that.
> >>>>>>>
> >>>>>>> BTW, is there any real perf benefit from such micor-optimisation?
> >>>>>> I'd tend to view these as use-case specific, and I'm not sure we should clutter
> >>>>>> up the ring library with yet more functions, especially since they can't be
> >>>>>> mixed with the existing enqueue/dequeue functions, since they don't use
> >>>>>> the head pointers.
> >>>>> IMO, the ring library is pretty organized with the recent addition of HTS/RTS
> >>> modes. This can be one of the modes and should allow us to use the existing
> >>> functions (though additional functions are required as well).
> >>>>> The other concern I have is, this implementation can be further optimized by
> >>> using a single cache line for the pointers. It uses 2 cache lines just because of the
> >>> layout of the rte_ring structure.
> >>>>> There was a question earlier about the performance improvements of this
> >>> patch? Are there any % performance improvements that can be shared?
> >>>>> It is also possible to change the above functions to use the head/tail pointers
> >>> from producer or the consumer cache line alone to check for perf differences.
> >>>> I don't have a % for the final improvement for this change alone, but
> >>>> there was some improvement in the memory overhead measurable during
> >>>> development, which very likely resulted in the whole optimization having
> >>>> more headroom.
> >>>>
> >>>> I agree that this may be further optimized, maybe by having a local
> >>>> implementation of a ring-like container instead.
> >>> Have we decided on the next steps for this patch? Is the plan to
> >>> supersede this patch and have different
> >>> one in rte_ring subsystem,
> >> My preference is to merge this version of the patch;
> >> 1) The ring helper functions are stripped to the SW PMD usage, and not valid to use in the general.
> >> 2) Adding static inline APIs in an LTS without extensive doesn't seem a good idea.
> >>
> >> If Honnappa is OK with the above solution for 20.11, we can see about moving the rings part of the
> >> code to rte_ring library location in 21.02, and give ourselves some time to settle the usage/API before
> >> the next LTS.
> >>
> > As ring library maintainer I share Honnappa concern that another library not uses public ring API,
> > but instead accesses ring internals directly. Obviously such coding practice is not welcomed
> > as it makes harder to maintain/extend ring library in future.
> > About 2) - these new API can(/shoud) be marked an experimental anyway.
> > As another thing - it is still unclear what a performance gain we are talking about here.
> > Is it really worth it comparing to just using SP/SC?
> 
> The change itself came after I analyzed the memory bound sections of the
> code, and I just did a quick test, I got about 3.5% improvement in
> throughput,  maybe not so much but significant for such a small change,
> and depending on the usecase it may be more.
> 
> As for the implementation itself, I would favour having a custom ring
> like container in the PMD code, this will solve the issue of using
> rte_ring internals while still allow for full optimisation. If this is
> acceptable, I will follow up by tomorrow.

Sounds ok to me.
Thanks
Konstantin
  
Jerin Jacob Oct. 13, 2020, 7:11 p.m. UTC | #14
On Wed, Oct 7, 2020 at 4:22 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
>
> > On 10/6/2020 11:13 AM, Ananyev, Konstantin wrote:
> > >>> -----Original Message-----
> > >>> From: Jerin Jacob <jerinjacobk@gmail.com>
> > >>> Sent: Monday, October 5, 2020 5:35 PM
> > >>> To: Nicolau, Radu <radu.nicolau@intel.com>
> > >>> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
> > >>> <bruce.richardson@intel.com>; Ananyev, Konstantin
> > >>> <konstantin.ananyev@intel.com>; Van Haaren, Harry
> > >>> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
> > >>> <nd@arm.com>
> > >>> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
> > >>>
> > >>> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> > >>>>
a concern that another library not uses public ring API,
> > > but instead accesses ring internals directly. Obviously such coding practice is not welcomed
> > > as it makes harder to maintain/extend ring library in future.
> > > About 2) - these new API can(/shoud) be marked an experimental anyway.
> > > As another thing - it is still unclear what a performance gain we are talking about here.
> > > Is it really worth it comparing to just using SP/SC?
> >
> > The change itself came after I analyzed the memory bound sections of the
> > code, and I just did a quick test, I got about 3.5% improvement in
> > throughput,  maybe not so much but significant for such a small change,
> > and depending on the usecase it may be more.
> >
> > As for the implementation itself, I would favour having a custom ring
> > like container in the PMD code, this will solve the issue of using
> > rte_ring internals while still allow for full optimisation. If this is
> > acceptable, I will follow up by tomorrow.
>
> Sounds ok to me.

Nicolau Radu,

Could you supersede this patch, if the plan is to send it to a new
version based on a custom ring?

> Thanks
> Konstantin
>
  
Radu Nicolau Oct. 14, 2020, 8:32 a.m. UTC | #15
On 10/13/2020 8:11 PM, Jerin Jacob wrote:
> On Wed, Oct 7, 2020 at 4:22 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
>>> On 10/6/2020 11:13 AM, Ananyev, Konstantin wrote:
>>>>>> -----Original Message-----
>>>>>> From: Jerin Jacob <jerinjacobk@gmail.com>
>>>>>> Sent: Monday, October 5, 2020 5:35 PM
>>>>>> To: Nicolau, Radu <radu.nicolau@intel.com>
>>>>>> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
>>>>>> <bruce.richardson@intel.com>; Ananyev, Konstantin
>>>>>> <konstantin.ananyev@intel.com>; Van Haaren, Harry
>>>>>> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
>>>>>> <nd@arm.com>
>>>>>> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
>>>>>>
>>>>>> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> a concern that another library not uses public ring API,
>>>> but instead accesses ring internals directly. Obviously such coding practice is not welcomed
>>>> as it makes harder to maintain/extend ring library in future.
>>>> About 2) - these new API can(/shoud) be marked an experimental anyway.
>>>> As another thing - it is still unclear what a performance gain we are talking about here.
>>>> Is it really worth it comparing to just using SP/SC?
>>> The change itself came after I analyzed the memory bound sections of the
>>> code, and I just did a quick test, I got about 3.5% improvement in
>>> throughput,  maybe not so much but significant for such a small change,
>>> and depending on the usecase it may be more.
>>>
>>> As for the implementation itself, I would favour having a custom ring
>>> like container in the PMD code, this will solve the issue of using
>>> rte_ring internals while still allow for full optimisation. If this is
>>> acceptable, I will follow up by tomorrow.
>> Sounds ok to me.
> Nicolau Radu,
>
> Could you supersede this patch, if the plan is to send it to a new
> version based on a custom ring?
The v3 (https://patchwork.dpdk.org/patch/79879/) sent last week 
implements the custom ring and does not use the rte_ring internals. v1 
and v2 are superseded.
  
Jerin Jacob Oct. 14, 2020, 10:09 a.m. UTC | #16
On Wed, Oct 14, 2020 at 2:02 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
>
>
> On 10/13/2020 8:11 PM, Jerin Jacob wrote:
> > On Wed, Oct 7, 2020 at 4:22 PM Ananyev, Konstantin
> > <konstantin.ananyev@intel.com> wrote:
> >>> On 10/6/2020 11:13 AM, Ananyev, Konstantin wrote:
> >>>>>> -----Original Message-----
> >>>>>> From: Jerin Jacob <jerinjacobk@gmail.com>
> >>>>>> Sent: Monday, October 5, 2020 5:35 PM
> >>>>>> To: Nicolau, Radu <radu.nicolau@intel.com>
> >>>>>> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
> >>>>>> <bruce.richardson@intel.com>; Ananyev, Konstantin
> >>>>>> <konstantin.ananyev@intel.com>; Van Haaren, Harry
> >>>>>> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
> >>>>>> <nd@arm.com>
> >>>>>> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
> >>>>>>
> >>>>>> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> > a concern that another library not uses public ring API,
> >>>> but instead accesses ring internals directly. Obviously such coding practice is not welcomed
> >>>> as it makes harder to maintain/extend ring library in future.
> >>>> About 2) - these new API can(/shoud) be marked an experimental anyway.
> >>>> As another thing - it is still unclear what a performance gain we are talking about here.
> >>>> Is it really worth it comparing to just using SP/SC?
> >>> The change itself came after I analyzed the memory bound sections of the
> >>> code, and I just did a quick test, I got about 3.5% improvement in
> >>> throughput,  maybe not so much but significant for such a small change,
> >>> and depending on the usecase it may be more.
> >>>
> >>> As for the implementation itself, I would favour having a custom ring
> >>> like container in the PMD code, this will solve the issue of using
> >>> rte_ring internals while still allow for full optimisation. If this is
> >>> acceptable, I will follow up by tomorrow.
> >> Sounds ok to me.
> > Nicolau Radu,
> >
> > Could you supersede this patch, if the plan is to send it to a new
> > version based on a custom ring?
> The v3 (https://patchwork.dpdk.org/patch/79879/) sent last week
> implements the custom ring and does not use the rte_ring internals. v1
> and v2 are superseded.

Ok. Looks good to me. @Honnappa Nagarahalli @Ananyev, Konstantin  ,
I will merge this patch if there are no more objections for v3.
  
Ananyev, Konstantin Oct. 14, 2020, 10:21 a.m. UTC | #17
> 
> On Wed, Oct 14, 2020 at 2:02 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> >
> >
> > On 10/13/2020 8:11 PM, Jerin Jacob wrote:
> > > On Wed, Oct 7, 2020 at 4:22 PM Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com> wrote:
> > >>> On 10/6/2020 11:13 AM, Ananyev, Konstantin wrote:
> > >>>>>> -----Original Message-----
> > >>>>>> From: Jerin Jacob <jerinjacobk@gmail.com>
> > >>>>>> Sent: Monday, October 5, 2020 5:35 PM
> > >>>>>> To: Nicolau, Radu <radu.nicolau@intel.com>
> > >>>>>> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
> > >>>>>> <bruce.richardson@intel.com>; Ananyev, Konstantin
> > >>>>>> <konstantin.ananyev@intel.com>; Van Haaren, Harry
> > >>>>>> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
> > >>>>>> <nd@arm.com>
> > >>>>>> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
> > >>>>>>
> > >>>>>> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> > > a concern that another library not uses public ring API,
> > >>>> but instead accesses ring internals directly. Obviously such coding practice is not welcomed
> > >>>> as it makes harder to maintain/extend ring library in future.
> > >>>> About 2) - these new API can(/shoud) be marked an experimental anyway.
> > >>>> As another thing - it is still unclear what a performance gain we are talking about here.
> > >>>> Is it really worth it comparing to just using SP/SC?
> > >>> The change itself came after I analyzed the memory bound sections of the
> > >>> code, and I just did a quick test, I got about 3.5% improvement in
> > >>> throughput,  maybe not so much but significant for such a small change,
> > >>> and depending on the usecase it may be more.
> > >>>
> > >>> As for the implementation itself, I would favour having a custom ring
> > >>> like container in the PMD code, this will solve the issue of using
> > >>> rte_ring internals while still allow for full optimisation. If this is
> > >>> acceptable, I will follow up by tomorrow.
> > >> Sounds ok to me.
> > > Nicolau Radu,
> > >
> > > Could you supersede this patch, if the plan is to send it to a new
> > > version based on a custom ring?
> > The v3 (https://patchwork.dpdk.org/patch/79879/) sent last week
> > implements the custom ring and does not use the rte_ring internals. v1
> > and v2 are superseded.
> 
> Ok. Looks good to me. @Honnappa Nagarahalli @Ananyev, Konstantin  ,
> I will merge this patch if there are no more objections for v3.

No objections from me.
  
Jerin Jacob Oct. 14, 2020, 6:27 p.m. UTC | #18
On Wed, Oct 14, 2020 at 3:51 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
>
>
>
> >
> > On Wed, Oct 14, 2020 at 2:02 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> > >
> > >
> > > On 10/13/2020 8:11 PM, Jerin Jacob wrote:
> > > > On Wed, Oct 7, 2020 at 4:22 PM Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com> wrote:
> > > >>> On 10/6/2020 11:13 AM, Ananyev, Konstantin wrote:
> > > >>>>>> -----Original Message-----
> > > >>>>>> From: Jerin Jacob <jerinjacobk@gmail.com>
> > > >>>>>> Sent: Monday, October 5, 2020 5:35 PM
> > > >>>>>> To: Nicolau, Radu <radu.nicolau@intel.com>
> > > >>>>>> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Richardson, Bruce
> > > >>>>>> <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > >>>>>> <konstantin.ananyev@intel.com>; Van Haaren, Harry
> > > >>>>>> <harry.van.haaren@intel.com>; dev@dpdk.org; jerinj@marvell.com; nd
> > > >>>>>> <nd@arm.com>
> > > >>>>>> Subject: Re: [dpdk-dev] [PATCH v1] event/sw: performance improvements
> > > >>>>>>
> > > >>>>>> On Tue, Sep 29, 2020 at 2:32 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
> > > > a concern that another library not uses public ring API,
> > > >>>> but instead accesses ring internals directly. Obviously such coding practice is not welcomed
> > > >>>> as it makes harder to maintain/extend ring library in future.
> > > >>>> About 2) - these new API can(/shoud) be marked an experimental anyway.
> > > >>>> As another thing - it is still unclear what a performance gain we are talking about here.
> > > >>>> Is it really worth it comparing to just using SP/SC?
> > > >>> The change itself came after I analyzed the memory bound sections of the
> > > >>> code, and I just did a quick test, I got about 3.5% improvement in
> > > >>> throughput,  maybe not so much but significant for such a small change,
> > > >>> and depending on the usecase it may be more.
> > > >>>
> > > >>> As for the implementation itself, I would favour having a custom ring
> > > >>> like container in the PMD code, this will solve the issue of using
> > > >>> rte_ring internals while still allow for full optimisation. If this is
> > > >>> acceptable, I will follow up by tomorrow.
> > > >> Sounds ok to me.
> > > > Nicolau Radu,
> > > >
> > > > Could you supersede this patch, if the plan is to send it to a new
> > > > version based on a custom ring?
> > > The v3 (https://patchwork.dpdk.org/patch/79879/) sent last week
> > > implements the custom ring and does not use the rte_ring internals. v1
> > > and v2 are superseded.
> >
> > Ok. Looks good to me. @Honnappa Nagarahalli @Ananyev, Konstantin  ,
> > I will merge this patch if there are no more objections for v3.
>
> No objections from me.


Applied to dpdk-next-eventdev/for-main. Thanks.


>
  

Patch

diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
index 7c77b2495..95e51065f 100644
--- a/drivers/event/sw/sw_evdev.h
+++ b/drivers/event/sw/sw_evdev.h
@@ -29,7 +29,13 @@ 
 /* report dequeue burst sizes in buckets */
 #define SW_DEQ_STAT_BUCKET_SHIFT 2
 /* how many packets pulled from port by sched */
-#define SCHED_DEQUEUE_BURST_SIZE 32
+#define SCHED_DEQUEUE_BURST_SIZE 64
+
+#define SCHED_MIN_BURST_SIZE 8
+#define SCHED_NO_ENQ_CYCLE_FLUSH 256
+/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
+#define SCHED_REFILL_ONCE_PER_CALL 1
+
 
 #define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our history list */
 #define NUM_SAMPLES 64 /* how many data points use for average stats */
@@ -214,6 +220,9 @@  struct sw_evdev {
 	uint32_t xstats_count_mode_port;
 	uint32_t xstats_count_mode_queue;
 
+	uint16_t sched_flush_count;
+	uint16_t sched_min_burst;
+
 	/* Contains all ports - load balanced and directed */
 	struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;
 
diff --git a/drivers/event/sw/sw_evdev_scheduler.c b/drivers/event/sw/sw_evdev_scheduler.c
index cff747da8..ca6d1caff 100644
--- a/drivers/event/sw/sw_evdev_scheduler.c
+++ b/drivers/event/sw/sw_evdev_scheduler.c
@@ -26,6 +26,29 @@ 
 /* use cheap bit mixing, we only need to lose a few bits */
 #define SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)
 
+
+/* single object enq and deq for non MT ring */
+static __rte_always_inline void
+sw_nonmt_ring_dequeue(struct rte_ring *r, void **obj)
+{
+	if ((r->prod.tail - r->cons.tail) < 1)
+		return;
+	void **ring = (void **)&r[1];
+	*obj = ring[r->cons.tail & r->mask];
+	r->cons.tail++;
+}
+static __rte_always_inline int
+sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj)
+{
+	if ((r->capacity + r->cons.tail - r->prod.tail) < 1)
+		return 0;
+	void **ring = (void **)&r[1];
+	ring[r->prod.tail & r->mask] = obj;
+	r->prod.tail++;
+	return 1;
+}
+
+
 static inline uint32_t
 sw_schedule_atomic_to_cq(struct sw_evdev *sw, struct sw_qid * const qid,
 		uint32_t iq_num, unsigned int count)
@@ -146,9 +169,9 @@  sw_schedule_parallel_to_cq(struct sw_evdev *sw, struct sw_qid * const qid,
 				cq_idx = 0;
 			cq = qid->cq_map[cq_idx++];
 
-		} while (rte_event_ring_free_count(
-				sw->ports[cq].cq_worker_ring) == 0 ||
-				sw->ports[cq].inflights == SW_PORT_HIST_LIST);
+		} while (sw->ports[cq].inflights == SW_PORT_HIST_LIST ||
+				rte_event_ring_free_count(
+					sw->ports[cq].cq_worker_ring) == 0);
 
 		struct sw_port *p = &sw->ports[cq];
 		if (sw->cq_ring_space[cq] == 0 ||
@@ -164,7 +187,7 @@  sw_schedule_parallel_to_cq(struct sw_evdev *sw, struct sw_qid * const qid,
 		p->hist_list[head].qid = qid_id;
 
 		if (keep_order)
-			rte_ring_sc_dequeue(qid->reorder_buffer_freelist,
+			sw_nonmt_ring_dequeue(qid->reorder_buffer_freelist,
 					(void *)&p->hist_list[head].rob_entry);
 
 		sw->ports[cq].cq_buf[sw->ports[cq].cq_buf_count++] = *qe;
@@ -229,7 +252,7 @@  sw_schedule_qid_to_cq(struct sw_evdev *sw)
 		uint32_t pkts_done = 0;
 		uint32_t count = iq_count(&qid->iq[iq_num]);
 
-		if (count > 0) {
+		if (count >= sw->sched_min_burst) {
 			if (type == SW_SCHED_TYPE_DIRECT)
 				pkts_done += sw_schedule_dir_to_cq(sw, qid,
 						iq_num, count);
@@ -267,7 +290,7 @@  sw_schedule_reorder(struct sw_evdev *sw, int qid_start, int qid_end)
 
 	for (; qid_start < qid_end; qid_start++) {
 		struct sw_qid *qid = &sw->qids[qid_start];
-		int i, num_entries_in_use;
+		unsigned int i, num_entries_in_use;
 
 		if (qid->type != RTE_SCHED_TYPE_ORDERED)
 			continue;
@@ -275,6 +298,9 @@  sw_schedule_reorder(struct sw_evdev *sw, int qid_start, int qid_end)
 		num_entries_in_use = rte_ring_free_count(
 					qid->reorder_buffer_freelist);
 
+		if (num_entries_in_use < sw->sched_min_burst)
+			num_entries_in_use = 0;
+
 		for (i = 0; i < num_entries_in_use; i++) {
 			struct reorder_buffer_entry *entry;
 			int j;
@@ -320,7 +346,7 @@  sw_schedule_reorder(struct sw_evdev *sw, int qid_start, int qid_end)
 			if (!entry->ready) {
 				entry->fragment_index = 0;
 
-				rte_ring_sp_enqueue(
+				sw_nonmt_ring_enqueue(
 						qid->reorder_buffer_freelist,
 						entry);
 
@@ -349,9 +375,11 @@  __pull_port_lb(struct sw_evdev *sw, uint32_t port_id, int allow_reorder)
 	uint32_t pkts_iter = 0;
 	struct sw_port *port = &sw->ports[port_id];
 
+#if !SCHED_REFILL_ONCE_PER_CALL
 	/* If shadow ring has 0 pkts, pull from worker ring */
 	if (port->pp_buf_count == 0)
 		sw_refill_pp_buf(sw, port);
+#endif
 
 	while (port->pp_buf_count) {
 		const struct rte_event *qe = &port->pp_buf[port->pp_buf_start];
@@ -467,9 +495,11 @@  sw_schedule_pull_port_dir(struct sw_evdev *sw, uint32_t port_id)
 	uint32_t pkts_iter = 0;
 	struct sw_port *port = &sw->ports[port_id];
 
+#if !SCHED_REFILL_ONCE_PER_CALL
 	/* If shadow ring has 0 pkts, pull from worker ring */
 	if (port->pp_buf_count == 0)
 		sw_refill_pp_buf(sw, port);
+#endif
 
 	while (port->pp_buf_count) {
 		const struct rte_event *qe = &port->pp_buf[port->pp_buf_start];
@@ -557,12 +587,41 @@  sw_event_schedule(struct rte_eventdev *dev)
 	/* push all the internal buffered QEs in port->cq_ring to the
 	 * worker cores: aka, do the ring transfers batched.
 	 */
+	int no_enq = 1;
 	for (i = 0; i < sw->port_count; i++) {
-		struct rte_event_ring *worker = sw->ports[i].cq_worker_ring;
-		rte_event_ring_enqueue_burst(worker, sw->ports[i].cq_buf,
-				sw->ports[i].cq_buf_count,
-				&sw->cq_ring_space[i]);
-		sw->ports[i].cq_buf_count = 0;
+		struct sw_port *port = &sw->ports[i];
+		struct rte_event_ring *worker = port->cq_worker_ring;
+
+#if SCHED_REFILL_ONCE_PER_CALL
+		/* If shadow ring has 0 pkts, pull from worker ring */
+		if (port->pp_buf_count == 0)
+			sw_refill_pp_buf(sw, port);
+#endif
+
+		if (port->cq_buf_count >= sw->sched_min_burst) {
+			rte_event_ring_enqueue_burst(worker,
+					port->cq_buf,
+					port->cq_buf_count,
+					&sw->cq_ring_space[i]);
+			port->cq_buf_count = 0;
+			no_enq = 0;
+		} else {
+			sw->cq_ring_space[i] =
+					rte_event_ring_free_count(worker) -
+					port->cq_buf_count;
+		}
+	}
+
+	if (no_enq) {
+		if (unlikely(sw->sched_flush_count > SCHED_NO_ENQ_CYCLE_FLUSH))
+			sw->sched_min_burst = 1;
+		else
+			sw->sched_flush_count++;
+	} else {
+		if (sw->sched_flush_count)
+			sw->sched_flush_count--;
+		else
+			sw->sched_min_burst = SCHED_MIN_BURST_SIZE;
 	}
 
 }