[dpdk-dev,v7,04/17] lib: add new burst oriented distributor structs

Message ID 1487647073-129064-5-git-send-email-david.hunt@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Hunt, David Feb. 21, 2017, 3:17 a.m. UTC
  Signed-off-by: David Hunt <david.hunt@intel.com>
---
 lib/librte_distributor/rte_distributor_private.h | 61 ++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
  

Comments

Bruce Richardson Feb. 24, 2017, 2:08 p.m. UTC | #1
On Tue, Feb 21, 2017 at 03:17:40AM +0000, David Hunt wrote:
> Signed-off-by: David Hunt <david.hunt@intel.com>
> ---
>  lib/librte_distributor/rte_distributor_private.h | 61 ++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/lib/librte_distributor/rte_distributor_private.h b/lib/librte_distributor/rte_distributor_private.h
> index 2d85b9b..c8e0f98 100644
> --- a/lib/librte_distributor/rte_distributor_private.h
> +++ b/lib/librte_distributor/rte_distributor_private.h
> @@ -129,6 +129,67 @@ struct rte_distributor_v20 {
>  	struct rte_distributor_returned_pkts returns;
>  };
>  
> +/* All different signature compare functions */
> +enum rte_distributor_match_function {
> +	RTE_DIST_MATCH_SCALAR = 0,
> +	RTE_DIST_MATCH_VECTOR,
> +	RTE_DIST_NUM_MATCH_FNS
> +};
> +
> +/**
> + * Buffer structure used to pass the pointer data between cores. This is cache
> + * line aligned, but to improve performance and prevent adjacent cache-line
> + * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
> + * the next cache line to worker 0, we pad this out to two cache lines.
> + * We can pass up to 8 mbufs at a time in one cacheline.
> + * There is a separate cacheline for returns in the burst API.
> + */
> +struct rte_distributor_buffer {
> +	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
> +			__rte_cache_aligned; /* <= outgoing to worker */
> +
> +	int64_t pad1 __rte_cache_aligned;    /* <= one cache line  */
> +
> +	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
> +			__rte_cache_aligned; /* <= incoming from worker */
> +
> +	int64_t pad2 __rte_cache_aligned;    /* <= one cache line  */
> +
> +	int count __rte_cache_aligned;       /* <= number of current mbufs */
> +};
> +
> +struct rte_distributor {
> +	TAILQ_ENTRY(rte_distributor) next;    /**< Next in list. */
> +
> +	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
> +	unsigned int num_workers;             /**< Number of workers polling */
> +	unsigned int alg_type;                /**< Number of alg types */
> +
> +	/**>
> +	 * First cache line in the this array are the tags inflight
> +	 * on the worker core. Second cache line are the backlog
> +	 * that are going to go to the worker core.
> +	 */
> +	uint16_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2]
> +			__rte_cache_aligned;
> +
> +	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]
> +			__rte_cache_aligned;
> +
> +	struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS];
> +
> +	struct rte_distributor_returned_pkts returns;
> +
> +	enum rte_distributor_match_function dist_match_fn;
> +
> +	struct rte_distributor_v20 *d_v20;
> +};
> +
> +void
> +find_match_scalar(struct rte_distributor *d,
> +			uint16_t *data_ptr,
> +			uint16_t *output_ptr);
> +
>  #ifdef __cplusplus
>  }
>  #endif
The last patch claimed that this header file is for structs/definitions
common between the old and new implementations. These definitions look
to apply only to the new one, so do they belong in the .c file instead?

/Bruce
  
Bruce Richardson Feb. 24, 2017, 2:09 p.m. UTC | #2
On Tue, Feb 21, 2017 at 03:17:40AM +0000, David Hunt wrote:
> Signed-off-by: David Hunt <david.hunt@intel.com>
> ---
>  lib/librte_distributor/rte_distributor_private.h | 61 ++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/lib/librte_distributor/rte_distributor_private.h b/lib/librte_distributor/rte_distributor_private.h
> index 2d85b9b..c8e0f98 100644
> --- a/lib/librte_distributor/rte_distributor_private.h
> +++ b/lib/librte_distributor/rte_distributor_private.h
> @@ -129,6 +129,67 @@ struct rte_distributor_v20 {
>  	struct rte_distributor_returned_pkts returns;
>  };
>  
> +/* All different signature compare functions */
> +enum rte_distributor_match_function {
> +	RTE_DIST_MATCH_SCALAR = 0,
> +	RTE_DIST_MATCH_VECTOR,
> +	RTE_DIST_NUM_MATCH_FNS
> +};
> +
> +/**
> + * Buffer structure used to pass the pointer data between cores. This is cache
> + * line aligned, but to improve performance and prevent adjacent cache-line
> + * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
> + * the next cache line to worker 0, we pad this out to two cache lines.
> + * We can pass up to 8 mbufs at a time in one cacheline.
> + * There is a separate cacheline for returns in the burst API.
> + */
> +struct rte_distributor_buffer {
> +	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
> +			__rte_cache_aligned; /* <= outgoing to worker */
> +
> +	int64_t pad1 __rte_cache_aligned;    /* <= one cache line  */
> +
> +	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
> +			__rte_cache_aligned; /* <= incoming from worker */
> +
> +	int64_t pad2 __rte_cache_aligned;    /* <= one cache line  */
> +
> +	int count __rte_cache_aligned;       /* <= number of current mbufs */
> +};

Rather than adding padding elements here, would it be better and clearer
just to align the values to 128B (or more strictly CACHE_LINE_SZ * 2)?

/Bruce
  
Hunt, David March 1, 2017, 9:57 a.m. UTC | #3
On 24/2/2017 2:08 PM, Bruce Richardson wrote:
> On Tue, Feb 21, 2017 at 03:17:40AM +0000, David Hunt wrote:
>> Signed-off-by: David Hunt <david.hunt@intel.com>
>> ---
>>   lib/librte_distributor/rte_distributor_private.h | 61 ++++++++++++++++++++++++
>>   1 file changed, 61 insertions(+)
>>
>> diff --git a/lib/librte_distributor/rte_distributor_private.h b/lib/librte_distributor/rte_distributor_private.h
>> index 2d85b9b..c8e0f98 100644
>> --- a/lib/librte_distributor/rte_distributor_private.h
>> +++ b/lib/librte_distributor/rte_distributor_private.h
>> @@ -129,6 +129,67 @@ struct rte_distributor_v20 {
>>   	struct rte_distributor_returned_pkts returns;
>>   };
>>   
>> +/* All different signature compare functions */
>> +enum rte_distributor_match_function {
>> +	RTE_DIST_MATCH_SCALAR = 0,
>> +	RTE_DIST_MATCH_VECTOR,
>> +	RTE_DIST_NUM_MATCH_FNS
>> +};
>> +
>> +/**
>> + * Buffer structure used to pass the pointer data between cores. This is cache
>> + * line aligned, but to improve performance and prevent adjacent cache-line
>> + * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
>> + * the next cache line to worker 0, we pad this out to two cache lines.
>> + * We can pass up to 8 mbufs at a time in one cacheline.
>> + * There is a separate cacheline for returns in the burst API.
>> + */
>> +struct rte_distributor_buffer {
>> +	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
>> +			__rte_cache_aligned; /* <= outgoing to worker */
>> +
>> +	int64_t pad1 __rte_cache_aligned;    /* <= one cache line  */
>> +
>> +	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
>> +			__rte_cache_aligned; /* <= incoming from worker */
>> +
>> +	int64_t pad2 __rte_cache_aligned;    /* <= one cache line  */
>> +
>> +	int count __rte_cache_aligned;       /* <= number of current mbufs */
>> +};
>> +
>> +struct rte_distributor {
>> +	TAILQ_ENTRY(rte_distributor) next;    /**< Next in list. */
>> +
>> +	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
>> +	unsigned int num_workers;             /**< Number of workers polling */
>> +	unsigned int alg_type;                /**< Number of alg types */
>> +
>> +	/**>
>> +	 * First cache line in the this array are the tags inflight
>> +	 * on the worker core. Second cache line are the backlog
>> +	 * that are going to go to the worker core.
>> +	 */
>> +	uint16_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2]
>> +			__rte_cache_aligned;
>> +
>> +	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]
>> +			__rte_cache_aligned;
>> +
>> +	struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS];
>> +
>> +	struct rte_distributor_returned_pkts returns;
>> +
>> +	enum rte_distributor_match_function dist_match_fn;
>> +
>> +	struct rte_distributor_v20 *d_v20;
>> +};
>> +
>> +void
>> +find_match_scalar(struct rte_distributor *d,
>> +			uint16_t *data_ptr,
>> +			uint16_t *output_ptr);
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
> The last patch claimed that this header file is for structs/definitions
> common between the old and new implementations. These definitions look
> to apply only to the new one, so do they belong in the .c file instead?

The _v20 structs are used as a fallback in the new struct, so probably 
best to have in a common private file.
  
Hunt, David March 1, 2017, 9:58 a.m. UTC | #4
On 24/2/2017 2:09 PM, Bruce Richardson wrote:
> On Tue, Feb 21, 2017 at 03:17:40AM +0000, David Hunt wrote:
>> Signed-off-by: David Hunt <david.hunt@intel.com>
>> ---
>>   lib/librte_distributor/rte_distributor_private.h | 61 ++++++++++++++++++++++++
>>   1 file changed, 61 insertions(+)
>>
>> diff --git a/lib/librte_distributor/rte_distributor_private.h b/lib/librte_distributor/rte_distributor_private.h
>> index 2d85b9b..c8e0f98 100644
>> --- a/lib/librte_distributor/rte_distributor_private.h
>> +++ b/lib/librte_distributor/rte_distributor_private.h
>> @@ -129,6 +129,67 @@ struct rte_distributor_v20 {
>>   	struct rte_distributor_returned_pkts returns;
>>   };
>>   
>> +/* All different signature compare functions */
>> +enum rte_distributor_match_function {
>> +	RTE_DIST_MATCH_SCALAR = 0,
>> +	RTE_DIST_MATCH_VECTOR,
>> +	RTE_DIST_NUM_MATCH_FNS
>> +};
>> +
>> +/**
>> + * Buffer structure used to pass the pointer data between cores. This is cache
>> + * line aligned, but to improve performance and prevent adjacent cache-line
>> + * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
>> + * the next cache line to worker 0, we pad this out to two cache lines.
>> + * We can pass up to 8 mbufs at a time in one cacheline.
>> + * There is a separate cacheline for returns in the burst API.
>> + */
>> +struct rte_distributor_buffer {
>> +	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
>> +			__rte_cache_aligned; /* <= outgoing to worker */
>> +
>> +	int64_t pad1 __rte_cache_aligned;    /* <= one cache line  */
>> +
>> +	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
>> +			__rte_cache_aligned; /* <= incoming from worker */
>> +
>> +	int64_t pad2 __rte_cache_aligned;    /* <= one cache line  */
>> +
>> +	int count __rte_cache_aligned;       /* <= number of current mbufs */
>> +};
> Rather than adding padding elements here, would it be better and clearer
> just to align the values to 128B (or more strictly CACHE_LINE_SZ * 2)?
>
> /Bruce

I tried various combinations of __rte_align(128) and taking out the 
pads, but the performance regressed 10-15%. For the moment, I suggest 
leaving as is.

Dave.
  

Patch

diff --git a/lib/librte_distributor/rte_distributor_private.h b/lib/librte_distributor/rte_distributor_private.h
index 2d85b9b..c8e0f98 100644
--- a/lib/librte_distributor/rte_distributor_private.h
+++ b/lib/librte_distributor/rte_distributor_private.h
@@ -129,6 +129,67 @@  struct rte_distributor_v20 {
 	struct rte_distributor_returned_pkts returns;
 };
 
+/* All different signature compare functions */
+enum rte_distributor_match_function {
+	RTE_DIST_MATCH_SCALAR = 0,
+	RTE_DIST_MATCH_VECTOR,
+	RTE_DIST_NUM_MATCH_FNS
+};
+
+/**
+ * Buffer structure used to pass the pointer data between cores. This is cache
+ * line aligned, but to improve performance and prevent adjacent cache-line
+ * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
+ * the next cache line to worker 0, we pad this out to two cache lines.
+ * We can pass up to 8 mbufs at a time in one cacheline.
+ * There is a separate cacheline for returns in the burst API.
+ */
+struct rte_distributor_buffer {
+	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
+			__rte_cache_aligned; /* <= outgoing to worker */
+
+	int64_t pad1 __rte_cache_aligned;    /* <= one cache line  */
+
+	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
+			__rte_cache_aligned; /* <= incoming from worker */
+
+	int64_t pad2 __rte_cache_aligned;    /* <= one cache line  */
+
+	int count __rte_cache_aligned;       /* <= number of current mbufs */
+};
+
+struct rte_distributor {
+	TAILQ_ENTRY(rte_distributor) next;    /**< Next in list. */
+
+	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
+	unsigned int num_workers;             /**< Number of workers polling */
+	unsigned int alg_type;                /**< Number of alg types */
+
+	/**>
+	 * First cache line in the this array are the tags inflight
+	 * on the worker core. Second cache line are the backlog
+	 * that are going to go to the worker core.
+	 */
+	uint16_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2]
+			__rte_cache_aligned;
+
+	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]
+			__rte_cache_aligned;
+
+	struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS];
+
+	struct rte_distributor_returned_pkts returns;
+
+	enum rte_distributor_match_function dist_match_fn;
+
+	struct rte_distributor_v20 *d_v20;
+};
+
+void
+find_match_scalar(struct rte_distributor *d,
+			uint16_t *data_ptr,
+			uint16_t *output_ptr);
+
 #ifdef __cplusplus
 }
 #endif