[v4,1/8] test/distributor: fix deadlock with freezed worker

Message ID 20200925224209.12173-2-l.wojciechow@partner.samsung.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series fix distributor synchronization issues |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Lukasz Wojciechowski Sept. 25, 2020, 10:42 p.m. UTC
  The sanity test with worker shutdown delegates all bufs
to be processed by a single lcore worker, then it freezes
one of the lcore workers and continues to send more bufs.

Problem occurred if freezed lcore is the same as the one
that is processing the mbufs. The lcore processing mbufs
might be different every time test is launched.
This is caused by keeping the value of wkr static variable
in rte_distributor_process function between running test cases.

Test freezed always lcore with 0 id. The patch changes avoids
possible collision by freezing lcore with zero_idx. The lcore
that receives the data updates the zero_idx, so it is not freezed
itself.

To reproduce the issue fixed by this patch, please run
distributor_autotest command in test app several times in a row.

Fixes: c3eabff124e6 ("distributor: add unit tests")
Cc: bruce.richardson@intel.com
Cc: stable@dpdk.org

Signed-off-by: Lukasz Wojciechowski <l.wojciechow@partner.samsung.com>
Tested-by: David Hunt <david.hunt@intel.com>
---
 app/test/test_distributor.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)
  

Comments

Honnappa Nagarahalli Sept. 27, 2020, 11:34 p.m. UTC | #1
Hi Lukasz,
	Few comments inline

<snip>

> 
> The sanity test with worker shutdown delegates all bufs to be processed by a
> single lcore worker, then it freezes one of the lcore workers and continues to
> send more bufs.
The designated core to freeze (core with id == 0 in the existing code) gets out of the first while loop and gets into the 2nd while loop in the function ' handle_work_for_shutdown_test'.
In between these 2 while loops, it informs the distributor that it will  not accept any more packets by calling ' rte_distributor_return_pkt' (at least this API is supposed to do that). But, the distributor hangs waiting for the frozen core to start accepting packets. I think this is a problem with the distributor and not the test case.

> 
> Problem occurred if freezed lcore is the same as the one that is processing
> the mbufs. The lcore processing mbufs might be different every time test is
> launched.
> This is caused by keeping the value of wkr static variable in
> rte_distributor_process function between running test cases.
> 
> Test freezed always lcore with 0 id. The patch changes avoids possible
> collision by freezing lcore with zero_idx. The lcore that receives the data
> updates the zero_idx, so it is not freezed itself.
> 
> To reproduce the issue fixed by this patch, please run distributor_autotest
> command in test app several times in a row.
> 
> Fixes: c3eabff124e6 ("distributor: add unit tests")
> Cc: bruce.richardson@intel.com
> Cc: stable@dpdk.org
> 
> Signed-off-by: Lukasz Wojciechowski <l.wojciechow@partner.samsung.com>
> Tested-by: David Hunt <david.hunt@intel.com>
> ---
>  app/test/test_distributor.c | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c index
> ba1f81cf8..35b25463a 100644
> --- a/app/test/test_distributor.c
> +++ b/app/test/test_distributor.c
> @@ -28,6 +28,7 @@ struct worker_params worker_params;
>  static volatile int quit;      /**< general quit variable for all threads */
>  static volatile int zero_quit; /**< var for when we just want thr0 to quit*/
> static volatile unsigned worker_idx;
> +static volatile unsigned zero_idx;
> 
>  struct worker_stats {
>  	volatile unsigned handled_packets;
> @@ -346,27 +347,43 @@ handle_work_for_shutdown_test(void *arg)
>  	unsigned int total = 0;
>  	unsigned int i;
>  	unsigned int returned = 0;
> +	unsigned int zero_id = 0;
>  	const unsigned int id = __atomic_fetch_add(&worker_idx, 1,
>  			__ATOMIC_RELAXED);
> 
>  	num = rte_distributor_get_pkt(d, id, buf, buf, num);
> 
> +	zero_id = __atomic_load_n(&zero_idx, __ATOMIC_ACQUIRE);
> +	if (id == zero_id && num > 0) {
> +		zero_id = (zero_id + 1) %  __atomic_load_n(&worker_idx,
> +			__ATOMIC_ACQUIRE);
> +		__atomic_store_n(&zero_idx, zero_id, __ATOMIC_RELEASE);
> +	}
> +
>  	/* wait for quit single globally, or for worker zero, wait
>  	 * for zero_quit */
> -	while (!quit && !(id == 0 && zero_quit)) {
> +	while (!quit && !(id == zero_id && zero_quit)) {
>  		worker_stats[id].handled_packets += num;
>  		count += num;
>  		for (i = 0; i < num; i++)
>  			rte_pktmbuf_free(buf[i]);
>  		num = rte_distributor_get_pkt(d,
>  				id, buf, buf, num);
> +
> +		zero_id = __atomic_load_n(&zero_idx,
> __ATOMIC_ACQUIRE);
> +		if (id == zero_id && num > 0) {
> +			zero_id = (zero_id + 1) %
> __atomic_load_n(&worker_idx,
> +				__ATOMIC_ACQUIRE);
> +			__atomic_store_n(&zero_idx, zero_id,
> __ATOMIC_RELEASE);
> +		}
> +
>  		total += num;
>  	}
>  	worker_stats[id].handled_packets += num;
>  	count += num;
>  	returned = rte_distributor_return_pkt(d, id, buf, num);
> 
> -	if (id == 0) {
> +	if (id == zero_id) {
>  		/* for worker zero, allow it to restart to pick up last packet
>  		 * when all workers are shutting down.
>  		 */
> @@ -586,6 +603,7 @@ quit_workers(struct worker_params *wp, struct
> rte_mempool *p)
>  	rte_eal_mp_wait_lcore();
>  	quit = 0;
>  	worker_idx = 0;
> +	zero_idx = 0;
>  }
> 
>  static int
> --
> 2.17.1
  
Lukasz Wojciechowski Sept. 30, 2020, 8:22 p.m. UTC | #2
Hi Honnappa,

Thank you very much for your review
Reply inline below

W dniu 28.09.2020 o 01:34, Honnappa Nagarahalli pisze:
> Hi Lukasz,
> 	Few comments inline
>
> <snip>
>
>> The sanity test with worker shutdown delegates all bufs to be processed by a
>> single lcore worker, then it freezes one of the lcore workers and continues to
>> send more bufs.
> The designated core to freeze (core with id == 0 in the existing code) gets out of the first while loop and gets into the 2nd while loop in the function ' handle_work_for_shutdown_test'.
> In between these 2 while loops, it informs the distributor that it will  not accept any more packets by calling ' rte_distributor_return_pkt' (at least this API is supposed to do that). But, the distributor hangs waiting for the frozen core to start accepting packets. I think this is a problem with the distributor and not the test case.
I agree.
I did some further investigation and you are correct. This is the 
distributor issue. The new burst model doesn't care at all if the worker 
has called rte_distributor_return_pkt(). It it doesn't find a worker 
with matching tag, it will process packets to the worker without 
checking if it requested for packets.

The legacy single model used a different handshake value to indicate it 
does not want any more packets. The flag is reused for other purposes in 
burst model (marking valid return packets) and that's obviously wrong.

The tests however also need to be adjusted as they don't verify the 
request/return status of worker properly.

I hope I will be able to update the patches this or next week to fix it.

>
>> Problem occurred if freezed lcore is the same as the one that is processing
>> the mbufs. The lcore processing mbufs might be different every time test is
>> launched.
>> This is caused by keeping the value of wkr static variable in
>> rte_distributor_process function between running test cases.
>>
>> Test freezed always lcore with 0 id. The patch changes avoids possible
>> collision by freezing lcore with zero_idx. The lcore that receives the data
>> updates the zero_idx, so it is not freezed itself.
>>
>> To reproduce the issue fixed by this patch, please run distributor_autotest
>> command in test app several times in a row.
>>
>> Fixes: c3eabff124e6 ("distributor: add unit tests")
>> Cc: bruce.richardson@intel.com
>> Cc: stable@dpdk.org
>>
>> Signed-off-by: Lukasz Wojciechowski <l.wojciechow@partner.samsung.com>
>> Tested-by: David Hunt <david.hunt@intel.com>
>> ---
>>   app/test/test_distributor.c | 22 ++++++++++++++++++++--
>>   1 file changed, 20 insertions(+), 2 deletions(-)
>>
>> diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c index
>> ba1f81cf8..35b25463a 100644
>> --- a/app/test/test_distributor.c
>> +++ b/app/test/test_distributor.c
>> @@ -28,6 +28,7 @@ struct worker_params worker_params;
>>   static volatile int quit;      /**< general quit variable for all threads */
>>   static volatile int zero_quit; /**< var for when we just want thr0 to quit*/
>> static volatile unsigned worker_idx;
>> +static volatile unsigned zero_idx;
>>
>>   struct worker_stats {
>>   	volatile unsigned handled_packets;
>> @@ -346,27 +347,43 @@ handle_work_for_shutdown_test(void *arg)
>>   	unsigned int total = 0;
>>   	unsigned int i;
>>   	unsigned int returned = 0;
>> +	unsigned int zero_id = 0;
>>   	const unsigned int id = __atomic_fetch_add(&worker_idx, 1,
>>   			__ATOMIC_RELAXED);
>>
>>   	num = rte_distributor_get_pkt(d, id, buf, buf, num);
>>
>> +	zero_id = __atomic_load_n(&zero_idx, __ATOMIC_ACQUIRE);
>> +	if (id == zero_id && num > 0) {
>> +		zero_id = (zero_id + 1) %  __atomic_load_n(&worker_idx,
>> +			__ATOMIC_ACQUIRE);
>> +		__atomic_store_n(&zero_idx, zero_id, __ATOMIC_RELEASE);
>> +	}
>> +
>>   	/* wait for quit single globally, or for worker zero, wait
>>   	 * for zero_quit */
>> -	while (!quit && !(id == 0 && zero_quit)) {
>> +	while (!quit && !(id == zero_id && zero_quit)) {
>>   		worker_stats[id].handled_packets += num;
>>   		count += num;
>>   		for (i = 0; i < num; i++)
>>   			rte_pktmbuf_free(buf[i]);
>>   		num = rte_distributor_get_pkt(d,
>>   				id, buf, buf, num);
>> +
>> +		zero_id = __atomic_load_n(&zero_idx,
>> __ATOMIC_ACQUIRE);
>> +		if (id == zero_id && num > 0) {
>> +			zero_id = (zero_id + 1) %
>> __atomic_load_n(&worker_idx,
>> +				__ATOMIC_ACQUIRE);
>> +			__atomic_store_n(&zero_idx, zero_id,
>> __ATOMIC_RELEASE);
>> +		}
>> +
>>   		total += num;
>>   	}
>>   	worker_stats[id].handled_packets += num;
>>   	count += num;
>>   	returned = rte_distributor_return_pkt(d, id, buf, num);
>>
>> -	if (id == 0) {
>> +	if (id == zero_id) {
>>   		/* for worker zero, allow it to restart to pick up last packet
>>   		 * when all workers are shutting down.
>>   		 */
>> @@ -586,6 +603,7 @@ quit_workers(struct worker_params *wp, struct
>> rte_mempool *p)
>>   	rte_eal_mp_wait_lcore();
>>   	quit = 0;
>>   	worker_idx = 0;
>> +	zero_idx = 0;
>>   }
>>
>>   static int
>> --
>> 2.17.1
  

Patch

diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c
index ba1f81cf8..35b25463a 100644
--- a/app/test/test_distributor.c
+++ b/app/test/test_distributor.c
@@ -28,6 +28,7 @@  struct worker_params worker_params;
 static volatile int quit;      /**< general quit variable for all threads */
 static volatile int zero_quit; /**< var for when we just want thr0 to quit*/
 static volatile unsigned worker_idx;
+static volatile unsigned zero_idx;
 
 struct worker_stats {
 	volatile unsigned handled_packets;
@@ -346,27 +347,43 @@  handle_work_for_shutdown_test(void *arg)
 	unsigned int total = 0;
 	unsigned int i;
 	unsigned int returned = 0;
+	unsigned int zero_id = 0;
 	const unsigned int id = __atomic_fetch_add(&worker_idx, 1,
 			__ATOMIC_RELAXED);
 
 	num = rte_distributor_get_pkt(d, id, buf, buf, num);
 
+	zero_id = __atomic_load_n(&zero_idx, __ATOMIC_ACQUIRE);
+	if (id == zero_id && num > 0) {
+		zero_id = (zero_id + 1) %  __atomic_load_n(&worker_idx,
+			__ATOMIC_ACQUIRE);
+		__atomic_store_n(&zero_idx, zero_id, __ATOMIC_RELEASE);
+	}
+
 	/* wait for quit single globally, or for worker zero, wait
 	 * for zero_quit */
-	while (!quit && !(id == 0 && zero_quit)) {
+	while (!quit && !(id == zero_id && zero_quit)) {
 		worker_stats[id].handled_packets += num;
 		count += num;
 		for (i = 0; i < num; i++)
 			rte_pktmbuf_free(buf[i]);
 		num = rte_distributor_get_pkt(d,
 				id, buf, buf, num);
+
+		zero_id = __atomic_load_n(&zero_idx, __ATOMIC_ACQUIRE);
+		if (id == zero_id && num > 0) {
+			zero_id = (zero_id + 1) %  __atomic_load_n(&worker_idx,
+				__ATOMIC_ACQUIRE);
+			__atomic_store_n(&zero_idx, zero_id, __ATOMIC_RELEASE);
+		}
+
 		total += num;
 	}
 	worker_stats[id].handled_packets += num;
 	count += num;
 	returned = rte_distributor_return_pkt(d, id, buf, num);
 
-	if (id == 0) {
+	if (id == zero_id) {
 		/* for worker zero, allow it to restart to pick up last packet
 		 * when all workers are shutting down.
 		 */
@@ -586,6 +603,7 @@  quit_workers(struct worker_params *wp, struct rte_mempool *p)
 	rte_eal_mp_wait_lcore();
 	quit = 0;
 	worker_idx = 0;
+	zero_idx = 0;
 }
 
 static int