net/iavf:fix slow memory allocation

Message ID 20221117065726.277672-1-kaisenx.you@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Qi Zhang
Headers
Series net/iavf:fix slow memory allocation |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/github-robot: build success github build: passed
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS

Commit Message

Kaisen You Nov. 17, 2022, 6:57 a.m. UTC
  In some cases, the DPDK does not allocate hugepage heap memory to
some sockets due to the user setting parameters
(e.g. -l 40-79, SOCKET 0 has no memory).
When the interrupt thread runs on the corresponding core of this
socket, each allocation/release will execute a whole set of heap
allocation/release operations,resulting in poor performance.
Instead we call malloc() to get memory from the system's
heap space to fix this problem.

Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
Cc: stable@dpdk.org

Signed-off-by: Kaisen You <kaisenx.you@intel.com>
---
 drivers/net/iavf/iavf_vchnl.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)
  

Comments

Jiale, SongX Nov. 18, 2022, 8:22 a.m. UTC | #1
> -----Original Message-----
> From: Kaisen You <kaisenx.you@intel.com>
> Sent: Thursday, November 17, 2022 2:57 PM
> To: dev@dpdk.org
> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; You, KaisenX <kaisenx.you@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>
> Subject: [PATCH] net/iavf:fix slow memory allocation
> 
> In some cases, the DPDK does not allocate hugepage heap memory to some
> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET 0 has no
> memory).
> When the interrupt thread runs on the corresponding core of this socket,
> each allocation/release will execute a whole set of heap allocation/release
> operations,resulting in poor performance.
> Instead we call malloc() to get memory from the system's heap space to fix
> this problem.
> 
> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
> ---
Tested-by: Song Jiale <songx.jiale@intel.com>
  
Kaisen You Dec. 7, 2022, 9:07 a.m. UTC | #2
Can you please help to review? thanks.

> -----Original Message-----
> From: You, KaisenX <kaisenx.you@intel.com>
> Sent: 2022年11月17日 14:57
> To: dev@dpdk.org
> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; You, KaisenX <kaisenx.you@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>
> Subject: [PATCH] net/iavf:fix slow memory allocation
> 
> In some cases, the DPDK does not allocate hugepage heap memory to some
> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET 0 has no
> memory).
> When the interrupt thread runs on the corresponding core of this socket,
> each allocation/release will execute a whole set of heap allocation/release
> operations,resulting in poor performance.
> Instead we call malloc() to get memory from the system's heap space to fix
> this problem.
> 
> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
> ---
>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c index
> f92daf97f2..a05791fe48 100644
> --- a/drivers/net/iavf/iavf_vchnl.c
> +++ b/drivers/net/iavf/iavf_vchnl.c
> @@ -36,7 +36,6 @@ struct iavf_event_element {
>  	struct rte_eth_dev *dev;
>  	enum rte_eth_event_type event;
>  	void *param;
> -	size_t param_alloc_size;
>  	uint8_t param_alloc_data[0];
>  };
> 
> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param __rte_unused)
>  		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
>  			TAILQ_REMOVE(&pending, pos, next);
>  			rte_eth_dev_callback_process(pos->dev, pos-
> >event, pos->param);
> -			rte_free(pos);
> +			free(pos);
>  		}
>  	}
> 
> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev *dev,  {
>  	struct iavf_event_handler *handler = &event_handler;
>  	char notify_byte;
> -	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem)
> + param_alloc_size, 0);
> +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
> +param_alloc_size);
>  	if (!elem)
>  		return;
> 
>  	elem->dev = dev;
>  	elem->event = event;
>  	elem->param = param;
> -	elem->param_alloc_size = param_alloc_size;
>  	if (param && param_alloc_size) {
>  		rte_memcpy(elem->param_alloc_data, param,
> param_alloc_size);
>  		elem->param = elem->param_alloc_data; @@ -165,7 +163,7
> @@ iavf_dev_event_handler_fini(void)
>  	struct iavf_event_element *pos, *save_next;
>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
>  		TAILQ_REMOVE(&handler->pending, pos, next);
> -		rte_free(pos);
> +		free(pos);
>  	}
>  }
> 
> --
> 2.34.1
  
Jingjing Wu Dec. 8, 2022, 8:46 a.m. UTC | #3
> -----Original Message-----
> From: You, KaisenX <kaisenx.you@intel.com>
> Sent: Thursday, November 17, 2022 2:57 PM
> To: dev@dpdk.org
> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; You, KaisenX <kaisenx.you@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH] net/iavf:fix slow memory allocation
> 
> In some cases, the DPDK does not allocate hugepage heap memory to
> some sockets due to the user setting parameters
> (e.g. -l 40-79, SOCKET 0 has no memory).
> When the interrupt thread runs on the corresponding core of this
> socket, each allocation/release will execute a whole set of heap
> allocation/release operations,resulting in poor performance.
> Instead we call malloc() to get memory from the system's
> heap space to fix this problem.
> 
> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
Acked-by: Jingjing Wu <jingjing.wu@intel.com>
  
Ferruh Yigit Dec. 8, 2022, 3:04 p.m. UTC | #4
On 11/17/2022 6:57 AM, Kaisen You wrote:
> In some cases, the DPDK does not allocate hugepage heap memory to
> some sockets due to the user setting parameters
> (e.g. -l 40-79, SOCKET 0 has no memory).
> When the interrupt thread runs on the corresponding core of this
> socket, each allocation/release will execute a whole set of heap
> allocation/release operations,resulting in poor performance.
> Instead we call malloc() to get memory from the system's
> heap space to fix this problem.
> 

Hi Kaisen,

Using libc malloc can improve performance for this case, but I would
like to understand root cause of the problem.


As far as I can see, interrupt callbacks are run by interrupt thread
("eal-intr-thread"),
and interrupt thread created by 'rte_ctrl_thread_create()' API.

'rte_ctrl_thread_create()' comment mentions that "CPU affinity retrieved
at the time 'rte_eal_init()' was called,"

And 'rte_eal_init()' is run on main lcore, which is the first lcore in
the core list (unless otherwise defined with --main-lcore).

So, the interrupts should be running on a core that has hugepages
allocated for it, am I missing something here?




And what about using 'rte_malloc_socket()' API (instead of rte_malloc),
which gets 'socket' as parameter, and provide the socket that devices is
on as parameter to this API? Is it possible to test this?



> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
> ---
>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
> index f92daf97f2..a05791fe48 100644
> --- a/drivers/net/iavf/iavf_vchnl.c
> +++ b/drivers/net/iavf/iavf_vchnl.c
> @@ -36,7 +36,6 @@ struct iavf_event_element {
>  	struct rte_eth_dev *dev;
>  	enum rte_eth_event_type event;
>  	void *param;
> -	size_t param_alloc_size;
>  	uint8_t param_alloc_data[0];
>  };
>  
> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param __rte_unused)
>  		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
>  			TAILQ_REMOVE(&pending, pos, next);
>  			rte_eth_dev_callback_process(pos->dev, pos->event, pos->param);
> -			rte_free(pos);
> +			free(pos);
>  		}
>  	}
>  
> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev *dev,
>  {
>  	struct iavf_event_handler *handler = &event_handler;
>  	char notify_byte;
> -	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem) + param_alloc_size, 0);
> +	struct iavf_event_element *elem = malloc(sizeof(*elem) + param_alloc_size);
>  	if (!elem)
>  		return;
>  
>  	elem->dev = dev;
>  	elem->event = event;
>  	elem->param = param;
> -	elem->param_alloc_size = param_alloc_size;
>  	if (param && param_alloc_size) {
>  		rte_memcpy(elem->param_alloc_data, param, param_alloc_size);
>  		elem->param = elem->param_alloc_data;
> @@ -165,7 +163,7 @@ iavf_dev_event_handler_fini(void)
>  	struct iavf_event_element *pos, *save_next;
>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
>  		TAILQ_REMOVE(&handler->pending, pos, next);
> -		rte_free(pos);
> +		free(pos);
>  	}
>  }
>
  
Kaisen You Dec. 13, 2022, 7:52 a.m. UTC | #5
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@amd.com>
> Sent: 2022年12月8日 23:04
> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> Anatoly <anatoly.burakov@intel.com>; David Marchand
> <david.marchand@redhat.com>
> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing,
> Beilei <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
> Boccassi <bluca@debian.org>; Mcnamara, John
> <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> On 11/17/2022 6:57 AM, Kaisen You wrote:
> > In some cases, the DPDK does not allocate hugepage heap memory to
> some
> > sockets due to the user setting parameters (e.g. -l 40-79, SOCKET 0
> > has no memory).
> > When the interrupt thread runs on the corresponding core of this
> > socket, each allocation/release will execute a whole set of heap
> > allocation/release operations,resulting in poor performance.
> > Instead we call malloc() to get memory from the system's heap space to
> > fix this problem.
> >
> 
> Hi Kaisen,
> 
> Using libc malloc can improve performance for this case, but I would like to
> understand root cause of the problem.
> 
> 
> As far as I can see, interrupt callbacks are run by interrupt thread ("eal-intr-
> thread"), and interrupt thread created by 'rte_ctrl_thread_create()' API.
> 
> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity retrieved at
> the time 'rte_eal_init()' was called,"
> 
> And 'rte_eal_init()' is run on main lcore, which is the first lcore in the core list
> (unless otherwise defined with --main-lcore).
> 
> So, the interrupts should be running on a core that has hugepages allocated
> for it, am I missing something here?
> 
> 
Thank for your comments.  Let me try to explain the root cause here:  
eal_intr_thread the CPU in the corresponding slot does not create memory pool. 
That results in frequent memory subsequently creating/destructing.

When testpmd started, the parameter (e.g. -l 40-79) is set.  Different OS 
has different topology. Some OS like SUSE only creates memory pool for 
one CPU slot, while other system creates for two. That is why the problem 
occurs when using memories in different OS.
> 
> 
> And what about using 'rte_malloc_socket()' API (instead of rte_malloc),
> which gets 'socket' as parameter, and provide the socket that devices is on as
> parameter to this API? Is it possible to test this?
> 
> 
As to the reason for not using rte_malloc_socket. I thought rte_malloc_socket() 
could solve the problem too. And the appropriate parameter should be the 
socket_id that created the memory pool for DPDK initialization. Assuming that 
the socket_id of the initially allocated memory = 1, first let the eal_intr_thread 
determine if it is on the socket_id, then record this socket_id in the eal_intr_thread 
and pass it to the iavf_event_thread.  But there seems no way to link this parameter 
to the iavf_dev_event_post() function. That is why rte_malloc_socket is not used. 

Let me know if there is anything else unclear.
> 
> > Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Kaisen You <kaisenx.you@intel.com>
> > ---
> >  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
> >  1 file changed, 3 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/iavf/iavf_vchnl.c
> > b/drivers/net/iavf/iavf_vchnl.c index f92daf97f2..a05791fe48 100644
> > --- a/drivers/net/iavf/iavf_vchnl.c
> > +++ b/drivers/net/iavf/iavf_vchnl.c
> > @@ -36,7 +36,6 @@ struct iavf_event_element {
> >  	struct rte_eth_dev *dev;
> >  	enum rte_eth_event_type event;
> >  	void *param;
> > -	size_t param_alloc_size;
> >  	uint8_t param_alloc_data[0];
> >  };
> >
> > @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param __rte_unused)
> >  		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
> >  			TAILQ_REMOVE(&pending, pos, next);
> >  			rte_eth_dev_callback_process(pos->dev, pos-
> >event, pos->param);
> > -			rte_free(pos);
> > +			free(pos);
> >  		}
> >  	}
> >
> > @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev *dev,  {
> >  	struct iavf_event_handler *handler = &event_handler;
> >  	char notify_byte;
> > -	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem)
> + param_alloc_size, 0);
> > +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
> > +param_alloc_size);
> >  	if (!elem)
> >  		return;
> >
> >  	elem->dev = dev;
> >  	elem->event = event;
> >  	elem->param = param;
> > -	elem->param_alloc_size = param_alloc_size;
> >  	if (param && param_alloc_size) {
> >  		rte_memcpy(elem->param_alloc_data, param,
> param_alloc_size);
> >  		elem->param = elem->param_alloc_data; @@ -165,7 +163,7
> @@
> > iavf_dev_event_handler_fini(void)
> >  	struct iavf_event_element *pos, *save_next;
> >  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
> >  		TAILQ_REMOVE(&handler->pending, pos, next);
> > -		rte_free(pos);
> > +		free(pos);
> >  	}
> >  }
> >
  
Ferruh Yigit Dec. 13, 2022, 9:35 a.m. UTC | #6
On 12/13/2022 7:52 AM, You, KaisenX wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>> Sent: 2022年12月8日 23:04
>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
>> Anatoly <anatoly.burakov@intel.com>; David Marchand
>> <david.marchand@redhat.com>
>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
>> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing,
>> Beilei <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
>> Boccassi <bluca@debian.org>; Mcnamara, John
>> <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
>>
>> On 11/17/2022 6:57 AM, Kaisen You wrote:
>>> In some cases, the DPDK does not allocate hugepage heap memory to
>> some
>>> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET 0
>>> has no memory).
>>> When the interrupt thread runs on the corresponding core of this
>>> socket, each allocation/release will execute a whole set of heap
>>> allocation/release operations,resulting in poor performance.
>>> Instead we call malloc() to get memory from the system's heap space to
>>> fix this problem.
>>>
>>
>> Hi Kaisen,
>>
>> Using libc malloc can improve performance for this case, but I would like to
>> understand root cause of the problem.
>>
>>
>> As far as I can see, interrupt callbacks are run by interrupt thread ("eal-intr-
>> thread"), and interrupt thread created by 'rte_ctrl_thread_create()' API.
>>
>> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity retrieved at
>> the time 'rte_eal_init()' was called,"
>>
>> And 'rte_eal_init()' is run on main lcore, which is the first lcore in the core list
>> (unless otherwise defined with --main-lcore).
>>
>> So, the interrupts should be running on a core that has hugepages allocated
>> for it, am I missing something here?
>>
>>
> Thank for your comments.  Let me try to explain the root cause here:  
> eal_intr_thread the CPU in the corresponding slot does not create memory pool. 
> That results in frequent memory subsequently creating/destructing.
> 
> When testpmd started, the parameter (e.g. -l 40-79) is set.  Different OS 
> has different topology. Some OS like SUSE only creates memory pool for 
> one CPU slot, while other system creates for two. That is why the problem 
> occurs when using memories in different OS.


It is testpmd application that decides from which socket to allocate
memory from, right. This is nothing specific to OS.

As far as I remember, testpmd logic is too allocate from socket that its
cores are used (provided with -l parameter), and allocate from socket
that device is attached to.

So, in a dual socket system, if all used cores are in socket 1 and the
NIC is in socket 1, no memory is allocated for socket 0. This is to
optimize memory consumption.


Can you please confirm that the problem you are observing is because
interrupt handler is running on a CPU, which doesn't have memory
allocated for its socket?

In this case what I don't understand is why interrupts is not running on
main lcore, which should be first core in the list, for "-l 40-79"
sample it should be lcore 40.
For your case, is interrupt handler run on core 0? Or any arbitrary core?
If so, can you please confirm when you provide core list as "-l 0,40-79"
fixes the issue?


>>
>>
>> And what about using 'rte_malloc_socket()' API (instead of rte_malloc),
>> which gets 'socket' as parameter, and provide the socket that devices is on as
>> parameter to this API? Is it possible to test this?
>>
>>
> As to the reason for not using rte_malloc_socket. I thought rte_malloc_socket() 
> could solve the problem too. And the appropriate parameter should be the 
> socket_id that created the memory pool for DPDK initialization. Assuming that> the socket_id of the initially allocated memory = 1, first let the
eal_intr_thread
> determine if it is on the socket_id, then record this socket_id in the eal_intr_thread 
> and pass it to the iavf_event_thread.  But there seems no way to link this parameter 
> to the iavf_dev_event_post() function. That is why rte_malloc_socket is not used. 
> 

I was thinking socket id of device can be used, but that won't help if
the core that interrupt handler runs is in different socket.
And I also don't know if there is a way to get socket that interrupt
thread is on. @David may help perhaps.

So question is why interrupt thread is not running on main lcore.

> Let me know if there is anything else unclear.
>>
>>> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
>>> Cc: stable@dpdk.org
>>>
>>> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
>>> ---
>>>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
>>>  1 file changed, 3 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/net/iavf/iavf_vchnl.c
>>> b/drivers/net/iavf/iavf_vchnl.c index f92daf97f2..a05791fe48 100644
>>> --- a/drivers/net/iavf/iavf_vchnl.c
>>> +++ b/drivers/net/iavf/iavf_vchnl.c
>>> @@ -36,7 +36,6 @@ struct iavf_event_element {
>>>  	struct rte_eth_dev *dev;
>>>  	enum rte_eth_event_type event;
>>>  	void *param;
>>> -	size_t param_alloc_size;
>>>  	uint8_t param_alloc_data[0];
>>>  };
>>>
>>> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param __rte_unused)
>>>  		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
>>>  			TAILQ_REMOVE(&pending, pos, next);
>>>  			rte_eth_dev_callback_process(pos->dev, pos-
>>> event, pos->param);
>>> -			rte_free(pos);
>>> +			free(pos);
>>>  		}
>>>  	}
>>>
>>> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev *dev,  {
>>>  	struct iavf_event_handler *handler = &event_handler;
>>>  	char notify_byte;
>>> -	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem)
>> + param_alloc_size, 0);
>>> +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
>>> +param_alloc_size);
>>>  	if (!elem)
>>>  		return;
>>>
>>>  	elem->dev = dev;
>>>  	elem->event = event;
>>>  	elem->param = param;
>>> -	elem->param_alloc_size = param_alloc_size;
>>>  	if (param && param_alloc_size) {
>>>  		rte_memcpy(elem->param_alloc_data, param,
>> param_alloc_size);
>>>  		elem->param = elem->param_alloc_data; @@ -165,7 +163,7
>> @@
>>> iavf_dev_event_handler_fini(void)
>>>  	struct iavf_event_element *pos, *save_next;
>>>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
>>>  		TAILQ_REMOVE(&handler->pending, pos, next);
>>> -		rte_free(pos);
>>> +		free(pos);
>>>  	}
>>>  }
>>>
>
  
Ferruh Yigit Dec. 13, 2022, 1:27 p.m. UTC | #7
On 12/13/2022 9:35 AM, Ferruh Yigit wrote:
> On 12/13/2022 7:52 AM, You, KaisenX wrote:
>>
>>
>>> -----Original Message-----
>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>>> Sent: 2022年12月8日 23:04
>>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
>>> Anatoly <anatoly.burakov@intel.com>; David Marchand
>>> <david.marchand@redhat.com>
>>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
>>> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing,
>>> Beilei <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
>>> Boccassi <bluca@debian.org>; Mcnamara, John
>>> <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
>>>
>>> On 11/17/2022 6:57 AM, Kaisen You wrote:
>>>> In some cases, the DPDK does not allocate hugepage heap memory to
>>> some
>>>> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET 0
>>>> has no memory).
>>>> When the interrupt thread runs on the corresponding core of this
>>>> socket, each allocation/release will execute a whole set of heap
>>>> allocation/release operations,resulting in poor performance.
>>>> Instead we call malloc() to get memory from the system's heap space to
>>>> fix this problem.
>>>>
>>>
>>> Hi Kaisen,
>>>
>>> Using libc malloc can improve performance for this case, but I would like to
>>> understand root cause of the problem.
>>>
>>>
>>> As far as I can see, interrupt callbacks are run by interrupt thread ("eal-intr-
>>> thread"), and interrupt thread created by 'rte_ctrl_thread_create()' API.
>>>
>>> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity retrieved at
>>> the time 'rte_eal_init()' was called,"
>>>
>>> And 'rte_eal_init()' is run on main lcore, which is the first lcore in the core list
>>> (unless otherwise defined with --main-lcore).
>>>
>>> So, the interrupts should be running on a core that has hugepages allocated
>>> for it, am I missing something here?
>>>
>>>
>> Thank for your comments.  Let me try to explain the root cause here:  
>> eal_intr_thread the CPU in the corresponding slot does not create memory pool. 
>> That results in frequent memory subsequently creating/destructing.
>>
>> When testpmd started, the parameter (e.g. -l 40-79) is set.  Different OS 
>> has different topology. Some OS like SUSE only creates memory pool for 
>> one CPU slot, while other system creates for two. That is why the problem 
>> occurs when using memories in different OS.
> 
> 
> It is testpmd application that decides from which socket to allocate
> memory from, right. This is nothing specific to OS.
> 
> As far as I remember, testpmd logic is too allocate from socket that its
> cores are used (provided with -l parameter), and allocate from socket
> that device is attached to.
> 
> So, in a dual socket system, if all used cores are in socket 1 and the
> NIC is in socket 1, no memory is allocated for socket 0. This is to
> optimize memory consumption.
> 
> 
> Can you please confirm that the problem you are observing is because
> interrupt handler is running on a CPU, which doesn't have memory
> allocated for its socket?
> 
> In this case what I don't understand is why interrupts is not running on
> main lcore, which should be first core in the list, for "-l 40-79"
> sample it should be lcore 40.
> For your case, is interrupt handler run on core 0? Or any arbitrary core?
> If so, can you please confirm when you provide core list as "-l 0,40-79"
> fixes the issue?
> 
> 
>>>
>>>
>>> And what about using 'rte_malloc_socket()' API (instead of rte_malloc),
>>> which gets 'socket' as parameter, and provide the socket that devices is on as
>>> parameter to this API? Is it possible to test this?
>>>
>>>
>> As to the reason for not using rte_malloc_socket. I thought rte_malloc_socket() 
>> could solve the problem too. And the appropriate parameter should be the 
>> socket_id that created the memory pool for DPDK initialization. Assuming that> the socket_id of the initially allocated memory = 1, first let the
> eal_intr_thread
>> determine if it is on the socket_id, then record this socket_id in the eal_intr_thread 
>> and pass it to the iavf_event_thread.  But there seems no way to link this parameter 
>> to the iavf_dev_event_post() function. That is why rte_malloc_socket is not used. 
>>
> 
> I was thinking socket id of device can be used, but that won't help if
> the core that interrupt handler runs is in different socket.
> And I also don't know if there is a way to get socket that interrupt
> thread is on. @David may help perhaps.
> 
> So question is why interrupt thread is not running on main lcore.
> 

OK after some talk with David, what I am missing is
'rte_ctrl_thread_create()' does NOT run on main lcore, it can run on any
core except data plane cores.

Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and
interrupt thread (so driver interrupt callback iavf_dev_event_post())
can run on any core, making it hard to manage.
And it seems it is not possible to control where interrupt thread to run.

One option can be allocating hugepages for all sockets, but this
requires user involvement, and can't happen transparently.

Other option can be to control where "iavf-event-thread" run,
like using 'rte_thread_create()' to create thread and provide attribute
to run it on main lcore (rte_lcore_cpuset(rte_get_main_lcore()))?

Can you please test above option?



>> Let me know if there is anything else unclear.
>>>
>>>> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
>>>> Cc: stable@dpdk.org
>>>>
>>>> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
>>>> ---
>>>>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
>>>>  1 file changed, 3 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/net/iavf/iavf_vchnl.c
>>>> b/drivers/net/iavf/iavf_vchnl.c index f92daf97f2..a05791fe48 100644
>>>> --- a/drivers/net/iavf/iavf_vchnl.c
>>>> +++ b/drivers/net/iavf/iavf_vchnl.c
>>>> @@ -36,7 +36,6 @@ struct iavf_event_element {
>>>>  	struct rte_eth_dev *dev;
>>>>  	enum rte_eth_event_type event;
>>>>  	void *param;
>>>> -	size_t param_alloc_size;
>>>>  	uint8_t param_alloc_data[0];
>>>>  };
>>>>
>>>> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param __rte_unused)
>>>>  		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
>>>>  			TAILQ_REMOVE(&pending, pos, next);
>>>>  			rte_eth_dev_callback_process(pos->dev, pos-
>>>> event, pos->param);
>>>> -			rte_free(pos);
>>>> +			free(pos);
>>>>  		}
>>>>  	}
>>>>
>>>> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev *dev,  {
>>>>  	struct iavf_event_handler *handler = &event_handler;
>>>>  	char notify_byte;
>>>> -	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem)
>>> + param_alloc_size, 0);
>>>> +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
>>>> +param_alloc_size);
>>>>  	if (!elem)
>>>>  		return;
>>>>
>>>>  	elem->dev = dev;
>>>>  	elem->event = event;
>>>>  	elem->param = param;
>>>> -	elem->param_alloc_size = param_alloc_size;
>>>>  	if (param && param_alloc_size) {
>>>>  		rte_memcpy(elem->param_alloc_data, param,
>>> param_alloc_size);
>>>>  		elem->param = elem->param_alloc_data; @@ -165,7 +163,7
>>> @@
>>>> iavf_dev_event_handler_fini(void)
>>>>  	struct iavf_event_element *pos, *save_next;
>>>>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
>>>>  		TAILQ_REMOVE(&handler->pending, pos, next);
>>>> -		rte_free(pos);
>>>> +		free(pos);
>>>>  	}
>>>>  }
>>>>
>>
>
  
Kaisen You Dec. 20, 2022, 6:52 a.m. UTC | #8
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@amd.com>
> Sent: 2022年12月13日 21:28
> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> Anatoly <anatoly.burakov@intel.com>; David Marchand
> <david.marchand@redhat.com>
> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing,
> Beilei <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
> Boccassi <bluca@debian.org>; Mcnamara, John
> <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> On 12/13/2022 9:35 AM, Ferruh Yigit wrote:
> > On 12/13/2022 7:52 AM, You, KaisenX wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Ferruh Yigit <ferruh.yigit@amd.com>
> >>> Sent: 2022年12月8日 23:04
> >>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> >>> Anatoly <anatoly.burakov@intel.com>; David Marchand
> >>> <david.marchand@redhat.com>
> >>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
> >>> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
> >>> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
> >>> Zhang, Qi Z <qi.z.zhang@intel.com>; Luca Boccassi
> >>> <bluca@debian.org>; Mcnamara, John <john.mcnamara@intel.com>;
> Kevin
> >>> Traynor <ktraynor@redhat.com>
> >>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >>>
> >>> On 11/17/2022 6:57 AM, Kaisen You wrote:
> >>>> In some cases, the DPDK does not allocate hugepage heap memory to
> >>> some
> >>>> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET 0
> >>>> has no memory).
> >>>> When the interrupt thread runs on the corresponding core of this
> >>>> socket, each allocation/release will execute a whole set of heap
> >>>> allocation/release operations,resulting in poor performance.
> >>>> Instead we call malloc() to get memory from the system's heap space
> >>>> to fix this problem.
> >>>>
> >>>
> >>> Hi Kaisen,
> >>>
> >>> Using libc malloc can improve performance for this case, but I would
> >>> like to understand root cause of the problem.
> >>>
> >>>
> >>> As far as I can see, interrupt callbacks are run by interrupt thread
> >>> ("eal-intr- thread"), and interrupt thread created by
> 'rte_ctrl_thread_create()' API.
> >>>
> >>> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity
> >>> retrieved at the time 'rte_eal_init()' was called,"
> >>>
> >>> And 'rte_eal_init()' is run on main lcore, which is the first lcore
> >>> in the core list (unless otherwise defined with --main-lcore).
> >>>
> >>> So, the interrupts should be running on a core that has hugepages
> >>> allocated for it, am I missing something here?
> >>>
> >>>
> >> Thank for your comments.  Let me try to explain the root cause here:
> >> eal_intr_thread the CPU in the corresponding slot does not create
> memory pool.
> >> That results in frequent memory subsequently creating/destructing.
> >>
> >> When testpmd started, the parameter (e.g. -l 40-79) is set.
> >> Different OS has different topology. Some OS like SUSE only creates
> >> memory pool for one CPU slot, while other system creates for two.
> >> That is why the problem occurs when using memories in different OS.
> >
> >
> > It is testpmd application that decides from which socket to allocate
> > memory from, right. This is nothing specific to OS.
> >
> > As far as I remember, testpmd logic is too allocate from socket that
> > its cores are used (provided with -l parameter), and allocate from
> > socket that device is attached to.
> >
> > So, in a dual socket system, if all used cores are in socket 1 and the
> > NIC is in socket 1, no memory is allocated for socket 0. This is to
> > optimize memory consumption.
> >
> >
> > Can you please confirm that the problem you are observing is because
> > interrupt handler is running on a CPU, which doesn't have memory
> > allocated for its socket?
> >
> > In this case what I don't understand is why interrupts is not running
> > on main lcore, which should be first core in the list, for "-l 40-79"
> > sample it should be lcore 40.
> > For your case, is interrupt handler run on core 0? Or any arbitrary core?
> > If so, can you please confirm when you provide core list as "-l 0,40-79"
> > fixes the issue?
> >
First of all, sorry to reply to you so late.
I can confirm that the problem I observed is because  interrupt handler is 
running on a CPU, which doesn't have memory allocated for its socket.

In my case, interrupt handler is running on core 0.
I tried providing "-l 0,40-79" as a startup parameter, this issue can be resolved.

I corrected the previous statement that this problem does  only occur on 
the SUSE system. In any OS, this problem occurs as long as the range of 
startup parameters is only on node1.

> >
> >>>
> >>>
> >>> And what about using 'rte_malloc_socket()' API (instead of
> >>> rte_malloc), which gets 'socket' as parameter, and provide the
> >>> socket that devices is on as parameter to this API? Is it possible to test
> this?
> >>>
> >>>
> >> As to the reason for not using rte_malloc_socket. I thought
> >> rte_malloc_socket() could solve the problem too. And the appropriate
> >> parameter should be the socket_id that created the memory pool for
> >> DPDK initialization. Assuming that> the socket_id of the initially
> >> allocated memory = 1, first let the
> > eal_intr_thread
> >> determine if it is on the socket_id, then record this socket_id in
> >> the eal_intr_thread and pass it to the iavf_event_thread.  But there
> >> seems no way to link this parameter to the iavf_dev_event_post()
> function. That is why rte_malloc_socket is not used.
> >>
> >
> > I was thinking socket id of device can be used, but that won't help if
> > the core that interrupt handler runs is in different socket.
> > And I also don't know if there is a way to get socket that interrupt
> > thread is on. @David may help perhaps.
> >
> > So question is why interrupt thread is not running on main lcore.
> >
> 
> OK after some talk with David, what I am missing is 'rte_ctrl_thread_create()'
> does NOT run on main lcore, it can run on any core except data plane cores.
> 
> Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and interrupt
> thread (so driver interrupt callback iavf_dev_event_post()) can run on any
> core, making it hard to manage.
> And it seems it is not possible to control where interrupt thread to run.
> 
> One option can be allocating hugepages for all sockets, but this requires user
> involvement, and can't happen transparently.
> 
> Other option can be to control where "iavf-event-thread" run, like using
> 'rte_thread_create()' to create thread and provide attribute to run it on main
> lcore (rte_lcore_cpuset(rte_get_main_lcore()))?
> 
> Can you please test above option?
> 
> 
The first option can solve this issue. but to borrow from your previous saying, 
"in a dual socket system, if all used cores are in socket 1 and the NIC is in socket 1,
 no memory is allocated for socket 0.  This is to optimize memory consumption."
I think it's unreasonable to do so.

About other option. In " rte_eal_intr_init" function, After the thread is created, 
I set the thread affinity for eal-intr-thread, but it does not solve this issue.
> 
> >> Let me know if there is anything else unclear.
> >>>
> >>>> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
> >>>> Cc: stable@dpdk.org
> >>>>
> >>>> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
> >>>> ---
> >>>>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
> >>>>  1 file changed, 3 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/drivers/net/iavf/iavf_vchnl.c
> >>>> b/drivers/net/iavf/iavf_vchnl.c index f92daf97f2..a05791fe48 100644
> >>>> --- a/drivers/net/iavf/iavf_vchnl.c
> >>>> +++ b/drivers/net/iavf/iavf_vchnl.c
> >>>> @@ -36,7 +36,6 @@ struct iavf_event_element {
> >>>>  	struct rte_eth_dev *dev;
> >>>>  	enum rte_eth_event_type event;
> >>>>  	void *param;
> >>>> -	size_t param_alloc_size;
> >>>>  	uint8_t param_alloc_data[0];
> >>>>  };
> >>>>
> >>>> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param
> __rte_unused)
> >>>>  		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
> >>>>  			TAILQ_REMOVE(&pending, pos, next);
> >>>>  			rte_eth_dev_callback_process(pos->dev, pos- event,
> pos->param);
> >>>> -			rte_free(pos);
> >>>> +			free(pos);
> >>>>  		}
> >>>>  	}
> >>>>
> >>>> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev *dev,
> {
> >>>>  	struct iavf_event_handler *handler = &event_handler;
> >>>>  	char notify_byte;
> >>>> -	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem)
> >>> + param_alloc_size, 0);
> >>>> +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
> >>>> +param_alloc_size);
> >>>>  	if (!elem)
> >>>>  		return;
> >>>>
> >>>>  	elem->dev = dev;
> >>>>  	elem->event = event;
> >>>>  	elem->param = param;
> >>>> -	elem->param_alloc_size = param_alloc_size;
> >>>>  	if (param && param_alloc_size) {
> >>>>  		rte_memcpy(elem->param_alloc_data, param,
> >>> param_alloc_size);
> >>>>  		elem->param = elem->param_alloc_data; @@ -165,7 +163,7
> >>> @@
> >>>> iavf_dev_event_handler_fini(void)
> >>>>  	struct iavf_event_element *pos, *save_next;
> >>>>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
> >>>>  		TAILQ_REMOVE(&handler->pending, pos, next);
> >>>> -		rte_free(pos);
> >>>> +		free(pos);
> >>>>  	}
> >>>>  }
> >>>>
> >>
> >
  
David Marchand Dec. 20, 2022, 9:33 a.m. UTC | #9
On Tue, Dec 20, 2022 at 7:52 AM You, KaisenX <kaisenx.you@intel.com> wrote:
> > >> As to the reason for not using rte_malloc_socket. I thought
> > >> rte_malloc_socket() could solve the problem too. And the appropriate
> > >> parameter should be the socket_id that created the memory pool for
> > >> DPDK initialization. Assuming that> the socket_id of the initially
> > >> allocated memory = 1, first let the
> > > eal_intr_thread
> > >> determine if it is on the socket_id, then record this socket_id in
> > >> the eal_intr_thread and pass it to the iavf_event_thread.  But there
> > >> seems no way to link this parameter to the iavf_dev_event_post()
> > function. That is why rte_malloc_socket is not used.
> > >>
> > >
> > > I was thinking socket id of device can be used, but that won't help if
> > > the core that interrupt handler runs is in different socket.
> > > And I also don't know if there is a way to get socket that interrupt
> > > thread is on. @David may help perhaps.
> > >
> > > So question is why interrupt thread is not running on main lcore.
> > >
> >
> > OK after some talk with David, what I am missing is 'rte_ctrl_thread_create()'
> > does NOT run on main lcore, it can run on any core except data plane cores.
> >
> > Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and interrupt
> > thread (so driver interrupt callback iavf_dev_event_post()) can run on any
> > core, making it hard to manage.
> > And it seems it is not possible to control where interrupt thread to run.
> >
> > One option can be allocating hugepages for all sockets, but this requires user
> > involvement, and can't happen transparently.
> >
> > Other option can be to control where "iavf-event-thread" run, like using
> > 'rte_thread_create()' to create thread and provide attribute to run it on main
> > lcore (rte_lcore_cpuset(rte_get_main_lcore()))?
> >
> > Can you please test above option?
> >
> >
> The first option can solve this issue. but to borrow from your previous saying,
> "in a dual socket system, if all used cores are in socket 1 and the NIC is in socket 1,
>  no memory is allocated for socket 0.  This is to optimize memory consumption."
> I think it's unreasonable to do so.
>
> About other option. In " rte_eal_intr_init" function, After the thread is created,
> I set the thread affinity for eal-intr-thread, but it does not solve this issue.

Jumping in this thread.

I tried to play a bit with a E810 nic on a dual numa and I can't see
anything wrong for now.
Can you provide a simple and small reproducer of your issue?

Thanks.
  
Kaisen You Dec. 20, 2022, 10:11 a.m. UTC | #10
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: 2022年12月20日 17:34
> To: You, KaisenX <kaisenx.you@intel.com>
> Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>; Mcnamara,
> John <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> On Tue, Dec 20, 2022 at 7:52 AM You, KaisenX <kaisenx.you@intel.com>
> wrote:
> > > >> As to the reason for not using rte_malloc_socket. I thought
> > > >> rte_malloc_socket() could solve the problem too. And the
> > > >> appropriate parameter should be the socket_id that created the
> > > >> memory pool for DPDK initialization. Assuming that> the socket_id
> > > >> of the initially allocated memory = 1, first let the
> > > > eal_intr_thread
> > > >> determine if it is on the socket_id, then record this socket_id
> > > >> in the eal_intr_thread and pass it to the iavf_event_thread.  But
> > > >> there seems no way to link this parameter to the
> > > >> iavf_dev_event_post()
> > > function. That is why rte_malloc_socket is not used.
> > > >>
> > > >
> > > > I was thinking socket id of device can be used, but that won't
> > > > help if the core that interrupt handler runs is in different socket.
> > > > And I also don't know if there is a way to get socket that
> > > > interrupt thread is on. @David may help perhaps.
> > > >
> > > > So question is why interrupt thread is not running on main lcore.
> > > >
> > >
> > > OK after some talk with David, what I am missing is
> 'rte_ctrl_thread_create()'
> > > does NOT run on main lcore, it can run on any core except data plane
> cores.
> > >
> > > Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and
> > > interrupt thread (so driver interrupt callback
> > > iavf_dev_event_post()) can run on any core, making it hard to manage.
> > > And it seems it is not possible to control where interrupt thread to run.
> > >
> > > One option can be allocating hugepages for all sockets, but this
> > > requires user involvement, and can't happen transparently.
> > >
> > > Other option can be to control where "iavf-event-thread" run, like
> > > using 'rte_thread_create()' to create thread and provide attribute
> > > to run it on main lcore (rte_lcore_cpuset(rte_get_main_lcore()))?
> > >
> > > Can you please test above option?
> > >
> > >
> > The first option can solve this issue. but to borrow from your
> > previous saying, "in a dual socket system, if all used cores are in
> > socket 1 and the NIC is in socket 1,  no memory is allocated for socket 0.
> This is to optimize memory consumption."
> > I think it's unreasonable to do so.
> >
> > About other option. In " rte_eal_intr_init" function, After the thread
> > is created, I set the thread affinity for eal-intr-thread, but it does not solve
> this issue.
> 
> Jumping in this thread.
> 
> I tried to play a bit with a E810 nic on a dual numa and I can't see anything
> wrong for now.
> Can you provide a simple and small reproducer of your issue?
> 
> Thanks.
> 
This is my environment:
Enter "lscpu" on the command line:
NUMA:
	NUMA node(s): 2
	NUMA node0 CPU(S) : 0-27,56-83
	NUMA node1 CPU(S) : 28-55,84-111

List the steps to reproduce the issue:

1. create vf and blind to dpdk
echo 1 > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0
2. launch testpmd
./x86_ 64-native-linuxapp-clang/app/dpdk-testpmd -l 28-48 -n 4 -a 0000:ca:01.0 
--file-prefix=dpdk_ 525342_ 20221104042659 -- -i --rxq=256 --txq=256 
--total-num-mbufs=500000

Parameter Description:
 "-l 28-48":The range of parameter values after "-l" must be on "NUMA node1 CPU(S)"
 "0000:ca:01.0":inset on node1
> --
> David Marchand
  
David Marchand Dec. 20, 2022, 10:33 a.m. UTC | #11
On Tue, Dec 20, 2022 at 11:12 AM You, KaisenX <kaisenx.you@intel.com> wrote:
> > I tried to play a bit with a E810 nic on a dual numa and I can't see anything
> > wrong for now.
> > Can you provide a simple and small reproducer of your issue?
> >
> > Thanks.
> >
> This is my environment:
> Enter "lscpu" on the command line:
> NUMA:
>         NUMA node(s): 2
>         NUMA node0 CPU(S) : 0-27,56-83
>         NUMA node1 CPU(S) : 28-55,84-111
>
> List the steps to reproduce the issue:
>
> 1. create vf and blind to dpdk
> echo 1 > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
> ./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0
> 2. launch testpmd
> ./x86_ 64-native-linuxapp-clang/app/dpdk-testpmd -l 28-48 -n 4 -a 0000:ca:01.0
> --file-prefix=dpdk_ 525342_ 20221104042659 -- -i --rxq=256 --txq=256
> --total-num-mbufs=500000
>
> Parameter Description:
>  "-l 28-48":The range of parameter values after "-l" must be on "NUMA node1 CPU(S)"
>  "0000:ca:01.0":inset on node1

- Using 256 queues is not supported upstream...
iavf_dev_configure(): large VF is not supported

I would really love that Intel stops building/testing features with
this out of tree driver........
We have lost and still lose so much time because of it.


- Back to your topic.
Can you try this simple hack:

diff --git a/lib/eal/common/eal_common_thread.c
b/lib/eal/common/eal_common_thread.c
index c5d8b4327d..92160c7fa6 100644
--- a/lib/eal/common/eal_common_thread.c
+++ b/lib/eal/common/eal_common_thread.c
@@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
        void *routine_arg = params->arg;

        __rte_thread_init(rte_lcore_id(), cpuset);
+       RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
        params->ret = pthread_setaffinity_np(pthread_self(), sizeof(*cpuset),
                cpuset);
        if (params->ret != 0) {
  
Kaisen You Dec. 21, 2022, 9:12 a.m. UTC | #12
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: 2022年12月20日 18:33
> To: You, KaisenX <kaisenx.you@intel.com>
> Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>; Mcnamara,
> John <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> On Tue, Dec 20, 2022 at 11:12 AM You, KaisenX <kaisenx.you@intel.com>
> wrote:
> > > I tried to play a bit with a E810 nic on a dual numa and I can't see
> > > anything wrong for now.
> > > Can you provide a simple and small reproducer of your issue?
> > >
> > > Thanks.
> > >
> > This is my environment:
> > Enter "lscpu" on the command line:
> > NUMA:
> >         NUMA node(s): 2
> >         NUMA node0 CPU(S) : 0-27,56-83
> >         NUMA node1 CPU(S) : 28-55,84-111
> >
> > List the steps to reproduce the issue:
> >
> > 1. create vf and blind to dpdk
> > echo 1 > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
> > ./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0 2. launch
> > testpmd ./x86_ 64-native-linuxapp-clang/app/dpdk-testpmd -l 28-48 -n 4
> > -a 0000:ca:01.0 --file-prefix=dpdk_ 525342_ 20221104042659 -- -i
> > --rxq=256 --txq=256
> > --total-num-mbufs=500000
> >
> > Parameter Description:
> >  "-l 28-48":The range of parameter values after "-l" must be on "NUMA
> node1 CPU(S)"
> >  "0000:ca:01.0":inset on node1
> 
> - Using 256 queues is not supported upstream...
> iavf_dev_configure(): large VF is not supported
> 
> I would really love that Intel stops building/testing features with this out of
> tree driver........
> We have lost and still lose so much time because of it.
> 
> 
> - Back to your topic.
> Can you try this simple hack:
> 
> diff --git a/lib/eal/common/eal_common_thread.c
> b/lib/eal/common/eal_common_thread.c
> index c5d8b4327d..92160c7fa6 100644
> --- a/lib/eal/common/eal_common_thread.c
> +++ b/lib/eal/common/eal_common_thread.c
> @@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
>         void *routine_arg = params->arg;
> 
>         __rte_thread_init(rte_lcore_id(), cpuset);
> +       RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
>         params->ret = pthread_setaffinity_np(pthread_self(), sizeof(*cpuset),
>                 cpuset);
>         if (params->ret != 0) {
> 
Thanks for your advice.

But this issue still exists after I tried.
> 
> --
> David Marchand
  
David Marchand Dec. 21, 2022, 10:50 a.m. UTC | #13
On Wed, Dec 21, 2022 at 10:12 AM You, KaisenX <kaisenx.you@intel.com> wrote:
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > Sent: 2022年12月20日 18:33
> > To: You, KaisenX <kaisenx.you@intel.com>
> > Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov, Anatoly
> > <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> > <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> > Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>; Mcnamara,
> > John <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> > Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >
> > On Tue, Dec 20, 2022 at 11:12 AM You, KaisenX <kaisenx.you@intel.com>
> > wrote:
> > > > I tried to play a bit with a E810 nic on a dual numa and I can't see
> > > > anything wrong for now.
> > > > Can you provide a simple and small reproducer of your issue?
> > > >
> > > > Thanks.
> > > >
> > > This is my environment:
> > > Enter "lscpu" on the command line:
> > > NUMA:
> > >         NUMA node(s): 2
> > >         NUMA node0 CPU(S) : 0-27,56-83
> > >         NUMA node1 CPU(S) : 28-55,84-111
> > >
> > > List the steps to reproduce the issue:
> > >
> > > 1. create vf and blind to dpdk
> > > echo 1 > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
> > > ./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0 2. launch
> > > testpmd ./x86_ 64-native-linuxapp-clang/app/dpdk-testpmd -l 28-48 -n 4
> > > -a 0000:ca:01.0 --file-prefix=dpdk_ 525342_ 20221104042659 -- -i
> > > --rxq=256 --txq=256
> > > --total-num-mbufs=500000
> > >
> > > Parameter Description:
> > >  "-l 28-48":The range of parameter values after "-l" must be on "NUMA
> > node1 CPU(S)"
> > >  "0000:ca:01.0":inset on node1
> > - Back to your topic.
> > Can you try this simple hack:
> >
> > diff --git a/lib/eal/common/eal_common_thread.c
> > b/lib/eal/common/eal_common_thread.c
> > index c5d8b4327d..92160c7fa6 100644
> > --- a/lib/eal/common/eal_common_thread.c
> > +++ b/lib/eal/common/eal_common_thread.c
> > @@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
> >         void *routine_arg = params->arg;
> >
> >         __rte_thread_init(rte_lcore_id(), cpuset);
> > +       RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
> >         params->ret = pthread_setaffinity_np(pthread_self(), sizeof(*cpuset),
> >                 cpuset);
> >         if (params->ret != 0) {
> >
> Thanks for your advice.
>
> But this issue still exists after I tried.

Ok, I think I understand what is wrong... but I am still guessing as I
am not sure what your "issue" is.
Can you have a try with:
https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.296530-1-david.marchand@redhat.com/

Thanks.
  
Ferruh Yigit Dec. 21, 2022, 1:48 p.m. UTC | #14
On 12/20/2022 6:52 AM, You, KaisenX wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>> Sent: 2022年12月13日 21:28
>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
>> Anatoly <anatoly.burakov@intel.com>; David Marchand
>> <david.marchand@redhat.com>
>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
>> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing,
>> Beilei <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
>> Boccassi <bluca@debian.org>; Mcnamara, John
>> <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
>>
>> On 12/13/2022 9:35 AM, Ferruh Yigit wrote:
>>> On 12/13/2022 7:52 AM, You, KaisenX wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>>>>> Sent: 2022年12月8日 23:04
>>>>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
>>>>> Anatoly <anatoly.burakov@intel.com>; David Marchand
>>>>> <david.marchand@redhat.com>
>>>>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
>>>>> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
>>>>> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
>>>>> Zhang, Qi Z <qi.z.zhang@intel.com>; Luca Boccassi
>>>>> <bluca@debian.org>; Mcnamara, John <john.mcnamara@intel.com>;
>> Kevin
>>>>> Traynor <ktraynor@redhat.com>
>>>>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
>>>>>
>>>>> On 11/17/2022 6:57 AM, Kaisen You wrote:
>>>>>> In some cases, the DPDK does not allocate hugepage heap memory to
>>>>> some
>>>>>> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET 0
>>>>>> has no memory).
>>>>>> When the interrupt thread runs on the corresponding core of this
>>>>>> socket, each allocation/release will execute a whole set of heap
>>>>>> allocation/release operations,resulting in poor performance.
>>>>>> Instead we call malloc() to get memory from the system's heap space
>>>>>> to fix this problem.
>>>>>>
>>>>>
>>>>> Hi Kaisen,
>>>>>
>>>>> Using libc malloc can improve performance for this case, but I would
>>>>> like to understand root cause of the problem.
>>>>>
>>>>>
>>>>> As far as I can see, interrupt callbacks are run by interrupt thread
>>>>> ("eal-intr- thread"), and interrupt thread created by
>> 'rte_ctrl_thread_create()' API.
>>>>>
>>>>> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity
>>>>> retrieved at the time 'rte_eal_init()' was called,"
>>>>>
>>>>> And 'rte_eal_init()' is run on main lcore, which is the first lcore
>>>>> in the core list (unless otherwise defined with --main-lcore).
>>>>>
>>>>> So, the interrupts should be running on a core that has hugepages
>>>>> allocated for it, am I missing something here?
>>>>>
>>>>>
>>>> Thank for your comments.  Let me try to explain the root cause here:
>>>> eal_intr_thread the CPU in the corresponding slot does not create
>> memory pool.
>>>> That results in frequent memory subsequently creating/destructing.
>>>>
>>>> When testpmd started, the parameter (e.g. -l 40-79) is set.
>>>> Different OS has different topology. Some OS like SUSE only creates
>>>> memory pool for one CPU slot, while other system creates for two.
>>>> That is why the problem occurs when using memories in different OS.
>>>
>>>
>>> It is testpmd application that decides from which socket to allocate
>>> memory from, right. This is nothing specific to OS.
>>>
>>> As far as I remember, testpmd logic is too allocate from socket that
>>> its cores are used (provided with -l parameter), and allocate from
>>> socket that device is attached to.
>>>
>>> So, in a dual socket system, if all used cores are in socket 1 and the
>>> NIC is in socket 1, no memory is allocated for socket 0. This is to
>>> optimize memory consumption.
>>>
>>>
>>> Can you please confirm that the problem you are observing is because
>>> interrupt handler is running on a CPU, which doesn't have memory
>>> allocated for its socket?
>>>
>>> In this case what I don't understand is why interrupts is not running
>>> on main lcore, which should be first core in the list, for "-l 40-79"
>>> sample it should be lcore 40.
>>> For your case, is interrupt handler run on core 0? Or any arbitrary core?
>>> If so, can you please confirm when you provide core list as "-l 0,40-79"
>>> fixes the issue?
>>>
> First of all, sorry to reply to you so late.
> I can confirm that the problem I observed is because  interrupt handler is 
> running on a CPU, which doesn't have memory allocated for its socket.
> 
> In my case, interrupt handler is running on core 0.
> I tried providing "-l 0,40-79" as a startup parameter, this issue can be resolved.
> 
> I corrected the previous statement that this problem does  only occur on 
> the SUSE system. In any OS, this problem occurs as long as the range of 
> startup parameters is only on node1.
> 
>>>
>>>>>
>>>>>
>>>>> And what about using 'rte_malloc_socket()' API (instead of
>>>>> rte_malloc), which gets 'socket' as parameter, and provide the
>>>>> socket that devices is on as parameter to this API? Is it possible to test
>> this?
>>>>>
>>>>>
>>>> As to the reason for not using rte_malloc_socket. I thought
>>>> rte_malloc_socket() could solve the problem too. And the appropriate
>>>> parameter should be the socket_id that created the memory pool for
>>>> DPDK initialization. Assuming that> the socket_id of the initially
>>>> allocated memory = 1, first let the
>>> eal_intr_thread
>>>> determine if it is on the socket_id, then record this socket_id in
>>>> the eal_intr_thread and pass it to the iavf_event_thread.  But there
>>>> seems no way to link this parameter to the iavf_dev_event_post()
>> function. That is why rte_malloc_socket is not used.
>>>>
>>>
>>> I was thinking socket id of device can be used, but that won't help if
>>> the core that interrupt handler runs is in different socket.
>>> And I also don't know if there is a way to get socket that interrupt
>>> thread is on. @David may help perhaps.
>>>
>>> So question is why interrupt thread is not running on main lcore.
>>>
>>
>> OK after some talk with David, what I am missing is 'rte_ctrl_thread_create()'
>> does NOT run on main lcore, it can run on any core except data plane cores.
>>
>> Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and interrupt
>> thread (so driver interrupt callback iavf_dev_event_post()) can run on any
>> core, making it hard to manage.
>> And it seems it is not possible to control where interrupt thread to run.
>>
>> One option can be allocating hugepages for all sockets, but this requires user
>> involvement, and can't happen transparently.
>>
>> Other option can be to control where "iavf-event-thread" run, like using
>> 'rte_thread_create()' to create thread and provide attribute to run it on main
>> lcore (rte_lcore_cpuset(rte_get_main_lcore()))?
>>
>> Can you please test above option?
>>
>>
> The first option can solve this issue. but to borrow from your previous saying, 
> "in a dual socket system, if all used cores are in socket 1 and the NIC is in socket 1,
>  no memory is allocated for socket 0.  This is to optimize memory consumption."
> I think it's unreasonable to do so.
> 
> About other option. In " rte_eal_intr_init" function, After the thread is created, 
> I set the thread affinity for eal-intr-thread, but it does not solve this issue.

Hi Kaisen,

There are two threads involved,

First one is interrupt thread, "eal-intr-thread", created by
'rte_eal_intr_init()'.

Second one is iavf event handler, "iavf-event-thread", created by
'iavf_dev_event_handler_init()'.

First one triggered by interrupt and puts a message to a list, second
one consumes from the list and processes the message.
So I assume two thread being in different sockets, or memory being
allocated in a different socket than the cores running causes the
performance issue.

Did you test the second thread, "iavf-event-thread", affiliated to main
core? (by creating thread using 'rte_thread_create()' API)


>>
>>>> Let me know if there is anything else unclear.
>>>>>
>>>>>> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
>>>>>> Cc: stable@dpdk.org
>>>>>>
>>>>>> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
>>>>>> ---
>>>>>>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
>>>>>>  1 file changed, 3 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/iavf/iavf_vchnl.c
>>>>>> b/drivers/net/iavf/iavf_vchnl.c index f92daf97f2..a05791fe48 100644
>>>>>> --- a/drivers/net/iavf/iavf_vchnl.c
>>>>>> +++ b/drivers/net/iavf/iavf_vchnl.c
>>>>>> @@ -36,7 +36,6 @@ struct iavf_event_element {
>>>>>>  	struct rte_eth_dev *dev;
>>>>>>  	enum rte_eth_event_type event;
>>>>>>  	void *param;
>>>>>> -	size_t param_alloc_size;
>>>>>>  	uint8_t param_alloc_data[0];
>>>>>>  };
>>>>>>
>>>>>> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param
>> __rte_unused)
>>>>>>  		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
>>>>>>  			TAILQ_REMOVE(&pending, pos, next);
>>>>>>  			rte_eth_dev_callback_process(pos->dev, pos- event,
>> pos->param);
>>>>>> -			rte_free(pos);
>>>>>> +			free(pos);
>>>>>>  		}
>>>>>>  	}
>>>>>>
>>>>>> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev *dev,
>> {
>>>>>>  	struct iavf_event_handler *handler = &event_handler;
>>>>>>  	char notify_byte;
>>>>>> -	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem)
>>>>> + param_alloc_size, 0);
>>>>>> +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
>>>>>> +param_alloc_size);
>>>>>>  	if (!elem)
>>>>>>  		return;
>>>>>>
>>>>>>  	elem->dev = dev;
>>>>>>  	elem->event = event;
>>>>>>  	elem->param = param;
>>>>>> -	elem->param_alloc_size = param_alloc_size;
>>>>>>  	if (param && param_alloc_size) {
>>>>>>  		rte_memcpy(elem->param_alloc_data, param,
>>>>> param_alloc_size);
>>>>>>  		elem->param = elem->param_alloc_data; @@ -165,7 +163,7
>>>>> @@
>>>>>> iavf_dev_event_handler_fini(void)
>>>>>>  	struct iavf_event_element *pos, *save_next;
>>>>>>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
>>>>>>  		TAILQ_REMOVE(&handler->pending, pos, next);
>>>>>> -		rte_free(pos);
>>>>>> +		free(pos);
>>>>>>  	}
>>>>>>  }
>>>>>>
>>>>
>>>
>
  
Kaisen You Dec. 22, 2022, 6:42 a.m. UTC | #15
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: 2022年12月21日 18:50
> To: You, KaisenX <kaisenx.you@intel.com>
> Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>; Mcnamara,
> John <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> On Wed, Dec 21, 2022 at 10:12 AM You, KaisenX <kaisenx.you@intel.com>
> wrote:
> > > -----Original Message-----
> > > From: David Marchand <david.marchand@redhat.com>
> > > Sent: 2022年12月20日 18:33
> > > To: You, KaisenX <kaisenx.you@intel.com>
> > > Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov,
> > > Anatoly <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> > > <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> > > Jingjing <jingjing.wu@intel.com>; Xing, Beilei
> > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
> > > Boccassi <bluca@debian.org>; Mcnamara, John
> > > <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> > > Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> > >
> > > On Tue, Dec 20, 2022 at 11:12 AM You, KaisenX
> > > <kaisenx.you@intel.com>
> > > wrote:
> > > > > I tried to play a bit with a E810 nic on a dual numa and I can't
> > > > > see anything wrong for now.
> > > > > Can you provide a simple and small reproducer of your issue?
> > > > >
> > > > > Thanks.
> > > > >
> > > > This is my environment:
> > > > Enter "lscpu" on the command line:
> > > > NUMA:
> > > >         NUMA node(s): 2
> > > >         NUMA node0 CPU(S) : 0-27,56-83
> > > >         NUMA node1 CPU(S) : 28-55,84-111
> > > >
> > > > List the steps to reproduce the issue:
> > > >
> > > > 1. create vf and blind to dpdk
> > > > echo 1 > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
> > > > ./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0 2. launch
> > > > testpmd ./x86_ 64-native-linuxapp-clang/app/dpdk-testpmd -l 28-48
> > > > -n 4 -a 0000:ca:01.0 --file-prefix=dpdk_ 525342_ 20221104042659 --
> > > > -i
> > > > --rxq=256 --txq=256
> > > > --total-num-mbufs=500000
> > > >
> > > > Parameter Description:
> > > >  "-l 28-48":The range of parameter values after "-l" must be on
> > > > "NUMA
> > > node1 CPU(S)"
> > > >  "0000:ca:01.0":inset on node1
> > > - Back to your topic.
> > > Can you try this simple hack:
> > >
> > > diff --git a/lib/eal/common/eal_common_thread.c
> > > b/lib/eal/common/eal_common_thread.c
> > > index c5d8b4327d..92160c7fa6 100644
> > > --- a/lib/eal/common/eal_common_thread.c
> > > +++ b/lib/eal/common/eal_common_thread.c
> > > @@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
> > >         void *routine_arg = params->arg;
> > >
> > >         __rte_thread_init(rte_lcore_id(), cpuset);
> > > +       RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
> > >         params->ret = pthread_setaffinity_np(pthread_self(),
> sizeof(*cpuset),
> > >                 cpuset);
> > >         if (params->ret != 0) {
> > >
> > Thanks for your advice.
> >
> > But this issue still exists after I tried.
> 
> Ok, I think I understand what is wrong... but I am still guessing as I am not
> sure what your "issue" is.
> Can you have a try with:
> https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.296530-1-
> david.marchand@redhat.com/
> 
> Thanks.
> 
I think this issue is similar to the description in the patch you gave me.

when the DPDK application is started only on one numa node, Interrupt thread 
find memory on another numa node. This leads to a whole set of memory 
allocation/release operations every time when "rte_malloc" is called.
This is the root cause of this issue.

This issue can be solved after I tried.
Thanks for your advice.
> 
> --
> David Marchand
  
Kaisen You Dec. 22, 2022, 7:23 a.m. UTC | #16
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@amd.com>
> Sent: 2022年12月21日 21:49
> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> Anatoly <anatoly.burakov@intel.com>; David Marchand
> <david.marchand@redhat.com>
> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing,
> Beilei <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
> Boccassi <bluca@debian.org>; Mcnamara, John
> <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> On 12/20/2022 6:52 AM, You, KaisenX wrote:
> >
> >
> >> -----Original Message-----
> >> From: Ferruh Yigit <ferruh.yigit@amd.com>
> >> Sent: 2022年12月13日 21:28
> >> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> >> Anatoly <anatoly.burakov@intel.com>; David Marchand
> >> <david.marchand@redhat.com>
> >> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
> >> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
> >> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> >> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>;
> >> Mcnamara, John <john.mcnamara@intel.com>; Kevin Traynor
> >> <ktraynor@redhat.com>
> >> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >>
> >> On 12/13/2022 9:35 AM, Ferruh Yigit wrote:
> >>> On 12/13/2022 7:52 AM, You, KaisenX wrote:
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
> >>>>> Sent: 2022年12月8日 23:04
> >>>>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> >>>>> Anatoly <anatoly.burakov@intel.com>; David Marchand
> >>>>> <david.marchand@redhat.com>
> >>>>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
> >>>>> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
> >>>>> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
> >>>>> Zhang, Qi Z <qi.z.zhang@intel.com>; Luca Boccassi
> >>>>> <bluca@debian.org>; Mcnamara, John <john.mcnamara@intel.com>;
> >> Kevin
> >>>>> Traynor <ktraynor@redhat.com>
> >>>>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >>>>>
> >>>>> On 11/17/2022 6:57 AM, Kaisen You wrote:
> >>>>>> In some cases, the DPDK does not allocate hugepage heap memory
> to
> >>>>> some
> >>>>>> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET
> >>>>>> 0 has no memory).
> >>>>>> When the interrupt thread runs on the corresponding core of this
> >>>>>> socket, each allocation/release will execute a whole set of heap
> >>>>>> allocation/release operations,resulting in poor performance.
> >>>>>> Instead we call malloc() to get memory from the system's heap
> >>>>>> space to fix this problem.
> >>>>>>
> >>>>>
> >>>>> Hi Kaisen,
> >>>>>
> >>>>> Using libc malloc can improve performance for this case, but I
> >>>>> would like to understand root cause of the problem.
> >>>>>
> >>>>>
> >>>>> As far as I can see, interrupt callbacks are run by interrupt
> >>>>> thread
> >>>>> ("eal-intr- thread"), and interrupt thread created by
> >> 'rte_ctrl_thread_create()' API.
> >>>>>
> >>>>> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity
> >>>>> retrieved at the time 'rte_eal_init()' was called,"
> >>>>>
> >>>>> And 'rte_eal_init()' is run on main lcore, which is the first
> >>>>> lcore in the core list (unless otherwise defined with --main-lcore).
> >>>>>
> >>>>> So, the interrupts should be running on a core that has hugepages
> >>>>> allocated for it, am I missing something here?
> >>>>>
> >>>>>
> >>>> Thank for your comments.  Let me try to explain the root cause here:
> >>>> eal_intr_thread the CPU in the corresponding slot does not create
> >> memory pool.
> >>>> That results in frequent memory subsequently creating/destructing.
> >>>>
> >>>> When testpmd started, the parameter (e.g. -l 40-79) is set.
> >>>> Different OS has different topology. Some OS like SUSE only creates
> >>>> memory pool for one CPU slot, while other system creates for two.
> >>>> That is why the problem occurs when using memories in different OS.
> >>>
> >>>
> >>> It is testpmd application that decides from which socket to allocate
> >>> memory from, right. This is nothing specific to OS.
> >>>
> >>> As far as I remember, testpmd logic is too allocate from socket that
> >>> its cores are used (provided with -l parameter), and allocate from
> >>> socket that device is attached to.
> >>>
> >>> So, in a dual socket system, if all used cores are in socket 1 and
> >>> the NIC is in socket 1, no memory is allocated for socket 0. This is
> >>> to optimize memory consumption.
> >>>
> >>>
> >>> Can you please confirm that the problem you are observing is because
> >>> interrupt handler is running on a CPU, which doesn't have memory
> >>> allocated for its socket?
> >>>
> >>> In this case what I don't understand is why interrupts is not
> >>> running on main lcore, which should be first core in the list, for "-l 40-79"
> >>> sample it should be lcore 40.
> >>> For your case, is interrupt handler run on core 0? Or any arbitrary core?
> >>> If so, can you please confirm when you provide core list as "-l 0,40-79"
> >>> fixes the issue?
> >>>
> > First of all, sorry to reply to you so late.
> > I can confirm that the problem I observed is because  interrupt
> > handler is running on a CPU, which doesn't have memory allocated for its
> socket.
> >
> > In my case, interrupt handler is running on core 0.
> > I tried providing "-l 0,40-79" as a startup parameter, this issue can be
> resolved.
> >
> > I corrected the previous statement that this problem does  only occur
> > on the SUSE system. In any OS, this problem occurs as long as the
> > range of startup parameters is only on node1.
> >
> >>>
> >>>>>
> >>>>>
> >>>>> And what about using 'rte_malloc_socket()' API (instead of
> >>>>> rte_malloc), which gets 'socket' as parameter, and provide the
> >>>>> socket that devices is on as parameter to this API? Is it possible
> >>>>> to test
> >> this?
> >>>>>
> >>>>>
> >>>> As to the reason for not using rte_malloc_socket. I thought
> >>>> rte_malloc_socket() could solve the problem too. And the
> >>>> appropriate parameter should be the socket_id that created the
> >>>> memory pool for DPDK initialization. Assuming that> the socket_id
> >>>> of the initially allocated memory = 1, first let the
> >>> eal_intr_thread
> >>>> determine if it is on the socket_id, then record this socket_id in
> >>>> the eal_intr_thread and pass it to the iavf_event_thread.  But
> >>>> there seems no way to link this parameter to the
> >>>> iavf_dev_event_post()
> >> function. That is why rte_malloc_socket is not used.
> >>>>
> >>>
> >>> I was thinking socket id of device can be used, but that won't help
> >>> if the core that interrupt handler runs is in different socket.
> >>> And I also don't know if there is a way to get socket that interrupt
> >>> thread is on. @David may help perhaps.
> >>>
> >>> So question is why interrupt thread is not running on main lcore.
> >>>
> >>
> >> OK after some talk with David, what I am missing is
> 'rte_ctrl_thread_create()'
> >> does NOT run on main lcore, it can run on any core except data plane
> cores.
> >>
> >> Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and
> >> interrupt thread (so driver interrupt callback iavf_dev_event_post())
> >> can run on any core, making it hard to manage.
> >> And it seems it is not possible to control where interrupt thread to run.
> >>
> >> One option can be allocating hugepages for all sockets, but this
> >> requires user involvement, and can't happen transparently.
> >>
> >> Other option can be to control where "iavf-event-thread" run, like
> >> using 'rte_thread_create()' to create thread and provide attribute to
> >> run it on main lcore (rte_lcore_cpuset(rte_get_main_lcore()))?
> >>
> >> Can you please test above option?
> >>
> >>
> > The first option can solve this issue. but to borrow from your
> > previous saying, "in a dual socket system, if all used cores are in
> > socket 1 and the NIC is in socket 1,  no memory is allocated for socket 0.
> This is to optimize memory consumption."
> > I think it's unreasonable to do so.
> >
> > About other option. In " rte_eal_intr_init" function, After the thread
> > is created, I set the thread affinity for eal-intr-thread, but it does not solve
> this issue.
> 
> Hi Kaisen,
> 
> There are two threads involved,
> 
> First one is interrupt thread, "eal-intr-thread", created by 'rte_eal_intr_init()'.
> 
> Second one is iavf event handler, "iavf-event-thread", created by
> 'iavf_dev_event_handler_init()'.
> 
> First one triggered by interrupt and puts a message to a list, second one
> consumes from the list and processes the message.
> So I assume two thread being in different sockets, or memory being
> allocated in a different socket than the cores running causes the
> performance issue.
> 
> Did you test the second thread, "iavf-event-thread", affiliated to main core?
> (by creating thread using 'rte_thread_create()' API)
> 
> 
I tried to use ''rte_thread_create() 'API creates the second thread, 
but this issue still exists.

Because malloc is executed by "eal_intr_thread", it has nothing 
to do with "iavf_event_thread".

But I found a patch similar to my issue:
https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.296530-1-david.marchand@redhat.com/
According to the patch modification, this issue can be solved.

> >>
> >>>> Let me know if there is anything else unclear.
> >>>>>
> >>>>>> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
> >>>>>> Cc: stable@dpdk.org
> >>>>>>
> >>>>>> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
> >>>>>> ---
> >>>>>>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
> >>>>>>  1 file changed, 3 insertions(+), 5 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/net/iavf/iavf_vchnl.c
> >>>>>> b/drivers/net/iavf/iavf_vchnl.c index f92daf97f2..a05791fe48
> >>>>>> 100644
> >>>>>> --- a/drivers/net/iavf/iavf_vchnl.c
> >>>>>> +++ b/drivers/net/iavf/iavf_vchnl.c
> >>>>>> @@ -36,7 +36,6 @@ struct iavf_event_element {
> >>>>>>  	struct rte_eth_dev *dev;
> >>>>>>  	enum rte_eth_event_type event;
> >>>>>>  	void *param;
> >>>>>> -	size_t param_alloc_size;
> >>>>>>  	uint8_t param_alloc_data[0];
> >>>>>>  };
> >>>>>>
> >>>>>> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param
> >> __rte_unused)
> >>>>>>  		TAILQ_FOREACH_SAFE(pos, &pending, next,
> save_next) {
> >>>>>>  			TAILQ_REMOVE(&pending, pos, next);
> >>>>>>  			rte_eth_dev_callback_process(pos->dev,
> pos- event,
> >> pos->param);
> >>>>>> -			rte_free(pos);
> >>>>>> +			free(pos);
> >>>>>>  		}
> >>>>>>  	}
> >>>>>>
> >>>>>> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev
> *dev,
> >> {
> >>>>>>  	struct iavf_event_handler *handler = &event_handler;
> >>>>>>  	char notify_byte;
> >>>>>> -	struct iavf_event_element *elem = rte_malloc(NULL,
> sizeof(*elem)
> >>>>> + param_alloc_size, 0);
> >>>>>> +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
> >>>>>> +param_alloc_size);
> >>>>>>  	if (!elem)
> >>>>>>  		return;
> >>>>>>
> >>>>>>  	elem->dev = dev;
> >>>>>>  	elem->event = event;
> >>>>>>  	elem->param = param;
> >>>>>> -	elem->param_alloc_size = param_alloc_size;
> >>>>>>  	if (param && param_alloc_size) {
> >>>>>>  		rte_memcpy(elem->param_alloc_data, param,
> >>>>> param_alloc_size);
> >>>>>>  		elem->param = elem->param_alloc_data; @@ -165,7
> +163,7
> >>>>> @@
> >>>>>> iavf_dev_event_handler_fini(void)
> >>>>>>  	struct iavf_event_element *pos, *save_next;
> >>>>>>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next,
> save_next) {
> >>>>>>  		TAILQ_REMOVE(&handler->pending, pos, next);
> >>>>>> -		rte_free(pos);
> >>>>>> +		free(pos);
> >>>>>>  	}
> >>>>>>  }
> >>>>>>
> >>>>
> >>>
> >
  
Ferruh Yigit Dec. 22, 2022, 12:06 p.m. UTC | #17
On 12/22/2022 7:23 AM, You, KaisenX wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>> Sent: 2022年12月21日 21:49
>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
>> Anatoly <anatoly.burakov@intel.com>; David Marchand
>> <david.marchand@redhat.com>
>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
>> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing,
>> Beilei <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
>> Boccassi <bluca@debian.org>; Mcnamara, John
>> <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
>>
>> On 12/20/2022 6:52 AM, You, KaisenX wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>>>> Sent: 2022年12月13日 21:28
>>>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
>>>> Anatoly <anatoly.burakov@intel.com>; David Marchand
>>>> <david.marchand@redhat.com>
>>>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
>>>> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
>>>> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
>>>> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>;
>>>> Mcnamara, John <john.mcnamara@intel.com>; Kevin Traynor
>>>> <ktraynor@redhat.com>
>>>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
>>>>
>>>> On 12/13/2022 9:35 AM, Ferruh Yigit wrote:
>>>>> On 12/13/2022 7:52 AM, You, KaisenX wrote:
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>>>>>>> Sent: 2022年12月8日 23:04
>>>>>>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
>>>>>>> Anatoly <anatoly.burakov@intel.com>; David Marchand
>>>>>>> <david.marchand@redhat.com>
>>>>>>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
>>>>>>> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
>>>>>>> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
>>>>>>> Zhang, Qi Z <qi.z.zhang@intel.com>; Luca Boccassi
>>>>>>> <bluca@debian.org>; Mcnamara, John <john.mcnamara@intel.com>;
>>>> Kevin
>>>>>>> Traynor <ktraynor@redhat.com>
>>>>>>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
>>>>>>>
>>>>>>> On 11/17/2022 6:57 AM, Kaisen You wrote:
>>>>>>>> In some cases, the DPDK does not allocate hugepage heap memory
>> to
>>>>>>> some
>>>>>>>> sockets due to the user setting parameters (e.g. -l 40-79, SOCKET
>>>>>>>> 0 has no memory).
>>>>>>>> When the interrupt thread runs on the corresponding core of this
>>>>>>>> socket, each allocation/release will execute a whole set of heap
>>>>>>>> allocation/release operations,resulting in poor performance.
>>>>>>>> Instead we call malloc() to get memory from the system's heap
>>>>>>>> space to fix this problem.
>>>>>>>>
>>>>>>>
>>>>>>> Hi Kaisen,
>>>>>>>
>>>>>>> Using libc malloc can improve performance for this case, but I
>>>>>>> would like to understand root cause of the problem.
>>>>>>>
>>>>>>>
>>>>>>> As far as I can see, interrupt callbacks are run by interrupt
>>>>>>> thread
>>>>>>> ("eal-intr- thread"), and interrupt thread created by
>>>> 'rte_ctrl_thread_create()' API.
>>>>>>>
>>>>>>> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity
>>>>>>> retrieved at the time 'rte_eal_init()' was called,"
>>>>>>>
>>>>>>> And 'rte_eal_init()' is run on main lcore, which is the first
>>>>>>> lcore in the core list (unless otherwise defined with --main-lcore).
>>>>>>>
>>>>>>> So, the interrupts should be running on a core that has hugepages
>>>>>>> allocated for it, am I missing something here?
>>>>>>>
>>>>>>>
>>>>>> Thank for your comments.  Let me try to explain the root cause here:
>>>>>> eal_intr_thread the CPU in the corresponding slot does not create
>>>> memory pool.
>>>>>> That results in frequent memory subsequently creating/destructing.
>>>>>>
>>>>>> When testpmd started, the parameter (e.g. -l 40-79) is set.
>>>>>> Different OS has different topology. Some OS like SUSE only creates
>>>>>> memory pool for one CPU slot, while other system creates for two.
>>>>>> That is why the problem occurs when using memories in different OS.
>>>>>
>>>>>
>>>>> It is testpmd application that decides from which socket to allocate
>>>>> memory from, right. This is nothing specific to OS.
>>>>>
>>>>> As far as I remember, testpmd logic is too allocate from socket that
>>>>> its cores are used (provided with -l parameter), and allocate from
>>>>> socket that device is attached to.
>>>>>
>>>>> So, in a dual socket system, if all used cores are in socket 1 and
>>>>> the NIC is in socket 1, no memory is allocated for socket 0. This is
>>>>> to optimize memory consumption.
>>>>>
>>>>>
>>>>> Can you please confirm that the problem you are observing is because
>>>>> interrupt handler is running on a CPU, which doesn't have memory
>>>>> allocated for its socket?
>>>>>
>>>>> In this case what I don't understand is why interrupts is not
>>>>> running on main lcore, which should be first core in the list, for "-l 40-79"
>>>>> sample it should be lcore 40.
>>>>> For your case, is interrupt handler run on core 0? Or any arbitrary core?
>>>>> If so, can you please confirm when you provide core list as "-l 0,40-79"
>>>>> fixes the issue?
>>>>>
>>> First of all, sorry to reply to you so late.
>>> I can confirm that the problem I observed is because  interrupt
>>> handler is running on a CPU, which doesn't have memory allocated for its
>> socket.
>>>
>>> In my case, interrupt handler is running on core 0.
>>> I tried providing "-l 0,40-79" as a startup parameter, this issue can be
>> resolved.
>>>
>>> I corrected the previous statement that this problem does  only occur
>>> on the SUSE system. In any OS, this problem occurs as long as the
>>> range of startup parameters is only on node1.
>>>
>>>>>
>>>>>>>
>>>>>>>
>>>>>>> And what about using 'rte_malloc_socket()' API (instead of
>>>>>>> rte_malloc), which gets 'socket' as parameter, and provide the
>>>>>>> socket that devices is on as parameter to this API? Is it possible
>>>>>>> to test
>>>> this?
>>>>>>>
>>>>>>>
>>>>>> As to the reason for not using rte_malloc_socket. I thought
>>>>>> rte_malloc_socket() could solve the problem too. And the
>>>>>> appropriate parameter should be the socket_id that created the
>>>>>> memory pool for DPDK initialization. Assuming that> the socket_id
>>>>>> of the initially allocated memory = 1, first let the
>>>>> eal_intr_thread
>>>>>> determine if it is on the socket_id, then record this socket_id in
>>>>>> the eal_intr_thread and pass it to the iavf_event_thread.  But
>>>>>> there seems no way to link this parameter to the
>>>>>> iavf_dev_event_post()
>>>> function. That is why rte_malloc_socket is not used.
>>>>>>
>>>>>
>>>>> I was thinking socket id of device can be used, but that won't help
>>>>> if the core that interrupt handler runs is in different socket.
>>>>> And I also don't know if there is a way to get socket that interrupt
>>>>> thread is on. @David may help perhaps.
>>>>>
>>>>> So question is why interrupt thread is not running on main lcore.
>>>>>
>>>>
>>>> OK after some talk with David, what I am missing is
>> 'rte_ctrl_thread_create()'
>>>> does NOT run on main lcore, it can run on any core except data plane
>> cores.
>>>>
>>>> Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and
>>>> interrupt thread (so driver interrupt callback iavf_dev_event_post())
>>>> can run on any core, making it hard to manage.
>>>> And it seems it is not possible to control where interrupt thread to run.
>>>>
>>>> One option can be allocating hugepages for all sockets, but this
>>>> requires user involvement, and can't happen transparently.
>>>>
>>>> Other option can be to control where "iavf-event-thread" run, like
>>>> using 'rte_thread_create()' to create thread and provide attribute to
>>>> run it on main lcore (rte_lcore_cpuset(rte_get_main_lcore()))?
>>>>
>>>> Can you please test above option?
>>>>
>>>>
>>> The first option can solve this issue. but to borrow from your
>>> previous saying, "in a dual socket system, if all used cores are in
>>> socket 1 and the NIC is in socket 1,  no memory is allocated for socket 0.
>> This is to optimize memory consumption."
>>> I think it's unreasonable to do so.
>>>
>>> About other option. In " rte_eal_intr_init" function, After the thread
>>> is created, I set the thread affinity for eal-intr-thread, but it does not solve
>> this issue.
>>
>> Hi Kaisen,
>>
>> There are two threads involved,
>>
>> First one is interrupt thread, "eal-intr-thread", created by 'rte_eal_intr_init()'.
>>
>> Second one is iavf event handler, "iavf-event-thread", created by
>> 'iavf_dev_event_handler_init()'.
>>
>> First one triggered by interrupt and puts a message to a list, second one
>> consumes from the list and processes the message.
>> So I assume two thread being in different sockets, or memory being
>> allocated in a different socket than the cores running causes the
>> performance issue.
>>
>> Did you test the second thread, "iavf-event-thread", affiliated to main core?
>> (by creating thread using 'rte_thread_create()' API)
>>
>>
> I tried to use ''rte_thread_create() 'API creates the second thread, 
> but this issue still exists.
> 
> Because malloc is executed by "eal_intr_thread", it has nothing 
> to do with "iavf_event_thread".
> 

Since 'iavf_event_element' (pos which is allocated by malloc()  accessed
and freed in "iavf-event-thread", it could be related. But if it doesn't
fix that is OK, thanks for testing.

> But I found a patch similar to my issue:
> https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.296530-1-david.marchand@redhat.com/
> According to the patch modification, this issue can be solved.
> 

I guess that patch inspired from this discussion, and if it fixes the
issue, I prefer that one as generic solution.

>>>>
>>>>>> Let me know if there is anything else unclear.
>>>>>>>
>>>>>>>> Fixes: cb5c1b91f76f ("net/iavf: add thread for event callbacks")
>>>>>>>> Cc: stable@dpdk.org
>>>>>>>>
>>>>>>>> Signed-off-by: Kaisen You <kaisenx.you@intel.com>
>>>>>>>> ---
>>>>>>>>  drivers/net/iavf/iavf_vchnl.c | 8 +++-----
>>>>>>>>  1 file changed, 3 insertions(+), 5 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/net/iavf/iavf_vchnl.c
>>>>>>>> b/drivers/net/iavf/iavf_vchnl.c index f92daf97f2..a05791fe48
>>>>>>>> 100644
>>>>>>>> --- a/drivers/net/iavf/iavf_vchnl.c
>>>>>>>> +++ b/drivers/net/iavf/iavf_vchnl.c
>>>>>>>> @@ -36,7 +36,6 @@ struct iavf_event_element {
>>>>>>>>  	struct rte_eth_dev *dev;
>>>>>>>>  	enum rte_eth_event_type event;
>>>>>>>>  	void *param;
>>>>>>>> -	size_t param_alloc_size;
>>>>>>>>  	uint8_t param_alloc_data[0];
>>>>>>>>  };
>>>>>>>>
>>>>>>>> @@ -80,7 +79,7 @@ iavf_dev_event_handle(void *param
>>>> __rte_unused)
>>>>>>>>  		TAILQ_FOREACH_SAFE(pos, &pending, next,
>> save_next) {
>>>>>>>>  			TAILQ_REMOVE(&pending, pos, next);
>>>>>>>>  			rte_eth_dev_callback_process(pos->dev,
>> pos- event,
>>>> pos->param);
>>>>>>>> -			rte_free(pos);
>>>>>>>> +			free(pos);
>>>>>>>>  		}
>>>>>>>>  	}
>>>>>>>>
>>>>>>>> @@ -94,14 +93,13 @@ iavf_dev_event_post(struct rte_eth_dev
>> *dev,
>>>> {
>>>>>>>>  	struct iavf_event_handler *handler = &event_handler;
>>>>>>>>  	char notify_byte;
>>>>>>>> -	struct iavf_event_element *elem = rte_malloc(NULL,
>> sizeof(*elem)
>>>>>>> + param_alloc_size, 0);
>>>>>>>> +	struct iavf_event_element *elem = malloc(sizeof(*elem) +
>>>>>>>> +param_alloc_size);
>>>>>>>>  	if (!elem)
>>>>>>>>  		return;
>>>>>>>>
>>>>>>>>  	elem->dev = dev;
>>>>>>>>  	elem->event = event;
>>>>>>>>  	elem->param = param;
>>>>>>>> -	elem->param_alloc_size = param_alloc_size;
>>>>>>>>  	if (param && param_alloc_size) {
>>>>>>>>  		rte_memcpy(elem->param_alloc_data, param,
>>>>>>> param_alloc_size);
>>>>>>>>  		elem->param = elem->param_alloc_data; @@ -165,7
>> +163,7
>>>>>>> @@
>>>>>>>> iavf_dev_event_handler_fini(void)
>>>>>>>>  	struct iavf_event_element *pos, *save_next;
>>>>>>>>  	TAILQ_FOREACH_SAFE(pos, &handler->pending, next,
>> save_next) {
>>>>>>>>  		TAILQ_REMOVE(&handler->pending, pos, next);
>>>>>>>> -		rte_free(pos);
>>>>>>>> +		free(pos);
>>>>>>>>  	}
>>>>>>>>  }
>>>>>>>>
>>>>>>
>>>>>
>>>
>
  
Qi Zhang Dec. 26, 2022, 2:17 a.m. UTC | #18
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@amd.com>
> Sent: Thursday, December 22, 2022 8:07 PM
> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; David Marchand
> <david.marchand@redhat.com>
> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca Boccassi
> <bluca@debian.org>; Mcnamara, John <john.mcnamara@intel.com>; Kevin
> Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> On 12/22/2022 7:23 AM, You, KaisenX wrote:
> >
> >
> >> -----Original Message-----
> >> From: Ferruh Yigit <ferruh.yigit@amd.com>
> >> Sent: 2022年12月21日 21:49
> >> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> >> Anatoly <anatoly.burakov@intel.com>; David Marchand
> >> <david.marchand@redhat.com>
> >> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
> >> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
> >> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> >> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>;
> >> Mcnamara, John <john.mcnamara@intel.com>; Kevin Traynor
> >> <ktraynor@redhat.com>
> >> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >>
> >> On 12/20/2022 6:52 AM, You, KaisenX wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
> >>>> Sent: 2022年12月13日 21:28
> >>>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> >>>> Anatoly <anatoly.burakov@intel.com>; David Marchand
> >>>> <david.marchand@redhat.com>
> >>>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
> >>>> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
> >>>> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
> >>>> Zhang, Qi Z <qi.z.zhang@intel.com>; Luca Boccassi
> >>>> <bluca@debian.org>; Mcnamara, John <john.mcnamara@intel.com>;
> Kevin
> >>>> Traynor <ktraynor@redhat.com>
> >>>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >>>>
> >>>> On 12/13/2022 9:35 AM, Ferruh Yigit wrote:
> >>>>> On 12/13/2022 7:52 AM, You, KaisenX wrote:
> >>>>>>
> >>>>>>
> >>>>>>> -----Original Message-----
> >>>>>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
> >>>>>>> Sent: 2022年12月8日 23:04
> >>>>>>> To: You, KaisenX <kaisenx.you@intel.com>; dev@dpdk.org; Burakov,
> >>>>>>> Anatoly <anatoly.burakov@intel.com>; David Marchand
> >>>>>>> <david.marchand@redhat.com>
> >>>>>>> Cc: stable@dpdk.org; Yang, Qiming <qiming.yang@intel.com>; Zhou,
> >>>>>>> YidingX <yidingx.zhou@intel.com>; Wu, Jingjing
> >>>>>>> <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>;
> >>>>>>> Zhang, Qi Z <qi.z.zhang@intel.com>; Luca Boccassi
> >>>>>>> <bluca@debian.org>; Mcnamara, John
> <john.mcnamara@intel.com>;
> >>>> Kevin
> >>>>>>> Traynor <ktraynor@redhat.com>
> >>>>>>> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >>>>>>>
> >>>>>>> On 11/17/2022 6:57 AM, Kaisen You wrote:
> >>>>>>>> In some cases, the DPDK does not allocate hugepage heap memory
> >> to
> >>>>>>> some
> >>>>>>>> sockets due to the user setting parameters (e.g. -l 40-79,
> >>>>>>>> SOCKET
> >>>>>>>> 0 has no memory).
> >>>>>>>> When the interrupt thread runs on the corresponding core of
> >>>>>>>> this socket, each allocation/release will execute a whole set
> >>>>>>>> of heap allocation/release operations,resulting in poor
> performance.
> >>>>>>>> Instead we call malloc() to get memory from the system's heap
> >>>>>>>> space to fix this problem.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Hi Kaisen,
> >>>>>>>
> >>>>>>> Using libc malloc can improve performance for this case, but I
> >>>>>>> would like to understand root cause of the problem.
> >>>>>>>
> >>>>>>>
> >>>>>>> As far as I can see, interrupt callbacks are run by interrupt
> >>>>>>> thread
> >>>>>>> ("eal-intr- thread"), and interrupt thread created by
> >>>> 'rte_ctrl_thread_create()' API.
> >>>>>>>
> >>>>>>> 'rte_ctrl_thread_create()' comment mentions that "CPU affinity
> >>>>>>> retrieved at the time 'rte_eal_init()' was called,"
> >>>>>>>
> >>>>>>> And 'rte_eal_init()' is run on main lcore, which is the first
> >>>>>>> lcore in the core list (unless otherwise defined with --main-lcore).
> >>>>>>>
> >>>>>>> So, the interrupts should be running on a core that has
> >>>>>>> hugepages allocated for it, am I missing something here?
> >>>>>>>
> >>>>>>>
> >>>>>> Thank for your comments.  Let me try to explain the root cause here:
> >>>>>> eal_intr_thread the CPU in the corresponding slot does not create
> >>>> memory pool.
> >>>>>> That results in frequent memory subsequently creating/destructing.
> >>>>>>
> >>>>>> When testpmd started, the parameter (e.g. -l 40-79) is set.
> >>>>>> Different OS has different topology. Some OS like SUSE only
> >>>>>> creates memory pool for one CPU slot, while other system creates for
> two.
> >>>>>> That is why the problem occurs when using memories in different OS.
> >>>>>
> >>>>>
> >>>>> It is testpmd application that decides from which socket to
> >>>>> allocate memory from, right. This is nothing specific to OS.
> >>>>>
> >>>>> As far as I remember, testpmd logic is too allocate from socket
> >>>>> that its cores are used (provided with -l parameter), and allocate
> >>>>> from socket that device is attached to.
> >>>>>
> >>>>> So, in a dual socket system, if all used cores are in socket 1 and
> >>>>> the NIC is in socket 1, no memory is allocated for socket 0. This
> >>>>> is to optimize memory consumption.
> >>>>>
> >>>>>
> >>>>> Can you please confirm that the problem you are observing is
> >>>>> because interrupt handler is running on a CPU, which doesn't have
> >>>>> memory allocated for its socket?
> >>>>>
> >>>>> In this case what I don't understand is why interrupts is not
> >>>>> running on main lcore, which should be first core in the list, for "-l 40-
> 79"
> >>>>> sample it should be lcore 40.
> >>>>> For your case, is interrupt handler run on core 0? Or any arbitrary core?
> >>>>> If so, can you please confirm when you provide core list as "-l 0,40-79"
> >>>>> fixes the issue?
> >>>>>
> >>> First of all, sorry to reply to you so late.
> >>> I can confirm that the problem I observed is because  interrupt
> >>> handler is running on a CPU, which doesn't have memory allocated for
> >>> its
> >> socket.
> >>>
> >>> In my case, interrupt handler is running on core 0.
> >>> I tried providing "-l 0,40-79" as a startup parameter, this issue
> >>> can be
> >> resolved.
> >>>
> >>> I corrected the previous statement that this problem does  only
> >>> occur on the SUSE system. In any OS, this problem occurs as long as
> >>> the range of startup parameters is only on node1.
> >>>
> >>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> And what about using 'rte_malloc_socket()' API (instead of
> >>>>>>> rte_malloc), which gets 'socket' as parameter, and provide the
> >>>>>>> socket that devices is on as parameter to this API? Is it
> >>>>>>> possible to test
> >>>> this?
> >>>>>>>
> >>>>>>>
> >>>>>> As to the reason for not using rte_malloc_socket. I thought
> >>>>>> rte_malloc_socket() could solve the problem too. And the
> >>>>>> appropriate parameter should be the socket_id that created the
> >>>>>> memory pool for DPDK initialization. Assuming that> the socket_id
> >>>>>> of the initially allocated memory = 1, first let the
> >>>>> eal_intr_thread
> >>>>>> determine if it is on the socket_id, then record this socket_id
> >>>>>> in the eal_intr_thread and pass it to the iavf_event_thread.  But
> >>>>>> there seems no way to link this parameter to the
> >>>>>> iavf_dev_event_post()
> >>>> function. That is why rte_malloc_socket is not used.
> >>>>>>
> >>>>>
> >>>>> I was thinking socket id of device can be used, but that won't
> >>>>> help if the core that interrupt handler runs is in different socket.
> >>>>> And I also don't know if there is a way to get socket that
> >>>>> interrupt thread is on. @David may help perhaps.
> >>>>>
> >>>>> So question is why interrupt thread is not running on main lcore.
> >>>>>
> >>>>
> >>>> OK after some talk with David, what I am missing is
> >> 'rte_ctrl_thread_create()'
> >>>> does NOT run on main lcore, it can run on any core except data
> >>>> plane
> >> cores.
> >>>>
> >>>> Driver "iavf-event-thread" thread (iavf_dev_event_handle()) and
> >>>> interrupt thread (so driver interrupt callback
> >>>> iavf_dev_event_post()) can run on any core, making it hard to manage.
> >>>> And it seems it is not possible to control where interrupt thread to run.
> >>>>
> >>>> One option can be allocating hugepages for all sockets, but this
> >>>> requires user involvement, and can't happen transparently.
> >>>>
> >>>> Other option can be to control where "iavf-event-thread" run, like
> >>>> using 'rte_thread_create()' to create thread and provide attribute
> >>>> to run it on main lcore (rte_lcore_cpuset(rte_get_main_lcore()))?
> >>>>
> >>>> Can you please test above option?
> >>>>
> >>>>
> >>> The first option can solve this issue. but to borrow from your
> >>> previous saying, "in a dual socket system, if all used cores are in
> >>> socket 1 and the NIC is in socket 1,  no memory is allocated for socket 0.
> >> This is to optimize memory consumption."
> >>> I think it's unreasonable to do so.
> >>>
> >>> About other option. In " rte_eal_intr_init" function, After the
> >>> thread is created, I set the thread affinity for eal-intr-thread,
> >>> but it does not solve
> >> this issue.
> >>
> >> Hi Kaisen,
> >>
> >> There are two threads involved,
> >>
> >> First one is interrupt thread, "eal-intr-thread", created by
> 'rte_eal_intr_init()'.
> >>
> >> Second one is iavf event handler, "iavf-event-thread", created by
> >> 'iavf_dev_event_handler_init()'.
> >>
> >> First one triggered by interrupt and puts a message to a list, second
> >> one consumes from the list and processes the message.
> >> So I assume two thread being in different sockets, or memory being
> >> allocated in a different socket than the cores running causes the
> >> performance issue.
> >>
> >> Did you test the second thread, "iavf-event-thread", affiliated to main
> core?
> >> (by creating thread using 'rte_thread_create()' API)
> >>
> >>
> > I tried to use ''rte_thread_create() 'API creates the second thread,
> > but this issue still exists.
> >
> > Because malloc is executed by "eal_intr_thread", it has nothing to do
> > with "iavf_event_thread".
> >
> 
> Since 'iavf_event_element' (pos which is allocated by malloc()  accessed and
> freed in "iavf-event-thread", it could be related. But if it doesn't fix that is OK,
> thanks for testing.
> 
> > But I found a patch similar to my issue:
> > https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.296530-
> 1-
> > david.marchand@redhat.com/ According to the patch modification, this
> > issue can be solved.
> >
> 
> I guess that patch inspired from this discussion, and if it fixes the issue, I
> prefer that one as generic solution.

+1 

That looks like the feature that DPDK should have.
  
Kaisen You Dec. 27, 2022, 6:06 a.m. UTC | #19
> -----Original Message-----
> From: You, KaisenX <kaisenx.you@intel.com>
> Sent: 2022年12月22日 14:43
> To: David Marchand <david.marchand@redhat.com>
> Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>; Mcnamara,
> John <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: RE: [PATCH] net/iavf:fix slow memory allocation
> 
> 
> 
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > Sent: 2022年12月21日 18:50
> > To: You, KaisenX <kaisenx.you@intel.com>
> > Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov,
> > Anatoly <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> > <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Xing, Beilei
> > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
> > Boccassi <bluca@debian.org>; Mcnamara, John
> <john.mcnamara@intel.com>;
> > Kevin Traynor <ktraynor@redhat.com>
> > Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> >
> > On Wed, Dec 21, 2022 at 10:12 AM You, KaisenX <kaisenx.you@intel.com>
> > wrote:
> > > > -----Original Message-----
> > > > From: David Marchand <david.marchand@redhat.com>
> > > > Sent: 2022年12月20日 18:33
> > > > To: You, KaisenX <kaisenx.you@intel.com>
> > > > Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov,
> > > > Anatoly <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> > > > <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>;
> > > > Wu, Jingjing <jingjing.wu@intel.com>; Xing, Beilei
> > > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Luca
> > > > Boccassi <bluca@debian.org>; Mcnamara, John
> > > > <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> > > > Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> > > >
> > > > On Tue, Dec 20, 2022 at 11:12 AM You, KaisenX
> > > > <kaisenx.you@intel.com>
> > > > wrote:
> > > > > > I tried to play a bit with a E810 nic on a dual numa and I
> > > > > > can't see anything wrong for now.
> > > > > > Can you provide a simple and small reproducer of your issue?
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > This is my environment:
> > > > > Enter "lscpu" on the command line:
> > > > > NUMA:
> > > > >         NUMA node(s): 2
> > > > >         NUMA node0 CPU(S) : 0-27,56-83
> > > > >         NUMA node1 CPU(S) : 28-55,84-111
> > > > >
> > > > > List the steps to reproduce the issue:
> > > > >
> > > > > 1. create vf and blind to dpdk
> > > > > echo 1 > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
> > > > > ./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0 2. launch
> > > > > testpmd ./x86_ 64-native-linuxapp-clang/app/dpdk-testpmd -l
> > > > > 28-48 -n 4 -a 0000:ca:01.0 --file-prefix=dpdk_ 525342_
> > > > > 20221104042659 -- -i
> > > > > --rxq=256 --txq=256
> > > > > --total-num-mbufs=500000
> > > > >
> > > > > Parameter Description:
> > > > >  "-l 28-48":The range of parameter values after "-l" must be on
> > > > > "NUMA
> > > > node1 CPU(S)"
> > > > >  "0000:ca:01.0":inset on node1
> > > > - Back to your topic.
> > > > Can you try this simple hack:
> > > >
> > > > diff --git a/lib/eal/common/eal_common_thread.c
> > > > b/lib/eal/common/eal_common_thread.c
> > > > index c5d8b4327d..92160c7fa6 100644
> > > > --- a/lib/eal/common/eal_common_thread.c
> > > > +++ b/lib/eal/common/eal_common_thread.c
> > > > @@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
> > > >         void *routine_arg = params->arg;
> > > >
> > > >         __rte_thread_init(rte_lcore_id(), cpuset);
> > > > +       RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
> > > >         params->ret = pthread_setaffinity_np(pthread_self(),
> > sizeof(*cpuset),
> > > >                 cpuset);
> > > >         if (params->ret != 0) {
> > > >
> > > Thanks for your advice.
> > >
> > > But this issue still exists after I tried.
> >
> > Ok, I think I understand what is wrong... but I am still guessing as I
> > am not sure what your "issue" is.
> > Can you have a try with:
> > https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.296530-
> 1-
> > david.marchand@redhat.com/
> >
> > Thanks.
> >
> I think this issue is similar to the description in the patch you gave me.
> 
> when the DPDK application is started only on one numa node, Interrupt
> thread find memory on another numa node. This leads to a whole set of
> memory allocation/release operations every time when "rte_malloc" is called.
> This is the root cause of this issue.
> 
> This issue can be solved after I tried.
> Thanks for your advice.

After further testing in a different environment, we found the issue still 
existed in your last patch.  After troubleshooting, it is found that in the 
"malloc_get_numa_socket()" API, if the return value of "rte_socket_id()" 
is "SOCKET_ID_ANY" (- 1), the API will return 
"rte_lcore_to_socket_id (rte_get_main_lcore())";
Otherwise, "malloc_get_numa_socket()" API will directly return 
"the return value of rte_socket_id()",in this case, the issue cannot be solved.

And the return value of "rte_socket_id()" is modified by the solution you 
suggested in your last email (RTE_PER_LCORE (_socket_id)=SOCKET_ ID_ ANY;). 
Therefore, I think merging your two suggestions together could completely solve this issue. 

Can you please update your accordingly?
> >
> > --
> > David Marchand
  
David Marchand Jan. 10, 2023, 10:16 a.m. UTC | #20
Hello,

On Tue, Dec 27, 2022 at 7:06 AM You, KaisenX <kaisenx.you@intel.com> wrote:
> > > > > > > I tried to play a bit with a E810 nic on a dual numa and I
> > > > > > > can't see anything wrong for now.
> > > > > > > Can you provide a simple and small reproducer of your issue?
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > This is my environment:
> > > > > > Enter "lscpu" on the command line:
> > > > > > NUMA:
> > > > > >         NUMA node(s): 2
> > > > > >         NUMA node0 CPU(S) : 0-27,56-83
> > > > > >         NUMA node1 CPU(S) : 28-55,84-111
> > > > > >
> > > > > > List the steps to reproduce the issue:
> > > > > >
> > > > > > 1. create vf and blind to dpdk
> > > > > > echo 1 > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
> > > > > > ./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0 2. launch
> > > > > > testpmd ./x86_ 64-native-linuxapp-clang/app/dpdk-testpmd -l
> > > > > > 28-48 -n 4 -a 0000:ca:01.0 --file-prefix=dpdk_ 525342_
> > > > > > 20221104042659 -- -i
> > > > > > --rxq=256 --txq=256
> > > > > > --total-num-mbufs=500000
> > > > > >
> > > > > > Parameter Description:
> > > > > >  "-l 28-48":The range of parameter values after "-l" must be on
> > > > > > "NUMA
> > > > > node1 CPU(S)"
> > > > > >  "0000:ca:01.0":inset on node1
> > > > > - Back to your topic.
> > > > > Can you try this simple hack:
> > > > >
> > > > > diff --git a/lib/eal/common/eal_common_thread.c
> > > > > b/lib/eal/common/eal_common_thread.c
> > > > > index c5d8b4327d..92160c7fa6 100644
> > > > > --- a/lib/eal/common/eal_common_thread.c
> > > > > +++ b/lib/eal/common/eal_common_thread.c
> > > > > @@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
> > > > >         void *routine_arg = params->arg;
> > > > >
> > > > >         __rte_thread_init(rte_lcore_id(), cpuset);
> > > > > +       RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
> > > > >         params->ret = pthread_setaffinity_np(pthread_self(),
> > > sizeof(*cpuset),
> > > > >                 cpuset);
> > > > >         if (params->ret != 0) {
> > > > >
> > > > Thanks for your advice.
> > > >
> > > > But this issue still exists after I tried.
> > >
> > > Ok, I think I understand what is wrong... but I am still guessing as I
> > > am not sure what your "issue" is.
> > > Can you have a try with:
> > > https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.296530-
> > 1-
> > > david.marchand@redhat.com/
> > >
> > > Thanks.
> > >
> > I think this issue is similar to the description in the patch you gave me.
> >
> > when the DPDK application is started only on one numa node, Interrupt
> > thread find memory on another numa node. This leads to a whole set of
> > memory allocation/release operations every time when "rte_malloc" is called.
> > This is the root cause of this issue.
> >
> > This issue can be solved after I tried.
> > Thanks for your advice.
>
> After further testing in a different environment, we found the issue still
> existed in your last patch.  After troubleshooting, it is found that in the
> "malloc_get_numa_socket()" API, if the return value of "rte_socket_id()"
> is "SOCKET_ID_ANY" (- 1), the API will return
> "rte_lcore_to_socket_id (rte_get_main_lcore())";
> Otherwise, "malloc_get_numa_socket()" API will directly return
> "the return value of rte_socket_id()",in this case, the issue cannot be solved.
>
> And the return value of "rte_socket_id()" is modified by the solution you
> suggested in your last email (RTE_PER_LCORE (_socket_id)=SOCKET_ ID_ ANY;).
> Therefore, I think merging your two suggestions together could completely solve this issue.
>
> Can you please update your accordingly?

Please try the last revision and report back.
https://patchwork.dpdk.org/project/dpdk/list/?series=26362

Thanks.
  
Kaisen You Jan. 13, 2023, 6:24 a.m. UTC | #21
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: 2023年1月10日 18:17
> To: You, KaisenX <kaisenx.you@intel.com>
> Cc: Ferruh Yigit <ferruh.yigit@amd.com>; dev@dpdk.org; Burakov, Anatoly
> <anatoly.burakov@intel.com>; stable@dpdk.org; Yang, Qiming
> <qiming.yang@intel.com>; Zhou, YidingX <yidingx.zhou@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Zhang,
> Qi Z <qi.z.zhang@intel.com>; Luca Boccassi <bluca@debian.org>; Mcnamara,
> John <john.mcnamara@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> Subject: Re: [PATCH] net/iavf:fix slow memory allocation
> 
> Hello,
> 
> On Tue, Dec 27, 2022 at 7:06 AM You, KaisenX <kaisenx.you@intel.com>
> wrote:
> > > > > > > > I tried to play a bit with a E810 nic on a dual numa and I
> > > > > > > > can't see anything wrong for now.
> > > > > > > > Can you provide a simple and small reproducer of your issue?
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > This is my environment:
> > > > > > > Enter "lscpu" on the command line:
> > > > > > > NUMA:
> > > > > > >         NUMA node(s): 2
> > > > > > >         NUMA node0 CPU(S) : 0-27,56-83
> > > > > > >         NUMA node1 CPU(S) : 28-55,84-111
> > > > > > >
> > > > > > > List the steps to reproduce the issue:
> > > > > > >
> > > > > > > 1. create vf and blind to dpdk echo 1 >
> > > > > > > /sys/bus/pci/devices/0000\:ca\:00.0/sriov_ numvfs
> > > > > > > ./usertools/dpdk-devbind. py -b vfio-pci 0000:ca:01.0 2.
> > > > > > > launch testpmd ./x86_
> > > > > > > 64-native-linuxapp-clang/app/dpdk-testpmd -l
> > > > > > > 28-48 -n 4 -a 0000:ca:01.0 --file-prefix=dpdk_ 525342_
> > > > > > > 20221104042659 -- -i
> > > > > > > --rxq=256 --txq=256
> > > > > > > --total-num-mbufs=500000
> > > > > > >
> > > > > > > Parameter Description:
> > > > > > >  "-l 28-48":The range of parameter values after "-l" must be
> > > > > > > on "NUMA
> > > > > > node1 CPU(S)"
> > > > > > >  "0000:ca:01.0":inset on node1
> > > > > > - Back to your topic.
> > > > > > Can you try this simple hack:
> > > > > >
> > > > > > diff --git a/lib/eal/common/eal_common_thread.c
> > > > > > b/lib/eal/common/eal_common_thread.c
> > > > > > index c5d8b4327d..92160c7fa6 100644
> > > > > > --- a/lib/eal/common/eal_common_thread.c
> > > > > > +++ b/lib/eal/common/eal_common_thread.c
> > > > > > @@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
> > > > > >         void *routine_arg = params->arg;
> > > > > >
> > > > > >         __rte_thread_init(rte_lcore_id(), cpuset);
> > > > > > +       RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
> > > > > >         params->ret = pthread_setaffinity_np(pthread_self(),
> > > > sizeof(*cpuset),
> > > > > >                 cpuset);
> > > > > >         if (params->ret != 0) {
> > > > > >
> > > > > Thanks for your advice.
> > > > >
> > > > > But this issue still exists after I tried.
> > > >
> > > > Ok, I think I understand what is wrong... but I am still guessing
> > > > as I am not sure what your "issue" is.
> > > > Can you have a try with:
> > > >
> https://patchwork.dpdk.org/project/dpdk/patch/20221221104858.29653
> > > > 0-
> > > 1-
> > > > david.marchand@redhat.com/
> > > >
> > > > Thanks.
> > > >
> > > I think this issue is similar to the description in the patch you gave me.
> > >
> > > when the DPDK application is started only on one numa node,
> > > Interrupt thread find memory on another numa node. This leads to a
> > > whole set of memory allocation/release operations every time when
> "rte_malloc" is called.
> > > This is the root cause of this issue.
> > >
> > > This issue can be solved after I tried.
> > > Thanks for your advice.
> >
> > After further testing in a different environment, we found the issue
> > still existed in your last patch.  After troubleshooting, it is found
> > that in the "malloc_get_numa_socket()" API, if the return value of
> "rte_socket_id()"
> > is "SOCKET_ID_ANY" (- 1), the API will return "rte_lcore_to_socket_id
> > (rte_get_main_lcore())"; Otherwise, "malloc_get_numa_socket()" API
> > will directly return "the return value of rte_socket_id()",in this
> > case, the issue cannot be solved.
> >
> > And the return value of "rte_socket_id()" is modified by the solution
> > you suggested in your last email (RTE_PER_LCORE (_socket_id)=SOCKET_
> ID_ ANY;).
> > Therefore, I think merging your two suggestions together could completely
> solve this issue.
> >
> > Can you please update your accordingly?
> 
> Please try the last revision and report back.
> https://patchwork.dpdk.org/project/dpdk/list/?series=26362
> 
> Thanks.
> 
Still failed.

Reason:
/lib/eal/common/malloc_heap.c
malloc_get_numa_socket(void)
{
        const struct internal_config *conf = eal_get_internal_configuration();
        unsigned int socket_id = rte_socket_id();   // The return value of "rte_socket_id()" is 1
        unsigned int idx;

        if (socket_id != (unsigned int)SOCKET_ID_ANY)
                return socket_id;    //so return here

So this patch cannot completely solve this issue.
Because the interrupt thread is on socket 1 when the thread is initialized (rte_crl_thread_create).
I think the following modifications should be added in this patch.

diff --git a/lib/eal/common/eal_common_thread.c
b/lib/eal/common/eal_common_thread.c
index c5d8b4327d..92160c7fa6 100644
--- a/lib/eal/common/eal_common_thread.c
+++ b/lib/eal/common/eal_common_thread.c
@@ -253,6 +253,7 @@ static void *ctrl_thread_init(void *arg)
        void *routine_arg = params->arg;

        __rte_thread_init(rte_lcore_id(), cpuset);
 +     RTE_PER_LCORE(_socket_id) = SOCKET_ID_ANY;
        params->ret = pthread_setaffinity_np(pthread_self(), sizeof(*cpuset),
                cpuset);
        if (params->ret != 0) {


> --
> David Marchand
  

Patch

diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index f92daf97f2..a05791fe48 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -36,7 +36,6 @@  struct iavf_event_element {
 	struct rte_eth_dev *dev;
 	enum rte_eth_event_type event;
 	void *param;
-	size_t param_alloc_size;
 	uint8_t param_alloc_data[0];
 };
 
@@ -80,7 +79,7 @@  iavf_dev_event_handle(void *param __rte_unused)
 		TAILQ_FOREACH_SAFE(pos, &pending, next, save_next) {
 			TAILQ_REMOVE(&pending, pos, next);
 			rte_eth_dev_callback_process(pos->dev, pos->event, pos->param);
-			rte_free(pos);
+			free(pos);
 		}
 	}
 
@@ -94,14 +93,13 @@  iavf_dev_event_post(struct rte_eth_dev *dev,
 {
 	struct iavf_event_handler *handler = &event_handler;
 	char notify_byte;
-	struct iavf_event_element *elem = rte_malloc(NULL, sizeof(*elem) + param_alloc_size, 0);
+	struct iavf_event_element *elem = malloc(sizeof(*elem) + param_alloc_size);
 	if (!elem)
 		return;
 
 	elem->dev = dev;
 	elem->event = event;
 	elem->param = param;
-	elem->param_alloc_size = param_alloc_size;
 	if (param && param_alloc_size) {
 		rte_memcpy(elem->param_alloc_data, param, param_alloc_size);
 		elem->param = elem->param_alloc_data;
@@ -165,7 +163,7 @@  iavf_dev_event_handler_fini(void)
 	struct iavf_event_element *pos, *save_next;
 	TAILQ_FOREACH_SAFE(pos, &handler->pending, next, save_next) {
 		TAILQ_REMOVE(&handler->pending, pos, next);
-		rte_free(pos);
+		free(pos);
 	}
 }