diff mbox

[dpdk-dev] add free hugepage function

Message ID 1414551269-5820-1-git-send-email-haifeng.lin@huawei.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Linhaifeng Oct. 29, 2014, 2:54 a.m. UTC
maybe somebody want to free hugepages when application exit.
so add this function for application to release hugepages when exit.

Signed-off-by: linhaifeng <haifeng.lin@huawei.com>
---
 .../lib/librte_eal/common/include/rte_memory.h     | 11 +++++++++
 .../lib/librte_eal/linuxapp/eal/eal_memory.c       | 27 ++++++++++++++++++++++
 2 files changed, 38 insertions(+)

Comments

Michael Qiu Oct. 29, 2014, 3:27 a.m. UTC | #1
10/29/2014 10:55 AM, linhaifeng :
> maybe somebody want to free hugepages when application exit.
> so add this function for application to release hugepages when exit.
>
> Signed-off-by: linhaifeng <haifeng.lin@huawei.com>
> ---
>  .../lib/librte_eal/common/include/rte_memory.h     | 11 +++++++++
>  .../lib/librte_eal/linuxapp/eal/eal_memory.c       | 27 ++++++++++++++++++++++
>  2 files changed, 38 insertions(+)
>
> diff --git a/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h b/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h
> index 4cf8ea9..7251b6b 100644
> --- a/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h
> +++ b/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h
> @@ -172,6 +172,17 @@ unsigned rte_memory_get_nchannel(void);
>   */
>  unsigned rte_memory_get_nrank(void);
>  
> +/**
> + * Free all the hugepages.For the application to call when exit.
> + *
> + * @param void
> + *
> + * @return
> + *       0: successfully
> + *       negative: error
> + */

here should be a non-return function, see comments below :)

> +int rte_eal_hugepage_free(void);
> +
>  #ifdef RTE_LIBRTE_XEN_DOM0
>  /**
>   * Return the physical address of elt, which is an element of the pool mp.
> diff --git a/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c b/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c
> index f2454f4..1ae0e79 100644
> --- a/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -98,6 +98,13 @@
>  #include "eal_filesystem.h"
>  #include "eal_hugepages.h"
>  
> +struct hugepage_table {
> +	struct hugepage_file *hugepg_tbl;
> +	unsigned nr_hugefiles;
> +};
> +
> +static struct hugepage_table g_hugepage_table;
> +
>  /**
>   * @file
>   * Huge page mapping under linux
> @@ -1202,6 +1209,7 @@ rte_eal_hugepage_init(void)
>  						(unsigned)
>  							(used_hp[i].hugepage_sz / 0x100000),
>  						j);
> +				g_hugepage_table.nr_hugefiles += used_hp[i].num_pages[j];
>  			}
>  		}
>  	}
> @@ -1237,6 +1245,8 @@ rte_eal_hugepage_init(void)
>  		goto fail;
>  	}
>  
> +	g_hugepage_table.hugepg_tbl = hugepage;
> +
>  	/* free the temporary hugepage table */
>  	free(tmp_hp);
>  	tmp_hp = NULL;
> @@ -1487,6 +1497,23 @@ error:
>  	return -1;
>  }
>  
> +int
> +rte_eal_hugepage_free(void)
> +{
> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
> +	unsigned i;
> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
> +
> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
> +
> +	for (i = 0; i < nr_hugefiles; i++) {
> +		unlink(hugepg_tbl[i].filepath);
> +		hugepg_tbl[i].orig_va = NULL;
> +	}
> +
> +	return 0;

I just saw one return path with value '0', and no any other place 
return a negative value,  so it is better to  be designed as one
non-return function,

+void
+rte_eal_hugepage_free(void)
+{
+	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
+	unsigned i;
+	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
+
+	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
+
+	for (i = 0; i < nr_hugefiles; i++) {
+		unlink(hugepg_tbl[i].filepath);
+		hugepg_tbl[i].orig_va = NULL;
+	}
+}
+

Thanks,
Michael
> +}
> +
>  static int
>  rte_eal_memdevice_init(void)
>  {
Matthew Hall Oct. 29, 2014, 3:44 a.m. UTC | #2
On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
> I just saw one return path with value '0', and no any other place 
> return a negative value,  so it is better to  be designed as one
> non-return function,
> 
> +void
> +rte_eal_hugepage_free(void)
> +{
> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
> +	unsigned i;
> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
> +
> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
> +
> +	for (i = 0; i < nr_hugefiles; i++) {
> +		unlink(hugepg_tbl[i].filepath);
> +		hugepg_tbl[i].orig_va = NULL;
> +	}
> +}
> +
> 
> Thanks,
> Michael

Actually, I don't think that's quite right.

http://linux.die.net/man/2/unlink

"On success, zero is returned. On error, -1 is returned, and errno is set 
appropriately." So it should be returning an error, and logging a message for 
a file it cannot unlink or people will be surprised with weird failures.

It also had some minor typos / English in the comments but we can fix that too.

Matthew.
Linhaifeng Oct. 29, 2014, 5:14 a.m. UTC | #3
On 2014/10/29 11:44, Matthew Hall wrote:
> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
>> I just saw one return path with value '0', and no any other place 
>> return a negative value,  so it is better to  be designed as one
>> non-return function,
>>
>> +void
>> +rte_eal_hugepage_free(void)
>> +{
>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
>> +	unsigned i;
>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
>> +
>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
>> +
>> +	for (i = 0; i < nr_hugefiles; i++) {
>> +		unlink(hugepg_tbl[i].filepath);
>> +		hugepg_tbl[i].orig_va = NULL;
>> +	}
>> +}
>> +
>>
>> Thanks,
>> Michael
> 
> Actually, I don't think that's quite right.
> 
> http://linux.die.net/man/2/unlink
> 
> "On success, zero is returned. On error, -1 is returned, and errno is set 
> appropriately." So it should be returning an error, and logging a message for 
> a file it cannot unlink or people will be surprised with weird failures.
> 
> It also had some minor typos / English in the comments but we can fix that too.
> 
> Matthew.
> 
> 

Thank you Michael & Matthew

I will fix it.
:)
Michael Qiu Oct. 29, 2014, 5:26 a.m. UTC | #4
在 10/29/2014 11:46 AM, Matthew Hall 写道:
> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
>> I just saw one return path with value '0', and no any other place 
>> return a negative value,  so it is better to  be designed as one
>> non-return function,
>>
>> +void
>> +rte_eal_hugepage_free(void)
>> +{
>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
>> +	unsigned i;
>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
>> +
>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
>> +
>> +	for (i = 0; i < nr_hugefiles; i++) {
>> +		unlink(hugepg_tbl[i].filepath);
>> +		hugepg_tbl[i].orig_va = NULL;
>> +	}
>> +}
>> +
>>
>> Thanks,
>> Michael
> Actually, I don't think that's quite right.
>
> http://linux.die.net/man/2/unlink
>
> "On success, zero is returned. On error, -1 is returned, and errno is set 
> appropriately." So it should be returning an error, and logging a message for 
> a file it cannot unlink or people will be surprised with weird failures.

Really need one message for unlink failed, but I'm afraid that if it
make sense for return an error code when application exit.

Thanks
Michael
> It also had some minor typos / English in the comments but we can fix that too.
>
> Matthew.
>
Linhaifeng Oct. 29, 2014, 6:49 a.m. UTC | #5
On 2014/10/29 13:26, Qiu, Michael wrote:
> 在 10/29/2014 11:46 AM, Matthew Hall 写道:
>> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
>>> I just saw one return path with value '0', and no any other place 
>>> return a negative value,  so it is better to  be designed as one
>>> non-return function,
>>>
>>> +void
>>> +rte_eal_hugepage_free(void)
>>> +{
>>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
>>> +	unsigned i;
>>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
>>> +
>>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
>>> +
>>> +	for (i = 0; i < nr_hugefiles; i++) {
>>> +		unlink(hugepg_tbl[i].filepath);
>>> +		hugepg_tbl[i].orig_va = NULL;
>>> +	}
>>> +}
>>> +
>>>
>>> Thanks,
>>> Michael
>> Actually, I don't think that's quite right.
>>
>> http://linux.die.net/man/2/unlink
>>
>> "On success, zero is returned. On error, -1 is returned, and errno is set 
>> appropriately." So it should be returning an error, and logging a message for 
>> a file it cannot unlink or people will be surprised with weird failures.
> 
> Really need one message for unlink failed, but I'm afraid that if it
> make sense for return an error code when application exit.
> 
> Thanks
> Michael
>> It also had some minor typos / English in the comments but we can fix that too.
>>
>> Matthew.
>>
> 
> 
> 
Agree.May be it is not need to return error?
Bruce Richardson Oct. 29, 2014, 10:26 a.m. UTC | #6
On Wed, Oct 29, 2014 at 02:49:05PM +0800, Linhaifeng wrote:
> 
> 
> On 2014/10/29 13:26, Qiu, Michael wrote:
> > 在 10/29/2014 11:46 AM, Matthew Hall 写道:
> >> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
> >>> I just saw one return path with value '0', and no any other place 
> >>> return a negative value,  so it is better to  be designed as one
> >>> non-return function,
> >>>
> >>> +void
> >>> +rte_eal_hugepage_free(void)
> >>> +{
> >>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
> >>> +	unsigned i;
> >>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
> >>> +
> >>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
> >>> +
> >>> +	for (i = 0; i < nr_hugefiles; i++) {
> >>> +		unlink(hugepg_tbl[i].filepath);
> >>> +		hugepg_tbl[i].orig_va = NULL;
> >>> +	}
> >>> +}
> >>> +
> >>>
> >>> Thanks,
> >>> Michael
> >> Actually, I don't think that's quite right.
> >>
> >> http://linux.die.net/man/2/unlink
> >>
> >> "On success, zero is returned. On error, -1 is returned, and errno is set 
> >> appropriately." So it should be returning an error, and logging a message for 
> >> a file it cannot unlink or people will be surprised with weird failures.
> > 
> > Really need one message for unlink failed, but I'm afraid that if it
> > make sense for return an error code when application exit.
> > 
> > Thanks
> > Michael
> >> It also had some minor typos / English in the comments but we can fix that too.
> >>
> >> Matthew.
> >>
> > 
> > 
> > 
> Agree.May be it is not need to return error?
> -- 
> Regards,
> Haifeng
> 

May I throw out a few extra ideas.

The basic problem with DPDK and hugepages on exit, as you have already 
diagnosed, is not that the hugepages aren't released for re-use, its that 
the files inside the hugepage directory aren't unlinked. There are two 
considerations here that prevented us from coming up previously with a 
solution to this that we were fully happy with.
1. There is no automatic way (at least none that I'm aware of), to have the 
kernel automatically delete a file on exit. This means that any scheme to 
delete the files on application termination will have to be a voluntary one 
that won't happen if an app crashed or is forcibly terminated. That is why 
the EAL right now does the clean-up on the restart of the app instead.
2. The other consideration is multi-process. In a multi-process environment, 
it is unclear when exactly an app is finished - just because one process 
terminates does not mean that the whole app is finished with the hugepage 
files. If the files are deleted before a DPDK secondary process starts up, 
it won't be able to start as it won't be able to mmap the hugepage memory it 
needs.

Now, that being said, I think we could see about having automatic hugepage 
deletion controlled by an EAL parameter. That way, the parameter could be 
omitted in the multi-process case, but could be used for those who want to 
have a clean hugepage dir after app termination.

What I think we could do is, instead of having the app save the list of 
hugepages it uses as the app initializes, is to have the app delete the 
hugepage files as soon as it closes the filehandle for them. If my 
understanding is correct, the kernel will actually keep the hugepage memory 
around in the app as usual from that point onwards, and will only release it 
back [automatically] on app termination. New processes won't be able to mmap 
the file, but the running process will be unaffected. The best thing about 
this approach is that the hugepage file entries will be deleted whether or 
not the application terminates normally or crashes.

So, any thoughts? Would such a scheme work? I haven't prototyped it, yet, 
but I think it should work.

Regards,
/Bruce

PS: As well as multi-process not being able to use this scheme, I'd also 
note that this would prevent the dump_cfg utility app - or any other DPDK 
analysis app like it - from being used to inspect the memory area of the 
running process. That is another reason why I think the flag should not be 
set by default.
Neil Horman Oct. 29, 2014, 2:27 p.m. UTC | #7
On Wed, Oct 29, 2014 at 10:26:35AM +0000, Bruce Richardson wrote:
> On Wed, Oct 29, 2014 at 02:49:05PM +0800, Linhaifeng wrote:
> > 
> > 
> > On 2014/10/29 13:26, Qiu, Michael wrote:
> > > 在 10/29/2014 11:46 AM, Matthew Hall 写道:
> > >> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
> > >>> I just saw one return path with value '0', and no any other place 
> > >>> return a negative value,  so it is better to  be designed as one
> > >>> non-return function,
> > >>>
> > >>> +void
> > >>> +rte_eal_hugepage_free(void)
> > >>> +{
> > >>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
> > >>> +	unsigned i;
> > >>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
> > >>> +
> > >>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
> > >>> +
> > >>> +	for (i = 0; i < nr_hugefiles; i++) {
> > >>> +		unlink(hugepg_tbl[i].filepath);
> > >>> +		hugepg_tbl[i].orig_va = NULL;
> > >>> +	}
> > >>> +}
> > >>> +
> > >>>
> > >>> Thanks,
> > >>> Michael
> > >> Actually, I don't think that's quite right.
> > >>
> > >> http://linux.die.net/man/2/unlink
> > >>
> > >> "On success, zero is returned. On error, -1 is returned, and errno is set 
> > >> appropriately." So it should be returning an error, and logging a message for 
> > >> a file it cannot unlink or people will be surprised with weird failures.
> > > 
> > > Really need one message for unlink failed, but I'm afraid that if it
> > > make sense for return an error code when application exit.
> > > 
> > > Thanks
> > > Michael
> > >> It also had some minor typos / English in the comments but we can fix that too.
> > >>
> > >> Matthew.
> > >>
> > > 
> > > 
> > > 
> > Agree.May be it is not need to return error?
> > -- 
> > Regards,
> > Haifeng
> > 
> 
> May I throw out a few extra ideas.
> 
> The basic problem with DPDK and hugepages on exit, as you have already 
> diagnosed, is not that the hugepages aren't released for re-use, its that 
> the files inside the hugepage directory aren't unlinked. There are two 
I agree, this patch seems rather unbalanced.  Hugepages are initalized on
startup, not allocated explicitly, and so should not be freed on exit.  Instead,
they should be globally refcounted and (potentially per configuration) freed
when the last application to exit drops the refcount to zero.

> considerations here that prevented us from coming up previously with a 
> solution to this that we were fully happy with.
> 1. There is no automatic way (at least none that I'm aware of), to have the 
> kernel automatically delete a file on exit. This means that any scheme to 
> delete the files on application termination will have to be a voluntary one 
> that won't happen if an app crashed or is forcibly terminated. That is why 
> the EAL right now does the clean-up on the restart of the app instead.
Thats not true.  The common POSIX compliant method is to use the atexit()
function to register a function to be called when a process terminates.  It can
be used from rte_eth_hugepage_init so that a refcount is increased there and
decreased when the process exits.  The atexit registered function can also check
the refcount for zero and, dependent on configuration, unlink the hugepage
files.  That way the application can be completely unaware of a need to do
freeing/unlinking

> 2. The other consideration is multi-process. In a multi-process environment, 
> it is unclear when exactly an app is finished - just because one process 
> terminates does not mean that the whole app is finished with the hugepage 
> files. If the files are deleted before a DPDK secondary process starts up, 
> it won't be able to start as it won't be able to mmap the hugepage memory it 
> needs.
> 
Refcounting fixes that.  If every process takes a reference count on the
hugepage set, then you unlink it when the refcount hits zero.

> Now, that being said, I think we could see about having automatic hugepage 
> deletion controlled by an EAL parameter. That way, the parameter could be 
> omitted in the multi-process case, but could be used for those who want to 
> have a clean hugepage dir after app termination.
> 
> What I think we could do is, instead of having the app save the list of 
> hugepages it uses as the app initializes, is to have the app delete the 
> hugepage files as soon as it closes the filehandle for them. If my 
> understanding is correct, the kernel will actually keep the hugepage memory 
> around in the app as usual from that point onwards, and will only release it 
> back [automatically] on app termination. New processes won't be able to mmap 
> the file, but the running process will be unaffected. The best thing about 
> this approach is that the hugepage file entries will be deleted whether or 
> not the application terminates normally or crashes.
> 
That doesn't really work in a use case in which processes are created and
removed (i.e. if you have two processes, one exits, then another one is forked,
that third process won't find the hugepage file, because the second process will
have unlinked it on exit).

> So, any thoughts? Would such a scheme work? I haven't prototyped it, yet, 
> but I think it should work.
> 
> Regards,
> /Bruce
> 
> PS: As well as multi-process not being able to use this scheme, I'd also 
> note that this would prevent the dump_cfg utility app - or any other DPDK 
> analysis app like it - from being used to inspect the memory area of the 
> running process. That is another reason why I think the flag should not be 
> set by default.
> 
>
Ramia, Kannan Babu Oct. 29, 2014, 3:22 p.m. UTC | #8
The problem still remains if one of the process gets abruptly killed, then the corresponding refcount will not get decremented. There should be some scavenging function to be implemented to check the process liveliness.  

regards
Kannan Babu

-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman

Sent: Wednesday, October 29, 2014 7:58 PM
To: Richardson, Bruce
Cc: dev@dpdk.org
Subject: Re: [dpdk-dev] [PATCH] add free hugepage function

On Wed, Oct 29, 2014 at 10:26:35AM +0000, Bruce Richardson wrote:
> On Wed, Oct 29, 2014 at 02:49:05PM +0800, Linhaifeng wrote:

> > 

> > 

> > On 2014/10/29 13:26, Qiu, Michael wrote:

> > > 在 10/29/2014 11:46 AM, Matthew Hall 写道:

> > >> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:

> > >>> I just saw one return path with value '0', and no any other 

> > >>> place return a negative value,  so it is better to  be designed 

> > >>> as one non-return function,

> > >>>

> > >>> +void

> > >>> +rte_eal_hugepage_free(void)

> > >>> +{

> > >>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;

> > >>> +	unsigned i;

> > >>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;

> > >>> +

> > >>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", 

> > >>> +nr_hugefiles);

> > >>> +

> > >>> +	for (i = 0; i < nr_hugefiles; i++) {

> > >>> +		unlink(hugepg_tbl[i].filepath);

> > >>> +		hugepg_tbl[i].orig_va = NULL;

> > >>> +	}

> > >>> +}

> > >>> +

> > >>>

> > >>> Thanks,

> > >>> Michael

> > >> Actually, I don't think that's quite right.

> > >>

> > >> http://linux.die.net/man/2/unlink

> > >>

> > >> "On success, zero is returned. On error, -1 is returned, and 

> > >> errno is set appropriately." So it should be returning an error, 

> > >> and logging a message for a file it cannot unlink or people will be surprised with weird failures.

> > > 

> > > Really need one message for unlink failed, but I'm afraid that if 

> > > it make sense for return an error code when application exit.

> > > 

> > > Thanks

> > > Michael

> > >> It also had some minor typos / English in the comments but we can fix that too.

> > >>

> > >> Matthew.

> > >>

> > > 

> > > 

> > > 

> > Agree.May be it is not need to return error?

> > --

> > Regards,

> > Haifeng

> > 

> 

> May I throw out a few extra ideas.

> 

> The basic problem with DPDK and hugepages on exit, as you have already 

> diagnosed, is not that the hugepages aren't released for re-use, its 

> that the files inside the hugepage directory aren't unlinked. There 

> are two

I agree, this patch seems rather unbalanced.  Hugepages are initalized on startup, not allocated explicitly, and so should not be freed on exit.  Instead, they should be globally refcounted and (potentially per configuration) freed when the last application to exit drops the refcount to zero.

> considerations here that prevented us from coming up previously with a 

> solution to this that we were fully happy with.

> 1. There is no automatic way (at least none that I'm aware of), to 

> have the kernel automatically delete a file on exit. This means that 

> any scheme to delete the files on application termination will have to 

> be a voluntary one that won't happen if an app crashed or is forcibly 

> terminated. That is why the EAL right now does the clean-up on the restart of the app instead.

Thats not true.  The common POSIX compliant method is to use the atexit() function to register a function to be called when a process terminates.  It can be used from rte_eth_hugepage_init so that a refcount is increased there and decreased when the process exits.  The atexit registered function can also check the refcount for zero and, dependent on configuration, unlink the hugepage files.  That way the application can be completely unaware of a need to do freeing/unlinking

> 2. The other consideration is multi-process. In a multi-process 

> environment, it is unclear when exactly an app is finished - just 

> because one process terminates does not mean that the whole app is 

> finished with the hugepage files. If the files are deleted before a 

> DPDK secondary process starts up, it won't be able to start as it 

> won't be able to mmap the hugepage memory it needs.

> 

Refcounting fixes that.  If every process takes a reference count on the hugepage set, then you unlink it when the refcount hits zero.

> Now, that being said, I think we could see about having automatic 

> hugepage deletion controlled by an EAL parameter. That way, the 

> parameter could be omitted in the multi-process case, but could be 

> used for those who want to have a clean hugepage dir after app termination.

> 

> What I think we could do is, instead of having the app save the list 

> of hugepages it uses as the app initializes, is to have the app delete 

> the hugepage files as soon as it closes the filehandle for them. If my 

> understanding is correct, the kernel will actually keep the hugepage 

> memory around in the app as usual from that point onwards, and will 

> only release it back [automatically] on app termination. New processes 

> won't be able to mmap the file, but the running process will be 

> unaffected. The best thing about this approach is that the hugepage 

> file entries will be deleted whether or not the application terminates normally or crashes.

> 

That doesn't really work in a use case in which processes are created and removed (i.e. if you have two processes, one exits, then another one is forked, that third process won't find the hugepage file, because the second process will have unlinked it on exit).

> So, any thoughts? Would such a scheme work? I haven't prototyped it, 

> yet, but I think it should work.

> 

> Regards,

> /Bruce

> 

> PS: As well as multi-process not being able to use this scheme, I'd 

> also note that this would prevent the dump_cfg utility app - or any 

> other DPDK analysis app like it - from being used to inspect the 

> memory area of the running process. That is another reason why I think 

> the flag should not be set by default.

> 

>
Neil Horman Oct. 29, 2014, 3:32 p.m. UTC | #9
On Wed, Oct 29, 2014 at 03:22:25PM +0000, Ramia, Kannan Babu wrote:
> The problem still remains if one of the process gets abruptly killed, then the corresponding refcount will not get decremented. There should be some scavenging function to be implemented to check the process liveliness.  
> 
Well, abnormal termination results in abnormal consequences.  You expect
garbage to get left behind of a program crashes, so I wouldn't really worry
about that too much.  If you really wanted to you can register chained handlers
for SIGSEGV/SIGBUS/etc to catch those conditions, but honestly, that seems like
overkill.  If a program that uses shared resources terminates abnormally, its
well understood that those shared resources may not get released properly, and
manual intervention is required to clean them up

Neil

> regards
> Kannan Babu
> 
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> Sent: Wednesday, October 29, 2014 7:58 PM
> To: Richardson, Bruce
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] add free hugepage function
> 
> On Wed, Oct 29, 2014 at 10:26:35AM +0000, Bruce Richardson wrote:
> > On Wed, Oct 29, 2014 at 02:49:05PM +0800, Linhaifeng wrote:
> > > 
> > > 
> > > On 2014/10/29 13:26, Qiu, Michael wrote:
> > > > 在 10/29/2014 11:46 AM, Matthew Hall 写道:
> > > >> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
> > > >>> I just saw one return path with value '0', and no any other 
> > > >>> place return a negative value,  so it is better to  be designed 
> > > >>> as one non-return function,
> > > >>>
> > > >>> +void
> > > >>> +rte_eal_hugepage_free(void)
> > > >>> +{
> > > >>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
> > > >>> +	unsigned i;
> > > >>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
> > > >>> +
> > > >>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", 
> > > >>> +nr_hugefiles);
> > > >>> +
> > > >>> +	for (i = 0; i < nr_hugefiles; i++) {
> > > >>> +		unlink(hugepg_tbl[i].filepath);
> > > >>> +		hugepg_tbl[i].orig_va = NULL;
> > > >>> +	}
> > > >>> +}
> > > >>> +
> > > >>>
> > > >>> Thanks,
> > > >>> Michael
> > > >> Actually, I don't think that's quite right.
> > > >>
> > > >> http://linux.die.net/man/2/unlink
> > > >>
> > > >> "On success, zero is returned. On error, -1 is returned, and 
> > > >> errno is set appropriately." So it should be returning an error, 
> > > >> and logging a message for a file it cannot unlink or people will be surprised with weird failures.
> > > > 
> > > > Really need one message for unlink failed, but I'm afraid that if 
> > > > it make sense for return an error code when application exit.
> > > > 
> > > > Thanks
> > > > Michael
> > > >> It also had some minor typos / English in the comments but we can fix that too.
> > > >>
> > > >> Matthew.
> > > >>
> > > > 
> > > > 
> > > > 
> > > Agree.May be it is not need to return error?
> > > --
> > > Regards,
> > > Haifeng
> > > 
> > 
> > May I throw out a few extra ideas.
> > 
> > The basic problem with DPDK and hugepages on exit, as you have already 
> > diagnosed, is not that the hugepages aren't released for re-use, its 
> > that the files inside the hugepage directory aren't unlinked. There 
> > are two
> I agree, this patch seems rather unbalanced.  Hugepages are initalized on startup, not allocated explicitly, and so should not be freed on exit.  Instead, they should be globally refcounted and (potentially per configuration) freed when the last application to exit drops the refcount to zero.
> 
> > considerations here that prevented us from coming up previously with a 
> > solution to this that we were fully happy with.
> > 1. There is no automatic way (at least none that I'm aware of), to 
> > have the kernel automatically delete a file on exit. This means that 
> > any scheme to delete the files on application termination will have to 
> > be a voluntary one that won't happen if an app crashed or is forcibly 
> > terminated. That is why the EAL right now does the clean-up on the restart of the app instead.
> Thats not true.  The common POSIX compliant method is to use the atexit() function to register a function to be called when a process terminates.  It can be used from rte_eth_hugepage_init so that a refcount is increased there and decreased when the process exits.  The atexit registered function can also check the refcount for zero and, dependent on configuration, unlink the hugepage files.  That way the application can be completely unaware of a need to do freeing/unlinking
> 
> > 2. The other consideration is multi-process. In a multi-process 
> > environment, it is unclear when exactly an app is finished - just 
> > because one process terminates does not mean that the whole app is 
> > finished with the hugepage files. If the files are deleted before a 
> > DPDK secondary process starts up, it won't be able to start as it 
> > won't be able to mmap the hugepage memory it needs.
> > 
> Refcounting fixes that.  If every process takes a reference count on the hugepage set, then you unlink it when the refcount hits zero.
> 
> > Now, that being said, I think we could see about having automatic 
> > hugepage deletion controlled by an EAL parameter. That way, the 
> > parameter could be omitted in the multi-process case, but could be 
> > used for those who want to have a clean hugepage dir after app termination.
> > 
> > What I think we could do is, instead of having the app save the list 
> > of hugepages it uses as the app initializes, is to have the app delete 
> > the hugepage files as soon as it closes the filehandle for them. If my 
> > understanding is correct, the kernel will actually keep the hugepage 
> > memory around in the app as usual from that point onwards, and will 
> > only release it back [automatically] on app termination. New processes 
> > won't be able to mmap the file, but the running process will be 
> > unaffected. The best thing about this approach is that the hugepage 
> > file entries will be deleted whether or not the application terminates normally or crashes.
> > 
> That doesn't really work in a use case in which processes are created and removed (i.e. if you have two processes, one exits, then another one is forked, that third process won't find the hugepage file, because the second process will have unlinked it on exit).
> 
> > So, any thoughts? Would such a scheme work? I haven't prototyped it, 
> > yet, but I think it should work.
> > 
> > Regards,
> > /Bruce
> > 
> > PS: As well as multi-process not being able to use this scheme, I'd 
> > also note that this would prevent the dump_cfg utility app - or any 
> > other DPDK analysis app like it - from being used to inspect the 
> > memory area of the running process. That is another reason why I think 
> > the flag should not be set by default.
> > 
> >
Bruce Richardson Oct. 29, 2014, 4:47 p.m. UTC | #10
On Wed, Oct 29, 2014 at 11:32:12AM -0400, Neil Horman wrote:
> On Wed, Oct 29, 2014 at 03:22:25PM +0000, Ramia, Kannan Babu wrote:
> > The problem still remains if one of the process gets abruptly killed, then the corresponding refcount will not get decremented. There should be some scavenging function to be implemented to check the process liveliness.  
> > 
> Well, abnormal termination results in abnormal consequences.  You expect
> garbage to get left behind of a program crashes, so I wouldn't really worry
> about that too much.  If you really wanted to you can register chained handlers
> for SIGSEGV/SIGBUS/etc to catch those conditions, but honestly, that seems like
> overkill.  If a program that uses shared resources terminates abnormally, its
> well understood that those shared resources may not get released properly, and
> manual intervention is required to clean them up
> 
> Neil

If we take this approach we can perhaps reuse a lot of what is already there.
The existing code run at startup uses locks as a form of reference counting to determine what hugepages to clean up - safe in the knowledge that any locks are auto-released by the kernel whatever way a process terminates. We could also set this code to also run from "atexit()", which means that in a multi-process scenario so long as the final process terminates correctly, the hugepage files will be cleaned up. If any other process terminates abruptly, the lock held by that will still be released, and so there will be no problem, and in the worst-case where the final (or perhaps only) process terminates abnormally, the existing cleanup code will continue to work as it does now by cleaning up at startup.

/Bruce


> 
> > regards
> > Kannan Babu
> > 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> > Sent: Wednesday, October 29, 2014 7:58 PM
> > To: Richardson, Bruce
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] add free hugepage function
> > 
> > On Wed, Oct 29, 2014 at 10:26:35AM +0000, Bruce Richardson wrote:
> > > On Wed, Oct 29, 2014 at 02:49:05PM +0800, Linhaifeng wrote:
> > > > 
> > > > 
> > > > On 2014/10/29 13:26, Qiu, Michael wrote:
> > > > > 在 10/29/2014 11:46 AM, Matthew Hall 写道:
> > > > >> On Wed, Oct 29, 2014 at 03:27:58AM +0000, Qiu, Michael wrote:
> > > > >>> I just saw one return path with value '0', and no any other 
> > > > >>> place return a negative value,  so it is better to  be designed 
> > > > >>> as one non-return function,
> > > > >>>
> > > > >>> +void
> > > > >>> +rte_eal_hugepage_free(void)
> > > > >>> +{
> > > > >>> +	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
> > > > >>> +	unsigned i;
> > > > >>> +	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
> > > > >>> +
> > > > >>> +	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", 
> > > > >>> +nr_hugefiles);
> > > > >>> +
> > > > >>> +	for (i = 0; i < nr_hugefiles; i++) {
> > > > >>> +		unlink(hugepg_tbl[i].filepath);
> > > > >>> +		hugepg_tbl[i].orig_va = NULL;
> > > > >>> +	}
> > > > >>> +}
> > > > >>> +
> > > > >>>
> > > > >>> Thanks,
> > > > >>> Michael
> > > > >> Actually, I don't think that's quite right.
> > > > >>
> > > > >> http://linux.die.net/man/2/unlink
> > > > >>
> > > > >> "On success, zero is returned. On error, -1 is returned, and 
> > > > >> errno is set appropriately." So it should be returning an error, 
> > > > >> and logging a message for a file it cannot unlink or people will be surprised with weird failures.
> > > > > 
> > > > > Really need one message for unlink failed, but I'm afraid that if 
> > > > > it make sense for return an error code when application exit.
> > > > > 
> > > > > Thanks
> > > > > Michael
> > > > >> It also had some minor typos / English in the comments but we can fix that too.
> > > > >>
> > > > >> Matthew.
> > > > >>
> > > > > 
> > > > > 
> > > > > 
> > > > Agree.May be it is not need to return error?
> > > > --
> > > > Regards,
> > > > Haifeng
> > > > 
> > > 
> > > May I throw out a few extra ideas.
> > > 
> > > The basic problem with DPDK and hugepages on exit, as you have already 
> > > diagnosed, is not that the hugepages aren't released for re-use, its 
> > > that the files inside the hugepage directory aren't unlinked. There 
> > > are two
> > I agree, this patch seems rather unbalanced.  Hugepages are initalized on startup, not allocated explicitly, and so should not be freed on exit.  Instead, they should be globally refcounted and (potentially per configuration) freed when the last application to exit drops the refcount to zero.
> > 
> > > considerations here that prevented us from coming up previously with a 
> > > solution to this that we were fully happy with.
> > > 1. There is no automatic way (at least none that I'm aware of), to 
> > > have the kernel automatically delete a file on exit. This means that 
> > > any scheme to delete the files on application termination will have to 
> > > be a voluntary one that won't happen if an app crashed or is forcibly 
> > > terminated. That is why the EAL right now does the clean-up on the restart of the app instead.
> > Thats not true.  The common POSIX compliant method is to use the atexit() function to register a function to be called when a process terminates.  It can be used from rte_eth_hugepage_init so that a refcount is increased there and decreased when the process exits.  The atexit registered function can also check the refcount for zero and, dependent on configuration, unlink the hugepage files.  That way the application can be completely unaware of a need to do freeing/unlinking
> > 
> > > 2. The other consideration is multi-process. In a multi-process 
> > > environment, it is unclear when exactly an app is finished - just 
> > > because one process terminates does not mean that the whole app is 
> > > finished with the hugepage files. If the files are deleted before a 
> > > DPDK secondary process starts up, it won't be able to start as it 
> > > won't be able to mmap the hugepage memory it needs.
> > > 
> > Refcounting fixes that.  If every process takes a reference count on the hugepage set, then you unlink it when the refcount hits zero.
> > 
> > > Now, that being said, I think we could see about having automatic 
> > > hugepage deletion controlled by an EAL parameter. That way, the 
> > > parameter could be omitted in the multi-process case, but could be 
> > > used for those who want to have a clean hugepage dir after app termination.
> > > 
> > > What I think we could do is, instead of having the app save the list 
> > > of hugepages it uses as the app initializes, is to have the app delete 
> > > the hugepage files as soon as it closes the filehandle for them. If my 
> > > understanding is correct, the kernel will actually keep the hugepage 
> > > memory around in the app as usual from that point onwards, and will 
> > > only release it back [automatically] on app termination. New processes 
> > > won't be able to mmap the file, but the running process will be 
> > > unaffected. The best thing about this approach is that the hugepage 
> > > file entries will be deleted whether or not the application terminates normally or crashes.
> > > 
> > That doesn't really work in a use case in which processes are created and removed (i.e. if you have two processes, one exits, then another one is forked, that third process won't find the hugepage file, because the second process will have unlinked it on exit).
> > 
> > > So, any thoughts? Would such a scheme work? I haven't prototyped it, 
> > > yet, but I think it should work.
> > > 
> > > Regards,
> > > /Bruce
> > > 
> > > PS: As well as multi-process not being able to use this scheme, I'd 
> > > also note that this would prevent the dump_cfg utility app - or any 
> > > other DPDK analysis app like it - from being used to inspect the 
> > > memory area of the running process. That is another reason why I think 
> > > the flag should not be set by default.
> > > 
> > >
Matthew Hall Oct. 30, 2014, 3:23 a.m. UTC | #11
On Wed, Oct 29, 2014 at 11:32:12AM -0400, Neil Horman wrote:
> > 
> Well, abnormal termination results in abnormal consequences.  You expect
> garbage to get left behind of a program crashes, so I wouldn't really worry
> about that too much.  If you really wanted to you can register chained handlers
> for SIGSEGV/SIGBUS/etc to catch those conditions, but honestly, that seems like
> overkill.  If a program that uses shared resources terminates abnormally, its
> well understood that those shared resources may not get released properly, and
> manual intervention is required to clean them up
> 
> Neil

Perhaps true. But also one of the top irritations with POSIX SHMEM. ;)
Neil Horman Oct. 30, 2014, 10:18 a.m. UTC | #12
On Wed, Oct 29, 2014 at 08:23:25PM -0700, Matthew Hall wrote:
> On Wed, Oct 29, 2014 at 11:32:12AM -0400, Neil Horman wrote:
> > > 
> > Well, abnormal termination results in abnormal consequences.  You expect
> > garbage to get left behind of a program crashes, so I wouldn't really worry
> > about that too much.  If you really wanted to you can register chained handlers
> > for SIGSEGV/SIGBUS/etc to catch those conditions, but honestly, that seems like
> > overkill.  If a program that uses shared resources terminates abnormally, its
> > well understood that those shared resources may not get released properly, and
> > manual intervention is required to clean them up
> > 
> > Neil
> 
> Perhaps true. But also one of the top irritations with POSIX SHMEM. ;)
> 
POSIX shared memory defines IPC_RMID for exactly that purpose.
Matthew Hall Oct. 30, 2014, 2:56 p.m. UTC | #13
Yes. I once had to write a cron job in Perl that monitored for unused regions and deleted them before the SHM subsystem ran out of memory. Possible, yes, but irritating.
diff mbox

Patch

diff --git a/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h b/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h
index 4cf8ea9..7251b6b 100644
--- a/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h
+++ b/dpdk/dpdk-1.7.0/lib/librte_eal/common/include/rte_memory.h
@@ -172,6 +172,17 @@  unsigned rte_memory_get_nchannel(void);
  */
 unsigned rte_memory_get_nrank(void);
 
+/**
+ * Free all the hugepages.For the application to call when exit.
+ *
+ * @param void
+ *
+ * @return
+ *       0: successfully
+ *       negative: error
+ */
+int rte_eal_hugepage_free(void);
+
 #ifdef RTE_LIBRTE_XEN_DOM0
 /**
  * Return the physical address of elt, which is an element of the pool mp.
diff --git a/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c b/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c
index f2454f4..1ae0e79 100644
--- a/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/dpdk/dpdk-1.7.0/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -98,6 +98,13 @@ 
 #include "eal_filesystem.h"
 #include "eal_hugepages.h"
 
+struct hugepage_table {
+	struct hugepage_file *hugepg_tbl;
+	unsigned nr_hugefiles;
+};
+
+static struct hugepage_table g_hugepage_table;
+
 /**
  * @file
  * Huge page mapping under linux
@@ -1202,6 +1209,7 @@  rte_eal_hugepage_init(void)
 						(unsigned)
 							(used_hp[i].hugepage_sz / 0x100000),
 						j);
+				g_hugepage_table.nr_hugefiles += used_hp[i].num_pages[j];
 			}
 		}
 	}
@@ -1237,6 +1245,8 @@  rte_eal_hugepage_init(void)
 		goto fail;
 	}
 
+	g_hugepage_table.hugepg_tbl = hugepage;
+
 	/* free the temporary hugepage table */
 	free(tmp_hp);
 	tmp_hp = NULL;
@@ -1487,6 +1497,23 @@  error:
 	return -1;
 }
 
+int
+rte_eal_hugepage_free(void)
+{
+	struct hugepage_file *hugepg_tbl = g_hugepage_table.hugepg_tbl;
+	unsigned i;
+	unsigned nr_hugefiles = g_hugepage_table.nr_hugefiles;
+
+	RTE_LOG(INFO, EAL, "unlink %u hugepage files\n", nr_hugefiles);
+
+	for (i = 0; i < nr_hugefiles; i++) {
+		unlink(hugepg_tbl[i].filepath);
+		hugepg_tbl[i].orig_va = NULL;
+	}
+
+	return 0;
+}
+
 static int
 rte_eal_memdevice_init(void)
 {