[2/3] net/virtio-user: avoid parsing process mappings

Message ID 20180905042852.6212-3-tiwei.bie@intel.com (mailing list archive)
State Accepted, archived
Delegated to: Maxime Coquelin
Headers
Series Some fixes/improvements for virtio-user memory table |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail Compilation issues

Commit Message

Tiwei Bie Sept. 5, 2018, 4:28 a.m. UTC
  Recently some memory APIs were introduced to allow users to
get the file descriptor and offset for each memory segment.
We can leverage those APIs to get rid of the /proc magic on
memory table preparation in vhost-user backend.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/net/virtio/virtio_user/vhost_user.c | 211 +++++++++-----------
 1 file changed, 90 insertions(+), 121 deletions(-)
  

Comments

Anatoly Burakov Sept. 7, 2018, 9:39 a.m. UTC | #1
On 05-Sep-18 5:28 AM, Tiwei Bie wrote:
> Recently some memory APIs were introduced to allow users to
> get the file descriptor and offset for each memory segment.
> We can leverage those APIs to get rid of the /proc magic on
> memory table preparation in vhost-user backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---

<snip>

> -	for (i = 0; i < num; ++i) {
> -		mr = &msg->payload.memory.regions[i];
> -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
> -		mr->userspace_addr = huges[i].addr;
> -		mr->memory_size = huges[i].size;
> -		mr->mmap_offset = 0;
> -		fds[i] = open(huges[i].path, O_RDWR);
> +	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
> +		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
> +			ms, rte_errno);
> +		return -1;
> +	}
> +
> +	start_addr = (uint64_t)(uintptr_t)ms->addr;
> +	end_addr = start_addr + ms->len;
> +
> +	for (i = 0; i < wa->region_nr; i++) {

There has to be a better way than to run this loop on every segment. 
Maybe store last-used region, and only do a region look up if there's a 
mismatch? Generally, in single-file segments mode, you'll get lots of 
segments from one memseg list one after the other, so you will need to 
do a lookup once memseg list changes, instead of on each segment.

> +		if (wa->fds[i] != fd)
> +			continue;
> +
> +		mr = &wa->vm->regions[i];
> +
> +		if (mr->userspace_addr + mr->memory_size < end_addr)
> +			mr->memory_size = end_addr - mr->userspace_addr;
> +

<snip>

>   	int fds[VHOST_MEMORY_MAX_NREGIONS];
>   	int fd_num = 0;
> -	int i, len;
> +	int len;
>   	int vhostfd = dev->vhostfd;
>   
>   	RTE_SET_USED(m);
> @@ -364,10 +337,6 @@ vhost_user_sock(struct virtio_user_dev *dev,
>   		return -1;
>   	}
>   
> -	if (req == VHOST_USER_SET_MEM_TABLE)
> -		for (i = 0; i < fd_num; ++i)
> -			close(fds[i]);
> -

You're sharing fd's - presumably the other side of this is in a 
different process, so it's safe to close these fd's there?

>   	if (need_reply) {
>   		if (vhost_user_read(vhostfd, &msg) < 0) {
>   			PMD_DRV_LOG(ERR, "Received msg failed: %s",
>
  
Tiwei Bie Sept. 7, 2018, 11:35 a.m. UTC | #2
On Fri, Sep 07, 2018 at 10:39:16AM +0100, Burakov, Anatoly wrote:
> On 05-Sep-18 5:28 AM, Tiwei Bie wrote:
> > Recently some memory APIs were introduced to allow users to
> > get the file descriptor and offset for each memory segment.
> > We can leverage those APIs to get rid of the /proc magic on
> > memory table preparation in vhost-user backend.
> > 
> > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > ---
> 
> <snip>
> 
> > -	for (i = 0; i < num; ++i) {
> > -		mr = &msg->payload.memory.regions[i];
> > -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
> > -		mr->userspace_addr = huges[i].addr;
> > -		mr->memory_size = huges[i].size;
> > -		mr->mmap_offset = 0;
> > -		fds[i] = open(huges[i].path, O_RDWR);
> > +	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
> > +		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
> > +			ms, rte_errno);
> > +		return -1;
> > +	}
> > +
> > +	start_addr = (uint64_t)(uintptr_t)ms->addr;
> > +	end_addr = start_addr + ms->len;
> > +
> > +	for (i = 0; i < wa->region_nr; i++) {
> 
> There has to be a better way than to run this loop on every segment. Maybe
> store last-used region, and only do a region look up if there's a mismatch?
> Generally, in single-file segments mode, you'll get lots of segments from
> one memseg list one after the other, so you will need to do a lookup once
> memseg list changes, instead of on each segment.

We may have holes in one memseg list due to dynamic free.
Do you mean we just need to do rte_memseg_contig_walk()
and we can assume that fds of the contiguous memegs will
be the same?

> 
> > +		if (wa->fds[i] != fd)
> > +			continue;
> > +
> > +		mr = &wa->vm->regions[i];
> > +
> > +		if (mr->userspace_addr + mr->memory_size < end_addr)
> > +			mr->memory_size = end_addr - mr->userspace_addr;
> > +
> 
> <snip>
> 
> >   	int fds[VHOST_MEMORY_MAX_NREGIONS];
> >   	int fd_num = 0;
> > -	int i, len;
> > +	int len;
> >   	int vhostfd = dev->vhostfd;
> >   	RTE_SET_USED(m);
> > @@ -364,10 +337,6 @@ vhost_user_sock(struct virtio_user_dev *dev,
> >   		return -1;
> >   	}
> > -	if (req == VHOST_USER_SET_MEM_TABLE)
> > -		for (i = 0; i < fd_num; ++i)
> > -			close(fds[i]);
> > -
> 
> You're sharing fd's - presumably the other side of this is in a different
> process, so it's safe to close these fd's there?

Above code belongs to virtio-user, and it will close the
fds got from rte_memseg_get_fd_thread_unsafe().

Below is the code which will close these fds on the other
side (i.e. the vhost-user process):

https://github.com/DPDK/dpdk/blob/3605968c2fa7/lib/librte_vhost/vhost_user.c#L805
https://github.com/DPDK/dpdk/blob/3605968c2fa7/lib/librte_vhost/vhost_user.c#L97


> 
> >   	if (need_reply) {
> >   		if (vhost_user_read(vhostfd, &msg) < 0) {
> >   			PMD_DRV_LOG(ERR, "Received msg failed: %s",
> > 
> 
> 
> -- 
> Thanks,
> Anatoly
  
Anatoly Burakov Sept. 7, 2018, 12:21 p.m. UTC | #3
On 07-Sep-18 12:35 PM, Tiwei Bie wrote:
> On Fri, Sep 07, 2018 at 10:39:16AM +0100, Burakov, Anatoly wrote:
>> On 05-Sep-18 5:28 AM, Tiwei Bie wrote:
>>> Recently some memory APIs were introduced to allow users to
>>> get the file descriptor and offset for each memory segment.
>>> We can leverage those APIs to get rid of the /proc magic on
>>> memory table preparation in vhost-user backend.
>>>
>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>> ---
>>
>> <snip>
>>
>>> -	for (i = 0; i < num; ++i) {
>>> -		mr = &msg->payload.memory.regions[i];
>>> -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
>>> -		mr->userspace_addr = huges[i].addr;
>>> -		mr->memory_size = huges[i].size;
>>> -		mr->mmap_offset = 0;
>>> -		fds[i] = open(huges[i].path, O_RDWR);
>>> +	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
>>> +		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
>>> +			ms, rte_errno);
>>> +		return -1;
>>> +	}
>>> +
>>> +	start_addr = (uint64_t)(uintptr_t)ms->addr;
>>> +	end_addr = start_addr + ms->len;
>>> +
>>> +	for (i = 0; i < wa->region_nr; i++) {
>>
>> There has to be a better way than to run this loop on every segment. Maybe
>> store last-used region, and only do a region look up if there's a mismatch?
>> Generally, in single-file segments mode, you'll get lots of segments from
>> one memseg list one after the other, so you will need to do a lookup once
>> memseg list changes, instead of on each segment.
> 
> We may have holes in one memseg list due to dynamic free.
> Do you mean we just need to do rte_memseg_contig_walk()
> and we can assume that fds of the contiguous memegs will
> be the same?

No, i didn't mean that.

Whether or not you are in single-file segments mode, you still need to 
scan each segment. However, you lose your state when you exit this 
function, and thus have to look up the appropriate memory region (that 
matches your fd) again, over and over. It would be good if you could 
store last-used memory region somewhere, so that next time you come back 
into this function, if the memseg has the same fd, you will not have to 
look it up.

Something like this:

struct walk_arg {
	*last_used;
	<snip>
}

int walk_func() {
	<snip>
	cur_region = wa->last_used; // check if it matches
	if (cur->region->fd != fd) {
		// fd is different - we've changed the segment
		<snip>
		wa->last_used = cur_region
	}
}

So, cache last used region to not have to look it up again, because 
chances are, you won't have to. That way, you will still do region 
lookups, but only if you have to - not every time.

> 
>>
>>> +		if (wa->fds[i] != fd)
>>> +			continue;
>>> +
>>> +		mr = &wa->vm->regions[i];
>>> +
>>> +		if (mr->userspace_addr + mr->memory_size < end_addr)
>>> +			mr->memory_size = end_addr - mr->userspace_addr;
>>> +
>>
>> <snip>
>>
>>>    	int fds[VHOST_MEMORY_MAX_NREGIONS];
>>>    	int fd_num = 0;
>>> -	int i, len;
>>> +	int len;
>>>    	int vhostfd = dev->vhostfd;
>>>    	RTE_SET_USED(m);
>>> @@ -364,10 +337,6 @@ vhost_user_sock(struct virtio_user_dev *dev,
>>>    		return -1;
>>>    	}
>>> -	if (req == VHOST_USER_SET_MEM_TABLE)
>>> -		for (i = 0; i < fd_num; ++i)
>>> -			close(fds[i]);
>>> -
>>
>> You're sharing fd's - presumably the other side of this is in a different
>> process, so it's safe to close these fd's there?
> 
> Above code belongs to virtio-user, and it will close the
> fds got from rte_memseg_get_fd_thread_unsafe().
> 
> Below is the code which will close these fds on the other
> side (i.e. the vhost-user process):
> 
> https://github.com/DPDK/dpdk/blob/3605968c2fa7/lib/librte_vhost/vhost_user.c#L805
> https://github.com/DPDK/dpdk/blob/3605968c2fa7/lib/librte_vhost/vhost_user.c#L97

OK, so not a problem then.

> 
> 
>>
>>>    	if (need_reply) {
>>>    		if (vhost_user_read(vhostfd, &msg) < 0) {
>>>    			PMD_DRV_LOG(ERR, "Received msg failed: %s",
>>>
>>
>>
>> -- 
>> Thanks,
>> Anatoly
>
  
Tiwei Bie Sept. 10, 2018, 3:59 a.m. UTC | #4
On Fri, Sep 07, 2018 at 01:21:35PM +0100, Burakov, Anatoly wrote:
> On 07-Sep-18 12:35 PM, Tiwei Bie wrote:
> > On Fri, Sep 07, 2018 at 10:39:16AM +0100, Burakov, Anatoly wrote:
> > > On 05-Sep-18 5:28 AM, Tiwei Bie wrote:
> > > > Recently some memory APIs were introduced to allow users to
> > > > get the file descriptor and offset for each memory segment.
> > > > We can leverage those APIs to get rid of the /proc magic on
> > > > memory table preparation in vhost-user backend.
> > > > 
> > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > ---
> > > 
> > > <snip>
> > > 
> > > > -	for (i = 0; i < num; ++i) {
> > > > -		mr = &msg->payload.memory.regions[i];
> > > > -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
> > > > -		mr->userspace_addr = huges[i].addr;
> > > > -		mr->memory_size = huges[i].size;
> > > > -		mr->mmap_offset = 0;
> > > > -		fds[i] = open(huges[i].path, O_RDWR);
> > > > +	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
> > > > +		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
> > > > +			ms, rte_errno);
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	start_addr = (uint64_t)(uintptr_t)ms->addr;
> > > > +	end_addr = start_addr + ms->len;
> > > > +
> > > > +	for (i = 0; i < wa->region_nr; i++) {
> > > 
> > > There has to be a better way than to run this loop on every segment. Maybe
> > > store last-used region, and only do a region look up if there's a mismatch?
> > > Generally, in single-file segments mode, you'll get lots of segments from
> > > one memseg list one after the other, so you will need to do a lookup once
> > > memseg list changes, instead of on each segment.
> > 
> > We may have holes in one memseg list due to dynamic free.
> > Do you mean we just need to do rte_memseg_contig_walk()
> > and we can assume that fds of the contiguous memegs will
> > be the same?
> 
> No, i didn't mean that.
> 
> Whether or not you are in single-file segments mode, you still need to scan
> each segment. However, you lose your state when you exit this function, and
> thus have to look up the appropriate memory region (that matches your fd)
> again, over and over. It would be good if you could store last-used memory
> region somewhere, so that next time you come back into this function, if the
> memseg has the same fd, you will not have to look it up.
> 
> Something like this:
> 
> struct walk_arg {
> 	*last_used;
> 	<snip>
> }
> 
> int walk_func() {
> 	<snip>
> 	cur_region = wa->last_used; // check if it matches
> 	if (cur->region->fd != fd) {
> 		// fd is different - we've changed the segment
> 		<snip>
> 		wa->last_used = cur_region
> 	}
> }

Thanks for the code. :)

> 
> So, cache last used region to not have to look it up again, because chances
> are, you won't have to. That way, you will still do region lookups, but only
> if you have to - not every time.

I can do it, but I'm not sure this optimization is really
necessary. Because this loop should be quite fast, as the
max number of regions permitted by vhost-user is quite
small. And actually we need to do that loop at least once
for each packet in vhost-user's dequeue and enqueue path,
i.e. the data path.


> 
> > 
> > > 
> > > > +		if (wa->fds[i] != fd)
> > > > +			continue;
> > > > +
> > > > +		mr = &wa->vm->regions[i];
> > > > +
> > > > +		if (mr->userspace_addr + mr->memory_size < end_addr)
> > > > +			mr->memory_size = end_addr - mr->userspace_addr;
> > > > +
> > > 
> > > <snip>
> > > 
> > > >    	int fds[VHOST_MEMORY_MAX_NREGIONS];
> > > >    	int fd_num = 0;
> > > > -	int i, len;
> > > > +	int len;
> > > >    	int vhostfd = dev->vhostfd;
> > > >    	RTE_SET_USED(m);
> > > > @@ -364,10 +337,6 @@ vhost_user_sock(struct virtio_user_dev *dev,
> > > >    		return -1;
> > > >    	}
> > > > -	if (req == VHOST_USER_SET_MEM_TABLE)
> > > > -		for (i = 0; i < fd_num; ++i)
> > > > -			close(fds[i]);
> > > > -
> > > 
> > > You're sharing fd's - presumably the other side of this is in a different
> > > process, so it's safe to close these fd's there?
> > 
> > Above code belongs to virtio-user, and it will close the
> > fds got from rte_memseg_get_fd_thread_unsafe().
> > 
> > Below is the code which will close these fds on the other
> > side (i.e. the vhost-user process):
> > 
> > https://github.com/DPDK/dpdk/blob/3605968c2fa7/lib/librte_vhost/vhost_user.c#L805
> > https://github.com/DPDK/dpdk/blob/3605968c2fa7/lib/librte_vhost/vhost_user.c#L97
> 
> OK, so not a problem then.
> 
> > 
> > 
> > > 
> > > >    	if (need_reply) {
> > > >    		if (vhost_user_read(vhostfd, &msg) < 0) {
> > > >    			PMD_DRV_LOG(ERR, "Received msg failed: %s",
> > > > 
> > > 
> > > 
> > > -- 
> > > Thanks,
> > > Anatoly
> > 
> 
> 
> -- 
> Thanks,
> Anatoly
  
Maxime Coquelin Sept. 11, 2018, 12:58 p.m. UTC | #5
On 09/05/2018 06:28 AM, Tiwei Bie wrote:
> Recently some memory APIs were introduced to allow users to
> get the file descriptor and offset for each memory segment.
> We can leverage those APIs to get rid of the /proc magic on
> memory table preparation in vhost-user backend.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/net/virtio/virtio_user/vhost_user.c | 211 +++++++++-----------
>   1 file changed, 90 insertions(+), 121 deletions(-)
> 

Nice to get rid off the /proc parsing!

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime
  
Anatoly Burakov Sept. 17, 2018, 10:17 a.m. UTC | #6
On 10-Sep-18 4:59 AM, Tiwei Bie wrote:
> On Fri, Sep 07, 2018 at 01:21:35PM +0100, Burakov, Anatoly wrote:
>> On 07-Sep-18 12:35 PM, Tiwei Bie wrote:
>>> On Fri, Sep 07, 2018 at 10:39:16AM +0100, Burakov, Anatoly wrote:
>>>> On 05-Sep-18 5:28 AM, Tiwei Bie wrote:
>>>>> Recently some memory APIs were introduced to allow users to
>>>>> get the file descriptor and offset for each memory segment.
>>>>> We can leverage those APIs to get rid of the /proc magic on
>>>>> memory table preparation in vhost-user backend.
>>>>>
>>>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>>>> ---
>>>>
>>>> <snip>
>>>>
>>>>> -	for (i = 0; i < num; ++i) {
>>>>> -		mr = &msg->payload.memory.regions[i];
>>>>> -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
>>>>> -		mr->userspace_addr = huges[i].addr;
>>>>> -		mr->memory_size = huges[i].size;
>>>>> -		mr->mmap_offset = 0;
>>>>> -		fds[i] = open(huges[i].path, O_RDWR);
>>>>> +	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
>>>>> +		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
>>>>> +			ms, rte_errno);
>>>>> +		return -1;
>>>>> +	}
>>>>> +
>>>>> +	start_addr = (uint64_t)(uintptr_t)ms->addr;
>>>>> +	end_addr = start_addr + ms->len;
>>>>> +
>>>>> +	for (i = 0; i < wa->region_nr; i++) {
>>>>
>>>> There has to be a better way than to run this loop on every segment. Maybe
>>>> store last-used region, and only do a region look up if there's a mismatch?
>>>> Generally, in single-file segments mode, you'll get lots of segments from
>>>> one memseg list one after the other, so you will need to do a lookup once
>>>> memseg list changes, instead of on each segment.
>>>
>>> We may have holes in one memseg list due to dynamic free.
>>> Do you mean we just need to do rte_memseg_contig_walk()
>>> and we can assume that fds of the contiguous memegs will
>>> be the same?
>>
>> No, i didn't mean that.
>>
>> Whether or not you are in single-file segments mode, you still need to scan
>> each segment. However, you lose your state when you exit this function, and
>> thus have to look up the appropriate memory region (that matches your fd)
>> again, over and over. It would be good if you could store last-used memory
>> region somewhere, so that next time you come back into this function, if the
>> memseg has the same fd, you will not have to look it up.
>>
>> Something like this:
>>
>> struct walk_arg {
>> 	*last_used;
>> 	<snip>
>> }
>>
>> int walk_func() {
>> 	<snip>
>> 	cur_region = wa->last_used; // check if it matches
>> 	if (cur->region->fd != fd) {
>> 		// fd is different - we've changed the segment
>> 		<snip>
>> 		wa->last_used = cur_region
>> 	}
>> }
> 
> Thanks for the code. :)
> 
>>
>> So, cache last used region to not have to look it up again, because chances
>> are, you won't have to. That way, you will still do region lookups, but only
>> if you have to - not every time.
> 
> I can do it, but I'm not sure this optimization is really
> necessary. Because this loop should be quite fast, as the
> max number of regions permitted by vhost-user is quite
> small. And actually we need to do that loop at least once
> for each packet in vhost-user's dequeue and enqueue path,
> i.e. the data path.

The number of regions is small, but the number of segments may be in the 
thousands. Think worst case - 8K segments in the 16th region - with my 
code, you will execute only 16 iterations on first segment and use "last 
used" for the rest of the segments, while with your code, it'll be 8K 
times 16 :)

You'll have to clarify the "for each packet" part, not sure i follow.
  
Tiwei Bie Sept. 17, 2018, 11:57 a.m. UTC | #7
On Mon, Sep 17, 2018 at 11:17:42AM +0100, Burakov, Anatoly wrote:
> On 10-Sep-18 4:59 AM, Tiwei Bie wrote:
> > On Fri, Sep 07, 2018 at 01:21:35PM +0100, Burakov, Anatoly wrote:
> > > On 07-Sep-18 12:35 PM, Tiwei Bie wrote:
> > > > On Fri, Sep 07, 2018 at 10:39:16AM +0100, Burakov, Anatoly wrote:
> > > > > On 05-Sep-18 5:28 AM, Tiwei Bie wrote:
> > > > > > Recently some memory APIs were introduced to allow users to
> > > > > > get the file descriptor and offset for each memory segment.
> > > > > > We can leverage those APIs to get rid of the /proc magic on
> > > > > > memory table preparation in vhost-user backend.
> > > > > > 
> > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > ---
> > > > > 
> > > > > <snip>
> > > > > 
> > > > > > -	for (i = 0; i < num; ++i) {
> > > > > > -		mr = &msg->payload.memory.regions[i];
> > > > > > -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
> > > > > > -		mr->userspace_addr = huges[i].addr;
> > > > > > -		mr->memory_size = huges[i].size;
> > > > > > -		mr->mmap_offset = 0;
> > > > > > -		fds[i] = open(huges[i].path, O_RDWR);
> > > > > > +	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
> > > > > > +		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
> > > > > > +			ms, rte_errno);
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	start_addr = (uint64_t)(uintptr_t)ms->addr;
> > > > > > +	end_addr = start_addr + ms->len;
> > > > > > +
> > > > > > +	for (i = 0; i < wa->region_nr; i++) {
> > > > > 
> > > > > There has to be a better way than to run this loop on every segment. Maybe
> > > > > store last-used region, and only do a region look up if there's a mismatch?
> > > > > Generally, in single-file segments mode, you'll get lots of segments from
> > > > > one memseg list one after the other, so you will need to do a lookup once
> > > > > memseg list changes, instead of on each segment.
> > > > 
> > > > We may have holes in one memseg list due to dynamic free.
> > > > Do you mean we just need to do rte_memseg_contig_walk()
> > > > and we can assume that fds of the contiguous memegs will
> > > > be the same?
> > > 
> > > No, i didn't mean that.
> > > 
> > > Whether or not you are in single-file segments mode, you still need to scan
> > > each segment. However, you lose your state when you exit this function, and
> > > thus have to look up the appropriate memory region (that matches your fd)
> > > again, over and over. It would be good if you could store last-used memory
> > > region somewhere, so that next time you come back into this function, if the
> > > memseg has the same fd, you will not have to look it up.
> > > 
> > > Something like this:
> > > 
> > > struct walk_arg {
> > > 	*last_used;
> > > 	<snip>
> > > }
> > > 
> > > int walk_func() {
> > > 	<snip>
> > > 	cur_region = wa->last_used; // check if it matches
> > > 	if (cur->region->fd != fd) {
> > > 		// fd is different - we've changed the segment
> > > 		<snip>
> > > 		wa->last_used = cur_region
> > > 	}
> > > }
> > 
> > Thanks for the code. :)
> > 
> > > 
> > > So, cache last used region to not have to look it up again, because chances
> > > are, you won't have to. That way, you will still do region lookups, but only
> > > if you have to - not every time.
> > 
> > I can do it, but I'm not sure this optimization is really
> > necessary. Because this loop should be quite fast, as the
> > max number of regions permitted by vhost-user is quite
> > small. And actually we need to do that loop at least once
> > for each packet in vhost-user's dequeue and enqueue path,
> > i.e. the data path.
> 
> The number of regions is small, but the number of segments may be in the
> thousands. Think worst case - 8K segments in the 16th region

The number of regions permitted by vhost-user is 8.
And most likely, we just have two regions as the
single-file-segment mode is mandatory when using
2MB pages.

> - with my code,
> you will execute only 16 iterations on first segment and use "last used" for
> the rest of the segments,

We still need to do 8K iterations on the segments.

> while with your code, it'll be 8K times 16 :)

IMO, what we really need is a way to reduce "8K",
i.e. reduce the number of segments (which could
be thousands currently) we need to parse.

And the loop should be faster than the function
call to rte_memseg_get_fd_thread_unsafe() and
rte_memseg_get_fd_offset_thread_unsafe() (which
are also called for each segment).

> 
> You'll have to clarify the "for each packet" part, not sure i follow.

Take the vhost-PMD as an example, when doing Rx burst
and Tx burst, for each mbuf (i.e. packet), we need to
do that loop at least once.

> 
> -- 
> Thanks,
> Anatoly
  
Anatoly Burakov Sept. 17, 2018, 1:06 p.m. UTC | #8
On 17-Sep-18 12:57 PM, Tiwei Bie wrote:
> On Mon, Sep 17, 2018 at 11:17:42AM +0100, Burakov, Anatoly wrote:
>> On 10-Sep-18 4:59 AM, Tiwei Bie wrote:
>>> On Fri, Sep 07, 2018 at 01:21:35PM +0100, Burakov, Anatoly wrote:
>>>> On 07-Sep-18 12:35 PM, Tiwei Bie wrote:
>>>>> On Fri, Sep 07, 2018 at 10:39:16AM +0100, Burakov, Anatoly wrote:
>>>>>> On 05-Sep-18 5:28 AM, Tiwei Bie wrote:
>>>>>>> Recently some memory APIs were introduced to allow users to
>>>>>>> get the file descriptor and offset for each memory segment.
>>>>>>> We can leverage those APIs to get rid of the /proc magic on
>>>>>>> memory table preparation in vhost-user backend.
>>>>>>>
>>>>>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>>>>>> ---
>>>>>>
>>>>>> <snip>
>>>>>>
>>>>>>> -	for (i = 0; i < num; ++i) {
>>>>>>> -		mr = &msg->payload.memory.regions[i];
>>>>>>> -		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
>>>>>>> -		mr->userspace_addr = huges[i].addr;
>>>>>>> -		mr->memory_size = huges[i].size;
>>>>>>> -		mr->mmap_offset = 0;
>>>>>>> -		fds[i] = open(huges[i].path, O_RDWR);
>>>>>>> +	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
>>>>>>> +		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
>>>>>>> +			ms, rte_errno);
>>>>>>> +		return -1;
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	start_addr = (uint64_t)(uintptr_t)ms->addr;
>>>>>>> +	end_addr = start_addr + ms->len;
>>>>>>> +
>>>>>>> +	for (i = 0; i < wa->region_nr; i++) {
>>>>>>
>>>>>> There has to be a better way than to run this loop on every segment. Maybe
>>>>>> store last-used region, and only do a region look up if there's a mismatch?
>>>>>> Generally, in single-file segments mode, you'll get lots of segments from
>>>>>> one memseg list one after the other, so you will need to do a lookup once
>>>>>> memseg list changes, instead of on each segment.
>>>>>
>>>>> We may have holes in one memseg list due to dynamic free.
>>>>> Do you mean we just need to do rte_memseg_contig_walk()
>>>>> and we can assume that fds of the contiguous memegs will
>>>>> be the same?
>>>>
>>>> No, i didn't mean that.
>>>>
>>>> Whether or not you are in single-file segments mode, you still need to scan
>>>> each segment. However, you lose your state when you exit this function, and
>>>> thus have to look up the appropriate memory region (that matches your fd)
>>>> again, over and over. It would be good if you could store last-used memory
>>>> region somewhere, so that next time you come back into this function, if the
>>>> memseg has the same fd, you will not have to look it up.
>>>>
>>>> Something like this:
>>>>
>>>> struct walk_arg {
>>>> 	*last_used;
>>>> 	<snip>
>>>> }
>>>>
>>>> int walk_func() {
>>>> 	<snip>
>>>> 	cur_region = wa->last_used; // check if it matches
>>>> 	if (cur->region->fd != fd) {
>>>> 		// fd is different - we've changed the segment
>>>> 		<snip>
>>>> 		wa->last_used = cur_region
>>>> 	}
>>>> }
>>>
>>> Thanks for the code. :)
>>>
>>>>
>>>> So, cache last used region to not have to look it up again, because chances
>>>> are, you won't have to. That way, you will still do region lookups, but only
>>>> if you have to - not every time.
>>>
>>> I can do it, but I'm not sure this optimization is really
>>> necessary. Because this loop should be quite fast, as the
>>> max number of regions permitted by vhost-user is quite
>>> small. And actually we need to do that loop at least once
>>> for each packet in vhost-user's dequeue and enqueue path,
>>> i.e. the data path.
>>
>> The number of regions is small, but the number of segments may be in the
>> thousands. Think worst case - 8K segments in the 16th region
> 
> The number of regions permitted by vhost-user is 8.
> And most likely, we just have two regions as the
> single-file-segment mode is mandatory when using
> 2MB pages.



> 
>> - with my code,
>> you will execute only 16 iterations on first segment and use "last used" for
>> the rest of the segments,
> 
> We still need to do 8K iterations on the segments.

Yes, but not 8K times (up to) 8 regions. Anyway, it's your driver, up to 
you :)

> 
>> while with your code, it'll be 8K times 16 :)
> 
> IMO, what we really need is a way to reduce "8K",
> i.e. reduce the number of segments (which could
> be thousands currently) we need to parse.
> 
> And the loop should be faster than the function
> call to rte_memseg_get_fd_thread_unsafe() and
> rte_memseg_get_fd_offset_thread_unsafe() (which
> are also called for each segment).

Unfortunately, we cannot do that, because we cannot make any assumptions 
about underlying structure of fd's.

Technically, i could introduce an fd-centric walk function (i.e. so 
that, instead of per-memseg or per-contiguous area walk, you'd get a 
per-fd walk), but i really don't want to pollute the API with another 
walk function.

> 
>>
>> You'll have to clarify the "for each packet" part, not sure i follow.
> 
> Take the vhost-PMD as an example, when doing Rx burst
> and Tx burst, for each mbuf (i.e. packet), we need to
> do that loop at least once.

Ah, OK, i get you - if it's fast enough to use on the data path, it's 
fast enough for memory mapping code :) Fair enough.

> 
>>
>> -- 
>> Thanks,
>> Anatoly
>
  

Patch

diff --git a/drivers/net/virtio/virtio_user/vhost_user.c b/drivers/net/virtio/virtio_user/vhost_user.c
index ef6e43df8..8bd49610b 100644
--- a/drivers/net/virtio/virtio_user/vhost_user.c
+++ b/drivers/net/virtio/virtio_user/vhost_user.c
@@ -11,6 +11,9 @@ 
 #include <string.h>
 #include <errno.h>
 
+#include <rte_fbarray.h>
+#include <rte_eal_memconfig.h>
+
 #include "vhost.h"
 #include "virtio_user_dev.h"
 
@@ -121,133 +124,103 @@  vhost_user_read(int fd, struct vhost_user_msg *msg)
 	return -1;
 }
 
-struct hugepage_file_info {
-	uint64_t addr;            /**< virtual addr */
-	size_t   size;            /**< the file size */
-	char     path[PATH_MAX];  /**< path to backing file */
+struct walk_arg {
+	struct vhost_memory *vm;
+	int *fds;
+	int region_nr;
 };
 
-/* Two possible options:
- * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
- * array. This is simple but cannot be used in secondary process because
- * secondary process will close and munmap that file.
- * 2. Match HUGEFILE_FMT to find hugepage files directly.
- *
- * We choose option 2.
- */
 static int
-get_hugepage_file_info(struct hugepage_file_info huges[], int max)
+update_memory_region(const struct rte_memseg_list *msl __rte_unused,
+		const struct rte_memseg *ms, void *arg)
 {
-	int idx, k, exist;
-	FILE *f;
-	char buf[BUFSIZ], *tmp, *tail;
-	char *str_underline, *str_start;
-	int huge_index;
-	uint64_t v_start, v_end;
-	struct stat stats;
-
-	f = fopen("/proc/self/maps", "r");
-	if (!f) {
-		PMD_DRV_LOG(ERR, "cannot open /proc/self/maps");
-		return -1;
-	}
-
-	idx = 0;
-	while (fgets(buf, sizeof(buf), f) != NULL) {
-		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
-			PMD_DRV_LOG(ERR, "Failed to parse address");
-			goto error;
-		}
-
-		tmp = strchr(buf, ' ') + 1; /** skip address */
-		tmp = strchr(tmp, ' ') + 1; /** skip perm */
-		tmp = strchr(tmp, ' ') + 1; /** skip offset */
-		tmp = strchr(tmp, ' ') + 1; /** skip dev */
-		tmp = strchr(tmp, ' ') + 1; /** skip inode */
-		while (*tmp == ' ')         /** skip spaces */
-			tmp++;
-		tail = strrchr(tmp, '\n');  /** remove newline if exists */
-		if (tail)
-			*tail = '\0';
-
-		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
-		 * which is defined in eal_filesystem.h
-		 */
-		str_underline = strrchr(tmp, '_');
-		if (!str_underline)
-			continue;
-
-		str_start = str_underline - strlen("map");
-		if (str_start < tmp)
-			continue;
-
-		if (sscanf(str_start, "map_%d", &huge_index) != 1)
-			continue;
-
-		/* skip duplicated file which is mapped to different regions */
-		for (k = 0, exist = -1; k < idx; ++k) {
-			if (!strcmp(huges[k].path, tmp)) {
-				exist = k;
-				break;
-			}
-		}
-		if (exist >= 0)
-			continue;
-
-		if (idx >= max) {
-			PMD_DRV_LOG(ERR, "Exceed maximum of %d", max);
-			goto error;
-		}
-
-		huges[idx].addr = v_start;
-		huges[idx].size = v_end - v_start; /* To be corrected later */
-		snprintf(huges[idx].path, PATH_MAX, "%s", tmp);
-		idx++;
-	}
-
-	/* correct the size for files who have many regions */
-	for (k = 0; k < idx; ++k) {
-		if (stat(huges[k].path, &stats) < 0) {
-			PMD_DRV_LOG(ERR, "Failed to stat %s, %s\n",
-				    huges[k].path, strerror(errno));
-			continue;
-		}
-		huges[k].size = stats.st_size;
-		PMD_DRV_LOG(INFO, "file %s, size %zx\n",
-			    huges[k].path, huges[k].size);
-	}
-
-	fclose(f);
-	return idx;
-
-error:
-	fclose(f);
-	return -1;
-}
-
-static int
-prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
-{
-	int i, num;
-	struct hugepage_file_info huges[VHOST_MEMORY_MAX_NREGIONS];
+	struct walk_arg *wa = arg;
 	struct vhost_memory_region *mr;
+	uint64_t start_addr, end_addr;
+	size_t offset;
+	int i, fd;
 
-	num = get_hugepage_file_info(huges, VHOST_MEMORY_MAX_NREGIONS);
-	if (num < 0) {
-		PMD_INIT_LOG(ERR, "Failed to prepare memory for vhost-user");
+	fd = rte_memseg_get_fd_thread_unsafe(ms);
+	if (fd < 0) {
+		PMD_DRV_LOG(ERR, "Failed to get fd, ms=%p rte_errno=%d",
+			ms, rte_errno);
 		return -1;
 	}
 
-	for (i = 0; i < num; ++i) {
-		mr = &msg->payload.memory.regions[i];
-		mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
-		mr->userspace_addr = huges[i].addr;
-		mr->memory_size = huges[i].size;
-		mr->mmap_offset = 0;
-		fds[i] = open(huges[i].path, O_RDWR);
+	if (rte_memseg_get_fd_offset_thread_unsafe(ms, &offset) < 0) {
+		PMD_DRV_LOG(ERR, "Failed to get offset, ms=%p rte_errno=%d",
+			ms, rte_errno);
+		return -1;
+	}
+
+	start_addr = (uint64_t)(uintptr_t)ms->addr;
+	end_addr = start_addr + ms->len;
+
+	for (i = 0; i < wa->region_nr; i++) {
+		if (wa->fds[i] != fd)
+			continue;
+
+		mr = &wa->vm->regions[i];
+
+		if (mr->userspace_addr + mr->memory_size < end_addr)
+			mr->memory_size = end_addr - mr->userspace_addr;
+
+		if (mr->userspace_addr > start_addr) {
+			mr->userspace_addr = start_addr;
+			mr->guest_phys_addr = start_addr;
+		}
+
+		if (mr->mmap_offset > offset)
+			mr->mmap_offset = offset;
+
+		PMD_DRV_LOG(DEBUG, "index=%d fd=%d offset=0x%" PRIx64
+			" addr=0x%" PRIx64 " len=%" PRIu64, i, fd,
+			mr->mmap_offset, mr->userspace_addr,
+			mr->memory_size);
+
+		return 0;
+	}
+
+	if (i >= VHOST_MEMORY_MAX_NREGIONS) {
+		PMD_DRV_LOG(ERR, "Too many memory regions");
+		return -1;
 	}
 
-	msg->payload.memory.nregions = num;
+	mr = &wa->vm->regions[i];
+	wa->fds[i] = fd;
+
+	mr->guest_phys_addr = start_addr;
+	mr->userspace_addr = start_addr;
+	mr->memory_size = ms->len;
+	mr->mmap_offset = offset;
+
+	PMD_DRV_LOG(DEBUG, "index=%d fd=%d offset=0x%" PRIx64
+		" addr=0x%" PRIx64 " len=%" PRIu64, i, fd,
+		mr->mmap_offset, mr->userspace_addr,
+		mr->memory_size);
+
+	wa->region_nr++;
+
+	return 0;
+}
+
+static int
+prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
+{
+	struct walk_arg wa;
+
+	wa.region_nr = 0;
+	wa.vm = &msg->payload.memory;
+	wa.fds = fds;
+
+	/*
+	 * The memory lock has already been taken by memory subsystem
+	 * or virtio_user_start_device().
+	 */
+	if (rte_memseg_walk_thread_unsafe(update_memory_region, &wa) < 0)
+		return -1;
+
+	msg->payload.memory.nregions = wa.region_nr;
 	msg->payload.memory.padding = 0;
 
 	return 0;
@@ -280,7 +253,7 @@  vhost_user_sock(struct virtio_user_dev *dev,
 	int need_reply = 0;
 	int fds[VHOST_MEMORY_MAX_NREGIONS];
 	int fd_num = 0;
-	int i, len;
+	int len;
 	int vhostfd = dev->vhostfd;
 
 	RTE_SET_USED(m);
@@ -364,10 +337,6 @@  vhost_user_sock(struct virtio_user_dev *dev,
 		return -1;
 	}
 
-	if (req == VHOST_USER_SET_MEM_TABLE)
-		for (i = 0; i < fd_num; ++i)
-			close(fds[i]);
-
 	if (need_reply) {
 		if (vhost_user_read(vhostfd, &msg) < 0) {
 			PMD_DRV_LOG(ERR, "Received msg failed: %s",