[dpdk-dev,2/6] vhost: get guest/host physical address mappings

Message ID 1471939839-29778-3-git-send-email-yuanhan.liu@linux.intel.com (mailing list archive)
State Superseded, archived
Delegated to: Yuanhan Liu
Headers

Commit Message

Yuanhan Liu Aug. 23, 2016, 8:10 a.m. UTC
  So that we can convert a guest physical address to host physical
address, which will be used in later Tx zero copy implementation.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
 lib/librte_vhost/vhost.h      | 30 +++++++++++++++
 lib/librte_vhost/vhost_user.c | 86 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
  

Comments

Maxime Coquelin Aug. 23, 2016, 9:58 a.m. UTC | #1
On 08/23/2016 10:10 AM, Yuanhan Liu wrote:
> So that we can convert a guest physical address to host physical
> address, which will be used in later Tx zero copy implementation.
>
> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> ---
>  lib/librte_vhost/vhost.h      | 30 +++++++++++++++
>  lib/librte_vhost/vhost_user.c | 86 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 116 insertions(+)
>
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index df2107b..2d52987 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -114,6 +114,12 @@ struct vhost_virtqueue {
>   #define VIRTIO_F_VERSION_1 32
>  #endif
>
> +struct guest_page {
> +	uint64_t guest_phys_addr;
> +	uint64_t host_phys_addr;
> +	uint64_t size;
> +};
> +
>  /**
>   * Device structure contains all configuration information relating
>   * to the device.
> @@ -137,6 +143,10 @@ struct virtio_net {
>  	uint64_t		log_addr;
>  	struct ether_addr	mac;
>
> +	uint32_t		nr_guest_pages;
> +	uint32_t		max_guest_pages;
> +	struct guest_page       *guest_pages;
> +
>  } __rte_cache_aligned;
>
>  /**
> @@ -217,6 +227,26 @@ gpa_to_vva(struct virtio_net *dev, uint64_t gpa)
>  	return 0;
>  }
>
> +/* Convert guest physical address to host physical address */
> +static inline phys_addr_t __attribute__((always_inline))
> +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
> +{
> +	uint32_t i;
> +	struct guest_page *page;
> +
> +	for (i = 0; i < dev->nr_guest_pages; i++) {
> +		page = &dev->guest_pages[i];
> +
> +		if (gpa >= page->guest_phys_addr &&
> +		    gpa + size < page->guest_phys_addr + page->size) {
Shouldn't be '<=' here?

> +			return gpa - page->guest_phys_addr +
> +			       page->host_phys_addr;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  struct virtio_net_device_ops const *notify_ops;
>  struct virtio_net *get_device(int vid);
>
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index d2071fd..045d4f0 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -372,6 +372,81 @@ vhost_user_set_vring_base(struct virtio_net *dev,
>  	return 0;
>  }
>
> +static void
> +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> +		   uint64_t host_phys_addr, uint64_t size)
> +{
> +	struct guest_page *page;
> +
> +	if (dev->nr_guest_pages == dev->max_guest_pages) {
> +		dev->max_guest_pages *= 2;
> +		dev->guest_pages = realloc(dev->guest_pages,
> +					dev->max_guest_pages * sizeof(*page));

Maybe realloc return could be checked?

> +	}
> +
> +	page = &dev->guest_pages[dev->nr_guest_pages++];
> +	page->guest_phys_addr = guest_phys_addr;
> +	page->host_phys_addr  = host_phys_addr;
> +	page->size = size;
> +}
> +
> +static void
> +add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
> +		uint64_t page_size)
> +{
> +	uint64_t reg_size = reg->size;
> +	uint64_t host_user_addr  = reg->host_user_addr;
> +	uint64_t guest_phys_addr = reg->guest_phys_addr;
> +	uint64_t host_phys_addr;
> +	uint64_t size;
> +	uint32_t pre_read;
> +
> +	pre_read = *((uint32_t *)(uintptr_t)host_user_addr);
> +	host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> +	size = page_size - (guest_phys_addr & (page_size - 1));
> +	size = RTE_MIN(size, reg_size);
> +
> +	add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
> +	host_user_addr  += size;
> +	guest_phys_addr += size;
> +	reg_size -= size;
> +
> +	while (reg_size > 0) {
> +		pre_read += *((uint32_t *)(uintptr_t)host_user_addr);
> +		host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> +		add_one_guest_page(dev, guest_phys_addr, host_phys_addr, page_size);
> +
> +		host_user_addr  += page_size;
> +		guest_phys_addr += page_size;
> +		reg_size -= page_size;
> +	}
> +
> +	/* FIXME */
> +	RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
For my information, what is the purpose of pre_read?

> +}
> +
> +/* TODO: enable it only in debug mode? */
> +static void
> +dump_guest_pages(struct virtio_net *dev)
> +{
> +	uint32_t i;
> +	struct guest_page *page;
> +
> +	for (i = 0; i < dev->nr_guest_pages; i++) {
> +		page = &dev->guest_pages[i];
> +
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"guest physical page region %u\n"
> +			"\t guest_phys_addr: %" PRIx64 "\n"
> +			"\t host_phys_addr : %" PRIx64 "\n"
> +			"\t size           : %" PRIx64 "\n",
> +			i,
> +			page->guest_phys_addr,
> +			page->host_phys_addr,
> +			page->size);
> +	}
> +}
> +
>  static int
>  vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
>  {
> @@ -396,6 +471,13 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
>  		dev->mem = NULL;
>  	}
>
> +	dev->nr_guest_pages = 0;
> +	if (!dev->guest_pages) {
> +		dev->max_guest_pages = 8;
> +		dev->guest_pages = malloc(dev->max_guest_pages *
> +						sizeof(struct guest_page));
> +	}
> +
>  	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) +
>  		sizeof(struct virtio_memory_region) * memory.nregions, 0);
>  	if (dev->mem == NULL) {
> @@ -447,6 +529,8 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
>  		reg->mmap_size = mmap_size;
>  		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
>
> +		add_guest_pages(dev, reg, alignment);
> +
>  		RTE_LOG(INFO, VHOST_CONFIG,
>  			"guest memory region %u, size: 0x%" PRIx64 "\n"
>  			"\t guest physical addr: 0x%" PRIx64 "\n"
> @@ -466,6 +550,8 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
>  			mmap_offset);
>  	}
>
> +	dump_guest_pages(dev);
> +
>  	return 0;
>
>  err_mmap:
>
  
Yuanhan Liu Aug. 23, 2016, 12:32 p.m. UTC | #2
On Tue, Aug 23, 2016 at 11:58:42AM +0200, Maxime Coquelin wrote:
> >
> >+/* Convert guest physical address to host physical address */
> >+static inline phys_addr_t __attribute__((always_inline))
> >+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
> >+{
> >+	uint32_t i;
> >+	struct guest_page *page;
> >+
> >+	for (i = 0; i < dev->nr_guest_pages; i++) {
> >+		page = &dev->guest_pages[i];
> >+
> >+		if (gpa >= page->guest_phys_addr &&
> >+		    gpa + size < page->guest_phys_addr + page->size) {
> Shouldn't be '<=' here?

Oops, you are right.

> >+			return gpa - page->guest_phys_addr +
> >+			       page->host_phys_addr;
> >+		}
> >+	}
> >+
> >+	return 0;
> >+}
> >+
> > struct virtio_net_device_ops const *notify_ops;
> > struct virtio_net *get_device(int vid);
> >
> >diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> >index d2071fd..045d4f0 100644
> >--- a/lib/librte_vhost/vhost_user.c
> >+++ b/lib/librte_vhost/vhost_user.c
> >@@ -372,6 +372,81 @@ vhost_user_set_vring_base(struct virtio_net *dev,
> > 	return 0;
> > }
> >
> >+static void
> >+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> >+		   uint64_t host_phys_addr, uint64_t size)
> >+{
> >+	struct guest_page *page;
> >+
> >+	if (dev->nr_guest_pages == dev->max_guest_pages) {
> >+		dev->max_guest_pages *= 2;
> >+		dev->guest_pages = realloc(dev->guest_pages,
> >+					dev->max_guest_pages * sizeof(*page));
> 
> Maybe realloc return could be checked?

Yes, I should have done that. Besides, I also forgot to free it at
somewhere. Will fix it.

> 
> >+	}
> >+
> >+	page = &dev->guest_pages[dev->nr_guest_pages++];
> >+	page->guest_phys_addr = guest_phys_addr;
> >+	page->host_phys_addr  = host_phys_addr;
> >+	page->size = size;
> >+}
> >+
> >+static void
> >+add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
> >+		uint64_t page_size)
> >+{
> >+	uint64_t reg_size = reg->size;
> >+	uint64_t host_user_addr  = reg->host_user_addr;
> >+	uint64_t guest_phys_addr = reg->guest_phys_addr;
> >+	uint64_t host_phys_addr;
> >+	uint64_t size;
> >+	uint32_t pre_read;
> >+
> >+	pre_read = *((uint32_t *)(uintptr_t)host_user_addr);
> >+	host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> >+	size = page_size - (guest_phys_addr & (page_size - 1));
> >+	size = RTE_MIN(size, reg_size);
> >+
> >+	add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
> >+	host_user_addr  += size;
> >+	guest_phys_addr += size;
> >+	reg_size -= size;
> >+
> >+	while (reg_size > 0) {
> >+		pre_read += *((uint32_t *)(uintptr_t)host_user_addr);
> >+		host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> >+		add_one_guest_page(dev, guest_phys_addr, host_phys_addr, page_size);
> >+
> >+		host_user_addr  += page_size;
> >+		guest_phys_addr += page_size;
> >+		reg_size -= page_size;
> >+	}
> >+
> >+	/* FIXME */
> >+	RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
> For my information, what is the purpose of pre_read?

Again, I put a FIXME here, but I forgot to add some explanation.

Here is the thing: the read will make sure the kernel populate the
corresponding PTE entry, so that rte_mem_virt2phy() will return proper
physical address, otherwise, invalid value is returned.

I can't simply do the read but do not actually reference/consume it.
Otherwise, the compiler will treat it as some noops and remove it.

An ugly RTE_LOG will make sure the read operation is not eliminated.
I'm seeking a more proper way to achieve that. Maybe I can add a new
field in virtio_net structure and store it there.

Or, do you have better ideas?

	--yliu
  
Maxime Coquelin Aug. 23, 2016, 1:25 p.m. UTC | #3
On 08/23/2016 02:32 PM, Yuanhan Liu wrote:
>>> +
>>> > >+	/* FIXME */
>>> > >+	RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
>> > For my information, what is the purpose of pre_read?
> Again, I put a FIXME here, but I forgot to add some explanation.
>
> Here is the thing: the read will make sure the kernel populate the
> corresponding PTE entry, so that rte_mem_virt2phy() will return proper
> physical address, otherwise, invalid value is returned.
>
> I can't simply do the read but do not actually reference/consume it.
> Otherwise, the compiler will treat it as some noops and remove it.
>
> An ugly RTE_LOG will make sure the read operation is not eliminated.
> I'm seeking a more proper way to achieve that. Maybe I can add a new
> field in virtio_net structure and store it there.
>
> Or, do you have better ideas?

This behavior is pretty twisted, no?
Shouldn't be rte_mem_virt2phy() role to ensure returning a valid value?

I have no better idea for now, but I will think about it.

Regards,
Maxime
  
Yuanhan Liu Aug. 23, 2016, 1:49 p.m. UTC | #4
On Tue, Aug 23, 2016 at 03:25:33PM +0200, Maxime Coquelin wrote:
> 
> 
> On 08/23/2016 02:32 PM, Yuanhan Liu wrote:
> >>>+
> >>>> >+	/* FIXME */
> >>>> >+	RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
> >>> For my information, what is the purpose of pre_read?
> >Again, I put a FIXME here, but I forgot to add some explanation.
> >
> >Here is the thing: the read will make sure the kernel populate the
> >corresponding PTE entry, so that rte_mem_virt2phy() will return proper
> >physical address, otherwise, invalid value is returned.
> >
> >I can't simply do the read but do not actually reference/consume it.
> >Otherwise, the compiler will treat it as some noops and remove it.
> >
> >An ugly RTE_LOG will make sure the read operation is not eliminated.
> >I'm seeking a more proper way to achieve that. Maybe I can add a new
> >field in virtio_net structure and store it there.
> >
> >Or, do you have better ideas?
> 
> This behavior is pretty twisted, no?

I have to say, yes, kind of.

> Shouldn't be rte_mem_virt2phy() role to ensure returning a valid value?

Not exactly. I think rte_mem_virt2phy() is more likely to fetch the
physical address of huge pages. And for those huge pages, EAL makes
sure they will be populated: it used to do a zero memset before to
achieve that. Since 5ce3ace1de45 ("eal: remove unnecessary hugepage
zero-filling"), it uses MAP_POPULATE option instead.

So, thank you that you just remind me of the MAP_POPULATE option.
I just had a quick try, it worked like a charm :)

	--yliu

> I have no better idea for now, but I will think about it.
> 
> Regards,
> Maxime
  
Maxime Coquelin Aug. 23, 2016, 2:05 p.m. UTC | #5
On 08/23/2016 03:49 PM, Yuanhan Liu wrote:
> On Tue, Aug 23, 2016 at 03:25:33PM +0200, Maxime Coquelin wrote:
>>
>>
>> On 08/23/2016 02:32 PM, Yuanhan Liu wrote:
>>>>> +
>>>>>>> +	/* FIXME */
>>>>>>> +	RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
>>>>> For my information, what is the purpose of pre_read?
>>> Again, I put a FIXME here, but I forgot to add some explanation.
>>>
>>> Here is the thing: the read will make sure the kernel populate the
>>> corresponding PTE entry, so that rte_mem_virt2phy() will return proper
>>> physical address, otherwise, invalid value is returned.
>>>
>>> I can't simply do the read but do not actually reference/consume it.
>>> Otherwise, the compiler will treat it as some noops and remove it.
>>>
>>> An ugly RTE_LOG will make sure the read operation is not eliminated.
>>> I'm seeking a more proper way to achieve that. Maybe I can add a new
>>> field in virtio_net structure and store it there.
>>>
>>> Or, do you have better ideas?
>>
>> This behavior is pretty twisted, no?
>
> I have to say, yes, kind of.
>
>> Shouldn't be rte_mem_virt2phy() role to ensure returning a valid value?
>
> Not exactly. I think rte_mem_virt2phy() is more likely to fetch the
> physical address of huge pages. And for those huge pages, EAL makes
> sure they will be populated: it used to do a zero memset before to
> achieve that. Since 5ce3ace1de45 ("eal: remove unnecessary hugepage
> zero-filling"), it uses MAP_POPULATE option instead.
>
> So, thank you that you just remind me of the MAP_POPULATE option.
> I just had a quick try, it worked like a charm :)

Excellent! :)
  

Patch

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index df2107b..2d52987 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -114,6 +114,12 @@  struct vhost_virtqueue {
  #define VIRTIO_F_VERSION_1 32
 #endif
 
+struct guest_page {
+	uint64_t guest_phys_addr;
+	uint64_t host_phys_addr;
+	uint64_t size;
+};
+
 /**
  * Device structure contains all configuration information relating
  * to the device.
@@ -137,6 +143,10 @@  struct virtio_net {
 	uint64_t		log_addr;
 	struct ether_addr	mac;
 
+	uint32_t		nr_guest_pages;
+	uint32_t		max_guest_pages;
+	struct guest_page       *guest_pages;
+
 } __rte_cache_aligned;
 
 /**
@@ -217,6 +227,26 @@  gpa_to_vva(struct virtio_net *dev, uint64_t gpa)
 	return 0;
 }
 
+/* Convert guest physical address to host physical address */
+static inline phys_addr_t __attribute__((always_inline))
+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
+{
+	uint32_t i;
+	struct guest_page *page;
+
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		page = &dev->guest_pages[i];
+
+		if (gpa >= page->guest_phys_addr &&
+		    gpa + size < page->guest_phys_addr + page->size) {
+			return gpa - page->guest_phys_addr +
+			       page->host_phys_addr;
+		}
+	}
+
+	return 0;
+}
+
 struct virtio_net_device_ops const *notify_ops;
 struct virtio_net *get_device(int vid);
 
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index d2071fd..045d4f0 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -372,6 +372,81 @@  vhost_user_set_vring_base(struct virtio_net *dev,
 	return 0;
 }
 
+static void
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+		   uint64_t host_phys_addr, uint64_t size)
+{
+	struct guest_page *page;
+
+	if (dev->nr_guest_pages == dev->max_guest_pages) {
+		dev->max_guest_pages *= 2;
+		dev->guest_pages = realloc(dev->guest_pages,
+					dev->max_guest_pages * sizeof(*page));
+	}
+
+	page = &dev->guest_pages[dev->nr_guest_pages++];
+	page->guest_phys_addr = guest_phys_addr;
+	page->host_phys_addr  = host_phys_addr;
+	page->size = size;
+}
+
+static void
+add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
+		uint64_t page_size)
+{
+	uint64_t reg_size = reg->size;
+	uint64_t host_user_addr  = reg->host_user_addr;
+	uint64_t guest_phys_addr = reg->guest_phys_addr;
+	uint64_t host_phys_addr;
+	uint64_t size;
+	uint32_t pre_read;
+
+	pre_read = *((uint32_t *)(uintptr_t)host_user_addr);
+	host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+	size = page_size - (guest_phys_addr & (page_size - 1));
+	size = RTE_MIN(size, reg_size);
+
+	add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+	host_user_addr  += size;
+	guest_phys_addr += size;
+	reg_size -= size;
+
+	while (reg_size > 0) {
+		pre_read += *((uint32_t *)(uintptr_t)host_user_addr);
+		host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+		add_one_guest_page(dev, guest_phys_addr, host_phys_addr, page_size);
+
+		host_user_addr  += page_size;
+		guest_phys_addr += page_size;
+		reg_size -= page_size;
+	}
+
+	/* FIXME */
+	RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
+}
+
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct guest_page *page;
+
+	for (i = 0; i < dev->nr_guest_pages; i++) {
+		page = &dev->guest_pages[i];
+
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"guest physical page region %u\n"
+			"\t guest_phys_addr: %" PRIx64 "\n"
+			"\t host_phys_addr : %" PRIx64 "\n"
+			"\t size           : %" PRIx64 "\n",
+			i,
+			page->guest_phys_addr,
+			page->host_phys_addr,
+			page->size);
+	}
+}
+
 static int
 vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 {
@@ -396,6 +471,13 @@  vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 		dev->mem = NULL;
 	}
 
+	dev->nr_guest_pages = 0;
+	if (!dev->guest_pages) {
+		dev->max_guest_pages = 8;
+		dev->guest_pages = malloc(dev->max_guest_pages *
+						sizeof(struct guest_page));
+	}
+
 	dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) +
 		sizeof(struct virtio_memory_region) * memory.nregions, 0);
 	if (dev->mem == NULL) {
@@ -447,6 +529,8 @@  vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 		reg->mmap_size = mmap_size;
 		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
 
+		add_guest_pages(dev, reg, alignment);
+
 		RTE_LOG(INFO, VHOST_CONFIG,
 			"guest memory region %u, size: 0x%" PRIx64 "\n"
 			"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -466,6 +550,8 @@  vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 			mmap_offset);
 	}
 
+	dump_guest_pages(dev);
+
 	return 0;
 
 err_mmap: