[dpdk-dev,2/6] vhost: get guest/host physical address mappings
Commit Message
So that we can convert a guest physical address to host physical
address, which will be used in later Tx zero copy implementation.
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
lib/librte_vhost/vhost.h | 30 +++++++++++++++
lib/librte_vhost/vhost_user.c | 86 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 116 insertions(+)
Comments
On 08/23/2016 10:10 AM, Yuanhan Liu wrote:
> So that we can convert a guest physical address to host physical
> address, which will be used in later Tx zero copy implementation.
>
> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> ---
> lib/librte_vhost/vhost.h | 30 +++++++++++++++
> lib/librte_vhost/vhost_user.c | 86 +++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 116 insertions(+)
>
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index df2107b..2d52987 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -114,6 +114,12 @@ struct vhost_virtqueue {
> #define VIRTIO_F_VERSION_1 32
> #endif
>
> +struct guest_page {
> + uint64_t guest_phys_addr;
> + uint64_t host_phys_addr;
> + uint64_t size;
> +};
> +
> /**
> * Device structure contains all configuration information relating
> * to the device.
> @@ -137,6 +143,10 @@ struct virtio_net {
> uint64_t log_addr;
> struct ether_addr mac;
>
> + uint32_t nr_guest_pages;
> + uint32_t max_guest_pages;
> + struct guest_page *guest_pages;
> +
> } __rte_cache_aligned;
>
> /**
> @@ -217,6 +227,26 @@ gpa_to_vva(struct virtio_net *dev, uint64_t gpa)
> return 0;
> }
>
> +/* Convert guest physical address to host physical address */
> +static inline phys_addr_t __attribute__((always_inline))
> +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
> +{
> + uint32_t i;
> + struct guest_page *page;
> +
> + for (i = 0; i < dev->nr_guest_pages; i++) {
> + page = &dev->guest_pages[i];
> +
> + if (gpa >= page->guest_phys_addr &&
> + gpa + size < page->guest_phys_addr + page->size) {
Shouldn't be '<=' here?
> + return gpa - page->guest_phys_addr +
> + page->host_phys_addr;
> + }
> + }
> +
> + return 0;
> +}
> +
> struct virtio_net_device_ops const *notify_ops;
> struct virtio_net *get_device(int vid);
>
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index d2071fd..045d4f0 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -372,6 +372,81 @@ vhost_user_set_vring_base(struct virtio_net *dev,
> return 0;
> }
>
> +static void
> +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> + uint64_t host_phys_addr, uint64_t size)
> +{
> + struct guest_page *page;
> +
> + if (dev->nr_guest_pages == dev->max_guest_pages) {
> + dev->max_guest_pages *= 2;
> + dev->guest_pages = realloc(dev->guest_pages,
> + dev->max_guest_pages * sizeof(*page));
Maybe realloc return could be checked?
> + }
> +
> + page = &dev->guest_pages[dev->nr_guest_pages++];
> + page->guest_phys_addr = guest_phys_addr;
> + page->host_phys_addr = host_phys_addr;
> + page->size = size;
> +}
> +
> +static void
> +add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
> + uint64_t page_size)
> +{
> + uint64_t reg_size = reg->size;
> + uint64_t host_user_addr = reg->host_user_addr;
> + uint64_t guest_phys_addr = reg->guest_phys_addr;
> + uint64_t host_phys_addr;
> + uint64_t size;
> + uint32_t pre_read;
> +
> + pre_read = *((uint32_t *)(uintptr_t)host_user_addr);
> + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> + size = page_size - (guest_phys_addr & (page_size - 1));
> + size = RTE_MIN(size, reg_size);
> +
> + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
> + host_user_addr += size;
> + guest_phys_addr += size;
> + reg_size -= size;
> +
> + while (reg_size > 0) {
> + pre_read += *((uint32_t *)(uintptr_t)host_user_addr);
> + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, page_size);
> +
> + host_user_addr += page_size;
> + guest_phys_addr += page_size;
> + reg_size -= page_size;
> + }
> +
> + /* FIXME */
> + RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
For my information, what is the purpose of pre_read?
> +}
> +
> +/* TODO: enable it only in debug mode? */
> +static void
> +dump_guest_pages(struct virtio_net *dev)
> +{
> + uint32_t i;
> + struct guest_page *page;
> +
> + for (i = 0; i < dev->nr_guest_pages; i++) {
> + page = &dev->guest_pages[i];
> +
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "guest physical page region %u\n"
> + "\t guest_phys_addr: %" PRIx64 "\n"
> + "\t host_phys_addr : %" PRIx64 "\n"
> + "\t size : %" PRIx64 "\n",
> + i,
> + page->guest_phys_addr,
> + page->host_phys_addr,
> + page->size);
> + }
> +}
> +
> static int
> vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
> {
> @@ -396,6 +471,13 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
> dev->mem = NULL;
> }
>
> + dev->nr_guest_pages = 0;
> + if (!dev->guest_pages) {
> + dev->max_guest_pages = 8;
> + dev->guest_pages = malloc(dev->max_guest_pages *
> + sizeof(struct guest_page));
> + }
> +
> dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) +
> sizeof(struct virtio_memory_region) * memory.nregions, 0);
> if (dev->mem == NULL) {
> @@ -447,6 +529,8 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
> reg->mmap_size = mmap_size;
> reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
>
> + add_guest_pages(dev, reg, alignment);
> +
> RTE_LOG(INFO, VHOST_CONFIG,
> "guest memory region %u, size: 0x%" PRIx64 "\n"
> "\t guest physical addr: 0x%" PRIx64 "\n"
> @@ -466,6 +550,8 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
> mmap_offset);
> }
>
> + dump_guest_pages(dev);
> +
> return 0;
>
> err_mmap:
>
On Tue, Aug 23, 2016 at 11:58:42AM +0200, Maxime Coquelin wrote:
> >
> >+/* Convert guest physical address to host physical address */
> >+static inline phys_addr_t __attribute__((always_inline))
> >+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
> >+{
> >+ uint32_t i;
> >+ struct guest_page *page;
> >+
> >+ for (i = 0; i < dev->nr_guest_pages; i++) {
> >+ page = &dev->guest_pages[i];
> >+
> >+ if (gpa >= page->guest_phys_addr &&
> >+ gpa + size < page->guest_phys_addr + page->size) {
> Shouldn't be '<=' here?
Oops, you are right.
> >+ return gpa - page->guest_phys_addr +
> >+ page->host_phys_addr;
> >+ }
> >+ }
> >+
> >+ return 0;
> >+}
> >+
> > struct virtio_net_device_ops const *notify_ops;
> > struct virtio_net *get_device(int vid);
> >
> >diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> >index d2071fd..045d4f0 100644
> >--- a/lib/librte_vhost/vhost_user.c
> >+++ b/lib/librte_vhost/vhost_user.c
> >@@ -372,6 +372,81 @@ vhost_user_set_vring_base(struct virtio_net *dev,
> > return 0;
> > }
> >
> >+static void
> >+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
> >+ uint64_t host_phys_addr, uint64_t size)
> >+{
> >+ struct guest_page *page;
> >+
> >+ if (dev->nr_guest_pages == dev->max_guest_pages) {
> >+ dev->max_guest_pages *= 2;
> >+ dev->guest_pages = realloc(dev->guest_pages,
> >+ dev->max_guest_pages * sizeof(*page));
>
> Maybe realloc return could be checked?
Yes, I should have done that. Besides, I also forgot to free it at
somewhere. Will fix it.
>
> >+ }
> >+
> >+ page = &dev->guest_pages[dev->nr_guest_pages++];
> >+ page->guest_phys_addr = guest_phys_addr;
> >+ page->host_phys_addr = host_phys_addr;
> >+ page->size = size;
> >+}
> >+
> >+static void
> >+add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
> >+ uint64_t page_size)
> >+{
> >+ uint64_t reg_size = reg->size;
> >+ uint64_t host_user_addr = reg->host_user_addr;
> >+ uint64_t guest_phys_addr = reg->guest_phys_addr;
> >+ uint64_t host_phys_addr;
> >+ uint64_t size;
> >+ uint32_t pre_read;
> >+
> >+ pre_read = *((uint32_t *)(uintptr_t)host_user_addr);
> >+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> >+ size = page_size - (guest_phys_addr & (page_size - 1));
> >+ size = RTE_MIN(size, reg_size);
> >+
> >+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
> >+ host_user_addr += size;
> >+ guest_phys_addr += size;
> >+ reg_size -= size;
> >+
> >+ while (reg_size > 0) {
> >+ pre_read += *((uint32_t *)(uintptr_t)host_user_addr);
> >+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
> >+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, page_size);
> >+
> >+ host_user_addr += page_size;
> >+ guest_phys_addr += page_size;
> >+ reg_size -= page_size;
> >+ }
> >+
> >+ /* FIXME */
> >+ RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
> For my information, what is the purpose of pre_read?
Again, I put a FIXME here, but I forgot to add some explanation.
Here is the thing: the read will make sure the kernel populate the
corresponding PTE entry, so that rte_mem_virt2phy() will return proper
physical address, otherwise, invalid value is returned.
I can't simply do the read but do not actually reference/consume it.
Otherwise, the compiler will treat it as some noops and remove it.
An ugly RTE_LOG will make sure the read operation is not eliminated.
I'm seeking a more proper way to achieve that. Maybe I can add a new
field in virtio_net structure and store it there.
Or, do you have better ideas?
--yliu
On 08/23/2016 02:32 PM, Yuanhan Liu wrote:
>>> +
>>> > >+ /* FIXME */
>>> > >+ RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
>> > For my information, what is the purpose of pre_read?
> Again, I put a FIXME here, but I forgot to add some explanation.
>
> Here is the thing: the read will make sure the kernel populate the
> corresponding PTE entry, so that rte_mem_virt2phy() will return proper
> physical address, otherwise, invalid value is returned.
>
> I can't simply do the read but do not actually reference/consume it.
> Otherwise, the compiler will treat it as some noops and remove it.
>
> An ugly RTE_LOG will make sure the read operation is not eliminated.
> I'm seeking a more proper way to achieve that. Maybe I can add a new
> field in virtio_net structure and store it there.
>
> Or, do you have better ideas?
This behavior is pretty twisted, no?
Shouldn't be rte_mem_virt2phy() role to ensure returning a valid value?
I have no better idea for now, but I will think about it.
Regards,
Maxime
On Tue, Aug 23, 2016 at 03:25:33PM +0200, Maxime Coquelin wrote:
>
>
> On 08/23/2016 02:32 PM, Yuanhan Liu wrote:
> >>>+
> >>>> >+ /* FIXME */
> >>>> >+ RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
> >>> For my information, what is the purpose of pre_read?
> >Again, I put a FIXME here, but I forgot to add some explanation.
> >
> >Here is the thing: the read will make sure the kernel populate the
> >corresponding PTE entry, so that rte_mem_virt2phy() will return proper
> >physical address, otherwise, invalid value is returned.
> >
> >I can't simply do the read but do not actually reference/consume it.
> >Otherwise, the compiler will treat it as some noops and remove it.
> >
> >An ugly RTE_LOG will make sure the read operation is not eliminated.
> >I'm seeking a more proper way to achieve that. Maybe I can add a new
> >field in virtio_net structure and store it there.
> >
> >Or, do you have better ideas?
>
> This behavior is pretty twisted, no?
I have to say, yes, kind of.
> Shouldn't be rte_mem_virt2phy() role to ensure returning a valid value?
Not exactly. I think rte_mem_virt2phy() is more likely to fetch the
physical address of huge pages. And for those huge pages, EAL makes
sure they will be populated: it used to do a zero memset before to
achieve that. Since 5ce3ace1de45 ("eal: remove unnecessary hugepage
zero-filling"), it uses MAP_POPULATE option instead.
So, thank you that you just remind me of the MAP_POPULATE option.
I just had a quick try, it worked like a charm :)
--yliu
> I have no better idea for now, but I will think about it.
>
> Regards,
> Maxime
On 08/23/2016 03:49 PM, Yuanhan Liu wrote:
> On Tue, Aug 23, 2016 at 03:25:33PM +0200, Maxime Coquelin wrote:
>>
>>
>> On 08/23/2016 02:32 PM, Yuanhan Liu wrote:
>>>>> +
>>>>>>> + /* FIXME */
>>>>>>> + RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
>>>>> For my information, what is the purpose of pre_read?
>>> Again, I put a FIXME here, but I forgot to add some explanation.
>>>
>>> Here is the thing: the read will make sure the kernel populate the
>>> corresponding PTE entry, so that rte_mem_virt2phy() will return proper
>>> physical address, otherwise, invalid value is returned.
>>>
>>> I can't simply do the read but do not actually reference/consume it.
>>> Otherwise, the compiler will treat it as some noops and remove it.
>>>
>>> An ugly RTE_LOG will make sure the read operation is not eliminated.
>>> I'm seeking a more proper way to achieve that. Maybe I can add a new
>>> field in virtio_net structure and store it there.
>>>
>>> Or, do you have better ideas?
>>
>> This behavior is pretty twisted, no?
>
> I have to say, yes, kind of.
>
>> Shouldn't be rte_mem_virt2phy() role to ensure returning a valid value?
>
> Not exactly. I think rte_mem_virt2phy() is more likely to fetch the
> physical address of huge pages. And for those huge pages, EAL makes
> sure they will be populated: it used to do a zero memset before to
> achieve that. Since 5ce3ace1de45 ("eal: remove unnecessary hugepage
> zero-filling"), it uses MAP_POPULATE option instead.
>
> So, thank you that you just remind me of the MAP_POPULATE option.
> I just had a quick try, it worked like a charm :)
Excellent! :)
@@ -114,6 +114,12 @@ struct vhost_virtqueue {
#define VIRTIO_F_VERSION_1 32
#endif
+struct guest_page {
+ uint64_t guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+};
+
/**
* Device structure contains all configuration information relating
* to the device.
@@ -137,6 +143,10 @@ struct virtio_net {
uint64_t log_addr;
struct ether_addr mac;
+ uint32_t nr_guest_pages;
+ uint32_t max_guest_pages;
+ struct guest_page *guest_pages;
+
} __rte_cache_aligned;
/**
@@ -217,6 +227,26 @@ gpa_to_vva(struct virtio_net *dev, uint64_t gpa)
return 0;
}
+/* Convert guest physical address to host physical address */
+static inline phys_addr_t __attribute__((always_inline))
+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ if (gpa >= page->guest_phys_addr &&
+ gpa + size < page->guest_phys_addr + page->size) {
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ }
+ }
+
+ return 0;
+}
+
struct virtio_net_device_ops const *notify_ops;
struct virtio_net *get_device(int vid);
@@ -372,6 +372,81 @@ vhost_user_set_vring_base(struct virtio_net *dev,
return 0;
}
+static void
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+ uint64_t host_phys_addr, uint64_t size)
+{
+ struct guest_page *page;
+
+ if (dev->nr_guest_pages == dev->max_guest_pages) {
+ dev->max_guest_pages *= 2;
+ dev->guest_pages = realloc(dev->guest_pages,
+ dev->max_guest_pages * sizeof(*page));
+ }
+
+ page = &dev->guest_pages[dev->nr_guest_pages++];
+ page->guest_phys_addr = guest_phys_addr;
+ page->host_phys_addr = host_phys_addr;
+ page->size = size;
+}
+
+static void
+add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
+ uint64_t page_size)
+{
+ uint64_t reg_size = reg->size;
+ uint64_t host_user_addr = reg->host_user_addr;
+ uint64_t guest_phys_addr = reg->guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+ uint32_t pre_read;
+
+ pre_read = *((uint32_t *)(uintptr_t)host_user_addr);
+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+ size = page_size - (guest_phys_addr & (page_size - 1));
+ size = RTE_MIN(size, reg_size);
+
+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+ host_user_addr += size;
+ guest_phys_addr += size;
+ reg_size -= size;
+
+ while (reg_size > 0) {
+ pre_read += *((uint32_t *)(uintptr_t)host_user_addr);
+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, page_size);
+
+ host_user_addr += page_size;
+ guest_phys_addr += page_size;
+ reg_size -= page_size;
+ }
+
+ /* FIXME */
+ RTE_LOG(INFO, VHOST_CONFIG, ":: %u ::\n", pre_read);
+}
+
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "guest physical page region %u\n"
+ "\t guest_phys_addr: %" PRIx64 "\n"
+ "\t host_phys_addr : %" PRIx64 "\n"
+ "\t size : %" PRIx64 "\n",
+ i,
+ page->guest_phys_addr,
+ page->host_phys_addr,
+ page->size);
+ }
+}
+
static int
vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
{
@@ -396,6 +471,13 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
dev->mem = NULL;
}
+ dev->nr_guest_pages = 0;
+ if (!dev->guest_pages) {
+ dev->max_guest_pages = 8;
+ dev->guest_pages = malloc(dev->max_guest_pages *
+ sizeof(struct guest_page));
+ }
+
dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) +
sizeof(struct virtio_memory_region) * memory.nregions, 0);
if (dev->mem == NULL) {
@@ -447,6 +529,8 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
reg->mmap_size = mmap_size;
reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
+ add_guest_pages(dev, reg, alignment);
+
RTE_LOG(INFO, VHOST_CONFIG,
"guest memory region %u, size: 0x%" PRIx64 "\n"
"\t guest physical addr: 0x%" PRIx64 "\n"
@@ -466,6 +550,8 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
mmap_offset);
}
+ dump_guest_pages(dev);
+
return 0;
err_mmap: