bus/pci: check if 5-level paging is enabled when testing IOMMU address width
Checks
Commit Message
The kernel version 4.14 released with the support of 5-level paging.
When PML5 enabled, user-space virtual addresses uses up to 56 bits.
see kernel's Documentation/x86/x86_64/mm.txt.
Signed-off-by: Drocula <quzeyao@gmail.com>
---
drivers/bus/pci/linux/pci.c | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)
Comments
On 05-Aug-18 7:41 PM, Drocula wrote:
> The kernel version 4.14 released with the support of 5-level paging.
> When PML5 enabled, user-space virtual addresses uses up to 56 bits.
> see kernel's Documentation/x86/x86_64/mm.txt.
>
> Signed-off-by: Drocula <quzeyao@gmail.com>
> ---
> drivers/bus/pci/linux/pci.c | 27 +++++++++++++++++++++++++--
> 1 file changed, 25 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
> index 004600f..8913d6d 100644
> --- a/drivers/bus/pci/linux/pci.c
> +++ b/drivers/bus/pci/linux/pci.c
> @@ -4,6 +4,7 @@
>
> #include <string.h>
> #include <dirent.h>
> +#include <sys/mman.h>
>
> #include <rte_log.h>
> #include <rte_bus.h>
> @@ -553,12 +554,34 @@
> }
>
> #if defined(RTE_ARCH_X86)
> +/*
> + * Try to detect whether the system uses 5-level page table.
> + */
> +static bool
> +system_uses_PML5(void)
> +{
> + void *page_4k, *mask = (void *)0xf0000000000000;
> + page_4k = mmap(mask, 4096, PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +
> + if (page_4k == (void *) -1)
> + return false;
Shouldn't this be MAP_FAILED?
> + munmap(page_4k, 4096);
> +
> + if ((unsigned long)page_4k & (unsigned long)mask)
> + return true;
> + return false;
> +}
> +
> static bool
> pci_one_device_iommu_support_va(struct rte_pci_device *dev)
> {
> #define VTD_CAP_MGAW_SHIFT 16
> #define VTD_CAP_MGAW_MASK (0x3fULL << VTD_CAP_MGAW_SHIFT)
> -#define X86_VA_WIDTH 47 /* From Documentation/x86/x86_64/mm.txt */
> +/* From Documentation/x86/x86_64/mm.txt */
> +#define X86_VA_WIDTH_PML4 47
> +#define X86_VA_WIDTH_PML5 56
> +
> struct rte_pci_addr *addr = &dev->addr;
> char filename[PATH_MAX];
> FILE *fp;
> @@ -589,7 +612,7 @@
> fclose(fp);
>
> mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
> - if (mgaw < X86_VA_WIDTH)
> + if (mgaw < (system_uses_PML5() ? X86_VA_WIDTH_PML5 : X86_VA_WIDTH_PML4))
This is perhaps nitpicking and a question of personal preferences, but i
think storing this in a var would be more readable than doing ternary
operator inside of an if statement.
> return false;
>
> return true;
>
Thanks for the patch, there are some minor style/cleanups that
could be done.
> #if defined(RTE_ARCH_X86)
Isn't this going to apply to 64 bit only?
> +/*
> + * Try to detect whether the system uses 5-level page table.
> + */
> +static bool
> +system_uses_PML5(void)
> +{
> + void *page_4k, *mask = (void *)0xf0000000000000;
Magic constants expressed like this seem wrong. Why not use
shift to make it obvious.
Also, you are assuming a particular layout of memory on
Linux which might be problematic. Plus if there is already
some memory in use there, it won't work.
> + page_4k = mmap(mask, 4096, PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
Since you are probing maybe MAP_FIXED is what you want.
> +
> + if (page_4k == (void *) -1)
> + return false;
Use MMAP_FAILED here.
> + munmap(page_4k, 4096);
> +
> + if ((unsigned long)page_4k & (unsigned long)mask)
> + return true;
> + return false;
Wouldn't this work the same for what you expect?
return page_4k == mask;
I.e you expect kernel to put page where you want.
Thanks, will refine in v2.
On Thu, Aug 9, 2018, 18:49 Burakov, Anatoly <anatoly.burakov@intel.com>
wrote:
> On 05-Aug-18 7:41 PM, Drocula wrote:
> > The kernel version 4.14 released with the support of 5-level paging.
> > When PML5 enabled, user-space virtual addresses uses up to 56 bits.
> > see kernel's Documentation/x86/x86_64/mm.txt.
> >
> > Signed-off-by: Drocula <quzeyao@gmail.com>
> > ---
> > drivers/bus/pci/linux/pci.c | 27 +++++++++++++++++++++++++--
> > 1 file changed, 25 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
> > index 004600f..8913d6d 100644
> > --- a/drivers/bus/pci/linux/pci.c
> > +++ b/drivers/bus/pci/linux/pci.c
> > @@ -4,6 +4,7 @@
> >
> > #include <string.h>
> > #include <dirent.h>
> > +#include <sys/mman.h>
> >
> > #include <rte_log.h>
> > #include <rte_bus.h>
> > @@ -553,12 +554,34 @@
> > }
> >
> > #if defined(RTE_ARCH_X86)
> > +/*
> > + * Try to detect whether the system uses 5-level page table.
> > + */
> > +static bool
> > +system_uses_PML5(void)
> > +{
> > + void *page_4k, *mask = (void *)0xf0000000000000;
> > + page_4k = mmap(mask, 4096, PROT_READ | PROT_WRITE,
> > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > +
> > + if (page_4k == (void *) -1)
> > + return false;
>
> Shouldn't this be MAP_FAILED?
>
> > + munmap(page_4k, 4096);
> > +
> > + if ((unsigned long)page_4k & (unsigned long)mask)
> > + return true;
> > + return false;
> > +}
> > +
> > static bool
> > pci_one_device_iommu_support_va(struct rte_pci_device *dev)
> > {
> > #define VTD_CAP_MGAW_SHIFT 16
> > #define VTD_CAP_MGAW_MASK (0x3fULL << VTD_CAP_MGAW_SHIFT)
> > -#define X86_VA_WIDTH 47 /* From Documentation/x86/x86_64/mm.txt */
> > +/* From Documentation/x86/x86_64/mm.txt */
> > +#define X86_VA_WIDTH_PML4 47
> > +#define X86_VA_WIDTH_PML5 56
> > +
> > struct rte_pci_addr *addr = &dev->addr;
> > char filename[PATH_MAX];
> > FILE *fp;
> > @@ -589,7 +612,7 @@
> > fclose(fp);
> >
> > mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) +
> 1;
> > - if (mgaw < X86_VA_WIDTH)
> > + if (mgaw < (system_uses_PML5() ? X86_VA_WIDTH_PML5 :
> X86_VA_WIDTH_PML4))
>
> This is perhaps nitpicking and a question of personal preferences, but i
> think storing this in a var would be more readable than doing ternary
> operator inside of an if statement.
>
> > return false;
> >
> > return true;
> >
>
>
> --
> Thanks,
> Anatoly
>
First, thanks for your suggestions.
When using the MAP_FIXED flag, mmap will return an MMAP_FAILED if
0xf0000000000000 is not available.
In this case, I want mmap to return an address near 0xf0000000000000.
I will submit v2.
On Fri, Aug 10, 2018, 01:03 Stephen Hemminger <stephen@networkplumber.org>
wrote:
> Thanks for the patch, there are some minor style/cleanups that
> could be done.
>
> > #if defined(RTE_ARCH_X86)
>
> Isn't this going to apply to 64 bit only?
>
> > +/*
> > + * Try to detect whether the system uses 5-level page table.
> > + */
> > +static bool
> > +system_uses_PML5(void)
> > +{
> > + void *page_4k, *mask = (void *)0xf0000000000000;
>
> Magic constants expressed like this seem wrong. Why not use
> shift to make it obvious.
>
> Also, you are assuming a particular layout of memory on
> Linux which might be problematic. Plus if there is already
> some memory in use there, it won't work.
>
> > + page_4k = mmap(mask, 4096, PROT_READ | PROT_WRITE,
> > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>
> Since you are probing maybe MAP_FIXED is what you want.
>
> > +
> > + if (page_4k == (void *) -1)
> > + return false;
> Use MMAP_FAILED here.
>
> > + munmap(page_4k, 4096);
> > +
> > + if ((unsigned long)page_4k & (unsigned long)mask)
> > + return true;
> > + return false;
>
> Wouldn't this work the same for what you expect?
> return page_4k == mask;
>
> I.e you expect kernel to put page where you want.
>
On 10-Aug-18 9:35 AM, Drocula wrote:
> First, thanks for your suggestions.
>
> When using the MAP_FIXED flag, mmap will return an MMAP_FAILED if
> 0xf0000000000000 is not available.
>
> In this case, I want mmap to return an address near 0xf0000000000000.
>
> I will submit v2.
How can we be sure there's nothing mapped at that address? I think the
original code was correct - try mapping around that address, and see if
we get *something* close to it with the right bits set. MAP_FIXED seems
dangerous to use without knowing that there's nothing there. Recent
kernels have added a safer version of MAP_FIXED, but obviously it won't
work on the majority of kernel versions we support.
>
> On Fri, Aug 10, 2018, 01:03 Stephen Hemminger <stephen@networkplumber.org>
> wrote:
>
>> Thanks for the patch, there are some minor style/cleanups that
>> could be done.
>>
>>> #if defined(RTE_ARCH_X86)
>>
>> Isn't this going to apply to 64 bit only?
>>
>>> +/*
>>> + * Try to detect whether the system uses 5-level page table.
>>> + */
>>> +static bool
>>> +system_uses_PML5(void)
>>> +{
>>> + void *page_4k, *mask = (void *)0xf0000000000000;
>>
>> Magic constants expressed like this seem wrong. Why not use
>> shift to make it obvious.
>>
>> Also, you are assuming a particular layout of memory on
>> Linux which might be problematic. Plus if there is already
>> some memory in use there, it won't work.
>>
>>> + page_4k = mmap(mask, 4096, PROT_READ | PROT_WRITE,
>>> + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>>
>> Since you are probing maybe MAP_FIXED is what you want.
>>
>>> +
>>> + if (page_4k == (void *) -1)
>>> + return false;
>> Use MMAP_FAILED here.
>>
>>> + munmap(page_4k, 4096);
>>> +
>>> + if ((unsigned long)page_4k & (unsigned long)mask)
>>> + return true;
>>> + return false;
>>
>> Wouldn't this work the same for what you expect?
>> return page_4k == mask;
>>
>> I.e you expect kernel to put page where you want.
>>
>
This patch is no long relevant since the current DPDK code
no longer depends on VA width. It should be rejected.
@@ -4,6 +4,7 @@
#include <string.h>
#include <dirent.h>
+#include <sys/mman.h>
#include <rte_log.h>
#include <rte_bus.h>
@@ -553,12 +554,34 @@
}
#if defined(RTE_ARCH_X86)
+/*
+ * Try to detect whether the system uses 5-level page table.
+ */
+static bool
+system_uses_PML5(void)
+{
+ void *page_4k, *mask = (void *)0xf0000000000000;
+ page_4k = mmap(mask, 4096, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (page_4k == (void *) -1)
+ return false;
+ munmap(page_4k, 4096);
+
+ if ((unsigned long)page_4k & (unsigned long)mask)
+ return true;
+ return false;
+}
+
static bool
pci_one_device_iommu_support_va(struct rte_pci_device *dev)
{
#define VTD_CAP_MGAW_SHIFT 16
#define VTD_CAP_MGAW_MASK (0x3fULL << VTD_CAP_MGAW_SHIFT)
-#define X86_VA_WIDTH 47 /* From Documentation/x86/x86_64/mm.txt */
+/* From Documentation/x86/x86_64/mm.txt */
+#define X86_VA_WIDTH_PML4 47
+#define X86_VA_WIDTH_PML5 56
+
struct rte_pci_addr *addr = &dev->addr;
char filename[PATH_MAX];
FILE *fp;
@@ -589,7 +612,7 @@
fclose(fp);
mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
- if (mgaw < X86_VA_WIDTH)
+ if (mgaw < (system_uses_PML5() ? X86_VA_WIDTH_PML5 : X86_VA_WIDTH_PML4))
return false;
return true;