[v6,4/4] kernel/linux/kni: add IOVA support in kni module

Message ID 20190625035700.2953-5-vattunuru@marvell.com
State Superseded, archived
Delegated to: Thomas Monjalon
Headers show
Series
  • add IOVA = VA support in KNI
Related show

Checks

Context Check Description
ci/Intel-compilation fail Compilation issues
ci/checkpatch success coding style OK

Commit Message

Vamsi Krishna Attunuru June 25, 2019, 3:57 a.m.
From: Kiran Kumar K <kirankumark@marvell.com>

Patch adds support for kernel module to work in IOVA = VA mode,
the idea is to get physical address from iova address using
iommu_iova_to_phys API and later use phys_to_virt API to
convert the physical address to kernel virtual address.

When compared with IOVA = PA mode, there is no performance
drop with this approach.

This approach does not work with the kernel versions less
than 4.4.0 because of API compatibility issues.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 kernel/linux/kni/kni_dev.h  |  3 ++
 kernel/linux/kni/kni_misc.c | 62 ++++++++++++++++++++++++++++++------
 kernel/linux/kni/kni_net.c  | 76 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 119 insertions(+), 22 deletions(-)

Comments

Ferruh Yigit July 11, 2019, 4:30 p.m. | #1
On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
> From: Kiran Kumar K <kirankumark@marvell.com>
> 
> Patch adds support for kernel module to work in IOVA = VA mode,
> the idea is to get physical address from iova address using
> iommu_iova_to_phys API and later use phys_to_virt API to
> convert the physical address to kernel virtual address.
> 
> When compared with IOVA = PA mode, there is no performance
> drop with this approach.
> 
> This approach does not work with the kernel versions less
> than 4.4.0 because of API compatibility issues.
> 
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>

<...>

> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>  
>  	/* Translate user space info into kernel space info */
> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> -	kni->free_q = phys_to_virt(dev_info.free_phys);
> -
> -	kni->req_q = phys_to_virt(dev_info.req_phys);
> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> -	kni->sync_va = dev_info.sync_va;
> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> +	if (dev_info.iova_mode) {
> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE

We have "kni/compat.h" to put the version checks, please use abstracted feature
checks only in the code.
From experience this goes ugly quickly with the addition to distro kernels and
their specific versioning, so better to hide these all from the source code.

And this version requirement needs to be documented in kni doc.

> +		(void)pci;
> +		pr_err("Kernel version is not supported\n");

Can you please include 'iova_mode' condition into the message log, because this
kernel version is supported if user wants to use via 'iova_mode == 0' condition.

> +		return -EINVAL;
> +#else
> +		pci = pci_get_device(dev_info.vendor_id,
> +				     dev_info.device_id, NULL);
> +		while (pci) {
> +			if ((pci->bus->number == dev_info.bus) &&
> +			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
> +			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
> +				domain = iommu_get_domain_for_dev(&pci->dev);
> +				break;
> +			}
> +			pci = pci_get_device(dev_info.vendor_id,
> +					     dev_info.device_id, pci);
> +		}

What if 'pci' is NULL here?
In kni it is not required to provide a device at all.

<...>

> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>  			return;
>  
>  		for (i = 0; i < num_rx; i++) {
> -			kva = pa2kva(kni->pa[i]);
> +			if (likely(kni->iova_mode == 1))
> +				kva = iova2kva(kni, kni->pa[i]);
> +			else
> +				kva = pa2kva(kni->pa[i]);

To reduce the churn, what about updating the 'pa2kva()' and put the
"(kni->iova_mode == 1)" check there? Does it help? (not only 'pa2kva()' but its
friends also, and if it makes more sense agree to rename the functions)

And btw, why 'likely' case is "kni->iova_mode == 1"?
Stephen Hemminger July 11, 2019, 4:43 p.m. | #2
On Tue, 25 Jun 2019 09:27:00 +0530
<vattunuru@marvell.com> wrote:

> diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
> index 1fc5eeb..b70c827 100644
> --- a/kernel/linux/kni/kni_misc.c
> +++ b/kernel/linux/kni/kni_misc.c
> @@ -294,6 +294,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	struct rte_kni_device_info dev_info;
>  	struct net_device *net_dev = NULL;
>  	struct kni_dev *kni, *dev, *n;
> +	struct pci_dev *pci = NULL;
> +	struct iommu_domain *domain = NULL;

Please don't do unnecessary initailization. It defeats the purpose
of compiler and static checkers.

> +	phys_addr_t phys_addr;
>  
>  	pr_info("Creating kni...\n");
>  	/* Check the buffer size, to avoid warning */
> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>  
>  	/* Translate user space info into kernel space info */
> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> -	kni->free_q = phys_to_virt(dev_info.free_phys);
> -
> -	kni->req_q = phys_to_virt(dev_info.req_phys);
> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> -	kni->sync_va = dev_info.sync_va;
> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> +	if (dev_info.iova_mode) {
> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> +		(void)pci;
> +		pr_err("Kernel version is not supported\n");
> +		return -EINVAL;
> +#else
> +		pci = pci_get_device(dev_info.vendor_id,
> +				     dev_info.device_id, NULL);
> +		while (pci) {
> +			if ((pci->bus->number == dev_info.bus) &&
> +			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
> +			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
> +				domain = iommu_get_domain_for_dev(&pci->dev);
> +				break;
> +			}
> +			pci = pci_get_device(dev_info.vendor_id,
> +					     dev_info.device_id, pci);
> +		}
> +#endif

Why not move the variable pci inside the if() statement, then (void)pci
is unnecessary.
Ferruh Yigit July 12, 2019, 11:10 a.m. | #3
On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
> 
> 
> 
> --------------------------------------------------------------------------------
> *From:* Ferruh Yigit <ferruh.yigit@intel.com>
> *Sent:* Thursday, July 11, 2019 10:00 PM
> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni
> module
>  
> External Email
> 
> ----------------------------------------------------------------------
> On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
>> From: Kiran Kumar K <kirankumark@marvell.com>
>> 
>> Patch adds support for kernel module to work in IOVA = VA mode,
>> the idea is to get physical address from iova address using
>> iommu_iova_to_phys API and later use phys_to_virt API to
>> convert the physical address to kernel virtual address.
>> 
>> When compared with IOVA = PA mode, there is no performance
>> drop with this approach.
>> 
>> This approach does not work with the kernel versions less
>> than 4.4.0 because of API compatibility issues.
>> 
>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> 
> <...>
> 
>> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>>        strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>>  
>>        /* Translate user space info into kernel space info */
>> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
>> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
>> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
>> -     kni->free_q = phys_to_virt(dev_info.free_phys);
>> -
>> -     kni->req_q = phys_to_virt(dev_info.req_phys);
>> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
>> -     kni->sync_va = dev_info.sync_va;
>> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
>> +     if (dev_info.iova_mode) {
>> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> 
> We have "kni/compat.h" to put the version checks, please use abstracted feature
> checks only in the code.
> From experience this goes ugly quickly with the addition to distro kernels and
> their specific versioning, so better to hide these all from the source code.
> 
> And this version requirement needs to be documented in kni doc.
> 
> ack
> 
>> +             (void)pci;
>> +             pr_err("Kernel version is not supported\n");
> 
> Can you please include 'iova_mode' condition into the message log, because this
> kernel version is supported if user wants to use via 'iova_mode == 0' condition.
> 
> ack
> 
>> +             return -EINVAL;
>> +#else
>> +             pci = pci_get_device(dev_info.vendor_id,
>> +                                  dev_info.device_id, NULL);
>> +             while (pci) {
>> +                     if ((pci->bus->number == dev_info.bus) &&
>> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
>> +                         (PCI_FUNC(pci->devfn) == dev_info.function)) {
>> +                             domain = iommu_get_domain_for_dev(&pci->dev);
>> +                             break;
>> +                     }
>> +                     pci = pci_get_device(dev_info.vendor_id,
>> +                                          dev_info.device_id, pci);
>> +             }
> 
> What if 'pci' is NULL here?
> In kni it is not required to provide a device at all.
> 
> Ack, will add a NULL check.
> other point is not clear to me, device info is absolutely required at least
> for  IOVA=VA mode, since it requires to procure iommu domain details.

"device info is absolutely required" *only* for IOVA=VA mode, so user may skip
to provide it.

> Any thoughts or ways to address this without device.?

Return error if 'iova_mode' requested but device info not?

But you didn't replied to passing 'iova_mode' from application, I would like
hear what you are thinking about it..

> 
> <...>
> 
>> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>>                        return;
>>  
>>                for (i = 0; i < num_rx; i++) {
>> -                     kva = pa2kva(kni->pa[i]);
>> +                     if (likely(kni->iova_mode == 1))
>> +                             kva = iova2kva(kni, kni->pa[i]);
>> +                     else
>> +                             kva = pa2kva(kni->pa[i]);
> 
> To reduce the churn, what about updating the 'pa2kva()' and put the
> "(kni->iova_mode == 1)" check there? Does it help? (not only 'pa2kva()' but its
> friends also, and if it makes more sense agree to rename the functions)
> 
> No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova address might
> crash, hence the if..else check is added.

I understand that part.
What I am suggestion is something like this:

kva = common_function(kni, kni->pa[i]);

---

common_function() {
	if (unlikely(kni->iova_mode == 1))
		return iova2kva(kni, kni->pa[i]);
	return pa2kva(kni->pa[i]);
}

To hide the check in the function and make code more readable

> 
> And btw, why 'likely' case is "kni->iova_mode == 1"?
> 
> no specific case other than branch predict, will remove this if it's really
> harmful to PA mode.
Vamsi Krishna Attunuru July 12, 2019, 4:29 p.m. | #4
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Friday, July 12, 2019 4:40 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in
> kni module
> 
> On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
> >
> >
> >
> > ----------------------------------------------------------------------
> > ----------
> > *From:* Ferruh Yigit <ferruh.yigit@intel.com>
> > *Sent:* Thursday, July 11, 2019 10:00 PM
> > *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> > *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> > Kokkilagadda
> > *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support
> > in kni module
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
> >> From: Kiran Kumar K <kirankumark@marvell.com>
> >>
> >> Patch adds support for kernel module to work in IOVA = VA mode, the
> >> idea is to get physical address from iova address using
> >> iommu_iova_to_phys API and later use phys_to_virt API to convert the
> >> physical address to kernel virtual address.
> >>
> >> When compared with IOVA = PA mode, there is no performance drop
> with
> >> this approach.
> >>
> >> This approach does not work with the kernel versions less than 4.4.0
> >> because of API compatibility issues.
> >>
> >> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > <...>
> >
> >> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t
> >>ioctl_num,
> >>        strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> >>
> >>        /* Translate user space info into kernel space info */
> >> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
> >> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
> >> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> >> -     kni->free_q = phys_to_virt(dev_info.free_phys);
> >> -
> >> -     kni->req_q = phys_to_virt(dev_info.req_phys);
> >> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
> >> -     kni->sync_va = dev_info.sync_va;
> >> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> >> +     if (dev_info.iova_mode) {
> >> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> >
> > We have "kni/compat.h" to put the version checks, please use
> > abstracted feature checks only in the code.
> > From experience this goes ugly quickly with the addition to distro
> > kernels and their specific versioning, so better to hide these all from the
> source code.
> >
> > And this version requirement needs to be documented in kni doc.
> >
> > ack
> >
> >> +             (void)pci;
> >> +             pr_err("Kernel version is not supported\n");
> >
> > Can you please include 'iova_mode' condition into the message log,
> > because this kernel version is supported if user wants to use via
> 'iova_mode == 0' condition.
> >
> > ack
> >
> >> +             return -EINVAL;
> >> +#else
> >> +             pci = pci_get_device(dev_info.vendor_id,
> >> +                                  dev_info.device_id, NULL);
> >> +             while (pci) {
> >> +                     if ((pci->bus->number == dev_info.bus) &&
> >> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
> >> +                         (PCI_FUNC(pci->devfn) ==
> >> +dev_info.function)) {
> >> +                             domain =
> >> +iommu_get_domain_for_dev(&pci->dev);
> >> +                             break;
> >> +                     }
> >> +                     pci = pci_get_device(dev_info.vendor_id,
> >> +                                          dev_info.device_id, pci);
> >> +             }
> >
> > What if 'pci' is NULL here?
> > In kni it is not required to provide a device at all.
> >
> > Ack, will add a NULL check.
> > other point is not clear to me, device info is absolutely required at
> > least for  IOVA=VA mode, since it requires to procure iommu domain
> details.
> 
> "device info is absolutely required" *only* for IOVA=VA mode, so user may
> skip to provide it.
> 
> > Any thoughts or ways to address this without device.?
> 
> Return error if 'iova_mode' requested but device info not?
> 
> But you didn't replied to passing 'iova_mode' from application, I would like
> hear what you are thinking about it..

One query regarding defining config for kni
Where this config comes, eal or kni sample app or KNI public API?

> 
> >
> > <...>
> >
> >> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
> >>                        return;
> >>
> >>                for (i = 0; i < num_rx; i++) {
> >> -                     kva = pa2kva(kni->pa[i]);
> >> +                     if (likely(kni->iova_mode == 1))
> >> +                             kva = iova2kva(kni, kni->pa[i]);
> >> +                     else
> >> +                             kva = pa2kva(kni->pa[i]);
> >
> > To reduce the churn, what about updating the 'pa2kva()' and put the
> > "(kni->iova_mode == 1)" check there? Does it help? (not only
> > 'pa2kva()' but its friends also, and if it makes more sense agree to
> > rename the functions)
> >
> > No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova
> > address might crash, hence the if..else check is added.
> 
> I understand that part.
> What I am suggestion is something like this:
> 
> kva = common_function(kni, kni->pa[i]);
> 
> ---
> 
> common_function() {
> 	if (unlikely(kni->iova_mode == 1))
> 		return iova2kva(kni, kni->pa[i]);
> 	return pa2kva(kni->pa[i]);
> }
> 
> To hide the check in the function and make code more readable
> 
> >
> > And btw, why 'likely' case is "kni->iova_mode == 1"?
> >
> > no specific case other than branch predict, will remove this if it's
> > really harmful to PA mode.
Ferruh Yigit July 15, 2019, 11:26 a.m. | #5
On 7/12/2019 5:29 PM, Vamsi Krishna Attunuru wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>> Sent: Friday, July 12, 2019 4:40 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
>> Kokkilagadda <kirankumark@marvell.com>
>> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in
>> kni module
>>
>> On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
>>>
>>>
>>>
>>> ----------------------------------------------------------------------
>>> ----------
>>> *From:* Ferruh Yigit <ferruh.yigit@intel.com>
>>> *Sent:* Thursday, July 11, 2019 10:00 PM
>>> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
>>> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
>>> Kokkilagadda
>>> *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support
>>> in kni module
>>>
>>> External Email
>>>
>>> ----------------------------------------------------------------------
>>> On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
>>>> From: Kiran Kumar K <kirankumark@marvell.com>
>>>>
>>>> Patch adds support for kernel module to work in IOVA = VA mode, the
>>>> idea is to get physical address from iova address using
>>>> iommu_iova_to_phys API and later use phys_to_virt API to convert the
>>>> physical address to kernel virtual address.
>>>>
>>>> When compared with IOVA = PA mode, there is no performance drop
>> with
>>>> this approach.
>>>>
>>>> This approach does not work with the kernel versions less than 4.4.0
>>>> because of API compatibility issues.
>>>>
>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> <...>
>>>
>>>> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t
>>>> ioctl_num,
>>>>         strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>>>>
>>>>         /* Translate user space info into kernel space info */
>>>> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
>>>> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
>>>> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
>>>> -     kni->free_q = phys_to_virt(dev_info.free_phys);
>>>> -
>>>> -     kni->req_q = phys_to_virt(dev_info.req_phys);
>>>> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
>>>> -     kni->sync_va = dev_info.sync_va;
>>>> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
>>>> +     if (dev_info.iova_mode) {
>>>> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
>>>
>>> We have "kni/compat.h" to put the version checks, please use
>>> abstracted feature checks only in the code.
>>> From experience this goes ugly quickly with the addition to distro
>>> kernels and their specific versioning, so better to hide these all from the
>> source code.
>>>
>>> And this version requirement needs to be documented in kni doc.
>>>
>>> ack
>>>
>>>> +             (void)pci;
>>>> +             pr_err("Kernel version is not supported\n");
>>>
>>> Can you please include 'iova_mode' condition into the message log,
>>> because this kernel version is supported if user wants to use via
>> 'iova_mode == 0' condition.
>>>
>>> ack
>>>
>>>> +             return -EINVAL;
>>>> +#else
>>>> +             pci = pci_get_device(dev_info.vendor_id,
>>>> +                                  dev_info.device_id, NULL);
>>>> +             while (pci) {
>>>> +                     if ((pci->bus->number == dev_info.bus) &&
>>>> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
>>>> +                         (PCI_FUNC(pci->devfn) ==
>>>> +dev_info.function)) {
>>>> +                             domain =
>>>> +iommu_get_domain_for_dev(&pci->dev);
>>>> +                             break;
>>>> +                     }
>>>> +                     pci = pci_get_device(dev_info.vendor_id,
>>>> +                                          dev_info.device_id, pci);
>>>> +             }
>>>
>>> What if 'pci' is NULL here?
>>> In kni it is not required to provide a device at all.
>>>
>>> Ack, will add a NULL check.
>>> other point is not clear to me, device info is absolutely required at
>>> least for  IOVA=VA mode, since it requires to procure iommu domain
>> details.
>>
>> "device info is absolutely required" *only* for IOVA=VA mode, so user may
>> skip to provide it.
>>
>>> Any thoughts or ways to address this without device.?
>>
>> Return error if 'iova_mode' requested but device info not?
>>
>> But you didn't replied to passing 'iova_mode' from application, I would like
>> hear what you are thinking about it..
> 
> One query regarding defining config for kni
> Where this config comes, eal or kni sample app or KNI public API?

Config comes from the application, but the KNI public API has to validate if the
request can be justified and return error if can't be.
I think the KNI API check is required to be able to remove the check in the eal.

> 
>>
>>>
>>> <...>
>>>
>>>> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>>>>                         return;
>>>>
>>>>                 for (i = 0; i < num_rx; i++) {
>>>> -                     kva = pa2kva(kni->pa[i]);
>>>> +                     if (likely(kni->iova_mode == 1))
>>>> +                             kva = iova2kva(kni, kni->pa[i]);
>>>> +                     else
>>>> +                             kva = pa2kva(kni->pa[i]);
>>>
>>> To reduce the churn, what about updating the 'pa2kva()' and put the
>>> "(kni->iova_mode == 1)" check there? Does it help? (not only
>>> 'pa2kva()' but its friends also, and if it makes more sense agree to
>>> rename the functions)
>>>
>>> No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova
>>> address might crash, hence the if..else check is added.
>>
>> I understand that part.
>> What I am suggestion is something like this:
>>
>> kva = common_function(kni, kni->pa[i]);
>>
>> ---
>>
>> common_function() {
>> 	if (unlikely(kni->iova_mode == 1))
>> 		return iova2kva(kni, kni->pa[i]);
>> 	return pa2kva(kni->pa[i]);
>> }
>>
>> To hide the check in the function and make code more readable
>>
>>>
>>> And btw, why 'likely' case is "kni->iova_mode == 1"?
>>>
>>> no specific case other than branch predict, will remove this if it's
>>> really harmful to PA mode.
>
Vamsi Krishna Attunuru July 15, 2019, 1:06 p.m. | #6
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Monday, July 15, 2019 4:57 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni
> module
> 
> On 7/12/2019 5:29 PM, Vamsi Krishna Attunuru wrote:
> >
> >
> >> -----Original Message-----
> >> From: Ferruh Yigit <ferruh.yigit@intel.com>
> >> Sent: Friday, July 12, 2019 4:40 PM
> >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> >> Kokkilagadda <kirankumark@marvell.com>
> >> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA
> >> support in kni module
> >>
> >> On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
> >>>
> >>>
> >>>
> >>> --------------------------------------------------------------------
> >>> --
> >>> ----------
> >>> *From:* Ferruh Yigit <ferruh.yigit@intel.com>
> >>> *Sent:* Thursday, July 11, 2019 10:00 PM
> >>> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> >>> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> >>> Kokkilagadda
> >>> *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA
> >>> support in kni module
> >>>
> >>> External Email
> >>>
> >>> --------------------------------------------------------------------
> >>> -- On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
> >>>> From: Kiran Kumar K <kirankumark@marvell.com>
> >>>>
> >>>> Patch adds support for kernel module to work in IOVA = VA mode, the
> >>>> idea is to get physical address from iova address using
> >>>> iommu_iova_to_phys API and later use phys_to_virt API to convert
> >>>> the physical address to kernel virtual address.
> >>>>
> >>>> When compared with IOVA = PA mode, there is no performance drop
> >> with
> >>>> this approach.
> >>>>
> >>>> This approach does not work with the kernel versions less than
> >>>> 4.4.0 because of API compatibility issues.
> >>>>
> >>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >>>
> >>> <...>
> >>>
> >>>> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t
> >>>> ioctl_num,
> >>>>         strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> >>>>
> >>>>         /* Translate user space info into kernel space info */
> >>>> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
> >>>> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
> >>>> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> >>>> -     kni->free_q = phys_to_virt(dev_info.free_phys);
> >>>> -
> >>>> -     kni->req_q = phys_to_virt(dev_info.req_phys);
> >>>> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
> >>>> -     kni->sync_va = dev_info.sync_va;
> >>>> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> >>>> +     if (dev_info.iova_mode) {
> >>>> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> >>>
> >>> We have "kni/compat.h" to put the version checks, please use
> >>> abstracted feature checks only in the code.
> >>> From experience this goes ugly quickly with the addition to distro
> >>> kernels and their specific versioning, so better to hide these all
> >>> from the
> >> source code.
> >>>
> >>> And this version requirement needs to be documented in kni doc.
> >>>
> >>> ack
> >>>
> >>>> +             (void)pci;
> >>>> +             pr_err("Kernel version is not supported\n");
> >>>
> >>> Can you please include 'iova_mode' condition into the message log,
> >>> because this kernel version is supported if user wants to use via
> >> 'iova_mode == 0' condition.
> >>>
> >>> ack
> >>>
> >>>> +             return -EINVAL;
> >>>> +#else
> >>>> +             pci = pci_get_device(dev_info.vendor_id,
> >>>> +                                  dev_info.device_id, NULL);
> >>>> +             while (pci) {
> >>>> +                     if ((pci->bus->number == dev_info.bus) &&
> >>>> +                         (PCI_SLOT(pci->devfn) == dev_info.devid)
> >>>> +&&
> >>>> +                         (PCI_FUNC(pci->devfn) ==
> >>>> +dev_info.function)) {
> >>>> +                             domain =
> >>>> +iommu_get_domain_for_dev(&pci->dev);
> >>>> +                             break;
> >>>> +                     }
> >>>> +                     pci = pci_get_device(dev_info.vendor_id,
> >>>> +                                          dev_info.device_id,
> >>>> +pci);
> >>>> +             }
> >>>
> >>> What if 'pci' is NULL here?
> >>> In kni it is not required to provide a device at all.
> >>>
> >>> Ack, will add a NULL check.
> >>> other point is not clear to me, device info is absolutely required
> >>> at least for  IOVA=VA mode, since it requires to procure iommu
> >>> domain
> >> details.
> >>
> >> "device info is absolutely required" *only* for IOVA=VA mode, so user
> >> may skip to provide it.
> >>
> >>> Any thoughts or ways to address this without device.?
> >>
> >> Return error if 'iova_mode' requested but device info not?
> >>
> >> But you didn't replied to passing 'iova_mode' from application, I
> >> would like hear what you are thinking about it..
> >
> > One query regarding defining config for kni Where this config comes,
> > eal or kni sample app or KNI public API?
> 
> Config comes from the application, but the KNI public API has to validate if the
> request can be justified and return error if can't be.
> I think the KNI API check is required to be able to remove the check in the eal.
> 

If eal is enabled in iova=VA mode, kni application can operate in that VA mode right.
I did not understand the application's use or requirement to configure/enforce PA mode
when eal is enabled with iova=VA mode.

How about using rte_eal_iova_mode() == VA check in kni application and kni lib to pass device
info(pci) and also for pool creation(with no page split flag), meaning all these patch changes will go
under that check.

Current eal check(is kni module inserted when eal mode is VA) also can be addressed by checking 
linux version, where if rte_eal_iova_mode() is  VA but kernel version < 4.4.0, 
PA mode can be enforced and all will be configured in PA mode.

With above approach, I think PA mode is still intact and there would not be any issues.

> >
> >>
> >>>
> >>> <...>
> >>>
> >>>> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
> >>>>                         return;
> >>>>
> >>>>                 for (i = 0; i < num_rx; i++) {
> >>>> -                     kva = pa2kva(kni->pa[i]);
> >>>> +                     if (likely(kni->iova_mode == 1))
> >>>> +                             kva = iova2kva(kni, kni->pa[i]);
> >>>> +                     else
> >>>> +                             kva = pa2kva(kni->pa[i]);
> >>>
> >>> To reduce the churn, what about updating the 'pa2kva()' and put the
> >>> "(kni->iova_mode == 1)" check there? Does it help? (not only
> >>> 'pa2kva()' but its friends also, and if it makes more sense agree to
> >>> rename the functions)
> >>>
> >>> No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova
> >>> address might crash, hence the if..else check is added.
> >>
> >> I understand that part.
> >> What I am suggestion is something like this:
> >>
> >> kva = common_function(kni, kni->pa[i]);
> >>
> >> ---
> >>
> >> common_function() {
> >> 	if (unlikely(kni->iova_mode == 1))
> >> 		return iova2kva(kni, kni->pa[i]);
> >> 	return pa2kva(kni->pa[i]);
> >> }
> >>
> >> To hide the check in the function and make code more readable
> >>
> >>>
> >>> And btw, why 'likely' case is "kni->iova_mode == 1"?
> >>>
> >>> no specific case other than branch predict, will remove this if it's
> >>> really harmful to PA mode.
> >

Patch

diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index d57bce6..6ad53c7 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -23,6 +23,7 @@ 
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>
 
 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -39,6 +40,8 @@  struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+	struct iommu_domain *domain;
 	struct net_device_stats stats;
 	int status;
 	uint16_t group_id;           /* Group ID of a group of KNI devices */
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 1fc5eeb..b70c827 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -294,6 +294,9 @@  kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 
 	pr_info("Creating kni...\n");
 	/* Check the buffer size, to avoid warning */
@@ -351,15 +354,56 @@  kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
+		(void)pci;
+		pr_err("Kernel version is not supported\n");
+		return -EINVAL;
+#else
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index ad83658..92d5991 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -35,6 +35,22 @@  static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+/* iova to kernel virtual address */
+static void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(uintptr_t)pa));
+}
+
+static void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt((iommu_iova_to_phys(kni->domain,
+					(uintptr_t)m->buf_physaddr) +
+			     m->data_off));
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -186,7 +202,10 @@  kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			if (likely(kni->iova_mode == 1))
+				kva = iova2kva(kni, kni->pa[i]);
+			else
+				kva = pa2kva(kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 		}
 
@@ -263,8 +282,13 @@  kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		if (likely(kni->iova_mode == 1)) {
+			pkt_kva = iova2kva(kni, pkt_pa);
+			data_kva = iova2data_kva(kni, pkt_kva);
+		} else {
+			pkt_kva = pa2kva(pkt_pa);
+			data_kva = kva2data_kva(pkt_kva);
+		}
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -335,9 +359,14 @@  kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = dev_alloc_skb(len + 2);
@@ -434,13 +463,21 @@  kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+
+			if (likely(kni->iova_mode == 1)) {
+				kva = iova2kva(kni, kni->pa[i]);
+				data_kva = iova2data_kva(kni, kva);
+				alloc_kva = iova2kva(kni, kni->alloc_pa[i]);
+				alloc_data_kva = iova2data_kva(kni, alloc_kva);
+			} else {
+				kva = pa2kva(kni->pa[i]);
+				data_kva = kva2data_kva(kva);
+				alloc_kva = pa2kva(kni->alloc_pa[i]);
+				alloc_data_kva = kva2data_kva(alloc_kva);
+			}
 			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -507,9 +544,16 @@  kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
+
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = dev_alloc_skb(len + 2);
@@ -545,8 +589,14 @@  kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 				if (!kva->next)
 					break;
 
-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = iova2kva(kni,
+						       va2pa(kva->next, kva));
+					data_kva = iova2data_kva(kni, kva);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}