[dpdk-dev,v8,04/14] eal/pci: Consolidate pci address comparison APIs

Message ID 1424060073-23484-5-git-send-email-mukawa@igel.co.jp (mailing list archive)
State Superseded, archived
Headers

Commit Message

Tetsuya Mukawa Feb. 16, 2015, 4:14 a.m. UTC
This patch replaces pci_addr_comparison() and memcmp() of pci addresses by
eal_compare_pci_addr().

v8:
- Fix pci_scan_one() to update sysfs values.
  (Thanks to Qiu, Michael and Iremonger, Bernard)
v5:
- Fix pci_scan_one to handle pt_driver correctly.
v4:
- Fix calculation method of eal_compare_pci_addr().
- Add parameter checking.

Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
---
 lib/librte_eal/bsdapp/eal/eal_pci.c       | 29 ++++++++++++--------------
 lib/librte_eal/common/eal_common_pci.c    |  2 +-
 lib/librte_eal/common/include/rte_pci.h   | 34 +++++++++++++++++++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_pci.c     | 30 +++++++++++++--------------
 lib/librte_eal/linuxapp/eal/eal_pci_uio.c |  2 +-
 5 files changed, 63 insertions(+), 34 deletions(-)
  

Comments

Iremonger, Bernard Feb. 16, 2015, 4:19 p.m. UTC | #1
> -----Original Message-----
> From: Tetsuya Mukawa [mailto:mukawa@igel.co.jp]
> Sent: Monday, February 16, 2015 4:14 AM
> To: dev@dpdk.org
> Cc: Qiu, Michael; Iremonger, Bernard; Tetsuya Mukawa
> Subject: [PATCH v8 04/14] eal/pci: Consolidate pci address comparison APIs
> 
> This patch replaces pci_addr_comparison() and memcmp() of pci addresses by
> eal_compare_pci_addr().
> 
> v8:
> - Fix pci_scan_one() to update sysfs values.
>   (Thanks to Qiu, Michael and Iremonger, Bernard)
> v5:
> - Fix pci_scan_one to handle pt_driver correctly.
> v4:
> - Fix calculation method of eal_compare_pci_addr().
> - Add parameter checking.
> 
> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>

Acked-by: Bernard Iremonger <bernard.iremonger@intel.com>
  
Thomas Monjalon Feb. 17, 2015, 12:44 a.m. UTC | #2
2015-02-16 13:14, Tetsuya Mukawa:
> This patch replaces pci_addr_comparison() and memcmp() of pci addresses by
> eal_compare_pci_addr().
> 
> v8:
> - Fix pci_scan_one() to update sysfs values.
>   (Thanks to Qiu, Michael and Iremonger, Bernard)
> v5:
> - Fix pci_scan_one to handle pt_driver correctly.
> v4:
> - Fix calculation method of eal_compare_pci_addr().
> - Add parameter checking.
> 
> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
> ---
>  lib/librte_eal/bsdapp/eal/eal_pci.c       | 29 ++++++++++++--------------
>  lib/librte_eal/common/eal_common_pci.c    |  2 +-
>  lib/librte_eal/common/include/rte_pci.h   | 34 +++++++++++++++++++++++++++++++
>  lib/librte_eal/linuxapp/eal/eal_pci.c     | 30 +++++++++++++--------------
>  lib/librte_eal/linuxapp/eal/eal_pci_uio.c |  2 +-
>  5 files changed, 63 insertions(+), 34 deletions(-)
> 
> diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c
> index 74ecce7..7dbdccd 100644
> --- a/lib/librte_eal/bsdapp/eal/eal_pci.c
> +++ b/lib/librte_eal/bsdapp/eal/eal_pci.c
> @@ -270,20 +270,6 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>  	return (0);
>  }
>  
> -/* Compare two PCI device addresses. */
> -static int
> -pci_addr_comparison(struct rte_pci_addr *addr, struct rte_pci_addr *addr2)
> -{
> -	uint64_t dev_addr = (addr->domain << 24) + (addr->bus << 16) + (addr->devid << 8) + addr->function;
> -	uint64_t dev_addr2 = (addr2->domain << 24) + (addr2->bus << 16) + (addr2->devid << 8) + addr2->function;
> -
> -	if (dev_addr > dev_addr2)
> -		return 1;
> -	else
> -		return 0;
> -}
> -
> -
>  /* Scan one pci sysfs entry, and fill the devices list from it. */
>  static int
>  pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
>  	}
>  	else {
>  		struct rte_pci_device *dev2 = NULL;
> +		int ret;
>  
>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
> +			if (ret > 0)
>  				continue;
> -			else {
> +			else if (ret < 0) {
>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
>  				return 0;
> +			} else { /* already registered */
> +				/* update pt_driver */
> +				dev2->pt_driver = dev->pt_driver;
> +				dev2->max_vfs = dev->max_vfs;
> +				memmove(dev2->mem_resource,
> +					dev->mem_resource,
> +					sizeof(dev->mem_resource));
> +				free(dev);
> +				return 0;

Could you comment this "else part" please?

[...]
> --- a/lib/librte_eal/common/include/rte_pci.h
> +++ b/lib/librte_eal/common/include/rte_pci.h
> @@ -269,6 +269,40 @@ eal_parse_pci_DomBDF(const char *input, struct rte_pci_addr *dev_addr)
>  }
>  #undef GET_PCIADDR_FIELD
>  
> +/* Compare two PCI device addresses. */
> +/**
> + * Utility function to compare two PCI device addresses.
> + *
> + * @param addr
> + *	The PCI Bus-Device-Function address to compare
> + * @param addr2
> + *	The PCI Bus-Device-Function address to compare
> + * @return
> + *	0 on equal PCI address.
> + *	Positive on addr is greater than addr2.
> + *	Negative on addr is less than addr2, or error.
> + */
> +static inline int
> +eal_compare_pci_addr(struct rte_pci_addr *addr, struct rte_pci_addr *addr2)

I think that this function should be prefixed with rte_

[...]
>  		/* skip this element if it doesn't match our PCI address */
> -		if (memcmp(&uio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
> +		if (eal_compare_pci_addr(&uio_res->pci_addr, &dev->addr))

Why memcmp is not sufficient to compare PCI addresses?
The only exception I see is endianness for natural sorting.

>  			continue;
>  
>  		for (i = 0; i != uio_res->nb_maps; i++) {
>
  
Tetsuya Mukawa Feb. 17, 2015, 6:14 a.m. UTC | #3
On 2015/02/17 9:44, Thomas Monjalon wrote:
> 2015-02-16 13:14, Tetsuya Mukawa:
>> This patch replaces pci_addr_comparison() and memcmp() of pci addresses by
>> eal_compare_pci_addr().
>>
>> v8:
>> - Fix pci_scan_one() to update sysfs values.
>>   (Thanks to Qiu, Michael and Iremonger, Bernard)
>> v5:
>> - Fix pci_scan_one to handle pt_driver correctly.
>> v4:
>> - Fix calculation method of eal_compare_pci_addr().
>> - Add parameter checking.
>>
>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
>> ---
>>  lib/librte_eal/bsdapp/eal/eal_pci.c       | 29 ++++++++++++--------------
>>  lib/librte_eal/common/eal_common_pci.c    |  2 +-
>>  lib/librte_eal/common/include/rte_pci.h   | 34 +++++++++++++++++++++++++++++++
>>  lib/librte_eal/linuxapp/eal/eal_pci.c     | 30 +++++++++++++--------------
>>  lib/librte_eal/linuxapp/eal/eal_pci_uio.c |  2 +-
>>  5 files changed, 63 insertions(+), 34 deletions(-)
>>
>> diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c
>> index 74ecce7..7dbdccd 100644
>> --- a/lib/librte_eal/bsdapp/eal/eal_pci.c
>> +++ b/lib/librte_eal/bsdapp/eal/eal_pci.c
>> @@ -270,20 +270,6 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>>  	return (0);
>>  }
>>  
>> -/* Compare two PCI device addresses. */
>> -static int
>> -pci_addr_comparison(struct rte_pci_addr *addr, struct rte_pci_addr *addr2)
>> -{
>> -	uint64_t dev_addr = (addr->domain << 24) + (addr->bus << 16) + (addr->devid << 8) + addr->function;
>> -	uint64_t dev_addr2 = (addr2->domain << 24) + (addr2->bus << 16) + (addr2->devid << 8) + addr2->function;
>> -
>> -	if (dev_addr > dev_addr2)
>> -		return 1;
>> -	else
>> -		return 0;
>> -}
>> -
>> -
>>  /* Scan one pci sysfs entry, and fill the devices list from it. */
>>  static int
>>  pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
>> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
>>  	}
>>  	else {
>>  		struct rte_pci_device *dev2 = NULL;
>> +		int ret;
>>  
>>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
>> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
>> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
>> +			if (ret > 0)
>>  				continue;
>> -			else {
>> +			else if (ret < 0) {
>>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
>>  				return 0;
>> +			} else { /* already registered */
>> +				/* update pt_driver */
>> +				dev2->pt_driver = dev->pt_driver;
>> +				dev2->max_vfs = dev->max_vfs;
>> +				memmove(dev2->mem_resource,
>> +					dev->mem_resource,
>> +					sizeof(dev->mem_resource));
>> +				free(dev);
>> +				return 0;
> Could you comment this "else part" please?

PCI device list is allocated when rte_eal_init() is called. At the time,
to fill pci device information, sysfs value is used.
If sysfs values written by kernel device driver will not be changed by
igb_uio, vfio or pci_uio_genereic, above code isn't needed.
But actually above values are changed or added by them.

Here is a step to cause issue.
1. Boot linux.
2. Start DPDK application without any physical NIC ports.
 - Here, some sysfs values are read, and store to pci device list.
3. igb_uio starts managing a port.
 - Here, some sysfs values are changed.
4. Add a NIC port to DPDK application using hotplug functions.
 - Here, we need to replace some values.

> [...]
>> --- a/lib/librte_eal/common/include/rte_pci.h
>> +++ b/lib/librte_eal/common/include/rte_pci.h
>> @@ -269,6 +269,40 @@ eal_parse_pci_DomBDF(const char *input, struct rte_pci_addr *dev_addr)
>>  }
>>  #undef GET_PCIADDR_FIELD
>>  
>> +/* Compare two PCI device addresses. */
>> +/**
>> + * Utility function to compare two PCI device addresses.
>> + *
>> + * @param addr
>> + *	The PCI Bus-Device-Function address to compare
>> + * @param addr2
>> + *	The PCI Bus-Device-Function address to compare
>> + * @return
>> + *	0 on equal PCI address.
>> + *	Positive on addr is greater than addr2.
>> + *	Negative on addr is less than addr2, or error.
>> + */
>> +static inline int
>> +eal_compare_pci_addr(struct rte_pci_addr *addr, struct rte_pci_addr *addr2)
> I think that this function should be prefixed with rte_

Sure, I will.

> [...]
>>  		/* skip this element if it doesn't match our PCI address */
>> -		if (memcmp(&uio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
>> +		if (eal_compare_pci_addr(&uio_res->pci_addr, &dev->addr))
> Why memcmp is not sufficient to compare PCI addresses?
> The only exception I see is endianness for natural sorting.

Here is the definition.

struct rte_pci_addr {
        uint16_t domain;                /**< Device domain */
        uint8_t bus;                    /**< Device bus */
        uint8_t devid;                  /**< Device ID */
        uint8_t function;               /**< Device function. */
};

But, sizeof(struct rte_pci_addr) will be 6.
If rte_pci_addr is allocated in a function without bzero, last 1 byte
may have some value.
As a result, memcmp may not work. To avoid such a case, I checked like
above.

>>  			continue;
>>  
>>  		for (i = 0; i != uio_res->nb_maps; i++) {
>>
  
Thomas Monjalon Feb. 18, 2015, 1:02 a.m. UTC | #4
2015-02-17 15:14, Tetsuya Mukawa:
> On 2015/02/17 9:44, Thomas Monjalon wrote:
> > 2015-02-16 13:14, Tetsuya Mukawa:
> >> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
> >>  	}
> >>  	else {
> >>  		struct rte_pci_device *dev2 = NULL;
> >> +		int ret;
> >>  
> >>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
> >> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
> >> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
> >> +			if (ret > 0)
> >>  				continue;
> >> -			else {
> >> +			else if (ret < 0) {
> >>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
> >>  				return 0;
> >> +			} else { /* already registered */
> >> +				/* update pt_driver */
> >> +				dev2->pt_driver = dev->pt_driver;
> >> +				dev2->max_vfs = dev->max_vfs;
> >> +				memmove(dev2->mem_resource,
> >> +					dev->mem_resource,
> >> +					sizeof(dev->mem_resource));
> >> +				free(dev);
> >> +				return 0;
> > Could you comment this "else part" please?
> 
> PCI device list is allocated when rte_eal_init() is called. At the time,
> to fill pci device information, sysfs value is used.
> If sysfs values written by kernel device driver will not be changed by
> igb_uio, vfio or pci_uio_genereic, above code isn't needed.
> But actually above values are changed or added by them.
> 
> Here is a step to cause issue.
> 1. Boot linux.
> 2. Start DPDK application without any physical NIC ports.
>  - Here, some sysfs values are read, and store to pci device list.
> 3. igb_uio starts managing a port.
>  - Here, some sysfs values are changed.
> 4. Add a NIC port to DPDK application using hotplug functions.
>  - Here, we need to replace some values.

I think that you are showing that something is wrongly designed in these
EAL structures. I suggest to try cleaning this mess instead of workarounding.

[...]
> >> -		if (memcmp(&uio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
> >> +		if (eal_compare_pci_addr(&uio_res->pci_addr, &dev->addr))
> > Why memcmp is not sufficient to compare PCI addresses?
> > The only exception I see is endianness for natural sorting.
> 
> Here is the definition.
> 
> struct rte_pci_addr {
>         uint16_t domain;                /**< Device domain */
>         uint8_t bus;                    /**< Device bus */
>         uint8_t devid;                  /**< Device ID */
>         uint8_t function;               /**< Device function. */
> };
> 
> But, sizeof(struct rte_pci_addr) will be 6.
> If rte_pci_addr is allocated in a function without bzero, last 1 byte
> may have some value.
> As a result, memcmp may not work. To avoid such a case, I checked like
> above.

OK thanks. That's the kind of information which is valuable in a commit log.
  
Tetsuya Mukawa Feb. 18, 2015, 1:55 a.m. UTC | #5
On 2015/02/18 10:02, Thomas Monjalon wrote:
> 2015-02-17 15:14, Tetsuya Mukawa:
>> On 2015/02/17 9:44, Thomas Monjalon wrote:
>>> 2015-02-16 13:14, Tetsuya Mukawa:
>>>> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
>>>>  	}
>>>>  	else {
>>>>  		struct rte_pci_device *dev2 = NULL;
>>>> +		int ret;
>>>>  
>>>>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
>>>> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
>>>> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
>>>> +			if (ret > 0)
>>>>  				continue;
>>>> -			else {
>>>> +			else if (ret < 0) {
>>>>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
>>>>  				return 0;
>>>> +			} else { /* already registered */
>>>> +				/* update pt_driver */
>>>> +				dev2->pt_driver = dev->pt_driver;
>>>> +				dev2->max_vfs = dev->max_vfs;
>>>> +				memmove(dev2->mem_resource,
>>>> +					dev->mem_resource,
>>>> +					sizeof(dev->mem_resource));
>>>> +				free(dev);
>>>> +				return 0;
>>> Could you comment this "else part" please?
>> PCI device list is allocated when rte_eal_init() is called. At the time,
>> to fill pci device information, sysfs value is used.
>> If sysfs values written by kernel device driver will not be changed by
>> igb_uio, vfio or pci_uio_genereic, above code isn't needed.
>> But actually above values are changed or added by them.
>>
>> Here is a step to cause issue.
>> 1. Boot linux.
>> 2. Start DPDK application without any physical NIC ports.
>>  - Here, some sysfs values are read, and store to pci device list.
>> 3. igb_uio starts managing a port.
>>  - Here, some sysfs values are changed.
>> 4. Add a NIC port to DPDK application using hotplug functions.
>>  - Here, we need to replace some values.
> I think that you are showing that something is wrongly designed in these
> EAL structures. I suggest to try cleaning this mess instead of workarounding.
>
> [...]
>>>> -		if (memcmp(&uio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
>>>> +		if (eal_compare_pci_addr(&uio_res->pci_addr, &dev->addr))
>>> Why memcmp is not sufficient to compare PCI addresses?
>>> The only exception I see is endianness for natural sorting.
>> Here is the definition.
>>
>> struct rte_pci_addr {
>>         uint16_t domain;                /**< Device domain */
>>         uint8_t bus;                    /**< Device bus */
>>         uint8_t devid;                  /**< Device ID */
>>         uint8_t function;               /**< Device function. */
>> };
>>
>> But, sizeof(struct rte_pci_addr) will be 6.
>> If rte_pci_addr is allocated in a function without bzero, last 1 byte
>> may have some value.
>> As a result, memcmp may not work. To avoid such a case, I checked like
>> above.
> OK thanks. That's the kind of information which is valuable in a commit log.
>

Sure I will add it.

Thanks,
Tetsuya
  
Iremonger, Bernard Feb. 18, 2015, 10:26 a.m. UTC | #6
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tetsuya Mukawa
> Sent: Wednesday, February 18, 2015 1:55 AM
> To: Thomas Monjalon
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v8 04/14] eal/pci: Consolidate pci address comparison APIs
> 
> On 2015/02/18 10:02, Thomas Monjalon wrote:
> > 2015-02-17 15:14, Tetsuya Mukawa:
> >> On 2015/02/17 9:44, Thomas Monjalon wrote:
> >>> 2015-02-16 13:14, Tetsuya Mukawa:
> >>>> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
> >>>>  	}
> >>>>  	else {
> >>>>  		struct rte_pci_device *dev2 = NULL;
> >>>> +		int ret;
> >>>>
> >>>>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
> >>>> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
> >>>> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
> >>>> +			if (ret > 0)
> >>>>  				continue;
> >>>> -			else {
> >>>> +			else if (ret < 0) {
> >>>>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
> >>>>  				return 0;
> >>>> +			} else { /* already registered */
> >>>> +				/* update pt_driver */
> >>>> +				dev2->pt_driver = dev->pt_driver;
> >>>> +				dev2->max_vfs = dev->max_vfs;
> >>>> +				memmove(dev2->mem_resource,
> >>>> +					dev->mem_resource,
> >>>> +					sizeof(dev->mem_resource));
> >>>> +				free(dev);
> >>>> +				return 0;
> >>> Could you comment this "else part" please?
> >> PCI device list is allocated when rte_eal_init() is called. At the
> >> time, to fill pci device information, sysfs value is used.
> >> If sysfs values written by kernel device driver will not be changed
> >> by igb_uio, vfio or pci_uio_genereic, above code isn't needed.
> >> But actually above values are changed or added by them.
> >>
> >> Here is a step to cause issue.
> >> 1. Boot linux.
> >> 2. Start DPDK application without any physical NIC ports.
> >>  - Here, some sysfs values are read, and store to pci device list.
> >> 3. igb_uio starts managing a port.
> >>  - Here, some sysfs values are changed.
> >> 4. Add a NIC port to DPDK application using hotplug functions.
> >>  - Here, we need to replace some values.
> > I think that you are showing that something is wrongly designed in
> > these EAL structures. I suggest to try cleaning this mess instead of workarounding.

Hi Tetsuya, Thomas,
I think that redesigning the EAL structures is beyond the scope of this patchset and should be undertaken as a separate task.
I suspect there may be a problem in the original code when  a device which was using a kernel driver is bound to igb_uio.  The igb_uio  driver adds /sys/bus/pci/devices/0000\:05\:00.0/max_vfs.

Regards,

Bernard.

> >
> > [...]
> >>>> -		if (memcmp(&uio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
> >>>> +		if (eal_compare_pci_addr(&uio_res->pci_addr, &dev->addr))
> >>> Why memcmp is not sufficient to compare PCI addresses?
> >>> The only exception I see is endianness for natural sorting.
> >> Here is the definition.
> >>
> >> struct rte_pci_addr {
> >>         uint16_t domain;                /**< Device domain */
> >>         uint8_t bus;                    /**< Device bus */
> >>         uint8_t devid;                  /**< Device ID */
> >>         uint8_t function;               /**< Device function. */
> >> };
> >>
> >> But, sizeof(struct rte_pci_addr) will be 6.
> >> If rte_pci_addr is allocated in a function without bzero, last 1 byte
> >> may have some value.
> >> As a result, memcmp may not work. To avoid such a case, I checked
> >> like above.
> > OK thanks. That's the kind of information which is valuable in a commit log.
> >
> 
> Sure I will add it.
> 
> Thanks,
> Tetsuya
  
Thomas Monjalon Feb. 18, 2015, 10:32 a.m. UTC | #7
Hi Bernard,

2015-02-18 10:26, Iremonger, Bernard:
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tetsuya Mukawa
> > On 2015/02/18 10:02, Thomas Monjalon wrote:
> > > 2015-02-17 15:14, Tetsuya Mukawa:
> > >> On 2015/02/17 9:44, Thomas Monjalon wrote:
> > >>> 2015-02-16 13:14, Tetsuya Mukawa:
> > >>>> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
> > >>>>  	}
> > >>>>  	else {
> > >>>>  		struct rte_pci_device *dev2 = NULL;
> > >>>> +		int ret;
> > >>>>
> > >>>>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
> > >>>> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
> > >>>> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
> > >>>> +			if (ret > 0)
> > >>>>  				continue;
> > >>>> -			else {
> > >>>> +			else if (ret < 0) {
> > >>>>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
> > >>>>  				return 0;
> > >>>> +			} else { /* already registered */
> > >>>> +				/* update pt_driver */
> > >>>> +				dev2->pt_driver = dev->pt_driver;
> > >>>> +				dev2->max_vfs = dev->max_vfs;
> > >>>> +				memmove(dev2->mem_resource,
> > >>>> +					dev->mem_resource,
> > >>>> +					sizeof(dev->mem_resource));
> > >>>> +				free(dev);
> > >>>> +				return 0;
> > >>> Could you comment this "else part" please?
> > >> PCI device list is allocated when rte_eal_init() is called. At the
> > >> time, to fill pci device information, sysfs value is used.
> > >> If sysfs values written by kernel device driver will not be changed
> > >> by igb_uio, vfio or pci_uio_genereic, above code isn't needed.
> > >> But actually above values are changed or added by them.
> > >>
> > >> Here is a step to cause issue.
> > >> 1. Boot linux.
> > >> 2. Start DPDK application without any physical NIC ports.
> > >>  - Here, some sysfs values are read, and store to pci device list.
> > >> 3. igb_uio starts managing a port.
> > >>  - Here, some sysfs values are changed.
> > >> 4. Add a NIC port to DPDK application using hotplug functions.
> > >>  - Here, we need to replace some values.
> > > 
> > > I think that you are showing that something is wrongly designed in
> > > these EAL structures. I suggest to try cleaning this mess instead of workarounding.
> 
> Hi Tetsuya, Thomas,
> I think that redesigning the EAL structures is beyond the scope of this patchset and should be undertaken as a separate task.

I strongly disagrees this opinion. We should never workaround design problems
and add more complex/weird code.
I think that this kind of consideration is the heart of some design problems we
have to face today. Please let's stop adding some code which just works without
thinking the whole design.

> I suspect there may be a problem in the original code when  a device which was using a kernel driver is bound to igb_uio.  The igb_uio  driver adds /sys/bus/pci/devices/0000\:05\:00.0/max_vfs.
  
Iremonger, Bernard Feb. 18, 2015, 11:39 a.m. UTC | #8
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Wednesday, February 18, 2015 10:33 AM
> To: Iremonger, Bernard
> Cc: Tetsuya Mukawa; dev@dpdk.org; ivan.boule@6wind.com
> Subject: Re: [dpdk-dev] [PATCH v8 04/14] eal/pci: Consolidate pci address comparison APIs
> 
> Hi Bernard,
> 
> 2015-02-18 10:26, Iremonger, Bernard:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tetsuya Mukawa
> > > On 2015/02/18 10:02, Thomas Monjalon wrote:
> > > > 2015-02-17 15:14, Tetsuya Mukawa:
> > > >> On 2015/02/17 9:44, Thomas Monjalon wrote:
> > > >>> 2015-02-16 13:14, Tetsuya Mukawa:
> > > >>>> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
> > > >>>>  	}
> > > >>>>  	else {
> > > >>>>  		struct rte_pci_device *dev2 = NULL;
> > > >>>> +		int ret;
> > > >>>>
> > > >>>>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
> > > >>>> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
> > > >>>> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
> > > >>>> +			if (ret > 0)
> > > >>>>  				continue;
> > > >>>> -			else {
> > > >>>> +			else if (ret < 0) {
> > > >>>>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
> > > >>>>  				return 0;
> > > >>>> +			} else { /* already registered */
> > > >>>> +				/* update pt_driver */
> > > >>>> +				dev2->pt_driver = dev->pt_driver;
> > > >>>> +				dev2->max_vfs = dev->max_vfs;
> > > >>>> +				memmove(dev2->mem_resource,
> > > >>>> +					dev->mem_resource,
> > > >>>> +					sizeof(dev->mem_resource));
> > > >>>> +				free(dev);
> > > >>>> +				return 0;
> > > >>> Could you comment this "else part" please?
> > > >> PCI device list is allocated when rte_eal_init() is called. At
> > > >> the time, to fill pci device information, sysfs value is used.
> > > >> If sysfs values written by kernel device driver will not be
> > > >> changed by igb_uio, vfio or pci_uio_genereic, above code isn't needed.
> > > >> But actually above values are changed or added by them.
> > > >>
> > > >> Here is a step to cause issue.
> > > >> 1. Boot linux.
> > > >> 2. Start DPDK application without any physical NIC ports.
> > > >>  - Here, some sysfs values are read, and store to pci device list.
> > > >> 3. igb_uio starts managing a port.
> > > >>  - Here, some sysfs values are changed.
> > > >> 4. Add a NIC port to DPDK application using hotplug functions.
> > > >>  - Here, we need to replace some values.
> > > >
> > > > I think that you are showing that something is wrongly designed in
> > > > these EAL structures. I suggest to try cleaning this mess instead of workarounding.
> >
> > Hi Tetsuya, Thomas,
> > I think that redesigning the EAL structures is beyond the scope of this patchset and should be
> undertaken as a separate task.
> 
> I strongly disagrees this opinion. We should never workaround design problems and add more
> complex/weird code.
> I think that this kind of consideration is the heart of some design problems we have to face today.
> Please let's stop adding some code which just works without thinking the whole design.
> 
> > I suspect there may be a problem in the original code when  a device which was using a kernel driver
> is bound to igb_uio.  The igb_uio  driver adds /sys/bus/pci/devices/0000\:05\:00.0/max_vfs.

Hi Tomas, Tetsuya,

In general, I agree that we should not workaround design problems.
In this case I don't think there is a problem with the rte_pci_device and pci_device_list structures.
The "already registered" device has been replaced. It would probably be cleaner to remove the "already registered" device from the list and then add the new device to the list rather than update the "already registered" device.

Regards,

Bernard.
  
Tetsuya Mukawa Feb. 18, 2015, 12:20 p.m. UTC | #9
On 2015/02/18 20:39, Iremonger, Bernard wrote:
>
>> -----Original Message-----
>> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
>> Sent: Wednesday, February 18, 2015 10:33 AM
>> To: Iremonger, Bernard
>> Cc: Tetsuya Mukawa; dev@dpdk.org; ivan.boule@6wind.com
>> Subject: Re: [dpdk-dev] [PATCH v8 04/14] eal/pci: Consolidate pci address comparison APIs
>>
>> Hi Bernard,
>>
>> 2015-02-18 10:26, Iremonger, Bernard:
>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tetsuya Mukawa
>>>> On 2015/02/18 10:02, Thomas Monjalon wrote:
>>>>> 2015-02-17 15:14, Tetsuya Mukawa:
>>>>>> On 2015/02/17 9:44, Thomas Monjalon wrote:
>>>>>>> 2015-02-16 13:14, Tetsuya Mukawa:
>>>>>>>> @@ -356,13 +342,24 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
>>>>>>>>  	}
>>>>>>>>  	else {
>>>>>>>>  		struct rte_pci_device *dev2 = NULL;
>>>>>>>> +		int ret;
>>>>>>>>
>>>>>>>>  		TAILQ_FOREACH(dev2, &pci_device_list, next) {
>>>>>>>> -			if (pci_addr_comparison(&dev->addr, &dev2->addr))
>>>>>>>> +			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
>>>>>>>> +			if (ret > 0)
>>>>>>>>  				continue;
>>>>>>>> -			else {
>>>>>>>> +			else if (ret < 0) {
>>>>>>>>  				TAILQ_INSERT_BEFORE(dev2, dev, next);
>>>>>>>>  				return 0;
>>>>>>>> +			} else { /* already registered */
>>>>>>>> +				/* update pt_driver */
>>>>>>>> +				dev2->pt_driver = dev->pt_driver;
>>>>>>>> +				dev2->max_vfs = dev->max_vfs;
>>>>>>>> +				memmove(dev2->mem_resource,
>>>>>>>> +					dev->mem_resource,
>>>>>>>> +					sizeof(dev->mem_resource));
>>>>>>>> +				free(dev);
>>>>>>>> +				return 0;
>>>>>>> Could you comment this "else part" please?
>>>>>> PCI device list is allocated when rte_eal_init() is called. At
>>>>>> the time, to fill pci device information, sysfs value is used.
>>>>>> If sysfs values written by kernel device driver will not be
>>>>>> changed by igb_uio, vfio or pci_uio_genereic, above code isn't needed.
>>>>>> But actually above values are changed or added by them.
>>>>>>
>>>>>> Here is a step to cause issue.
>>>>>> 1. Boot linux.
>>>>>> 2. Start DPDK application without any physical NIC ports.
>>>>>>  - Here, some sysfs values are read, and store to pci device list.
>>>>>> 3. igb_uio starts managing a port.
>>>>>>  - Here, some sysfs values are changed.
>>>>>> 4. Add a NIC port to DPDK application using hotplug functions.
>>>>>>  - Here, we need to replace some values.
>>>>> I think that you are showing that something is wrongly designed in
>>>>> these EAL structures. I suggest to try cleaning this mess instead of workarounding.
>>> Hi Tetsuya, Thomas,
>>> I think that redesigning the EAL structures is beyond the scope of this patchset and should be
>> undertaken as a separate task.
>>
>> I strongly disagrees this opinion. We should never workaround design problems and add more
>> complex/weird code.
>> I think that this kind of consideration is the heart of some design problems we have to face today.
>> Please let's stop adding some code which just works without thinking the whole design.
>>
>>> I suspect there may be a problem in the original code when  a device which was using a kernel driver
>> is bound to igb_uio.  The igb_uio  driver adds /sys/bus/pci/devices/0000\:05\:00.0/max_vfs.
> Hi Tomas, Tetsuya,
>
> In general, I agree that we should not workaround design problems.
> In this case I don't think there is a problem with the rte_pci_device and pci_device_list structures.

I agree with it.

> The "already registered" device has been replaced. It would probably be cleaner to remove the "already registered" device from the list and then add the new device to the list rather than update the "already registered" device.
>

I guess "replacing" will not work, because rte_pci_device structure is
also registered in rte_eth_dev structure.
If we remove and free the pci device, I guess something goes wrong in
ethdev library.
Just removing is one more option, but it means there is a working pci
device that is not registered in the pci_device_list. I guess it's weird.

I still think updating may be correct behavior.
The pci_device_list is used like below when rte_eal_init() is called.

1. When rte_eal_pci_init() is called, all pci devices are registered in
the pci_device_list.
2. When rte_eal_dev_init() is called, dev_driver_list is traversed, and
if a PCI device for a driver is found in the pci_device_list, init() of
the driver is called.

I guess it's not so strange design.
But this design assumes pci_device_list is latest while matching a
driver registered in dev_driver_list.

 Now, we have hotplug patch.
I guess we should do same thing.
Before matching, we should update the pci_device_list.

Thanks,
Tetsuya

> Regards,
>
> Bernard.
>
  

Patch

diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c
index 74ecce7..7dbdccd 100644
--- a/lib/librte_eal/bsdapp/eal/eal_pci.c
+++ b/lib/librte_eal/bsdapp/eal/eal_pci.c
@@ -270,20 +270,6 @@  pci_uio_map_resource(struct rte_pci_device *dev)
 	return (0);
 }
 
-/* Compare two PCI device addresses. */
-static int
-pci_addr_comparison(struct rte_pci_addr *addr, struct rte_pci_addr *addr2)
-{
-	uint64_t dev_addr = (addr->domain << 24) + (addr->bus << 16) + (addr->devid << 8) + addr->function;
-	uint64_t dev_addr2 = (addr2->domain << 24) + (addr2->bus << 16) + (addr2->devid << 8) + addr2->function;
-
-	if (dev_addr > dev_addr2)
-		return 1;
-	else
-		return 0;
-}
-
-
 /* Scan one pci sysfs entry, and fill the devices list from it. */
 static int
 pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
@@ -356,13 +342,24 @@  pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
 	}
 	else {
 		struct rte_pci_device *dev2 = NULL;
+		int ret;
 
 		TAILQ_FOREACH(dev2, &pci_device_list, next) {
-			if (pci_addr_comparison(&dev->addr, &dev2->addr))
+			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
+			if (ret > 0)
 				continue;
-			else {
+			else if (ret < 0) {
 				TAILQ_INSERT_BEFORE(dev2, dev, next);
 				return 0;
+			} else { /* already registered */
+				/* update pt_driver */
+				dev2->pt_driver = dev->pt_driver;
+				dev2->max_vfs = dev->max_vfs;
+				memmove(dev2->mem_resource,
+					dev->mem_resource,
+					sizeof(dev->mem_resource));
+				free(dev);
+				return 0;
 			}
 		}
 		TAILQ_INSERT_TAIL(&pci_device_list, dev, next);
diff --git a/lib/librte_eal/common/eal_common_pci.c b/lib/librte_eal/common/eal_common_pci.c
index f3c7f71..a89f5c3 100644
--- a/lib/librte_eal/common/eal_common_pci.c
+++ b/lib/librte_eal/common/eal_common_pci.c
@@ -93,7 +93,7 @@  static struct rte_devargs *pci_devargs_lookup(struct rte_pci_device *dev)
 		if (devargs->type != RTE_DEVTYPE_BLACKLISTED_PCI &&
 			devargs->type != RTE_DEVTYPE_WHITELISTED_PCI)
 			continue;
-		if (!memcmp(&dev->addr, &devargs->pci.addr, sizeof(dev->addr)))
+		if (!eal_compare_pci_addr(&dev->addr, &devargs->pci.addr))
 			return devargs;
 	}
 	return NULL;
diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
index 7f2d699..4814cd7 100644
--- a/lib/librte_eal/common/include/rte_pci.h
+++ b/lib/librte_eal/common/include/rte_pci.h
@@ -269,6 +269,40 @@  eal_parse_pci_DomBDF(const char *input, struct rte_pci_addr *dev_addr)
 }
 #undef GET_PCIADDR_FIELD
 
+/* Compare two PCI device addresses. */
+/**
+ * Utility function to compare two PCI device addresses.
+ *
+ * @param addr
+ *	The PCI Bus-Device-Function address to compare
+ * @param addr2
+ *	The PCI Bus-Device-Function address to compare
+ * @return
+ *	0 on equal PCI address.
+ *	Positive on addr is greater than addr2.
+ *	Negative on addr is less than addr2, or error.
+ */
+static inline int
+eal_compare_pci_addr(struct rte_pci_addr *addr, struct rte_pci_addr *addr2)
+{
+	uint64_t dev_addr, dev_addr2;
+
+	if ((addr == NULL) || (addr2 == NULL))
+		return -1;
+
+	dev_addr = (addr->domain << 24) | (addr->bus << 16) |
+				(addr->devid << 8) | addr->function;
+	dev_addr2 = (addr2->domain << 24) | (addr2->bus << 16) |
+				(addr2->devid << 8) | addr2->function;
+
+	if (dev_addr > dev_addr2)
+		return 1;
+	else if (dev_addr < dev_addr2)
+		return -1;
+	else
+		return 0;
+}
+
 /**
  * Probe the PCI bus for registered drivers.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index 3c463b2..2d5f6a6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -41,6 +41,7 @@ 
 #include <rte_eal_memconfig.h>
 #include <rte_malloc.h>
 #include <rte_devargs.h>
+#include <rte_memcpy.h>
 
 #include "rte_pci_dev_ids.h"
 #include "eal_filesystem.h"
@@ -229,20 +230,6 @@  error:
 	return -1;
 }
 
-/* Compare two PCI device addresses. */
-static int
-pci_addr_comparison(struct rte_pci_addr *addr, struct rte_pci_addr *addr2)
-{
-	uint64_t dev_addr = (addr->domain << 24) + (addr->bus << 16) + (addr->devid << 8) + addr->function;
-	uint64_t dev_addr2 = (addr2->domain << 24) + (addr2->bus << 16) + (addr2->devid << 8) + addr2->function;
-
-	if (dev_addr > dev_addr2)
-		return 1;
-	else
-		return 0;
-}
-
-
 /* Scan one pci sysfs entry, and fill the devices list from it. */
 static int
 pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
@@ -360,13 +347,24 @@  pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
 	}
 	else {
 		struct rte_pci_device *dev2 = NULL;
+		int ret;
 
 		TAILQ_FOREACH(dev2, &pci_device_list, next) {
-			if (pci_addr_comparison(&dev->addr, &dev2->addr))
+			ret = eal_compare_pci_addr(&dev->addr, &dev2->addr);
+			if (ret > 0)
 				continue;
-			else {
+			else if (ret < 0) {
 				TAILQ_INSERT_BEFORE(dev2, dev, next);
 				return 0;
+			} else { /* already registered */
+				/* update pt_driver */
+				dev2->pt_driver = dev->pt_driver;
+				dev2->max_vfs = dev->max_vfs;
+				memmove(dev2->mem_resource,
+					dev->mem_resource,
+					sizeof(dev->mem_resource));
+				free(dev);
+				return 0;
 			}
 		}
 		TAILQ_INSERT_TAIL(&pci_device_list, dev, next);
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
index e53f06b..1da3507 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
@@ -123,7 +123,7 @@  pci_uio_map_secondary(struct rte_pci_device *dev)
 	TAILQ_FOREACH(uio_res, pci_res_list, next) {
 
 		/* skip this element if it doesn't match our PCI address */
-		if (memcmp(&uio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+		if (eal_compare_pci_addr(&uio_res->pci_addr, &dev->addr))
 			continue;
 
 		for (i = 0; i != uio_res->nb_maps; i++) {