[v11,11/25] eal/dev: implement device iteration

Message ID d5ea19aaec29793e7e9570c5458475df1f60eab3.1531345404.git.gaetan.rivet@6wind.com (mailing list archive)
State Accepted, archived
Headers
Series Device querying |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Gaëtan Rivet July 11, 2018, 9:45 p.m. UTC
  Use the iteration hooks in the abstraction layers to perform the
requested filtering on the internal device lists.

Signed-off-by: Gaetan Rivet <gaetan.rivet@6wind.com>
---
 lib/librte_eal/common/eal_common_dev.c  | 168 ++++++++++++++++++++++++
 lib/librte_eal/common/include/rte_dev.h |  26 ++++
 lib/librte_eal/rte_eal_version.map      |   1 +
 3 files changed, 195 insertions(+)
  

Comments

Shreyansh Jain July 12, 2018, 10:58 a.m. UTC | #1
On Thursday 12 July 2018 03:15 AM, Gaetan Rivet wrote:
> Use the iteration hooks in the abstraction layers to perform the
> requested filtering on the internal device lists.
> 
> Signed-off-by: Gaetan Rivet <gaetan.rivet@6wind.com>
> ---
>   lib/librte_eal/common/eal_common_dev.c  | 168 ++++++++++++++++++++++++
>   lib/librte_eal/common/include/rte_dev.h |  26 ++++
>   lib/librte_eal/rte_eal_version.map      |   1 +
>   3 files changed, 195 insertions(+)
> 
> diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c
> index 63e329bd8..b78845f02 100644
> --- a/lib/librte_eal/common/eal_common_dev.c
> +++ b/lib/librte_eal/common/eal_common_dev.c
> @@ -45,6 +45,28 @@ static struct dev_event_cb_list dev_event_cbs;
>   /* spinlock for device callbacks */
>   static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER;
>   
> +struct dev_next_ctx {
> +	struct rte_dev_iterator *it;
> +	const char *bus_str;
> +	const char *cls_str;
> +};
> +
> +#define CTX(it, bus_str, cls_str) \
> +	(&(const struct dev_next_ctx){ \
> +		.it = it, \
> +		.bus_str = bus_str, \
> +		.cls_str = cls_str, \
> +	})
> +
> +#define ITCTX(ptr) \
> +	(((struct dev_next_ctx *)(intptr_t)ptr)->it)
> +
> +#define BUSCTX(ptr) \
> +	(((struct dev_next_ctx *)(intptr_t)ptr)->bus_str)
> +
> +#define CLSCTX(ptr) \
> +	(((struct dev_next_ctx *)(intptr_t)ptr)->cls_str)
> +
>   static int cmp_detached_dev_name(const struct rte_device *dev,
>   	const void *_name)
>   {
> @@ -398,3 +420,149 @@ rte_dev_iterator_init(struct rte_dev_iterator *it,
>   get_out:
>   	return -rte_errno;
>   }
> +
> +static char *
> +dev_str_sane_copy(const char *str)
> +{
> +	size_t end;
> +	char *copy;
> +
> +	end = strcspn(str, ",/");
> +	if (str[end] == ',') {
> +		copy = strdup(&str[end + 1]);
> +	} else {
> +		/* '/' or '\0' */
> +		copy = strdup("");
> +	}

Though it doesn't change anything functionally, if you can separate 
blocks of if-else with new lines, it really makes it easier to read.
Like here...

> +	if (copy == NULL) {
> +		rte_errno = ENOMEM;
> +	} else {
> +		char *slash;
> +
> +		slash = strchr(copy, '/');
> +		if (slash != NULL)
> +			slash[0] = '\0';
> +	}
> +	return copy;
> +}
> +
> +static int
> +class_next_dev_cmp(const struct rte_class *cls,
> +		   const void *ctx)
> +{
> +	struct rte_dev_iterator *it;
> +	const char *cls_str = NULL;
> +	void *dev;
> +
> +	if (cls->dev_iterate == NULL)
> +		return 1;
> +	it = ITCTX(ctx);
> +	cls_str = CLSCTX(ctx);
> +	dev = it->class_device;
> +	/* it->cls_str != NULL means a class
> +	 * was specified in the devstr.
> +	 */
> +	if (it->cls_str != NULL && cls != it->cls)
> +		return 1;
> +	/* If an error occurred previously,
> +	 * no need to test further.
> +	 */
> +	if (rte_errno != 0)
> +		return -1;

I am guessing here by '..error occurred previously..' you mean 
sane_copy. If so, why wait until this point to return? Anyway the caller 
(rte_bus_find, probably) would only look for '0' or non-zero.

> +	dev = cls->dev_iterate(dev, cls_str, it);
> +	it->class_device = dev;
> +	return dev == NULL;
> +}
> +
> +static int
> +bus_next_dev_cmp(const struct rte_bus *bus,
> +		 const void *ctx)
> +{
> +	struct rte_device *dev = NULL;
> +	struct rte_class *cls = NULL;
> +	struct rte_dev_iterator *it;
> +	const char *bus_str = NULL;
> +
> +	if (bus->dev_iterate == NULL)
> +		return 1;
> +	it = ITCTX(ctx);
> +	bus_str = BUSCTX(ctx);
> +	dev = it->device;
> +	/* it->bus_str != NULL means a bus
> +	 * was specified in the devstr.
> +	 */
> +	if (it->bus_str != NULL && bus != it->bus)
> +		return 1;
> +	/* If an error occurred previously,
> +	 * no need to test further.
> +	 */
> +	if (rte_errno != 0)
> +		return -1;
> +	if (it->cls_str == NULL) {
> +		dev = bus->dev_iterate(dev, bus_str, it);
> +		goto end;

This is slightly confusing. If it->cls_str == NULL, you do 
bus->dev_iterate... but

> +	}
> +	/* cls_str != NULL */
> +	if (dev == NULL) {
> +next_dev_on_bus:
> +		dev = bus->dev_iterate(dev, bus_str, it);

When cls_str!=NULL, you still do bus->dev_iterate...
So, maybe they are OR case resulting in check of dev==NULL and return 
(as being done right now by jumping to out)...?

And, how can bus iterate over a 'null' device?

> +		it->device = dev;
> +	}
> +	if (dev == NULL)
> +		return 1;

Maybe, this check should move in the if(dev==NULL) above - that way, it 
can in path of 'next_dev_on_bus' yet do the same as function as its 
current location.

> +	if (it->cls != NULL)

In what case would (it->cls_str == NULL) but (it->cls != NULL)?

> +		cls = TAILQ_PREV(it->cls, rte_class_list, next);
> +	cls = rte_class_find(cls, class_next_dev_cmp, ctx);
> +	if (cls != NULL) {
> +		it->cls = cls;
> +		goto end;
> +	}
> +	goto next_dev_on_bus;

Maybe I have completely mixed your intention of this function. So, if 
you still find the above comments naive - maybe you can tell me what you 
are attempting here?
Is it: find next bus and stop if no class specified. find next class as 
well, iff that too was specified?

Reason I am confused is that bus_next_dev_cmp attempts to compare both - 
bus and class, yet class_next_dev_cmp simply stops by comparing class only.

> +end:
> +	it->device = dev;
> +	return dev == NULL;
> +}

A new line should be added here - start of a new function.

> +__rte_experimental
> +struct rte_device *
> +rte_dev_iterator_next(struct rte_dev_iterator *it)
> +{
> +	struct rte_bus *bus = NULL;
> +	int old_errno = rte_errno;
> +	char *bus_str = NULL;
> +	char *cls_str = NULL;
> +
> +	rte_errno = 0;
> +	if (it->bus_str == NULL && it->cls_str == NULL) {
> +		/* Invalid iterator. */
> +		rte_errno = EINVAL;
> +		return NULL;
> +	}
> +	if (it->bus != NULL)
> +		bus = TAILQ_PREV(it->bus, rte_bus_list, next);
> +	if (it->bus_str != NULL) {
> +		bus_str = dev_str_sane_copy(it->bus_str);
> +		if (bus_str == NULL)
> +			goto out;
> +	}
> +	if (it->cls_str != NULL) {
> +		cls_str = dev_str_sane_copy(it->cls_str);
> +		if (cls_str == NULL)
> +			goto out;
> +	}
> +	while ((bus = rte_bus_find(bus, bus_next_dev_cmp,
> +				   CTX(it, bus_str, cls_str)))) {
> +		if (it->device != NULL) {
> +			it->bus = bus;
> +			goto out;
> +		}
> +		if (it->bus_str != NULL ||
> +		    rte_errno != 0)
> +			break;
> +	}
> +	if (rte_errno == 0)
> +		rte_errno = old_errno;
> +out:
> +	free(bus_str);
> +	free(cls_str);
> +	return it->device;
> +}
> diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h
> index fdc812ff8..8638a2bbd 100644
> --- a/lib/librte_eal/common/include/rte_dev.h
> +++ b/lib/librte_eal/common/include/rte_dev.h
> @@ -355,6 +355,32 @@ __rte_experimental
>   int
>   rte_dev_iterator_init(struct rte_dev_iterator *it, const char *str);
>   
> +/**
> + * Iterates on a device iterator.
> + *
> + * Generates a new rte_device handle corresponding to the next element
> + * in the list described in comprehension by the iterator.
> + *
> + * The next object is returned, and the iterator is updated.
> + *
> + * @param it
> + *   Device iterator handle.
> + *
> + * @return
> + *   An rte_device handle if found.
> + *   NULL if an error occurred (rte_errno is set).
> + *   NULL if no device could be found (rte_errno is not set).
> + */
> +__rte_experimental
> +struct rte_device *
> +rte_dev_iterator_next(struct rte_dev_iterator *it);
> +
> +#define RTE_DEV_FOREACH(dev, devstr, it) \
> +	for (rte_dev_iterator_init(it, devstr), \
> +	     dev = rte_dev_iterator_next(it); \
> +	     dev != NULL; \
> +	     dev = rte_dev_iterator_next(it))
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index ac04120d6..4cd5ab3df 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -252,6 +252,7 @@ EXPERIMENTAL {
>   	rte_dev_event_monitor_start;
>   	rte_dev_event_monitor_stop;
>   	rte_dev_iterator_init;
> +	rte_dev_iterator_next;
>   	rte_devargs_add;
>   	rte_devargs_dump;
>   	rte_devargs_insert;
>
  
Gaëtan Rivet July 12, 2018, 3:08 p.m. UTC | #2
Hi Shreyansh,

On Thu, Jul 12, 2018 at 04:28:27PM +0530, Shreyansh Jain wrote:
> On Thursday 12 July 2018 03:15 AM, Gaetan Rivet wrote:
> > Use the iteration hooks in the abstraction layers to perform the
> > requested filtering on the internal device lists.
> > 
> > Signed-off-by: Gaetan Rivet <gaetan.rivet@6wind.com>
> > ---
> >   lib/librte_eal/common/eal_common_dev.c  | 168 ++++++++++++++++++++++++
> >   lib/librte_eal/common/include/rte_dev.h |  26 ++++
> >   lib/librte_eal/rte_eal_version.map      |   1 +
> >   3 files changed, 195 insertions(+)
> > 
> > diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c
> > index 63e329bd8..b78845f02 100644
> > --- a/lib/librte_eal/common/eal_common_dev.c
> > +++ b/lib/librte_eal/common/eal_common_dev.c
> > @@ -45,6 +45,28 @@ static struct dev_event_cb_list dev_event_cbs;
> >   /* spinlock for device callbacks */
> >   static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER;
> > +struct dev_next_ctx {
> > +	struct rte_dev_iterator *it;
> > +	const char *bus_str;
> > +	const char *cls_str;
> > +};
> > +
> > +#define CTX(it, bus_str, cls_str) \
> > +	(&(const struct dev_next_ctx){ \
> > +		.it = it, \
> > +		.bus_str = bus_str, \
> > +		.cls_str = cls_str, \
> > +	})
> > +
> > +#define ITCTX(ptr) \
> > +	(((struct dev_next_ctx *)(intptr_t)ptr)->it)
> > +
> > +#define BUSCTX(ptr) \
> > +	(((struct dev_next_ctx *)(intptr_t)ptr)->bus_str)
> > +
> > +#define CLSCTX(ptr) \
> > +	(((struct dev_next_ctx *)(intptr_t)ptr)->cls_str)
> > +
> >   static int cmp_detached_dev_name(const struct rte_device *dev,
> >   	const void *_name)
> >   {
> > @@ -398,3 +420,149 @@ rte_dev_iterator_init(struct rte_dev_iterator *it,
> >   get_out:
> >   	return -rte_errno;
> >   }
> > +
> > +static char *
> > +dev_str_sane_copy(const char *str)
> > +{
> > +	size_t end;
> > +	char *copy;
> > +
> > +	end = strcspn(str, ",/");
> > +	if (str[end] == ',') {
> > +		copy = strdup(&str[end + 1]);
> > +	} else {
> > +		/* '/' or '\0' */
> > +		copy = strdup("");
> > +	}
> 
> Though it doesn't change anything functionally, if you can separate blocks
> of if-else with new lines, it really makes it easier to read.
> Like here...
> 

sure,

> > +	if (copy == NULL) {
> > +		rte_errno = ENOMEM;
> > +	} else {
> > +		char *slash;
> > +
> > +		slash = strchr(copy, '/');
> > +		if (slash != NULL)
> > +			slash[0] = '\0';
> > +	}
> > +	return copy;
> > +}
> > +
> > +static int
> > +class_next_dev_cmp(const struct rte_class *cls,
> > +		   const void *ctx)
> > +{
> > +	struct rte_dev_iterator *it;
> > +	const char *cls_str = NULL;
> > +	void *dev;
> > +
> > +	if (cls->dev_iterate == NULL)
> > +		return 1;
> > +	it = ITCTX(ctx);
> > +	cls_str = CLSCTX(ctx);
> > +	dev = it->class_device;
> > +	/* it->cls_str != NULL means a class
> > +	 * was specified in the devstr.
> > +	 */
> > +	if (it->cls_str != NULL && cls != it->cls)
> > +		return 1;
> > +	/* If an error occurred previously,
> > +	 * no need to test further.
> > +	 */
> > +	if (rte_errno != 0)
> > +		return -1;
> 
> I am guessing here by '..error occurred previously..' you mean sane_copy. If
> so, why wait until this point to return? Anyway the caller (rte_bus_find,
> probably) would only look for '0' or non-zero.
> 

No, rte_errno could be set by a bus / class implementation, for any
error occurring during a call to dev_iterate: maybe a device was lost
(hotplugged), etc. The return value of dev_iterate() cannot transmit an
error as not matching a filter is not an error. The only error channel
is rte_errno.

sane_copy was already checked before and should be cleared at this
point.

> > +	dev = cls->dev_iterate(dev, cls_str, it);
> > +	it->class_device = dev;
> > +	return dev == NULL;
> > +}
> > +
> > +static int
> > +bus_next_dev_cmp(const struct rte_bus *bus,
> > +		 const void *ctx)
> > +{
> > +	struct rte_device *dev = NULL;
> > +	struct rte_class *cls = NULL;
> > +	struct rte_dev_iterator *it;
> > +	const char *bus_str = NULL;
> > +
> > +	if (bus->dev_iterate == NULL)
> > +		return 1;
> > +	it = ITCTX(ctx);
> > +	bus_str = BUSCTX(ctx);
> > +	dev = it->device;
> > +	/* it->bus_str != NULL means a bus
> > +	 * was specified in the devstr.
> > +	 */
> > +	if (it->bus_str != NULL && bus != it->bus)
> > +		return 1;
> > +	/* If an error occurred previously,
> > +	 * no need to test further.
> > +	 */
> > +	if (rte_errno != 0)
> > +		return -1;
> > +	if (it->cls_str == NULL) {
> > +		dev = bus->dev_iterate(dev, bus_str, it);
> > +		goto end;
> 
> This is slightly confusing. If it->cls_str == NULL, you do
> bus->dev_iterate... but
> 
> > +	}
> > +	/* cls_str != NULL */
> > +	if (dev == NULL) {
> > +next_dev_on_bus:
> > +		dev = bus->dev_iterate(dev, bus_str, it);
> 
> When cls_str!=NULL, you still do bus->dev_iterate...
> So, maybe they are OR case resulting in check of dev==NULL and return (as
> being done right now by jumping to out)...?
> 

Yes, this iteration is pretty complex.

The best way to walk through it is to define the possible cases:

1. Iterating on one bus:

   if (bus_str != NULL && cls_str == NULL)

   Simplest case. You got one bus, no classes.
   Just call the current bus dev_iterate() callback, then report
   the result (whether NULL or not).

2. Iterating on one bus and one class:

   if (bus_str != NULL && cls_str != NULL)

   Possible states are:

   a. We are starting the iteration: dev is NULL.
      Iterate on the current bus.

      If device is not NULL anymore, iterate on the class.
      To ensure we stay on the same class, set cls to the previous class
      and call rte_class_find();

      If device is still NULL, return 1 to iterate on the next bus.

   b. We are in the middle of an iteration: dev is not NULL.
      We start iterating on the class to find a possible second instance
      of the same device in the class (e.g. mlx devices with multiple
      eth ports per PCI devices). If none is found, we
      come back to bus->dev_iterate() (bypassing the dev == NULL check),
      restarting this (b) cycle as many times as necessary.
      If the result is NULL, the iteration is finished.

3. Iterating on one class:

    if (bus_str == NULL && cls_str != NULL)

    The most complex case. Same as (2), however we start by the first
    bus, and if a device is NULL we will continue onto the next bus
    within the loop line 554 (within rte_dev_iterator_next).


The core of the complexity here lies in the next_dev_on_bus cycle
described in 2.b.

> And, how can bus iterate over a 'null' device?
> 

A NULL device means that we are starting the iteration: a bus will give
its first device.

> > +		it->device = dev;
> > +	}
> > +	if (dev == NULL)
> > +		return 1;
> 
> Maybe, this check should move in the if(dev==NULL) above - that way, it can
> in path of 'next_dev_on_bus' yet do the same as function as its current
> location.
> 

Yes

> > +	if (it->cls != NULL)
> 
> In what case would (it->cls_str == NULL) but (it->cls != NULL)?
> 

When one rte_device was used to spawn several class_devices: multiple
adapters per PCI addresses for example.

However, in this case, given that the matching would be useless, we skip
the class matching process and only return the rte_device. This single
rte_device is not returned multiple times.

However, if someone was giving:

bus=pci,id=00:02.0/class=eth
(str_sane_copy would set cls_str to "" here)

Then, if 00:02.0 had spawned several eth ports, it would be returned
once for each instance.

This is a pretty ambiguous case. I'm not sure of the best way to deal with
it. My decision here was to simplify the iteration if possible, as I
considered that someone that did not care for the class properties would
not care about counting the instances of the bus device. maybe I'm
wrong.

> > +		cls = TAILQ_PREV(it->cls, rte_class_list, next);
> > +	cls = rte_class_find(cls, class_next_dev_cmp, ctx);
> > +	if (cls != NULL) {
> > +		it->cls = cls;
> > +		goto end;
> > +	}
> > +	goto next_dev_on_bus;
> 
> Maybe I have completely mixed your intention of this function. So, if you
> still find the above comments naive - maybe you can tell me what you are
> attempting here?
> Is it: find next bus and stop if no class specified. find next class as
> well, iff that too was specified?
> 
> Reason I am confused is that bus_next_dev_cmp attempts to compare both - bus
> and class, yet class_next_dev_cmp simply stops by comparing class only.
> 

You are right: bus comparator will produce an iteration on the bus
devices, then on the class devices iff class is specified.

Thus the class comparator only has to iterate on class, because we
called it from the bus iterator.

> > +end:
> > +	it->device = dev;
> > +	return dev == NULL;
> > +}
> 
> A new line should be added here - start of a new function.
> 

yes


I'm pretty sorry about this code, to be honest.
This is a nasty piece with a lot of states to care for.

At least it works. I'd like to have the properties integrated so that
developpers can add their own quickly. In the meantime I could rework
this function. But simple is not easy.
  
Shreyansh Jain July 13, 2018, 7:06 a.m. UTC | #3
(I am not reducing the thread as it contains quite interesting 
discussion - so, you might have to fish for inline comments...)

On Thursday 12 July 2018 08:38 PM, Gaëtan Rivet wrote:
> Hi Shreyansh,
> 
> On Thu, Jul 12, 2018 at 04:28:27PM +0530, Shreyansh Jain wrote:
>> On Thursday 12 July 2018 03:15 AM, Gaetan Rivet wrote:
>>> Use the iteration hooks in the abstraction layers to perform the
>>> requested filtering on the internal device lists.
>>>
>>> Signed-off-by: Gaetan Rivet <gaetan.rivet@6wind.com>
>>> ---
>>>    lib/librte_eal/common/eal_common_dev.c  | 168 ++++++++++++++++++++++++
>>>    lib/librte_eal/common/include/rte_dev.h |  26 ++++
>>>    lib/librte_eal/rte_eal_version.map      |   1 +
>>>    3 files changed, 195 insertions(+)
>>>
>>> diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c
>>> index 63e329bd8..b78845f02 100644
>>> --- a/lib/librte_eal/common/eal_common_dev.c
>>> +++ b/lib/librte_eal/common/eal_common_dev.c
>>> @@ -45,6 +45,28 @@ static struct dev_event_cb_list dev_event_cbs;
>>>    /* spinlock for device callbacks */
>>>    static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER;
>>> +struct dev_next_ctx {
>>> +	struct rte_dev_iterator *it;
>>> +	const char *bus_str;
>>> +	const char *cls_str;
>>> +};
>>> +
>>> +#define CTX(it, bus_str, cls_str) \
>>> +	(&(const struct dev_next_ctx){ \
>>> +		.it = it, \
>>> +		.bus_str = bus_str, \
>>> +		.cls_str = cls_str, \
>>> +	})
>>> +
>>> +#define ITCTX(ptr) \
>>> +	(((struct dev_next_ctx *)(intptr_t)ptr)->it)
>>> +
>>> +#define BUSCTX(ptr) \
>>> +	(((struct dev_next_ctx *)(intptr_t)ptr)->bus_str)
>>> +
>>> +#define CLSCTX(ptr) \
>>> +	(((struct dev_next_ctx *)(intptr_t)ptr)->cls_str)
>>> +
>>>    static int cmp_detached_dev_name(const struct rte_device *dev,
>>>    	const void *_name)
>>>    {
>>> @@ -398,3 +420,149 @@ rte_dev_iterator_init(struct rte_dev_iterator *it,
>>>    get_out:
>>>    	return -rte_errno;
>>>    }
>>> +
>>> +static char *
>>> +dev_str_sane_copy(const char *str)
>>> +{
>>> +	size_t end;
>>> +	char *copy;
>>> +
>>> +	end = strcspn(str, ",/");
>>> +	if (str[end] == ',') {
>>> +		copy = strdup(&str[end + 1]);
>>> +	} else {
>>> +		/* '/' or '\0' */
>>> +		copy = strdup("");
>>> +	}
>>
>> Though it doesn't change anything functionally, if you can separate blocks
>> of if-else with new lines, it really makes it easier to read.
>> Like here...
>>
> 
> sure,
> 
>>> +	if (copy == NULL) {
>>> +		rte_errno = ENOMEM;
>>> +	} else {
>>> +		char *slash;
>>> +
>>> +		slash = strchr(copy, '/');
>>> +		if (slash != NULL)
>>> +			slash[0] = '\0';
>>> +	}
>>> +	return copy;
>>> +}
>>> +
>>> +static int
>>> +class_next_dev_cmp(const struct rte_class *cls,
>>> +		   const void *ctx)
>>> +{
>>> +	struct rte_dev_iterator *it;
>>> +	const char *cls_str = NULL;
>>> +	void *dev;
>>> +
>>> +	if (cls->dev_iterate == NULL)
>>> +		return 1;
>>> +	it = ITCTX(ctx);
>>> +	cls_str = CLSCTX(ctx);
>>> +	dev = it->class_device;
>>> +	/* it->cls_str != NULL means a class
>>> +	 * was specified in the devstr.
>>> +	 */
>>> +	if (it->cls_str != NULL && cls != it->cls)
>>> +		return 1;
>>> +	/* If an error occurred previously,
>>> +	 * no need to test further.
>>> +	 */
>>> +	if (rte_errno != 0)
>>> +		return -1;
>>
>> I am guessing here by '..error occurred previously..' you mean sane_copy. If
>> so, why wait until this point to return? Anyway the caller (rte_bus_find,
>> probably) would only look for '0' or non-zero.
>>
> 
> No, rte_errno could be set by a bus / class implementation, for any
> error occurring during a call to dev_iterate: maybe a device was lost
> (hotplugged), etc. The return value of dev_iterate() cannot transmit an
> error as not matching a filter is not an error. The only error channel
> is rte_errno.
> 
> sane_copy was already checked before and should be cleared at this
> point.
> 
>>> +	dev = cls->dev_iterate(dev, cls_str, it);
>>> +	it->class_device = dev;
>>> +	return dev == NULL;
>>> +}
>>> +
>>> +static int
>>> +bus_next_dev_cmp(const struct rte_bus *bus,
>>> +		 const void *ctx)
>>> +{
>>> +	struct rte_device *dev = NULL;
>>> +	struct rte_class *cls = NULL;
>>> +	struct rte_dev_iterator *it;
>>> +	const char *bus_str = NULL;
>>> +
>>> +	if (bus->dev_iterate == NULL)
>>> +		return 1;
>>> +	it = ITCTX(ctx);
>>> +	bus_str = BUSCTX(ctx);
>>> +	dev = it->device;
>>> +	/* it->bus_str != NULL means a bus
>>> +	 * was specified in the devstr.
>>> +	 */
>>> +	if (it->bus_str != NULL && bus != it->bus)
>>> +		return 1;
>>> +	/* If an error occurred previously,
>>> +	 * no need to test further.
>>> +	 */
>>> +	if (rte_errno != 0)
>>> +		return -1;
>>> +	if (it->cls_str == NULL) {
>>> +		dev = bus->dev_iterate(dev, bus_str, it);
>>> +		goto end;
>>
>> This is slightly confusing. If it->cls_str == NULL, you do
>> bus->dev_iterate... but
>>
>>> +	}
>>> +	/* cls_str != NULL */
>>> +	if (dev == NULL) {
>>> +next_dev_on_bus:
>>> +		dev = bus->dev_iterate(dev, bus_str, it);
>>
>> When cls_str!=NULL, you still do bus->dev_iterate...
>> So, maybe they are OR case resulting in check of dev==NULL and return (as
>> being done right now by jumping to out)...?
>>
> 
> Yes, this iteration is pretty complex.
> 
> The best way to walk through it is to define the possible cases:
> 
> 1. Iterating on one bus:
> 
>     if (bus_str != NULL && cls_str == NULL)
> 
>     Simplest case. You got one bus, no classes.
>     Just call the current bus dev_iterate() callback, then report
>     the result (whether NULL or not).
> 
> 2. Iterating on one bus and one class:
> 
>     if (bus_str != NULL && cls_str != NULL)
> 
>     Possible states are:
> 
>     a. We are starting the iteration: dev is NULL.
>        Iterate on the current bus.
> 
>        If device is not NULL anymore, iterate on the class.
>        To ensure we stay on the same class, set cls to the previous class
>        and call rte_class_find();
> 
>        If device is still NULL, return 1 to iterate on the next bus.
> 
>     b. We are in the middle of an iteration: dev is not NULL.
>        We start iterating on the class to find a possible second instance
>        of the same device in the class (e.g. mlx devices with multiple
>        eth ports per PCI devices). If none is found, we
>        come back to bus->dev_iterate() (bypassing the dev == NULL check),
>        restarting this (b) cycle as many times as necessary.
>        If the result is NULL, the iteration is finished.
> 
> 3. Iterating on one class:
> 
>      if (bus_str == NULL && cls_str != NULL)
> 
>      The most complex case. Same as (2), however we start by the first
>      bus, and if a device is NULL we will continue onto the next bus
>      within the loop line 554 (within rte_dev_iterator_next).
> 

This, above, is most useful. I had missed out the case (b) above 
entirely. I will re-review with this in mind. Thanks for writing this.

> 
> The core of the complexity here lies in the next_dev_on_bus cycle
> described in 2.b.
> 
>> And, how can bus iterate over a 'null' device?
>>
> 
> A NULL device means that we are starting the iteration: a bus will give
> its first device.
> 
>>> +		it->device = dev;
>>> +	}
>>> +	if (dev == NULL)
>>> +		return 1;
>>
>> Maybe, this check should move in the if(dev==NULL) above - that way, it can
>> in path of 'next_dev_on_bus' yet do the same as function as its current
>> location.
>>
> 
> Yes
> 
>>> +	if (it->cls != NULL)
>>
>> In what case would (it->cls_str == NULL) but (it->cls != NULL)?
>>
> 
> When one rte_device was used to spawn several class_devices: multiple
> adapters per PCI addresses for example.
> 
> However, in this case, given that the matching would be useless, we skip
> the class matching process and only return the rte_device. This single
> rte_device is not returned multiple times.
> 
> However, if someone was giving:
> 
> bus=pci,id=00:02.0/class=eth
> (str_sane_copy would set cls_str to "" here)
> 
> Then, if 00:02.0 had spawned several eth ports, it would be returned
> once for each instance.
> 
> This is a pretty ambiguous case. I'm not sure of the best way to deal with
> it. My decision here was to simplify the iteration if possible, as I
> considered that someone that did not care for the class properties would
> not care about counting the instances of the bus device. maybe I'm
> wrong.

Frankly, I have nothing extra to add as a use-case. For your approach of 
'simple is better': +1

> 
>>> +		cls = TAILQ_PREV(it->cls, rte_class_list, next);
>>> +	cls = rte_class_find(cls, class_next_dev_cmp, ctx);
>>> +	if (cls != NULL) {
>>> +		it->cls = cls;
>>> +		goto end;
>>> +	}
>>> +	goto next_dev_on_bus;
>>
>> Maybe I have completely mixed your intention of this function. So, if you
>> still find the above comments naive - maybe you can tell me what you are
>> attempting here?
>> Is it: find next bus and stop if no class specified. find next class as
>> well, iff that too was specified?
>>
>> Reason I am confused is that bus_next_dev_cmp attempts to compare both - bus
>> and class, yet class_next_dev_cmp simply stops by comparing class only.
>>
> 
> You are right: bus comparator will produce an iteration on the bus
> devices, then on the class devices iff class is specified.
> 
> Thus the class comparator only has to iterate on class, because we
> called it from the bus iterator.
> 
>>> +end:
>>> +	it->device = dev;
>>> +	return dev == NULL;
>>> +}
>>
>> A new line should be added here - start of a new function.
>>
> 
> yes
> 
> 
> I'm pretty sorry about this code, to be honest.
> This is a nasty piece with a lot of states to care for.

Actually, the cases that you have mentioned above indeed requires a 
complex logic. I am not sure I have any suggestion to make it simpler.
I will read again based on what you have explained.

> 
> At least it works. I'd like to have the properties integrated so that
> developpers can add their own quickly. In the meantime I could rework
> this function. But simple is not easy.
> 

Problem is that the 18.08 window is almost closed - I though I could 
review it within the window but now I am not confident. Maybe someone 
else too can look through the PCI/VDEV part after this patch..
  

Patch

diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c
index 63e329bd8..b78845f02 100644
--- a/lib/librte_eal/common/eal_common_dev.c
+++ b/lib/librte_eal/common/eal_common_dev.c
@@ -45,6 +45,28 @@  static struct dev_event_cb_list dev_event_cbs;
 /* spinlock for device callbacks */
 static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER;
 
+struct dev_next_ctx {
+	struct rte_dev_iterator *it;
+	const char *bus_str;
+	const char *cls_str;
+};
+
+#define CTX(it, bus_str, cls_str) \
+	(&(const struct dev_next_ctx){ \
+		.it = it, \
+		.bus_str = bus_str, \
+		.cls_str = cls_str, \
+	})
+
+#define ITCTX(ptr) \
+	(((struct dev_next_ctx *)(intptr_t)ptr)->it)
+
+#define BUSCTX(ptr) \
+	(((struct dev_next_ctx *)(intptr_t)ptr)->bus_str)
+
+#define CLSCTX(ptr) \
+	(((struct dev_next_ctx *)(intptr_t)ptr)->cls_str)
+
 static int cmp_detached_dev_name(const struct rte_device *dev,
 	const void *_name)
 {
@@ -398,3 +420,149 @@  rte_dev_iterator_init(struct rte_dev_iterator *it,
 get_out:
 	return -rte_errno;
 }
+
+static char *
+dev_str_sane_copy(const char *str)
+{
+	size_t end;
+	char *copy;
+
+	end = strcspn(str, ",/");
+	if (str[end] == ',') {
+		copy = strdup(&str[end + 1]);
+	} else {
+		/* '/' or '\0' */
+		copy = strdup("");
+	}
+	if (copy == NULL) {
+		rte_errno = ENOMEM;
+	} else {
+		char *slash;
+
+		slash = strchr(copy, '/');
+		if (slash != NULL)
+			slash[0] = '\0';
+	}
+	return copy;
+}
+
+static int
+class_next_dev_cmp(const struct rte_class *cls,
+		   const void *ctx)
+{
+	struct rte_dev_iterator *it;
+	const char *cls_str = NULL;
+	void *dev;
+
+	if (cls->dev_iterate == NULL)
+		return 1;
+	it = ITCTX(ctx);
+	cls_str = CLSCTX(ctx);
+	dev = it->class_device;
+	/* it->cls_str != NULL means a class
+	 * was specified in the devstr.
+	 */
+	if (it->cls_str != NULL && cls != it->cls)
+		return 1;
+	/* If an error occurred previously,
+	 * no need to test further.
+	 */
+	if (rte_errno != 0)
+		return -1;
+	dev = cls->dev_iterate(dev, cls_str, it);
+	it->class_device = dev;
+	return dev == NULL;
+}
+
+static int
+bus_next_dev_cmp(const struct rte_bus *bus,
+		 const void *ctx)
+{
+	struct rte_device *dev = NULL;
+	struct rte_class *cls = NULL;
+	struct rte_dev_iterator *it;
+	const char *bus_str = NULL;
+
+	if (bus->dev_iterate == NULL)
+		return 1;
+	it = ITCTX(ctx);
+	bus_str = BUSCTX(ctx);
+	dev = it->device;
+	/* it->bus_str != NULL means a bus
+	 * was specified in the devstr.
+	 */
+	if (it->bus_str != NULL && bus != it->bus)
+		return 1;
+	/* If an error occurred previously,
+	 * no need to test further.
+	 */
+	if (rte_errno != 0)
+		return -1;
+	if (it->cls_str == NULL) {
+		dev = bus->dev_iterate(dev, bus_str, it);
+		goto end;
+	}
+	/* cls_str != NULL */
+	if (dev == NULL) {
+next_dev_on_bus:
+		dev = bus->dev_iterate(dev, bus_str, it);
+		it->device = dev;
+	}
+	if (dev == NULL)
+		return 1;
+	if (it->cls != NULL)
+		cls = TAILQ_PREV(it->cls, rte_class_list, next);
+	cls = rte_class_find(cls, class_next_dev_cmp, ctx);
+	if (cls != NULL) {
+		it->cls = cls;
+		goto end;
+	}
+	goto next_dev_on_bus;
+end:
+	it->device = dev;
+	return dev == NULL;
+}
+__rte_experimental
+struct rte_device *
+rte_dev_iterator_next(struct rte_dev_iterator *it)
+{
+	struct rte_bus *bus = NULL;
+	int old_errno = rte_errno;
+	char *bus_str = NULL;
+	char *cls_str = NULL;
+
+	rte_errno = 0;
+	if (it->bus_str == NULL && it->cls_str == NULL) {
+		/* Invalid iterator. */
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	if (it->bus != NULL)
+		bus = TAILQ_PREV(it->bus, rte_bus_list, next);
+	if (it->bus_str != NULL) {
+		bus_str = dev_str_sane_copy(it->bus_str);
+		if (bus_str == NULL)
+			goto out;
+	}
+	if (it->cls_str != NULL) {
+		cls_str = dev_str_sane_copy(it->cls_str);
+		if (cls_str == NULL)
+			goto out;
+	}
+	while ((bus = rte_bus_find(bus, bus_next_dev_cmp,
+				   CTX(it, bus_str, cls_str)))) {
+		if (it->device != NULL) {
+			it->bus = bus;
+			goto out;
+		}
+		if (it->bus_str != NULL ||
+		    rte_errno != 0)
+			break;
+	}
+	if (rte_errno == 0)
+		rte_errno = old_errno;
+out:
+	free(bus_str);
+	free(cls_str);
+	return it->device;
+}
diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h
index fdc812ff8..8638a2bbd 100644
--- a/lib/librte_eal/common/include/rte_dev.h
+++ b/lib/librte_eal/common/include/rte_dev.h
@@ -355,6 +355,32 @@  __rte_experimental
 int
 rte_dev_iterator_init(struct rte_dev_iterator *it, const char *str);
 
+/**
+ * Iterates on a device iterator.
+ *
+ * Generates a new rte_device handle corresponding to the next element
+ * in the list described in comprehension by the iterator.
+ *
+ * The next object is returned, and the iterator is updated.
+ *
+ * @param it
+ *   Device iterator handle.
+ *
+ * @return
+ *   An rte_device handle if found.
+ *   NULL if an error occurred (rte_errno is set).
+ *   NULL if no device could be found (rte_errno is not set).
+ */
+__rte_experimental
+struct rte_device *
+rte_dev_iterator_next(struct rte_dev_iterator *it);
+
+#define RTE_DEV_FOREACH(dev, devstr, it) \
+	for (rte_dev_iterator_init(it, devstr), \
+	     dev = rte_dev_iterator_next(it); \
+	     dev != NULL; \
+	     dev = rte_dev_iterator_next(it))
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index ac04120d6..4cd5ab3df 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -252,6 +252,7 @@  EXPERIMENTAL {
 	rte_dev_event_monitor_start;
 	rte_dev_event_monitor_stop;
 	rte_dev_iterator_init;
+	rte_dev_iterator_next;
 	rte_devargs_add;
 	rte_devargs_dump;
 	rte_devargs_insert;