[v7,5/7] bus: add helper to handle sigbus

Message ID 1531137666-10351-6-git-send-email-jia.guo@intel.com (mailing list archive)
State Superseded, archived
Headers
Series [v7,1/7] bus: add hotplug failure handler |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Guo, Jia July 9, 2018, 12:01 p.m. UTC
  This patch aim to add a helper to iterate all buses to find the
corresponding bus to handle the sigbus error.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Acked-by: Shaopeng He <shaopeng.he@intel.com>
---
v7->v6:
no change
---
 lib/librte_eal/common/eal_common_bus.c | 42 ++++++++++++++++++++++++++++++++++
 lib/librte_eal/common/eal_private.h    | 12 ++++++++++
 2 files changed, 54 insertions(+)
  

Comments

Andrew Rybchenko July 9, 2018, 1:48 p.m. UTC | #1
On 09.07.2018 15:01, Jeff Guo wrote:
> This patch aim to add a helper to iterate all buses to find the
> corresponding bus to handle the sigbus error.
>
> Signed-off-by: Jeff Guo <jia.guo@intel.com>
> Acked-by: Shaopeng He <shaopeng.he@intel.com>
> ---
> v7->v6:
> no change
> ---
>   lib/librte_eal/common/eal_common_bus.c | 42 ++++++++++++++++++++++++++++++++++
>   lib/librte_eal/common/eal_private.h    | 12 ++++++++++
>   2 files changed, 54 insertions(+)
>
> diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
> index 0943851..8856adc 100644
> --- a/lib/librte_eal/common/eal_common_bus.c
> +++ b/lib/librte_eal/common/eal_common_bus.c
> @@ -37,6 +37,7 @@
>   #include <rte_bus.h>
>   #include <rte_debug.h>
>   #include <rte_string_fns.h>
> +#include <rte_errno.h>
>   
>   #include "eal_private.h"
>   
> @@ -242,3 +243,44 @@ rte_bus_get_iommu_class(void)
>   	}
>   	return mode;
>   }
> +
> +static int
> +bus_handle_sigbus(const struct rte_bus *bus,
> +			const void *failure_addr)
> +{
> +	int ret;
> +
> +	if (!bus->sigbus_handler) {
> +		RTE_LOG(ERR, EAL, "Function sigbus_handler not supported by "
> +			"bus (%s)\n", bus->name);

It is not an error. It is OK that some buses cannot handle SIGBUS.

> +		return -1;
> +	}
> +
> +	ret = bus->sigbus_handler(failure_addr);
> +	rte_errno = ret;
> +
> +	return !(bus->sigbus_handler && ret <= 0);

There is no point to check bus->sigbus_handler here. It is already 
checked above.
So, it should be just:
    return ret > 0;
I.e. we should continue search if the address is not handled by any device
on the bus (we should stop if it is handled (ret==0) or failed to to handle
(ret < 0)).

> +}
> +
> +int
> +rte_bus_sigbus_handler(const void *failure_addr)
> +{
> +	struct rte_bus *bus;
> +
> +	int ret = 0;
> +	int old_errno = rte_errno;
> +
> +	rte_errno = 0;
> +
> +	bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr);
> +	/* failed to handle the sigbus, pass the new errno. */
> +	if (!bus)
> +		ret = 1;
> +	else if (rte_errno == -1)

I'm still thinking it is bad to keep negative value in rte_errno here.

> +		return -1;
> +
> +	/* otherwise restore the old errno. */
> +	rte_errno = old_errno;
> +
> +	return ret;
> +}
> diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
> index bdadc4d..2337e71 100644
> --- a/lib/librte_eal/common/eal_private.h
> +++ b/lib/librte_eal/common/eal_private.h
> @@ -258,4 +258,16 @@ int rte_mp_channel_init(void);
>    */
>   void dev_callback_process(char *device_name, enum rte_dev_event_type event);
>   
> +/**
> + * Iterate all buses to find the corresponding bus, to handle the sigbus error.
> + * @param failure_addr
> + *	Pointer of the fault address of the sigbus error.
> + *
> + * @return
> + *	 0 success to handle the sigbus.
> + *	-1 failed to handle the sigbus
> + *	 1 no bus can handler the sigbus
> + */
> +int rte_bus_sigbus_handler(const void *failure_addr);
> +
>   #endif /* _EAL_PRIVATE_H_ */
  
Guo, Jia July 10, 2018, 8:22 a.m. UTC | #2
On 7/9/2018 9:48 PM, Andrew Rybchenko wrote:
> On 09.07.2018 15:01, Jeff Guo wrote:
>> This patch aim to add a helper to iterate all buses to find the
>> corresponding bus to handle the sigbus error.
>>
>> Signed-off-by: Jeff Guo <jia.guo@intel.com>
>> Acked-by: Shaopeng He <shaopeng.he@intel.com>
>> ---
>> v7->v6:
>> no change
>> ---
>>   lib/librte_eal/common/eal_common_bus.c | 42 
>> ++++++++++++++++++++++++++++++++++
>>   lib/librte_eal/common/eal_private.h    | 12 ++++++++++
>>   2 files changed, 54 insertions(+)
>>
>> diff --git a/lib/librte_eal/common/eal_common_bus.c 
>> b/lib/librte_eal/common/eal_common_bus.c
>> index 0943851..8856adc 100644
>> --- a/lib/librte_eal/common/eal_common_bus.c
>> +++ b/lib/librte_eal/common/eal_common_bus.c
>> @@ -37,6 +37,7 @@
>>   #include <rte_bus.h>
>>   #include <rte_debug.h>
>>   #include <rte_string_fns.h>
>> +#include <rte_errno.h>
>>     #include "eal_private.h"
>>   @@ -242,3 +243,44 @@ rte_bus_get_iommu_class(void)
>>       }
>>       return mode;
>>   }
>> +
>> +static int
>> +bus_handle_sigbus(const struct rte_bus *bus,
>> +            const void *failure_addr)
>> +{
>> +    int ret;
>> +
>> +    if (!bus->sigbus_handler) {
>> +        RTE_LOG(ERR, EAL, "Function sigbus_handler not supported by "
>> +            "bus (%s)\n", bus->name);
>
> It is not an error. It is OK that some buses cannot handle SIGBUS.
>

yes, it is.

>> +        return -1;
>> +    }
>> +
>> +    ret = bus->sigbus_handler(failure_addr);
>> +    rte_errno = ret;
>> +
>> +    return !(bus->sigbus_handler && ret <= 0);
>
> There is no point to check bus->sigbus_handler here. It is already 
> checked above.
> So, it should be just:
>    return ret > 0;
> I.e. we should continue search if the address is not handled by any 
> device
> on the bus (we should stop if it is handled (ret==0) or failed to to 
> handle
> (ret < 0)).
>

i will modify it, thanks.

>> +}
>> +
>> +int
>> +rte_bus_sigbus_handler(const void *failure_addr)
>> +{
>> +    struct rte_bus *bus;
>> +
>> +    int ret = 0;
>> +    int old_errno = rte_errno;
>> +
>> +    rte_errno = 0;
>> +
>> +    bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr);
>> +    /* failed to handle the sigbus, pass the new errno. */
>> +    if (!bus)
>> +        ret = 1;
>> +    else if (rte_errno == -1)
>
> I'm still thinking it is bad to keep negative value in rte_errno here.
>

i think the rte_errno just no used for the caller if return -1. Since if 
find bus but process failed, will use rte_exit to process whatever the 
rte_errno value. Only return 1 means use the origin sigbus handler that 
will care about the errno.

>> +        return -1;
>> +
>> +    /* otherwise restore the old errno. */
>> +    rte_errno = old_errno;
>> +
>> +    return ret;
>> +}
>> diff --git a/lib/librte_eal/common/eal_private.h 
>> b/lib/librte_eal/common/eal_private.h
>> index bdadc4d..2337e71 100644
>> --- a/lib/librte_eal/common/eal_private.h
>> +++ b/lib/librte_eal/common/eal_private.h
>> @@ -258,4 +258,16 @@ int rte_mp_channel_init(void);
>>    */
>>   void dev_callback_process(char *device_name, enum 
>> rte_dev_event_type event);
>>   +/**
>> + * Iterate all buses to find the corresponding bus, to handle the 
>> sigbus error.
>> + * @param failure_addr
>> + *    Pointer of the fault address of the sigbus error.
>> + *
>> + * @return
>> + *     0 success to handle the sigbus.
>> + *    -1 failed to handle the sigbus
>> + *     1 no bus can handler the sigbus
>> + */
>> +int rte_bus_sigbus_handler(const void *failure_addr);
>> +
>>   #endif /* _EAL_PRIVATE_H_ */
>
  
Gaëtan Rivet July 10, 2018, 8:40 a.m. UTC | #3
On Tue, Jul 10, 2018 at 04:22:23PM +0800, Jeff Guo wrote:
> 
> 
> On 7/9/2018 9:48 PM, Andrew Rybchenko wrote:
> > On 09.07.2018 15:01, Jeff Guo wrote:
> > > This patch aim to add a helper to iterate all buses to find the
> > > corresponding bus to handle the sigbus error.
> > > 
> > > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > > Acked-by: Shaopeng He <shaopeng.he@intel.com>
> > > ---
> > > v7->v6:
> > > no change
> > > ---
> > >   lib/librte_eal/common/eal_common_bus.c | 42
> > > ++++++++++++++++++++++++++++++++++
> > >   lib/librte_eal/common/eal_private.h    | 12 ++++++++++
> > >   2 files changed, 54 insertions(+)
> > > 
> > > diff --git a/lib/librte_eal/common/eal_common_bus.c
> > > b/lib/librte_eal/common/eal_common_bus.c
> > > index 0943851..8856adc 100644
> > > --- a/lib/librte_eal/common/eal_common_bus.c
> > > +++ b/lib/librte_eal/common/eal_common_bus.c
> > > @@ -37,6 +37,7 @@
> > >   #include <rte_bus.h>
> > >   #include <rte_debug.h>
> > >   #include <rte_string_fns.h>
> > > +#include <rte_errno.h>
> > >     #include "eal_private.h"
> > >   @@ -242,3 +243,44 @@ rte_bus_get_iommu_class(void)
> > >       }
> > >       return mode;
> > >   }
> > > +
> > > +static int
> > > +bus_handle_sigbus(const struct rte_bus *bus,
> > > +            const void *failure_addr)
> > > +{
> > > +    int ret;
> > > +
> > > +    if (!bus->sigbus_handler) {
> > > +        RTE_LOG(ERR, EAL, "Function sigbus_handler not supported by "
> > > +            "bus (%s)\n", bus->name);
> > 
> > It is not an error. It is OK that some buses cannot handle SIGBUS.
> > 
> 
> yes, it is.
> 
> > > +        return -1;
> > > +    }
> > > +
> > > +    ret = bus->sigbus_handler(failure_addr);
> > > +    rte_errno = ret;
> > > +
> > > +    return !(bus->sigbus_handler && ret <= 0);
> > 
> > There is no point to check bus->sigbus_handler here. It is already
> > checked above.
> > So, it should be just:
> >    return ret > 0;
> > I.e. we should continue search if the address is not handled by any
> > device
> > on the bus (we should stop if it is handled (ret==0) or failed to to
> > handle
> > (ret < 0)).
> > 
> 
> i will modify it, thanks.
> 

Why is rte_errno set here?
rte_errno is meant by the bus dev to be set on error. You do not have to
modify it.
ret would already be <0 on error.

At most, you could do something like:

if (ret < 0 && rte_errno == 0)
    rte_errno = ENOTSUP;

Or something akin, with a non-descriptive error hinting that the
developper didn't seem to care about setting errno to something
meaningful (so only partially respecting the API).

> > > +}
> > > +
> > > +int
> > > +rte_bus_sigbus_handler(const void *failure_addr)
> > > +{
> > > +    struct rte_bus *bus;
> > > +
> > > +    int ret = 0;
> > > +    int old_errno = rte_errno;
> > > +
> > > +    rte_errno = 0;
> > > +
> > > +    bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr);
> > > +    /* failed to handle the sigbus, pass the new errno. */
> > > +    if (!bus)
> > > +        ret = 1;
> > > +    else if (rte_errno == -1)
> > 
> > I'm still thinking it is bad to keep negative value in rte_errno here.
> > 
> 
> i think the rte_errno just no used for the caller if return -1. Since if
> find bus but process failed, will use rte_exit to process whatever the
> rte_errno value. Only return 1 means use the origin sigbus handler that will
> care about the errno.
> 

With the changes above, the check should be something like:

    if (bus == NULL)
        return 1;
    else if (rte_errno != 0)
        return -rte_errno;

    rte_errno = old_errno;
    return 0;

Which would avoid resetting rte_errno on top of whichever value a dev
would have used, and having it set to a negative non-errno value.

(Please do not just use this as-is, if you think this is not a good idea
just tell us why or how you would prefer to do it. I'm only proposing a
way that I think would work.)

Regards,
  
Guo, Jia July 10, 2018, 10:07 a.m. UTC | #4
On 7/10/2018 4:40 PM, Gaëtan Rivet wrote:
> On Tue, Jul 10, 2018 at 04:22:23PM +0800, Jeff Guo wrote:
>>
>> On 7/9/2018 9:48 PM, Andrew Rybchenko wrote:
>>> On 09.07.2018 15:01, Jeff Guo wrote:
>>>> This patch aim to add a helper to iterate all buses to find the
>>>> corresponding bus to handle the sigbus error.
>>>>
>>>> Signed-off-by: Jeff Guo <jia.guo@intel.com>
>>>> Acked-by: Shaopeng He <shaopeng.he@intel.com>
>>>> ---
>>>> v7->v6:
>>>> no change
>>>> ---
>>>>    lib/librte_eal/common/eal_common_bus.c | 42
>>>> ++++++++++++++++++++++++++++++++++
>>>>    lib/librte_eal/common/eal_private.h    | 12 ++++++++++
>>>>    2 files changed, 54 insertions(+)
>>>>
>>>> diff --git a/lib/librte_eal/common/eal_common_bus.c
>>>> b/lib/librte_eal/common/eal_common_bus.c
>>>> index 0943851..8856adc 100644
>>>> --- a/lib/librte_eal/common/eal_common_bus.c
>>>> +++ b/lib/librte_eal/common/eal_common_bus.c
>>>> @@ -37,6 +37,7 @@
>>>>    #include <rte_bus.h>
>>>>    #include <rte_debug.h>
>>>>    #include <rte_string_fns.h>
>>>> +#include <rte_errno.h>
>>>>      #include "eal_private.h"
>>>>    @@ -242,3 +243,44 @@ rte_bus_get_iommu_class(void)
>>>>        }
>>>>        return mode;
>>>>    }
>>>> +
>>>> +static int
>>>> +bus_handle_sigbus(const struct rte_bus *bus,
>>>> +            const void *failure_addr)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    if (!bus->sigbus_handler) {
>>>> +        RTE_LOG(ERR, EAL, "Function sigbus_handler not supported by "
>>>> +            "bus (%s)\n", bus->name);
>>> It is not an error. It is OK that some buses cannot handle SIGBUS.
>>>
>> yes, it is.
>>
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    ret = bus->sigbus_handler(failure_addr);
>>>> +    rte_errno = ret;
>>>> +
>>>> +    return !(bus->sigbus_handler && ret <= 0);
>>> There is no point to check bus->sigbus_handler here. It is already
>>> checked above.
>>> So, it should be just:
>>>     return ret > 0;
>>> I.e. we should continue search if the address is not handled by any
>>> device
>>> on the bus (we should stop if it is handled (ret==0) or failed to to
>>> handle
>>> (ret < 0)).
>>>
>> i will modify it, thanks.
>>
> Why is rte_errno set here?
> rte_errno is meant by the bus dev to be set on error. You do not have to
> modify it.
> ret would already be <0 on error.
>
> At most, you could do something like:
>
> if (ret < 0 && rte_errno == 0)
>      rte_errno = ENOTSUP;
>
> Or something akin, with a non-descriptive error hinting that the
> developper didn't seem to care about setting errno to something
> meaningful (so only partially respecting the API).

  the purpose to set rte_errno here is because of the status of the 
handle need to pass though to the function caller "rte_bus_sigbus_handler",
it could give a chance to check the searching status.

>>>> +}
>>>> +
>>>> +int
>>>> +rte_bus_sigbus_handler(const void *failure_addr)
>>>> +{
>>>> +    struct rte_bus *bus;
>>>> +
>>>> +    int ret = 0;
>>>> +    int old_errno = rte_errno;
>>>> +
>>>> +    rte_errno = 0;
>>>> +
>>>> +    bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr);
>>>> +    /* failed to handle the sigbus, pass the new errno. */
>>>> +    if (!bus)
>>>> +        ret = 1;
>>>> +    else if (rte_errno == -1)
>>> I'm still thinking it is bad to keep negative value in rte_errno here.
>>>
>> i think the rte_errno just no used for the caller if return -1. Since if
>> find bus but process failed, will use rte_exit to process whatever the
>> rte_errno value. Only return 1 means use the origin sigbus handler that will
>> care about the errno.
>>
> With the changes above, the check should be something like:
>
>      if (bus == NULL)
>          return 1;
>      else if (rte_errno != 0)
>          return -rte_errno;
>
>      rte_errno = old_errno;
>      return 0;
>
> Which would avoid resetting rte_errno on top of whichever value a dev
> would have used, and having it set to a negative non-errno value.
>
> (Please do not just use this as-is, if you think this is not a good idea
> just tell us why or how you would prefer to do it. I'm only proposing a
> way that I think would work.)
>
> Regards,

i think that is the problem to find a better way, i agree to maximum to 
keep the rte_errno should be make sense.
  

Patch

diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
index 0943851..8856adc 100644
--- a/lib/librte_eal/common/eal_common_bus.c
+++ b/lib/librte_eal/common/eal_common_bus.c
@@ -37,6 +37,7 @@ 
 #include <rte_bus.h>
 #include <rte_debug.h>
 #include <rte_string_fns.h>
+#include <rte_errno.h>
 
 #include "eal_private.h"
 
@@ -242,3 +243,44 @@  rte_bus_get_iommu_class(void)
 	}
 	return mode;
 }
+
+static int
+bus_handle_sigbus(const struct rte_bus *bus,
+			const void *failure_addr)
+{
+	int ret;
+
+	if (!bus->sigbus_handler) {
+		RTE_LOG(ERR, EAL, "Function sigbus_handler not supported by "
+			"bus (%s)\n", bus->name);
+		return -1;
+	}
+
+	ret = bus->sigbus_handler(failure_addr);
+	rte_errno = ret;
+
+	return !(bus->sigbus_handler && ret <= 0);
+}
+
+int
+rte_bus_sigbus_handler(const void *failure_addr)
+{
+	struct rte_bus *bus;
+
+	int ret = 0;
+	int old_errno = rte_errno;
+
+	rte_errno = 0;
+
+	bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr);
+	/* failed to handle the sigbus, pass the new errno. */
+	if (!bus)
+		ret = 1;
+	else if (rte_errno == -1)
+		return -1;
+
+	/* otherwise restore the old errno. */
+	rte_errno = old_errno;
+
+	return ret;
+}
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index bdadc4d..2337e71 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -258,4 +258,16 @@  int rte_mp_channel_init(void);
  */
 void dev_callback_process(char *device_name, enum rte_dev_event_type event);
 
+/**
+ * Iterate all buses to find the corresponding bus, to handle the sigbus error.
+ * @param failure_addr
+ *	Pointer of the fault address of the sigbus error.
+ *
+ * @return
+ *	 0 success to handle the sigbus.
+ *	-1 failed to handle the sigbus
+ *	 1 no bus can handler the sigbus
+ */
+int rte_bus_sigbus_handler(const void *failure_addr);
+
 #endif /* _EAL_PRIVATE_H_ */