net/vdev_netvsc: handle removal of associated pci device

Message ID 20200819175333.19601-1-stephen@networkplumber.org (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series net/vdev_netvsc: handle removal of associated pci device |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/travis-robot success Travis build: passed

Commit Message

Stephen Hemminger Aug. 19, 2020, 5:53 p.m. UTC
  The vdev_netvsc was not detecting when the associated PCI device
(SRIOV) was removed. Because of that it would keep feeding the
same (removed) device to failsafe PMD which would then unsuccessfully
try and probe for it.

Change to use a mark/sweep method to detect that PCI device was
removed, and also only tell failsafe about new PCI devices.
Vdev_netvsc does not have to keep stuffing the pipe with the
same already existing PCI device.

Fixes: e7dc5d7becc5 ("net/vdev_netvsc: implement core functionality")
Cc: matan@mellanox.com, stable@dpdk.org
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/vdev_netvsc/vdev_netvsc.c | 39 +++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 5 deletions(-)
  

Comments

Long Li Sept. 6, 2020, 8:11 a.m. UTC | #1
>Subject: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of associated
>pci device
>
>The vdev_netvsc was not detecting when the associated PCI device
>(SRIOV) was removed. Because of that it would keep feeding the same
>(removed) device to failsafe PMD which would then unsuccessfully try and
>probe for it.
>
>Change to use a mark/sweep method to detect that PCI device was removed,
>and also only tell failsafe about new PCI devices.
>Vdev_netvsc does not have to keep stuffing the pipe with the same already
>existing PCI device.
>
>Fixes: e7dc5d7becc5 ("net/vdev_netvsc: implement core functionality")
>Cc: matan@mellanox.com, stable@dpdk.org
>Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

Reviewed-by: Long Li <longli@microsoft.com>

>---
> drivers/net/vdev_netvsc/vdev_netvsc.c | 39 +++++++++++++++++++++++--
>--
> 1 file changed, 34 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/net/vdev_netvsc/vdev_netvsc.c
>b/drivers/net/vdev_netvsc/vdev_netvsc.c
>index 1ecb0b3e6a34..80a3158012ce 100644
>--- a/drivers/net/vdev_netvsc/vdev_netvsc.c
>+++ b/drivers/net/vdev_netvsc/vdev_netvsc.c
>@@ -66,6 +66,7 @@ struct vdev_netvsc_ctx {
> 	char devargs[256];		   /**< Fail-safe device arguments. */
> 	char if_name[IF_NAMESIZE];	   /**< NetVSC netdevice name. */
> 	unsigned int if_index;		   /**< NetVSC netdevice index. */
>+	int pci_found;			   /**< Device detection */
> 	struct rte_ether_addr if_addr;	   /**< NetVSC MAC address. */
> 	int pipe[2];			   /**< Fail-safe communication pipe.
>*/
> 	char yield[256];		   /**< PCI sub-device arguments. */
>@@ -405,11 +406,18 @@ vdev_netvsc_device_probe(const struct
>if_nameindex *iface,
> 	len = strlen(addr);
> 	if (!len)
> 		return 0;
>+
>+	ctx->pci_found = 1;
>+
>+	/* Skip if this is same device already sent to failsafe */
>+	if (strcmp(addr, ctx->yield) == 0)
>+		return 0;
>+
>+	DRV_LOG(DEBUG, "associating PCI device \"%s\" with NetVSC"
>+		" interface \"%s\" (index %u)", addr, ctx->if_name,
>+		ctx->if_index);
>+
> 	/* Send PCI device argument to fail-safe PMD instance. */
>-	if (strcmp(addr, ctx->yield))
>-		DRV_LOG(DEBUG, "associating PCI device \"%s\" with
>NetVSC"
>-			" interface \"%s\" (index %u)", addr, ctx->if_name,
>-			ctx->if_index);
> 	memmove(buf, addr, len + 1);
> 	addr = buf;
> 	buf[len] = '\n';
>@@ -452,12 +460,33 @@ vdev_netvsc_alarm(__rte_unused void *arg)
> 	struct vdev_netvsc_ctx *ctx;
> 	int ret;
>
>+	/* 1st pass: clear PCI flag on all devices */
>+	LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
>+		ctx->pci_found = 0;
>+	}
>+
>+	/* 2nd pass: scan all system devices to look for changes to this
>+device */
> 	LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
> 		ret =
>vdev_netvsc_foreach_iface(vdev_netvsc_device_probe, 0,
> 		      ctx);
>-		if (ret < 0)
>+
>+		if (ret < 0) {
>+			DRV_LOG(NOTICE, "can not scan devices for %s\n",
>+				ctx->if_name);
> 			break;
>+		}
> 	}
>+
>+	/* 3rd pass: detect PCI removal */
>+	LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
>+		if (!ctx->pci_found && ctx->yield[0]) {
>+			DRV_LOG(DEBUG, "disassociating PCI device \"%s\"
>from NetVSC"
>+				" interface \"%s\" (index %u)", ctx->yield, ctx-
>>if_name,
>+				ctx->if_index);
>+			ctx->yield[0] = '\0';
>+		}
>+	}
>+
> 	if (!vdev_netvsc_ctx_count)
> 		return;
> 	ret = rte_eal_alarm_set(VDEV_NETVSC_PROBE_MS * 1000,
>--
>2.27.0
  
Matan Azrad Sept. 6, 2020, 12:38 p.m. UTC | #2
Hi Stephen

From: Stephen Hemminger:
> The vdev_netvsc was not detecting when the associated PCI device
> (SRIOV) was removed. Because of that it would keep feeding the same
> (removed) device to failsafe PMD which would then unsuccessfully try and
> probe for it.
> 
> Change to use a mark/sweep method to detect that PCI device was
> removed, and also only tell failsafe about new PCI devices.
> Vdev_netvsc does not have to keep stuffing the pipe with the same already
> existing PCI device.

As I know, the vdev_netvsc driver doesn't call to failsafe if the PCI device is not detected by the readlink command(considered as removed)...
Am I missing something?

>
> Fixes: e7dc5d7becc5 ("net/vdev_netvsc: implement core functionality")
> Cc: matan@mellanox.com, stable@dpdk.org
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  drivers/net/vdev_netvsc/vdev_netvsc.c | 39 +++++++++++++++++++++++-
> ---
>  1 file changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/vdev_netvsc/vdev_netvsc.c
> b/drivers/net/vdev_netvsc/vdev_netvsc.c
> index 1ecb0b3e6a34..80a3158012ce 100644
> --- a/drivers/net/vdev_netvsc/vdev_netvsc.c
> +++ b/drivers/net/vdev_netvsc/vdev_netvsc.c
> @@ -66,6 +66,7 @@ struct vdev_netvsc_ctx {
>         char devargs[256];                 /**< Fail-safe device arguments. */
>         char if_name[IF_NAMESIZE];         /**< NetVSC netdevice name. */
>         unsigned int if_index;             /**< NetVSC netdevice index. */
> +       int pci_found;                     /**< Device detection */
>         struct rte_ether_addr if_addr;     /**< NetVSC MAC address. */
>         int pipe[2];                       /**< Fail-safe communication pipe. */
>         char yield[256];                   /**< PCI sub-device arguments. */
> @@ -405,11 +406,18 @@ vdev_netvsc_device_probe(const struct
> if_nameindex *iface,
>         len = strlen(addr);
>         if (!len)
>                 return 0;
> +

The file code convention is to use blank lines in blocks only after variables declarations.

> +       ctx->pci_found = 1;
> +
> +       /* Skip if this is same device already sent to failsafe */
> +       if (strcmp(addr, ctx->yield) == 0)
> +               return 0;
> +
> +       DRV_LOG(DEBUG, "associating PCI device \"%s\" with NetVSC"
> +               " interface \"%s\" (index %u)", addr, ctx->if_name,
> +               ctx->if_index);
> +
>         /* Send PCI device argument to fail-safe PMD instance. */
> -       if (strcmp(addr, ctx->yield))
> -               DRV_LOG(DEBUG, "associating PCI device \"%s\" with NetVSC"
> -                       " interface \"%s\" (index %u)", addr, ctx->if_name,
> -                       ctx->if_index);
>         memmove(buf, addr, len + 1);
>         addr = buf;
>         buf[len] = '\n';
> @@ -452,12 +460,33 @@ vdev_netvsc_alarm(__rte_unused void *arg)
>         struct vdev_netvsc_ctx *ctx;
>         int ret;
> 
> +       /* 1st pass: clear PCI flag on all devices */
> +       LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
> +               ctx->pci_found = 0;
> +       }
> +
> +       /* 2nd pass: scan all system devices to look for changes to this
> + device */
>         LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
>                 ret = vdev_netvsc_foreach_iface(vdev_netvsc_device_probe, 0,
>                       ctx);
> -               if (ret < 0)
> +
> +               if (ret < 0) {
> +                       DRV_LOG(NOTICE, "can not scan devices for %s\n",
> +                               ctx->if_name);
>                         break;
> +               }
>         }
> +
> +       /* 3rd pass: detect PCI removal */
> +       LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
> +               if (!ctx->pci_found && ctx->yield[0]) {
> +                       DRV_LOG(DEBUG, "disassociating PCI device \"%s\" from
> NetVSC"
> +                               " interface \"%s\" (index %u)", ctx->yield, ctx->if_name,
> +                               ctx->if_index);
> +                       ctx->yield[0] = '\0';
> +               }
> +       }
> +
>         if (!vdev_netvsc_ctx_count)
>                 return;
>         ret = rte_eal_alarm_set(VDEV_NETVSC_PROBE_MS * 1000,
> --
> 2.27.0
  
Stephen Hemminger Sept. 6, 2020, 6:33 p.m. UTC | #3
On Sun, 6 Sep 2020 12:38:18 +0000
Matan Azrad <matan@nvidia.com> wrote:

> Hi Stephen
> 
> From: Stephen Hemminger:
> > The vdev_netvsc was not detecting when the associated PCI device
> > (SRIOV) was removed. Because of that it would keep feeding the same
> > (removed) device to failsafe PMD which would then unsuccessfully try and
> > probe for it.
> > 
> > Change to use a mark/sweep method to detect that PCI device was
> > removed, and also only tell failsafe about new PCI devices.
> > Vdev_netvsc does not have to keep stuffing the pipe with the same already
> > existing PCI device.  
> 
> As I know, the vdev_netvsc driver doesn't call to failsafe if the PCI device is not detected by the readlink command(considered as removed)...
> Am I missing something?

The original code is broken because ctx_yield is not cleared, it keeps sending the same value.
It looks like device removal and add was never tested.

If you test removal you will see that vdev_netvsc:
 1. Sends same PCI device repeatedly to failsafe (every alarm call)
    This is harmless, but useless.
 2. When device is removed, keeps doing #1
  
Matan Azrad Sept. 7, 2020, 8:09 a.m. UTC | #4
Hi Stephen

From: Stephen Hemminger:
> On Sun, 6 Sep 2020 12:38:18 +0000
> Matan Azrad <matan@nvidia.com> wrote:
> 
> > Hi Stephen
> >
> > From: Stephen Hemminger:
> > > The vdev_netvsc was not detecting when the associated PCI device
> > > (SRIOV) was removed. Because of that it would keep feeding the same
> > > (removed) device to failsafe PMD which would then unsuccessfully try
> > > and probe for it.
> > >
> > > Change to use a mark/sweep method to detect that PCI device was
> > > removed, and also only tell failsafe about new PCI devices.
> > > Vdev_netvsc does not have to keep stuffing the pipe with the same
> > > already existing PCI device.
> >
> > As I know, the vdev_netvsc driver doesn't call to failsafe if the PCI device is
> not detected by the readlink command(considered as removed)...
> > Am I missing something?
> 
> The original code is broken because ctx_yield is not cleared, it keeps sending
> the same value.

Looking on the code again, It looks like ctx->yield has no effect on the next pipe write,
It is just used for log.

After the PCI interface matching to the netvsc interface, the pipe write is triggered only if the readlink commands success to see the plugged-in PCI device:
readlink /sys/class/net/[iface]/device/subsystem shows "pci"
readlink /sys/class/net/[iface]/device shows the pci device ID.

So, the assumption is when the above readlink failed on the interface the device is removed(plugged-out) and the fd write will not happen.

The code will continue to retry probe again and again until success only for plugged-in pci device matched the netvsc device.

> It looks like device removal and add was never tested.

This is basic test we have to test plug-in plug-out and it passed every day in the last years.

Maybe something new and special in your setup?

> If you test removal you will see that vdev_netvsc:
>  1. Sends same PCI device repeatedly to failsafe (every alarm call)
>     This is harmless, but useless.
>  2. When device is removed, keeps doing #1
  
Long Li Sept. 15, 2020, 4:53 a.m. UTC | #5
>Subject: Re: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of
>associated pci device
>
>Hi Stephen
>
>From: Stephen Hemminger:
>> On Sun, 6 Sep 2020 12:38:18 +0000
>> Matan Azrad <matan@nvidia.com> wrote:
>>
>> > Hi Stephen
>> >
>> > From: Stephen Hemminger:
>> > > The vdev_netvsc was not detecting when the associated PCI device
>> > > (SRIOV) was removed. Because of that it would keep feeding the
>> > > same
>> > > (removed) device to failsafe PMD which would then unsuccessfully
>> > > try and probe for it.
>> > >
>> > > Change to use a mark/sweep method to detect that PCI device was
>> > > removed, and also only tell failsafe about new PCI devices.
>> > > Vdev_netvsc does not have to keep stuffing the pipe with the same
>> > > already existing PCI device.
>> >
>> > As I know, the vdev_netvsc driver doesn't call to failsafe if the
>> > PCI device is
>> not detected by the readlink command(considered as removed)...
>> > Am I missing something?
>>
>> The original code is broken because ctx_yield is not cleared, it keeps
>> sending the same value.
>
>Looking on the code again, It looks like ctx->yield has no effect on the next
>pipe write, It is just used for log.
>
>After the PCI interface matching to the netvsc interface, the pipe write is
>triggered only if the readlink commands success to see the plugged-in PCI
>device:
>readlink /sys/class/net/[iface]/device/subsystem shows "pci"
>readlink /sys/class/net/[iface]/device shows the pci device ID.
>
>So, the assumption is when the above readlink failed on the interface the
>device is removed(plugged-out) and the fd write will not happen.
>
>The code will continue to retry probe again and again until success only for
>plugged-in pci device matched the netvsc device.

Hi Matan,

The original code keeps writing to pipe even it's the same PCI device. The new code writes to pipe for a new device, only once. See the following code:

+	/* Skip if this is same device already sent to failsafe */
+	if (strcmp(addr, ctx->yield) == 0)
+		return 0;

This patch also saves lots of CPU since it no longer writes to pipe all the time. You are correct about the code will continue to probe on a new PCI device. But someone has to do it to handle hot-add.

Thanks,
Long


>
>> It looks like device removal and add was never tested.
>
>This is basic test we have to test plug-in plug-out and it passed every day in
>the last years.
>
>Maybe something new and special in your setup?
>
>> If you test removal you will see that vdev_netvsc:
>>  1. Sends same PCI device repeatedly to failsafe (every alarm call)
>>     This is harmless, but useless.
>>  2. When device is removed, keeps doing #1
  
Matan Azrad Sept. 15, 2020, 7 a.m. UTC | #6
Hi Li

From: Long Li <longli@microsoft.com>
> >Subject: Re: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of
> >associated pci device
> >
> >Hi Stephen
> >
> >From: Stephen Hemminger:
> >> On Sun, 6 Sep 2020 12:38:18 +0000
> >> Matan Azrad <matan@nvidia.com> wrote:
> >>
> >> > Hi Stephen
> >> >
> >> > From: Stephen Hemminger:
> >> > > The vdev_netvsc was not detecting when the associated PCI device
> >> > > (SRIOV) was removed. Because of that it would keep feeding the
> >> > > same
> >> > > (removed) device to failsafe PMD which would then unsuccessfully
> >> > > try and probe for it.
> >> > >
> >> > > Change to use a mark/sweep method to detect that PCI device was
> >> > > removed, and also only tell failsafe about new PCI devices.
> >> > > Vdev_netvsc does not have to keep stuffing the pipe with the same
> >> > > already existing PCI device.
> >> >
> >> > As I know, the vdev_netvsc driver doesn't call to failsafe if the
> >> > PCI device is
> >> not detected by the readlink command(considered as removed)...
> >> > Am I missing something?
> >>
> >> The original code is broken because ctx_yield is not cleared, it
> >> keeps sending the same value.
> >
> >Looking on the code again, It looks like ctx->yield has no effect on
> >the next pipe write, It is just used for log.
> >
> >After the PCI interface matching to the netvsc interface, the pipe
> >write is triggered only if the readlink commands success to see the
> >plugged-in PCI
> >device:
> >readlink /sys/class/net/[iface]/device/subsystem shows "pci"
> >readlink /sys/class/net/[iface]/device shows the pci device ID.
> >
> >So, the assumption is when the above readlink failed on the interface
> >the device is removed(plugged-out) and the fd write will not happen.
> >
> >The code will continue to retry probe again and again until success
> >only for plugged-in pci device matched the netvsc device.
> 
> Hi Matan,
> 
> The original code keeps writing to pipe even it's the same PCI device.

Yes, the vdev_netvsc writes any plugged-in device to the associated netvsc device fd.

> The
> new code writes to pipe for a new device, only once. See the following code:
> 
> +	/* Skip if this is same device already sent to failsafe */
> +	if (strcmp(addr, ctx->yield) == 0)
> +		return 0;
> 

I understand you want to optimize the pipe writing to be written only after plugged-in hot event.

The current solution suffers from race: the PCI device may be plugged-out and plugged-in in short time shorter than the driver alarm delay, then the PCI device plugged-in detection will lost.

My suggestion:
Add validation to the plugged-in device probing state and that it is owned by failsafe(using ownership API) - don't write the pipe if so.

Matan



> This patch also saves lots of CPU since it no longer writes to pipe all the time.
> You are correct about the code will continue to probe on a new PCI device.
> But someone has to do it to handle hot-add.
> 
> Thanks,
> Long
> 
> 
> >
> >> It looks like device removal and add was never tested.
> >
> >This is basic test we have to test plug-in plug-out and it passed every
> >day in the last years.
> >
> >Maybe something new and special in your setup?
> >
> >> If you test removal you will see that vdev_netvsc:
> >>  1. Sends same PCI device repeatedly to failsafe (every alarm call)
> >>     This is harmless, but useless.
> >>  2. When device is removed, keeps doing #1
  
Long Li Sept. 25, 2020, 8:30 p.m. UTC | #7
HI Matan,

While troubleshooting a failure in DPDK on device removal when VF device briefly disappears and comes back, I notice the failsafe driver is trying repeatedly to start a sub device (after this sub device has been successfully configured, but later hot removed from the kernel). This is due to repeated alarms calling fs_dev_start(). I trace into this commit:

ae80146 net/failsafe: fix removed device handling

The implementation of fs_err() is interesting:

+fs_err(struct sub_device *sdev, int err)
+{
+       /* A device removal shouldn't be reported as an error. */
+       if (sdev->remove == 1 || err == -EIO)
+               return rte_errno = 0;
+       return err;
+}

If I change this function to:
@@ -497,7 +497,7 @@ int failsafe_eth_new_event_callback(uint16_t port_id
 fs_err(struct sub_device *sdev, int err)
 {
        /* A device removal shouldn't be reported as an error. */
-       if (sdev->remove == 1 || err == -EIO)
+       if (sdev->remove == 1 && err == -EIO)
                return rte_errno = 0;
        return err;
 }

The hung is going away. I don't know the reason why we use a || in the if(). If a call to rte_eth_dev_start() returning EIO (as the case in fs_dev_start), the best choice would be bail out and fail this sub device.

Can you please take a look?

Thanks,
Long
  
Thomas Monjalon Oct. 19, 2020, 10:33 p.m. UTC | #8
Gaetan, Matan,
Please could you check?


25/09/2020 22:30, Long Li:
> HI Matan,
> 
> While troubleshooting a failure in DPDK on device removal when VF device briefly disappears and comes back, I notice the failsafe driver is trying repeatedly to start a sub device (after this sub device has been successfully configured, but later hot removed from the kernel). This is due to repeated alarms calling fs_dev_start(). I trace into this commit:
> 
> ae80146 net/failsafe: fix removed device handling
> 
> The implementation of fs_err() is interesting:
> 
> +fs_err(struct sub_device *sdev, int err)
> +{
> +       /* A device removal shouldn't be reported as an error. */
> +       if (sdev->remove == 1 || err == -EIO)
> +               return rte_errno = 0;
> +       return err;
> +}
> 
> If I change this function to:
> @@ -497,7 +497,7 @@ int failsafe_eth_new_event_callback(uint16_t port_id
>  fs_err(struct sub_device *sdev, int err)
>  {
>         /* A device removal shouldn't be reported as an error. */
> -       if (sdev->remove == 1 || err == -EIO)
> +       if (sdev->remove == 1 && err == -EIO)
>                 return rte_errno = 0;
>         return err;
>  }
> 
> The hung is going away. I don't know the reason why we use a || in the if(). If a call to rte_eth_dev_start() returning EIO (as the case in fs_dev_start), the best choice would be bail out and fail this sub device.
> 
> Can you please take a look?
> 
> Thanks,
> Long
> 
> ________________________________
> From: Matan Azrad <matan@nvidia.com>
> Sent: Tuesday, September 15, 2020 12:00 AM
> To: Long Li <longli@microsoft.com>; Stephen Hemminger <stephen@networkplumber.org>
> Cc: matan@mellanox.com <matan@mellanox.com>; grive@u246.net <grive@u246.net>; dev@dpdk.org <dev@dpdk.org>; Raslan Darawsheh <rasland@nvidia.com>
> Subject: RE: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of associated pci device
> 
> Hi Li
> 
> From: Long Li <longli@microsoft.com>
> > >Subject: Re: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of
> > >associated pci device
> > >
> > >Hi Stephen
> > >
> > >From: Stephen Hemminger:
> > >> On Sun, 6 Sep 2020 12:38:18 +0000
> > >> Matan Azrad <matan@nvidia.com> wrote:
> > >>
> > >> > Hi Stephen
> > >> >
> > >> > From: Stephen Hemminger:
> > >> > > The vdev_netvsc was not detecting when the associated PCI device
> > >> > > (SRIOV) was removed. Because of that it would keep feeding the
> > >> > > same
> > >> > > (removed) device to failsafe PMD which would then unsuccessfully
> > >> > > try and probe for it.
> > >> > >
> > >> > > Change to use a mark/sweep method to detect that PCI device was
> > >> > > removed, and also only tell failsafe about new PCI devices.
> > >> > > Vdev_netvsc does not have to keep stuffing the pipe with the same
> > >> > > already existing PCI device.
> > >> >
> > >> > As I know, the vdev_netvsc driver doesn't call to failsafe if the
> > >> > PCI device is
> > >> not detected by the readlink command(considered as removed)...
> > >> > Am I missing something?
> > >>
> > >> The original code is broken because ctx_yield is not cleared, it
> > >> keeps sending the same value.
> > >
> > >Looking on the code again, It looks like ctx->yield has no effect on
> > >the next pipe write, It is just used for log.
> > >
> > >After the PCI interface matching to the netvsc interface, the pipe
> > >write is triggered only if the readlink commands success to see the
> > >plugged-in PCI
> > >device:
> > >readlink /sys/class/net/[iface]/device/subsystem shows "pci"
> > >readlink /sys/class/net/[iface]/device shows the pci device ID.
> > >
> > >So, the assumption is when the above readlink failed on the interface
> > >the device is removed(plugged-out) and the fd write will not happen.
> > >
> > >The code will continue to retry probe again and again until success
> > >only for plugged-in pci device matched the netvsc device.
> >
> > Hi Matan,
> >
> > The original code keeps writing to pipe even it's the same PCI device.
> 
> Yes, the vdev_netvsc writes any plugged-in device to the associated netvsc device fd.
> 
> > The
> > new code writes to pipe for a new device, only once. See the following code:
> >
> > +     /* Skip if this is same device already sent to failsafe */
> > +     if (strcmp(addr, ctx->yield) == 0)
> > +             return 0;
> >
> 
> I understand you want to optimize the pipe writing to be written only after plugged-in hot event.
> 
> The current solution suffers from race: the PCI device may be plugged-out and plugged-in in short time shorter than the driver alarm delay, then the PCI device plugged-in detection will lost.
> 
> My suggestion:
> Add validation to the plugged-in device probing state and that it is owned by failsafe(using ownership API) - don't write the pipe if so.
> 
> Matan
> 
> 
> 
> > This patch also saves lots of CPU since it no longer writes to pipe all the time.
> > You are correct about the code will continue to probe on a new PCI device.
> > But someone has to do it to handle hot-add.
> >
> > Thanks,
> > Long
> >
> >
> > >
> > >> It looks like device removal and add was never tested.
> > >
> > >This is basic test we have to test plug-in plug-out and it passed every
> > >day in the last years.
> > >
> > >Maybe something new and special in your setup?
> > >
> > >> If you test removal you will see that vdev_netvsc:
> > >>  1. Sends same PCI device repeatedly to failsafe (every alarm call)
> > >>     This is harmless, but useless.
> > >>  2. When device is removed, keeps doing #1
>
  
Thomas Monjalon Oct. 19, 2020, 10:36 p.m. UTC | #9
Fixing Gaetan's address

20/10/2020 00:33, Thomas Monjalon:
> Gaetan, Matan,
> Please could you check?
> 
> 
> 25/09/2020 22:30, Long Li:
> > HI Matan,
> > 
> > While troubleshooting a failure in DPDK on device removal when VF device briefly disappears and comes back, I notice the failsafe driver is trying repeatedly to start a sub device (after this sub device has been successfully configured, but later hot removed from the kernel). This is due to repeated alarms calling fs_dev_start(). I trace into this commit:
> > 
> > ae80146 net/failsafe: fix removed device handling
> > 
> > The implementation of fs_err() is interesting:
> > 
> > +fs_err(struct sub_device *sdev, int err)
> > +{
> > +       /* A device removal shouldn't be reported as an error. */
> > +       if (sdev->remove == 1 || err == -EIO)
> > +               return rte_errno = 0;
> > +       return err;
> > +}
> > 
> > If I change this function to:
> > @@ -497,7 +497,7 @@ int failsafe_eth_new_event_callback(uint16_t port_id
> >  fs_err(struct sub_device *sdev, int err)
> >  {
> >         /* A device removal shouldn't be reported as an error. */
> > -       if (sdev->remove == 1 || err == -EIO)
> > +       if (sdev->remove == 1 && err == -EIO)
> >                 return rte_errno = 0;
> >         return err;
> >  }
> > 
> > The hung is going away. I don't know the reason why we use a || in the if(). If a call to rte_eth_dev_start() returning EIO (as the case in fs_dev_start), the best choice would be bail out and fail this sub device.
> > 
> > Can you please take a look?
> > 
> > Thanks,
> > Long
> > 
> > ________________________________
> > From: Matan Azrad <matan@nvidia.com>
> > Sent: Tuesday, September 15, 2020 12:00 AM
> > To: Long Li <longli@microsoft.com>; Stephen Hemminger <stephen@networkplumber.org>
> > Cc: matan@mellanox.com <matan@mellanox.com>; grive@u246.net <grive@u246.net>; dev@dpdk.org <dev@dpdk.org>; Raslan Darawsheh <rasland@nvidia.com>
> > Subject: RE: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of associated pci device
> > 
> > Hi Li
> > 
> > From: Long Li <longli@microsoft.com>
> > > >Subject: Re: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of
> > > >associated pci device
> > > >
> > > >Hi Stephen
> > > >
> > > >From: Stephen Hemminger:
> > > >> On Sun, 6 Sep 2020 12:38:18 +0000
> > > >> Matan Azrad <matan@nvidia.com> wrote:
> > > >>
> > > >> > Hi Stephen
> > > >> >
> > > >> > From: Stephen Hemminger:
> > > >> > > The vdev_netvsc was not detecting when the associated PCI device
> > > >> > > (SRIOV) was removed. Because of that it would keep feeding the
> > > >> > > same
> > > >> > > (removed) device to failsafe PMD which would then unsuccessfully
> > > >> > > try and probe for it.
> > > >> > >
> > > >> > > Change to use a mark/sweep method to detect that PCI device was
> > > >> > > removed, and also only tell failsafe about new PCI devices.
> > > >> > > Vdev_netvsc does not have to keep stuffing the pipe with the same
> > > >> > > already existing PCI device.
> > > >> >
> > > >> > As I know, the vdev_netvsc driver doesn't call to failsafe if the
> > > >> > PCI device is
> > > >> not detected by the readlink command(considered as removed)...
> > > >> > Am I missing something?
> > > >>
> > > >> The original code is broken because ctx_yield is not cleared, it
> > > >> keeps sending the same value.
> > > >
> > > >Looking on the code again, It looks like ctx->yield has no effect on
> > > >the next pipe write, It is just used for log.
> > > >
> > > >After the PCI interface matching to the netvsc interface, the pipe
> > > >write is triggered only if the readlink commands success to see the
> > > >plugged-in PCI
> > > >device:
> > > >readlink /sys/class/net/[iface]/device/subsystem shows "pci"
> > > >readlink /sys/class/net/[iface]/device shows the pci device ID.
> > > >
> > > >So, the assumption is when the above readlink failed on the interface
> > > >the device is removed(plugged-out) and the fd write will not happen.
> > > >
> > > >The code will continue to retry probe again and again until success
> > > >only for plugged-in pci device matched the netvsc device.
> > >
> > > Hi Matan,
> > >
> > > The original code keeps writing to pipe even it's the same PCI device.
> > 
> > Yes, the vdev_netvsc writes any plugged-in device to the associated netvsc device fd.
> > 
> > > The
> > > new code writes to pipe for a new device, only once. See the following code:
> > >
> > > +     /* Skip if this is same device already sent to failsafe */
> > > +     if (strcmp(addr, ctx->yield) == 0)
> > > +             return 0;
> > >
> > 
> > I understand you want to optimize the pipe writing to be written only after plugged-in hot event.
> > 
> > The current solution suffers from race: the PCI device may be plugged-out and plugged-in in short time shorter than the driver alarm delay, then the PCI device plugged-in detection will lost.
> > 
> > My suggestion:
> > Add validation to the plugged-in device probing state and that it is owned by failsafe(using ownership API) - don't write the pipe if so.
> > 
> > Matan
> > 
> > 
> > 
> > > This patch also saves lots of CPU since it no longer writes to pipe all the time.
> > > You are correct about the code will continue to probe on a new PCI device.
> > > But someone has to do it to handle hot-add.
> > >
> > > Thanks,
> > > Long
> > >
> > >
> > > >
> > > >> It looks like device removal and add was never tested.
> > > >
> > > >This is basic test we have to test plug-in plug-out and it passed every
> > > >day in the last years.
> > > >
> > > >Maybe something new and special in your setup?
> > > >
> > > >> If you test removal you will see that vdev_netvsc:
> > > >>  1. Sends same PCI device repeatedly to failsafe (every alarm call)
> > > >>     This is harmless, but useless.
> > > >>  2. When device is removed, keeps doing #1
> > 
> 
> 
>
  
Gaëtan Rivet Oct. 20, 2020, 9:13 a.m. UTC | #10
Hi Thomas,

This issue has already been fixed, see:
http://mails.dpdk.org/archives/dev/2020-October/185921.html

It has been integrated, Long was able to test it and confirm it fixed
this issue.

On 20/10/20 00:36 +0200, Thomas Monjalon wrote:
> Fixing Gaetan's address
> 
> 20/10/2020 00:33, Thomas Monjalon:
> > Gaetan, Matan,
> > Please could you check?
> > 
> > 
> > 25/09/2020 22:30, Long Li:
> > > HI Matan,
> > > 
> > > While troubleshooting a failure in DPDK on device removal when VF device briefly disappears and comes back, I notice the failsafe driver is trying repeatedly to start a sub device (after this sub device has been successfully configured, but later hot removed from the kernel). This is due to repeated alarms calling fs_dev_start(). I trace into this commit:
> > > 
> > > ae80146 net/failsafe: fix removed device handling
> > > 
> > > The implementation of fs_err() is interesting:
> > > 
> > > +fs_err(struct sub_device *sdev, int err)
> > > +{
> > > +       /* A device removal shouldn't be reported as an error. */
> > > +       if (sdev->remove == 1 || err == -EIO)
> > > +               return rte_errno = 0;
> > > +       return err;
> > > +}
> > > 
> > > If I change this function to:
> > > @@ -497,7 +497,7 @@ int failsafe_eth_new_event_callback(uint16_t port_id
> > >  fs_err(struct sub_device *sdev, int err)
> > >  {
> > >         /* A device removal shouldn't be reported as an error. */
> > > -       if (sdev->remove == 1 || err == -EIO)
> > > +       if (sdev->remove == 1 && err == -EIO)
> > >                 return rte_errno = 0;
> > >         return err;
> > >  }
> > > 
> > > The hung is going away. I don't know the reason why we use a || in the if(). If a call to rte_eth_dev_start() returning EIO (as the case in fs_dev_start), the best choice would be bail out and fail this sub device.
> > > 
> > > Can you please take a look?
> > > 
> > > Thanks,
> > > Long
> > > 
> > > ________________________________
> > > From: Matan Azrad <matan@nvidia.com>
> > > Sent: Tuesday, September 15, 2020 12:00 AM
> > > To: Long Li <longli@microsoft.com>; Stephen Hemminger <stephen@networkplumber.org>
> > > Cc: matan@mellanox.com <matan@mellanox.com>; grive@u246.net <grive@u246.net>; dev@dpdk.org <dev@dpdk.org>; Raslan Darawsheh <rasland@nvidia.com>
> > > Subject: RE: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of associated pci device
> > > 
> > > Hi Li
> > > 
> > > From: Long Li <longli@microsoft.com>
> > > > >Subject: Re: [dpdk-dev] [PATCH] net/vdev_netvsc: handle removal of
> > > > >associated pci device
> > > > >
> > > > >Hi Stephen
> > > > >
> > > > >From: Stephen Hemminger:
> > > > >> On Sun, 6 Sep 2020 12:38:18 +0000
> > > > >> Matan Azrad <matan@nvidia.com> wrote:
> > > > >>
> > > > >> > Hi Stephen
> > > > >> >
> > > > >> > From: Stephen Hemminger:
> > > > >> > > The vdev_netvsc was not detecting when the associated PCI device
> > > > >> > > (SRIOV) was removed. Because of that it would keep feeding the
> > > > >> > > same
> > > > >> > > (removed) device to failsafe PMD which would then unsuccessfully
> > > > >> > > try and probe for it.
> > > > >> > >
> > > > >> > > Change to use a mark/sweep method to detect that PCI device was
> > > > >> > > removed, and also only tell failsafe about new PCI devices.
> > > > >> > > Vdev_netvsc does not have to keep stuffing the pipe with the same
> > > > >> > > already existing PCI device.
> > > > >> >
> > > > >> > As I know, the vdev_netvsc driver doesn't call to failsafe if the
> > > > >> > PCI device is
> > > > >> not detected by the readlink command(considered as removed)...
> > > > >> > Am I missing something?
> > > > >>
> > > > >> The original code is broken because ctx_yield is not cleared, it
> > > > >> keeps sending the same value.
> > > > >
> > > > >Looking on the code again, It looks like ctx->yield has no effect on
> > > > >the next pipe write, It is just used for log.
> > > > >
> > > > >After the PCI interface matching to the netvsc interface, the pipe
> > > > >write is triggered only if the readlink commands success to see the
> > > > >plugged-in PCI
> > > > >device:
> > > > >readlink /sys/class/net/[iface]/device/subsystem shows "pci"
> > > > >readlink /sys/class/net/[iface]/device shows the pci device ID.
> > > > >
> > > > >So, the assumption is when the above readlink failed on the interface
> > > > >the device is removed(plugged-out) and the fd write will not happen.
> > > > >
> > > > >The code will continue to retry probe again and again until success
> > > > >only for plugged-in pci device matched the netvsc device.
> > > >
> > > > Hi Matan,
> > > >
> > > > The original code keeps writing to pipe even it's the same PCI device.
> > > 
> > > Yes, the vdev_netvsc writes any plugged-in device to the associated netvsc device fd.
> > > 
> > > > The
> > > > new code writes to pipe for a new device, only once. See the following code:
> > > >
> > > > +     /* Skip if this is same device already sent to failsafe */
> > > > +     if (strcmp(addr, ctx->yield) == 0)
> > > > +             return 0;
> > > >
> > > 
> > > I understand you want to optimize the pipe writing to be written only after plugged-in hot event.
> > > 
> > > The current solution suffers from race: the PCI device may be plugged-out and plugged-in in short time shorter than the driver alarm delay, then the PCI device plugged-in detection will lost.
> > > 
> > > My suggestion:
> > > Add validation to the plugged-in device probing state and that it is owned by failsafe(using ownership API) - don't write the pipe if so.
> > > 
> > > Matan
> > > 
> > > 
> > > 
> > > > This patch also saves lots of CPU since it no longer writes to pipe all the time.
> > > > You are correct about the code will continue to probe on a new PCI device.
> > > > But someone has to do it to handle hot-add.
> > > >
> > > > Thanks,
> > > > Long
> > > >
> > > >
> > > > >
> > > > >> It looks like device removal and add was never tested.
> > > > >
> > > > >This is basic test we have to test plug-in plug-out and it passed every
> > > > >day in the last years.
> > > > >
> > > > >Maybe something new and special in your setup?
> > > > >
> > > > >> If you test removal you will see that vdev_netvsc:
> > > > >>  1. Sends same PCI device repeatedly to failsafe (every alarm call)
> > > > >>     This is harmless, but useless.
> > > > >>  2. When device is removed, keeps doing #1
> > > 
> > 
> > 
> > 
> 
> 
> 
> 
>
  

Patch

diff --git a/drivers/net/vdev_netvsc/vdev_netvsc.c b/drivers/net/vdev_netvsc/vdev_netvsc.c
index 1ecb0b3e6a34..80a3158012ce 100644
--- a/drivers/net/vdev_netvsc/vdev_netvsc.c
+++ b/drivers/net/vdev_netvsc/vdev_netvsc.c
@@ -66,6 +66,7 @@  struct vdev_netvsc_ctx {
 	char devargs[256];		   /**< Fail-safe device arguments. */
 	char if_name[IF_NAMESIZE];	   /**< NetVSC netdevice name. */
 	unsigned int if_index;		   /**< NetVSC netdevice index. */
+	int pci_found;			   /**< Device detection */
 	struct rte_ether_addr if_addr;	   /**< NetVSC MAC address. */
 	int pipe[2];			   /**< Fail-safe communication pipe. */
 	char yield[256];		   /**< PCI sub-device arguments. */
@@ -405,11 +406,18 @@  vdev_netvsc_device_probe(const struct if_nameindex *iface,
 	len = strlen(addr);
 	if (!len)
 		return 0;
+
+	ctx->pci_found = 1;
+
+	/* Skip if this is same device already sent to failsafe */
+	if (strcmp(addr, ctx->yield) == 0)
+		return 0;
+
+	DRV_LOG(DEBUG, "associating PCI device \"%s\" with NetVSC"
+		" interface \"%s\" (index %u)", addr, ctx->if_name,
+		ctx->if_index);
+
 	/* Send PCI device argument to fail-safe PMD instance. */
-	if (strcmp(addr, ctx->yield))
-		DRV_LOG(DEBUG, "associating PCI device \"%s\" with NetVSC"
-			" interface \"%s\" (index %u)", addr, ctx->if_name,
-			ctx->if_index);
 	memmove(buf, addr, len + 1);
 	addr = buf;
 	buf[len] = '\n';
@@ -452,12 +460,33 @@  vdev_netvsc_alarm(__rte_unused void *arg)
 	struct vdev_netvsc_ctx *ctx;
 	int ret;
 
+	/* 1st pass: clear PCI flag on all devices */
+	LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
+		ctx->pci_found = 0;
+	}
+
+	/* 2nd pass: scan all system devices to look for changes to this device */
 	LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
 		ret = vdev_netvsc_foreach_iface(vdev_netvsc_device_probe, 0,
 		      ctx);
-		if (ret < 0)
+
+		if (ret < 0) {
+			DRV_LOG(NOTICE, "can not scan devices for %s\n",
+				ctx->if_name);
 			break;
+		}
 	}
+
+	/* 3rd pass: detect PCI removal */
+	LIST_FOREACH(ctx, &vdev_netvsc_ctx_list, entry) {
+		if (!ctx->pci_found && ctx->yield[0]) {
+			DRV_LOG(DEBUG, "disassociating PCI device \"%s\" from NetVSC"
+				" interface \"%s\" (index %u)", ctx->yield, ctx->if_name,
+				ctx->if_index);
+			ctx->yield[0] = '\0';
+		}
+	}
+
 	if (!vdev_netvsc_ctx_count)
 		return;
 	ret = rte_eal_alarm_set(VDEV_NETVSC_PROBE_MS * 1000,