[v3,13/13] vdpa/mlx5: disable ROCE

Message ID 1580659433-25581-14-git-send-email-matan@mellanox.com (mailing list archive)
State Accepted, archived
Delegated to: Maxime Coquelin
Headers
Series Introduce mlx5 vDPA driver |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply issues

Commit Message

Matan Azrad Feb. 2, 2020, 4:03 p.m. UTC
  In order to support virtio queue creation by the FW, ROCE mode
should be disabled in the device.

Do it by netlink which is like the devlink tool commands:
	1. devlink dev param set pci/[pci] name enable_roce value false
	   cmode driverinit
    	2. devlink dev reload pci/[pci]
Or by sysfs which is like:
	echo 0 >  /sys/bus/pci/devices/[pci]/roce_enable

The IB device is matched again after ROCE disabling.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 drivers/vdpa/mlx5/Makefile    |   2 +-
 drivers/vdpa/mlx5/meson.build |   2 +-
 drivers/vdpa/mlx5/mlx5_vdpa.c | 191 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 160 insertions(+), 35 deletions(-)
  

Comments

Maxime Coquelin Feb. 3, 2020, 9:27 a.m. UTC | #1
Hi Matan,

On 2/2/20 5:03 PM, Matan Azrad wrote:
> In order to support virtio queue creation by the FW, ROCE mode
> should be disabled in the device.
> 
> Do it by netlink which is like the devlink tool commands:
> 	1. devlink dev param set pci/[pci] name enable_roce value false
> 	   cmode driverinit
>     	2. devlink dev reload pci/[pci]
> Or by sysfs which is like:
> 	echo 0 >  /sys/bus/pci/devices/[pci]/roce_enable
> 
> The IB device is matched again after ROCE disabling.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  drivers/vdpa/mlx5/Makefile    |   2 +-
>  drivers/vdpa/mlx5/meson.build |   2 +-
>  drivers/vdpa/mlx5/mlx5_vdpa.c | 191 ++++++++++++++++++++++++++++++++++--------
>  3 files changed, 160 insertions(+), 35 deletions(-)
...
> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
> index 57619d2..710f305 100644
> --- a/drivers/vdpa/mlx5/mlx5_vdpa.c
> +++ b/drivers/vdpa/mlx5/mlx5_vdpa.c

...

> @@ -246,8 +389,7 @@
>  mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		    struct rte_pci_device *pci_dev __rte_unused)
>  {
> -	struct ibv_device **ibv_list;
> -	struct ibv_device *ibv_match = NULL;
> +	struct ibv_device *ibv;
>  	struct mlx5_vdpa_priv *priv = NULL;
>  	struct ibv_context *ctx = NULL;
>  	struct mlx5_hca_attr attr;
> @@ -258,42 +400,25 @@
>  			" driver.");
>  		return 1;
>  	}
> -	errno = 0;
> -	ibv_list = mlx5_glue->get_device_list(&ret);
> -	if (!ibv_list) {
> -		rte_errno = ENOSYS;
> -		DRV_LOG(ERR, "Failed to get device list, is ib_uverbs loaded?");
> +	ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
> +	if (!ibv) {
> +		DRV_LOG(ERR, "No matching IB device for PCI slot "
> +			PCI_PRI_FMT ".", pci_dev->addr.domain,
> +			pci_dev->addr.bus, pci_dev->addr.devid,
> +			pci_dev->addr.function);
>  		return -rte_errno;
> -	}
> -	while (ret-- > 0) {
> -		struct rte_pci_addr pci_addr;
> -
> -		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]->name);
> -		if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path, &pci_addr))
> -			continue;
> -		if (pci_dev->addr.domain != pci_addr.domain ||
> -		    pci_dev->addr.bus != pci_addr.bus ||
> -		    pci_dev->addr.devid != pci_addr.devid ||
> -		    pci_dev->addr.function != pci_addr.function)
> -			continue;
> +	} else {
>  		DRV_LOG(INFO, "PCI information matches for device \"%s\".",
> -			ibv_list[ret]->name);
> -		ibv_match = ibv_list[ret];
> -		break;
> +			ibv->name);
>  	}
> -	mlx5_glue->free_device_list(ibv_list);
> -	if (!ibv_match) {
> -		DRV_LOG(ERR, "No matching IB device for PCI slot "
> -			"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".",
> -			pci_dev->addr.domain, pci_dev->addr.bus,
> -			pci_dev->addr.devid, pci_dev->addr.function);
> -		rte_errno = ENOENT;
> -		return -rte_errno;
> +	if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
> +		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
> +			ibv->name);
> +		//return -rte_errno;
>  	}

Is that commented return expected?
  
Maxime Coquelin Feb. 3, 2020, 11 a.m. UTC | #2
On 2/3/20 10:27 AM, Maxime Coquelin wrote:
> Hi Matan,
> 
> On 2/2/20 5:03 PM, Matan Azrad wrote:
>> In order to support virtio queue creation by the FW, ROCE mode
>> should be disabled in the device.
>>
>> Do it by netlink which is like the devlink tool commands:
>> 	1. devlink dev param set pci/[pci] name enable_roce value false
>> 	   cmode driverinit
>>     	2. devlink dev reload pci/[pci]
>> Or by sysfs which is like:
>> 	echo 0 >  /sys/bus/pci/devices/[pci]/roce_enable
>>
>> The IB device is matched again after ROCE disabling.
>>
>> Signed-off-by: Matan Azrad <matan@mellanox.com>
>> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
>> Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>>  drivers/vdpa/mlx5/Makefile    |   2 +-
>>  drivers/vdpa/mlx5/meson.build |   2 +-
>>  drivers/vdpa/mlx5/mlx5_vdpa.c | 191 ++++++++++++++++++++++++++++++++++--------
>>  3 files changed, 160 insertions(+), 35 deletions(-)
> ...
>> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
>> index 57619d2..710f305 100644
>> --- a/drivers/vdpa/mlx5/mlx5_vdpa.c
>> +++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
> 
> ...
> 
>> @@ -246,8 +389,7 @@
>>  mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>>  		    struct rte_pci_device *pci_dev __rte_unused)
>>  {
>> -	struct ibv_device **ibv_list;
>> -	struct ibv_device *ibv_match = NULL;
>> +	struct ibv_device *ibv;
>>  	struct mlx5_vdpa_priv *priv = NULL;
>>  	struct ibv_context *ctx = NULL;
>>  	struct mlx5_hca_attr attr;
>> @@ -258,42 +400,25 @@
>>  			" driver.");
>>  		return 1;
>>  	}
>> -	errno = 0;
>> -	ibv_list = mlx5_glue->get_device_list(&ret);
>> -	if (!ibv_list) {
>> -		rte_errno = ENOSYS;
>> -		DRV_LOG(ERR, "Failed to get device list, is ib_uverbs loaded?");
>> +	ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
>> +	if (!ibv) {
>> +		DRV_LOG(ERR, "No matching IB device for PCI slot "
>> +			PCI_PRI_FMT ".", pci_dev->addr.domain,
>> +			pci_dev->addr.bus, pci_dev->addr.devid,
>> +			pci_dev->addr.function);
>>  		return -rte_errno;
>> -	}
>> -	while (ret-- > 0) {
>> -		struct rte_pci_addr pci_addr;
>> -
>> -		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]->name);
>> -		if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path, &pci_addr))
>> -			continue;
>> -		if (pci_dev->addr.domain != pci_addr.domain ||
>> -		    pci_dev->addr.bus != pci_addr.bus ||
>> -		    pci_dev->addr.devid != pci_addr.devid ||
>> -		    pci_dev->addr.function != pci_addr.function)
>> -			continue;
>> +	} else {
>>  		DRV_LOG(INFO, "PCI information matches for device \"%s\".",
>> -			ibv_list[ret]->name);
>> -		ibv_match = ibv_list[ret];
>> -		break;
>> +			ibv->name);
>>  	}
>> -	mlx5_glue->free_device_list(ibv_list);
>> -	if (!ibv_match) {
>> -		DRV_LOG(ERR, "No matching IB device for PCI slot "
>> -			"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".",
>> -			pci_dev->addr.domain, pci_dev->addr.bus,
>> -			pci_dev->addr.devid, pci_dev->addr.function);
>> -		rte_errno = ENOENT;
>> -		return -rte_errno;
>> +	if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
>> +		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
>> +			ibv->name);
>> +		//return -rte_errno;
>>  	}
> 
> Is that commented return expected?
> 

Please let me know if I should remove the comment, or remove the return.

Thanks,
Maxime
  
Matan Azrad Feb. 3, 2020, 12:44 p.m. UTC | #3
From: Maxime Coquelin
> Sent: Monday, February 3, 2020 1:00 PM
> To: Matan Azrad <matan@mellanox.com>; dev@dpdk.org; Slava Ovsiienko
> <viacheslavo@mellanox.com>
> Subject: Re: [PATCH v3 13/13] vdpa/mlx5: disable ROCE
> 
> 
> 
> On 2/3/20 10:27 AM, Maxime Coquelin wrote:
> > Hi Matan,
> >
> > On 2/2/20 5:03 PM, Matan Azrad wrote:
> >> In order to support virtio queue creation by the FW, ROCE mode should
> >> be disabled in the device.
> >>
> >> Do it by netlink which is like the devlink tool commands:
> >> 	1. devlink dev param set pci/[pci] name enable_roce value false
> >> 	   cmode driverinit
> >>     	2. devlink dev reload pci/[pci]
> >> Or by sysfs which is like:
> >> 	echo 0 >  /sys/bus/pci/devices/[pci]/roce_enable
> >>
> >> The IB device is matched again after ROCE disabling.
> >>
> >> Signed-off-by: Matan Azrad <matan@mellanox.com>
> >> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> >> Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> ---
> >>  drivers/vdpa/mlx5/Makefile    |   2 +-
> >>  drivers/vdpa/mlx5/meson.build |   2 +-
> >>  drivers/vdpa/mlx5/mlx5_vdpa.c | 191
> >> ++++++++++++++++++++++++++++++++++--------
> >>  3 files changed, 160 insertions(+), 35 deletions(-)
> > ...
> >> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c
> >> b/drivers/vdpa/mlx5/mlx5_vdpa.c index 57619d2..710f305 100644
> >> --- a/drivers/vdpa/mlx5/mlx5_vdpa.c
> >> +++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
> >
> > ...
> >
> >> @@ -246,8 +389,7 @@
> >>  mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> >>  		    struct rte_pci_device *pci_dev __rte_unused)  {
> >> -	struct ibv_device **ibv_list;
> >> -	struct ibv_device *ibv_match = NULL;
> >> +	struct ibv_device *ibv;
> >>  	struct mlx5_vdpa_priv *priv = NULL;
> >>  	struct ibv_context *ctx = NULL;
> >>  	struct mlx5_hca_attr attr;
> >> @@ -258,42 +400,25 @@
> >>  			" driver.");
> >>  		return 1;
> >>  	}
> >> -	errno = 0;
> >> -	ibv_list = mlx5_glue->get_device_list(&ret);
> >> -	if (!ibv_list) {
> >> -		rte_errno = ENOSYS;
> >> -		DRV_LOG(ERR, "Failed to get device list, is ib_uverbs
> loaded?");
> >> +	ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
> >> +	if (!ibv) {
> >> +		DRV_LOG(ERR, "No matching IB device for PCI slot "
> >> +			PCI_PRI_FMT ".", pci_dev->addr.domain,
> >> +			pci_dev->addr.bus, pci_dev->addr.devid,
> >> +			pci_dev->addr.function);
> >>  		return -rte_errno;
> >> -	}
> >> -	while (ret-- > 0) {
> >> -		struct rte_pci_addr pci_addr;
> >> -
> >> -		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]-
> >name);
> >> -		if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path,
> &pci_addr))
> >> -			continue;
> >> -		if (pci_dev->addr.domain != pci_addr.domain ||
> >> -		    pci_dev->addr.bus != pci_addr.bus ||
> >> -		    pci_dev->addr.devid != pci_addr.devid ||
> >> -		    pci_dev->addr.function != pci_addr.function)
> >> -			continue;
> >> +	} else {
> >>  		DRV_LOG(INFO, "PCI information matches for device
> \"%s\".",
> >> -			ibv_list[ret]->name);
> >> -		ibv_match = ibv_list[ret];
> >> -		break;
> >> +			ibv->name);
> >>  	}
> >> -	mlx5_glue->free_device_list(ibv_list);
> >> -	if (!ibv_match) {
> >> -		DRV_LOG(ERR, "No matching IB device for PCI slot "
> >> -			"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".",
> >> -			pci_dev->addr.domain, pci_dev->addr.bus,
> >> -			pci_dev->addr.devid, pci_dev->addr.function);
> >> -		rte_errno = ENOENT;
> >> -		return -rte_errno;
> >> +	if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
> >> +		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
> >> +			ibv->name);
> >> +		//return -rte_errno;
> >>  	}
> >
> > Is that commented return expected?
> >
> 
> Please let me know if I should remove the comment, or remove the return.

Sorry, forgot the comment , good catch!
It should not be comment just need to remove "//".

Can you do it in integration?

> 
> Thanks,
> Maxime
  
Maxime Coquelin Feb. 3, 2020, 12:45 p.m. UTC | #4
On 2/3/20 1:44 PM, Matan Azrad wrote:
> 
> 
> From: Maxime Coquelin
>> Sent: Monday, February 3, 2020 1:00 PM
>> To: Matan Azrad <matan@mellanox.com>; dev@dpdk.org; Slava Ovsiienko
>> <viacheslavo@mellanox.com>
>> Subject: Re: [PATCH v3 13/13] vdpa/mlx5: disable ROCE
>>
>>
>>
>> On 2/3/20 10:27 AM, Maxime Coquelin wrote:
>>> Hi Matan,
>>>
>>> On 2/2/20 5:03 PM, Matan Azrad wrote:
>>>> In order to support virtio queue creation by the FW, ROCE mode should
>>>> be disabled in the device.
>>>>
>>>> Do it by netlink which is like the devlink tool commands:
>>>> 	1. devlink dev param set pci/[pci] name enable_roce value false
>>>> 	   cmode driverinit
>>>>     	2. devlink dev reload pci/[pci]
>>>> Or by sysfs which is like:
>>>> 	echo 0 >  /sys/bus/pci/devices/[pci]/roce_enable
>>>>
>>>> The IB device is matched again after ROCE disabling.
>>>>
>>>> Signed-off-by: Matan Azrad <matan@mellanox.com>
>>>> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
>>>> Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> ---
>>>>  drivers/vdpa/mlx5/Makefile    |   2 +-
>>>>  drivers/vdpa/mlx5/meson.build |   2 +-
>>>>  drivers/vdpa/mlx5/mlx5_vdpa.c | 191
>>>> ++++++++++++++++++++++++++++++++++--------
>>>>  3 files changed, 160 insertions(+), 35 deletions(-)
>>> ...
>>>> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c
>>>> b/drivers/vdpa/mlx5/mlx5_vdpa.c index 57619d2..710f305 100644
>>>> --- a/drivers/vdpa/mlx5/mlx5_vdpa.c
>>>> +++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
>>>
>>> ...
>>>
>>>> @@ -246,8 +389,7 @@
>>>>  mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>>>>  		    struct rte_pci_device *pci_dev __rte_unused)  {
>>>> -	struct ibv_device **ibv_list;
>>>> -	struct ibv_device *ibv_match = NULL;
>>>> +	struct ibv_device *ibv;
>>>>  	struct mlx5_vdpa_priv *priv = NULL;
>>>>  	struct ibv_context *ctx = NULL;
>>>>  	struct mlx5_hca_attr attr;
>>>> @@ -258,42 +400,25 @@
>>>>  			" driver.");
>>>>  		return 1;
>>>>  	}
>>>> -	errno = 0;
>>>> -	ibv_list = mlx5_glue->get_device_list(&ret);
>>>> -	if (!ibv_list) {
>>>> -		rte_errno = ENOSYS;
>>>> -		DRV_LOG(ERR, "Failed to get device list, is ib_uverbs
>> loaded?");
>>>> +	ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
>>>> +	if (!ibv) {
>>>> +		DRV_LOG(ERR, "No matching IB device for PCI slot "
>>>> +			PCI_PRI_FMT ".", pci_dev->addr.domain,
>>>> +			pci_dev->addr.bus, pci_dev->addr.devid,
>>>> +			pci_dev->addr.function);
>>>>  		return -rte_errno;
>>>> -	}
>>>> -	while (ret-- > 0) {
>>>> -		struct rte_pci_addr pci_addr;
>>>> -
>>>> -		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]-
>>> name);
>>>> -		if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path,
>> &pci_addr))
>>>> -			continue;
>>>> -		if (pci_dev->addr.domain != pci_addr.domain ||
>>>> -		    pci_dev->addr.bus != pci_addr.bus ||
>>>> -		    pci_dev->addr.devid != pci_addr.devid ||
>>>> -		    pci_dev->addr.function != pci_addr.function)
>>>> -			continue;
>>>> +	} else {
>>>>  		DRV_LOG(INFO, "PCI information matches for device
>> \"%s\".",
>>>> -			ibv_list[ret]->name);
>>>> -		ibv_match = ibv_list[ret];
>>>> -		break;
>>>> +			ibv->name);
>>>>  	}
>>>> -	mlx5_glue->free_device_list(ibv_list);
>>>> -	if (!ibv_match) {
>>>> -		DRV_LOG(ERR, "No matching IB device for PCI slot "
>>>> -			"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".",
>>>> -			pci_dev->addr.domain, pci_dev->addr.bus,
>>>> -			pci_dev->addr.devid, pci_dev->addr.function);
>>>> -		rte_errno = ENOENT;
>>>> -		return -rte_errno;
>>>> +	if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
>>>> +		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
>>>> +			ibv->name);
>>>> +		//return -rte_errno;
>>>>  	}
>>>
>>> Is that commented return expected?
>>>
>>
>> Please let me know if I should remove the comment, or remove the return.
> 
> Sorry, forgot the comment , good catch!
> It should not be comment just need to remove "//".

Thanks Matan.

> Can you do it in integration?
Sure, will do now.

Maxime

>>
>> Thanks,
>> Maxime
>
  

Patch

diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
index d4a544c..7153217 100644
--- a/drivers/vdpa/mlx5/Makefile
+++ b/drivers/vdpa/mlx5/Makefile
@@ -29,7 +29,7 @@  CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -Wno-strict-prototypes
 LDLIBS += -lrte_common_mlx5
-LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci -lrte_sched
+LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_pci -lrte_bus_pci -lrte_sched
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual
diff --git a/drivers/vdpa/mlx5/meson.build b/drivers/vdpa/mlx5/meson.build
index bb96dad..9c152e5 100644
--- a/drivers/vdpa/mlx5/meson.build
+++ b/drivers/vdpa/mlx5/meson.build
@@ -9,7 +9,7 @@  endif
 
 fmt_name = 'mlx5_vdpa'
 allow_experimental_apis = true
-deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched']
+deps += ['hash', 'common_mlx5', 'vhost', 'pci', 'bus_pci', 'eal', 'sched']
 sources = files(
 	'mlx5_vdpa.c',
 	'mlx5_vdpa_mem.c',
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index 57619d2..710f305 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -1,15 +1,19 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2019 Mellanox Technologies, Ltd
  */
+#include <unistd.h>
+
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_errno.h>
 #include <rte_bus_pci.h>
+#include <rte_pci.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_common.h>
 #include <mlx5_devx_cmds.h>
 #include <mlx5_prm.h>
+#include <mlx5_nl.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
@@ -228,6 +232,145 @@ 
 	.get_notify_area = NULL,
 };
 
+static struct ibv_device *
+mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr)
+{
+	int n;
+	struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
+	struct ibv_device *ibv_match = NULL;
+
+	if (!ibv_list) {
+		rte_errno = ENOSYS;
+		return NULL;
+	}
+	while (n-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
+		if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr))
+			continue;
+		if (memcmp(addr, &pci_addr, sizeof(pci_addr)))
+			continue;
+		ibv_match = ibv_list[n];
+		break;
+	}
+	if (!ibv_match)
+		rte_errno = ENOENT;
+	mlx5_glue->free_device_list(ibv_list);
+	return ibv_match;
+}
+
+/* Try to disable ROCE by Netlink\Devlink. */
+static int
+mlx5_vdpa_nl_roce_disable(const char *addr)
+{
+	int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
+	int devlink_id;
+	int enable;
+	int ret;
+
+	if (nlsk_fd < 0)
+		return nlsk_fd;
+	devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
+	if (devlink_id < 0) {
+		ret = devlink_id;
+		DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
+			" Netlink.");
+		goto close;
+	}
+	ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
+	if (ret) {
+		DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
+			ret);
+		goto close;
+	} else if (!enable) {
+		DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
+		goto close;
+	}
+	ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
+	if (ret)
+		DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
+	else
+		DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
+close:
+	close(nlsk_fd);
+	return ret;
+}
+
+/* Try to disable ROCE by sysfs. */
+static int
+mlx5_vdpa_sys_roce_disable(const char *addr)
+{
+	FILE *file_o;
+	int enable;
+	int ret;
+
+	MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
+	file_o = fopen(file_p, "rb");
+	if (!file_o) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	ret = fscanf(file_o, "%d", &enable);
+	if (ret != 1) {
+		rte_errno = EINVAL;
+		ret = EINVAL;
+		goto close;
+	} else if (!enable) {
+		ret = 0;
+		DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
+		goto close;
+	}
+	fclose(file_o);
+	file_o = fopen(file_p, "wb");
+	if (!file_o) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	fprintf(file_o, "0\n");
+	ret = 0;
+close:
+	if (ret)
+		DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
+	else
+		DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
+	fclose(file_o);
+	return ret;
+}
+
+#define MLX5_VDPA_MAX_RETRIES 20
+#define MLX5_VDPA_USEC 1000
+static int
+mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
+{
+	char addr_name[64] = {0};
+
+	rte_pci_device_name(addr, addr_name, sizeof(addr_name));
+	/* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
+	if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
+	    mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
+		/*
+		 * Succeed to disable ROCE, wait for the IB device to appear
+		 * again after reload.
+		 */
+		int r;
+		struct ibv_device *ibv_new;
+
+		for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
+			ibv_new = mlx5_vdpa_get_ib_device_match(addr);
+			if (ibv_new) {
+				*ibv = ibv_new;
+				return 0;
+			}
+			usleep(MLX5_VDPA_USEC);
+		}
+		DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
+			"retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
+		rte_errno = EAGAIN;
+	}
+	return -rte_errno;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -246,8 +389,7 @@ 
 mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		    struct rte_pci_device *pci_dev __rte_unused)
 {
-	struct ibv_device **ibv_list;
-	struct ibv_device *ibv_match = NULL;
+	struct ibv_device *ibv;
 	struct mlx5_vdpa_priv *priv = NULL;
 	struct ibv_context *ctx = NULL;
 	struct mlx5_hca_attr attr;
@@ -258,42 +400,25 @@ 
 			" driver.");
 		return 1;
 	}
-	errno = 0;
-	ibv_list = mlx5_glue->get_device_list(&ret);
-	if (!ibv_list) {
-		rte_errno = ENOSYS;
-		DRV_LOG(ERR, "Failed to get device list, is ib_uverbs loaded?");
+	ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
+	if (!ibv) {
+		DRV_LOG(ERR, "No matching IB device for PCI slot "
+			PCI_PRI_FMT ".", pci_dev->addr.domain,
+			pci_dev->addr.bus, pci_dev->addr.devid,
+			pci_dev->addr.function);
 		return -rte_errno;
-	}
-	while (ret-- > 0) {
-		struct rte_pci_addr pci_addr;
-
-		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]->name);
-		if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path, &pci_addr))
-			continue;
-		if (pci_dev->addr.domain != pci_addr.domain ||
-		    pci_dev->addr.bus != pci_addr.bus ||
-		    pci_dev->addr.devid != pci_addr.devid ||
-		    pci_dev->addr.function != pci_addr.function)
-			continue;
+	} else {
 		DRV_LOG(INFO, "PCI information matches for device \"%s\".",
-			ibv_list[ret]->name);
-		ibv_match = ibv_list[ret];
-		break;
+			ibv->name);
 	}
-	mlx5_glue->free_device_list(ibv_list);
-	if (!ibv_match) {
-		DRV_LOG(ERR, "No matching IB device for PCI slot "
-			"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".",
-			pci_dev->addr.domain, pci_dev->addr.bus,
-			pci_dev->addr.devid, pci_dev->addr.function);
-		rte_errno = ENOENT;
-		return -rte_errno;
+	if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
+		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
+			ibv->name);
+		//return -rte_errno;
 	}
-	ctx = mlx5_glue->dv_open_device(ibv_match);
+	ctx = mlx5_glue->dv_open_device(ibv);
 	if (!ctx) {
-		DRV_LOG(ERR, "Failed to open IB device \"%s\".",
-			ibv_match->name);
+		DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
 		rte_errno = ENODEV;
 		return -rte_errno;
 	}