diff mbox series

[v2,3/3] vhost: add device op to offload the interrupt kick

Message ID	168069850278.833254.17836553051894685658.stgit@ebuild.local (mailing list archive)
State	Superseded, archived
Delegated to:	Thomas Monjalon
Headers	From: Eelco Chaudron <echaudro@redhat.com> To: maxime.coquelin@redhat.com, chenbo.xia@intel.com Cc: dev@dpdk.org Subject: [PATCH v2 3/3] vhost: add device op to offload the interrupt kick Date: Wed, 5 Apr 2023 14:41:57 +0200 Message-Id: <168069850278.833254.17836553051894685658.stgit@ebuild.local> In-Reply-To: <168069838578.833254.4856666346839028593.stgit@ebuild.local> References: <168069838578.833254.4856666346839028593.stgit@ebuild.local> User-Agent: StGit/1.5 MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Precedence: list Errors-To: dev-bounces@dpdk.org
Series	vhost: add device op to offload the interrupt kick \| [v2,0/3] vhost: add device op to offload the interrupt kick [v2,1/3] vhost: Change vhost_virtqueue access lock to a read/write one. [v2,2/3] vhost: make the guest_notifications statistic counter atomic. [v2,3/3] vhost: add device op to offload the interrupt kick

Checks

Context	Check	Description
ci/checkpatch	warning	coding style issues
ci/loongarch-compilation	success	Compilation OK
ci/loongarch-unit-testing	success	Unit Testing PASS
ci/Intel-compilation	success	Compilation OK
ci/iol-broadcom-Performance	success	Performance Testing PASS
ci/iol-mellanox-Performance	success	Performance Testing PASS
ci/intel-Testing	success	Testing PASS
ci/iol-intel-Functional	success	Functional Testing PASS
ci/iol-intel-Performance	success	Performance Testing PASS
ci/iol-aarch64-unit-testing	success	Testing PASS
ci/intel-Functional	success	Functional PASS
ci/iol-testing	success	Testing PASS
ci/iol-x86_64-unit-testing	success	Testing PASS
ci/github-robot: build	success	github build: passed
ci/iol-x86_64-compile-testing	success	Testing PASS
ci/iol-aarch64-compile-testing	success	Testing PASS
ci/iol-broadcom-Functional	success	Functional Testing PASS
ci/iol-unit-testing	success	Testing PASS
ci/iol-abi-testing	success	Testing PASS

Commit Message

Eelco Chaudron April 5, 2023, 12:41 p.m. UTC

  This patch adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/vhost/meson.build |    2 +
 lib/vhost/rte_vhost.h |   23 +++++++++++++++-
 lib/vhost/socket.c    |   72 ++++++++++++++++++++++++++++++++++++++++++++++---
 lib/vhost/version.map |    9 ++++++
 lib/vhost/vhost.c     |   38 ++++++++++++++++++++++++++
 lib/vhost/vhost.h     |   65 +++++++++++++++++++++++++++++++-------------
 6 files changed, 184 insertions(+), 25 deletions(-)

Comments

David Marchand May 10, 2023, 11:44 a.m. UTC | #1

On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron <echaudro@redhat.com> wrote:
>
> This patch adds an operation callback which gets called every time the
> library wants to call eventfd_write(). This eventfd_write() call could
> result in a system call, which could potentially block the PMD thread.
>
> The callback function can decide whether it's ok to handle the
> eventfd_write() now or have the newly introduced function,
> rte_vhost_notify_guest(), called at a later time.
>
> This can be used by 3rd party applications, like OVS, to avoid system
> calls being called as part of the PMD threads.
>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
>  lib/vhost/meson.build |    2 +
>  lib/vhost/rte_vhost.h |   23 +++++++++++++++-
>  lib/vhost/socket.c    |   72 ++++++++++++++++++++++++++++++++++++++++++++++---
>  lib/vhost/version.map |    9 ++++++
>  lib/vhost/vhost.c     |   38 ++++++++++++++++++++++++++
>  lib/vhost/vhost.h     |   65 +++++++++++++++++++++++++++++++-------------
>  6 files changed, 184 insertions(+), 25 deletions(-)
>
> diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
> index 197a51d936..e93ba6b078 100644
> --- a/lib/vhost/meson.build
> +++ b/lib/vhost/meson.build
> @@ -39,3 +39,5 @@ driver_sdk_headers = files(
>          'vdpa_driver.h',
>  )
>  deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
> +
> +use_function_versioning = true
> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> index 58a5d4be92..7a10bc36cf 100644
> --- a/lib/vhost/rte_vhost.h
> +++ b/lib/vhost/rte_vhost.h
> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>          */
>         void (*guest_notified)(int vid);
>
> -       void *reserved[1]; /**< Reserved for future extension */
> +       /**
> +        * If this callback is registered, notification to the guest can
> +        * be handled by the front-end calling rte_vhost_notify_guest().
> +        * If it's not handled, 'false' should be returned. This can be used
> +        * to remove the "slow" eventfd_write() syscall from the datapath.
> +        */
> +       bool (*guest_notify)(int vid, uint16_t queue_id);
>  };
>
>  /**
> @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
>
>  int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
>
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
> + *
> + * Inject the offloaded interrupt into the vhost device's queue. For more
> + * details see the 'guest_notify' vhost device operation.
> + *
> + * @param vid
> + *  vhost device ID
> + * @param queue_id
> + *  virtio queue index
> + */
> +__rte_experimental
> +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
> +
>  /**
>   * Register vhost driver. path could be different for multiple
>   * instance support.
> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
> index 669c322e12..787d6bacf8 100644
> --- a/lib/vhost/socket.c
> +++ b/lib/vhost/socket.c
> @@ -15,6 +15,7 @@
>  #include <fcntl.h>
>  #include <pthread.h>
>
> +#include <rte_function_versioning.h>
>  #include <rte_log.h>
>
>  #include "fd_man.h"
> @@ -43,6 +44,7 @@ struct vhost_user_socket {
>         bool async_copy;
>         bool net_compliant_ol_flags;
>         bool stats_enabled;
> +       bool alloc_notify_ops;
>
>         /*
>          * The "supported_features" indicates the feature bits the
> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>                 vsocket->path = NULL;
>         }
>
> +       if (vsocket && vsocket->alloc_notify_ops) {
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
> +#pragma GCC diagnostic pop
> +               vsocket->notify_ops = NULL;
> +       }

Rather than select the behavior based on a boolean (and here force the
compiler to close its eyes), I would instead add a non const pointer
to ops (let's say alloc_notify_ops) in vhost_user_socket.
The code can then unconditionnally call free(vsocket->alloc_notify_ops);


> +
>         if (vsocket) {
>                 free(vsocket);
>                 vsocket = NULL;
> @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path)
>  /*
>   * Register ops so that we can add/remove device to data core.
>   */
> -int
> -rte_vhost_driver_callback_register(const char *path,
> -       struct rte_vhost_device_ops const * const ops)
> +static int
> +rte_vhost_driver_callback_register__(const char *path,

No need for a rte_ prefix on static symbol.
And we can simply call this internal helper vhost_driver_callback_register().


> +       struct rte_vhost_device_ops const * const ops, bool ops_allocated)
>  {
>         struct vhost_user_socket *vsocket;
>
>         pthread_mutex_lock(&vhost_user.mutex);
>         vsocket = find_vhost_user_socket(path);
> -       if (vsocket)
> +       if (vsocket) {
> +               if (vsocket->alloc_notify_ops) {
> +                       vsocket->alloc_notify_ops = false;
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +                       free((struct rte_vhost_device_ops *)vsocket->notify_ops);
> +#pragma GCC diagnostic pop
> +               }
>                 vsocket->notify_ops = ops;
> +               if (ops_allocated)
> +                       vsocket->alloc_notify_ops = true;
> +       }
>         pthread_mutex_unlock(&vhost_user.mutex);
>
>         return vsocket ? 0 : -1;
>  }
>
> +int __vsym
> +rte_vhost_driver_callback_register_v24(const char *path,
> +       struct rte_vhost_device_ops const * const ops)
> +{
> +       return rte_vhost_driver_callback_register__(path, ops, false);
> +}
> +
> +int __vsym
> +rte_vhost_driver_callback_register_v23(const char *path,
> +       struct rte_vhost_device_ops const * const ops)
> +{
> +       int ret;
> +
> +       /*
> +        * Although the ops structure is a const structure, we do need to
> +        * override the guest_notify operation. This is because with the
> +        * previous APIs it was "reserved" and if any garbage value was passed,
> +        * it could crash the application.
> +        */
> +       if (ops && !ops->guest_notify) {

Hum, as described in the comment above, I don't think we should look
at ops->guest_notify value at all.
Checking ops != NULL should be enough.


> +               struct rte_vhost_device_ops *new_ops;
> +
> +               new_ops = malloc(sizeof(*new_ops));

Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
I am unclear of the impact though.


> +               if (new_ops == NULL)
> +                       return -1;
> +
> +               memcpy(new_ops, ops, sizeof(*new_ops));
> +               new_ops->guest_notify = NULL;
> +
> +               ret = rte_vhost_driver_callback_register__(path, ops, true);
> +       } else {
> +               ret = rte_vhost_driver_callback_register__(path, ops, false);
> +       }
> +
> +       return ret;
> +}
> +
> +/* Mark the v23 function as the old version, and v24 as the default version. */
> +VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
> +BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
> +MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
> +               struct rte_vhost_device_ops const * const ops),
> +               rte_vhost_driver_callback_register_v24);
> +
>  struct rte_vhost_device_ops const *
>  vhost_driver_callback_get(const char *path)
>  {
> diff --git a/lib/vhost/version.map b/lib/vhost/version.map
> index d322a4a888..7bcbfd12cf 100644
> --- a/lib/vhost/version.map
> +++ b/lib/vhost/version.map
> @@ -64,6 +64,12 @@ DPDK_23 {
>         local: *;
>  };
>
> +DPDK_24 {
> +       global:
> +
> +       rte_vhost_driver_callback_register;
> +} DPDK_23;
> +
>  EXPERIMENTAL {
>         global:
>
> @@ -98,6 +104,9 @@ EXPERIMENTAL {
>         # added in 22.11
>         rte_vhost_async_dma_unconfigure;
>         rte_vhost_vring_call_nonblock;
> +
> +        # added in 23.07
> +       rte_vhost_notify_guest;
>  };
>
>  INTERNAL {
> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
> index 8ff6434c93..79e88f986e 100644
> --- a/lib/vhost/vhost.c
> +++ b/lib/vhost/vhost.c
> @@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = {
>         {"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])},
>         {"size_1519_max_packets",  offsetof(struct vhost_virtqueue, stats.size_bins[7])},
>         {"guest_notifications",    offsetof(struct vhost_virtqueue, stats.guest_notifications)},
> +       {"guest_notifications_offloaded", offsetof(struct vhost_virtqueue,
> +               stats.guest_notifications_offloaded)},
> +       {"guest_notifications_error", offsetof(struct vhost_virtqueue,
> +               stats.guest_notifications_error)},
>         {"iotlb_hits",             offsetof(struct vhost_virtqueue, stats.iotlb_hits)},
>         {"iotlb_misses",           offsetof(struct vhost_virtqueue, stats.iotlb_misses)},
>         {"inflight_submitted",     offsetof(struct vhost_virtqueue, stats.inflight_submitted)},
> @@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
>         return ret;
>  }
>
> +void
> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
> +{
> +       struct virtio_net *dev = get_device(vid);
> +       struct vhost_virtqueue *vq;
> +
> +       if (!dev ||  queue_id >= VHOST_MAX_VRING)
> +               return;
> +
> +       vq = dev->virtqueue[queue_id];
> +       if (!vq)
> +               return;
> +
> +       rte_rwlock_read_lock(&vq->access_lock);
> +
> +       if (vq->callfd >= 0) {
> +               int ret = eventfd_write(vq->callfd, (eventfd_t)1);
> +
> +               if (ret) {
> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                               __atomic_fetch_add(&vq->stats.guest_notifications_error,
> +                                       1, __ATOMIC_RELAXED);
> +               } else {
> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                               __atomic_fetch_add(&vq->stats.guest_notifications,
> +                                       1, __ATOMIC_RELAXED);
> +                       if (dev->notify_ops->guest_notified)
> +                               dev->notify_ops->guest_notified(dev->vid);
> +               }
> +       }
> +
> +       rte_rwlock_read_unlock(&vq->access_lock);
> +}
> +
>  void
>  rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>  {
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 37609c7c8d..0ab75b78b5 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -141,6 +141,8 @@ struct virtqueue_stats {
>         uint64_t inflight_completed;
>         /* Counters below are atomic, and should be incremented as such. */
>         uint64_t guest_notifications;
> +       uint64_t guest_notifications_offloaded;
> +       uint64_t guest_notifications_error;
>  };
>
>  /**
> @@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
>         return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>  }
>
> +static __rte_always_inline void
> +vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
> +       int ret;
> +
> +       if (dev->notify_ops->guest_notify &&
> +           dev->notify_ops->guest_notify(dev->vid, vq->index)) {
> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                       __atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
> +                               1, __ATOMIC_RELAXED);
> +               return;
> +       }
> +
> +       ret = eventfd_write(vq->callfd, (eventfd_t) 1);
> +       if (ret) {
> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +                       __atomic_fetch_add(&vq->stats.guest_notifications_error,
> +                               1, __ATOMIC_RELAXED);
> +               return;
> +       }
> +
> +       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +               __atomic_fetch_add(&vq->stats.guest_notifications,
> +                       1, __ATOMIC_RELAXED);
> +       if (dev->notify_ops->guest_notified)
> +               dev->notify_ops->guest_notified(dev->vid);
> +}
> +
> +

One empty line is enough.

>  static __rte_always_inline void
>  vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> @@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>                         "%s: used_event_idx=%d, old=%d, new=%d\n",
>                         __func__, vhost_used_event(vq), old, new);
>
> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
> -                                       (vq->callfd >= 0)) ||
> -                               unlikely(!signalled_used_valid)) {
> -                       eventfd_write(vq->callfd, (eventfd_t) 1);
> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
> -                                       1, __ATOMIC_RELAXED);
> -                       if (dev->notify_ops->guest_notified)
> -                               dev->notify_ops->guest_notified(dev->vid);
> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
> +                    unlikely(!signalled_used_valid)) &&

A bit hard to read (even before your change).

After the change, indentation does not comply with dpdk coding style.
http://doc.dpdk.org/guides/contributing/coding_style.html#c-indentation


But putting indentation aside, is this change equivalent?
-               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
-                                       (vq->callfd >= 0)) ||
-                               unlikely(!signalled_used_valid)) {
+               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
+                               unlikely(!signalled_used_valid)) &&
+                               vq->callfd >= 0) {


> +                       vhost_vring_inject_irq(dev, vq);
>                 }
>         } else {
>                 /* Kick the guest if necessary. */
>                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>                                 && (vq->callfd >= 0)) {
> -                       eventfd_write(vq->callfd, (eventfd_t)1);
> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
> -                                       1, __ATOMIC_RELAXED);
> -                       if (dev->notify_ops->guest_notified)
> -                               dev->notify_ops->guest_notified(dev->vid);
> +                       vhost_vring_inject_irq(dev, vq);
>                 }
>         }
>  }
> @@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
>         if (vhost_need_event(off, new, old))
>                 kick = true;
>  kick:
> -       if (kick) {
> -               eventfd_write(vq->callfd, (eventfd_t)1);
> -               if (dev->notify_ops->guest_notified)
> -                       dev->notify_ops->guest_notified(dev->vid);
> -       }
> +       if (kick && vq->callfd >= 0)
> +               vhost_vring_inject_irq(dev, vq);
>  }
>
>  static __rte_always_inline void
> @@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m)
>
>  uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
>  void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
> +
> +/* Versioned functions */
> +int rte_vhost_driver_callback_register_v23(const char *path,
> +       struct rte_vhost_device_ops const * const ops);
> +int rte_vhost_driver_callback_register_v24(const char *path,
> +       struct rte_vhost_device_ops const * const ops);
> +
>  #endif /* _VHOST_NET_CDEV_H_ */
>

Eelco Chaudron May 16, 2023, 8:53 a.m. UTC | #2

On 10 May 2023, at 13:44, David Marchand wrote:

> On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron <echaudro@redhat.com> wrote:
>>
>> This patch adds an operation callback which gets called every time the
>> library wants to call eventfd_write(). This eventfd_write() call could
>> result in a system call, which could potentially block the PMD thread.
>>
>> The callback function can decide whether it's ok to handle the
>> eventfd_write() now or have the newly introduced function,
>> rte_vhost_notify_guest(), called at a later time.
>>
>> This can be used by 3rd party applications, like OVS, to avoid system
>> calls being called as part of the PMD threads.
>>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>>  lib/vhost/meson.build |    2 +
>>  lib/vhost/rte_vhost.h |   23 +++++++++++++++-
>>  lib/vhost/socket.c    |   72 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  lib/vhost/version.map |    9 ++++++
>>  lib/vhost/vhost.c     |   38 ++++++++++++++++++++++++++
>>  lib/vhost/vhost.h     |   65 +++++++++++++++++++++++++++++++-------------
>>  6 files changed, 184 insertions(+), 25 deletions(-)
>>
>> diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
>> index 197a51d936..e93ba6b078 100644
>> --- a/lib/vhost/meson.build
>> +++ b/lib/vhost/meson.build
>> @@ -39,3 +39,5 @@ driver_sdk_headers = files(
>>          'vdpa_driver.h',
>>  )
>>  deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
>> +
>> +use_function_versioning = true
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>> index 58a5d4be92..7a10bc36cf 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>>          */
>>         void (*guest_notified)(int vid);
>>
>> -       void *reserved[1]; /**< Reserved for future extension */
>> +       /**
>> +        * If this callback is registered, notification to the guest can
>> +        * be handled by the front-end calling rte_vhost_notify_guest().
>> +        * If it's not handled, 'false' should be returned. This can be used
>> +        * to remove the "slow" eventfd_write() syscall from the datapath.
>> +        */
>> +       bool (*guest_notify)(int vid, uint16_t queue_id);
>>  };
>>
>>  /**
>> @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
>>
>>  int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
>>
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
>> + *
>> + * Inject the offloaded interrupt into the vhost device's queue. For more
>> + * details see the 'guest_notify' vhost device operation.
>> + *
>> + * @param vid
>> + *  vhost device ID
>> + * @param queue_id
>> + *  virtio queue index
>> + */
>> +__rte_experimental
>> +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
>> +
>>  /**
>>   * Register vhost driver. path could be different for multiple
>>   * instance support.
>> diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
>> index 669c322e12..787d6bacf8 100644
>> --- a/lib/vhost/socket.c
>> +++ b/lib/vhost/socket.c
>> @@ -15,6 +15,7 @@
>>  #include <fcntl.h>
>>  #include <pthread.h>
>>
>> +#include <rte_function_versioning.h>
>>  #include <rte_log.h>
>>
>>  #include "fd_man.h"
>> @@ -43,6 +44,7 @@ struct vhost_user_socket {
>>         bool async_copy;
>>         bool net_compliant_ol_flags;
>>         bool stats_enabled;
>> +       bool alloc_notify_ops;
>>
>>         /*
>>          * The "supported_features" indicates the feature bits the
>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>                 vsocket->path = NULL;
>>         }
>>
>> +       if (vsocket && vsocket->alloc_notify_ops) {
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>> +#pragma GCC diagnostic pop
>> +               vsocket->notify_ops = NULL;
>> +       }
>
> Rather than select the behavior based on a boolean (and here force the
> compiler to close its eyes), I would instead add a non const pointer
> to ops (let's say alloc_notify_ops) in vhost_user_socket.
> The code can then unconditionnally call free(vsocket->alloc_notify_ops);

Good idea, I will make the change in v3.

>> +
>>         if (vsocket) {
>>                 free(vsocket);
>>                 vsocket = NULL;
>> @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path)
>>  /*
>>   * Register ops so that we can add/remove device to data core.
>>   */
>> -int
>> -rte_vhost_driver_callback_register(const char *path,
>> -       struct rte_vhost_device_ops const * const ops)
>> +static int
>> +rte_vhost_driver_callback_register__(const char *path,
>
> No need for a rte_ prefix on static symbol.
> And we can simply call this internal helper vhost_driver_callback_register().

ACK.

>> +       struct rte_vhost_device_ops const * const ops, bool ops_allocated)
>>  {
>>         struct vhost_user_socket *vsocket;
>>
>>         pthread_mutex_lock(&vhost_user.mutex);
>>         vsocket = find_vhost_user_socket(path);
>> -       if (vsocket)
>> +       if (vsocket) {
>> +               if (vsocket->alloc_notify_ops) {
>> +                       vsocket->alloc_notify_ops = false;
>> +#pragma GCC diagnostic push
>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>> +                       free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>> +#pragma GCC diagnostic pop
>> +               }
>>                 vsocket->notify_ops = ops;
>> +               if (ops_allocated)
>> +                       vsocket->alloc_notify_ops = true;
>> +       }
>>         pthread_mutex_unlock(&vhost_user.mutex);
>>
>>         return vsocket ? 0 : -1;
>>  }
>>
>> +int __vsym
>> +rte_vhost_driver_callback_register_v24(const char *path,
>> +       struct rte_vhost_device_ops const * const ops)
>> +{
>> +       return rte_vhost_driver_callback_register__(path, ops, false);
>> +}
>> +
>> +int __vsym
>> +rte_vhost_driver_callback_register_v23(const char *path,
>> +       struct rte_vhost_device_ops const * const ops)
>> +{
>> +       int ret;
>> +
>> +       /*
>> +        * Although the ops structure is a const structure, we do need to
>> +        * override the guest_notify operation. This is because with the
>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>> +        * it could crash the application.
>> +        */
>> +       if (ops && !ops->guest_notify) {
>
> Hum, as described in the comment above, I don't think we should look
> at ops->guest_notify value at all.
> Checking ops != NULL should be enough.

Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.

I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.

>> +               struct rte_vhost_device_ops *new_ops;
>> +
>> +               new_ops = malloc(sizeof(*new_ops));
>
> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
> I am unclear of the impact though.

Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?

Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.

>> +               if (new_ops == NULL)
>> +                       return -1;
>> +
>> +               memcpy(new_ops, ops, sizeof(*new_ops));
>> +               new_ops->guest_notify = NULL;
>> +
>> +               ret = rte_vhost_driver_callback_register__(path, ops, true);
>> +       } else {
>> +               ret = rte_vhost_driver_callback_register__(path, ops, false);
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>> +/* Mark the v23 function as the old version, and v24 as the default version. */
>> +VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
>> +BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
>> +MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
>> +               struct rte_vhost_device_ops const * const ops),
>> +               rte_vhost_driver_callback_register_v24);
>> +
>>  struct rte_vhost_device_ops const *
>>  vhost_driver_callback_get(const char *path)
>>  {
>> diff --git a/lib/vhost/version.map b/lib/vhost/version.map
>> index d322a4a888..7bcbfd12cf 100644
>> --- a/lib/vhost/version.map
>> +++ b/lib/vhost/version.map
>> @@ -64,6 +64,12 @@ DPDK_23 {
>>         local: *;
>>  };
>>
>> +DPDK_24 {
>> +       global:
>> +
>> +       rte_vhost_driver_callback_register;
>> +} DPDK_23;
>> +
>>  EXPERIMENTAL {
>>         global:
>>
>> @@ -98,6 +104,9 @@ EXPERIMENTAL {
>>         # added in 22.11
>>         rte_vhost_async_dma_unconfigure;
>>         rte_vhost_vring_call_nonblock;
>> +
>> +        # added in 23.07
>> +       rte_vhost_notify_guest;
>>  };
>>
>>  INTERNAL {
>> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
>> index 8ff6434c93..79e88f986e 100644
>> --- a/lib/vhost/vhost.c
>> +++ b/lib/vhost/vhost.c
>> @@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = {
>>         {"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])},
>>         {"size_1519_max_packets",  offsetof(struct vhost_virtqueue, stats.size_bins[7])},
>>         {"guest_notifications",    offsetof(struct vhost_virtqueue, stats.guest_notifications)},
>> +       {"guest_notifications_offloaded", offsetof(struct vhost_virtqueue,
>> +               stats.guest_notifications_offloaded)},
>> +       {"guest_notifications_error", offsetof(struct vhost_virtqueue,
>> +               stats.guest_notifications_error)},
>>         {"iotlb_hits",             offsetof(struct vhost_virtqueue, stats.iotlb_hits)},
>>         {"iotlb_misses",           offsetof(struct vhost_virtqueue, stats.iotlb_misses)},
>>         {"inflight_submitted",     offsetof(struct vhost_virtqueue, stats.inflight_submitted)},
>> @@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
>>         return ret;
>>  }
>>
>> +void
>> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
>> +{
>> +       struct virtio_net *dev = get_device(vid);
>> +       struct vhost_virtqueue *vq;
>> +
>> +       if (!dev ||  queue_id >= VHOST_MAX_VRING)
>> +               return;
>> +
>> +       vq = dev->virtqueue[queue_id];
>> +       if (!vq)
>> +               return;
>> +
>> +       rte_rwlock_read_lock(&vq->access_lock);
>> +
>> +       if (vq->callfd >= 0) {
>> +               int ret = eventfd_write(vq->callfd, (eventfd_t)1);
>> +
>> +               if (ret) {
>> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                               __atomic_fetch_add(&vq->stats.guest_notifications_error,
>> +                                       1, __ATOMIC_RELAXED);
>> +               } else {
>> +                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                               __atomic_fetch_add(&vq->stats.guest_notifications,
>> +                                       1, __ATOMIC_RELAXED);
>> +                       if (dev->notify_ops->guest_notified)
>> +                               dev->notify_ops->guest_notified(dev->vid);
>> +               }
>> +       }
>> +
>> +       rte_rwlock_read_unlock(&vq->access_lock);
>> +}
>> +
>>  void
>>  rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>>  {
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>> index 37609c7c8d..0ab75b78b5 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -141,6 +141,8 @@ struct virtqueue_stats {
>>         uint64_t inflight_completed;
>>         /* Counters below are atomic, and should be incremented as such. */
>>         uint64_t guest_notifications;
>> +       uint64_t guest_notifications_offloaded;
>> +       uint64_t guest_notifications_error;
>>  };
>>
>>  /**
>> @@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
>>         return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>>  }
>>
>> +static __rte_always_inline void
>> +vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> +       int ret;
>> +
>> +       if (dev->notify_ops->guest_notify &&
>> +           dev->notify_ops->guest_notify(dev->vid, vq->index)) {
>> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                       __atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
>> +                               1, __ATOMIC_RELAXED);
>> +               return;
>> +       }
>> +
>> +       ret = eventfd_write(vq->callfd, (eventfd_t) 1);
>> +       if (ret) {
>> +               if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +                       __atomic_fetch_add(&vq->stats.guest_notifications_error,
>> +                               1, __ATOMIC_RELAXED);
>> +               return;
>> +       }
>> +
>> +       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +               __atomic_fetch_add(&vq->stats.guest_notifications,
>> +                       1, __ATOMIC_RELAXED);
>> +       if (dev->notify_ops->guest_notified)
>> +               dev->notify_ops->guest_notified(dev->vid);
>> +}
>> +
>> +
>
> One empty line is enough.

Will remove.

>>  static __rte_always_inline void
>>  vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>  {
>> @@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>                         "%s: used_event_idx=%d, old=%d, new=%d\n",
>>                         __func__, vhost_used_event(vq), old, new);
>>
>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>> -                                       (vq->callfd >= 0)) ||
>> -                               unlikely(!signalled_used_valid)) {
>> -                       eventfd_write(vq->callfd, (eventfd_t) 1);
>> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
>> -                                       1, __ATOMIC_RELAXED);
>> -                       if (dev->notify_ops->guest_notified)
>> -                               dev->notify_ops->guest_notified(dev->vid);
>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>> +                    unlikely(!signalled_used_valid)) &&
>> + 		            vq->callfd >= 0) {
>
> A bit hard to read (even before your change).
>
> After the change, indentation does not comply with dpdk coding style.
> http://doc.dpdk.org/guides/contributing/coding_style.html#c-indentation

Yes, I always mess up the DPDK indentation style :(
>
> But putting indentation aside, is this change equivalent?
> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
> -                                       (vq->callfd >= 0)) ||
> -                               unlikely(!signalled_used_valid)) {
> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
> +                               unlikely(!signalled_used_valid)) &&
> +                               vq->callfd >= 0) {

They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)

>> +                       vhost_vring_inject_irq(dev, vq);
>>                 }
>>         } else {
>>                 /* Kick the guest if necessary. */
>>                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>>                                 && (vq->callfd >= 0)) {
>> -                       eventfd_write(vq->callfd, (eventfd_t)1);
>> -                       if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> -                               __atomic_fetch_add(&vq->stats.guest_notifications,
>> -                                       1, __ATOMIC_RELAXED);
>> -                       if (dev->notify_ops->guest_notified)
>> -                               dev->notify_ops->guest_notified(dev->vid);
>> +                       vhost_vring_inject_irq(dev, vq);
>>                 }
>>         }
>>  }
>> @@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>         if (vhost_need_event(off, new, old))
>>                 kick = true;
>>  kick:
>> -       if (kick) {
>> -               eventfd_write(vq->callfd, (eventfd_t)1);
>> -               if (dev->notify_ops->guest_notified)
>> -                       dev->notify_ops->guest_notified(dev->vid);
>> -       }
>> +       if (kick && vq->callfd >= 0)
>> +               vhost_vring_inject_irq(dev, vq);
>>  }
>>
>>  static __rte_always_inline void
>> @@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m)
>>
>>  uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
>>  void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
>> +
>> +/* Versioned functions */
>> +int rte_vhost_driver_callback_register_v23(const char *path,
>> +       struct rte_vhost_device_ops const * const ops);
>> +int rte_vhost_driver_callback_register_v24(const char *path,
>> +       struct rte_vhost_device_ops const * const ops);
>> +
>>  #endif /* _VHOST_NET_CDEV_H_ */
>>
>
>
> -- 
> David Marchand

David Marchand May 16, 2023, 10:12 a.m. UTC | #3

On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
> On 10 May 2023, at 13:44, David Marchand wrote:

[snip]

> >> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
> >>                 vsocket->path = NULL;
> >>         }
> >>
> >> +       if (vsocket && vsocket->alloc_notify_ops) {
> >> +#pragma GCC diagnostic push
> >> +#pragma GCC diagnostic ignored "-Wcast-qual"
> >> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
> >> +#pragma GCC diagnostic pop
> >> +               vsocket->notify_ops = NULL;
> >> +       }
> >
> > Rather than select the behavior based on a boolean (and here force the
> > compiler to close its eyes), I would instead add a non const pointer
> > to ops (let's say alloc_notify_ops) in vhost_user_socket.
> > The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>
> Good idea, I will make the change in v3.

Feel free to use a better name for this field :-).

>
> >> +
> >>         if (vsocket) {
> >>                 free(vsocket);
> >>                 vsocket = NULL;

[snip]

> >> +       /*
> >> +        * Although the ops structure is a const structure, we do need to
> >> +        * override the guest_notify operation. This is because with the
> >> +        * previous APIs it was "reserved" and if any garbage value was passed,
> >> +        * it could crash the application.
> >> +        */
> >> +       if (ops && !ops->guest_notify) {
> >
> > Hum, as described in the comment above, I don't think we should look
> > at ops->guest_notify value at all.
> > Checking ops != NULL should be enough.
>
> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>
> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.

Hum, I don't understand my comment either o_O'.
Too many days off... or maybe my evil twin took over the keyboard.


>
> >> +               struct rte_vhost_device_ops *new_ops;
> >> +
> >> +               new_ops = malloc(sizeof(*new_ops));
> >
> > Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
> > I am unclear of the impact though.
>
> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>
> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.

Determinining current numa is doable, via 'ops'
get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
numa_realloc().
The problem is how to allocate on this numa with the libc allocator
for which I have no idea...
We could go with the dpdk allocator (again, like numa_realloc()).


In practice, the passed ops will be probably from a const variable in
the program .data section (for which I think fields are set to 0
unless explicitly initialised), or a memset() will be called for a
dynamic allocation from good citizens.
So we can probably live with the current proposal.
Plus, this is only for one release, since in 23.11 with the ABI bump,
we will drop this compat code.

Maxime, Chenbo, what do you think?


[snip]

> >
> > But putting indentation aside, is this change equivalent?
> > -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
> > -                                       (vq->callfd >= 0)) ||
> > -                               unlikely(!signalled_used_valid)) {
> > +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
> > +                               unlikely(!signalled_used_valid)) &&
> > +                               vq->callfd >= 0) {
>
> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)

I think this should be a separate fix.

>
> >> +                       vhost_vring_inject_irq(dev, vq);

Eelco Chaudron May 16, 2023, 11:36 a.m. UTC | #4

On 16 May 2023, at 12:12, David Marchand wrote:

> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>> On 10 May 2023, at 13:44, David Marchand wrote:
>
> [snip]
>
>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>                 vsocket->path = NULL;
>>>>         }
>>>>
>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>> +#pragma GCC diagnostic push
>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>> +#pragma GCC diagnostic pop
>>>> +               vsocket->notify_ops = NULL;
>>>> +       }
>>>
>>> Rather than select the behavior based on a boolean (and here force the
>>> compiler to close its eyes), I would instead add a non const pointer
>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>
>> Good idea, I will make the change in v3.
>
> Feel free to use a better name for this field :-).
>
>>
>>>> +
>>>>         if (vsocket) {
>>>>                 free(vsocket);
>>>>                 vsocket = NULL;
>
> [snip]
>
>>>> +       /*
>>>> +        * Although the ops structure is a const structure, we do need to
>>>> +        * override the guest_notify operation. This is because with the
>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>> +        * it could crash the application.
>>>> +        */
>>>> +       if (ops && !ops->guest_notify) {
>>>
>>> Hum, as described in the comment above, I don't think we should look
>>> at ops->guest_notify value at all.
>>> Checking ops != NULL should be enough.
>>
>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>
>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>
> Hum, I don't understand my comment either o_O'.
> Too many days off... or maybe my evil twin took over the keyboard.
>
>
>>
>>>> +               struct rte_vhost_device_ops *new_ops;
>>>> +
>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>
>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>> I am unclear of the impact though.
>>
>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>
>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>
> Determinining current numa is doable, via 'ops'
> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
> numa_realloc().
> The problem is how to allocate on this numa with the libc allocator
> for which I have no idea...
> We could go with the dpdk allocator (again, like numa_realloc()).
>
>
> In practice, the passed ops will be probably from a const variable in
> the program .data section (for which I think fields are set to 0
> unless explicitly initialised), or a memset() will be called for a
> dynamic allocation from good citizens.
> So we can probably live with the current proposal.
> Plus, this is only for one release, since in 23.11 with the ABI bump,
> we will drop this compat code.
>
> Maxime, Chenbo, what do you think?

Wait for their response, but for now I assume we can just keep the numa unaware malloc().

>
> [snip]
>
>>>
>>> But putting indentation aside, is this change equivalent?
>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>> -                                       (vq->callfd >= 0)) ||
>>> -                               unlikely(!signalled_used_valid)) {
>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>> +                               unlikely(!signalled_used_valid)) &&
>>> +                               vq->callfd >= 0) {
>>
>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>
> I think this should be a separate fix.

ACK, will add a separate patch in this series to fix it.

>
>>
>>>> +                       vhost_vring_inject_irq(dev, vq);
>
>
> -- 
> David Marchand

Maxime Coquelin May 16, 2023, 11:45 a.m. UTC | #5

On 5/16/23 13:36, Eelco Chaudron wrote:
> 
> 
> On 16 May 2023, at 12:12, David Marchand wrote:
> 
>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>
>> [snip]
>>
>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>>                  vsocket->path = NULL;
>>>>>          }
>>>>>
>>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>>> +#pragma GCC diagnostic push
>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>> +#pragma GCC diagnostic pop
>>>>> +               vsocket->notify_ops = NULL;
>>>>> +       }
>>>>
>>>> Rather than select the behavior based on a boolean (and here force the
>>>> compiler to close its eyes), I would instead add a non const pointer
>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>
>>> Good idea, I will make the change in v3.
>>
>> Feel free to use a better name for this field :-).
>>
>>>
>>>>> +
>>>>>          if (vsocket) {
>>>>>                  free(vsocket);
>>>>>                  vsocket = NULL;
>>
>> [snip]
>>
>>>>> +       /*
>>>>> +        * Although the ops structure is a const structure, we do need to
>>>>> +        * override the guest_notify operation. This is because with the
>>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>>> +        * it could crash the application.
>>>>> +        */
>>>>> +       if (ops && !ops->guest_notify) {
>>>>
>>>> Hum, as described in the comment above, I don't think we should look
>>>> at ops->guest_notify value at all.
>>>> Checking ops != NULL should be enough.
>>>
>>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>>
>>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>>
>> Hum, I don't understand my comment either o_O'.
>> Too many days off... or maybe my evil twin took over the keyboard.
>>
>>
>>>
>>>>> +               struct rte_vhost_device_ops *new_ops;
>>>>> +
>>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>>
>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>> I am unclear of the impact though.
>>>
>>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>>
>>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>>
>> Determinining current numa is doable, via 'ops'
>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>> numa_realloc().
>> The problem is how to allocate on this numa with the libc allocator
>> for which I have no idea...
>> We could go with the dpdk allocator (again, like numa_realloc()).
>>
>>
>> In practice, the passed ops will be probably from a const variable in
>> the program .data section (for which I think fields are set to 0
>> unless explicitly initialised), or a memset() will be called for a
>> dynamic allocation from good citizens.
>> So we can probably live with the current proposal.
>> Plus, this is only for one release, since in 23.11 with the ABI bump,
>> we will drop this compat code.
>>
>> Maxime, Chenbo, what do you think?
> 
> Wait for their response, but for now I assume we can just keep the numa unaware malloc().

Let's keep it as is as we'll get rid of it in 23.11.

>>
>> [snip]
>>
>>>>
>>>> But putting indentation aside, is this change equivalent?
>>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>>> -                                       (vq->callfd >= 0)) ||
>>>> -                               unlikely(!signalled_used_valid)) {
>>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>>> +                               unlikely(!signalled_used_valid)) &&
>>>> +                               vq->callfd >= 0) {
>>>
>>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>>
>> I think this should be a separate fix.
> 
> ACK, will add a separate patch in this series to fix it.

I also caught & fixed it while implementing my VDUSE series [0].
You can pick it in your series, and I will rebase my series on top of
it.

Thanks,
Maxime

[0]: 
https://gitlab.com/mcoquelin/dpdk-next-virtio/-/commit/b976e1f226db5c09834148847d994045eb89be93


> 
>>
>>>
>>>>> +                       vhost_vring_inject_irq(dev, vq);
>>
>>
>> -- 
>> David Marchand
>

Eelco Chaudron May 16, 2023, 12:07 p.m. UTC | #6

On 16 May 2023, at 13:45, Maxime Coquelin wrote:

> On 5/16/23 13:36, Eelco Chaudron wrote:
>>
>>
>> On 16 May 2023, at 12:12, David Marchand wrote:
>>
>>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>>
>>> [snip]
>>>
>>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>>>                  vsocket->path = NULL;
>>>>>>          }
>>>>>>
>>>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>>>> +#pragma GCC diagnostic push
>>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>>> +#pragma GCC diagnostic pop
>>>>>> +               vsocket->notify_ops = NULL;
>>>>>> +       }
>>>>>
>>>>> Rather than select the behavior based on a boolean (and here force the
>>>>> compiler to close its eyes), I would instead add a non const pointer
>>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>>
>>>> Good idea, I will make the change in v3.
>>>
>>> Feel free to use a better name for this field :-).
>>>
>>>>
>>>>>> +
>>>>>>          if (vsocket) {
>>>>>>                  free(vsocket);
>>>>>>                  vsocket = NULL;
>>>
>>> [snip]
>>>
>>>>>> +       /*
>>>>>> +        * Although the ops structure is a const structure, we do need to
>>>>>> +        * override the guest_notify operation. This is because with the
>>>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>>>> +        * it could crash the application.
>>>>>> +        */
>>>>>> +       if (ops && !ops->guest_notify) {
>>>>>
>>>>> Hum, as described in the comment above, I don't think we should look
>>>>> at ops->guest_notify value at all.
>>>>> Checking ops != NULL should be enough.
>>>>
>>>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>>>
>>>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>>>
>>> Hum, I don't understand my comment either o_O'.
>>> Too many days off... or maybe my evil twin took over the keyboard.
>>>
>>>
>>>>
>>>>>> +               struct rte_vhost_device_ops *new_ops;
>>>>>> +
>>>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>>>
>>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>>> I am unclear of the impact though.
>>>>
>>>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>>>
>>>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>>>
>>> Determinining current numa is doable, via 'ops'
>>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>>> numa_realloc().
>>> The problem is how to allocate on this numa with the libc allocator
>>> for which I have no idea...
>>> We could go with the dpdk allocator (again, like numa_realloc()).
>>>
>>>
>>> In practice, the passed ops will be probably from a const variable in
>>> the program .data section (for which I think fields are set to 0
>>> unless explicitly initialised), or a memset() will be called for a
>>> dynamic allocation from good citizens.
>>> So we can probably live with the current proposal.
>>> Plus, this is only for one release, since in 23.11 with the ABI bump,
>>> we will drop this compat code.
>>>
>>> Maxime, Chenbo, what do you think?
>>
>> Wait for their response, but for now I assume we can just keep the numa unaware malloc().
>
> Let's keep it as is as we'll get rid of it in 23.11.

Thanks for confirming.

>>>
>>> [snip]
>>>
>>>>>
>>>>> But putting indentation aside, is this change equivalent?
>>>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>>>> -                                       (vq->callfd >= 0)) ||
>>>>> -                               unlikely(!signalled_used_valid)) {
>>>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>>>> +                               unlikely(!signalled_used_valid)) &&
>>>>> +                               vq->callfd >= 0) {
>>>>
>>>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>>>
>>> I think this should be a separate fix.
>>
>> ACK, will add a separate patch in this series to fix it.
>
> I also caught & fixed it while implementing my VDUSE series [0].
> You can pick it in your series, and I will rebase my series on top of
> it.

Thanks for the details I’ll include your patch in my series.

I will send out a new revision soon (after testing the changes with OVS).

Thanks,

Eelco

> Thanks,
> Maxime
>
> [0]: https://gitlab.com/mcoquelin/dpdk-next-virtio/-/commit/b976e1f226db5c09834148847d994045eb89be93
>
>
>>
>>>
>>>>
>>>>>> +                       vhost_vring_inject_irq(dev, vq);
>>>
>>>
>>> -- 
>>> David Marchand
>>

Eelco Chaudron May 17, 2023, 9:18 a.m. UTC | #7

On 16 May 2023, at 13:36, Eelco Chaudron wrote:

> On 16 May 2023, at 12:12, David Marchand wrote:
>
>> On Tue, May 16, 2023 at 10:53 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>> On 10 May 2023, at 13:44, David Marchand wrote:
>>
>> [snip]
>>
>>>>> @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
>>>>>                 vsocket->path = NULL;
>>>>>         }
>>>>>
>>>>> +       if (vsocket && vsocket->alloc_notify_ops) {
>>>>> +#pragma GCC diagnostic push
>>>>> +#pragma GCC diagnostic ignored "-Wcast-qual"
>>>>> +               free((struct rte_vhost_device_ops *)vsocket->notify_ops);
>>>>> +#pragma GCC diagnostic pop
>>>>> +               vsocket->notify_ops = NULL;
>>>>> +       }
>>>>
>>>> Rather than select the behavior based on a boolean (and here force the
>>>> compiler to close its eyes), I would instead add a non const pointer
>>>> to ops (let's say alloc_notify_ops) in vhost_user_socket.
>>>> The code can then unconditionnally call free(vsocket->alloc_notify_ops);
>>>
>>> Good idea, I will make the change in v3.
>>
>> Feel free to use a better name for this field :-).
>>
>>>
>>>>> +
>>>>>         if (vsocket) {
>>>>>                 free(vsocket);
>>>>>                 vsocket = NULL;
>>
>> [snip]
>>
>>>>> +       /*
>>>>> +        * Although the ops structure is a const structure, we do need to
>>>>> +        * override the guest_notify operation. This is because with the
>>>>> +        * previous APIs it was "reserved" and if any garbage value was passed,
>>>>> +        * it could crash the application.
>>>>> +        */
>>>>> +       if (ops && !ops->guest_notify) {
>>>>
>>>> Hum, as described in the comment above, I don't think we should look
>>>> at ops->guest_notify value at all.
>>>> Checking ops != NULL should be enough.
>>>
>>> Not sure I get you here. If the guest_notify passed by the user is NULL, it means the previously ‘reserved[1]’ field is NULL, so we do not need to use a new structure.
>>>
>>> I guess your comment would be true if we would introduce a new field in the data structure, not replacing a reserved one.
>>
>> Hum, I don't understand my comment either o_O'.
>> Too many days off... or maybe my evil twin took over the keyboard.
>>
>>
>>>
>>>>> +               struct rte_vhost_device_ops *new_ops;
>>>>> +
>>>>> +               new_ops = malloc(sizeof(*new_ops));
>>>>
>>>> Strictly speaking, we lose the numa affinity of "ops" by calling malloc.
>>>> I am unclear of the impact though.
>>>
>>> Don’t think there is a portable API that we can use to determine the NUMA for the ops memory and then allocate this on the same numa?
>>>
>>> Any thoughts or ideas on how to solve this? I hope most people will memset() the ops structure and the reserved[1] part is zero, but it might be a problem in the future if more extensions get added.
>>
>> Determinining current numa is doable, via 'ops'
>> get_mempolicy(MPOL_F_NODE | MPOL_F_ADDR), like what is done for vq in
>> numa_realloc().
>> The problem is how to allocate on this numa with the libc allocator
>> for which I have no idea...
>> We could go with the dpdk allocator (again, like numa_realloc()).
>>
>>
>> In practice, the passed ops will be probably from a const variable in
>> the program .data section (for which I think fields are set to 0
>> unless explicitly initialised), or a memset() will be called for a
>> dynamic allocation from good citizens.
>> So we can probably live with the current proposal.
>> Plus, this is only for one release, since in 23.11 with the ABI bump,
>> we will drop this compat code.
>>
>> Maxime, Chenbo, what do you think?
>
> Wait for their response, but for now I assume we can just keep the numa unaware malloc().
>
>>
>> [snip]
>>
>>>>
>>>> But putting indentation aside, is this change equivalent?
>>>> -               if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>>> -                                       (vq->callfd >= 0)) ||
>>>> -                               unlikely(!signalled_used_valid)) {
>>>> +               if ((vhost_need_event(vhost_used_event(vq), new, old) ||
>>>> +                               unlikely(!signalled_used_valid)) &&
>>>> +                               vq->callfd >= 0) {
>>>
>>> They are not equal, but in the past eventfd_write() should also not have been called with callfd < 0, guess this was an existing bug ;)
>>
>> I think this should be a separate fix.
>
> ACK, will add a separate patch in this series to fix it.

FYI I sent out the v3 patch.

//Eelco

diff mbox series

Patch

diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
index 197a51d936..e93ba6b078 100644
--- a/lib/vhost/meson.build
+++ b/lib/vhost/meson.build
@@ -39,3 +39,5 @@  driver_sdk_headers = files(
         'vdpa_driver.h',
 )
 deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
+
+use_function_versioning = true
diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 58a5d4be92..7a10bc36cf 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -298,7 +298,13 @@  struct rte_vhost_device_ops {
 	 */
 	void (*guest_notified)(int vid);
 
-	void *reserved[1]; /**< Reserved for future extension */
+	/**
+	 * If this callback is registered, notification to the guest can
+	 * be handled by the front-end calling rte_vhost_notify_guest().
+	 * If it's not handled, 'false' should be returned. This can be used
+	 * to remove the "slow" eventfd_write() syscall from the datapath.
+	 */
+	bool (*guest_notify)(int vid, uint16_t queue_id);
 };
 
 /**
@@ -433,6 +439,21 @@  void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Inject the offloaded interrupt into the vhost device's queue. For more
+ * details see the 'guest_notify' vhost device operation.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  virtio queue index
+ */
+__rte_experimental
+void rte_vhost_notify_guest(int vid, uint16_t queue_id);
+
 /**
  * Register vhost driver. path could be different for multiple
  * instance support.
diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c
index 669c322e12..787d6bacf8 100644
--- a/lib/vhost/socket.c
+++ b/lib/vhost/socket.c
@@ -15,6 +15,7 @@ 
 #include <fcntl.h>
 #include <pthread.h>
 
+#include <rte_function_versioning.h>
 #include <rte_log.h>
 
 #include "fd_man.h"
@@ -43,6 +44,7 @@  struct vhost_user_socket {
 	bool async_copy;
 	bool net_compliant_ol_flags;
 	bool stats_enabled;
+	bool alloc_notify_ops;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -846,6 +848,14 @@  vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
 		vsocket->path = NULL;
 	}
 
+	if (vsocket && vsocket->alloc_notify_ops) {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+		free((struct rte_vhost_device_ops *)vsocket->notify_ops);
+#pragma GCC diagnostic pop
+		vsocket->notify_ops = NULL;
+	}
+
 	if (vsocket) {
 		free(vsocket);
 		vsocket = NULL;
@@ -1099,21 +1109,75 @@  rte_vhost_driver_unregister(const char *path)
 /*
  * Register ops so that we can add/remove device to data core.
  */
-int
-rte_vhost_driver_callback_register(const char *path,
-	struct rte_vhost_device_ops const * const ops)
+static int
+rte_vhost_driver_callback_register__(const char *path,
+	struct rte_vhost_device_ops const * const ops, bool ops_allocated)
 {
 	struct vhost_user_socket *vsocket;
 
 	pthread_mutex_lock(&vhost_user.mutex);
 	vsocket = find_vhost_user_socket(path);
-	if (vsocket)
+	if (vsocket) {
+		if (vsocket->alloc_notify_ops) {
+			vsocket->alloc_notify_ops = false;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+			free((struct rte_vhost_device_ops *)vsocket->notify_ops);
+#pragma GCC diagnostic pop
+		}
 		vsocket->notify_ops = ops;
+		if (ops_allocated)
+			vsocket->alloc_notify_ops = true;
+	}
 	pthread_mutex_unlock(&vhost_user.mutex);
 
 	return vsocket ? 0 : -1;
 }
 
+int __vsym
+rte_vhost_driver_callback_register_v24(const char *path,
+	struct rte_vhost_device_ops const * const ops)
+{
+	return rte_vhost_driver_callback_register__(path, ops, false);
+}
+
+int __vsym
+rte_vhost_driver_callback_register_v23(const char *path,
+	struct rte_vhost_device_ops const * const ops)
+{
+	int ret;
+
+	/*
+	 * Although the ops structure is a const structure, we do need to
+	 * override the guest_notify operation. This is because with the
+	 * previous APIs it was "reserved" and if any garbage value was passed,
+	 * it could crash the application.
+	 */
+	if (ops && !ops->guest_notify) {
+		struct rte_vhost_device_ops *new_ops;
+
+		new_ops = malloc(sizeof(*new_ops));
+		if (new_ops == NULL)
+			return -1;
+
+		memcpy(new_ops, ops, sizeof(*new_ops));
+		new_ops->guest_notify = NULL;
+
+		ret = rte_vhost_driver_callback_register__(path, ops, true);
+	} else {
+		ret = rte_vhost_driver_callback_register__(path, ops, false);
+	}
+
+	return ret;
+}
+
+/* Mark the v23 function as the old version, and v24 as the default version. */
+VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
+BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
+MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
+		struct rte_vhost_device_ops const * const ops),
+		rte_vhost_driver_callback_register_v24);
+
 struct rte_vhost_device_ops const *
 vhost_driver_callback_get(const char *path)
 {
diff --git a/lib/vhost/version.map b/lib/vhost/version.map
index d322a4a888..7bcbfd12cf 100644
--- a/lib/vhost/version.map
+++ b/lib/vhost/version.map
@@ -64,6 +64,12 @@  DPDK_23 {
 	local: *;
 };
 
+DPDK_24 {
+	global:
+
+	rte_vhost_driver_callback_register;
+} DPDK_23;
+
 EXPERIMENTAL {
 	global:
 
@@ -98,6 +104,9 @@  EXPERIMENTAL {
 	# added in 22.11
 	rte_vhost_async_dma_unconfigure;
 	rte_vhost_vring_call_nonblock;
+
+        # added in 23.07
+	rte_vhost_notify_guest;
 };
 
 INTERNAL {
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index 8ff6434c93..79e88f986e 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -44,6 +44,10 @@  static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = {
 	{"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])},
 	{"size_1519_max_packets",  offsetof(struct vhost_virtqueue, stats.size_bins[7])},
 	{"guest_notifications",    offsetof(struct vhost_virtqueue, stats.guest_notifications)},
+	{"guest_notifications_offloaded", offsetof(struct vhost_virtqueue,
+		stats.guest_notifications_offloaded)},
+	{"guest_notifications_error", offsetof(struct vhost_virtqueue,
+		stats.guest_notifications_error)},
 	{"iotlb_hits",             offsetof(struct vhost_virtqueue, stats.iotlb_hits)},
 	{"iotlb_misses",           offsetof(struct vhost_virtqueue, stats.iotlb_misses)},
 	{"inflight_submitted",     offsetof(struct vhost_virtqueue, stats.inflight_submitted)},
@@ -1467,6 +1471,40 @@  rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
 	return ret;
 }
 
+void
+rte_vhost_notify_guest(int vid, uint16_t queue_id)
+{
+	struct virtio_net *dev = get_device(vid);
+	struct vhost_virtqueue *vq;
+
+	if (!dev ||  queue_id >= VHOST_MAX_VRING)
+		return;
+
+	vq = dev->virtqueue[queue_id];
+	if (!vq)
+		return;
+
+	rte_rwlock_read_lock(&vq->access_lock);
+
+	if (vq->callfd >= 0) {
+		int ret = eventfd_write(vq->callfd, (eventfd_t)1);
+
+		if (ret) {
+			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+				__atomic_fetch_add(&vq->stats.guest_notifications_error,
+					1, __ATOMIC_RELAXED);
+		} else {
+			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+				__atomic_fetch_add(&vq->stats.guest_notifications,
+					1, __ATOMIC_RELAXED);
+			if (dev->notify_ops->guest_notified)
+				dev->notify_ops->guest_notified(dev->vid);
+		}
+	}
+
+	rte_rwlock_read_unlock(&vq->access_lock);
+}
+
 void
 rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
 {
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 37609c7c8d..0ab75b78b5 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -141,6 +141,8 @@  struct virtqueue_stats {
 	uint64_t inflight_completed;
 	/* Counters below are atomic, and should be incremented as such. */
 	uint64_t guest_notifications;
+	uint64_t guest_notifications_offloaded;
+	uint64_t guest_notifications_error;
 };
 
 /**
@@ -884,6 +886,35 @@  vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
 	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
 }
 
+static __rte_always_inline void
+vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	int ret;
+
+	if (dev->notify_ops->guest_notify &&
+	    dev->notify_ops->guest_notify(dev->vid, vq->index)) {
+		if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+			__atomic_fetch_add(&vq->stats.guest_notifications_offloaded,
+				1, __ATOMIC_RELAXED);
+		return;
+	}
+
+	ret = eventfd_write(vq->callfd, (eventfd_t) 1);
+	if (ret) {
+		if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+			__atomic_fetch_add(&vq->stats.guest_notifications_error,
+				1, __ATOMIC_RELAXED);
+		return;
+	}
+
+	if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+		__atomic_fetch_add(&vq->stats.guest_notifications,
+			1, __ATOMIC_RELAXED);
+	if (dev->notify_ops->guest_notified)
+		dev->notify_ops->guest_notified(dev->vid);
+}
+
+
 static __rte_always_inline void
 vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -903,26 +934,16 @@  vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 			"%s: used_event_idx=%d, old=%d, new=%d\n",
 			__func__, vhost_used_event(vq), old, new);
 
-		if ((vhost_need_event(vhost_used_event(vq), new, old) &&
-					(vq->callfd >= 0)) ||
-				unlikely(!signalled_used_valid)) {
-			eventfd_write(vq->callfd, (eventfd_t) 1);
-			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				__atomic_fetch_add(&vq->stats.guest_notifications,
-					1, __ATOMIC_RELAXED);
-			if (dev->notify_ops->guest_notified)
-				dev->notify_ops->guest_notified(dev->vid);
+		if ((vhost_need_event(vhost_used_event(vq), new, old) ||
+		     unlikely(!signalled_used_valid)) &&
+		    vq->callfd >= 0) {
+			vhost_vring_inject_irq(dev, vq);
 		}
 	} else {
 		/* Kick the guest if necessary. */
 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 				&& (vq->callfd >= 0)) {
-			eventfd_write(vq->callfd, (eventfd_t)1);
-			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				__atomic_fetch_add(&vq->stats.guest_notifications,
-					1, __ATOMIC_RELAXED);
-			if (dev->notify_ops->guest_notified)
-				dev->notify_ops->guest_notified(dev->vid);
+			vhost_vring_inject_irq(dev, vq);
 		}
 	}
 }
@@ -974,11 +995,8 @@  vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	if (vhost_need_event(off, new, old))
 		kick = true;
 kick:
-	if (kick) {
-		eventfd_write(vq->callfd, (eventfd_t)1);
-		if (dev->notify_ops->guest_notified)
-			dev->notify_ops->guest_notified(dev->vid);
-	}
+	if (kick && vq->callfd >= 0)
+		vhost_vring_inject_irq(dev, vq);
 }
 
 static __rte_always_inline void
@@ -1017,4 +1035,11 @@  mbuf_is_consumed(struct rte_mbuf *m)
 
 uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
 void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
+
+/* Versioned functions */
+int rte_vhost_driver_callback_register_v23(const char *path,
+	struct rte_vhost_device_ops const * const ops);
+int rte_vhost_driver_callback_register_v24(const char *path,
+	struct rte_vhost_device_ops const * const ops);
+
 #endif /* _VHOST_NET_CDEV_H_ */