[dpdk-dev,4/5] uio: new driver with MSI-X support
Commit Message
This is a merge of igb_uio with the MSI-X support through
eventfd (similar to VFIO). The driver requires a small change to
upstream UIO driver to allow UIO drivers to support ioctl's.
See:
http://marc.info/?l=linux-kernel&m=143197030217434&w=2
http://www.spinics.net/lists/kernel/msg1993359.html
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
config/common_linuxapp | 1 +
lib/librte_eal/linuxapp/Makefile | 3 +
lib/librte_eal/linuxapp/uio_msi/Makefile | 13 ++
lib/librte_eal/linuxapp/uio_msi/uio_msi.c | 365 ++++++++++++++++++++++++++++++
lib/librte_eal/linuxapp/uio_msi/uio_msi.h | 22 ++
5 files changed, 404 insertions(+)
create mode 100644 lib/librte_eal/linuxapp/uio_msi/Makefile
create mode 100644 lib/librte_eal/linuxapp/uio_msi/uio_msi.c
create mode 100644 lib/librte_eal/linuxapp/uio_msi/uio_msi.h
Comments
On 5/19/2015 1:40 AM, Stephen Hemminger wrote:
> +
> +/* set the mapping between vector # and existing eventfd. */
> +static int set_irq_eventfd(struct uio_msi_pci_dev *udev, u32 vec, int fd)
> +{
> + struct uio_msi_irq_ctx *ctx;
> + struct eventfd_ctx *trigger;
> + int irq, err;
> +
> + if (vec >= udev->num_vectors) {
> + dev_notice(&udev->pdev->dev, "vec %u >= num_vec %u\n",
> + vec, udev->num_vectors);
> + return -ERANGE;
> + }
> +
> + irq = udev->msix[vec].vector;
> +
> + /* Clearup existing irq mapping */
> + ctx = &udev->ctx[vec];
> + if (ctx->trigger) {
> + free_irq(irq, ctx->trigger);
> + eventfd_ctx_put(ctx->trigger);
> + ctx->trigger = NULL;
> + }
> +
> + /* Passing -1 is used to disable interrupt */
> + if (fd < 0)
> + return 0;
> +
> +
One unnecessary blank line here.
> + trigger = eventfd_ctx_fdget(fd);
> + if (IS_ERR(trigger)) {
> + err = PTR_ERR(trigger);
> + dev_notice(&udev->pdev->dev,
> + "eventfd ctx get failed: %d\n", err);
> + return err;
> + }
> +
> + err = request_irq(irq, uio_msi_irqhandler, 0, ctx->name, trigger);
> + if (err) {
> + dev_notice(&udev->pdev->dev,
> + "request irq failed: %d\n", err);
> + eventfd_ctx_put(trigger);
> + return err;
> + }
> +
> + dev_dbg(&udev->pdev->dev, "map vector %u to fd %d trigger %p\n",
> + vec, fd, trigger);
> + ctx->trigger = trigger;
> + return 0;
> +}
> +
> +static int
> +uio_msi_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg)
> +{
> + struct uio_msi_pci_dev *udev
> + = container_of(info, struct uio_msi_pci_dev, info);
> + struct uio_msi_irq_set hdr;
> + int err;
> +
> + switch (cmd) {
> + case UIO_MSI_IRQ_SET:
> + if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr)))
> + return -EFAULT;
> +
> + mutex_lock(&udev->mutex);
> + err = set_irq_eventfd(udev, hdr.vec, hdr.fd);
> + mutex_unlock(&udev->mutex);
> + break;
> + default:
> + err = -EOPNOTSUPP;
> + }
> + return err;
> +}
"uio_msi_irq_set" defines in single pattern. Compare with the bulk set
in "vfio_irq_set", it requires additional syscall during uio_msix_enable().
> +
> +static int uio_msi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> +{
> + struct uio_msi_pci_dev *udev;
> + int i, err, vectors;
> +
> + udev = kzalloc(sizeof(struct uio_msi_pci_dev), GFP_KERNEL);
> + if (!udev)
> + return -ENOMEM;
> +
> + err = pci_enable_device(pdev);
> + if (err != 0) {
> + dev_err(&pdev->dev, "cannot enable PCI device\n");
> + goto fail_free;
> + }
> +
> + vectors = pci_msix_vec_count(pdev);
> + if (vectors < 0) {
> + dev_err(&pdev->dev, "device does not support MSI-X\n");
> + err = -EINVAL;
> + goto fail_disable;
> + }
pci_msix_vec_count() is available since v3.14, it requires a compatible
check.
In order to support older version, probably a function
'uio_msix_vec_count()' is necessary.
I've one overall question, is there a special reason not enhance igb_uio
but define a new uio_msi? And the looks like the piece could be add
into igb_uio or uio_pci_generic as well. Do you have plan for the
latter? Thanks.
On Mon, 25 May 2015 14:01:14 +0800
"Liang, Cunming" <cunming.liang@intel.com> wrote:
>
>
> On 5/19/2015 1:40 AM, Stephen Hemminger wrote:
> > +
> > +/* set the mapping between vector # and existing eventfd. */
> > +static int set_irq_eventfd(struct uio_msi_pci_dev *udev, u32 vec, int fd)
> > +{
> > + struct uio_msi_irq_ctx *ctx;
> > + struct eventfd_ctx *trigger;
> > + int irq, err;
> > +
> > + if (vec >= udev->num_vectors) {
> > + dev_notice(&udev->pdev->dev, "vec %u >= num_vec %u\n",
> > + vec, udev->num_vectors);
> > + return -ERANGE;
> > + }
> > +
> > + irq = udev->msix[vec].vector;
> > +
> > + /* Clearup existing irq mapping */
> > + ctx = &udev->ctx[vec];
> > + if (ctx->trigger) {
> > + free_irq(irq, ctx->trigger);
> > + eventfd_ctx_put(ctx->trigger);
> > + ctx->trigger = NULL;
> > + }
> > +
> > + /* Passing -1 is used to disable interrupt */
> > + if (fd < 0)
> > + return 0;
> > +
> > +
> One unnecessary blank line here.
> > + trigger = eventfd_ctx_fdget(fd);
> > + if (IS_ERR(trigger)) {
> > + err = PTR_ERR(trigger);
> > + dev_notice(&udev->pdev->dev,
> > + "eventfd ctx get failed: %d\n", err);
> > + return err;
> > + }
> > +
> > + err = request_irq(irq, uio_msi_irqhandler, 0, ctx->name, trigger);
> > + if (err) {
> > + dev_notice(&udev->pdev->dev,
> > + "request irq failed: %d\n", err);
> > + eventfd_ctx_put(trigger);
> > + return err;
> > + }
> > +
> > + dev_dbg(&udev->pdev->dev, "map vector %u to fd %d trigger %p\n",
> > + vec, fd, trigger);
> > + ctx->trigger = trigger;
> > + return 0;
> > +}
> > +
> > +static int
> > +uio_msi_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg)
> > +{
> > + struct uio_msi_pci_dev *udev
> > + = container_of(info, struct uio_msi_pci_dev, info);
> > + struct uio_msi_irq_set hdr;
> > + int err;
> > +
> > + switch (cmd) {
> > + case UIO_MSI_IRQ_SET:
> > + if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr)))
> > + return -EFAULT;
> > +
> > + mutex_lock(&udev->mutex);
> > + err = set_irq_eventfd(udev, hdr.vec, hdr.fd);
> > + mutex_unlock(&udev->mutex);
> > + break;
> > + default:
> > + err = -EOPNOTSUPP;
> > + }
> > + return err;
> > +}
> "uio_msi_irq_set" defines in single pattern. Compare with the bulk set
> in "vfio_irq_set", it requires additional syscall during uio_msix_enable().
The bulk operation is VFIO is actually a bad design,
It forces too many updates when manipulating individual vectors.
Personally, the whole VFIO API has some questionable design choices
that I did not want to repeat.
> > +static int uio_msi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> > +{
> > + struct uio_msi_pci_dev *udev;
> > + int i, err, vectors;
> > +
> > + udev = kzalloc(sizeof(struct uio_msi_pci_dev), GFP_KERNEL);
> > + if (!udev)
> > + return -ENOMEM;
> > +
> > + err = pci_enable_device(pdev);
> > + if (err != 0) {
> > + dev_err(&pdev->dev, "cannot enable PCI device\n");
> > + goto fail_free;
> > + }
> > +
> > + vectors = pci_msix_vec_count(pdev);
> > + if (vectors < 0) {
> > + dev_err(&pdev->dev, "device does not support MSI-X\n");
> > + err = -EINVAL;
> > + goto fail_disable;
> > + }
> pci_msix_vec_count() is available since v3.14, it requires a compatible
> check.
> In order to support older version, probably a function
> 'uio_msix_vec_count()' is necessary.
>
> I've one overall question, is there a special reason not enhance igb_uio
> but define a new uio_msi? And the looks like the piece could be add
> into igb_uio or uio_pci_generic as well. Do you have plan for the
> latter? Thanks.
I wanted something that could go upstream. igb_uio has some other things
which make it unlikely to get accepted in current form, so seemed best to start
fresh.
Also, intentionally did not want to address any kernel version earlier
than the version where VFIO was added.
@@ -100,6 +100,7 @@ CONFIG_RTE_EAL_ALLOW_INV_SOCKET_ID=n
CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
CONFIG_RTE_EAL_IGB_UIO=y
CONFIG_RTE_EAL_VFIO=y
+CONFIG_RTE_EAL_UIO_MSI=y
#
# Special configurations in PCI Config Space for high performance
@@ -34,6 +34,9 @@ include $(RTE_SDK)/mk/rte.vars.mk
ifeq ($(CONFIG_RTE_EAL_IGB_UIO),y)
DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += igb_uio
endif
+ifeq ($(CONFIG_RTE_EAL_UIO_MSI),y)
+DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += uio_msi
+endif
DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal
ifeq ($(CONFIG_RTE_LIBRTE_KNI),y)
DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kni
new file mode 100644
@@ -0,0 +1,13 @@
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+MODULE = uio_msi
+MODULE_PATH = drivers/uio/uio_msi
+
+MODULE_CFLAGS += -I$(SRCDIR)
+MODULE_CFLAGS += -I$(RTE_OUTPUT)/include
+MODULE_CFLAGS += -Winline -Wall -Werror
+
+SRCS-y := uio_msi.c
+
+include $(RTE_SDK)/mk/rte.module.mk
new file mode 100644
@@ -0,0 +1,365 @@
+/*-
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright (c) 2015 by Brocade Communications Systems, Inc.
+ * All rights reserved.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/eventfd.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/uio_driver.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/version.h>
+
+#include "uio_msi.h"
+
+#define DRIVER_VERSION "0.1.0"
+#define NON_Q_VECTORS 1
+
+/* MSI-X vector information */
+struct uio_msi_pci_dev {
+ struct uio_info info; /* UIO driver info */
+ struct pci_dev *pdev; /* PCI device */
+ struct mutex mutex; /* open/release/ioctl mutex */
+ int ref_cnt; /* references to device */
+ u16 num_vectors; /* How many MSI-X slots are used */
+ struct msix_entry *msix; /* MSI-x vector table */
+ struct uio_msi_irq_ctx {
+ struct eventfd_ctx *trigger; /* MSI-x vector to eventfd */
+ char *name; /* name in /proc/interrupts */
+ } *ctx;
+};
+
+static unsigned int max_vectors = 33;
+module_param(max_vectors, uint, 0);
+MODULE_PARM_DESC(max_vectors, "Upper limit on # of MSI-X vectors used");
+
+static irqreturn_t uio_msi_irqhandler(int irq, void *arg)
+{
+ struct eventfd_ctx *trigger = arg;
+
+ pr_devel("irq %u trigger %p\n", irq, trigger);
+
+ eventfd_signal(trigger, 1);
+ return IRQ_HANDLED;
+}
+
+/* set the mapping between vector # and existing eventfd. */
+static int set_irq_eventfd(struct uio_msi_pci_dev *udev, u32 vec, int fd)
+{
+ struct uio_msi_irq_ctx *ctx;
+ struct eventfd_ctx *trigger;
+ int irq, err;
+
+ if (vec >= udev->num_vectors) {
+ dev_notice(&udev->pdev->dev, "vec %u >= num_vec %u\n",
+ vec, udev->num_vectors);
+ return -ERANGE;
+ }
+
+ irq = udev->msix[vec].vector;
+
+ /* Clearup existing irq mapping */
+ ctx = &udev->ctx[vec];
+ if (ctx->trigger) {
+ free_irq(irq, ctx->trigger);
+ eventfd_ctx_put(ctx->trigger);
+ ctx->trigger = NULL;
+ }
+
+ /* Passing -1 is used to disable interrupt */
+ if (fd < 0)
+ return 0;
+
+
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger)) {
+ err = PTR_ERR(trigger);
+ dev_notice(&udev->pdev->dev,
+ "eventfd ctx get failed: %d\n", err);
+ return err;
+ }
+
+ err = request_irq(irq, uio_msi_irqhandler, 0, ctx->name, trigger);
+ if (err) {
+ dev_notice(&udev->pdev->dev,
+ "request irq failed: %d\n", err);
+ eventfd_ctx_put(trigger);
+ return err;
+ }
+
+ dev_dbg(&udev->pdev->dev, "map vector %u to fd %d trigger %p\n",
+ vec, fd, trigger);
+ ctx->trigger = trigger;
+ return 0;
+}
+
+static int
+uio_msi_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg)
+{
+ struct uio_msi_pci_dev *udev
+ = container_of(info, struct uio_msi_pci_dev, info);
+ struct uio_msi_irq_set hdr;
+ int err;
+
+ switch (cmd) {
+ case UIO_MSI_IRQ_SET:
+ if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr)))
+ return -EFAULT;
+
+ mutex_lock(&udev->mutex);
+ err = set_irq_eventfd(udev, hdr.vec, hdr.fd);
+ mutex_unlock(&udev->mutex);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ }
+ return err;
+}
+
+/* Opening the UIO device for first time enables MSI-X */
+static int
+uio_msi_open(struct uio_info *info, struct inode *inode)
+{
+ struct uio_msi_pci_dev *udev
+ = container_of(info, struct uio_msi_pci_dev, info);
+ int err = 0;
+
+ mutex_lock(&udev->mutex);
+ if (udev->ref_cnt++ == 0)
+ err = pci_enable_msix(udev->pdev, udev->msix,
+ udev->num_vectors);
+ mutex_unlock(&udev->mutex);
+
+ return err;
+}
+
+/* Last close of the UIO device releases/disables all IRQ's */
+static int
+uio_msi_release(struct uio_info *info, struct inode *inode)
+{
+ struct uio_msi_pci_dev *udev
+ = container_of(info, struct uio_msi_pci_dev, info);
+
+ mutex_lock(&udev->mutex);
+ if (--udev->ref_cnt == 0) {
+ int i;
+
+ for (i = 0; i < udev->num_vectors; i++) {
+ struct uio_msi_irq_ctx *ctx = &udev->ctx[i];
+
+ if (!ctx->trigger)
+ continue;
+
+ free_irq(udev->msix[i].vector, ctx->trigger);
+ eventfd_ctx_put(ctx->trigger);
+ ctx->trigger = NULL;
+ }
+ pci_disable_msix(udev->pdev);
+ }
+ mutex_unlock(&udev->mutex);
+
+ return 0;
+}
+
+/* Unmap previously ioremap'd resources */
+static void
+release_iomaps(struct uio_mem *mem)
+{
+ int i;
+
+ for (i = 0; i < MAX_UIO_MAPS; i++, mem++) {
+ if (mem->internal_addr)
+ iounmap(mem->internal_addr);
+ }
+}
+
+static int
+setup_maps(struct pci_dev *pdev, struct uio_info *info)
+{
+ int i, m = 0, p = 0, err;
+ static const char * const bar_names[] = {
+ "BAR0", "BAR1", "BAR2", "BAR3", "BAR4", "BAR5",
+ };
+
+ for (i = 0; i < ARRAY_SIZE(bar_names); i++) {
+ unsigned long start = pci_resource_start(pdev, i);
+ unsigned long flags = pci_resource_flags(pdev, i);
+ unsigned long len = pci_resource_len(pdev, i);
+
+ if (start == 0 || len == 0)
+ continue;
+
+ if (flags & IORESOURCE_MEM) {
+ void *addr;
+
+ if (m >= MAX_UIO_MAPS)
+ continue;
+
+ addr = ioremap(start, len);
+ if (addr == NULL) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ info->mem[m].name = bar_names[i];
+ info->mem[m].addr = start;
+ info->mem[m].internal_addr = addr;
+ info->mem[m].size = len;
+ info->mem[m].memtype = UIO_MEM_PHYS;
+ ++m;
+ } else if (flags & IORESOURCE_IO) {
+ if (p >= MAX_UIO_PORT_REGIONS)
+ continue;
+
+ info->port[p].name = bar_names[i];
+ info->port[p].start = start;
+ info->port[p].size = len;
+ info->port[p].porttype = UIO_PORT_X86;
+ ++p;
+ }
+ }
+
+ return 0;
+ fail:
+ for (i = 0; i < m; i++)
+ iounmap(info->mem[i].internal_addr);
+ return err;
+}
+
+static int uio_msi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct uio_msi_pci_dev *udev;
+ int i, err, vectors;
+
+ udev = kzalloc(sizeof(struct uio_msi_pci_dev), GFP_KERNEL);
+ if (!udev)
+ return -ENOMEM;
+
+ err = pci_enable_device(pdev);
+ if (err != 0) {
+ dev_err(&pdev->dev, "cannot enable PCI device\n");
+ goto fail_free;
+ }
+
+ vectors = pci_msix_vec_count(pdev);
+ if (vectors < 0) {
+ dev_err(&pdev->dev, "device does not support MSI-X\n");
+ err = -EINVAL;
+ goto fail_disable;
+ }
+
+ udev->num_vectors = min_t(u16, vectors, max_vectors);
+ udev->msix = kcalloc(GFP_KERNEL, sizeof(struct msix_entry),
+ udev->num_vectors);
+ err = -ENOMEM;
+ if (!udev->msix)
+ goto fail_disable;
+
+ udev->ctx = kcalloc(GFP_KERNEL, sizeof(struct uio_msi_irq_ctx),
+ udev->num_vectors);
+ if (!udev->ctx)
+ goto fail_free_msix;
+
+ for (i = 0; i < udev->num_vectors; i++) {
+ udev->msix[i].entry = i;
+
+ udev->ctx[i].name = kasprintf(GFP_KERNEL,
+ KBUILD_MODNAME "[%d](%s)",
+ i, pci_name(pdev));
+ if (!udev->ctx[i].name)
+ goto fail_free_ctx;
+ }
+
+ err = pci_request_regions(pdev, "uio_msi");
+ if (err != 0) {
+ dev_err(&pdev->dev, "Cannot request regions\n");
+ goto fail_free_ctx;
+ }
+
+ pci_set_master(pdev);
+
+ /* remap resources */
+ err = setup_maps(pdev, &udev->info);
+ if (err)
+ goto fail_release_iomem;
+
+ /* fill uio infos */
+ udev->info.name = "uio_msi";
+ udev->info.version = DRIVER_VERSION;
+ udev->info.priv = udev;
+ udev->pdev = pdev;
+ udev->info.ioctl = uio_msi_ioctl;
+ udev->info.open = uio_msi_open;
+ udev->info.release = uio_msi_release;
+ udev->info.irq = UIO_IRQ_CUSTOM;
+ mutex_init(&udev->mutex);
+
+ /* register uio driver */
+ err = uio_register_device(&pdev->dev, &udev->info);
+ if (err != 0)
+ goto fail_release_iomem;
+
+ pci_set_drvdata(pdev, udev);
+ return 0;
+
+fail_release_iomem:
+ release_iomaps(udev->info.mem);
+ pci_release_regions(pdev);
+fail_free_ctx:
+ for (i = 0; i < udev->num_vectors; i++)
+ kfree(udev->ctx[i].name);
+ kfree(udev->ctx);
+fail_free_msix:
+ kfree(udev->msix);
+fail_disable:
+ pci_disable_device(pdev);
+fail_free:
+ kfree(udev);
+
+ return err;
+}
+
+static void uio_msi_remove(struct pci_dev *pdev)
+{
+ struct uio_info *info = pci_get_drvdata(pdev);
+ struct uio_msi_pci_dev *udev
+ = container_of(info, struct uio_msi_pci_dev, info);
+ int i;
+
+ uio_unregister_device(info);
+ release_iomaps(info->mem);
+
+ pci_release_regions(pdev);
+ for (i = 0; i < udev->num_vectors; i++)
+ kfree(udev->ctx[i].name);
+ kfree(udev->ctx);
+ kfree(udev->msix);
+ pci_disable_device(pdev);
+
+ pci_set_drvdata(pdev, NULL);
+ kfree(info);
+}
+
+static struct pci_driver uio_msi_pci_driver = {
+ .name = "uio_msi",
+ .probe = uio_msi_probe,
+ .remove = uio_msi_remove,
+};
+
+module_pci_driver(uio_msi_pci_driver);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
+MODULE_DESCRIPTION("UIO driver for MSI-X PCI devices");
new file mode 100644
@@ -0,0 +1,22 @@
+/*
+ * UIO_MSI API definition
+ *
+ * Copyright (c) 2015 by Brocade Communications Systems, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _UIO_PCI_MSI_H
+#define _UIO_PCI_MSI_H
+
+struct uio_msi_irq_set {
+ u32 vec;
+ int fd;
+};
+
+#define UIO_MSI_BASE 0x86
+#define UIO_MSI_IRQ_SET _IOW('I', UIO_MSI_BASE+1, struct uio_msi_irq_set)
+
+#endif