kernel -- Import virtio & virtio-block drivers.
authorVenkatesh Srinivas <vsrinivas@ops101.org>
Mon, 24 Dec 2012 18:41:27 +0000 (13:41 -0500)
committerSascha Wildner <saw@online.de>
Sun, 6 Jan 2013 22:13:55 +0000 (23:13 +0100)
virtio-blk provides a paravirtualized storage controller, with one
disk per virtio device.

This driver is based on Tim Bisson's port of FreeBSD's virtio and
virtio-blk devices. Differences from Tim's port:

* Import all FreeBSD updates from 4/16 on.
* Remove indirect descriptor support from virtio device.
* Mark devices as D_MPSAFE; removes mplock around disk routines, they
  are all correctly self-synchronized.
* Implement devstat support.
* Move I/O completion routine to threaded taskqueue.
* Do not hold target spinlock around virtqueue notify.
* Move objcache caches to kmalloc.

18 files changed:
sys/dev/virtual/Makefile
sys/dev/virtual/virtio/Makefile [new file with mode: 0644]
sys/dev/virtual/virtio/block/Makefile [new file with mode: 0644]
sys/dev/virtual/virtio/block/virtio_blk.c [new file with mode: 0644]
sys/dev/virtual/virtio/block/virtio_blk.h [new file with mode: 0644]
sys/dev/virtual/virtio/pci/Makefile [new file with mode: 0644]
sys/dev/virtual/virtio/pci/virtio_bus_if.h [new file with mode: 0644]
sys/dev/virtual/virtio/pci/virtio_if.h [new file with mode: 0644]
sys/dev/virtual/virtio/pci/virtio_pci.c [new file with mode: 0644]
sys/dev/virtual/virtio/pci/virtio_pci.h [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/Makefile [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/virtio.c [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/virtio.h [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/virtio_bus_if.m [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/virtio_if.m [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/virtio_ring.h [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/virtqueue.c [new file with mode: 0644]
sys/dev/virtual/virtio/virtio/virtqueue.h [new file with mode: 0644]

index 3514db7..f1a4af8 100644 (file)
@@ -1,6 +1,6 @@
 .include "${.CURDIR}/../../platform/${MACHINE_PLATFORM}/Makefile.inc"
 
-SUBDIR=        vkernel
+SUBDIR=        vkernel virtio
 
 .for dir in ${SUBDIR}
 .if empty(DEV_SUPPORT:Mvirtual) && \
diff --git a/sys/dev/virtual/virtio/Makefile b/sys/dev/virtual/virtio/Makefile
new file mode 100644 (file)
index 0000000..b83a94b
--- /dev/null
@@ -0,0 +1,28 @@
+#
+# $FreeBSD$
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+
+SUBDIR= virtio pci block 
+
+.include <bsd.subdir.mk>
diff --git a/sys/dev/virtual/virtio/block/Makefile b/sys/dev/virtual/virtio/block/Makefile
new file mode 100644 (file)
index 0000000..f25b6d7
--- /dev/null
@@ -0,0 +1,39 @@
+#
+# $FreeBSD$
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+
+.PATH: ${.CURDIR}
+
+KMOD=  virtio_blk
+SRCS=  virtio_blk.c
+SRCS+= virtio_bus_if.h virtio_if.h
+SRCS+= bus_if.h device_if.h 
+
+CFLAGS+=       -I${.CURDIR}/..
+
+MFILES=        kern/bus_if.m kern/device_if.m \
+       dev/virtual/virtio/virtio/virtio_bus_if.m \
+       dev/virtual/virtio/virtio/virtio_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/dev/virtual/virtio/block/virtio_blk.c b/sys/dev/virtual/virtio/block/virtio_blk.c
new file mode 100644 (file)
index 0000000..d172b84
--- /dev/null
@@ -0,0 +1,1030 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/block/virtio_blk.c,v 1.4 2012/04/16 18:29:12 grehan Exp $
+ */
+
+/* Driver for VirtIO block devices. */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+
+#include <sys/buf2.h>
+#include <sys/rman.h>
+#include <sys/disk.h>
+#include <sys/spinlock.h>
+#include <sys/spinlock2.h>
+#include <sys/devicestat.h>
+
+#include <virtio/virtio.h>
+#include <virtio/virtqueue.h>
+#include "virtio_blk.h"
+
+struct vtblk_request {
+       struct virtio_blk_outhdr        vbr_hdr;
+       struct bio                      *vbr_bp;
+       uint8_t                         vbr_ack;
+
+       TAILQ_ENTRY(vtblk_request)      vbr_link;
+};
+
+struct vtblk_softc {
+       device_t                        vtblk_dev;
+       struct spinlock                 vtblk_mtx;
+       uint64_t                        vtblk_features;
+
+#define VTBLK_FLAG_READONLY            0x0002
+#define VTBLK_FLAG_DETACH              0x0004
+#define VTBLK_FLAG_SUSPEND             0x0008
+#define VTBLK_FLAG_DUMPING             0x0010
+       uint32_t                        vtblk_flags;
+
+       struct virtqueue                *vtblk_vq;
+       struct sglist                   *vtblk_sglist;
+       struct disk                     vtblk_disk;
+       cdev_t                          cdev;
+       struct devstat                  stats;
+
+       struct bio_queue_head           vtblk_bioq;
+       TAILQ_HEAD(, vtblk_request)     vtblk_req_free;
+       TAILQ_HEAD(, vtblk_request)     vtblk_req_ready;
+
+       struct task                     vtblk_intr_task;
+
+       int                             vtblk_sector_size;
+       int                             vtblk_max_nsegs;
+       int                             vtblk_unit;
+       int                             vtblk_request_count;
+
+       struct vtblk_request            vtblk_dump_request;
+};
+
+static struct virtio_feature_desc vtblk_feature_desc[] = {
+       { VIRTIO_BLK_F_BARRIER,         "HostBarrier"   },
+       { VIRTIO_BLK_F_SIZE_MAX,        "MaxSegSize"    },
+       { VIRTIO_BLK_F_SEG_MAX,         "MaxNumSegs"    },
+       { VIRTIO_BLK_F_GEOMETRY,        "DiskGeometry"  },
+       { VIRTIO_BLK_F_RO,              "ReadOnly"      },
+       { VIRTIO_BLK_F_BLK_SIZE,        "BlockSize"     },
+       { VIRTIO_BLK_F_SCSI,            "SCSICmds"      },
+       { VIRTIO_BLK_F_FLUSH,           "FlushCmd"      },
+       { VIRTIO_BLK_F_TOPOLOGY,        "Topology"      },
+
+       { 0, NULL }
+};
+
+static int     vtblk_modevent(module_t, int, void *);
+
+static int     vtblk_probe(device_t);
+static int     vtblk_attach(device_t);
+static int     vtblk_detach(device_t);
+static int     vtblk_suspend(device_t);
+static int     vtblk_resume(device_t);
+static int     vtblk_shutdown(device_t);
+
+static void    vtblk_negotiate_features(struct vtblk_softc *);
+static int     vtblk_maximum_segments(struct vtblk_softc *,
+                                      struct virtio_blk_config *);
+static int     vtblk_alloc_virtqueue(struct vtblk_softc *);
+static void    vtblk_alloc_disk(struct vtblk_softc *,
+                                struct virtio_blk_config *);
+/*
+ * Interface to the device switch.
+ */
+static d_open_t                vtblk_open;
+static d_strategy_t    vtblk_strategy;
+static d_dump_t                vtblk_dump;
+
+static struct dev_ops vbd_disk_ops = {
+       { "vbd", 200, D_DISK | D_MPSAFE },
+       .d_open         = vtblk_open,
+       .d_close        = nullclose,
+       .d_read         = physread,
+       .d_write        = physwrite,
+       .d_strategy     = vtblk_strategy,
+       .d_dump         = vtblk_dump,
+};
+
+static void            vtblk_startio(struct vtblk_softc *);
+static struct vtblk_request *vtblk_bio_request(struct vtblk_softc *);
+static int             vtblk_execute_request(struct vtblk_softc *,
+                                             struct vtblk_request *);
+
+static int             vtblk_vq_intr(void *);
+static void            vtblk_intr_task(void *, int);
+
+static void            vtblk_stop(struct vtblk_softc *);
+
+static void            vtblk_prepare_dump(struct vtblk_softc *);
+static int             vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
+static int             vtblk_flush_dump(struct vtblk_softc *);
+static int             vtblk_poll_request(struct vtblk_softc *,
+                                          struct vtblk_request *);
+
+static void            vtblk_drain_vq(struct vtblk_softc *, int);
+static void            vtblk_drain(struct vtblk_softc *);
+
+static int             vtblk_alloc_requests(struct vtblk_softc *);
+static void            vtblk_free_requests(struct vtblk_softc *);
+static struct vtblk_request *vtblk_dequeue_request(struct vtblk_softc *);
+static void            vtblk_enqueue_request(struct vtblk_softc *,
+                                             struct vtblk_request *);
+
+static struct vtblk_request *vtblk_dequeue_ready(struct vtblk_softc *);
+static void            vtblk_enqueue_ready(struct vtblk_softc *,
+                                           struct vtblk_request *);
+
+static void            vtblk_bio_error(struct bio *, int);
+
+/* Tunables. */
+static int vtblk_no_ident = 0;
+TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
+
+/* Features desired/implemented by this driver. */
+#define VTBLK_FEATURES \
+    (VIRTIO_BLK_F_BARRIER              | \
+     VIRTIO_BLK_F_SIZE_MAX             | \
+     VIRTIO_BLK_F_SEG_MAX              | \
+     VIRTIO_BLK_F_GEOMETRY             | \
+     VIRTIO_BLK_F_RO                   | \
+     VIRTIO_BLK_F_BLK_SIZE             | \
+     VIRTIO_BLK_F_FLUSH)
+
+#define VTBLK_MTX(_sc)         &(_sc)->vtblk_mtx
+#define VTBLK_LOCK_INIT(_sc)   spin_init(&(_sc)->vtblk_mtx)
+#define VTBLK_LOCK(_sc)                spin_lock(VTBLK_MTX((_sc)))
+#define VTBLK_TRYLOCK(_sc)     spin_trylock(VTBLK_MTX((_sc)))
+#define VTBLK_UNLOCK(_sc)      spin_unlock(VTBLK_MTX((_sc)))
+#define VTBLK_LOCK_DESTROY(_sc)        spin_uninit(VTBLK_MTX((_sc)))
+
+#define VTBLK_LOCK_ASSERT(_sc)
+#define VTBLK_LOCK_ASSERT_NOTOWNED(_sc)
+
+/*
+ * Each block request uses at least two segments - one for the header
+ * and one for the status.
+ */
+#define VTBLK_MIN_SEGMENTS     2
+
+static device_method_t vtblk_methods[] = {
+       /* Device methods. */
+       DEVMETHOD(device_probe,         vtblk_probe),
+       DEVMETHOD(device_attach,        vtblk_attach),
+       DEVMETHOD(device_detach,        vtblk_detach),
+       DEVMETHOD(device_suspend,       vtblk_suspend),
+       DEVMETHOD(device_resume,        vtblk_resume),
+       DEVMETHOD(device_shutdown,      vtblk_shutdown),
+
+       { 0, 0 }
+};
+
+static driver_t vtblk_driver = {
+       "vtblk",
+       vtblk_methods,
+       sizeof(struct vtblk_softc)
+};
+static devclass_t vtblk_devclass;
+
+DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
+             vtblk_modevent, NULL);
+MODULE_VERSION(virtio_blk, 1);
+MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
+
+static int
+vtblk_modevent(module_t mod, int type, void *unused)
+{
+       int error;
+
+       error = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               break;
+       case MOD_UNLOAD:
+               break;
+       case MOD_SHUTDOWN:
+               break;
+       default:
+               error = EOPNOTSUPP;
+               break;
+       }
+
+       return (error);
+}
+
+static int
+vtblk_probe(device_t dev)
+{
+
+       if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
+               return (ENXIO);
+
+       device_set_desc(dev, "VirtIO Block Adapter");
+
+       return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtblk_attach(device_t dev)
+{
+       struct vtblk_softc *sc;
+       struct virtio_blk_config blkcfg;
+       int error;
+
+       sc = device_get_softc(dev);
+       sc->vtblk_dev = dev;
+       sc->vtblk_unit = device_get_unit(dev);
+
+       VTBLK_LOCK_INIT(sc);
+
+       bioq_init(&sc->vtblk_bioq);
+       TAILQ_INIT(&sc->vtblk_req_free);
+       TAILQ_INIT(&sc->vtblk_req_ready);
+
+       virtio_set_feature_desc(dev, vtblk_feature_desc);
+       vtblk_negotiate_features(sc);
+
+       if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
+               sc->vtblk_flags |= VTBLK_FLAG_READONLY;
+
+       /* Get local copy of config. */
+       virtio_read_device_config(dev, 0, &blkcfg,
+                                 sizeof(struct virtio_blk_config));
+
+       /*
+        * With the current sglist(9) implementation, it is not easy
+        * for us to support a maximum segment size as adjacent
+        * segments are coalesced. For now, just make sure it's larger
+        * than the maximum supported transfer size.
+        */
+       if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
+               if (blkcfg.size_max < MAXPHYS) {
+                       error = ENOTSUP;
+                       device_printf(dev, "host requires unsupported "
+                           "maximum segment size feature\n");
+                       goto fail;
+               }
+       }
+
+       sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
+        if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
+               error = EINVAL;
+               device_printf(dev, "fewer than minimum number of segments "
+                   "allowed: %d\n", sc->vtblk_max_nsegs);
+               goto fail;
+       }
+
+       /*
+        * Allocate working sglist. The number of segments may be too
+        * large to safely store on the stack.
+        */
+       sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
+       if (sc->vtblk_sglist == NULL) {
+               error = ENOMEM;
+               device_printf(dev, "cannot allocate sglist\n");
+               goto fail;
+       }
+
+       error = vtblk_alloc_virtqueue(sc);
+       if (error) {
+               device_printf(dev, "cannot allocate virtqueue\n");
+               goto fail;
+       }
+
+       error = vtblk_alloc_requests(sc);
+       if (error) {
+               device_printf(dev, "cannot preallocate requests\n");
+               goto fail;
+       }
+
+       vtblk_alloc_disk(sc, &blkcfg);
+
+       TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
+
+       error = virtio_setup_intr(dev);
+       if (error) {
+               device_printf(dev, "cannot setup virtqueue interrupt\n");
+               goto fail;
+       }
+
+       virtqueue_enable_intr(sc->vtblk_vq);
+
+fail:
+       if (error)
+               vtblk_detach(dev);
+
+       return (error);
+}
+
+static int
+vtblk_detach(device_t dev)
+{
+       struct vtblk_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       VTBLK_LOCK(sc);
+       sc->vtblk_flags |= VTBLK_FLAG_DETACH;
+       if (device_is_attached(dev))
+               vtblk_stop(sc);
+       VTBLK_UNLOCK(sc);
+
+       taskqueue_drain(taskqueue_thread[mycpuid], &sc->vtblk_intr_task);
+
+       vtblk_drain(sc);
+
+       if (sc->vtblk_sglist != NULL) {
+               sglist_free(sc->vtblk_sglist);
+               sc->vtblk_sglist = NULL;
+       }
+
+       VTBLK_LOCK_DESTROY(sc);
+
+       return (0);
+}
+
+static int
+vtblk_suspend(device_t dev)
+{
+       struct vtblk_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       VTBLK_LOCK(sc);
+       sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
+       /* TODO Wait for any inflight IO to complete? */
+       VTBLK_UNLOCK(sc);
+
+       return (0);
+}
+
+static int
+vtblk_resume(device_t dev)
+{
+       struct vtblk_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       VTBLK_LOCK(sc);
+       sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
+       /* TODO Resume IO? */
+       VTBLK_UNLOCK(sc);
+
+       return (0);
+}
+
+static int
+vtblk_shutdown(device_t dev)
+{
+       return (0);
+}
+
+static int
+vtblk_open(struct dev_open_args *ap)
+{
+       struct vtblk_softc *sc;
+       cdev_t dev = ap->a_head.a_dev;
+       sc = dev->si_drv1;
+       if (sc == NULL)
+               return (ENXIO);
+
+       return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
+}
+
+static int
+vtblk_dump(struct dev_dump_args *ap)
+{
+       struct vtblk_softc *sc;
+       int error;
+
+       error = 0;
+
+       cdev_t dev = ap->a_head.a_dev;
+       sc = dev->si_drv1;
+
+       if (sc == NULL)
+               return (ENXIO);
+
+       if (VTBLK_TRYLOCK(sc) == 0) {
+               device_printf(sc->vtblk_dev,
+                   "softc already locked, cannot dump...\n");
+               return (EBUSY);
+       }
+
+       if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
+               vtblk_prepare_dump(sc);
+               sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
+       }
+
+       if (ap->a_length > 0) {
+               error = vtblk_write_dump(sc, ap->a_virtual, ap->a_offset,
+                                        ap->a_length);
+       } else if (ap->a_virtual == NULL && ap->a_offset == 0) {
+               error = vtblk_flush_dump(sc);
+       }
+
+       VTBLK_UNLOCK(sc);
+
+       return (error);
+}
+
+static int
+vtblk_strategy(struct dev_strategy_args *ap)
+{
+       struct vtblk_softc *sc;
+       cdev_t dev = ap->a_head.a_dev;
+       sc = dev->si_drv1;
+       struct bio *bio = ap->a_bio;
+       struct buf *bp = bio->bio_buf;
+
+       if (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_WRITE) {
+               KKASSERT(bp->b_count > 0);
+       }
+
+       if (sc == NULL) {
+               vtblk_bio_error(bio, EINVAL);
+               return EINVAL;
+       }
+
+       /*
+        * Fail any write if RO. Unfortunately, there does not seem to
+        * be a better way to report our readonly'ness to GEOM above.
+        *
+        * XXX: Is that true in DFly?
+        */
+       if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
+           (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_FLUSH)) {
+               vtblk_bio_error(bio, EROFS);
+               return (EINVAL);
+       }
+
+       VTBLK_LOCK(sc);
+       if ((sc->vtblk_flags & VTBLK_FLAG_DETACH) == 0) {
+               devstat_start_transaction(&sc->stats);
+               bioqdisksort(&sc->vtblk_bioq, bio);
+               vtblk_startio(sc);
+       } else {
+               vtblk_bio_error(bio, ENXIO);
+       }
+       VTBLK_UNLOCK(sc);
+       return 0;
+}
+
+static void
+vtblk_negotiate_features(struct vtblk_softc *sc)
+{
+       device_t dev;
+       uint64_t features;
+
+       dev = sc->vtblk_dev;
+       features = VTBLK_FEATURES;
+
+       sc->vtblk_features = virtio_negotiate_features(dev, features);
+}
+
+static int
+vtblk_maximum_segments(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
+{
+       device_t dev;
+       int nsegs;
+
+       dev = sc->vtblk_dev;
+       nsegs = VTBLK_MIN_SEGMENTS;
+
+       if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
+               nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
+       } else {
+               nsegs += 1;
+       }
+
+       return (nsegs);
+}
+
+static int
+vtblk_alloc_virtqueue(struct vtblk_softc *sc)
+{
+       device_t dev;
+       struct vq_alloc_info vq_info;
+
+       dev = sc->vtblk_dev;
+
+       VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
+                          vtblk_vq_intr, sc, &sc->vtblk_vq,
+                          "%s request", device_get_nameunit(dev));
+
+       return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
+}
+
+static void
+vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
+{
+
+       struct disk_info info;
+
+       /* construct the disk_info */
+       bzero(&info, sizeof(info));
+
+       if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE))
+               sc->vtblk_sector_size = blkcfg->blk_size;
+       else
+               sc->vtblk_sector_size = DEV_BSIZE;
+
+       info.d_media_blksize = sc->vtblk_sector_size;
+       info.d_media_blocks = blkcfg->capacity;
+
+       info.d_ncylinders = blkcfg->geometry.cylinders;
+       info.d_nheads = blkcfg->geometry.heads;
+       info.d_secpertrack = blkcfg->geometry.sectors;
+
+       info.d_secpercyl = info.d_secpertrack * info.d_nheads;
+
+       devstat_add_entry(&sc->stats, "vbd", device_get_unit(sc->vtblk_dev),
+                         DEV_BSIZE, DEVSTAT_ALL_SUPPORTED,
+                         DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
+                         DEVSTAT_PRIORITY_DISK);
+
+       /* attach a generic disk device to ourselves */
+       sc->cdev = disk_create(device_get_unit(sc->vtblk_dev), &sc->vtblk_disk,
+                              &vbd_disk_ops);
+
+       sc->cdev->si_drv1 = sc;
+       disk_setdiskinfo(&sc->vtblk_disk, &info);
+}
+
+static void
+vtblk_startio(struct vtblk_softc *sc)
+{
+       struct virtqueue *vq;
+       struct vtblk_request *req;
+       int enq;
+
+       vq = sc->vtblk_vq;
+       enq = 0;
+
+       VTBLK_LOCK_ASSERT(sc);
+
+       if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
+               return;
+
+       while (!virtqueue_full(vq)) {
+               if ((req = vtblk_dequeue_ready(sc)) == NULL)
+                       req = vtblk_bio_request(sc);
+               if (req == NULL)
+                       break;
+
+               if (vtblk_execute_request(sc, req) != 0) {
+                       vtblk_enqueue_ready(sc, req);
+                       break;
+               }
+
+               enq++;
+       }
+
+       if (enq > 0)
+               virtqueue_notify(vq, &sc->vtblk_mtx);
+}
+
+static struct vtblk_request *
+vtblk_bio_request(struct vtblk_softc *sc)
+{
+       struct bio_queue_head *bioq;
+       struct vtblk_request *req;
+       struct bio *bio;
+       struct buf *bp;
+
+       bioq = &sc->vtblk_bioq;
+
+       if (bioq_first(bioq) == NULL)
+               return (NULL);
+
+       req = vtblk_dequeue_request(sc);
+       if (req == NULL)
+               return (NULL);
+
+       bio = bioq_takefirst(bioq);
+       req->vbr_bp = bio;
+       req->vbr_ack = -1;
+       req->vbr_hdr.ioprio = 1;
+       bp = bio->bio_buf;
+
+       switch (bp->b_cmd) {
+       case BUF_CMD_FLUSH:
+               req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
+               break;
+       case BUF_CMD_READ:
+               req->vbr_hdr.type = VIRTIO_BLK_T_IN;
+               req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
+               break;
+       case BUF_CMD_WRITE:
+               req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
+               req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
+               break;
+       default:
+               KASSERT(0, ("bio with unhandled cmd: %d", bp->b_cmd));
+               req->vbr_hdr.type = -1;
+               break;
+       }
+
+       if (bp->b_flags & B_ORDERED)
+               req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
+
+       return (req);
+}
+
+static int
+vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+       struct sglist *sg;
+       struct bio *bio;
+       struct buf *bp;
+       int writable, error;
+
+       sg = sc->vtblk_sglist;
+       bio = req->vbr_bp;
+       bp = bio->bio_buf;
+       writable = 0;
+
+       /*
+        * sglist is live throughout this subroutine.
+        */
+       sglist_reset(sg);
+       
+       error = sglist_append(sg, &req->vbr_hdr,
+                             sizeof(struct virtio_blk_outhdr));
+       KASSERT(error == 0, ("error adding header to sglist"));
+       KASSERT(sg->sg_nseg == 1,
+           ("header spanned multiple segments: %d", sg->sg_nseg));
+
+       if (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_WRITE) {
+               error = sglist_append(sg, bp->b_data, bp->b_bcount);
+               KASSERT(error == 0, ("error adding buffer to sglist"));
+
+               /* BUF_CMD_READ means the host writes into our buffer. */
+               if (bp->b_cmd == BUF_CMD_READ)
+                       writable += sg->sg_nseg - 1;
+       }
+
+       error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
+       KASSERT(error == 0, ("error adding ack to sglist"));
+       writable++;
+
+       KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
+           ("fewer than min segments: %d", sg->sg_nseg));
+
+       error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
+                                 sg->sg_nseg - writable, writable);
+
+       sglist_reset(sg);
+
+       return (error);
+}
+
+static int
+vtblk_vq_intr(void *xsc)
+{
+       struct vtblk_softc *sc;
+
+       sc = xsc;
+
+       virtqueue_disable_intr(sc->vtblk_vq);
+       taskqueue_enqueue(taskqueue_thread[mycpuid], &sc->vtblk_intr_task);
+
+       return (1);
+}
+
+static void
+vtblk_intr_task(void *arg, int pending)
+{
+       struct vtblk_softc *sc;
+       struct vtblk_request *req;
+       struct virtqueue *vq;
+       struct bio *bio;
+       struct buf *bp;
+       
+       sc = arg;
+       vq = sc->vtblk_vq;
+
+retry:
+       VTBLK_LOCK(sc);
+       if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
+               VTBLK_UNLOCK(sc);
+               return;
+       }
+
+       while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
+               bio = req->vbr_bp;
+               bp = bio->bio_buf;
+
+               if (req->vbr_ack == VIRTIO_BLK_S_OK)
+                       bp->b_resid = 0;
+               else {
+                       bp->b_flags |= B_ERROR;
+                       if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP) {
+                               bp->b_error = ENOTSUP;
+                       } else {
+                               bp->b_error = EIO;
+                       }
+               }
+
+               devstat_end_transaction_buf(&sc->stats, bio->bio_buf);
+
+               VTBLK_UNLOCK(sc);
+               /*
+                * Unlocking the controller around biodone() does not allow
+                * processing further device interrupts; when we queued
+                * vtblk_intr_task, we disabled interrupts. It will allow
+                * concurrent vtblk_strategy/_startio command dispatches.
+                */
+               biodone(bio);
+               VTBLK_LOCK(sc);
+
+               vtblk_enqueue_request(sc, req);
+       }
+
+       vtblk_startio(sc);
+
+       if (virtqueue_enable_intr(vq) != 0) {
+               /* 
+                * If new virtqueue entries appeared immediately after
+                * enabling interrupts, process them now. Release and
+                * retake softcontroller lock to try to avoid blocking
+                * I/O dispatch for too long.
+                */
+               virtqueue_disable_intr(vq);
+               VTBLK_UNLOCK(sc);
+               goto retry;
+       }
+
+       VTBLK_UNLOCK(sc);
+}
+
+static void
+vtblk_stop(struct vtblk_softc *sc)
+{
+       virtqueue_disable_intr(sc->vtblk_vq);
+       virtio_stop(sc->vtblk_dev);
+}
+
+static void
+vtblk_prepare_dump(struct vtblk_softc *sc)
+{
+       device_t dev;
+       struct virtqueue *vq;
+
+       dev = sc->vtblk_dev;
+       vq = sc->vtblk_vq;
+
+       vtblk_stop(sc);
+
+       /*
+        * Drain all requests caught in-flight in the virtqueue,
+        * skipping biodone(). When dumping, only one request is
+        * outstanding at a time, and we just poll the virtqueue
+        * for the response.
+        */
+       vtblk_drain_vq(sc, 1);
+
+       if (virtio_reinit(dev, sc->vtblk_features) != 0)
+               panic("cannot reinit VirtIO block device during dump");
+
+       virtqueue_disable_intr(vq);
+       virtio_reinit_complete(dev);
+}
+
+static int
+vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
+                size_t length)
+{
+       struct bio bio;
+       struct vtblk_request *req;
+       struct buf *bp;
+
+       req = &sc->vtblk_dump_request;
+       req->vbr_ack = -1;
+       req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
+       req->vbr_hdr.ioprio = 1;
+       req->vbr_hdr.sector = offset / DEV_BSIZE;
+
+       req->vbr_bp = &bio;
+       bzero(&buf, sizeof(struct bio));
+       bp = bio.bio_buf;
+
+       bp->b_cmd = BUF_CMD_WRITE;
+       bp->b_data = virtual;
+       bp->b_bcount = length;
+
+       return (vtblk_poll_request(sc, req));
+}
+
+static int
+vtblk_flush_dump(struct vtblk_softc *sc)
+{
+       struct bio bio;
+       struct vtblk_request *req;
+       struct buf *bp;
+
+       req = &sc->vtblk_dump_request;
+       req->vbr_ack = -1;
+       req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
+       req->vbr_hdr.ioprio = 1;
+       req->vbr_hdr.sector = 0;
+
+       req->vbr_bp = &bio;
+       bzero(&buf, sizeof(struct bio));
+       bp = bio.bio_buf;
+       
+       bp->b_cmd = BUF_CMD_FLUSH;
+
+       return (vtblk_poll_request(sc, req));
+}
+
+static int
+vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+       device_t dev;
+       struct virtqueue *vq;
+       struct vtblk_request *r;
+       int error;
+
+       dev = sc->vtblk_dev;
+       vq = sc->vtblk_vq;
+
+       if (!virtqueue_empty(vq))
+               return (EBUSY);
+
+       error = vtblk_execute_request(sc, req);
+       if (error)
+               return (error);
+
+       virtqueue_notify(vq, &sc->vtblk_mtx);
+
+       r = virtqueue_poll(vq, NULL);
+       KASSERT(r == req, ("unexpected request response"));
+
+       if (req->vbr_ack != VIRTIO_BLK_S_OK) {
+               error = req->vbr_ack == VIRTIO_BLK_S_UNSUPP ? ENOTSUP : EIO;
+               if (bootverbose)
+                       device_printf(dev,
+                           "vtblk_poll_request: IO error: %d\n", error);
+       }
+
+       return (error);
+}
+
+static void
+vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
+{
+       struct virtqueue *vq;
+       struct vtblk_request *req;
+       int last;
+
+       vq = sc->vtblk_vq;
+       last = 0;
+
+       while ((req = virtqueue_drain(vq, &last)) != NULL) {
+               if (!skip_done)
+                       vtblk_bio_error(req->vbr_bp, ENXIO);
+
+               vtblk_enqueue_request(sc, req);
+       }
+
+       KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
+}
+
+static void
+vtblk_drain(struct vtblk_softc *sc)
+{
+       struct bio_queue_head *bioq;
+       struct vtblk_request *req;
+       struct bio *bp;
+
+       bioq = &sc->vtblk_bioq;
+
+       if (sc->vtblk_vq != NULL)
+               vtblk_drain_vq(sc, 0);
+
+       while ((req = vtblk_dequeue_ready(sc)) != NULL) {
+               vtblk_bio_error(req->vbr_bp, ENXIO);
+               vtblk_enqueue_request(sc, req);
+       }
+
+       while (bioq_first(bioq) != NULL) {
+               bp = bioq_takefirst(bioq);
+               vtblk_bio_error(bp, ENXIO);
+       }
+
+       vtblk_free_requests(sc);
+}
+
+static int
+vtblk_alloc_requests(struct vtblk_softc *sc)
+{
+       struct vtblk_request *req;
+       int i, nreqs;
+
+       nreqs = virtqueue_size(sc->vtblk_vq);
+
+       /*
+        * Preallocate sufficient requests to keep the virtqueue full. Each
+        * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
+        * the number allocated when indirect descriptors are not available.
+        */
+       nreqs /= VTBLK_MIN_SEGMENTS;
+
+       for (i = 0; i < nreqs; i++) {
+               req = kmalloc(sizeof(struct vtblk_request), M_DEVBUF, M_WAITOK);
+
+               sc->vtblk_request_count++;
+               vtblk_enqueue_request(sc, req);
+       }
+
+       return (0);
+}
+
+static void
+vtblk_free_requests(struct vtblk_softc *sc)
+{
+       struct vtblk_request *req;
+
+       while ((req = vtblk_dequeue_request(sc)) != NULL) {
+               sc->vtblk_request_count--;
+               kfree(req, M_DEVBUF);
+       }
+
+       KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
+}
+
+static struct vtblk_request *
+vtblk_dequeue_request(struct vtblk_softc *sc)
+{
+       struct vtblk_request *req;
+
+       req = TAILQ_FIRST(&sc->vtblk_req_free);
+       if (req != NULL)
+               TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
+
+       return (req);
+}
+
+static void
+vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+       bzero(req, sizeof(struct vtblk_request));
+       TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
+}
+
+static struct vtblk_request *
+vtblk_dequeue_ready(struct vtblk_softc *sc)
+{
+       struct vtblk_request *req;
+
+       req = TAILQ_FIRST(&sc->vtblk_req_ready);
+       if (req != NULL)
+               TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
+
+       return (req);
+}
+
+static void
+vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+       TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
+}
+
+static void
+vtblk_bio_error(struct bio *bp, int error)
+{
+       biodone(bp);
+}
diff --git a/sys/dev/virtual/virtio/block/virtio_blk.h b/sys/dev/virtual/virtio/block/virtio_blk.h
new file mode 100644 (file)
index 0000000..4e05e9e
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/block/virtio_blk.h,v 1.2 2011/12/06 06:28:32 grehan Exp $
+ */
+
+#ifndef _VIRTIO_BLK_H
+#define _VIRTIO_BLK_H
+
+#include <sys/types.h>
+
+/* Feature bits */
+#define VIRTIO_BLK_F_BARRIER   0x0001  /* Does host support barriers? */
+#define VIRTIO_BLK_F_SIZE_MAX  0x0002  /* Indicates maximum segment size */
+#define VIRTIO_BLK_F_SEG_MAX   0x0004  /* Indicates maximum # of segments */
+#define VIRTIO_BLK_F_GEOMETRY  0x0010  /* Legacy geometry available  */
+#define VIRTIO_BLK_F_RO                0x0020  /* Disk is read-only */
+#define VIRTIO_BLK_F_BLK_SIZE  0x0040  /* Block size of disk is available*/
+#define VIRTIO_BLK_F_SCSI      0x0080  /* Supports scsi command passthru */
+#define VIRTIO_BLK_F_FLUSH     0x0200  /* Cache flush command support */
+#define VIRTIO_BLK_F_TOPOLOGY  0x0400  /* Topology information is available */
+
+#define VIRTIO_BLK_ID_BYTES    20      /* ID string length */
+
+struct virtio_blk_config {
+       /* The capacity (in 512-byte sectors). */
+       uint64_t capacity;
+       /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */
+       uint32_t size_max;
+       /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
+       uint32_t seg_max;
+       /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */
+       struct virtio_blk_geometry {
+               uint16_t cylinders;
+               uint8_t heads;
+               uint8_t sectors;
+       } geometry;
+
+       /* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
+       uint32_t blk_size;
+} __packed;
+
+/*
+ * Command types
+ *
+ * Usage is a bit tricky as some bits are used as flags and some are not.
+ *
+ * Rules:
+ *   VIRTIO_BLK_T_OUT may be combined with VIRTIO_BLK_T_SCSI_CMD or
+ *   VIRTIO_BLK_T_BARRIER.  VIRTIO_BLK_T_FLUSH is a command of its own
+ *   and may not be combined with any of the other flags.
+ */
+
+/* These two define direction. */
+#define VIRTIO_BLK_T_IN                0
+#define VIRTIO_BLK_T_OUT       1
+
+/* This bit says it's a scsi command, not an actual read or write. */
+#define VIRTIO_BLK_T_SCSI_CMD  2
+
+/* Cache flush command */
+#define VIRTIO_BLK_T_FLUSH     4
+
+/* Get device ID command */
+#define VIRTIO_BLK_T_GET_ID    8
+
+/* Barrier before this op. */
+#define VIRTIO_BLK_T_BARRIER   0x80000000
+
+/* ID string length */
+#define VIRTIO_BLK_ID_BYTES    20
+
+/* This is the first element of the read scatter-gather list. */
+struct virtio_blk_outhdr {
+       /* VIRTIO_BLK_T* */
+       uint32_t type;
+       /* io priority. */
+       uint32_t ioprio;
+       /* Sector (ie. 512 byte offset) */
+       uint64_t sector;
+};
+
+struct virtio_scsi_inhdr {
+       uint32_t errors;
+       uint32_t data_len;
+       uint32_t sense_len;
+       uint32_t residual;
+};
+
+/* And this is the final byte of the write scatter-gather list. */
+#define VIRTIO_BLK_S_OK                0
+#define VIRTIO_BLK_S_IOERR     1
+#define VIRTIO_BLK_S_UNSUPP    2
+
+#endif /* _VIRTIO_BLK_H */
diff --git a/sys/dev/virtual/virtio/pci/Makefile b/sys/dev/virtual/virtio/pci/Makefile
new file mode 100644 (file)
index 0000000..5e7f21a
--- /dev/null
@@ -0,0 +1,40 @@
+#
+# $FreeBSD$
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+
+.PATH: ${.CURDIR}
+
+
+CFLAGS+=       -I${.CURDIR}/..
+
+KMOD=  virtio_pci
+SRCS=  virtio_pci.c
+SRCS+= virtio_bus_if.h virtio_if.h
+SRCS+= bus_if.h device_if.h pci_if.h
+
+MFILES=        kern/bus_if.m kern/device_if.m bus/pci/pci_if.m
+MFILES+=dev/virtual/virtio/virtio/virtio_bus_if.m 
+MFILES+=dev/virtual/virtio/virtio/virtio_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/dev/virtual/virtio/pci/virtio_bus_if.h b/sys/dev/virtual/virtio/pci/virtio_bus_if.h
new file mode 100644 (file)
index 0000000..987e77e
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from source file
+ *   @/dev/virtio/virtio_bus_if.m
+ * with
+ *   makeobjops.awk
+ *
+ * See the source file for legal information
+ */
+
+#ifndef _virtio_bus_if_h_
+#define _virtio_bus_if_h_
+
+
+struct vq_alloc_info;
+
+extern struct kobjop_desc virtio_bus_negotiate_features_desc;
+typedef uint64_t virtio_bus_negotiate_features_t(device_t dev,
+                                                 uint64_t child_features);
+static __inline uint64_t VIRTIO_BUS_NEGOTIATE_FEATURES(device_t dev,
+                                                       uint64_t child_features)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_negotiate_features);
+       return ((virtio_bus_negotiate_features_t *) _m)(dev, child_features);
+}
+
+extern struct kobjop_desc virtio_bus_with_feature_desc;
+typedef int virtio_bus_with_feature_t(device_t dev, uint64_t feature);
+static __inline int VIRTIO_BUS_WITH_FEATURE(device_t dev, uint64_t feature)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_with_feature);
+       return ((virtio_bus_with_feature_t *) _m)(dev, feature);
+}
+
+extern struct kobjop_desc virtio_bus_alloc_virtqueues_desc;
+typedef int virtio_bus_alloc_virtqueues_t(device_t dev, int flags, int nvqs,
+                                          struct vq_alloc_info *info);
+static __inline int VIRTIO_BUS_ALLOC_VIRTQUEUES(device_t dev, int flags,
+                                                int nvqs,
+                                                struct vq_alloc_info *info)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_alloc_virtqueues);
+       return ((virtio_bus_alloc_virtqueues_t *) _m)(dev, flags, nvqs, info);
+}
+
+
+#define VIRTIO_ALLOC_VQS_DISABLE_MSIX 0x1
+
+extern struct kobjop_desc virtio_bus_setup_intr_desc;
+typedef int virtio_bus_setup_intr_t(device_t dev);
+static __inline int VIRTIO_BUS_SETUP_INTR(device_t dev)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_setup_intr);
+       return ((virtio_bus_setup_intr_t *) _m)(dev);
+}
+
+extern struct kobjop_desc virtio_bus_stop_desc;
+typedef void virtio_bus_stop_t(device_t dev);
+static __inline void VIRTIO_BUS_STOP(device_t dev)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_stop);
+       ((virtio_bus_stop_t *) _m)(dev);
+}
+
+extern struct kobjop_desc virtio_bus_reinit_desc;
+typedef int virtio_bus_reinit_t(device_t dev, uint64_t features);
+static __inline int VIRTIO_BUS_REINIT(device_t dev, uint64_t features)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_reinit);
+       return ((virtio_bus_reinit_t *) _m)(dev, features);
+}
+
+extern struct kobjop_desc virtio_bus_reinit_complete_desc;
+typedef void virtio_bus_reinit_complete_t(device_t dev);
+static __inline void VIRTIO_BUS_REINIT_COMPLETE(device_t dev)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_reinit_complete);
+       ((virtio_bus_reinit_complete_t *) _m)(dev);
+}
+
+extern struct kobjop_desc virtio_bus_notify_vq_desc;
+typedef void virtio_bus_notify_vq_t(device_t dev, uint16_t queue);
+static __inline void VIRTIO_BUS_NOTIFY_VQ(device_t dev, uint16_t queue)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_notify_vq);
+       ((virtio_bus_notify_vq_t *) _m)(dev, queue);
+}
+
+extern struct kobjop_desc virtio_bus_read_device_config_desc;
+typedef void virtio_bus_read_device_config_t(device_t dev, bus_size_t offset,
+                                             void *dst, int len);
+static __inline void VIRTIO_BUS_READ_DEVICE_CONFIG(device_t dev,
+                                                   bus_size_t offset, void *dst,
+                                                   int len)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_read_device_config);
+       ((virtio_bus_read_device_config_t *) _m)(dev, offset, dst, len);
+}
+
+extern struct kobjop_desc virtio_bus_write_device_config_desc;
+typedef void virtio_bus_write_device_config_t(device_t dev, bus_size_t offset,
+                                              void *src, int len);
+static __inline void VIRTIO_BUS_WRITE_DEVICE_CONFIG(device_t dev,
+                                                    bus_size_t offset,
+                                                    void *src, int len)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_bus_write_device_config);
+       ((virtio_bus_write_device_config_t *) _m)(dev, offset, src, len);
+}
+
+#endif /* _virtio_bus_if_h_ */
diff --git a/sys/dev/virtual/virtio/pci/virtio_if.h b/sys/dev/virtual/virtio/pci/virtio_if.h
new file mode 100644 (file)
index 0000000..eb343a9
--- /dev/null
@@ -0,0 +1,25 @@
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from source file
+ *   @/dev/virtio/virtio_if.m
+ * with
+ *   makeobjops.awk
+ *
+ * See the source file for legal information
+ */
+
+#ifndef _virtio_if_h_
+#define _virtio_if_h_
+
+extern struct kobjop_desc virtio_config_change_desc;
+typedef int virtio_config_change_t(device_t dev);
+static __inline int VIRTIO_CONFIG_CHANGE(device_t dev)
+{
+       kobjop_t _m;
+       KOBJOPLOOKUP(((kobj_t)dev)->ops, virtio_config_change);
+       return ((virtio_config_change_t *) _m)(dev);
+}
+
+#endif /* _virtio_if_h_ */
diff --git a/sys/dev/virtual/virtio/pci/virtio_pci.c b/sys/dev/virtual/virtio/pci/virtio_pci.c
new file mode 100644 (file)
index 0000000..59baca0
--- /dev/null
@@ -0,0 +1,1092 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/pci/virtio_pci.c,v 1.3 2012/04/14 05:48:04 grehan Exp $
+ */
+
+/* Driver for the VirtIO PCI interface. */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+
+#include <bus/pci/pcivar.h>
+#include <bus/pci/pcireg.h>
+
+#include <sys/bus.h>
+#include <sys/param.h>
+#include <sys/rman.h>
+
+
+#include <virtio/virtio.h>
+#include <virtio/virtqueue.h>
+#include "virtio_pci.h"
+
+#include "virtio_bus_if.h"
+#include "virtio_if.h"
+
+struct vtpci_softc {
+       device_t                         vtpci_dev;
+       struct resource                 *vtpci_res;
+       struct resource                 *vtpci_msix_res;
+       uint64_t                         vtpci_features;
+       uint32_t                         vtpci_flags;
+       int                              vtpci_irq_type;
+       int                              vtpci_irq_rid;
+#define VIRTIO_PCI_FLAG_NO_MSI          0x0001
+#define VIRTIO_PCI_FLAG_MSI             0x0002
+#define VIRTIO_PCI_FLAG_NO_MSIX                 0x0010
+#define VIRTIO_PCI_FLAG_MSIX            0x0020
+#define VIRTIO_PCI_FLAG_SHARED_MSIX     0x0040
+
+       device_t                         vtpci_child_dev;
+       struct virtio_feature_desc      *vtpci_child_feat_desc;
+
+       /*
+        * Ideally, each virtqueue that the driver provides a callback for
+        * will receive its own MSIX vector. If there are not sufficient
+        * vectors available, we will then attempt to have all the VQs
+        * share one vector. Note that when using MSIX, the configuration
+        * changed notifications must be on their own vector.
+        *
+        * If MSIX is not available, we will attempt to have the whole
+        * device share one MSI vector, and then, finally, one legacy
+        * interrupt.
+        */
+       int                              vtpci_nvqs;
+       struct vtpci_virtqueue {
+               struct virtqueue *vq;
+
+               /* Index into vtpci_intr_res[] below. Unused, then -1. */
+               int               ires_idx;
+       } vtpci_vqx[VIRTIO_MAX_VIRTQUEUES];
+
+       /*
+        * When using MSIX interrupts, the first element of vtpci_intr_res[]
+        * is always the configuration changed notifications. The remaining
+        * element(s) are used for the virtqueues.
+        *
+        * With MSI and legacy interrupts, only the first element of
+        * vtpci_intr_res[] is used.
+        */
+       int                              vtpci_nintr_res;
+       struct vtpci_intr_resource {
+               struct resource *irq;
+               int              rid;
+               void            *intrhand;
+       } vtpci_intr_res[1 + VIRTIO_MAX_VIRTQUEUES];
+};
+
+static int     vtpci_probe(device_t);
+static int     vtpci_attach(device_t);
+static int     vtpci_detach(device_t);
+static int     vtpci_suspend(device_t);
+static int     vtpci_resume(device_t);
+static int     vtpci_shutdown(device_t);
+static void    vtpci_driver_added(device_t, driver_t *);
+static void    vtpci_child_detached(device_t, device_t);
+static int     vtpci_read_ivar(device_t, device_t, int, uintptr_t *);
+static int     vtpci_write_ivar(device_t, device_t, int, uintptr_t);
+
+static uint64_t        vtpci_negotiate_features(device_t, uint64_t);
+static int     vtpci_with_feature(device_t, uint64_t);
+static int     vtpci_alloc_virtqueues(device_t, int, int,
+                   struct vq_alloc_info *);
+static int     vtpci_setup_intr(device_t);
+static void    vtpci_stop(device_t);
+static int     vtpci_reinit(device_t, uint64_t);
+static void    vtpci_reinit_complete(device_t);
+static void    vtpci_notify_virtqueue(device_t, uint16_t);
+static uint8_t vtpci_get_status(device_t);
+static void    vtpci_set_status(device_t, uint8_t);
+static void    vtpci_read_dev_config(device_t, bus_size_t, void *, int);
+static void    vtpci_write_dev_config(device_t, bus_size_t, void *, int);
+
+static void    vtpci_describe_features(struct vtpci_softc *, const char *,
+                   uint64_t);
+static void    vtpci_probe_and_attach_child(struct vtpci_softc *);
+
+static int     vtpci_alloc_interrupts(struct vtpci_softc *, int, int,
+                   struct vq_alloc_info *);
+static int     vtpci_alloc_intr_resources(struct vtpci_softc *, int,
+                   struct vq_alloc_info *);
+static int     vtpci_alloc_msi(struct vtpci_softc *);
+static int     vtpci_alloc_msix(struct vtpci_softc *, int);
+static int     vtpci_register_msix_vector(struct vtpci_softc *, int, int);
+
+static void    vtpci_free_interrupts(struct vtpci_softc *);
+static void    vtpci_free_virtqueues(struct vtpci_softc *);
+static void    vtpci_release_child_resources(struct vtpci_softc *);
+static void    vtpci_reset(struct vtpci_softc *);
+
+static int     vtpci_legacy_intr(void *);
+static int     vtpci_vq_shared_intr(void *);
+static int     vtpci_vq_intr(void *);
+static int     vtpci_config_intr(void *);
+
+/*
+ * I/O port read/write wrappers.
+ */
+#define vtpci_read_config_1(sc, o)     bus_read_1((sc)->vtpci_res, (o))
+#define vtpci_read_config_2(sc, o)     bus_read_2((sc)->vtpci_res, (o))
+#define vtpci_read_config_4(sc, o)     bus_read_4((sc)->vtpci_res, (o))
+#define vtpci_write_config_1(sc, o, v) bus_write_1((sc)->vtpci_res, (o), (v))
+#define vtpci_write_config_2(sc, o, v) bus_write_2((sc)->vtpci_res, (o), (v))
+#define vtpci_write_config_4(sc, o, v) bus_write_4((sc)->vtpci_res, (o), (v))
+
+/* Tunables. */
+static int vtpci_disable_msix = 0;
+TUNABLE_INT("hw.virtio.pci.disable_msix", &vtpci_disable_msix);
+
+static device_method_t vtpci_methods[] = {
+       /* Device interface. */
+       DEVMETHOD(device_probe,                   vtpci_probe),
+       DEVMETHOD(device_attach,                  vtpci_attach),
+       DEVMETHOD(device_detach,                  vtpci_detach),
+       DEVMETHOD(device_suspend,                 vtpci_suspend),
+       DEVMETHOD(device_resume,                  vtpci_resume),
+       DEVMETHOD(device_shutdown,                vtpci_shutdown),
+
+       /* Bus interface. */
+       DEVMETHOD(bus_driver_added,               vtpci_driver_added),
+       DEVMETHOD(bus_child_detached,             vtpci_child_detached),
+       DEVMETHOD(bus_read_ivar,                  vtpci_read_ivar),
+       DEVMETHOD(bus_write_ivar,                 vtpci_write_ivar),
+
+       /* VirtIO bus interface. */
+       DEVMETHOD(virtio_bus_negotiate_features,  vtpci_negotiate_features),
+       DEVMETHOD(virtio_bus_with_feature,        vtpci_with_feature),
+       DEVMETHOD(virtio_bus_alloc_virtqueues,    vtpci_alloc_virtqueues),
+       DEVMETHOD(virtio_bus_setup_intr,          vtpci_setup_intr),
+       DEVMETHOD(virtio_bus_stop,                vtpci_stop),
+       DEVMETHOD(virtio_bus_reinit,              vtpci_reinit),
+       DEVMETHOD(virtio_bus_reinit_complete,     vtpci_reinit_complete),
+       DEVMETHOD(virtio_bus_notify_vq,           vtpci_notify_virtqueue),
+       DEVMETHOD(virtio_bus_read_device_config,  vtpci_read_dev_config),
+       DEVMETHOD(virtio_bus_write_device_config, vtpci_write_dev_config),
+
+       { 0, 0 }
+};
+
+static driver_t vtpci_driver = {
+       "virtio_pci",
+       vtpci_methods,
+       sizeof(struct vtpci_softc)
+};
+
+devclass_t vtpci_devclass;
+
+DRIVER_MODULE(virtio_pci, pci, vtpci_driver, vtpci_devclass, 0, 0);
+MODULE_VERSION(virtio_pci, 1);
+MODULE_DEPEND(virtio_pci, pci, 1, 1, 1);
+MODULE_DEPEND(virtio_pci, virtio, 1, 1, 1);
+
+static int
+vtpci_probe(device_t dev)
+{
+       char desc[36];
+       const char *name;
+
+       if (pci_get_vendor(dev) != VIRTIO_PCI_VENDORID)
+               return (ENXIO);
+
+       if (pci_get_device(dev) < VIRTIO_PCI_DEVICEID_MIN ||
+           pci_get_device(dev) > VIRTIO_PCI_DEVICEID_MAX)
+               return (ENXIO);
+
+       if (pci_get_revid(dev) != VIRTIO_PCI_ABI_VERSION)
+               return (ENXIO);
+
+       name = virtio_device_name(pci_get_subdevice(dev));
+       if (name == NULL)
+               name = "Unknown";
+
+       ksnprintf(desc, sizeof(desc), "VirtIO PCI %s adapter", name);
+       device_set_desc_copy(dev, desc);
+
+       return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtpci_attach(device_t dev)
+{
+       struct vtpci_softc *sc;
+       device_t child;
+       int rid;
+
+       sc = device_get_softc(dev);
+       sc->vtpci_dev = dev;
+
+       pci_enable_busmaster(dev);
+
+       rid = PCIR_BAR(0);
+       sc->vtpci_res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
+           RF_ACTIVE);
+       if (sc->vtpci_res == NULL) {
+               device_printf(dev, "cannot map I/O space\n");
+               return (ENXIO);
+       }
+
+       if (pci_find_extcap(dev, PCIY_MSI, NULL) != 0)
+               sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSI;
+       /* XXX(vsrinivas): Check out how to get MSI-X */
+#if OLD_MSI
+       if (pci_find_extcap(dev, PCIY_MSIX, NULL) == 0) {
+               rid = PCIR_BAR(1);
+               sc->vtpci_msix_res = bus_alloc_resource_any(dev,
+                   SYS_RES_MEMORY, &rid, RF_ACTIVE);
+       }
+#endif
+       if (sc->vtpci_msix_res == NULL)
+               sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSIX;
+
+       vtpci_reset(sc);
+
+       /* Tell the host we've noticed this device. */
+       vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+
+       if ((child = device_add_child(dev, NULL, -1)) == NULL) {
+               device_printf(dev, "cannot create child device\n");
+               vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+               vtpci_detach(dev);
+               return (ENOMEM);
+       }
+
+       sc->vtpci_child_dev = child;
+       vtpci_probe_and_attach_child(sc);
+
+       return (0);
+}
+
+static int
+vtpci_detach(device_t dev)
+{
+       struct vtpci_softc *sc;
+       device_t child;
+       int error;
+
+       sc = device_get_softc(dev);
+
+       if ((child = sc->vtpci_child_dev) != NULL) {
+               error = device_delete_child(dev, child);
+               if (error)
+                       return (error);
+               sc->vtpci_child_dev = NULL;
+       }
+
+       vtpci_reset(sc);
+
+       if (sc->vtpci_msix_res != NULL) {
+               bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(1),
+                   sc->vtpci_msix_res);
+               sc->vtpci_msix_res = NULL;
+       }
+
+       if (sc->vtpci_res != NULL) {
+               bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(0),
+                   sc->vtpci_res);
+               sc->vtpci_res = NULL;
+       }
+
+       return (0);
+}
+
+static int
+vtpci_suspend(device_t dev)
+{
+
+       return (bus_generic_suspend(dev));
+}
+
+static int
+vtpci_resume(device_t dev)
+{
+
+       return (bus_generic_resume(dev));
+}
+
+static int
+vtpci_shutdown(device_t dev)
+{
+
+       (void) bus_generic_shutdown(dev);
+       /* Forcibly stop the host device. */
+       vtpci_stop(dev);
+
+       return (0);
+}
+
+static void
+vtpci_driver_added(device_t dev, driver_t *driver)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       vtpci_probe_and_attach_child(sc);
+}
+
+static void
+vtpci_child_detached(device_t dev, device_t child)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       vtpci_reset(sc);
+       vtpci_release_child_resources(sc);
+}
+
+static int
+vtpci_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       if (sc->vtpci_child_dev != child)
+               return (ENOENT);
+
+       switch (index) {
+       case VIRTIO_IVAR_DEVTYPE:
+               *result = pci_get_subdevice(dev);
+               break;
+       default:
+               return (ENOENT);
+       }
+
+       return (0);
+}
+
+static int
+vtpci_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       if (sc->vtpci_child_dev != child)
+               return (ENOENT);
+
+       switch (index) {
+       case VIRTIO_IVAR_FEATURE_DESC:
+               sc->vtpci_child_feat_desc = (void *) value;
+               break;
+       default:
+               return (ENOENT);
+       }
+
+       return (0);
+}
+
+static uint64_t
+vtpci_negotiate_features(device_t dev, uint64_t child_features)
+{
+       struct vtpci_softc *sc;
+       uint64_t host_features, features;
+
+       sc = device_get_softc(dev);
+
+       host_features = vtpci_read_config_4(sc, VIRTIO_PCI_HOST_FEATURES);
+       vtpci_describe_features(sc, "host", host_features);
+
+       /*
+        * Limit negotiated features to what the driver, virtqueue, and
+        * host all support.
+        */
+       features = host_features & child_features;
+       features = virtqueue_filter_features(features);
+       sc->vtpci_features = features;
+
+       vtpci_describe_features(sc, "negotiated", features);
+       vtpci_write_config_4(sc, VIRTIO_PCI_GUEST_FEATURES, features);
+
+       return (features);
+}
+
+static int
+vtpci_with_feature(device_t dev, uint64_t feature)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       return ((sc->vtpci_features & feature) != 0);
+}
+
+static int
+vtpci_alloc_virtqueues(device_t dev, int flags, int nvqs,
+    struct vq_alloc_info *vq_info)
+{
+       struct vtpci_softc *sc;
+       struct vtpci_virtqueue *vqx;
+       struct vq_alloc_info *info;
+       int queue, error;
+       uint16_t vq_size;
+
+       sc = device_get_softc(dev);
+
+       if (sc->vtpci_nvqs != 0 || nvqs <= 0 ||
+           nvqs > VIRTIO_MAX_VIRTQUEUES)
+               return (EINVAL);
+
+       error = vtpci_alloc_interrupts(sc, flags, nvqs, vq_info);
+       if (error) {
+               device_printf(dev, "cannot allocate interrupts\n");
+               return (error);
+       }
+
+       if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+               error = vtpci_register_msix_vector(sc,
+                   VIRTIO_MSI_CONFIG_VECTOR, 0);
+               if (error)
+                       return (error);
+       }
+
+       for (queue = 0; queue < nvqs; queue++) {
+               vqx = &sc->vtpci_vqx[queue];
+               info = &vq_info[queue];
+
+               vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue);
+
+               vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
+               error = virtqueue_alloc(dev, queue, vq_size,
+                   VIRTIO_PCI_VRING_ALIGN, 0xFFFFFFFFUL, info, &vqx->vq);
+               if (error)
+                       return (error);
+
+               if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+                       error = vtpci_register_msix_vector(sc,
+                           VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx);
+                       if (error)
+                               return (error);
+               }
+
+               vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
+                   virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+
+               *info->vqai_vq = vqx->vq;
+               sc->vtpci_nvqs++;
+       }
+
+       return (0);
+}
+
+static int
+vtpci_setup_intr(device_t dev)
+{
+       struct vtpci_softc *sc;
+       struct vtpci_intr_resource *ires;
+       struct vtpci_virtqueue *vqx;
+       int i, flags, error;
+
+       sc = device_get_softc(dev);
+       flags = INTR_MPSAFE;
+       ires = &sc->vtpci_intr_res[0];
+
+       if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) == 0) {
+               error = bus_setup_intr(dev, ires->irq, flags,
+               (driver_intr_t *)    vtpci_legacy_intr, sc, &ires->intrhand, NULL);
+
+               return (error);
+       }
+
+       error = bus_setup_intr(dev, ires->irq, flags,(driver_intr_t *) vtpci_config_intr,
+            sc, &ires->intrhand, NULL);
+       if (error)
+               return (error);
+
+       if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX) {
+               ires = &sc->vtpci_intr_res[1];
+               error = bus_setup_intr(dev, ires->irq, flags,
+                (driver_intr_t *)   vtpci_vq_shared_intr, sc, &ires->intrhand, NULL);
+
+               return (error);
+       }
+
+       /* Setup an interrupt handler for each virtqueue. */
+       for (i = 0; i < sc->vtpci_nvqs; i++) {
+               vqx = &sc->vtpci_vqx[i];
+               if (vqx->ires_idx < 1)
+                       continue;
+
+               ires = &sc->vtpci_intr_res[vqx->ires_idx];
+               error = bus_setup_intr(dev, ires->irq, flags,
+                 (driver_intr_t *)  vtpci_vq_intr, vqx->vq, &ires->intrhand, NULL);
+               if (error)
+                       return (error);
+       }
+
+       return (0);
+}
+
+static void
+vtpci_stop(device_t dev)
+{
+
+       vtpci_reset(device_get_softc(dev));
+}
+
+static int
+vtpci_reinit(device_t dev, uint64_t features)
+{
+       struct vtpci_softc *sc;
+       struct vtpci_virtqueue *vqx;
+       struct virtqueue *vq;
+       int queue, error;
+       uint16_t vq_size;
+
+       sc = device_get_softc(dev);
+
+       /*
+        * Redrive the device initialization. This is a bit of an abuse
+        * of the specification, but both VirtualBox and QEMU/KVM seem
+        * to play nice. We do not allow the host device to change from
+        * what was originally negotiated beyond what the guest driver
+        * changed (MSIX state should not change, number of virtqueues
+        * and their size remain the same, etc).
+        */
+
+       if (vtpci_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET)
+               vtpci_stop(dev);
+
+       /*
+        * Quickly drive the status through ACK and DRIVER. The device
+        * does not become usable again until vtpci_reinit_complete().
+        */
+       vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+       vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
+
+       vtpci_negotiate_features(dev, features);
+
+       if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+               error = vtpci_register_msix_vector(sc,
+                   VIRTIO_MSI_CONFIG_VECTOR, 0);
+               if (error)
+                       return (error);
+       }
+
+       for (queue = 0; queue < sc->vtpci_nvqs; queue++) {
+               vqx = &sc->vtpci_vqx[queue];
+               vq = vqx->vq;
+
+               KASSERT(vq != NULL, ("vq %d not allocated", queue));
+               vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue);
+
+               vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
+               error = virtqueue_reinit(vq, vq_size);
+               if (error)
+                       return (error);
+
+               if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+                       error = vtpci_register_msix_vector(sc,
+                           VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx);
+                       if (error)
+                               return (error);
+               }
+
+               vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
+                   virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+       }
+
+       return (0);
+}
+
+static void
+vtpci_reinit_complete(device_t dev)
+{
+
+       vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+static void
+vtpci_notify_virtqueue(device_t dev, uint16_t queue)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_NOTIFY, queue);
+}
+
+static uint8_t
+vtpci_get_status(device_t dev)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       return (vtpci_read_config_1(sc, VIRTIO_PCI_STATUS));
+}
+
+static void
+vtpci_set_status(device_t dev, uint8_t status)
+{
+       struct vtpci_softc *sc;
+
+       sc = device_get_softc(dev);
+
+       if (status != VIRTIO_CONFIG_STATUS_RESET)
+               status |= vtpci_get_status(dev);
+
+       vtpci_write_config_1(sc, VIRTIO_PCI_STATUS, status);
+}
+
+static void
+vtpci_read_dev_config(device_t dev, bus_size_t offset,
+    void *dst, int length)
+{
+       struct vtpci_softc *sc;
+       bus_size_t off;
+       uint8_t *d;
+       int size;
+
+       sc = device_get_softc(dev);
+       off = VIRTIO_PCI_CONFIG(sc) + offset;
+
+       for (d = dst; length > 0; d += size, off += size, length -= size) {
+               if (length >= 4) {
+                       size = 4;
+                       *(uint32_t *)d = vtpci_read_config_4(sc, off);
+               } else if (length >= 2) {
+                       size = 2;
+                       *(uint16_t *)d = vtpci_read_config_2(sc, off);
+               } else {
+                       size = 1;
+                       *d = vtpci_read_config_1(sc, off);
+               }
+       }
+}
+
+static void
+vtpci_write_dev_config(device_t dev, bus_size_t offset,
+    void *src, int length)
+{
+       struct vtpci_softc *sc;
+       bus_size_t off;
+       uint8_t *s;
+       int size;
+
+       sc = device_get_softc(dev);
+       off = VIRTIO_PCI_CONFIG(sc) + offset;
+
+       for (s = src; length > 0; s += size, off += size, length -= size) {
+               if (length >= 4) {
+                       size = 4;
+                       vtpci_write_config_4(sc, off, *(uint32_t *)s);
+               } else if (length >= 2) {
+                       size = 2;
+                       vtpci_write_config_2(sc, off, *(uint16_t *)s);
+               } else {
+                       size = 1;
+                       vtpci_write_config_1(sc, off, *s);
+               }
+       }
+}
+
+static void
+vtpci_describe_features(struct vtpci_softc *sc, const char *msg,
+    uint64_t features)
+{
+       device_t dev, child;
+
+       dev = sc->vtpci_dev;
+       child = sc->vtpci_child_dev;
+
+       if (device_is_attached(child) && bootverbose == 0)
+               return;
+
+       virtio_describe(dev, msg, features, sc->vtpci_child_feat_desc);
+}
+
+static void
+vtpci_probe_and_attach_child(struct vtpci_softc *sc)
+{
+       device_t dev, child;
+
+       dev = sc->vtpci_dev;
+       child = sc->vtpci_child_dev;
+
+       if (child == NULL)
+               return;
+
+       if (device_get_state(child) != DS_NOTPRESENT)
+               return;
+
+       if (device_probe_child(dev, child) != 0)
+               return;
+
+       vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
+       if (DEVICE_ATTACH(child) != 0) {
+               vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+               vtpci_reset(sc);
+               vtpci_release_child_resources(sc);
+
+               /* Reset status for future attempt. */
+               vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+       } else
+               vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+static int
+vtpci_alloc_interrupts(struct vtpci_softc *sc, int flags, int nvqs,
+    struct vq_alloc_info *vq_info)
+{
+       int i, nvectors, error;
+
+       /*
+        * Only allocate a vector for virtqueues that are actually
+        * expecting an interrupt.
+        */
+       for (nvectors = 0, i = 0; i < nvqs; i++)
+               if (vq_info[i].vqai_intr != NULL)
+                       nvectors++;
+
+       if (vtpci_disable_msix != 0 ||
+           sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSIX ||
+           flags & VIRTIO_ALLOC_VQS_DISABLE_MSIX ||
+           vtpci_alloc_msix(sc, nvectors) != 0) {
+               /*
+                * Use MSI interrupts if available. Otherwise, we fallback
+                * to legacy interrupts.
+                */
+               if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSI) == 0 &&
+                   vtpci_alloc_msi(sc) == 0)
+                       sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSI;
+
+               sc->vtpci_nintr_res = 1;
+       }
+
+       error = vtpci_alloc_intr_resources(sc, nvqs, vq_info);
+
+       return (error);
+}
+
+static int
+vtpci_alloc_intr_resources(struct vtpci_softc *sc, int nvqs,
+    struct vq_alloc_info *vq_info)
+{
+       device_t dev;
+       struct resource *irq;
+       struct vtpci_virtqueue *vqx;
+       int i, rid, flags, res_idx;
+
+       dev = sc->vtpci_dev;
+       flags = RF_ACTIVE;
+
+       if ((sc->vtpci_flags &
+           (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) == 0) {
+               rid = 0;
+               flags |= RF_SHAREABLE;
+       } else
+               rid = 1;
+
+       for (i = 0; i < sc->vtpci_nintr_res; i++) {
+               irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, flags);
+               if (irq == NULL)
+                       return (ENXIO);
+
+               sc->vtpci_intr_res[i].irq = irq;
+               sc->vtpci_intr_res[i].rid = rid++;
+       }
+
+       /*
+        * Map the virtqueue into the correct index in vq_intr_res[]. Note the
+        * first index is reserved for configuration changes notifications.
+        */
+       for (i = 0, res_idx = 1; i < nvqs; i++) {
+               vqx = &sc->vtpci_vqx[i];
+
+               if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+                       if (vq_info[i].vqai_intr == NULL)
+                               vqx->ires_idx = -1;
+                       else if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX)
+                               vqx->ires_idx = res_idx;
+                       else
+                               vqx->ires_idx = res_idx++;
+               } else
+                       vqx->ires_idx = -1;
+       }
+
+       return (0);
+}
+
+static int
+vtpci_alloc_msi(struct vtpci_softc *sc)
+{
+       device_t dev;
+       int nmsi, cnt;
+       u_int irq_flags;
+
+       dev = sc->vtpci_dev;
+       nmsi = pci_msi_count(dev);
+
+       if (nmsi < 1)
+               return (1);
+
+       cnt = 1;
+
+       sc->vtpci_irq_rid = 0;
+        sc->vtpci_irq_type = pci_alloc_1intr(dev, 1,
+            &sc->vtpci_irq_rid, &irq_flags);
+
+
+       return (1);
+}
+
+static int
+vtpci_alloc_msix(struct vtpci_softc *sc, int nvectors)
+{
+       /* XXX(vsrinivas): Huh? Is this how MSI-X works?*/
+       /* XXX(vsrinivas): All of this was disabled... */
+#ifdef OLD_MSI
+       device_t dev;
+       int nmsix, cnt, required;
+
+       dev = sc->vtpci_dev;
+
+       nmsix = pci_msix_count(dev);
+       if (nmsix < 1)
+               return (1);
+
+       /* An additional vector is needed for the config changes. */
+       required = nvectors + 1;
+       if (nmsix >= required) {
+               cnt = required;
+               if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required)
+                       goto out;
+
+               pci_release_msi(dev);
+       }
+
+       /* Attempt shared MSIX configuration. */
+       required = 2;
+       if (nmsix >= required) {
+               cnt = required;
+               if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) {
+                       sc->vtpci_flags |= VIRTIO_PCI_FLAG_SHARED_MSIX;
+                       goto out;
+               }
+
+               pci_release_msi(dev);
+       }
+
+       return (1);
+
+out:
+       sc->vtpci_nintr_res = required;
+       sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSIX;
+
+       if (bootverbose) {
+               if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX)
+                       device_printf(dev, "using shared virtqueue MSIX\n");
+               else
+                       device_printf(dev, "using per virtqueue MSIX\n");
+       }
+#endif
+       return (0);
+}
+
+static int
+vtpci_register_msix_vector(struct vtpci_softc *sc, int offset, int res_idx)
+{
+       device_t dev;
+       uint16_t vector;
+
+       dev = sc->vtpci_dev;
+
+       if (offset != VIRTIO_MSI_CONFIG_VECTOR &&
+           offset != VIRTIO_MSI_QUEUE_VECTOR)
+               return (EINVAL);
+
+       if (res_idx != -1) {
+               /* Map from rid to host vector. */
+               vector = sc->vtpci_intr_res[res_idx].rid - 1;
+       } else
+               vector = VIRTIO_MSI_NO_VECTOR;
+
+       /* The first resource is special; make sure it is used correctly. */
+       if (res_idx == 0) {
+               KASSERT(vector == 0, ("unexpected config vector"));
+               KASSERT(offset == VIRTIO_MSI_CONFIG_VECTOR,
+                   ("unexpected config offset"));
+       }
+
+       vtpci_write_config_2(sc, offset, vector);
+
+       if (vtpci_read_config_2(sc, offset) != vector) {
+               device_printf(dev, "insufficient host resources for "
+                   "MSIX interrupts\n");
+               return (ENODEV);
+       }
+
+       return (0);
+}
+
+static void
+vtpci_free_interrupts(struct vtpci_softc *sc)
+{
+       device_t dev;
+       struct vtpci_intr_resource *ires;
+       int i;
+
+       dev = sc->vtpci_dev;
+       sc->vtpci_nintr_res = 0;
+
+       if (sc->vtpci_flags & (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) {
+               pci_release_msi(dev);
+               sc->vtpci_flags &= ~(VIRTIO_PCI_FLAG_MSI |
+                   VIRTIO_PCI_FLAG_MSIX | VIRTIO_PCI_FLAG_SHARED_MSIX);
+       }
+
+       for (i = 0; i < 1 + VIRTIO_MAX_VIRTQUEUES; i++) {
+               ires = &sc->vtpci_intr_res[i];
+
+               if (ires->intrhand != NULL) {
+                       bus_teardown_intr(dev, ires->irq, ires->intrhand);
+                       ires->intrhand = NULL;
+               }
+
+               if (ires->irq != NULL) {
+                       bus_release_resource(dev, SYS_RES_IRQ, ires->rid,
+                           ires->irq);
+                       ires->irq = NULL;
+               }
+
+               ires->rid = -1;
+       }
+}
+
+static void
+vtpci_free_virtqueues(struct vtpci_softc *sc)
+{
+       struct vtpci_virtqueue *vqx;
+       int i;
+
+       sc->vtpci_nvqs = 0;
+
+       for (i = 0; i < VIRTIO_MAX_VIRTQUEUES; i++) {
+               vqx = &sc->vtpci_vqx[i];
+
+               if (vqx->vq != NULL) {
+                       virtqueue_free(vqx->vq);
+                       vqx->vq = NULL;
+               }
+       }
+}
+
+static void
+vtpci_release_child_resources(struct vtpci_softc *sc)
+{
+
+       vtpci_free_interrupts(sc);
+       vtpci_free_virtqueues(sc);
+}
+
+static void
+vtpci_reset(struct vtpci_softc *sc)
+{
+
+       /*
+        * Setting the status to RESET sets the host device to
+        * the original, uninitialized state.
+        */
+       vtpci_set_status(sc->vtpci_dev, VIRTIO_CONFIG_STATUS_RESET);
+}
+
+static int
+vtpci_legacy_intr(void *xsc)
+{
+       struct vtpci_softc *sc;
+       struct vtpci_virtqueue *vqx;
+       int i;
+       uint8_t isr;
+
+       sc = xsc;
+       vqx = &sc->vtpci_vqx[0];
+
+       /* Reading the ISR also clears it. */
+       isr = vtpci_read_config_1(sc, VIRTIO_PCI_ISR);
+
+       if (isr & VIRTIO_PCI_ISR_CONFIG)
+               vtpci_config_intr(sc);
+
+       if (isr & VIRTIO_PCI_ISR_INTR)
+               for (i = 0; i < sc->vtpci_nvqs; i++, vqx++)
+                       virtqueue_intr(vqx->vq);
+
+       return isr;
+}
+
+static int
+vtpci_vq_shared_intr(void *xsc)
+{
+       struct vtpci_softc *sc;
+       struct vtpci_virtqueue *vqx;
+       int i, rc;
+
+       rc = 0;
+       sc = xsc;
+       vqx = &sc->vtpci_vqx[0];
+
+       for (i = 0; i < sc->vtpci_nvqs; i++, vqx++)
+               rc |= virtqueue_intr(vqx->vq);
+
+       return rc;
+}
+
+static int
+vtpci_vq_intr(void *xvq)
+{
+       struct virtqueue *vq;
+       int rc;
+
+       vq = xvq;
+       rc = virtqueue_intr(vq);
+
+       return rc;
+}
+
+static int
+vtpci_config_intr(void *xsc)
+{
+       struct vtpci_softc *sc;
+       device_t child;
+       int rc;
+
+       rc = 0;
+       sc = xsc;
+       child = sc->vtpci_child_dev;
+
+       if (child != NULL)
+               rc = VIRTIO_CONFIG_CHANGE(child);
+
+       return rc;
+}
diff --git a/sys/dev/virtual/virtio/pci/virtio_pci.h b/sys/dev/virtual/virtio/pci/virtio_pci.h
new file mode 100644 (file)
index 0000000..4773146
--- /dev/null
@@ -0,0 +1,87 @@
+/*-
+ * Copyright IBM Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori  <aliguori@us.ibm.com>
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/pci/virtio_pci.h,v 1.2 2011/12/06 06:28:32 grehan Exp $
+ */
+
+#ifndef _VIRTIO_PCI_H
+#define _VIRTIO_PCI_H
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_PCI_VENDORID    0x1AF4
+#define VIRTIO_PCI_DEVICEID_MIN        0x1000
+#define VIRTIO_PCI_DEVICEID_MAX        0x103F
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY          16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+                                     * also clears the register (8, RO) */
+/* Only if MSIX is enabled: */
+#define VIRTIO_MSI_CONFIG_VECTOR  20 /* configuration change vector (16, RW) */
+#define VIRTIO_MSI_QUEUE_VECTOR   22 /* vector for selected VQ notifications
+                                       (16, RW) */
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR    0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG  0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR   0xFFFF
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(sc) \
+    (((sc)->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) ? 24 : 20)
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT    12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+#endif /* _VIRTIO_PCI_H */
diff --git a/sys/dev/virtual/virtio/virtio/Makefile b/sys/dev/virtual/virtio/virtio/Makefile
new file mode 100644 (file)
index 0000000..7071ce1
--- /dev/null
@@ -0,0 +1,10 @@
+KMOD=   virtio
+
+SRCS=   virtio.c virtqueue.c virtio_if.h virtio_bus_if.h device_if.h bus_if.h\
+       virtio_bus_if.c virtio_if.c
+
+MFILES= kern/bus_if.m kern/device_if.m
+MFILES+=dev/virtual/virtio/virtio/virtio_if.m
+MFILES+=dev/virtual/virtio/virtio/virtio_bus_if.m
+
+.include <bsd.kmod.mk>
diff --git a/sys/dev/virtual/virtio/virtio/virtio.c b/sys/dev/virtual/virtio/virtio/virtio.c
new file mode 100644 (file)
index 0000000..ec8fcb3
--- /dev/null
@@ -0,0 +1,281 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/virtio.c,v 1.1 2011/11/18 05:43:43 grehan Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+
+#include <machine/inttypes.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include "virtio.h"
+#include "virtqueue.h"
+
+#include "virtio_if.h"
+#include "virtio_bus_if.h"
+
+static int virtio_modevent(module_t, int, void *);
+static const char *virtio_feature_name(uint64_t, struct virtio_feature_desc *);
+
+static struct virtio_ident {
+       uint16_t devid;
+       char    *name;
+} virtio_ident_table[] = {
+       { VIRTIO_ID_NETWORK,    "Network"       },
+       { VIRTIO_ID_BLOCK,      "Block"         },
+       { VIRTIO_ID_CONSOLE,    "Console"       },
+       { VIRTIO_ID_ENTROPY,    "Entropy"       },
+       { VIRTIO_ID_BALLOON,    "Balloon"       },
+       { VIRTIO_ID_IOMEMORY,   "IOMemory"      },
+       { VIRTIO_ID_9P,         "9P Transport"  },
+
+       { 0, NULL }
+};
+
+/* Device independent features. */
+static struct virtio_feature_desc virtio_common_feature_desc[] = {
+       { VIRTIO_F_NOTIFY_ON_EMPTY,     "NotifyOnEmpty" },
+       { VIRTIO_RING_F_EVENT_IDX,      "EventIdx"      },
+       { VIRTIO_F_BAD_FEATURE,         "BadFeature"    },
+
+       { 0, NULL }
+};
+
+const char *
+virtio_device_name(uint16_t devid)
+{
+       struct virtio_ident *ident;
+
+       for (ident = virtio_ident_table; ident->name != NULL; ident++) {
+               if (ident->devid == devid)
+                       return (ident->name);
+       }
+
+       return (NULL);
+}
+
+int
+virtio_get_device_type(device_t dev)
+{
+       uintptr_t devtype;
+
+       devtype = -1;
+
+       BUS_READ_IVAR(device_get_parent(dev), dev,
+           VIRTIO_IVAR_DEVTYPE, &devtype);
+
+       return ((int) devtype);
+}
+
+void
+virtio_set_feature_desc(device_t dev,
+    struct virtio_feature_desc *feature_desc)
+{
+
+       BUS_WRITE_IVAR(device_get_parent(dev), dev,
+           VIRTIO_IVAR_FEATURE_DESC, (uintptr_t) feature_desc);
+}
+
+/* XXX(vsrinivas): Hmm, check this SBUF usage */
+void
+virtio_describe(device_t dev, const char *msg,
+    uint64_t features, struct virtio_feature_desc *feature_desc)
+{
+       struct sbuf sb;
+       uint64_t val;
+       char *buf;
+       const char *name;
+       int n;
+
+       if ((buf = kmalloc(512, M_TEMP, M_NOWAIT)) == NULL) {
+               device_printf(dev, "%s features: 0x%"PRIx64"\n", msg,
+                   features);
+               return;
+       }
+
+       sbuf_new(&sb, buf, 512, SBUF_FIXEDLEN);
+       sbuf_printf(&sb, "%s features: 0x%"PRIx64, msg, features);
+
+       for (n = 0, val = 1ULL << 63; val != 0; val >>= 1) {
+               /*
+                * BAD_FEATURE is used to detect broken Linux clients
+                * and therefore is not applicable to FreeBSD.
+                */
+               if (((features & val) == 0) || val == VIRTIO_F_BAD_FEATURE)
+                       continue;
+
+               if (n++ == 0)
+                       sbuf_cat(&sb, " <");
+               else
+                       sbuf_cat(&sb, ",");
+
+               name = NULL;
+               if (feature_desc != NULL)
+                       name = virtio_feature_name(val, feature_desc);
+               if (name == NULL)
+                       name = virtio_feature_name(val,
+                           virtio_common_feature_desc);
+
+               if (name == NULL)
+                       sbuf_printf(&sb, "0x%"PRIx64, val);
+               else
+                       sbuf_cat(&sb, name);
+       }
+
+       if (n > 0)
+               sbuf_cat(&sb, ">");
+
+#if __FreeBSD_version < 900020
+       sbuf_finish(&sb);
+       if (sbuf_overflowed(&sb) == 0)
+#else
+       if (sbuf_finish(&sb) == 0)
+#endif
+               device_printf(dev, "%s\n", sbuf_data(&sb));
+
+       sbuf_delete(&sb);
+       kfree(buf, M_TEMP);
+}
+
+static const char *
+virtio_feature_name(uint64_t val, struct virtio_feature_desc *feature_desc)
+{
+       int i;
+
+       for (i = 0; feature_desc[i].vfd_val != 0; i++)
+               if (val == feature_desc[i].vfd_val)
+                       return (feature_desc[i].vfd_str);
+
+       return (NULL);
+}
+
+/*
+ * VirtIO bus method wrappers.
+ */
+
+uint64_t
+virtio_negotiate_features(device_t dev, uint64_t child_features)
+{
+
+       return (VIRTIO_BUS_NEGOTIATE_FEATURES(device_get_parent(dev),
+           child_features));
+}
+
+int
+virtio_alloc_virtqueues(device_t dev, int flags, int nvqs,
+    struct vq_alloc_info *info)
+{
+
+       return (VIRTIO_BUS_ALLOC_VIRTQUEUES(device_get_parent(dev), flags,
+           nvqs, info));
+}
+
+int
+virtio_setup_intr(device_t dev)
+{
+
+       return (VIRTIO_BUS_SETUP_INTR(device_get_parent(dev)));
+}
+
+int
+virtio_with_feature(device_t dev, uint64_t feature)
+{
+
+       return (VIRTIO_BUS_WITH_FEATURE(device_get_parent(dev), feature));
+}
+
+void
+virtio_stop(device_t dev)
+{
+
+       VIRTIO_BUS_STOP(device_get_parent(dev));
+}
+
+int
+virtio_reinit(device_t dev, uint64_t features)
+{
+
+       return (VIRTIO_BUS_REINIT(device_get_parent(dev), features));
+}
+
+void
+virtio_reinit_complete(device_t dev)
+{
+
+       VIRTIO_BUS_REINIT_COMPLETE(device_get_parent(dev));
+}
+
+void
+virtio_read_device_config(device_t dev, bus_size_t offset, void *dst, int len)
+{
+
+       VIRTIO_BUS_READ_DEVICE_CONFIG(device_get_parent(dev),
+           offset, dst, len);
+}
+
+void
+virtio_write_device_config(device_t dev, bus_size_t offset, void *dst, int len)
+{
+
+       VIRTIO_BUS_WRITE_DEVICE_CONFIG(device_get_parent(dev),
+           offset, dst, len);
+}
+
+static int
+virtio_modevent(module_t mod, int type, void *unused)
+{
+       int error;
+
+       error = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+       //case MOD_QUIESCE:
+       case MOD_UNLOAD:
+       case MOD_SHUTDOWN:
+               break;
+       default:
+               error = EOPNOTSUPP;
+               break;
+       }
+
+       return (error);
+}
+
+static moduledata_t virtio_mod = {
+       "virtio",
+       virtio_modevent,
+       0
+};
+
+DECLARE_MODULE(virtio, virtio_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_VERSION(virtio, 1);
diff --git a/sys/dev/virtual/virtio/virtio/virtio.h b/sys/dev/virtual/virtio/virtio/virtio.h
new file mode 100644 (file)
index 0000000..f859e19
--- /dev/null
@@ -0,0 +1,141 @@
+/*-
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/virtio.h,v 1.2 2011/12/06 06:28:32 grehan Exp $
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#include <sys/types.h>
+
+struct vq_alloc_info;
+
+/* VirtIO device IDs. */
+#define VIRTIO_ID_NETWORK      0x01
+#define VIRTIO_ID_BLOCK                0x02
+#define VIRTIO_ID_CONSOLE      0x03
+#define VIRTIO_ID_ENTROPY      0x04
+#define VIRTIO_ID_BALLOON      0x05
+#define VIRTIO_ID_IOMEMORY     0x06
+#define VIRTIO_ID_9P           0x09
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Generate interrupt when the virtqueue ring is
+ * completely used, even if we've suppressed them.
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24)
+
+/*
+ * The guest should never negotiate this feature; it
+ * is used to detect faulty drivers.
+ */
+#define VIRTIO_F_BAD_FEATURE (1 << 30)
+
+/*
+ * Some VirtIO feature bits (currently bits 28 through 31) are
+ * reserved for the transport being used (eg. virtio_ring), the
+ * rest are per-device feature bits.
+ */
+#define VIRTIO_TRANSPORT_F_START       28
+#define VIRTIO_TRANSPORT_F_END         32
+
+/*
+ * Maximum number of virtqueues per device.
+ */
+#define VIRTIO_MAX_VIRTQUEUES 8
+
+/*
+ * VirtIO instance variables indices.
+ */
+#define VIRTIO_IVAR_DEVTYPE            1
+#define VIRTIO_IVAR_FEATURE_DESC       2
+
+struct virtio_feature_desc {
+       uint64_t         vfd_val;
+       char            *vfd_str;
+};
+
+const char *virtio_device_name(uint16_t devid);
+int     virtio_get_device_type(device_t dev);
+void    virtio_set_feature_desc(device_t dev,
+            struct virtio_feature_desc *feature_desc);
+void    virtio_describe(device_t dev, const char *msg,
+            uint64_t features, struct virtio_feature_desc *feature_desc);
+
+/*
+ * VirtIO Bus Methods.
+ */
+uint64_t virtio_negotiate_features(device_t dev, uint64_t child_features);
+int     virtio_alloc_virtqueues(device_t dev, int flags, int nvqs,
+            struct vq_alloc_info *info);
+int     virtio_setup_intr(device_t dev);
+int     virtio_with_feature(device_t dev, uint64_t feature);
+void    virtio_stop(device_t dev);
+int     virtio_reinit(device_t dev, uint64_t features);
+void    virtio_reinit_complete(device_t dev);
+
+/*
+ * Read/write a variable amount from the device specific (ie, network)
+ * configuration region. This region is encoded in the same endian as
+ * the guest.
+ */
+void    virtio_read_device_config(device_t dev, bus_size_t offset,
+            void *dst, int length);
+void    virtio_write_device_config(device_t dev, bus_size_t offset,
+            void *src, int length);
+
+/* Inlined device specific read/write functions for common lengths. */
+#define VIRTIO_RDWR_DEVICE_CONFIG(size, type)                          \
+static inline type                                                     \
+__CONCAT(virtio_read_dev_config_,size)(device_t dev,                   \
+    bus_size_t offset)                                                 \
+{                                                                      \
+       type val;                                                       \
+       virtio_read_device_config(dev, offset, &val, sizeof(type));     \
+       return (val);                                                   \
+}                                                                      \
+                                                                       \
+static inline void                                                     \
+__CONCAT(virtio_write_dev_config_,size)(device_t dev,                  \
+    bus_size_t offset, type val)                                       \
+{                                                                      \
+       virtio_write_device_config(dev, offset, &val, sizeof(type));    \
+}
+
+VIRTIO_RDWR_DEVICE_CONFIG(1, uint8_t);
+VIRTIO_RDWR_DEVICE_CONFIG(2, uint16_t);
+VIRTIO_RDWR_DEVICE_CONFIG(4, uint32_t);
+
+#endif /* _VIRTIO_H_ */
diff --git a/sys/dev/virtual/virtio/virtio/virtio_bus_if.m b/sys/dev/virtual/virtio/virtio/virtio_bus_if.m
new file mode 100644 (file)
index 0000000..b9a3872
--- /dev/null
@@ -0,0 +1,90 @@
+#-
+# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD: src/sys/dev/virtio/virtio_bus_if.m,v 1.1 2011/11/18 05:43:43 grehan Exp $
+
+#include <sys/bus.h>
+
+INTERFACE virtio_bus;
+
+HEADER {
+struct vq_alloc_info;
+};
+
+METHOD uint64_t negotiate_features {
+       device_t        dev;
+       uint64_t        child_features;
+};
+
+METHOD int with_feature {
+       device_t        dev;
+       uint64_t        feature;
+};
+
+METHOD int alloc_virtqueues {
+       device_t        dev;
+       int             flags;
+       int             nvqs;
+       struct vq_alloc_info *info;
+};
+HEADER {
+#define VIRTIO_ALLOC_VQS_DISABLE_MSIX 0x1
+};
+
+METHOD int setup_intr {
+       device_t        dev;
+};
+
+METHOD void stop {
+       device_t        dev;
+};
+
+METHOD int reinit {
+       device_t        dev;
+       uint64_t        features;
+};
+
+METHOD void reinit_complete {
+       device_t        dev;
+};
+
+METHOD void notify_vq {
+       device_t        dev;
+       uint16_t        queue;
+};
+
+METHOD void read_device_config {
+       device_t        dev;
+       bus_size_t      offset;
+       void            *dst;
+       int             len;
+};
+
+METHOD void write_device_config {
+       device_t        dev;
+       bus_size_t      offset;
+       void            *src;
+       int             len;
+};
diff --git a/sys/dev/virtual/virtio/virtio/virtio_if.m b/sys/dev/virtual/virtio/virtio/virtio_if.m
new file mode 100644 (file)
index 0000000..103c8a9
--- /dev/null
@@ -0,0 +1,43 @@
+#-
+# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD: src/sys/dev/virtio/virtio_if.m,v 1.1 2011/11/18 05:43:43 grehan Exp $
+
+#include <sys/bus.h>
+
+INTERFACE virtio;
+
+CODE {
+       static int
+       virtio_default_config_change(device_t dev)
+       {
+               /* Return that we've handled the change. */
+               return (1);
+       }
+};
+
+METHOD int config_change {
+       device_t        dev;
+} DEFAULT virtio_default_config_change;
diff --git a/sys/dev/virtual/virtio/virtio/virtio_ring.h b/sys/dev/virtual/virtio/virtio/virtio_ring.h
new file mode 100644 (file)
index 0000000..df0d94d
--- /dev/null
@@ -0,0 +1,164 @@
+/*-
+ * Copyright Rusty Russell IBM Corporation 2007.
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/virtio_ring.h,v 1.3 2012/04/14 05:48:04 grehan Exp $
+ */
+
+#ifndef VIRTIO_RING_H
+#define        VIRTIO_RING_H
+
+#include <sys/types.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers. */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.  */
+#define VRING_AVAIL_F_NO_INTERRUPT      1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next". */
+struct vring_desc {
+        /* Address (guest-physical). */
+        uint64_t addr;
+        /* Length. */
+        uint32_t len;
+        /* The flags as indicated above. */
+        uint16_t flags;
+        /* We chain unused descriptors via this, too. */
+        uint16_t next;
+};
+
+struct vring_avail {
+        uint16_t flags;
+        uint16_t idx;
+        uint16_t ring[0];
+};
+
+/* uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+        /* Index of start of used descriptor chain. */
+        uint32_t id;
+        /* Total length of the descriptor chain which was written to. */
+        uint32_t len;
+};
+
+struct vring_used {
+        uint16_t flags;
+        uint16_t idx;
+        struct vring_used_elem ring[0];
+};
+
+struct vring {
+       unsigned int num;
+
+       struct vring_desc *desc;
+       struct vring_avail *avail;
+       struct vring_used *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define        vring_used_event(vr)    ((vr)->avail->ring[(vr)->num])
+#define        vring_avail_event(vr)   (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline int
+vring_size(unsigned int num, unsigned long align)
+{
+       int size;
+
+       size = num * sizeof(struct vring_desc);
+       size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+       size = (size + align - 1) & ~(align - 1);
+       size += sizeof(struct vring_used) +
+           (num * sizeof(struct vring_used_elem));
+       return (size);
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+    unsigned long align)
+{
+        vr->num = num;
+        vr->desc = (struct vring_desc *) p;
+        vr->avail = (struct vring_avail *) (p +
+           num * sizeof(struct vring_desc));
+        vr->used = (void *)
+           (((unsigned long) &vr->avail->ring[num] + align-1) & ~(align-1));
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ *
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+
+       return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+#endif /* VIRTIO_RING_H */
diff --git a/sys/dev/virtual/virtio/virtio/virtqueue.c b/sys/dev/virtual/virtio/virtio/virtqueue.c
new file mode 100644 (file)
index 0000000..511efa3
--- /dev/null
@@ -0,0 +1,639 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/virtqueue.c,v 1.2 2012/04/14 05:48:04 grehan Exp $
+ */
+
+/*
+ * Implements the virtqueue interface as basically described
+ * in the original VirtIO paper.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/sglist.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <sys/spinlock.h>
+#include <sys/spinlock2.h>
+
+#include <machine/cpu.h>
+#include <machine/atomic.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include "virtio.h"
+#include "virtqueue.h"
+#include "virtio_ring.h"
+
+#include "virtio_bus_if.h"
+
+struct virtqueue {
+       device_t                 vq_dev;
+       char                     vq_name[VIRTQUEUE_MAX_NAME_SZ];
+       uint16_t                 vq_queue_index;
+       uint16_t                 vq_nentries;
+       uint32_t                 vq_flags;
+
+#define        VIRTQUEUE_FLAG_EVENT_IDX 0x0002
+
+       int                      vq_alignment;
+       int                      vq_ring_size;
+       void                    *vq_ring_mem;
+
+       virtqueue_intr_t        *vq_intrhand;
+       void                    *vq_intrhand_arg;
+
+       struct vring             vq_ring;
+       uint16_t                 vq_free_cnt;
+       uint16_t                 vq_queued_cnt;
+       /*
+        * Head of the free chain in the descriptor table. If
+        * there are no free descriptors, this will be set to
+        * VQ_RING_DESC_CHAIN_END.
+        */
+       uint16_t                 vq_desc_head_idx;
+       /*
+        * Last consumed descriptor in the used table,
+        * trails vq_ring.used->idx.
+        */
+       uint16_t                 vq_used_cons_idx;
+
+       struct vq_desc_extra {
+               void              *cookie;
+               uint16_t           ndescs;
+       } vq_descx[0];
+};
+
+/*
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+#define VQASSERT(_vq, _exp, _msg, ...)                         \
+    KASSERT((_exp),("%s: %s - "_msg, __func__, (_vq)->vq_name, \
+       ##__VA_ARGS__))
+
+#define VQ_RING_ASSERT_VALID_IDX(_vq, _idx)                    \
+    VQASSERT((_vq), (_idx) < (_vq)->vq_nentries,               \
+       "invalid ring index: %d, max: %d", (_idx),              \
+       (_vq)->vq_nentries)
+
+#define VQ_RING_ASSERT_CHAIN_TERM(_vq)                         \
+    VQASSERT((_vq), (_vq)->vq_desc_head_idx ==                 \
+       VQ_RING_DESC_CHAIN_END, "full ring terminated "         \
+       "incorrectly: head idx: %d", (_vq)->vq_desc_head_idx)
+
+static void    vq_ring_init(struct virtqueue *);
+static void    vq_ring_update_avail(struct virtqueue *, uint16_t);
+static uint16_t        vq_ring_enqueue_segments(struct virtqueue *,
+                   struct vring_desc *, uint16_t, struct sglist *, int, int);
+static int     vq_ring_must_notify_host(struct virtqueue *);
+static void    vq_ring_notify_host(struct virtqueue *);
+static void    vq_ring_free_chain(struct virtqueue *, uint16_t);
+
+uint64_t
+virtqueue_filter_features(uint64_t features)
+{
+       uint64_t mask;
+
+       mask = (1 << VIRTIO_TRANSPORT_F_START) - 1;
+       mask |= VIRTIO_RING_F_EVENT_IDX;
+
+       return (features & mask);
+}
+
+int
+virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, int align,
+    vm_paddr_t highaddr, struct vq_alloc_info *info, struct virtqueue **vqp)
+{
+       struct virtqueue *vq;
+       int error;
+
+       *vqp = NULL;
+       error = 0;
+
+       if (size == 0) {
+               device_printf(dev,
+                   "virtqueue %d (%s) does not exist (size is zero)\n",
+                   queue, info->vqai_name);
+               return (ENODEV);
+       } else if (!powerof2(size)) {
+               device_printf(dev,
+                   "virtqueue %d (%s) size is not a power of 2: %d\n",
+                   queue, info->vqai_name, size);
+               return (ENXIO);
+       }
+
+       vq = kmalloc(sizeof(struct virtqueue) +
+           size * sizeof(struct vq_desc_extra), M_DEVBUF, M_NOWAIT | M_ZERO);
+       if (vq == NULL) {
+               device_printf(dev, "cannot allocate virtqueue\n");
+               return (ENOMEM);
+       }
+
+       vq->vq_dev = dev;
+       strlcpy(vq->vq_name, info->vqai_name, sizeof(vq->vq_name));
+       vq->vq_queue_index = queue;
+       vq->vq_alignment = align;
+       vq->vq_nentries = size;
+       vq->vq_free_cnt = size;
+       vq->vq_intrhand = info->vqai_intr;
+       vq->vq_intrhand_arg = info->vqai_intr_arg;
+
+       if (VIRTIO_BUS_WITH_FEATURE(dev, VIRTIO_RING_F_EVENT_IDX) != 0)
+               vq->vq_flags |= VIRTQUEUE_FLAG_EVENT_IDX;
+
+       vq->vq_ring_size = round_page(vring_size(size, align));
+       vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
+           M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
+       if (vq->vq_ring_mem == NULL) {
+               device_printf(dev,
+                   "cannot allocate memory for virtqueue ring\n");
+               error = ENOMEM;
+               goto fail;
+       }
+
+       vq_ring_init(vq);
+       virtqueue_disable_intr(vq);
+
+       *vqp = vq;
+
+fail:
+       if (error)
+               virtqueue_free(vq);
+
+       return (error);
+}
+
+int
+virtqueue_reinit(struct virtqueue *vq, uint16_t size)
+{
+       struct vq_desc_extra *dxp;
+       int i;
+
+       if (vq->vq_nentries != size) {
+               device_printf(vq->vq_dev,
+                   "%s: '%s' changed size; old=%hu, new=%hu\n",
+                   __func__, vq->vq_name, vq->vq_nentries, size);
+               return (EINVAL);
+       }
+
+       /* Warn if the virtqueue was not properly cleaned up. */
+       if (vq->vq_free_cnt != vq->vq_nentries) {
+               device_printf(vq->vq_dev,
+                   "%s: warning, '%s' virtqueue not empty, "
+                   "leaking %d entries\n", __func__, vq->vq_name,
+                   vq->vq_nentries - vq->vq_free_cnt);
+       }
+
+       vq->vq_desc_head_idx = 0;
+       vq->vq_used_cons_idx = 0;
+       vq->vq_queued_cnt = 0;
+       vq->vq_free_cnt = vq->vq_nentries;
+
+       /* To be safe, reset all our allocated memory. */
+       bzero(vq->vq_ring_mem, vq->vq_ring_size);
+       for (i = 0; i < vq->vq_nentries; i++) {
+               dxp = &vq->vq_descx[i];
+               dxp->cookie = NULL;
+               dxp->ndescs = 0;
+       }
+
+       vq_ring_init(vq);
+       virtqueue_disable_intr(vq);
+
+       return (0);
+}
+
+void
+virtqueue_free(struct virtqueue *vq)
+{
+
+       if (vq->vq_free_cnt != vq->vq_nentries) {
+               device_printf(vq->vq_dev, "%s: freeing non-empty virtqueue, "
+                   "leaking %d entries\n", vq->vq_name,
+                   vq->vq_nentries - vq->vq_free_cnt);
+       }
+
+       if (vq->vq_ring_mem != NULL) {
+               contigfree(vq->vq_ring_mem, vq->vq_ring_size, M_DEVBUF);
+               vq->vq_ring_size = 0;
+               vq->vq_ring_mem = NULL;
+       }
+
+       kfree(vq, M_DEVBUF);
+}
+
+vm_paddr_t
+virtqueue_paddr(struct virtqueue *vq)
+{
+       return (vtophys(vq->vq_ring_mem));
+}
+
+int
+virtqueue_size(struct virtqueue *vq)
+{
+       return (vq->vq_nentries);
+}
+
+int
+virtqueue_empty(struct virtqueue *vq)
+{
+
+       return (vq->vq_nentries == vq->vq_free_cnt);
+}
+
+int
+virtqueue_full(struct virtqueue *vq)
+{
+
+       return (vq->vq_free_cnt == 0);
+}
+
+void
+virtqueue_notify(struct virtqueue *vq, struct spinlock *interlock)
+{
+       /* Ensure updated avail->idx is visible to host. */
+       cpu_mfence();
+
+       if (vq_ring_must_notify_host(vq)) {
+               spin_unlock(interlock);
+               vq_ring_notify_host(vq);
+               spin_lock(interlock);
+       }
+       vq->vq_queued_cnt = 0;
+}
+
+int
+virtqueue_nused(struct virtqueue *vq)
+{
+       uint16_t used_idx, nused;
+
+       used_idx = vq->vq_ring.used->idx;
+       nused = (uint16_t)(used_idx - vq->vq_used_cons_idx);
+       VQASSERT(vq, nused <= vq->vq_nentries, "used more than available");
+
+       return (nused);
+}
+
+int
+virtqueue_intr(struct virtqueue *vq)
+{
+
+       if (vq->vq_intrhand == NULL ||
+           vq->vq_used_cons_idx == vq->vq_ring.used->idx)
+               return (0);
+
+       vq->vq_intrhand(vq->vq_intrhand_arg);
+
+       return (1);
+}
+
+/*
+ * Enable interrupts on a given virtqueue. Returns 1 if there are
+ * additional entries to process on the virtqueue after we return.
+ */
+int
+virtqueue_enable_intr(struct virtqueue *vq)
+{
+       /*
+        * Enable interrupts, making sure we get the latest
+        * index of what's already been consumed.
+        */
+       vq->vq_ring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+       if (vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX) {
+               vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx;
+       } else {
+              vq->vq_ring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+       }
+
+       cpu_mfence();
+
+       /*
+        * Additional items may have been consumed in the time between
+        * since we last checked and enabled interrupts above. Let our
+        * caller know so it processes the new entries.
+        */
+       if (vq->vq_used_cons_idx != vq->vq_ring.used->idx)
+               return (1);
+
+       return (0);
+}
+
+int
+virtqueue_postpone_intr(struct virtqueue *vq)
+{
+       uint16_t ndesc;
+
+       /*
+        * Postpone until at least half of the available descriptors
+        * have been consumed.
+        *
+        * XXX Adaptive factor? (Linux uses 3/4)
+        */
+       ndesc = (uint16_t)(vq->vq_ring.avail->idx - vq->vq_used_cons_idx) / 2;
+
+       if (vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX)
+               vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx + ndesc;
+       else
+               vq->vq_ring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+
+       cpu_mfence();
+
+       /*
+        * Enough items may have already been consumed to meet our
+        * threshold since we last checked. Let our caller know so
+        * it processes the new entries.
+        */
+       if (virtqueue_nused(vq) > ndesc)
+               return (1);
+
+       return (0);
+}
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+       /*
+        * Note this is only considered a hint to the host.
+        */
+       if ((vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX) == 0)
+               vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+int
+virtqueue_enqueue(struct virtqueue *vq, void *cookie, struct sglist *sg,
+    int readable, int writable)
+{
+       struct vq_desc_extra *dxp;
+       int needed;
+       uint16_t head_idx, idx;
+
+       needed = readable + writable;
+
+       VQASSERT(vq, cookie != NULL, "enqueuing with no cookie");
+       VQASSERT(vq, needed == sg->sg_nseg,
+           "segment count mismatch, %d, %d", needed, sg->sg_nseg);
+
+       if (needed < 1)
+               return (EINVAL);
+       if (vq->vq_free_cnt == 0)
+               return (ENOSPC);
+       if (vq->vq_free_cnt < needed)
+               return (EMSGSIZE);
+
+       head_idx = vq->vq_desc_head_idx;
+       VQ_RING_ASSERT_VALID_IDX(vq, head_idx);
+       dxp = &vq->vq_descx[head_idx];
+
+       VQASSERT(vq, dxp->cookie == NULL,
+           "cookie already exists for index %d", head_idx);
+       dxp->cookie = cookie;
+       dxp->ndescs = needed;
+
+       idx = vq_ring_enqueue_segments(vq, vq->vq_ring.desc, head_idx,
+           sg, readable, writable);
+
+       vq->vq_desc_head_idx = idx;
+       vq->vq_free_cnt -= needed;
+       if (vq->vq_free_cnt == 0)
+               VQ_RING_ASSERT_CHAIN_TERM(vq);
+       else
+               VQ_RING_ASSERT_VALID_IDX(vq, idx);
+
+       vq_ring_update_avail(vq, head_idx);
+
+       return (0);
+}
+
+void *
+virtqueue_dequeue(struct virtqueue *vq, uint32_t *len)
+{
+       struct vring_used_elem *uep;
+       void *cookie;
+       uint16_t used_idx, desc_idx;
+
+       if (vq->vq_used_cons_idx == vq->vq_ring.used->idx)
+               return (NULL);
+
+       used_idx = vq->vq_used_cons_idx++ & (vq->vq_nentries - 1);
+       uep = &vq->vq_ring.used->ring[used_idx];
+
+       cpu_mfence();
+       desc_idx = (uint16_t) uep->id;
+       if (len != NULL)
+               *len = uep->len;
+
+       vq_ring_free_chain(vq, desc_idx);
+
+       cookie = vq->vq_descx[desc_idx].cookie;
+       VQASSERT(vq, cookie != NULL, "no cookie for index %d", desc_idx);
+       vq->vq_descx[desc_idx].cookie = NULL;
+
+       return (cookie);
+}
+
+void *
+virtqueue_poll(struct virtqueue *vq, uint32_t *len)
+{
+       void *cookie;
+
+       /* We only poll the virtqueue when dumping to virtio-blk */
+       while ((cookie = virtqueue_dequeue(vq, len)) == NULL)
+               ;       
+
+       return (cookie);
+}
+
+void *
+virtqueue_drain(struct virtqueue *vq, int *last)
+{
+       void *cookie;
+       int idx;
+
+       cookie = NULL;
+       idx = *last;
+
+       while (idx < vq->vq_nentries && cookie == NULL) {
+               if ((cookie = vq->vq_descx[idx].cookie) != NULL) {
+                       vq->vq_descx[idx].cookie = NULL;
+                       /* Free chain to keep free count consistent. */
+                       vq_ring_free_chain(vq, idx);
+               }
+               idx++;
+       }
+
+       *last = idx;
+
+       return (cookie);
+}
+
+void
+virtqueue_dump(struct virtqueue *vq)
+{
+
+       if (vq == NULL)
+               return;
+
+       kprintf("VQ: %s - size=%d; free=%d; used=%d; queued=%d; "
+           "desc_head_idx=%d; avail.idx=%d; used_cons_idx=%d; "
+           "used.idx=%d; avail.flags=0x%x; used.flags=0x%x\n",
+           vq->vq_name, vq->vq_nentries, vq->vq_free_cnt,
+           virtqueue_nused(vq), vq->vq_queued_cnt, vq->vq_desc_head_idx,
+           vq->vq_ring.avail->idx, vq->vq_used_cons_idx,
+           vq->vq_ring.used->idx, vq->vq_ring.avail->flags,
+           vq->vq_ring.used->flags);
+}
+
+static void
+vq_ring_init(struct virtqueue *vq)
+{
+       struct vring *vr;
+       char *ring_mem;
+       int i, size;
+
+       ring_mem = vq->vq_ring_mem;
+       size = vq->vq_nentries;
+       vr = &vq->vq_ring;
+
+       vring_init(vr, size, ring_mem, vq->vq_alignment);
+
+       for (i = 0; i < size - 1; i++)
+               vr->desc[i].next = i + 1;
+       vr->desc[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+static void
+vq_ring_update_avail(struct virtqueue *vq, uint16_t desc_idx)
+{
+       uint16_t avail_idx;
+
+       /*
+        * Place the head of the descriptor chain into the next slot and make
+        * it usable to the host. The chain is made available now rather than
+        * deferring to virtqueue_notify() in the hopes that if the host is
+        * currently running on another CPU, we can keep it processing the new
+        * descriptor.
+        */
+       avail_idx = vq->vq_ring.avail->idx & (vq->vq_nentries - 1);
+       vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+
+       cpu_mfence();
+       vq->vq_ring.avail->idx++;
+
+       /* Keep pending count until virtqueue_notify() for debugging. */
+       vq->vq_queued_cnt++;
+}
+
+static uint16_t
+vq_ring_enqueue_segments(struct virtqueue *vq, struct vring_desc *desc,
+    uint16_t head_idx, struct sglist *sg, int readable, int writable)
+{
+       struct sglist_seg *seg;
+       struct vring_desc *dp;
+       int i, needed;
+       uint16_t idx;
+
+       needed = readable + writable;
+
+       for (i = 0, idx = head_idx, seg = sg->sg_segs;
+            i < needed;
+            i++, idx = dp->next, seg++) {
+               VQASSERT(vq, idx != VQ_RING_DESC_CHAIN_END,
+                   "premature end of free desc chain");
+
+               dp = &desc[idx];
+               dp->addr = seg->ss_paddr;
+               dp->len = seg->ss_len;
+               dp->flags = 0;
+
+               if (i < needed - 1)
+                       dp->flags |= VRING_DESC_F_NEXT;
+               if (i >= readable)
+                       dp->flags |= VRING_DESC_F_WRITE;
+       }
+
+       return (idx);
+}
+
+static int
+vq_ring_must_notify_host(struct virtqueue *vq)
+{
+       uint16_t new_idx, prev_idx, event_idx;
+
+       if (vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX) {
+               new_idx = vq->vq_ring.avail->idx;
+               prev_idx = new_idx - vq->vq_queued_cnt;
+               event_idx = vring_avail_event(&vq->vq_ring);
+
+               return (vring_need_event(event_idx, new_idx, prev_idx) != 0);
+       }
+
+       return ((vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) == 0);
+}
+
+static void
+vq_ring_notify_host(struct virtqueue *vq)
+{
+       VIRTIO_BUS_NOTIFY_VQ(vq->vq_dev, vq->vq_queue_index);
+}
+
+static void
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+       struct vring_desc *dp;
+       struct vq_desc_extra *dxp;
+
+       VQ_RING_ASSERT_VALID_IDX(vq, desc_idx);
+       dp = &vq->vq_ring.desc[desc_idx];
+       dxp = &vq->vq_descx[desc_idx];
+
+       if (vq->vq_free_cnt == 0)
+               VQ_RING_ASSERT_CHAIN_TERM(vq);
+
+       vq->vq_free_cnt += dxp->ndescs;
+       dxp->ndescs--;
+
+       while (dp->flags & VRING_DESC_F_NEXT) {
+               VQ_RING_ASSERT_VALID_IDX(vq, dp->next);
+               dp = &vq->vq_ring.desc[dp->next];
+               dxp->ndescs--;
+       }
+       VQASSERT(vq, dxp->ndescs == 0, "failed to free entire desc chain");
+
+       /*
+        * We must append the existing free chain, if any, to the end of
+        * newly freed chain. If the virtqueue was completely used, then
+        * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+        */
+       dp->next = vq->vq_desc_head_idx;
+       vq->vq_desc_head_idx = desc_idx;
+}
diff --git a/sys/dev/virtual/virtio/virtio/virtqueue.h b/sys/dev/virtual/virtio/virtio/virtqueue.h
new file mode 100644 (file)
index 0000000..ceaa0e6
--- /dev/null
@@ -0,0 +1,95 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/dev/virtio/virtqueue.h,v 1.2 2012/04/14 05:48:04 grehan Exp $
+ */
+
+#ifndef _VIRTIO_VIRTQUEUE_H
+#define _VIRTIO_VIRTQUEUE_H
+
+#include <sys/types.h>
+
+struct virtqueue;
+struct sglist;
+struct spinlock;
+
+/* The guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ * The host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX                (1 << 29)
+
+/* Device callback for a virtqueue interrupt. */
+typedef int virtqueue_intr_t(void *);
+
+#define VIRTQUEUE_MAX_NAME_SZ  32
+
+/* One for each virtqueue the device wishes to allocate. */
+struct vq_alloc_info {
+       char               vqai_name[VIRTQUEUE_MAX_NAME_SZ];
+       virtqueue_intr_t  *vqai_intr;
+       void              *vqai_intr_arg;
+       struct virtqueue **vqai_vq;
+};
+
+#define VQ_ALLOC_INFO_INIT(_i,_nsegs,_intr,_arg,_vqp,_str,...) do {    \
+       ksnprintf((_i)->vqai_name, VIRTQUEUE_MAX_NAME_SZ, _str,         \
+           ##__VA_ARGS__);                                             \
+       (_i)->vqai_intr = (_intr);                                      \
+       (_i)->vqai_intr_arg = (_arg);                                   \
+       (_i)->vqai_vq = (_vqp);                                         \
+} while (0)
+
+uint64_t virtqueue_filter_features(uint64_t features);
+
+int     virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
+            int align, vm_paddr_t highaddr, struct vq_alloc_info *info,
+            struct virtqueue **vqp);
+void   *virtqueue_drain(struct virtqueue *vq, int *last);
+void    virtqueue_free(struct virtqueue *vq);
+int     virtqueue_reinit(struct virtqueue *vq, uint16_t size);
+
+int     virtqueue_intr(struct virtqueue *vq);
+int     virtqueue_enable_intr(struct virtqueue *vq);
+int     virtqueue_postpone_intr(struct virtqueue *vq);
+void    virtqueue_disable_intr(struct virtqueue *vq);
+
+/* Get physical address of the virtqueue ring. */
+vm_paddr_t virtqueue_paddr(struct virtqueue *vq);
+
+int     virtqueue_full(struct virtqueue *vq);
+int     virtqueue_empty(struct virtqueue *vq);
+int     virtqueue_size(struct virtqueue *vq);
+int     virtqueue_nused(struct virtqueue *vq);
+void    virtqueue_notify(struct virtqueue *vq, struct spinlock *);
+void    virtqueue_dump(struct virtqueue *vq);
+
+int     virtqueue_enqueue(struct virtqueue *vq, void *cookie,
+            struct sglist *sg, int readable, int writable);
+void   *virtqueue_dequeue(struct virtqueue *vq, uint32_t *len);
+void   *virtqueue_poll(struct virtqueue *vq, uint32_t *len);
+
+#endif /* _VIRTIO_VIRTQUEUE_H */