MPSAFE - tsleep_interlock, BUF/BIO, cluster, swap_pager.
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 15 Jul 2009 02:31:18 +0000 (19:31 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 15 Jul 2009 02:31:18 +0000 (19:31 -0700)
* tsleep_interlock()/tsleep() could miss wakeups during periods of
  heavy cpu activity.  What would happen is code inbetween the two
  calls would try to send an IPI (say, issue a wakeup()), but while
  sending the IPI the kernel would be forced to process incoming IPIs
  synchronous to avoid a deadlock.

  The new tsleep_interlock()/tsleep() code adds another TAILQ_ENTRY to
  the thread structure allowing tsleep_interlock() to formally place
  the thread on the appropriate sleep queue without having to deschedule
  the thread.  Any wakeup which occurs between the interlock and the
  real tsleep() call will remove the thread from the queue and the
  later tsleep() call will recognize this and simply return without sleeping.

  The new tsleep() call requires PINTERLOCKED to be passed to tsleep
  so tsleep() knows that the thread has already been placed on a sleep
  queue.

* Continue making BUF/BIO MPSAFE.  Remove B_ASYNC and B_WANT from buf->b_flag
  and add a new bio->bio_flags field to the bio.  Add BIO_SYNC, BIO_WANT,
  and BIO_DONE.  Use atomic_cmpset_int() (aka cmpxchg) to interlock
  biodone() against biowait().

  vn_strategy() and dev_dstrategy() call semantics now require that
  synchronous BIO's install a bio_done function and set BIO_SYNC in
  the bio.

* Clean up the cluster code a bit.

* Redo the swap_pager code.  Instead of issuing I/O during the collection,
  which depended on critical sections to avoid races in the cluster append,
  we now build the entire collection first and then dispatch the I/O.
  This allows us to use only async completion for the BIOs, instead of
  a hybrid sync-or-async completion.

65 files changed:
sys/bus/cam/cam_sim.c
sys/bus/cam/cam_xpt.c
sys/dev/disk/ahci/ahci_dragonfly.c
sys/dev/disk/aic7xxx/aic_osm_lib.c
sys/dev/disk/ata/ata-raid.c
sys/dev/disk/fd/fd.c
sys/dev/disk/nata/ata-raid.c
sys/dev/disk/sili/sili_dragonfly.c
sys/dev/disk/vn/vn.c
sys/dev/drm/drmP.h
sys/dev/drm/drm_drv.c
sys/dev/drm/drm_lock.c
sys/dev/drm/radeon_cp.c
sys/dev/netif/iwi/if_iwi.c
sys/dev/raid/aac/aac.c
sys/dev/raid/vinum/vinuminterrupt.c
sys/dev/raid/vinum/vinumio.c
sys/dev/raid/vinum/vinumrequest.c
sys/dev/raid/vinum/vinumrevive.c
sys/dev/sound/pcm/sound.c
sys/kern/kern_device.c
sys/kern/kern_lock.c
sys/kern/kern_physio.c
sys/kern/kern_synch.c
sys/kern/kern_umtx.c
sys/kern/lwkt_ipiq.c
sys/kern/lwkt_serialize.c
sys/kern/lwkt_thread.c
sys/kern/subr_bus.c
sys/kern/subr_diskgpt.c
sys/kern/subr_disklabel32.c
sys/kern/subr_disklabel64.c
sys/kern/subr_diskmbr.c
sys/kern/sys_pipe.c
sys/kern/vfs_aio.c
sys/kern/vfs_bio.c
sys/kern/vfs_cluster.c
sys/kern/vfs_subr.c
sys/kern/vfs_vnops.c
sys/net/tap/if_tap.c
sys/netproto/smb/smb_subr.c
sys/sys/bio.h
sys/sys/buf.h
sys/sys/buf2.h
sys/sys/systm.h
sys/sys/thread.h
sys/vfs/gnu/ext2fs/ext2_bmap.c
sys/vfs/gnu/ext2fs/ext2_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_subs.c
sys/vfs/mfs/mfs_vnops.c
sys/vfs/nfs/nfs_bio.c
sys/vfs/nfs/nfs_serv.c
sys/vfs/nfs/nfs_syscalls.c
sys/vfs/nfs/nfs_vnops.c
sys/vfs/nwfs/nwfs_io.c
sys/vfs/nwfs/nwfs_vnops.c
sys/vfs/smbfs/smbfs_io.c
sys/vfs/smbfs/smbfs_vnops.c
sys/vfs/specfs/spec_vnops.c
sys/vfs/ufs/ffs_balloc.c
sys/vfs/ufs/ffs_inode.c
sys/vfs/ufs/ffs_rawread.c
sys/vfs/ufs/ufs_bmap.c
sys/vm/swap_pager.c

index 6a9f021..fcb770d 100644 (file)
@@ -101,8 +101,7 @@ sim_lock_sleep(void *ident, int flags, const char *wmesg, int timo,
        if (lock != &sim_mplock) {
                /* lock should be held already */
                KKASSERT(lockstatus(lock, curthread) != 0);
-               crit_enter();
-               tsleep_interlock(ident);
+               tsleep_interlock(ident, flags);
                lockmgr(lock, LK_RELEASE);
                retval = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
        } else {
@@ -111,7 +110,6 @@ sim_lock_sleep(void *ident, int flags, const char *wmesg, int timo,
 
        if (lock != &sim_mplock) {
                lockmgr(lock, LK_EXCLUSIVE);
-               crit_exit();
        }
 
        return (retval);
index 41e8824..be9dcef 100644 (file)
@@ -1416,11 +1416,9 @@ xpt_scanner_thread(void *dummy)
 #endif
                }
                xsoftc.ccb_scanq_running = 0;
-               crit_enter();
-               tsleep_interlock(&xsoftc.ccb_scanq);
+               tsleep_interlock(&xsoftc.ccb_scanq, 0);
                xpt_unlock_buses();
                tsleep(&xsoftc.ccb_scanq, PINTERLOCKED, "ccb_scanq", 0);
-               crit_exit();
        }
 }
 
index 5c54583..7ecc319 100644 (file)
@@ -293,11 +293,9 @@ ahci_port_thread(void *arg)
        while ((mask & AP_SIGF_STOP) == 0) {
                atomic_clear_int(&ap->ap_signal, mask);
                ahci_port_thread_core(ap, mask);
-               crit_enter();
-               tsleep_interlock(&ap->ap_thread);
+               tsleep_interlock(&ap->ap_thread, 0);
                if (ap->ap_signal == 0)
                        tsleep(&ap->ap_thread, PINTERLOCKED, "ahport", 0);
-               crit_exit();
                mask = ap->ap_signal;
        }
        ap->ap_thread = NULL;
index d30736a..d3268d5 100644 (file)
@@ -101,12 +101,10 @@ aic_terminate_recovery_thread(struct aic_softc *aic)
         * Sleep on a slightly different location 
         * for this interlock just for added safety.
         */
-       crit_enter();
        aic_lock(aic);
-       tsleep_interlock(aic->platform_data);
+       tsleep_interlock(aic->platform_data, 0);
        aic_unlock(aic);
        tsleep(aic->platform_data, PINTERLOCKED, "thtrm", 0);
-       crit_exit();
 }
 
 static void
@@ -120,12 +118,10 @@ aic_recovery_thread(void *arg)
                
                if (LIST_EMPTY(&aic->timedout_scbs) != 0
                 && (aic->flags & AIC_SHUTDOWN_RECOVERY) == 0) {
-                       crit_enter();
-                       tsleep_interlock(aic);
+                       tsleep_interlock(aic, 0);
                        aic_unlock(aic);
                        tsleep(aic, PINTERLOCKED, "idle", 0);
                        aic_lock(aic);
-                       crit_exit();
                }
 
                if ((aic->flags & AIC_SHUTDOWN_RECOVERY) != 0)
index 68d90cc..13e7604 100644 (file)
@@ -66,7 +66,6 @@ static struct dev_ops ar_ops = {
 /* prototypes */
 static void ar_attach_raid(struct ar_softc *, int);
 static void ar_done(struct bio *);
-static void ar_sync_done(struct bio *);
 static void ar_config_changed(struct ar_softc *, int);
 static int ar_rebuild(struct ar_softc *);
 static int ar_highpoint_read_conf(struct ad_softc *, struct ar_softc **);
@@ -730,13 +729,6 @@ ar_done(struct bio *bio)
 }
 
 static void
-ar_sync_done(struct bio *bio)
-{
-    bio->bio_buf->b_cmd = BUF_CMD_DONE;
-    wakeup(bio);
-}
-
-static void
 ar_config_changed(struct ar_softc *rdp, int writeback)
 {
     int disk, flags;
@@ -1400,10 +1392,12 @@ ar_rw(struct ad_softc *adp, u_int32_t lba, int count, caddr_t data, int flags)
     bp->b_data = data;
     bp->b_bio1.bio_offset = (off_t)lba << DEV_BSHIFT;
     bp->b_bcount = count;
-    if (flags & AR_WAIT)
-       bp->b_bio1.bio_done = ar_sync_done;
-    else
+    if (flags & AR_WAIT) {
+       bp->b_bio1.bio_flags |= BIO_SYNC;
+       bp->b_bio1.bio_done = biodone_sync;
+    } else {
        bp->b_bio1.bio_done = ar_rw_done;
+    }
     if (flags & AR_READ)
        bp->b_cmd = BUF_CMD_READ;
     if (flags & AR_WRITE)
@@ -1413,11 +1407,14 @@ ar_rw(struct ad_softc *adp, u_int32_t lba, int count, caddr_t data, int flags)
     dev_dstrategy(adp->dev, &bp->b_bio1);
 
     if (flags & AR_WAIT) {
-       while ((retry++ < (15*hz/10)) && (error = !(bp->b_cmd == BUF_CMD_DONE)))
-           error = tsleep(&bp->b_bio1, 0, "arrw", 10);
+       while (retry++ < (15*hz/10))
+           error = biowait_timeout(&bp->b_bio1, "arrw", 10);
        if (!error && (bp->b_flags & B_ERROR))
            error = bp->b_error;
-       kfree(bp, M_AR);
+       if (error == EWOULDBLOCK)
+           bp->b_bio1.bio_done = ar_rw_done;
+       else
+           kfree(bp, M_AR);
     }
     return error;
 }
index a50f848..b595da3 100644 (file)
@@ -2214,13 +2214,6 @@ retrier(struct fdc_data *fdc)
        return (1);
 }
 
-static void
-fdformat_wakeup(struct bio *bio)
-{
-       bio->bio_buf->b_cmd = BUF_CMD_DONE;
-       wakeup(bio);
-}
-
 static int
 fdformat(cdev_t dev, struct fd_formb *finfo, struct ucred *cred)
 {
@@ -2246,7 +2239,8 @@ fdformat(cdev_t dev, struct fd_formb *finfo, struct ucred *cred)
                (fd->ft.sectrac * fd->ft.heads)
                + finfo->head * fd->ft.sectrac) * fdblk;
        bp->b_bio1.bio_driver_info = dev;
-       bp->b_bio1.bio_done = fdformat_wakeup;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
+       bp->b_bio1.bio_done = biodone_sync;
 
        bp->b_bcount = sizeof(struct fd_idfield_data) * finfo->fd_formb_nsecs;
        bp->b_data = (caddr_t)finfo;
@@ -2255,14 +2249,7 @@ fdformat(cdev_t dev, struct fd_formb *finfo, struct ucred *cred)
        dev_dstrategy(dev, &bp->b_bio1);
 
        /* ...and wait for it to complete */
-       crit_enter();
-       while (bp->b_cmd != BUF_CMD_DONE) {
-               rv = tsleep(&bp->b_bio1, 0, "fdform", 20 * hz);
-               if (rv == EWOULDBLOCK)
-                       break;
-       }
-       crit_exit();
-
+       rv = biowait_timeout(&bp->b_bio1, "fdform", 20 * hz);
        if (rv == EWOULDBLOCK) {
                /* timed out */
                rv = EIO;
index 1d50afe..1231066 100644 (file)
@@ -899,12 +899,14 @@ ata_raid_dump(struct dev_dump_args *ap)
        /* bio_offset is byte granularity, convert block granularity a_blkno */
        dbuf.b_bio1.bio_offset = (off_t)(ap->a_blkno << DEV_BSHIFT);
        dbuf.b_bio1.bio_caller_info1.ptr = (void *)rdp;
+       dbuf.b_bio1.bio_flags |= BIO_SYNC;
+       dbuf.b_bio1.bio_done = biodone_sync;
        dbuf.b_bcount = dumppages * PAGE_SIZE;
        dbuf.b_data = va;
        dbuf.b_cmd = BUF_CMD_WRITE;
        dev_dstrategy(rdp->cdev, &dbuf.b_bio1);
        /* wait for completion, unlock the buffer, check status */
-       if (biowait(&dbuf)) {
+       if (biowait(&dbuf.b_bio1, "dumpw")) {
            BUF_UNLOCK(&dbuf);
            return(dbuf.b_error ? dbuf.b_error : EIO);
        }
index 6641094..3de0914 100644 (file)
@@ -293,11 +293,9 @@ sili_port_thread(void *arg)
        while ((mask & AP_SIGF_STOP) == 0) {
                atomic_clear_int(&ap->ap_signal, mask);
                sili_port_thread_core(ap, mask);
-               crit_enter();
-               tsleep_interlock(&ap->ap_thread);
+               tsleep_interlock(&ap->ap_thread, 0);
                if (ap->ap_signal == 0)
                        tsleep(&ap->ap_thread, PINTERLOCKED, "ahport", 0);
-               crit_exit();
                mask = ap->ap_signal;
        }
        ap->ap_thread = NULL;
index decaa74..a3f06f5 100644 (file)
@@ -274,8 +274,6 @@ vnopen(struct dev_open_args *ap)
  *     Run strategy routine for VN device.  We use VOP_READ/VOP_WRITE calls
  *     for vnode-backed vn's, and the new vm_pager_strategy() call for
  *     vm_object-backed vn's.
- *
- *     Currently B_ASYNC is only partially handled - for OBJT_SWAP I/O only.
  */
 static int
 vnstrategy(struct dev_strategy_args *ap)
index 7cfce1a..7b7a3fe 100644 (file)
@@ -289,12 +289,10 @@ for ( ret = 0 ; !ret && !(condition) ; ) {                        \
        DRM_UNLOCK();                                           \
        lwkt_serialize_enter(&dev->irq_lock);                   \
        if (!(condition)) {                                     \
-            crit_enter();                                      \
-            tsleep_interlock(&(queue));                                \
+            tsleep_interlock(&(queue), PCATCH);                        \
             lwkt_serialize_exit(&dev->irq_lock);               \
             ret = -tsleep(&(queue), PCATCH | PINTERLOCKED,     \
                          "drmwtq", (timeout));                 \
-            crit_exit();                                       \
        } else {                                                \
                lwkt_serialize_exit(&dev->irq_lock);            \
        }                                                       \
index 0cc834e..f4c2cf0 100644 (file)
@@ -660,12 +660,10 @@ int drm_close(struct dev_close_args *ap)
                                break;  /* Got lock */
                        }
                        /* Contention */
-                       crit_enter();
-                       tsleep_interlock((void *)&dev->lock.lock_queue);
+                       tsleep_interlock((void *)&dev->lock.lock_queue, PCATCH);
                        DRM_UNLOCK();
                        retcode = tsleep((void *)&dev->lock.lock_queue,
-                           PCATCH | PINTERLOCKED, "drmlk2", 0);
-                       crit_exit();
+                                        PCATCH | PINTERLOCKED, "drmlk2", 0);
                        DRM_LOCK();
                        if (retcode)
                                break;
index 3306398..4e71c21 100644 (file)
@@ -78,12 +78,10 @@ int drm_lock(struct drm_device *dev, void *data, struct drm_file *file_priv)
                }
 
                /* Contention */
-               crit_enter();
-               tsleep_interlock((void *)&dev->lock.lock_queue);
+               tsleep_interlock((void *)&dev->lock.lock_queue, PCATCH);
                DRM_UNLOCK();
                ret = tsleep((void *)&dev->lock.lock_queue,
                             PCATCH | PINTERLOCKED, "drmlk2", 0);
-               crit_exit();
                DRM_LOCK();
                if (ret != 0)
                        break;
index fc9260e..db5d1fa 100644 (file)
@@ -1688,25 +1688,23 @@ void radeon_do_release(struct drm_device * dev)
                        if ((dev_priv->flags & RADEON_FAMILY_MASK) >= CHIP_R600) {
                                while ((ret = r600_do_cp_idle(dev_priv)) != 0) {
                                        DRM_DEBUG("radeon_do_cp_idle %d\n", ret);
-                                       crit_enter();
-                                       tsleep_interlock(&dev->lock.lock_queue);
+                                       tsleep_interlock(&dev->lock.lock_queue,
+                                                        PCATCH);
                                        DRM_UNLOCK();
                                        ret = tsleep(&dev->lock.lock_queue,
                                                    PCATCH | PINTERLOCKED,
                                                    "rdnrel", 0);
-                                       crit_exit();
                                        DRM_LOCK();
                                }
                        } else {
                                while ((ret = radeon_do_cp_idle(dev_priv)) != 0) {
                                        DRM_DEBUG("radeon_do_cp_idle %d\n", ret);
-                                       crit_enter();
-                                       tsleep_interlock(&dev->lock.lock_queue);
+                                       tsleep_interlock(&dev->lock.lock_queue,
+                                                        PCATCH);
                                        DRM_UNLOCK();
                                        ret = tsleep(&dev->lock.lock_queue,
                                                    PCATCH | PINTERLOCKED,
                                                    "rdnrel", 0);
-                                       crit_exit();
                                        DRM_LOCK();
                                }
                        }
index 1f33f68..d5d705d 100644 (file)
@@ -223,12 +223,10 @@ iwi_fw_monitor(void *arg)
                 * especially when attaching fails.
                 */
                if ((sc->flags & IWI_FLAG_EXIT) == 0) {
-                       crit_enter();
-                       tsleep_interlock(IWI_FW_WAKE_MONITOR(sc));
+                       tsleep_interlock(IWI_FW_WAKE_MONITOR(sc), 0);
                        lwkt_serialize_exit(ifp->if_serializer);
                        error = tsleep(IWI_FW_WAKE_MONITOR(sc),
                                       PINTERLOCKED, "iwifwm", 0);
-                       crit_exit();
                        lwkt_serialize_enter(ifp->if_serializer);
                }
 
@@ -261,13 +259,11 @@ iwi_fw_monitor(void *arg)
                                if (sc->flags & IWI_FLAG_EXIT)
                                        break;
 
-                               crit_enter();
-                               tsleep_interlock(IWI_FW_CMD_ACKED(sc));
+                               tsleep_interlock(IWI_FW_CMD_ACKED(sc), 0);
                                lwkt_serialize_exit(ifp->if_serializer);
                                error = tsleep(IWI_FW_CMD_ACKED(sc),
                                               PINTERLOCKED,
                                               "iwirun", boff * hz);
-                               crit_exit();
                                lwkt_serialize_enter(ifp->if_serializer);
                        }
                }
@@ -570,11 +566,9 @@ iwi_detach(device_t dev)
                sc->flags |= IWI_FLAG_EXIT;
                wakeup(IWI_FW_WAKE_MONITOR(sc));
 
-               crit_enter();
-               tsleep_interlock(IWI_FW_EXIT_MONITOR(sc));
+               tsleep_interlock(IWI_FW_EXIT_MONITOR(sc), 0);
                lwkt_serialize_exit(ifp->if_serializer);
                tsleep(IWI_FW_EXIT_MONITOR(sc), PINTERLOCKED, "iwiexi", 0);
-               crit_exit();
                /* No need to hold serializer again */
 
                if_printf(ifp, "fw monitor exited\n");
@@ -1626,11 +1620,9 @@ iwi_cmd(struct iwi_softc *sc, uint8_t type, void *data, uint8_t len, int async)
        if (!async) {
                ASSERT_SERIALIZED(ifp->if_serializer);
 
-               crit_enter();
-               tsleep_interlock(IWI_FW_CMD_ACKED(sc));
+               tsleep_interlock(IWI_FW_CMD_ACKED(sc), 0);
                lwkt_serialize_exit(ifp->if_serializer);
                ret = tsleep(IWI_FW_CMD_ACKED(sc), PINTERLOCKED, "iwicmd", hz);
-               crit_exit();
                lwkt_serialize_enter(ifp->if_serializer);
        } else {
                ret = 0;
@@ -2291,11 +2283,9 @@ iwi_load_firmware(struct iwi_softc *sc, void *fw, int size)
        CSR_WRITE_4(sc, IWI_CSR_CTL, tmp | IWI_CTL_ALLOW_STANDBY);
 
        /* wait at most one second for firmware initialization to complete */
-       crit_enter();
-       tsleep_interlock(IWI_FW_INITIALIZED(sc));
+       tsleep_interlock(IWI_FW_INITIALIZED(sc), 0);
        lwkt_serialize_exit(ifp->if_serializer);
        error = tsleep(IWI_FW_INITIALIZED(sc), PINTERLOCKED, "iwiinit", hz);
-       crit_exit();
        lwkt_serialize_enter(ifp->if_serializer);
        if (error != 0) {
                device_printf(sc->sc_dev, "timeout waiting for firmware "
index c869d89..ffe55a2 100644 (file)
@@ -921,13 +921,11 @@ aac_command_thread(struct aac_softc *sc)
        while ((sc->aifflags & AAC_AIFFLAGS_EXIT) == 0) {
                retval = 0;
                if ((sc->aifflags & AAC_AIFFLAGS_PENDING) == 0) {
-                       crit_enter();
-                       tsleep_interlock(sc->aifthread);
+                       tsleep_interlock(sc->aifthread, 0);
                        AAC_LOCK_RELEASE(&sc->aac_io_lock);
                        retval = tsleep(sc->aifthread, PINTERLOCKED,
                                        "aifthd", AAC_PERIODIC_INTERVAL * hz);
                        AAC_LOCK_ACQUIRE(&sc->aac_io_lock);
-                       crit_exit();
                }
                /*
                 * First see if any FIBs need to be allocated.  This needs
@@ -1367,12 +1365,10 @@ aac_wait_command(struct aac_command *cm)
        aac_startio(sc);
        /* Lock is held */
        KKASSERT(lockstatus(&sc->aac_io_lock, curthread) != 0);
-       crit_enter();
-       tsleep_interlock(cm);
+       tsleep_interlock(cm, 0);
        AAC_LOCK_RELEASE(&sc->aac_io_lock);
        error = tsleep(cm, PINTERLOCKED, "aacwait", 0);
        AAC_LOCK_ACQUIRE(&sc->aac_io_lock);
-       crit_exit();
        return(error);
 }
 
@@ -3135,12 +3131,10 @@ aac_ioctl_sendfib(struct aac_softc *sc, caddr_t ufib)
                event->ev_callback = aac_ioctl_event;
                event->ev_arg = &cm;
                aac_add_event(sc, event);
-               crit_enter();
-               tsleep_interlock(&cm);
+               tsleep_interlock(&cm, 0);
                AAC_LOCK_RELEASE(&sc->aac_io_lock);
                tsleep(&cm, PINTERLOCKED, "sendfib", 0);
                AAC_LOCK_ACQUIRE(&sc->aac_io_lock);
-               crit_exit();
        }
        AAC_LOCK_RELEASE(&sc->aac_io_lock);
 
index 09cff0d..24c35b5 100644 (file)
@@ -284,6 +284,7 @@ sdio_done(struct bio *bio)
        SD[sbp->sdno].writes++;
        SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
     }
+    biodone_sync(bio);
     biodone(sbp->bio);                                     /* complete the caller's I/O */
     BUF_UNLOCK(&sbp->b);
     BUF_LOCKFREE(&sbp->b);
index 3702fae..9595127 100644 (file)
@@ -290,11 +290,13 @@ driveio(struct drive *drive, char *buf, size_t length, off_t offset, buf_cmd_t c
        bp = geteblk(len);                                  /* get a buffer header */
        bp->b_cmd = cmd;
        bp->b_bio1.bio_offset = offset;                     /* disk offset */
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        saveaddr = bp->b_data;
        bp->b_data = buf;
        bp->b_bcount = len;
        vn_strategy(drive->vp, &bp->b_bio1);
-       error = biowait(bp);
+       error = biowait(&bp->b_bio1, (cmd == BUF_CMD_READ ? "drvrd" : "drvwr"));
        bp->b_data = saveaddr;
        bp->b_flags |= B_INVAL | B_AGE;
        bp->b_flags &= ~B_ERROR;
index 9e089f7..d809d4a 100644 (file)
@@ -813,7 +813,7 @@ build_rq_buffer(struct rqelement *rqe, struct plex *plex)
 
     /* Initialize the buf struct */
     /* copy these flags from user bp */
-    bp->b_flags = ubp->b_flags & (B_ORDERED | B_NOCACHE | B_ASYNC);
+    bp->b_flags = ubp->b_flags & (B_ORDERED | B_NOCACHE);
     bp->b_cmd = ubp->b_cmd;
 #ifdef VINUMDEBUG
     if (rqe->flags & XFR_BUFLOCKED)                        /* paranoia */
@@ -956,6 +956,7 @@ sdio(struct bio *bio)
     initbufbio(&sbp->b);
     sbp->b.b_bio1.bio_offset = bio->bio_offset + ((off_t)sd->driveoffset << DEV_BSHIFT);
     sbp->b.b_bio1.bio_done = sdio_done;                            /* come here on completion */
+    sbp->b.b_bio1.bio_flags |= BIO_SYNC;
     sbp->bio = bio;                                        /* note the address of the original header */
     sbp->sdno = sd->sdno;                                  /* note for statistics */
     sbp->driveno = sd->driveno;
index 6333ab9..c2904f2 100644 (file)
@@ -150,6 +150,8 @@ revive_block(int sdno)
        bp->b_bcount = size;
        bp->b_resid = bp->b_bcount;
        bp->b_bio1.bio_offset = (off_t)plexblkno << DEV_BSHIFT;             /* start here */
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        if (isstriped(plex))                                /* we need to lock striped plexes */
            lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
        if (vol != NULL)                                    /* it's part of a volume, */
@@ -163,7 +165,7 @@ revive_block(int sdno)
 
        bp->b_cmd = BUF_CMD_READ;
        vinumstart(dev, &bp->b_bio1, 1);
-       biowait(bp);
+       biowait(&bp->b_bio1, "drvrd");
     }
 
     if (bp->b_flags & B_ERROR)
@@ -178,7 +180,7 @@ revive_block(int sdno)
        bp->b_bio1.bio_offset = (off_t)sd->revived << DEV_BSHIFT;                   /* write it to here */
        bp->b_bio1.bio_driver_info = dev;
        sdio(&bp->b_bio1);                                  /* perform the I/O */
-       biowait(bp);
+       biowait(&bp->b_bio1, "drvwr");
        if (bp->b_flags & B_ERROR)
            error = bp->b_error;
        else {
@@ -293,7 +295,7 @@ parityops(struct vinum_ioctl_msg *data)
            pbp->b_cmd = BUF_CMD_WRITE;
            pbp->b_resid = pbp->b_bcount;
            sdio(&pbp->b_bio1);                             /* write the parity block */
-           biowait(pbp);
+           biowait(&pbp->b_bio1, "drvwr");
        }
        if (((op == checkparity)
                || (op == rebuildandcheckparity))
@@ -439,7 +441,7 @@ parityrebuild(struct plex *plex,
      */
     for (sdno = 0; sdno < plex->subdisks; sdno++) {        /* for each subdisk */
        if ((sdno != psd) || (op != rebuildparity)) {
-           biowait(bpp[sdno]);
+           biowait(&bpp[sdno]->b_bio1, "drvio");
            if (bpp[sdno]->b_flags & B_ERROR)               /* can't read, */
                error = bpp[sdno]->b_error;
            else if (sdno != psd) {                         /* update parity */
@@ -536,7 +538,7 @@ initsd(int sdno, int verify)
        bzero(bp->b_data, bp->b_bcount);
        bp->b_cmd = BUF_CMD_WRITE;
        sdio(&bp->b_bio1);                  /* perform the I/O */
-       biowait(bp);
+       biowait(&bp->b_bio1, "drvwr");
        if (bp->b_flags & B_ERROR)
            error = bp->b_error;
        if ((error == 0) && verify) {                       /* check that it got there */
@@ -546,7 +548,7 @@ initsd(int sdno, int verify)
            bp->b_bio1.bio_driver_info = VINUM_SD(sdno);
            bp->b_cmd = BUF_CMD_READ;               /* read it back */
            sdio(&bp->b_bio1);
-           biowait(bp);
+           biowait(&bp->b_bio1, "drvrd");
            /*
             * XXX Bug fix code.  This is hopefully no
             * longer needed (21 February 2000).
index 4acc95b..1fb2082 100644 (file)
@@ -128,12 +128,10 @@ snd_mtxsleep(void *addr, sndlock_t lock, int flags, const char *wmesg, int timo)
 {
        int r;
 
-       crit_enter();
-       tsleep_interlock(addr);
+       tsleep_interlock(addr, flags);
        snd_mtxunlock(lock);
        r = tsleep(addr, flags | PINTERLOCKED, wmesg, timo);
        snd_mtxlock(lock);
-       crit_exit();
        return(r);
 }
 
index 30bb88c..c6085bf 100644 (file)
@@ -247,6 +247,7 @@ dev_dstrategy(cdev_t dev, struct bio *bio)
            track = &dev->si_track_write;
        bio_track_ref(track);
        bio->bio_track = track;
+       KKASSERT((bio->bio_flags & BIO_DONE) == 0);
        (void)dev->si_ops->d_strategy(&ap);
 }
 
@@ -255,10 +256,12 @@ dev_dstrategy_chain(cdev_t dev, struct bio *bio)
 {
        struct dev_strategy_args ap;
 
-       KKASSERT(bio->bio_track != NULL);
        ap.a_head.a_desc = &dev_strategy_desc;
        ap.a_head.a_dev = dev;
        ap.a_bio = bio;
+
+       KKASSERT(bio->bio_track != NULL);
+       KKASSERT((bio->bio_flags & BIO_DONE) == 0);
        (void)dev->si_ops->d_strategy(&ap);
 }
 
index afcb251..9ae8167 100644 (file)
@@ -395,10 +395,10 @@ debuglockmgr(struct lock *lkp, u_int flags,
                        if (lkp->lk_lockholder != td &&
                            lkp->lk_lockholder != LK_KERNTHREAD) {
                                spin_unlock_wr(&lkp->lk_spinlock);
-                               panic("lockmgr: pid %d, not %s thr %p unlocking",
-                                   (td->td_proc ? td->td_proc->p_pid : -99),
+                               panic("lockmgr: pid %d, not %s thr %p/%p unlocking",
+                                   (td->td_proc ? td->td_proc->p_pid : -1),
                                    "exclusive lock holder",
-                                   lkp->lk_lockholder);
+                                   td, lkp->lk_lockholder);
                        }
                        if (lkp->lk_lockholder != LK_KERNTHREAD) {
                                COUNT(td, -1);
index 58e66d1..f930f4e 100644 (file)
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
-static void
-physwakeup(struct bio *bio)
-{
-       bio->bio_buf->b_cmd = BUF_CMD_DONE;
-       wakeup(bio);
-}
-
 static int
 physio(cdev_t dev, struct uio *uio, int ioflag)
 {
@@ -76,7 +69,8 @@ physio(cdev_t dev, struct uio *uio, int ioflag)
 
                        reinitbufbio(bp);       /* clear translation cache */
                        bp->b_bio1.bio_offset = uio->uio_offset;
-                       bp->b_bio1.bio_done = physwakeup;
+                       bp->b_bio1.bio_done = biodone_sync;
+                       bp->b_bio1.bio_flags |= BIO_SYNC;
 
                        /* 
                         * Setup for mapping the request into kernel memory.
@@ -133,10 +127,7 @@ physio(cdev_t dev, struct uio *uio, int ioflag)
                                bp->b_bcount = bcount;
                        }
                        dev_dstrategy(dev, &bp->b_bio1);
-                       crit_enter();
-                       while (bp->b_cmd != BUF_CMD_DONE)
-                               tsleep(&bp->b_bio1, 0, "physstr", 0);
-                       crit_exit();
+                       biowait(&bp->b_bio1, "physstr");
 
                        iolen = bp->b_bcount - bp->b_resid;
                        if (uio->uio_segflg == UIO_USERSPACE) {
index be4727b..23095b3 100644 (file)
@@ -111,7 +111,7 @@ static fixpt_t cexp[3] = {
 };
 
 static void    endtsleep (void *);
-static void    unsleep_and_wakeup_thread(struct thread *td);
+static void    tsleep_wakeup(struct thread *td);
 static void    loadav (void *arg);
 static void    schedcpu (void *arg);
 
@@ -326,6 +326,116 @@ sleep_gdinit(globaldata_t gd)
 }
 
 /*
+ * This is a dandy function that allows us to interlock tsleep/wakeup
+ * operations with unspecified upper level locks, such as lockmgr locks,
+ * simply by holding a critical section.  The sequence is:
+ *
+ *     (acquire upper level lock)
+ *     tsleep_interlock(blah)
+ *     (release upper level lock)
+ *     tsleep(blah, ...)
+ *
+ * Basically this functions queues us on the tsleep queue without actually
+ * descheduling us.  When tsleep() is later called with PINTERLOCK it
+ * assumes the thread was already queued, otherwise it queues it there.
+ *
+ * Thus it is possible to receive the wakeup prior to going to sleep and
+ * the race conditions are covered.
+ */
+static __inline void
+_tsleep_interlock(globaldata_t gd, void *ident, int flags)
+{
+       thread_t td = gd->gd_curthread;
+       int id;
+
+       crit_enter_quick(td);
+       if (td->td_flags & TDF_TSLEEPQ) {
+               id = LOOKUP(td->td_wchan);
+               TAILQ_REMOVE(&gd->gd_tsleep_hash[id], td, td_sleepq);
+               if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL)
+                       atomic_clear_int(&slpque_cpumasks[id], gd->gd_cpumask);
+       } else {
+               td->td_flags |= TDF_TSLEEPQ;
+       }
+       id = LOOKUP(ident);
+       TAILQ_INSERT_TAIL(&gd->gd_tsleep_hash[id], td, td_sleepq);
+       atomic_set_int(&slpque_cpumasks[id], gd->gd_cpumask);
+       td->td_wchan = ident;
+       td->td_wdomain = flags & PDOMAIN_MASK;
+       atomic_set_int(&slpque_cpumasks[id], gd->gd_cpumask);
+       crit_exit_quick(td);
+}
+
+void
+tsleep_interlock(void *ident, int flags)
+{
+       _tsleep_interlock(mycpu, ident, flags);
+}
+
+/*
+ * Remove thread from sleepq.  Must be called with a critical section held.
+ */
+static __inline void
+_tsleep_remove(thread_t td)
+{
+       globaldata_t gd = mycpu;
+       int id;
+
+       KKASSERT(td->td_gd == gd);
+       if (td->td_flags & TDF_TSLEEPQ) {
+               td->td_flags &= ~TDF_TSLEEPQ;
+               id = LOOKUP(td->td_wchan);
+               TAILQ_REMOVE(&gd->gd_tsleep_hash[id], td, td_sleepq);
+               if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL)
+                       atomic_clear_int(&slpque_cpumasks[id], gd->gd_cpumask);
+               td->td_wchan = NULL;
+               td->td_wdomain = 0;
+       }
+}
+
+void
+tsleep_remove(thread_t td)
+{
+       _tsleep_remove(td);
+}
+
+/*
+ * This function removes a thread from the tsleep queue and schedules
+ * it.  This function may act asynchronously.  The target thread may be
+ * sleeping on a different cpu.
+ *
+ * This function mus be called while in a critical section but if the
+ * target thread is sleeping on a different cpu we cannot safely probe
+ * td_flags.
+ */
+static __inline
+void
+_tsleep_wakeup(struct thread *td)
+{
+       globaldata_t gd = mycpu;
+
+#ifdef SMP
+       if (td->td_gd != gd) {
+               lwkt_send_ipiq(td->td_gd, (ipifunc1_t)tsleep_wakeup, td);
+               return;
+       }
+#endif
+       _tsleep_remove(td);
+       if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
+               td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
+               lwkt_schedule(td);
+       }
+}
+
+static
+void
+tsleep_wakeup(struct thread *td)
+{
+       _tsleep_wakeup(td);
+}
+
+
+/*
  * General sleep call.  Suspends the current process until a wakeup is
  * performed on the specified identifier.  The process will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
@@ -387,7 +497,6 @@ tsleep(void *ident, int flags, const char *wmesg, int timo)
         * The entire sequence through to where we actually sleep must
         * run without breaking the critical section.
         */
-       id = LOOKUP(ident);
        catch = flags & PCATCH;
        error = 0;
        sig = 0;
@@ -451,23 +560,18 @@ tsleep(void *ident, int flags, const char *wmesg, int timo)
         *
         * Even the usched->release function just above can muff it up.
         */
-       if ((flags & PINTERLOCKED) &&
-           (slpque_cpumasks[id] & gd->gd_cpumask) == 0) {
-               logtsleep2(ilockfail, ident);
-               goto resume;
+       if (flags & PINTERLOCKED) {
+               if ((td->td_flags & TDF_TSLEEPQ) == 0) {
+                       logtsleep2(ilockfail, ident);
+                       goto resume;
+               }
+       } else {
+               id = LOOKUP(ident);
+               _tsleep_interlock(gd, ident, flags);
        }
-
-       /*
-        * Move our thread to the correct queue and setup our wchan, etc.
-        */
        lwkt_deschedule_self(td);
-       td->td_flags |= TDF_TSLEEPQ;
-       TAILQ_INSERT_TAIL(&gd->gd_tsleep_hash[id], td, td_threadq);
-       atomic_set_int(&slpque_cpumasks[id], gd->gd_cpumask);
-
-       td->td_wchan = ident;
+       td->td_flags |= TDF_TSLEEP_DESCHEDULED;
        td->td_wmesg = wmesg;
-       td->td_wdomain = flags & PDOMAIN_MASK;
 
        /*
         * Setup the timeout, if any
@@ -524,14 +628,16 @@ tsleep(void *ident, int flags, const char *wmesg, int timo)
        }
 
        /*
-        * Since td_threadq is used both for our run queue AND for the
-        * tsleep hash queue, we can't still be on it at this point because
-        * we've gotten cpu back.
+        * Make sure we have been removed from the sleepq.  This should
+        * have been done for us already.
         */
-       KASSERT((td->td_flags & TDF_TSLEEPQ) == 0, ("tsleep: impossible thread flags %08x", td->td_flags));
-       td->td_wchan = NULL;
+       _tsleep_remove(td);
        td->td_wmesg = NULL;
-       td->td_wdomain = 0;
+       if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
+               td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
+               kprintf("td %p (%s) unexpectedly rescheduled\n",
+                       td, td->td_comm);
+       }
 
        /*
         * Figure out the correct error return.  If interrupted by a
@@ -564,43 +670,6 @@ resume:
 }
 
 /*
- * This is a dandy function that allows us to interlock tsleep/wakeup
- * operations with unspecified upper level locks, such as lockmgr locks,
- * simply by holding a critical section.  The sequence is:
- *
- *     (enter critical section)
- *     (acquire upper level lock)
- *     tsleep_interlock(blah)
- *     (release upper level lock)
- *     tsleep(blah, ...)
- *     (exit critical section)
- *
- * Basically this function sets our cpumask for the ident which informs
- * other cpus that our cpu 'might' be waiting (or about to wait on) the
- * hash index related to the ident.  The critical section prevents another
- * cpu's wakeup() from being processed on our cpu until we are actually
- * able to enter the tsleep().  Thus, no race occurs between our attempt
- * to release a resource and sleep, and another cpu's attempt to acquire
- * a resource and call wakeup.
- *
- * There isn't much of a point to this function unless you call it while
- * holding a critical section.
- */
-static __inline void
-_tsleep_interlock(globaldata_t gd, void *ident)
-{
-       int id = LOOKUP(ident);
-
-       atomic_set_int(&slpque_cpumasks[id], gd->gd_cpumask);
-}
-
-void
-tsleep_interlock(void *ident)
-{
-       _tsleep_interlock(mycpu, ident);
-}
-
-/*
  * Interlocked spinlock sleep.  An exclusively held spinlock must
  * be passed to msleep().  The function will atomically release the
  * spinlock and tsleep on the ident, then reacquire the spinlock and
@@ -616,12 +685,10 @@ msleep(void *ident, struct spinlock *spin, int flags,
        globaldata_t gd = mycpu;
        int error;
 
-       crit_enter_gd(gd);
-       _tsleep_interlock(gd, ident);
+       _tsleep_interlock(gd, ident, flags);
        spin_unlock_wr_quick(gd, spin);
        error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
        spin_lock_wr_quick(gd, spin);
-       crit_exit_gd(gd);
 
        return (error);
 }
@@ -636,16 +703,15 @@ int
 serialize_sleep(void *ident, struct lwkt_serialize *slz, int flags,
                const char *wmesg, int timo)
 {
+       globaldata_t gd = mycpu;
        int ret;
 
        ASSERT_SERIALIZED(slz);
 
-       crit_enter();
-       tsleep_interlock(ident);
+       _tsleep_interlock(gd, ident, flags);
        lwkt_serialize_exit(slz);
        ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
        lwkt_serialize_enter(slz);
-       crit_exit();
 
        return ret;
 }
@@ -657,7 +723,7 @@ serialize_sleep(void *ident, struct lwkt_serialize *slz, int flags,
  *
  * Setting TDF_SINTR will cause new signals to directly schedule us.
  *
- * This routine is typically called while in a critical section.
+ * This routine must be called while in a critical section.
  */
 int
 lwkt_sleep(const char *wmesg, int flags)
@@ -713,7 +779,7 @@ endtsleep(void *arg)
         * the cpu owning the thread.  proc flags are only manipulated
         * by the older of the MP lock.  We have both.
         */
-       if (td->td_flags & TDF_TSLEEPQ) {
+       if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
                td->td_flags |= TDF_TIMEOUT;
 
                if ((lp = td->td_lwp) != NULL) {
@@ -721,43 +787,13 @@ endtsleep(void *arg)
                        if (lp->lwp_proc->p_stat != SSTOP)
                                setrunnable(lp);
                } else {
-                       unsleep_and_wakeup_thread(td);
+                       _tsleep_wakeup(td);
                }
        }
        crit_exit();
 }
 
 /*
- * Unsleep and wakeup a thread.  This function runs without the MP lock
- * which means that it can only manipulate thread state on the owning cpu,
- * and cannot touch the process state at all.
- */
-static
-void
-unsleep_and_wakeup_thread(struct thread *td)
-{
-       globaldata_t gd = mycpu;
-       int id;
-
-#ifdef SMP
-       if (td->td_gd != gd) {
-               lwkt_send_ipiq(td->td_gd, (ipifunc1_t)unsleep_and_wakeup_thread, td);
-               return;
-       }
-#endif
-       crit_enter();
-       if (td->td_flags & TDF_TSLEEPQ) {
-               td->td_flags &= ~TDF_TSLEEPQ;
-               id = LOOKUP(td->td_wchan);
-               TAILQ_REMOVE(&gd->gd_tsleep_hash[id], td, td_threadq);
-               if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL)
-                       atomic_clear_int(&slpque_cpumasks[id], gd->gd_cpumask);
-               lwkt_schedule(td);
-       }
-       crit_exit();
-}
-
-/*
  * Make all processes sleeping on the specified identifier runnable.
  * count may be zero or one only.
  *
@@ -787,20 +823,18 @@ _wakeup(void *ident, int domain)
        qp = &gd->gd_tsleep_hash[id];
 restart:
        for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
-               ntd = TAILQ_NEXT(td, td_threadq);
+               ntd = TAILQ_NEXT(td, td_sleepq);
                if (td->td_wchan == ident && 
                    td->td_wdomain == (domain & PDOMAIN_MASK)
                ) {
-                       KKASSERT(td->td_flags & TDF_TSLEEPQ);
-                       td->td_flags &= ~TDF_TSLEEPQ;
-                       TAILQ_REMOVE(qp, td, td_threadq);
-                       if (TAILQ_FIRST(qp) == NULL) {
-                               atomic_clear_int(&slpque_cpumasks[id],
-                                                gd->gd_cpumask);
+                       KKASSERT(td->td_gd == gd);
+                       _tsleep_remove(td);
+                       if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
+                               td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
+                               lwkt_schedule(td);
+                               if (domain & PWAKEUP_ONE)
+                                       goto done;
                        }
-                       lwkt_schedule(td);
-                       if (domain & PWAKEUP_ONE)
-                               goto done;
                        goto restart;
                }
        }
@@ -949,7 +983,7 @@ setrunnable(struct lwp *lp)
        if (lp->lwp_stat == LSSTOP)
                lp->lwp_stat = LSSLEEP;
        if (lp->lwp_stat == LSSLEEP && (lp->lwp_flag & LWP_BREAKTSLEEP))
-               unsleep_and_wakeup_thread(lp->lwp_thread);
+               _tsleep_wakeup(lp->lwp_thread);
        crit_exit();
 }
 
index b765651..3e76048 100644 (file)
@@ -132,7 +132,7 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
        }
        waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset);
        crit_enter();
-       tsleep_interlock(waddr);
+       tsleep_interlock(waddr, PCATCH | PDOMAIN_UMTX);
        if (*(int *)(sf_buf_kva(sf) + offset) == uap->value) {
            vm_page_init_action(&action, umtx_sleep_page_action_cow, waddr);
            vm_page_register_action(m, &action, VMEVENT_COW);
index 7a5c115..5d319a8 100644 (file)
@@ -600,13 +600,11 @@ lwkt_synchronize_ipiqs(const char *wmesg)
     other_cpumask = mycpu->gd_other_cpus & smp_active_mask;
     lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, &other_cpumask);
 
-    crit_enter();
     while (other_cpumask != 0) {
-       tsleep_interlock(&other_cpumask);
+       tsleep_interlock(&other_cpumask, 0);
        if (other_cpumask != 0)
            tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0);
     }
-    crit_exit();
 }
 
 #endif
index a29fac6..3ad0eb3 100644 (file)
@@ -286,14 +286,13 @@ static void
 lwkt_serialize_sleep(void *info)
 {
     lwkt_serialize_t s = info;
-    crit_enter();
-    tsleep_interlock(s);
+
+    tsleep_interlock(s, 0);
     if (atomic_intr_cond_test(&s->interlock) != 0) {
        logslz(sleep_beg, s);
        tsleep(s, PINTERLOCKED, "slize", 0);
        logslz(sleep_end, s);
     }
-    crit_exit();
 }
 
 #ifdef SMP
@@ -336,14 +335,12 @@ lwkt_serialize_adaptive_sleep(void *arg)
            return;
     }
 
-    crit_enter();
-    tsleep_interlock(s);
+    tsleep_interlock(s, 0);
     if (atomic_intr_cond_test(&s->interlock) != 0) {
        logslz(sleep_beg, s);
        tsleep(s, PINTERLOCKED, "slize", 0);
        logslz(sleep_end, s);
     }
-    crit_exit();
 }
 
 #endif /* SMP */
index ea02c27..8d4198d 100644 (file)
@@ -1195,6 +1195,9 @@ lwkt_schedule_noresched(thread_t td)
  * At any point after lwkt_giveaway() is called, the target cpu may
  * 'pull' the thread by calling lwkt_acquire().
  *
+ * We have to make sure the thread is not sitting on a per-cpu tsleep
+ * queue or it will blow up when it moves to another cpu.
+ *
  * MPSAFE - must be called under very specific conditions.
  */
 void
@@ -1203,6 +1206,8 @@ lwkt_giveaway(thread_t td)
        globaldata_t gd = mycpu;
 
        crit_enter_gd(gd);
+       if (td->td_flags & TDF_TSLEEPQ)
+               tsleep_remove(td);
        KKASSERT(td->td_gd == gd);
        TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
        td->td_flags |= TDF_MIGRATING;
@@ -1322,6 +1327,10 @@ lwkt_setpri_self(int pri)
  * moving our thread to the tdallq of the target cpu, IPI messaging the
  * target cpu, and switching out.  TDF_MIGRATING prevents scheduling
  * races while the thread is being migrated.
+ *
+ * We must be sure to remove ourselves from the current cpu's tsleepq
+ * before potentially moving to another queue.  The thread can be on
+ * a tsleepq due to a left-over tsleep_interlock().
  */
 #ifdef SMP
 static void lwkt_setcpu_remote(void *arg);
@@ -1335,6 +1344,8 @@ lwkt_setcpu_self(globaldata_t rgd)
 
     if (td->td_gd != rgd) {
        crit_enter_quick(td);
+       if (td->td_flags & TDF_TSLEEPQ)
+               tsleep_remove(td);
        td->td_flags |= TDF_MIGRATING;
        lwkt_deschedule_self(td);
        TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
index 7d84e19..57b1c92 100644 (file)
@@ -223,11 +223,9 @@ devread(struct dev_read_args *ap)
                        lockmgr(&devsoftc.lock, LK_RELEASE);
                        return (EAGAIN);
                }
-               crit_enter();
-               tsleep_interlock(&devsoftc);
+               tsleep_interlock(&devsoftc, PCATCH);
                lockmgr(&devsoftc.lock, LK_RELEASE);
                rv = tsleep(&devsoftc, PCATCH | PINTERLOCKED, "devctl", 0);
-               crit_exit();
                lockmgr(&devsoftc.lock, LK_EXCLUSIVE);
                if (rv) {
                        /*
index ff9d8dc..d00a8a8 100644 (file)
@@ -90,10 +90,12 @@ gptinit(cdev_t dev, struct disk_info *info, struct diskslices **sspp)
        dname = dev_dname(wdev);
        bp1 = geteblk((int)info->d_media_blksize);
        bp1->b_bio1.bio_offset = info->d_media_blksize;
+       bp1->b_bio1.bio_done = biodone_sync;
+       bp1->b_bio1.bio_flags |= BIO_SYNC;
        bp1->b_bcount = info->d_media_blksize;
        bp1->b_cmd = BUF_CMD_READ;
        dev_dstrategy(wdev, &bp1->b_bio1);
-       if (biowait(bp1) != 0) {
+       if (biowait(&bp1->b_bio1, "gptrd") != 0) {
                kprintf("%s: reading GPT @ block 1: error %d\n",
                        dname, bp1->b_error);
                error = EIO;
@@ -141,10 +143,12 @@ gptinit(cdev_t dev, struct disk_info *info, struct diskslices **sspp)
         */
        bp2 = geteblk((int)(table_blocks * info->d_media_blksize));
        bp2->b_bio1.bio_offset = (off_t)table_lba * info->d_media_blksize;
+       bp2->b_bio1.bio_done = biodone_sync;
+       bp2->b_bio1.bio_flags |= BIO_SYNC;
        bp2->b_bcount = table_blocks * info->d_media_blksize;
        bp2->b_cmd = BUF_CMD_READ;
        dev_dstrategy(wdev, &bp2->b_bio1);
-       if (biowait(bp2) != 0) {
+       if (biowait(&bp2->b_bio1, "gptrd") != 0) {
                kprintf("%s: reading GPT partition table @ %lld: error %d\n",
                        dname,
                        (long long)bp2->b_bio1.bio_offset,
index 3372fe4..d9e29c5 100644 (file)
@@ -178,11 +178,13 @@ l32_readdisklabel(cdev_t dev, struct diskslice *sp, disklabel_t *lpp,
 
        bp = geteblk(secsize);
        bp->b_bio1.bio_offset = (off_t)LABELSECTOR32 * secsize;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        bp->b_bcount = secsize;
        bp->b_flags &= ~B_INVAL;
        bp->b_cmd = BUF_CMD_READ;
        dev_dstrategy(dev, &bp->b_bio1);
-       if (biowait(bp))
+       if (biowait(&bp->b_bio1, "labrd"))
                msg = "I/O error";
        else for (dlp = (struct disklabel32 *)bp->b_data;
            dlp <= (struct disklabel32 *)((char *)bp->b_data +
@@ -305,6 +307,8 @@ l32_writedisklabel(cdev_t dev, struct diskslices *ssp, struct diskslice *sp,
                return (EXDEV);                 /* not quite right */
        bp = geteblk((int)lp->d_secsize);
        bp->b_bio1.bio_offset = (off_t)LABELSECTOR32 * lp->d_secsize;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        bp->b_bcount = lp->d_secsize;
 #if 1
        /*
@@ -317,7 +321,7 @@ l32_writedisklabel(cdev_t dev, struct diskslices *ssp, struct diskslice *sp,
        bp->b_flags &= ~B_INVAL;
        bp->b_cmd = BUF_CMD_READ;
        dev_dstrategy(dkmodpart(dev, WHOLE_SLICE_PART), &bp->b_bio1);
-       error = biowait(bp);
+       error = biowait(&bp->b_bio1, "labrd");
        if (error)
                goto done;
        for (dlp = (struct disklabel32 *)bp->b_data;
@@ -333,9 +337,11 @@ l32_writedisklabel(cdev_t dev, struct diskslices *ssp, struct diskslice *sp,
                                error = EINVAL;
                        } else {
                                bp->b_cmd = BUF_CMD_WRITE;
+                               bp->b_bio1.bio_done = biodone_sync;
+                               bp->b_bio1.bio_flags |= BIO_SYNC;
                                dev_dstrategy(dkmodpart(dev, WHOLE_SLICE_PART),
                                              &bp->b_bio1);
-                               error = biowait(bp);
+                               error = biowait(&bp->b_bio1, "labwr");
                        }
                        goto done;
                }
@@ -348,8 +354,10 @@ done:
        *dlp = *lp;
        bp->b_flags &= ~B_INVAL;
        bp->b_cmd = BUF_CMD_WRITE;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        BUF_STRATEGY(bp, 1);
-       error = biowait(bp);
+       error = biowait(&bp->b_bio1, "labwr");
 #endif
        bp->b_flags |= B_INVAL | B_AGE;
        brelse(bp);
index ae85b27..9769c20 100644 (file)
@@ -126,12 +126,14 @@ l64_readdisklabel(cdev_t dev, struct diskslice *sp, disklabel_t *lpp,
 
        bp = geteblk(bpsize);
        bp->b_bio1.bio_offset = 0;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        bp->b_bcount = bpsize;
        bp->b_flags &= ~B_INVAL;
        bp->b_cmd = BUF_CMD_READ;
        dev_dstrategy(dev, &bp->b_bio1);
 
-       if (biowait(bp)) {
+       if (biowait(&bp->b_bio1, "labrd")) {
                msg = "I/O error";
        } else {
                dlp = (struct disklabel64 *)bp->b_data;
@@ -300,6 +302,8 @@ l64_writedisklabel(cdev_t dev, struct diskslices *ssp,
 
        bp = geteblk(bpsize);
        bp->b_bio1.bio_offset = 0;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        bp->b_bcount = bpsize;
 
        /*
@@ -309,7 +313,7 @@ l64_writedisklabel(cdev_t dev, struct diskslices *ssp,
        bp->b_flags &= ~B_INVAL;
        bp->b_cmd = BUF_CMD_READ;
        dev_dstrategy(dkmodpart(dev, WHOLE_SLICE_PART), &bp->b_bio1);
-       error = biowait(bp);
+       error = biowait(&bp->b_bio1, "labrd");
        if (error)
                goto done;
 
@@ -317,8 +321,10 @@ l64_writedisklabel(cdev_t dev, struct diskslices *ssp,
        bcopy(&lp->d_magic, &dlp->d_magic,
              sizeof(*lp) - offsetof(struct disklabel64, d_magic));
        bp->b_cmd = BUF_CMD_WRITE;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        dev_dstrategy(dkmodpart(dev, WHOLE_SLICE_PART), &bp->b_bio1);
-       error = biowait(bp);
+       error = biowait(&bp->b_bio1, "labwr");
 done:
        bp->b_flags |= B_INVAL | B_AGE;
        brelse(bp);
index 7bd2d08..08ae8e4 100644 (file)
@@ -124,10 +124,12 @@ reread_mbr:
        wdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), WHOLE_SLICE_PART);
        bp = geteblk((int)info->d_media_blksize);
        bp->b_bio1.bio_offset = (off_t)mbr_offset * info->d_media_blksize;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        bp->b_bcount = info->d_media_blksize;
        bp->b_cmd = BUF_CMD_READ;
        dev_dstrategy(wdev, &bp->b_bio1);
-       if (biowait(bp) != 0) {
+       if (biowait(&bp->b_bio1, "mbrrd") != 0) {
                diskerr(&bp->b_bio1, wdev, 
                        "reading primary partition table: error",
                        LOG_PRINTF, 0);
@@ -427,10 +429,12 @@ mbr_extended(cdev_t dev, struct disk_info *info, struct diskslices *ssp,
        /* Read extended boot record. */
        bp = geteblk((int)info->d_media_blksize);
        bp->b_bio1.bio_offset = (off_t)ext_offset * info->d_media_blksize;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        bp->b_bcount = info->d_media_blksize;
        bp->b_cmd = BUF_CMD_READ;
        dev_dstrategy(dev, &bp->b_bio1);
-       if (biowait(bp) != 0) {
+       if (biowait(&bp->b_bio1, "mbrrd") != 0) {
                diskerr(&bp->b_bio1, dev,
                        "reading extended partition table: error",
                        LOG_PRINTF, 0);
index 613dfee..e1c3fa4 100644 (file)
@@ -605,11 +605,9 @@ pipe_read(struct file *fp, struct uio *uio, struct ucred *cred, int fflags)
                 * are held.
                 */
                rpipe->pipe_state |= PIPE_WANTR;
-               crit_enter();
-               tsleep_interlock(rpipe);
+               tsleep_interlock(rpipe, PCATCH);
                lwkt_reltoken(&wlock);
                error = tsleep(rpipe, PCATCH | PINTERLOCKED, "piperd", 0);
-               crit_exit();
                ++pipe_rblocked_count;
                if (error)
                        break;
index 48a2a6f..a68802e 100644 (file)
@@ -922,6 +922,7 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
        bp->b_cmd = (cb->aio_lio_opcode == LIO_WRITE) ?
                    BUF_CMD_WRITE : BUF_CMD_READ;
        bp->b_bio1.bio_done = aio_physwakeup;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        bp->b_bio1.bio_offset = cb->aio_offset;
 
        /* Bring buffer into kernel space. */
@@ -953,6 +954,7 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
        notify = 0;
        crit_enter();
        
+#if 0
        /*
         * If we had an error invoking the request, or an error in processing
         * the request before we have returned, we process it as an error in
@@ -979,6 +981,7 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
                        notify = 1;
                }
        }
+#endif
        crit_exit();
        if (notify)
                KNOTE(&aiocbe->klist, 0);
@@ -1004,18 +1007,12 @@ aio_fphysio(struct aiocblist *iocb)
 
        bp = iocb->bp;
 
-       crit_enter();
-       while (bp->b_cmd != BUF_CMD_DONE) {
-               if (tsleep(bp, 0, "physstr", aiod_timeout)) {
-                       if (bp->b_cmd != BUF_CMD_DONE) {
-                               crit_exit();
-                               return EINPROGRESS;
-                       } else {
-                               break;
-                       }
-               }
+       error = biowait_timeout(&bp->b_bio1, "physstr", aiod_timeout);
+       if (error) {
+               if (error == EWOULDBLOCK)
+                       return EINPROGRESS;
+               break;
        }
-       crit_exit();
 
        /* Release mapping into kernel space. */
        vunmapbuf(bp);
@@ -2026,8 +2023,7 @@ aio_physwakeup(struct bio *bio)
                                        process_signal, aiocbe);
                }
        }
-       bp->b_cmd = BUF_CMD_DONE;
-       wakeup(bp);
+       biodone_sync(bio);
 }
 #endif /* VFS_AIO */
 
index 6ce3743..55b2276 100644 (file)
@@ -238,8 +238,8 @@ runningbufwakeup(struct buf *bp)
        int totalspace;
 
        if ((totalspace = bp->b_runningbufspace) != 0) {
-               runningbufspace -= totalspace;
-               --runningbufcount;
+               atomic_subtract_int(&runningbufspace, totalspace);
+               atomic_subtract_int(&runningbufcount, 1);
                bp->b_runningbufspace = 0;
                if (runningbufreq && runningbufspace <= lorunningspace) {
                        runningbufreq = 0;
@@ -305,6 +305,18 @@ waitrunningbufspace(int limit)
 }
 
 /*
+ * buf_dirty_count_severe:
+ *
+ *     Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+       return (runningbufspace + dirtybufspace >= hidirtybufspace ||
+               dirtybufcount >= nbuf / 2);
+}
+
+/*
  * vfs_buf_test_cache:
  *
  *     Called when a buffer is extended.  This function clears the B_CACHE
@@ -406,7 +418,6 @@ bd_wait(int totalspace)
 
        while (totalspace > 0) {
                bd_heatup();
-               crit_enter();
                if (totalspace > runningbufspace + dirtybufspace)
                        totalspace = runningbufspace + dirtybufspace;
                count = totalspace / BKVASIZE;
@@ -416,11 +427,9 @@ bd_wait(int totalspace)
                spin_lock_wr(&needsbuffer_spin);
                i = (bd_wake_index + count) & BD_WAKE_MASK;
                ++bd_wake_ary[i];
-               tsleep_interlock(&bd_wake_ary[i]);
+               tsleep_interlock(&bd_wake_ary[i], 0);
                spin_unlock_wr(&needsbuffer_spin);
-
                tsleep(&bd_wake_ary[i], PINTERLOCKED, "flstik", hz);
-               crit_exit();
 
                totalspace = runningbufspace + dirtybufspace - hidirtybufspace;
        }
@@ -529,11 +538,10 @@ bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo)
         * Full-on.  Note that the wait flag may only be atomically set if
         * the active count is non-zero.
         */
-       crit_enter();   /* for tsleep_interlock */
        error = 0;
        while ((active = track->bk_active) != 0) {
                desired = active | 0x80000000;
-               tsleep_interlock(track);
+               tsleep_interlock(track, slp_flags);
                if (active == desired ||
                    atomic_cmpset_int(&track->bk_active, active, desired)) {
                        error = tsleep(track, slp_flags | PINTERLOCKED,
@@ -542,7 +550,6 @@ bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo)
                                break;
                }
        }
-       crit_exit();
        return (error);
 }
 
@@ -643,12 +650,14 @@ initbufbio(struct buf *bp)
        bp->b_bio1.bio_offset = NOOFFSET;
        bp->b_bio1.bio_next = &bp->b_bio2;
        bp->b_bio1.bio_done = NULL;
+       bp->b_bio1.bio_flags = 0;
 
        bp->b_bio2.bio_buf = bp;
        bp->b_bio2.bio_prev = &bp->b_bio1;
        bp->b_bio2.bio_offset = NOOFFSET;
        bp->b_bio2.bio_next = NULL;
        bp->b_bio2.bio_done = NULL;
+       bp->b_bio2.bio_flags = 0;
 }
 
 /*
@@ -802,14 +811,14 @@ bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp)
        /* if not found in cache, do some I/O */
        if ((bp->b_flags & B_CACHE) == 0) {
                get_mplock();
-               KASSERT(!(bp->b_flags & B_ASYNC),
-                       ("bread: illegal async bp %p", bp));
-               bp->b_flags &= ~(B_ERROR | B_INVAL);
+               bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
                bp->b_cmd = BUF_CMD_READ;
+               bp->b_bio1.bio_done = biodone_sync;
+               bp->b_bio1.bio_flags |= BIO_SYNC;
                vfs_busy_pages(vp, bp);
                vn_strategy(vp, &bp->b_bio1);
                rel_mplock();
-               return (biowait(bp));
+               return (biowait(&bp->b_bio1, "biord"));
        }
        return (0);
 }
@@ -837,8 +846,10 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
        /* if not found in cache, do some I/O */
        if ((bp->b_flags & B_CACHE) == 0) {
                get_mplock();
-               bp->b_flags &= ~(B_ERROR | B_INVAL);
+               bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
                bp->b_cmd = BUF_CMD_READ;
+               bp->b_bio1.bio_done = biodone_sync;
+               bp->b_bio1.bio_flags |= BIO_SYNC;
                vfs_busy_pages(vp, bp);
                vn_strategy(vp, &bp->b_bio1);
                ++readwait;
@@ -852,8 +863,7 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
 
                if ((rabp->b_flags & B_CACHE) == 0) {
                        rel_mplock();
-                       rabp->b_flags |= B_ASYNC;
-                       rabp->b_flags &= ~(B_ERROR | B_INVAL);
+                       rabp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
                        rabp->b_cmd = BUF_CMD_READ;
                        vfs_busy_pages(vp, rabp);
                        BUF_KERNPROC(rabp);
@@ -864,13 +874,15 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
                }
        }
        if (readwait)
-               rv = biowait(bp);
+               rv = biowait(&bp->b_bio1, "biord");
        return (rv);
 }
 
 /*
  * bwrite:
  *
+ *     Synchronous write, waits for completion.
+ *
  *     Write, release buffer on completion.  (Done by iodone
  *     if async).  Do not bother writing anything if the buffer
  *     is invalid.
@@ -884,25 +896,23 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
 int
 bwrite(struct buf *bp)
 {
-       int oldflags;
+       int error;
 
        if (bp->b_flags & B_INVAL) {
                brelse(bp);
                return (0);
        }
-
-       oldflags = bp->b_flags;
-
        if (BUF_REFCNTNB(bp) == 0)
                panic("bwrite: buffer is not busy???");
-       crit_enter();
 
        /* Mark the buffer clean */
        bundirty(bp);
 
-       bp->b_flags &= ~B_ERROR;
+       bp->b_flags &= ~(B_ERROR | B_EINTR);
        bp->b_flags |= B_CACHE;
        bp->b_cmd = BUF_CMD_WRITE;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
        vfs_busy_pages(bp->b_vp, bp);
 
        /*
@@ -915,16 +925,67 @@ bwrite(struct buf *bp)
                ++runningbufcount;
        }
 
-       crit_exit();
-       if (oldflags & B_ASYNC)
-               BUF_KERNPROC(bp);
        vn_strategy(bp->b_vp, &bp->b_bio1);
+       error = biowait(&bp->b_bio1, "biows");
+       brelse(bp);
+       return (error);
+}
 
-       if ((oldflags & B_ASYNC) == 0) {
-               int rtval = biowait(bp);
+/*
+ * bawrite:
+ *
+ *     Asynchronous write.  Start output on a buffer, but do not wait for
+ *     it to complete.  The buffer is released when the output completes.
+ *
+ *     bwrite() ( or the VOP routine anyway ) is responsible for handling
+ *     B_INVAL buffers.  Not us.
+ */
+void
+bawrite(struct buf *bp)
+{
+       if (bp->b_flags & B_INVAL) {
                brelse(bp);
-               return (rtval);
+               return;
+       }
+       if (BUF_REFCNTNB(bp) == 0)
+               panic("bwrite: buffer is not busy???");
+
+       /* Mark the buffer clean */
+       bundirty(bp);
+
+       bp->b_flags &= ~(B_ERROR | B_EINTR);
+       bp->b_flags |= B_CACHE;
+       bp->b_cmd = BUF_CMD_WRITE;
+       KKASSERT(bp->b_bio1.bio_done == NULL);
+       vfs_busy_pages(bp->b_vp, bp);
+
+       /*
+        * Normal bwrites pipeline writes.  NOTE: b_bufsize is only
+        * valid for vnode-backed buffers.
+        */
+       bp->b_runningbufspace = bp->b_bufsize;
+       if (bp->b_runningbufspace) {
+               runningbufspace += bp->b_runningbufspace;
+               ++runningbufcount;
        }
+
+       BUF_KERNPROC(bp);
+       vn_strategy(bp->b_vp, &bp->b_bio1);
+}
+
+/*
+ * bowrite:
+ *
+ *     Ordered write.  Start output on a buffer, and flag it so that the
+ *     device will write it in the order it was queued.  The buffer is
+ *     released when the output completes.  bwrite() ( or the VOP routine
+ *     anyway ) is responsible for handling B_INVAL buffers.
+ */
+int
+bowrite(struct buf *bp)
+{
+       bp->b_flags |= B_ORDERED;
+       bawrite(bp);
        return (0);
 }
 
@@ -1081,49 +1142,6 @@ bundirty(struct buf *bp)
 }
 
 /*
- * bawrite:
- *
- *     Asynchronous write.  Start output on a buffer, but do not wait for
- *     it to complete.  The buffer is released when the output completes.
- *
- *     bwrite() ( or the VOP routine anyway ) is responsible for handling 
- *     B_INVAL buffers.  Not us.
- */
-void
-bawrite(struct buf *bp)
-{
-       bp->b_flags |= B_ASYNC;
-       bwrite(bp);
-}
-
-/*
- * bowrite:
- *
- *     Ordered write.  Start output on a buffer, and flag it so that the 
- *     device will write it in the order it was queued.  The buffer is 
- *     released when the output completes.  bwrite() ( or the VOP routine
- *     anyway ) is responsible for handling B_INVAL buffers.
- */
-int
-bowrite(struct buf *bp)
-{
-       bp->b_flags |= B_ORDERED | B_ASYNC;
-       return (bwrite(bp));
-}
-
-/*
- * buf_dirty_count_severe:
- *
- *     Return true if we have too many dirty buffers.
- */
-int
-buf_dirty_count_severe(void)
-{
-       return (runningbufspace + dirtybufspace >= hidirtybufspace ||
-               dirtybufcount >= nbuf / 2);
-}
-
-/*
  * brelse:
  *
  *     Release a busy buffer and, if requested, free its resources.  The
@@ -1472,7 +1490,7 @@ brelse(struct buf *bp)
        /*
         * Clean up temporary flags and unlock the buffer.
         */
-       bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT);
+       bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF | B_DIRECT);
        BUF_UNLOCK(bp);
 }
 
@@ -1548,7 +1566,7 @@ bqrelse(struct buf *bp)
         * Final cleanup and unlock.  Clear bits that are only used while a
         * buffer is actively locked.
         */
-       bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_RELBUF);
+       bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF);
        BUF_UNLOCK(bp);
 }
 
@@ -1595,12 +1613,15 @@ vfs_vmio_release(struct buf *bp)
                         * no valid data.  We also free the page if the
                         * buffer was used for direct I/O.
                         */
+#if 0
                        if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
                                        m->hold_count == 0) {
                                vm_page_busy(m);
                                vm_page_protect(m, VM_PROT_NONE);
                                vm_page_free(m);
-                       } else if (bp->b_flags & B_DIRECT) {
+                       } else
+#endif
+                       if (bp->b_flags & B_DIRECT) {
                                vm_page_try_to_free(m);
                        } else if (vm_page_count_severe()) {
                                vm_page_try_to_cache(m);
@@ -1701,16 +1722,14 @@ vfs_bio_awrite(struct buf *bp)
                }
        }
 
-       bremfree(bp);
-       bp->b_flags |= B_ASYNC;
-
        /*
         * default (old) behavior, writing out only one block
         *
         * XXX returns b_bufsize instead of b_bcount for nwritten?
         */
        nwritten = bp->b_bufsize;
-       bwrite(bp);
+       bremfree(bp);
+       bawrite(bp);
 
        return nwritten;
 }
@@ -1916,7 +1935,6 @@ restart:
                if (qindex == BQUEUE_CLEAN) {
                        get_mplock();
                        if (bp->b_flags & B_VMIO) {
-                               bp->b_flags &= ~B_ASYNC;
                                get_mplock();
                                vfs_vmio_release(bp);
                                rel_mplock();
@@ -2160,7 +2178,6 @@ recoverbufpages(void)
 
                get_mplock();
                if (bp->b_flags & B_VMIO) {
-                       bp->b_flags &= ~B_ASYNC;
                        bp->b_flags |= B_DIRECT;    /* try to free pages */
                        vfs_vmio_release(bp);
                }
@@ -2560,6 +2577,10 @@ vfs_setdirty(struct buf *bp)
  *     Locate and return the specified buffer.  Unless flagged otherwise,
  *     a locked buffer will be returned if it exists or NULL if it does not.
  *
+ *     findblk()'d buffers are still on the bufqueues and if you intend
+ *     to use your (locked NON-TEST) buffer you need to bremfree(bp)
+ *     and possibly do other stuff to it.
+ *
  *     FINDBLK_TEST    - Do not lock the buffer.  The caller is responsible
  *                       for locking the buffer and ensuring that it remains
  *                       the desired buffer after locking.
@@ -3244,40 +3265,75 @@ allocbuf(struct buf *bp, int size)
 /*
  * biowait:
  *
- *     Wait for buffer I/O completion, returning error status.  The buffer
- *     is left locked on return.  B_EINTR is converted into an EINTR error
- *     and cleared.
+ *     Wait for buffer I/O completion, returning error status. B_EINTR
+ *     is converted into an EINTR error but not cleared (since a chain
+ *     of biowait() calls may occur).
  *
- *     NOTE!  The original b_cmd is lost on return, since b_cmd will be
- *     set to BUF_CMD_DONE.
+ *     On return bpdone() will have been called but the buffer will remain
+ *     locked and will not have been brelse()'d.
+ *
+ *     NOTE!  If a timeout is specified and ETIMEDOUT occurs the I/O is
+ *     likely still in progress on return.
+ *
+ *     NOTE!  This operation is on a BIO, not a BUF.
+ *
+ *     NOTE!  BIO_DONE is cleared by vn_strategy()
  *
  * MPSAFE
  */
-int
-biowait(struct buf *bp)
+static __inline int
+_biowait(struct bio *bio, const char *wmesg, int to)
 {
-       if (bp->b_cmd != BUF_CMD_DONE) {
-               crit_enter();
-               for (;;) {
-                       tsleep_interlock(bp);
-                       if (bp->b_cmd == BUF_CMD_DONE)
-                               break;
-                       if (bp->b_cmd == BUF_CMD_READ)
-                               tsleep(bp, PINTERLOCKED, "biord", 0);
+       struct buf *bp = bio->bio_buf;
+       u_int32_t flags;
+       u_int32_t nflags;
+       int error;
+
+       KKASSERT(bio == &bp->b_bio1);
+       for (;;) {
+               flags = bio->bio_flags;
+               if (flags & BIO_DONE)
+                       break;
+               tsleep_interlock(bio, 0);
+               nflags = flags | BIO_WANT;
+               tsleep_interlock(bio, 0);
+               if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
+                       if (wmesg)
+                               error = tsleep(bio, PINTERLOCKED, wmesg, to);
+                       else if (bp->b_cmd == BUF_CMD_READ)
+                               error = tsleep(bio, PINTERLOCKED, "biord", to);
                        else
-                               tsleep(bp, PINTERLOCKED, "biowr", 0);
+                               error = tsleep(bio, PINTERLOCKED, "biowr", to);
+                       if (error) {
+                               kprintf("tsleep error biowait %d\n", error);
+                               return (error);
+                       }
+                       break;
                }
-               crit_exit();
        }
-       if (bp->b_flags & B_EINTR) {
-               bp->b_flags &= ~B_EINTR;
+
+       /*
+        * Finish up.
+        */
+       KKASSERT(bp->b_cmd == BUF_CMD_DONE);
+       bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
+       if (bp->b_flags & B_EINTR)
                return (EINTR);
-       }
-       if (bp->b_flags & B_ERROR) {
+       if (bp->b_flags & B_ERROR)
                return (bp->b_error ? bp->b_error : EIO);
-       } else {
-               return (0);
-       }
+       return (0);
+}
+
+int
+biowait(struct bio *bio, const char *wmesg)
+{
+       return(_biowait(bio, wmesg, 0));
+}
+
+int
+biowait_timeout(struct bio *bio, const char *wmesg, int to)
+{
+       return(_biowait(bio, wmesg, to));
 }
 
 /*
@@ -3308,19 +3364,20 @@ vn_strategy(struct vnode *vp, struct bio *bio)
                 track = &vp->v_track_read;
         else
                 track = &vp->v_track_write;
+       KKASSERT((bio->bio_flags & BIO_DONE) == 0);
        bio->bio_track = track;
        bio_track_ref(track);
         vop_strategy(*vp->v_ops, vp, bio);
 }
 
 /*
- * biodone:
+ * bpdone:
  *
- *     Finish I/O on a buffer, optionally calling a completion function.
- *     This is usually called from an interrupt so process blocking is
- *     not allowed.
+ *     Finish I/O on a buffer after all BIOs have been processed.
+ *     Called when the bio chain is exhausted or by biowait.  If called
+ *     by biowait, elseit is typically 0.
  *
- *     biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ *     bpdone is also responsible for setting B_CACHE in a B_VMIO bp.
  *     In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *     assuming B_INVAL is clear.
  *
@@ -3328,56 +3385,24 @@ vn_strategy(struct vnode *vp, struct bio *bio)
  *     read error occured, or if the op was a write.  B_CACHE is never
  *     set if the buffer is invalid or otherwise uncacheable.
  *
- *     biodone does not mess with B_INVAL, allowing the I/O routine or the
+ *     bpdone does not mess with B_INVAL, allowing the I/O routine or the
  *     initiator to leave B_INVAL set to brelse the buffer out of existance
  *     in the biodone routine.
  */
 void
-biodone(struct bio *bio)
+bpdone(struct buf *bp, int elseit)
 {
-       struct buf *bp = bio->bio_buf;
        buf_cmd_t cmd;
 
-       crit_enter();
-
        KASSERT(BUF_REFCNTNB(bp) > 0, 
                ("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp)));
        KASSERT(bp->b_cmd != BUF_CMD_DONE, 
                ("biodone: bp %p already done!", bp));
 
-       runningbufwakeup(bp);
-
        /*
-        * Run up the chain of BIO's.   Leave b_cmd intact for the duration.
+        * No more BIOs are left.  All completion functions have been dealt
+        * with, now we clean up the buffer.
         */
-       while (bio) {
-               biodone_t *done_func; 
-               struct bio_track *track;
-
-               /*
-                * BIO tracking.  Most but not all BIOs are tracked.
-                */
-               if ((track = bio->bio_track) != NULL) {
-                       bio_track_rel(track);
-                       bio->bio_track = NULL;
-               }
-
-               /*
-                * A bio_done function terminates the loop.  The function
-                * will be responsible for any further chaining and/or 
-                * buffer management.
-                *
-                * WARNING!  The done function can deallocate the buffer!
-                */
-               if ((done_func = bio->bio_done) != NULL) {
-                       bio->bio_done = NULL;
-                       done_func(bio);
-                       crit_exit();
-                       return;
-               }
-               bio = bio->bio_prev;
-       }
-
        cmd = bp->b_cmd;
        bp->b_cmd = BUF_CMD_DONE;
 
@@ -3387,8 +3412,8 @@ biodone(struct bio *bio)
        if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) {
                if (cmd == BUF_CMD_FREEBLKS)
                        bp->b_flags |= B_NOCACHE;
-               brelse(bp);
-               crit_exit();
+               if (elseit)
+                       brelse(bp);
                return;
        }
 
@@ -3409,7 +3434,6 @@ biodone(struct bio *bio)
                bdirty(bp);
        }
 
-
        if (bp->b_flags & B_VMIO) {
                int i;
                vm_ooffset_t foff;
@@ -3444,10 +3468,13 @@ biodone(struct bio *bio)
                 * routines.
                 */
                iosize = bp->b_bcount - bp->b_resid;
-               if (cmd == BUF_CMD_READ && (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) {
+               if (cmd == BUF_CMD_READ &&
+                   (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) {
                        bp->b_flags |= B_CACHE;
                }
 
+               crit_enter();
+               get_mplock();
                for (i = 0; i < bp->b_xio.xio_npages; i++) {
                        int bogusflag = 0;
                        int resid;
@@ -3522,23 +3549,96 @@ biodone(struct bio *bio)
                }
                if (obj)
                        vm_object_pip_wakeupn(obj, 0);
+               rel_mplock();
+               crit_exit();
        }
 
        /*
-        * For asynchronous completions, release the buffer now. The brelse
-        * will do a wakeup there if necessary - so no need to do a wakeup
-        * here in the async case. The sync case always needs to do a wakeup.
+        * Finish up by releasing the buffer.  There are no more synchronous
+        * or asynchronous completions, those were handled by bio_done
+        * callbacks.
         */
-
-       if (bp->b_flags & B_ASYNC) {
-               if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+       if (elseit) {
+               if (bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR|B_RELBUF))
                        brelse(bp);
                else
                        bqrelse(bp);
-       } else {
-               wakeup(bp);
        }
-       crit_exit();
+}
+
+/*
+ * Normal biodone.
+ */
+void
+biodone(struct bio *bio)
+{
+       struct buf *bp = bio->bio_buf;
+
+       runningbufwakeup(bp);
+
+       /*
+        * Run up the chain of BIO's.   Leave b_cmd intact for the duration.
+        */
+       while (bio) {
+               biodone_t *done_func;
+               struct bio_track *track;
+
+               /*
+                * BIO tracking.  Most but not all BIOs are tracked.
+                */
+               if ((track = bio->bio_track) != NULL) {
+                       bio_track_rel(track);
+                       bio->bio_track = NULL;
+               }
+
+               /*
+                * A bio_done function terminates the loop.  The function
+                * will be responsible for any further chaining and/or
+                * buffer management.
+                *
+                * WARNING!  The done function can deallocate the buffer!
+                */
+               if ((done_func = bio->bio_done) != NULL) {
+                       bio->bio_done = NULL;
+                       done_func(bio);
+                       return;
+               }
+               bio = bio->bio_prev;
+       }
+
+       /*
+        * If we've run out of bio's do normal [a]synchronous completion.
+        */
+       bpdone(bp, 1);
+}
+
+/*
+ * Synchronous biodone - this terminates a synchronous BIO.
+ *
+ * bpdone() is called with elseit=FALSE, leaving the buffer completed
+ * but still locked.  The caller must brelse() the buffer after waiting
+ * for completion.
+ */
+void
+biodone_sync(struct bio *bio)
+{
+       struct buf *bp = bio->bio_buf;
+       int flags;
+       int nflags;
+
+       KKASSERT(bio == &bp->b_bio1);
+       bpdone(bp, 0);
+
+       for (;;) {
+               flags = bio->bio_flags;
+               nflags = (flags | BIO_DONE) & ~BIO_WANT;
+
+               if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
+                       if (flags & BIO_WANT)
+                               wakeup(bio);
+                       break;
+               }
+       }
 }
 
 /*
@@ -3822,7 +3922,7 @@ vfs_bio_clrbuf(struct buf *bp)
        int i, mask = 0;
        caddr_t sa, ea;
        if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
-               bp->b_flags &= ~(B_INVAL|B_ERROR);
+               bp->b_flags &= ~(B_INVAL | B_EINTR | B_ERROR);
                if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
                    (bp->b_loffset & PAGE_MASK) == 0) {
                        mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
index c0c11d8..31dbe66 100644 (file)
@@ -70,7 +70,7 @@ static struct cluster_save *
 static struct buf *
        cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset,
                            off_t doffset, int blksize, int run, 
-                           struct buf *fbp, int doasync);
+                           struct buf *fbp);
 static void cluster_callback (struct bio *);
 
 
@@ -114,7 +114,7 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
                maxra = nbuf/8;
 
        /*
-        * get the requested block
+        * Get the requested block.
         */
        *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
        origoffset = loffset;
@@ -158,6 +158,13 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
                off_t firstread = bp->b_loffset;
                int nblks;
 
+               /*
+                * Set-up synchronous read for bp.
+                */
+               bp->b_cmd = BUF_CMD_READ;
+               bp->b_bio1.bio_done = biodone_sync;
+               bp->b_bio1.bio_flags |= BIO_SYNC;
+
                KASSERT(firstread != NOOFFSET, 
                        ("cluster_read: no buffer offset"));
                if (firstread + totread > filesize)
@@ -181,7 +188,7 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
                                nblks = burstbytes / blksize;
 
                        bp = cluster_rbuild(vp, filesize, loffset,
-                                           doffset, blksize, nblks, bp, 0);
+                                           doffset, blksize, nblks, bp);
                        loffset += bp->b_bufsize;
                } else {
 single_block_read:
@@ -195,10 +202,11 @@ single_block_read:
        }
 
        /*
-        * Handle the synchronous read.  This only occurs if B_CACHE was
-        * not set.  bp (and rbp) could be either a cluster bp or a normal
-        * bp depending on the what cluster_rbuild() decided to do.  If
-        * it is a cluster bp, vfs_busy_pages() has already been called.
+        * If B_CACHE was not set issue bp.  bp will either be an
+        * asynchronous cluster buf or a synchronous single-buf.
+        * If it is a single buf it will be the same as reqbp.
+        *
+        * NOTE: Once an async cluster buf is issued bp becomes invalid.
         */
        if (bp) {
 #if defined(CLUSTERDEBUG)
@@ -206,19 +214,12 @@ single_block_read:
                        kprintf("S(%lld,%d,%d) ",
                            bp->b_loffset, bp->b_bcount, seqcount);
 #endif
-               bp->b_cmd = BUF_CMD_READ;
                if ((bp->b_flags & B_CLUSTER) == 0)
                        vfs_busy_pages(vp, bp);
                bp->b_flags &= ~(B_ERROR|B_INVAL);
-               if ((bp->b_flags & B_ASYNC) || bp->b_bio1.bio_done != NULL)
-                       BUF_KERNPROC(bp);
                vn_strategy(vp, &bp->b_bio1);
-               if (bp->b_flags & B_ERROR) {
-                       if ((error = bp->b_error) == 0)
-                               error = EIO;
-               } else {
-                       error = 0;
-               }
+               error = 0;
+               /* bp invalid now */
        }
 
        /*
@@ -267,11 +268,16 @@ single_block_read:
                if (ntoread > seqcount)
                        ntoread = seqcount;
 
+               /*
+                * rbp: async read
+                */
+               rbp->b_cmd = BUF_CMD_READ;
                rbp->b_flags |= B_RAM/* | B_AGE*/;
+
                if (burstbytes) {
                        rbp = cluster_rbuild(vp, filesize, loffset,
                                             doffset, blksize, 
-                                            ntoread, rbp, 1);
+                                            ntoread, rbp);
                } else {
                        rbp->b_bio2.bio_offset = doffset;
                }
@@ -290,30 +296,40 @@ single_block_read:
                }
 #endif
                rbp->b_flags &= ~(B_ERROR|B_INVAL);
-               rbp->b_flags |= B_ASYNC;
-               rbp->b_cmd = BUF_CMD_READ;
 
                if ((rbp->b_flags & B_CLUSTER) == 0)
                        vfs_busy_pages(vp, rbp);
-               BUF_KERNPROC(rbp);                      /* B_ASYNC */
+               BUF_KERNPROC(rbp);
                vn_strategy(vp, &rbp->b_bio1);
+               /* rbp invalid now */
        }
-no_read_ahead:
 
-       if (reqbp)
-               return (biowait(reqbp));
-       else
-               return (error);
+       /*
+        * Wait for our original buffer to complete its I/O.  reqbp will
+        * be NULL if the original buffer was B_CACHE.  We are returning
+        * (*bpp) which is the same as reqbp when reqbp != NULL.
+        */
+no_read_ahead:
+       if (reqbp) {
+               KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC);
+               error = biowait(&reqbp->b_bio1, "clurd");
+       }
+       return (error);
 }
 
 /*
  * If blocks are contiguous on disk, use this to provide clustered
  * read ahead.  We will read as many blocks as possible sequentially
  * and then parcel them up into logical blocks in the buffer hash table.
+ *
+ * This function either returns a cluster buf or it returns fbp.  fbp is
+ * already expected to be set up as a synchronous or asynchronous request.
+ *
+ * If a cluster buf is returned it will always be async.
  */
 static struct buf *
-cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, 
-       off_t doffset, int blksize, int run, struct buf *fbp, int doasync)
+cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
+              int blksize, int run, struct buf *fbp)
 {
        struct buf *bp, *tbp;
        off_t boffset;
@@ -335,8 +351,9 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset,
        }
 
        bp = trypbuf(&cluster_pbuf_freecnt);
-       if (bp == NULL)
+       if (bp == NULL) {
                return tbp;
+       }
 
        /*
         * We are synthesizing a buffer out of vm_page_t's, but
@@ -346,9 +363,9 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset,
         */
        bp->b_data = (char *)((vm_offset_t)bp->b_data |
            ((vm_offset_t)tbp->b_data & PAGE_MASK));
-       bp->b_flags |= B_ASYNC | B_CLUSTER | B_VMIO;
+       bp->b_flags |= B_CLUSTER | B_VMIO;
        bp->b_cmd = BUF_CMD_READ;
-       bp->b_bio1.bio_done = cluster_callback;
+       bp->b_bio1.bio_done = cluster_callback;         /* default to async */
        bp->b_bio1.bio_caller_info1.cluster_head = NULL;
        bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
        bp->b_loffset = loffset;
@@ -439,14 +456,12 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset,
                                break;
                        }
                }
+
                /*
-                * The first buffer is setup async if doasync is specified.
-                * All other buffers in the cluster are setup async.  This
-                * way the caller can decide how to deal with the requested
-                * buffer.
+                * The passed-in tbp (i == 0) will already be set up for
+                * async or sync operation.  All other tbp's acquire in
+                * our loop are set up for async operation.
                 */
-               if (i || doasync)
-                       tbp->b_flags |= B_ASYNC;
                tbp->b_cmd = BUF_CMD_READ;
                BUF_KERNPROC(tbp);
                cluster_append(&bp->b_bio1, tbp);
@@ -493,9 +508,9 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset,
                panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)",
                    bp->b_bufsize, bp->b_kvasize);
        }
-
        pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
                (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
+       BUF_KERNPROC(bp);
        return (bp);
 }
 
@@ -814,7 +829,6 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
                bp->b_flags &= ~B_ERROR;
                bp->b_flags |= B_CLUSTER | B_BNOCLIP |
                        (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
-               bp->b_bio1.bio_done = cluster_callback;
                bp->b_bio1.bio_caller_info1.cluster_head = NULL;
                bp->b_bio1.bio_caller_info2.cluster_tail = NULL;
 
@@ -910,7 +924,6 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
 
                        bundirty(tbp);
                        tbp->b_flags &= ~B_ERROR;
-                       tbp->b_flags |= B_ASYNC;
                        tbp->b_cmd = BUF_CMD_WRITE;
                        BUF_KERNPROC(tbp);
                        cluster_append(&bp->b_bio1, tbp);
@@ -932,15 +945,16 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
                totalwritten += bp->b_bufsize;
                bp->b_dirtyoff = 0;
                bp->b_dirtyend = bp->b_bufsize;
-               bp->b_flags |= B_ASYNC;
+               bp->b_bio1.bio_done = cluster_callback;
                bp->b_cmd = BUF_CMD_WRITE;
+
                vfs_busy_pages(vp, bp);
                bp->b_runningbufspace = bp->b_bufsize;
                if (bp->b_runningbufspace) {
                        runningbufspace += bp->b_runningbufspace;
                        ++runningbufcount;
                }
-               BUF_KERNPROC(bp);       /* B_ASYNC */
+               BUF_KERNPROC(bp);
                vn_strategy(vp, &bp->b_bio1);
 
                bytes -= i;
index 7b4b7df..ebb72b1 100644 (file)
@@ -403,8 +403,7 @@ vinvalbuf_bp(struct buf *bp, void *data)
                                vfs_bio_awrite(bp);
                        } else {
                                bremfree(bp);
-                               bp->b_flags |= B_ASYNC;
-                               bwrite(bp);
+                               bawrite(bp);
                        }
                } else {
                        bremfree(bp);
@@ -418,12 +417,10 @@ vinvalbuf_bp(struct buf *bp, void *data)
                 */
                bremfree(bp);
                bp->b_flags |= (B_INVAL | B_RELBUF);
-               bp->b_flags &= ~B_ASYNC;
                brelse(bp);
        } else {
                bremfree(bp);
                bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
-               bp->b_flags &= ~B_ASYNC;
                brelse(bp);
        }
        return(0);
@@ -547,7 +544,6 @@ vtruncbuf_bp_trunc(struct buf *bp, void *data)
        } else {
                bremfree(bp);
                bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
-               bp->b_flags &= ~B_ASYNC;
                brelse(bp);
        }
        return(1);
@@ -582,12 +578,10 @@ vtruncbuf_bp_metasync(struct buf *bp, void *data)
                                BUF_UNLOCK(bp);
                } else {
                        bremfree(bp);
-                       if (bp->b_vp == vp) {
-                               bp->b_flags |= B_ASYNC;
-                       } else {
-                               bp->b_flags &= ~B_ASYNC;
-                       }
-                       bwrite(bp);
+                       if (bp->b_vp == vp)
+                               bawrite(bp);
+                       else
+                               bwrite(bp);
                }
                return(1);
        } else {
index c460944..fbe55ca 100644 (file)
@@ -496,11 +496,9 @@ vn_get_fpf_offset(struct file *fp)
                flags = fp->f_flag;
                if (flags & FOFFSETLOCK) {
                        nflags = flags | FOFFSETWAKE;
-                       crit_enter();
-                       tsleep_interlock(&fp->f_flag);
+                       tsleep_interlock(&fp->f_flag, 0);
                        if (atomic_cmpset_int(&fp->f_flag, flags, nflags))
                                tsleep(&fp->f_flag, PINTERLOCKED, "fpoff", 0);
-                       crit_exit();
                } else {
                        nflags = flags | FOFFSETLOCK;
                        if (atomic_cmpset_int(&fp->f_flag, flags, nflags))
index af13274..f0cad2f 100644 (file)
@@ -779,11 +779,9 @@ tapread(struct dev_read_args *ap)
                                return (EWOULDBLOCK);
                        }
                        tp->tap_flags |= TAP_RWAIT;
-                       crit_enter();
-                       tsleep_interlock(tp);
+                       tsleep_interlock(tp, PCATCH);
                        ifnet_deserialize_all(ifp);
                        error = tsleep(tp, PCATCH | PINTERLOCKED, "taprd", 0);
-                       crit_exit();
                        if (error)
                                return (error);
                } else {
index d5fc709..59fba6d 100644 (file)
@@ -419,13 +419,11 @@ smb_sleep(void *chan, struct smb_slock *sl, int slpflags, const char *wmesg, int
        int error;
 
        if (sl) {
-               crit_enter();
-               tsleep_interlock(chan);
+               tsleep_interlock(chan, slpflags);
                smb_sl_unlock(sl);
                error = tsleep(chan, slpflags | PINTERLOCKED, wmesg, timo);
                if ((slpflags & PDROP) == 0)
                        smb_sl_lock(sl);
-               crit_exit();
        } else {
                error = tsleep(chan, slpflags, wmesg, timo);
        }
index 41f23ab..eb21b8b 100644 (file)
@@ -67,6 +67,7 @@ struct bio {
        biodone_t       *bio_done;      /* Caller completion function */
        off_t           bio_offset;     /* Logical offset relative to device */
        void            *bio_driver_info;
+       int             bio_flags;
        union {
                void    *ptr;
                off_t   offset;
@@ -83,6 +84,13 @@ struct bio {
        } bio_caller_info2;
 };
 
+/*
+ * BIO flags, used for strategy/biodone/biodone_sync interactions.
+ */
+#define BIO_SYNC       0x00000001
+#define BIO_WANT       0x20000000
+#define BIO_DONE       0x40000000
+
 void bio_start_transaction(struct bio *, struct bio_track *);
 
 #endif
index c08afba..44039b6 100644 (file)
@@ -215,11 +215,6 @@ struct buf {
  *
  * Notes:
  *
- *     B_ASYNC         VOP calls on bp's are usually async whether or not
- *                     B_ASYNC is set, but some subsystems, such as NFS, like 
- *                     to know what is best for the caller so they can
- *                     optimize the I/O.
- *
  *     B_PAGING        Indicates that bp is being used by the paging system or
  *                     some paging system and that the bp is not linked into
  *                     the b_vp's clean/dirty linked lists or ref counts.
@@ -283,28 +278,28 @@ struct buf {
 
 #define        B_AGE           0x00000001      /* Reuse more quickly */
 #define        B_NEEDCOMMIT    0x00000002      /* Append-write in progress. */
-#define        B_ASYNC         0x00000004      /* Start I/O, do not wait. */
+#define        B_UNUSED2       0x00000004
 #define        B_DIRECT        0x00000008      /* direct I/O flag (pls free vmio) */
 #define        B_DEFERRED      0x00000010      /* vfs-controlled deferment */
 #define        B_CACHE         0x00000020      /* Bread found us in the cache. */
 #define        B_HASHED        0x00000040      /* Indexed via v_rbhash_tree */
 #define        B_DELWRI        0x00000080      /* Delay I/O until buffer reused. */
 #define        B_BNOCLIP       0x00000100      /* EOF clipping b_bcount not allowed */
-#define        B_UNUSED0200    0x00000200
+#define        B_UNUSED9       0x00000200
 #define        B_EINTR         0x00000400      /* I/O was interrupted */
 #define        B_ERROR         0x00000800      /* I/O error occurred. */
-#define        B_UNUSED1000    0x00001000      /* Unused */
+#define        B_UNUSED12      0x00001000      /* Unused */
 #define        B_INVAL         0x00002000      /* Does not contain valid info. */
 #define        B_LOCKED        0x00004000      /* Locked in core (not reusable). */
 #define        B_NOCACHE       0x00008000      /* Destroy buffer AND backing store */
 #define        B_MALLOC        0x00010000      /* malloced b_data */
 #define        B_CLUSTEROK     0x00020000      /* Pagein op, so swap() can count it. */
-#define        B_UNUSED40000   0x00040000
+#define        B_UNUSED18      0x00040000
 #define        B_RAW           0x00080000      /* Set by physio for raw transfers. */
 #define        B_HEAVY         0x00100000      /* Heavy-weight buffer */
 #define        B_DIRTY         0x00200000      /* Needs writing later. */
 #define        B_RELBUF        0x00400000      /* Release VMIO buffer. */
-#define        B_WANT          0x00800000      /* Used by vm_pager.c */
+#define        B_UNUSED23      0x00800000      /* Request wakeup on done */
 #define        B_VNCLEAN       0x01000000      /* On vnode clean list */
 #define        B_VNDIRTY       0x02000000      /* On vnode dirty list */
 #define        B_PAGING        0x04000000      /* volatile paging I/O -- bypass VMIO */
@@ -312,14 +307,14 @@ struct buf {
 #define B_RAM          0x10000000      /* Read ahead mark (flag) */
 #define B_VMIO         0x20000000      /* VMIO flag */
 #define B_CLUSTER      0x40000000      /* pagein op, so swap() can count it */
-#define B_UNUSED80000000 0x80000000
+#define B_UNUSED31     0x80000000      /* synchronous operation done */
 
 #define PRINT_BUF_FLAGS "\20"  \
        "\40unused31\37cluster\36vmio\35ram\34ordered" \
-       "\33paging\32vndirty\31vnclean\30want\27relbuf\26dirty" \
+       "\33paging\32vndirty\31vnclean\30unused23\27relbuf\26dirty" \
        "\25unused20\24raw\23unused18\22clusterok\21malloc\20nocache" \
        "\17locked\16inval\15unused12\14error\13eintr\12unused9\11bnoclip" \
-       "\10delwri\7hashed\6cache\5deferred\4direct\3async\2needcommit\1age"
+       "\10delwri\7hashed\6cache\5deferred\4direct\3unused2\2needcommit\1age"
 
 #define        NOOFFSET        (-1LL)          /* No buffer offset calculated yet */
 
@@ -413,8 +408,11 @@ struct buf *geteblk (int);
 void regetblk(struct buf *bp);
 struct bio *push_bio(struct bio *);
 struct bio *pop_bio(struct bio *);
-int    biowait (struct buf *);
+int    biowait (struct bio *, const char *);
+int    biowait_timeout (struct bio *, const char *, int);
+void   bpdone (struct buf *, int);
 void   biodone (struct bio *);
+void   biodone_sync (struct bio *);
 
 void   cluster_append(struct bio *, struct buf *);
 int    cluster_read (struct vnode *, off_t, off_t, int,
index ed2a850..cfc0487 100644 (file)
@@ -281,6 +281,20 @@ buf_checkwrite(struct buf *bp)
        return(0);
 }
 
+/*
+ * Chained biodone.  The bio callback was made and the callback function
+ * wishes to chain the biodone.  If no BIO's are left we call bpdone()
+ * with elseit=TRUE (asynchronous completion).
+ */
+static __inline void
+biodone_chain(struct bio *bio)
+{
+       if (bio->bio_prev)
+               biodone(bio->bio_prev);
+       else
+               bpdone(bio->bio_buf, 1);
+}
+
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BUF2_H_ */
index 43c72b5..bf70dca 100644 (file)
@@ -333,7 +333,8 @@ int tsleep (void *, int, const char *, int);
 int    msleep (void *, struct spinlock *, int, const char *, int);
 int    serialize_sleep(void *, struct lwkt_serialize *, int,
                        const char *, int);
-void   tsleep_interlock (void *chan);
+void   tsleep_interlock (void *, int);
+void   tsleep_remove (struct thread *);
 int    lwkt_sleep (const char *, int);
 void   tstop (void);
 void   wakeup (void *chan);
index 8b00648..1be8cd8 100644 (file)
@@ -215,6 +215,7 @@ struct caps_kinfo;
 struct thread {
     TAILQ_ENTRY(thread) td_threadq;
     TAILQ_ENTRY(thread) td_allq;
+    TAILQ_ENTRY(thread) td_sleepq;
     lwkt_port  td_msgport;     /* built-in message port for replies */
     struct lwp *td_lwp;        /* (optional) associated lwp */
     struct proc        *td_proc;       /* (optional) associated process */
@@ -295,7 +296,7 @@ struct thread {
 #define TDF_WAKEREQ            0x4000  /* resume_kproc */
 #define TDF_TIMEOUT            0x8000  /* tsleep timeout */
 #define TDF_INTTHREAD          0x00010000      /* interrupt thread */
-#define TDF_UNUSED20000                0x00020000
+#define TDF_TSLEEP_DESCHEDULED 0x00020000      /* tsleep core deschedule */
 #define TDF_BLOCKED            0x00040000      /* Thread is blocked */
 #define TDF_PANICWARN          0x00080000      /* panic warning in switch */
 #define TDF_BLOCKQ             0x00100000      /* on block queue */
index 228beaf..aef2608 100644 (file)
@@ -215,12 +215,19 @@ ext2_bmaparray(struct vnode *vp, ext2_daddr_t bn, ext2_daddr_t *bnp,
                        if (!daddr)
                                panic("ext2_bmaparray: indirect block not in cache");
 #endif
+                       /*
+                        * This runs through ext2_strategy using bio2 to
+                        * cache the disk offset, then comes back through
+                        * bio1.  So we want to wait on bio1
+                        */
+                       bp->b_bio1.bio_done = biodone_sync;
+                       bp->b_bio1.bio_flags |= BIO_SYNC;
                        bp->b_bio2.bio_offset = fsbtodoff(fs, daddr);
                        bp->b_flags &= ~(B_INVAL|B_ERROR);
                        bp->b_cmd = BUF_CMD_READ;
                        vfs_busy_pages(bp->b_vp, bp);
                        vn_strategy(bp->b_vp, &bp->b_bio1);
-                       error = biowait(bp);
+                       error = biowait(&bp->b_bio1, "biord");
                        if (error) {
                                brelse(bp);
                                return (error);
index cfd27e2..f1ad3ac 100644 (file)
@@ -405,9 +405,11 @@ ext2_indirtrunc(struct inode *ip, daddr_t lbn, off_t doffset, daddr_t lastbn,
                if (bp->b_bcount > bp->b_bufsize)
                        panic("ext2_indirtrunc: bad buffer size");
                bp->b_bio2.bio_offset = doffset;
+               bp->b_bio1.bio_done = biodone_sync;
+               bp->b_bio1.bio_flags |= BIO_SYNC;
                vfs_busy_pages(bp->b_vp, bp);
                vn_strategy(vp, &bp->b_bio1);
-               error = biowait(bp);
+               error = biowait(&bp->b_bio1, "biord");
        }
        if (error) {
                brelse(bp);
index b091525..ca44a9b 100644 (file)
@@ -61,7 +61,6 @@ static void hammer_io_direct_write_complete(struct bio *nbio);
 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
 static void hammer_io_set_modlist(struct hammer_io *io);
 static void hammer_io_flush_mark(hammer_volume_t volume);
-static void hammer_io_flush_sync_done(struct bio *bio);
 
 
 /*
@@ -125,24 +124,23 @@ hammer_io_disassociate(hammer_io_structure_t iou)
 
 /*
  * Wait for any physical IO to complete
+ *
+ * XXX we aren't interlocked against a spinlock or anything so there
+ *     is a small window in the interlock / io->running == 0 test.
  */
 void
 hammer_io_wait(hammer_io_t io)
 {
        if (io->running) {
-               crit_enter();
-               tsleep_interlock(io);
-               io->waiting = 1;
                for (;;) {
-                       tsleep(io, PINTERLOCKED, "hmrflw", 0);
+                       io->waiting = 1;
+                       tsleep_interlock(io, 0);
                        if (io->running == 0)
                                break;
-                       tsleep_interlock(io);
-                       io->waiting = 1;
+                       tsleep(io, PINTERLOCKED, "hmrflw", hz);
                        if (io->running == 0)
                                break;
                }
-               crit_exit();
        }
 }
 
@@ -1447,36 +1445,15 @@ hammer_io_flush_sync(hammer_mount_t hmp)
                        bp->b_bcount = 0;
                        bp->b_cmd = BUF_CMD_FLUSH;
                        bp->b_bio1.bio_caller_info1.cluster_head = bp_base;
-                       bp->b_bio1.bio_done = hammer_io_flush_sync_done;
-                       bp->b_flags |= B_ASYNC;
+                       bp->b_bio1.bio_done = biodone_sync;
+                       bp->b_bio1.bio_flags |= BIO_SYNC;
                        bp_base = bp;
                        vn_strategy(volume->devvp, &bp->b_bio1);
                }
        }
        while ((bp = bp_base) != NULL) {
                bp_base = bp->b_bio1.bio_caller_info1.cluster_head;
-               while (bp->b_cmd != BUF_CMD_DONE) {
-                       crit_enter();
-                       tsleep_interlock(&bp->b_cmd);
-                       if (bp->b_cmd != BUF_CMD_DONE)
-                               tsleep(&bp->b_cmd, PINTERLOCKED, "hmrFLS", 0);
-                       crit_exit();
-               }
-               bp->b_flags &= ~B_ASYNC;
+               biowait(&bp->b_bio1, "hmrFLS");
                relpbuf(bp, NULL);
        }
 }
-
-/*
- * Callback to deal with completed flush commands to the device.
- */
-static void
-hammer_io_flush_sync_done(struct bio *bio)
-{
-       struct buf *bp;
-
-       bp = bio->bio_buf;
-       bp->b_cmd = BUF_CMD_DONE;
-       wakeup(&bp->b_cmd);
-}
-
index 49cb15e..5c73ce1 100644 (file)
@@ -68,14 +68,12 @@ hammer_lock_ex_ident(struct hammer_lock *lock, const char *ident)
                        }
                        nlv = lv | HAMMER_LOCKF_WANTED;
                        ++hammer_contention_count;
-                       crit_enter();
-                       tsleep_interlock(lock);
+                       tsleep_interlock(lock, 0);
                        if (atomic_cmpset_int(&lock->lockval, lv, nlv)) {
                                tsleep(lock, PINTERLOCKED, ident, 0);
                                if (hammer_debug_locks)
                                        kprintf("hammer_lock_ex: try again\n");
                        }
-                       crit_exit();
                }
        }
 }
@@ -150,12 +148,10 @@ hammer_lock_sh(struct hammer_lock *lock)
                } else {
                        nlv = lv | HAMMER_LOCKF_WANTED;
                        ++hammer_contention_count;
-                       crit_enter();
-                       tsleep_interlock(lock);
+                       tsleep_interlock(lock, 0);
                        if (atomic_cmpset_int(&lock->lockval, lv, nlv)) {
                                tsleep(lock, PINTERLOCKED, "hmrlck", 0);
                        }
-                       crit_exit();
                }
        }
 }
index 87cfd53..8c40df7 100644 (file)
@@ -143,7 +143,6 @@ mfs_freeblks(struct vop_freeblks_args *ap)
        struct vnode *vp = ap->a_vp;
 
        bp = geteblk(ap->a_length);
-       bp->b_flags |= B_ASYNC;
        bp->b_cmd = BUF_CMD_FREEBLKS;
        bp->b_bio1.bio_offset = ap->a_offset;
        bp->b_bcount = ap->a_length;
index 7256e04..bbc0261 100644 (file)
@@ -69,6 +69,7 @@
 static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset,
                                   int size, struct thread *td);
 static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen);
+static void nfsiodone_sync(struct bio *bio);
 
 extern int nfs_numasync;
 extern int nfs_pbuf_freecnt;
@@ -434,7 +435,6 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag)
                            if (!rabp)
                                return (EINTR);
                            if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
-                               rabp->b_flags |= B_ASYNC;
                                rabp->b_cmd = BUF_CMD_READ;
                                vfs_busy_pages(vp, rabp);
                                if (nfs_asyncio(vp, &rabp->b_bio2, td)) {
@@ -497,6 +497,8 @@ again:
 
                if ((bp->b_flags & B_CACHE) == 0) {
                    bp->b_cmd = BUF_CMD_READ;
+                   bp->b_bio2.bio_done = nfsiodone_sync;
+                   bp->b_bio2.bio_flags |= BIO_SYNC;
                    vfs_busy_pages(vp, bp);
                    error = nfs_doio(vp, &bp->b_bio2, td);
                    if (error) {
@@ -525,6 +527,8 @@ again:
                        return (EINTR);
                if ((bp->b_flags & B_CACHE) == 0) {
                    bp->b_cmd = BUF_CMD_READ;
+                   bp->b_bio2.bio_done = nfsiodone_sync;
+                   bp->b_bio2.bio_flags |= BIO_SYNC;
                    vfs_busy_pages(vp, bp);
                    error = nfs_doio(vp, &bp->b_bio2, td);
                    if (error) {
@@ -551,6 +555,8 @@ again:
 
                if ((bp->b_flags & B_CACHE) == 0) {
                    bp->b_cmd = BUF_CMD_READ;
+                   bp->b_bio2.bio_done = nfsiodone_sync;
+                   bp->b_bio2.bio_flags |= BIO_SYNC;
                    vfs_busy_pages(vp, bp);
                    error = nfs_doio(vp, &bp->b_bio2, td);
                    if (error) {
@@ -580,6 +586,8 @@ again:
                                return (EINTR);
                            if ((bp->b_flags & B_CACHE) == 0) {
                                    bp->b_cmd = BUF_CMD_READ;
+                                   bp->b_bio2.bio_done = nfsiodone_sync;
+                                   bp->b_bio2.bio_flags |= BIO_SYNC;
                                    vfs_busy_pages(vp, bp);
                                    error = nfs_doio(vp, &bp->b_bio2, td);
                                    /*
@@ -624,7 +632,6 @@ again:
                                               NFS_DIRBLKSIZ, td);
                        if (rabp) {
                            if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
-                               rabp->b_flags |= B_ASYNC;
                                rabp->b_cmd = BUF_CMD_READ;
                                vfs_busy_pages(vp, rabp);
                                if (nfs_asyncio(vp, &rabp->b_bio2, td)) {
@@ -952,6 +959,8 @@ again:
 
                if ((bp->b_flags & B_CACHE) == 0) {
                        bp->b_cmd = BUF_CMD_READ;
+                       bp->b_bio2.bio_done = nfsiodone_sync;
+                       bp->b_bio2.bio_flags |= BIO_SYNC;
                        vfs_busy_pages(vp, bp);
                        error = nfs_doio(vp, &bp->b_bio2, td);
                        if (error) {
@@ -1313,6 +1322,9 @@ again:
  * Do an I/O operation to/from a cache block. This may be called
  * synchronously or from an nfsiod.  The BIO is normalized for DEV_BSIZE.
  *
+ * A locked, completed I/O is returned and the caller is responsible for
+ * brelse()'ing it.
+ *
  * NOTE! TD MIGHT BE NULL
  */
 int
@@ -1450,7 +1462,7 @@ nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
                uiop->uio_rw = UIO_WRITE;
                nfsstats.write_bios++;
 
-               if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
+               if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
                    iomode = NFSV3WRITE_UNSTABLE;
                else
                    iomode = NFSV3WRITE_FILESYNC;
@@ -1483,7 +1495,7 @@ nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
                 * For an interrupted write, the buffer is still valid
                 * and the write hasn't been pushed to the server yet,
                 * so we can't set B_ERROR and report the interruption
-                * by setting B_EINTR. For the B_ASYNC case, B_EINTR
+                * by setting B_EINTR. For the async case, B_EINTR
                 * is not relevant, so the rpc attempt is essentially
                 * a noop.  For the case of a V3 write rpc not being
                 * committed to stable storage, the block is still
@@ -1503,7 +1515,7 @@ nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
                        bp->b_flags &= ~(B_INVAL|B_NOCACHE);
                        if ((bp->b_flags & B_PAGING) == 0)
                            bdirty(bp);
-                       if (error && (bp->b_flags & B_ASYNC) == 0)
+                       if (error)
                            bp->b_flags |= B_EINTR;
                        crit_exit();
                } else {
@@ -1573,3 +1585,13 @@ nfs_meta_setsize(struct vnode *vp, struct thread *td, u_quad_t nsize)
        return(error);
 }
 
+/*
+ * Synchronous completion for nfs_doio.  Call bpdone() with elseit=FALSE.
+ * Caller is responsible for brelse()'ing the bp.
+ */
+static void
+nfsiodone_sync(struct bio *bio)
+{
+       bio->bio_flags = 0;
+       bpdone(bio->bio_buf, 0);
+}
index 3d22d29..2aba4d5 100644 (file)
@@ -3487,11 +3487,14 @@ nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
                                else
                                        bp = NULL;
                        }
-                       if (bp && (bp->b_flags & B_DELWRI)) {
-                               bremfree(bp);
-                               bp->b_flags &= ~B_ASYNC;
-                               bwrite(bp);
-                               ++nfs_commit_miss;
+                       if (bp) {
+                               if (bp->b_flags & B_DELWRI) {
+                                       bremfree(bp);
+                                       bwrite(bp);
+                                       ++nfs_commit_miss;
+                               } else {
+                                       BUF_UNLOCK(bp);
+                               }
                        }
                        ++nfs_commit_blks;
                        if (cnt < iosize)
index e1c913e..95374e8 100644 (file)
@@ -944,6 +944,7 @@ nfssvc_iod(struct thread *td)
                    wakeup(&nmp->nm_bioq);
                }
                nfs_doio((struct vnode *)bio->bio_driver_info, bio, NULL);
+
                /*
                 * If there are more than one iod on this mount, then defect
                 * so that the iods can be shared out fairly between the mounts
index 34ad5e9..a9fdc81 100644 (file)
@@ -2877,10 +2877,10 @@ nfs_strategy(struct vop_strategy_args *ap)
        KASSERT(BUF_REFCNT(bp) > 0,
                ("nfs_strategy: buffer %p not locked", bp));
 
-       if (bp->b_flags & B_ASYNC)
-               td = NULL;
-       else
+       if (bio->bio_flags & BIO_SYNC)
                td = curthread; /* XXX */
+       else
+               td = NULL;
 
         /*
         * We probably don't need to push an nbio any more since no
@@ -2895,7 +2895,7 @@ nfs_strategy(struct vop_strategy_args *ap)
         * queue the request, wake it up and wait for completion
         * otherwise just do it ourselves.
         */
-       if ((bp->b_flags & B_ASYNC) == 0 || nfs_asyncio(ap->a_vp, nbio, td))
+       if ((bio->bio_flags & BIO_SYNC) || nfs_asyncio(ap->a_vp, nbio, td))
                error = nfs_doio(ap->a_vp, nbio, td);
        return (error);
 }
@@ -3219,7 +3219,6 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error)
                                 * start the transaction in order to
                                 * immediately biodone() it.
                                 */
-                               bp->b_flags |= B_ASYNC;
                                bundirty(bp);
                                bp->b_flags &= ~B_ERROR;
                                bp->b_dirtyoff = bp->b_dirtyend = 0;
index 42dcfdc..1f373b9 100644 (file)
@@ -332,7 +332,7 @@ nwfs_doio(struct vnode *vp, struct bio *bio, struct ucred *cr, struct thread *td
                 * For an interrupted write, the buffer is still valid
                 * and the write hasn't been pushed to the server yet,
                 * so we can't set B_ERROR and report the interruption
-                * by setting B_EINTR. For the B_ASYNC case, B_EINTR
+                * by setting B_EINTR. For the async case, B_EINTR
                 * is not relevant, so the rpc attempt is essentially
                 * a noop.  For the case of a V3 write rpc not being
                 * committed to stable storage, the block is still
@@ -346,12 +346,9 @@ nwfs_doio(struct vnode *vp, struct bio *bio, struct ucred *cr, struct thread *td
 
                        crit_enter();
                        bp->b_flags &= ~(B_INVAL|B_NOCACHE);
-                       if ((bp->b_flags & B_ASYNC) == 0)
-                           bp->b_flags |= B_EINTR;
                        if ((bp->b_flags & B_PAGING) == 0)
                            bdirty(bp);
-                       if ((bp->b_flags & B_ASYNC) == 0)
-                           bp->b_flags |= B_EINTR;
+                       bp->b_flags |= B_EINTR;
                        crit_exit();
                } else {
                        if (error) {
index 0a1ce19..f4062c5 100644 (file)
@@ -740,19 +740,18 @@ static int
 nwfs_strategy(struct vop_strategy_args *ap)
 {
        struct bio *bio = ap->a_bio;
-       struct buf *bp = bio->bio_buf;
        int error = 0;
        struct thread *td = NULL;
 
        NCPVNDEBUG("\n");
-       if ((bp->b_flags & B_ASYNC) == 0)
+       if ((bio->bio_flags & BIO_SYNC))
                td = curthread;         /* YYY dunno if this is legal */
        /*
         * If the op is asynchronous and an i/o daemon is waiting
         * queue the request, wake it up and wait for completion
         * otherwise just do it ourselves.
         */
-       if ((bp->b_flags & B_ASYNC) == 0 )
+       if (bio->bio_flags & BIO_SYNC)
                error = nwfs_doio(ap->a_vp, bio, proc0.p_ucred, td);
        return (error);
 }
index e72c050..e6c2048 100644 (file)
@@ -355,7 +355,7 @@ smbfs_doio(struct vnode *vp, struct bio *bio, struct ucred *cr, struct thread *t
                 * For an interrupted write, the buffer is still valid
                 * and the write hasn't been pushed to the server yet,
                 * so we can't set BIO_ERROR and report the interruption
-                * by setting B_EINTR. For the B_ASYNC case, B_EINTR
+                * by setting B_EINTR. For the async case, B_EINTR
                 * is not relevant, so the rpc attempt is essentially
                 * a noop.  For the case of a V3 write rpc not being
                 * committed to stable storage, the block is still
@@ -369,12 +369,9 @@ smbfs_doio(struct vnode *vp, struct bio *bio, struct ucred *cr, struct thread *t
 
                        crit_enter();
                        bp->b_flags &= ~(B_INVAL|B_NOCACHE);
-                       if ((bp->b_flags & B_ASYNC) == 0)
-                           bp->b_flags |= B_EINTR;
                        if ((bp->b_flags & B_PAGING) == 0)
                            bdirty(bp);
-                       if ((bp->b_flags & B_ASYNC) == 0)
-                           bp->b_flags |= B_EINTR;
+                       bp->b_flags |= B_EINTR;
                        crit_exit();
                } else {
                        if (error) {
index df54519..8ebf83e 100644 (file)
@@ -798,15 +798,14 @@ static int
 smbfs_strategy(struct vop_strategy_args *ap)
 {
        struct bio *bio = ap->a_bio;
-       struct buf *bp = bio->bio_buf;
        struct thread *td = NULL;
        int error = 0;
 
        SMBVDEBUG("\n");
-       if ((bp->b_flags & B_ASYNC) == 0)
+       if (bio->bio_flags & BIO_SYNC)
                td = curthread;         /* XXX */
 
-       if ((bp->b_flags & B_ASYNC) == 0 )
+       if (bio->bio_flags & BIO_SYNC)
                error = smbfs_doio(ap->a_vp, bio, proc0.p_ucred, td);
        return error;
 }
index 7c34cad..bf679cb 100644 (file)
@@ -138,8 +138,6 @@ spec_vnoperate(struct vop_generic_args *ap)
        return (VOCALL(&spec_vnode_vops, ap));
 }
 
-static void spec_getpages_iodone (struct bio *bio);
-
 /*
  * Open a special file.
  *
@@ -513,15 +511,15 @@ spec_strategy(struct vop_strategy_args *ap)
        KKASSERT(vp->v_rdev != NULL);   /* XXX */
        if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) {
                if (bp->b_cmd == BUF_CMD_READ) {
-                       if (bp->b_flags & B_ASYNC)
-                               mp->mnt_stat.f_asyncreads++;
-                       else
+                       if (bio->bio_flags & BIO_SYNC)
                                mp->mnt_stat.f_syncreads++;
-               } else {
-                       if (bp->b_flags & B_ASYNC)
-                               mp->mnt_stat.f_asyncwrites++;
                        else
+                               mp->mnt_stat.f_asyncreads++;
+               } else {
+                       if (bio->bio_flags & BIO_SYNC)
                                mp->mnt_stat.f_syncwrites++;
+                       else
+                               mp->mnt_stat.f_asyncwrites++;
                }
        }
 
@@ -822,13 +820,6 @@ spec_advlock(struct vop_advlock_args *ap)
        return ((ap->a_flags & F_POSIX) ? EINVAL : EOPNOTSUPP);
 }
 
-static void
-spec_getpages_iodone(struct bio *bio)
-{
-       bio->bio_buf->b_cmd = BUF_CMD_DONE;
-       wakeup(bio->bio_buf);
-}
-
 /*
  * spec_getpages() - get pages associated with device vnode.
  *
@@ -893,21 +884,15 @@ spec_getpages(struct vop_getpages_args *ap)
        }
 
        bp->b_bio1.bio_offset = offset;
-       bp->b_bio1.bio_done = spec_getpages_iodone;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
 
        mycpu->gd_cnt.v_vnodein++;
        mycpu->gd_cnt.v_vnodepgsin += pcount;
 
        /* Do the input. */
        vn_strategy(ap->a_vp, &bp->b_bio1);
-
-       crit_enter();
-
-       /* We definitely need to be at splbio here. */
-       while (bp->b_cmd != BUF_CMD_DONE)
-               tsleep(bp, 0, "spread", 0);
-
-       crit_exit();
+       biowait(&bp->b_bio1, "spread");
 
        if (bp->b_flags & B_ERROR) {
                if (bp->b_error)
index 670eac4..c0c5b51 100644 (file)
@@ -450,7 +450,6 @@ fail:
                 */
                bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0);
                bp->b_flags |= (B_INVAL | B_RELBUF);
-               bp->b_flags &= ~B_ASYNC;
                brelse(bp);
                deallocated += fs->fs_bsize;
        }
index d29383d..4a2c2a3 100644 (file)
@@ -466,7 +466,13 @@ ffs_indirtrunc(struct inode *ip, ufs_daddr_t lbn, ufs_daddr_t dbn,
                bp->b_cmd = BUF_CMD_READ;
                if (bp->b_bcount > bp->b_bufsize)
                        panic("ffs_indirtrunc: bad buffer size");
+               /*
+                * BIO is bio2 which chains back to bio1.  We wait
+                * on bio1.
+                */
                bp->b_bio2.bio_offset = dbtodoff(fs, dbn);
+               bp->b_bio1.bio_done = biodone_sync;
+               bp->b_bio1.bio_flags |= BIO_SYNC;
                vfs_busy_pages(vp, bp);
                /*
                 * Access the block device layer using the device vnode
@@ -479,7 +485,7 @@ ffs_indirtrunc(struct inode *ip, ufs_daddr_t lbn, ufs_daddr_t dbn,
                 */
                bio_start_transaction(&bp->b_bio1, &vp->v_track_read);
                vn_strategy(ip->i_devvp, &bp->b_bio2);
-               error = biowait(bp);
+               error = biowait(&bp->b_bio1, "biord");
        }
        if (error) {
                brelse(bp);
index 2af2612..37f6354 100644 (file)
@@ -62,9 +62,6 @@ int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 
 void ffs_rawread_setup(void);
 
-static void ffs_rawreadwakeup(struct bio *bio);
-
-
 SYSCTL_DECL(_vfs_ffs);
 
 static int ffsrawbufcnt = 4;
@@ -166,10 +163,16 @@ ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t loffset,
                if (iolen != 0)
                        len -= PAGE_SIZE;
        }
+
+       /*
+        * Raw disk address is in bio2, but we wait for it to
+        * chain to bio1.
+        */
        bp->b_flags &= ~B_ERROR;
        bp->b_loffset = loffset;
        bp->b_bio2.bio_offset = NOOFFSET;
-       bp->b_bio2.bio_done = ffs_rawreadwakeup;
+       bp->b_bio1.bio_done = biodone_sync;
+       bp->b_bio1.bio_flags |= BIO_SYNC;
 
        blockoff = (loffset % bsize) / DEV_BSIZE;
 
@@ -274,10 +277,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
                        }
                }
                
-               crit_enter();
-               while (bp->b_cmd != BUF_CMD_DONE)
-                       tsleep((caddr_t)&bp->b_bio2, 0, "rawrd", 0);
-               crit_exit();
+               biowait(&bp->b_bio1, "rawrd");
                
                vunmapbuf(bp);
                
@@ -338,10 +338,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
        if (bp != NULL)
                relpbuf(bp, &ffsrawbufcnt);
        if (nbp != NULL) {                      /* Run down readahead buffer */
-               crit_enter();
-               while (nbp->b_cmd != BUF_CMD_DONE)
-                       tsleep(&nbp->b_bio2, 0, "rawrd", 0);
-               crit_exit();
+               biowait(&nbp->b_bio1, "rawrd");
                vunmapbuf(nbp);
                relpbuf(nbp, &ffsrawbufcnt);
        }
@@ -415,10 +412,3 @@ ffs_rawread(struct vnode *vp,
        return 0;
 }
 
-
-static void
-ffs_rawreadwakeup(struct bio *bio)
-{
-       bio->bio_buf->b_cmd = BUF_CMD_DONE;
-       wakeup(bio);
-}
index d26bf0f..cf8b04c 100644 (file)
@@ -207,12 +207,19 @@ ufs_bmaparray(struct vnode *vp, ufs_daddr_t bn, ufs_daddr_t *bnp,
                        if (!daddr)
                                panic("ufs_bmaparray: indirect block not in cache");
 #endif
+                       /*
+                        * cached disk addr in bio2, do I/O on bio1.  It
+                        * will probably hit the vfs's strategy function
+                        * which will then use the cached offset in bio2.
+                        */
+                       bp->b_bio1.bio_done = biodone_sync;
+                       bp->b_bio1.bio_flags |= BIO_SYNC;
                        bp->b_bio2.bio_offset = fsbtodoff(fs, daddr);
                        bp->b_flags &= ~(B_INVAL|B_ERROR);
                        bp->b_cmd = BUF_CMD_READ;
                        vfs_busy_pages(bp->b_vp, bp);
                        vn_strategy(bp->b_vp, &bp->b_bio1);
-                       error = biowait(bp);
+                       error = biowait(&bp->b_bio1, "biord");
                        if (error) {
                                brelse(bp);
                                return (error);
index f8fb496..1b9eabc 100644 (file)
 #define SWM_FREE       0x02    /* free, period                 */
 #define SWM_POP                0x04    /* pop out                      */
 
-#define AUTOCHAINDONE  ((struct buf *)(intptr_t)-1)
-
 /*
  * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
  * in the old system.
@@ -218,7 +216,6 @@ int nswap_lowat = 128;              /* in pages, swap_pager_almost_full warn */
 int nswap_hiwat = 512;         /* in pages, swap_pager_almost_full warn */
 
 static __inline void   swp_sizecheck (void);
-static void    swp_pager_sync_iodone (struct bio *bio);
 static void    swp_pager_async_iodone (struct bio *bio);
 
 /*
@@ -856,8 +853,8 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
        vm_pindex_t biox_blkno = 0;
        int count;
        char *data;
-       struct bio *biox = NULL;
-       struct buf *bufx = NULL;
+       struct bio *biox;
+       struct buf *bufx;
        struct bio_track *track;
 
        /*
@@ -889,8 +886,6 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
        count = howmany(bp->b_bcount, PAGE_SIZE);
        data = bp->b_data;
 
-       crit_enter();
-
        /*
         * Deal with BUF_CMD_FREEBLKS
         */
@@ -900,7 +895,6 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                 *                needed.
                 */
                swp_pager_meta_free(object, start, count);
-               crit_exit();
                bp->b_resid = 0;
                biodone(bio);
                return;
@@ -919,10 +913,12 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
        nbio->bio_caller_info1.cluster_head = NULL;
        nbio->bio_caller_info2.cluster_tail = NULL;
 
+       biox = NULL;
+       bufx = NULL;
+
        /*
         * Execute read or write
         */
-
        while (count > 0) {
                daddr_t blk;
 
@@ -930,7 +926,6 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                 * Obtain block.  If block not found and writing, allocate a
                 * new block and build it into the object.
                 */
-
                blk = swp_pager_meta_ctl(object, start, 0);
                if ((blk == SWAPBLK_NONE) && bp->b_cmd != BUF_CMD_READ) {
                        blk = swp_pager_getswapspace(1);
@@ -950,13 +945,11 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                 *      - we cross a physical disk boundry in the
                 *        stripe.
                 */
-
                if (
                    biox && (biox_blkno + btoc(bufx->b_bcount) != blk ||
                     ((biox_blkno ^ blk) & dmmax_mask)
                    )
                ) {
-                       crit_exit();
                        if (bp->b_cmd == BUF_CMD_READ) {
                                ++mycpu->gd_cnt.v_swapin;
                                mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
@@ -967,17 +960,11 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                        }
 
                        /*
-                        * Flush the biox to the swap device.
+                        * Finished with this buf.
                         */
-                       if (bufx->b_bcount) {
-                               if (bufx->b_cmd != BUF_CMD_READ)
-                                       bufx->b_dirtyend = bufx->b_bcount;
-                               BUF_KERNPROC(bufx);
-                               vn_strategy(swapdev_vp, biox);
-                       } else {
-                               biodone(biox);
-                       }
-                       crit_enter();
+                       KKASSERT(bufx->b_bcount != 0);
+                       if (bufx->b_cmd != BUF_CMD_READ)
+                               bufx->b_dirtyend = bufx->b_bcount;
                        biox = NULL;
                        bufx = NULL;
                }
@@ -1001,8 +988,7 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                                bufx = getpbuf(NULL);
                                biox = &bufx->b_bio1;
                                cluster_append(nbio, bufx);
-                               bufx->b_flags |= (bufx->b_flags & B_ORDERED) |
-                                               B_ASYNC;
+                               bufx->b_flags |= (bufx->b_flags & B_ORDERED);
                                bufx->b_cmd = bp->b_cmd;
                                biox->bio_done = swap_chain_iodone;
                                biox->bio_offset = (off_t)blk << PAGE_SHIFT;
@@ -1021,11 +1007,7 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
        /*
         *  Flush out last buffer
         */
-       crit_exit();
-
        if (biox) {
-               if ((bp->b_flags & B_ASYNC) == 0)
-                       bufx->b_flags &= ~B_ASYNC;
                if (bufx->b_cmd == BUF_CMD_READ) {
                        ++mycpu->gd_cnt.v_swapin;
                        mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
@@ -1034,43 +1016,32 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                        mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
                        bufx->b_dirtyend = bufx->b_bcount;
                }
-               if (bufx->b_bcount) {
-                       if (bufx->b_cmd != BUF_CMD_READ)
-                               bufx->b_dirtyend = bufx->b_bcount;
-                       BUF_KERNPROC(bufx);
-                       vn_strategy(swapdev_vp, biox);
-               } else {
-                       biodone(biox);
-               }
+               KKASSERT(bufx->b_bcount);
+               if (bufx->b_cmd != BUF_CMD_READ)
+                       bufx->b_dirtyend = bufx->b_bcount;
                /* biox, bufx = NULL */
        }
 
        /*
-        * Wait for completion.  Now that we are no longer using
-        * cluster_append, use the cluster_tail field to indicate
-        * auto-completion if there are still I/O's in progress.
+        * Now initiate all the I/O.  Be careful looping on our chain as
+        * I/O's may complete while we are still initiating them.
         */
-       if (bp->b_flags & B_ASYNC) {
-               crit_enter();
-               if (nbio->bio_caller_info1.cluster_head == NULL) {
-                       biodone(bio);
-               } else {
-                       nbio->bio_caller_info2.cluster_tail = AUTOCHAINDONE;
-               }
-               crit_exit();
-       } else {
-               crit_enter();
-               while (nbio->bio_caller_info1.cluster_head != NULL) {
-                       bp->b_flags |= B_WANT;
-                       tsleep(bp, 0, "bpchain", 0);
-               }
-               if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
-                       bp->b_flags |= B_ERROR;
-                       bp->b_error = EINVAL;
-               }
-               biodone(bio);
-               crit_exit();
+       nbio->bio_caller_info2.cluster_tail = NULL;
+       bufx = nbio->bio_caller_info1.cluster_head;
+
+       while (bufx) {
+               biox = &bufx->b_bio1;
+               BUF_KERNPROC(bufx);
+               bufx = bufx->b_cluster_next;
+               vn_strategy(swapdev_vp, biox);
        }
+
+       /*
+        * Completion of the cluster will also call biodone_chain(nbio).
+        * We never call biodone(nbio) so we don't have to worry about
+        * setting up a bio_done callback.  It's handled in the sub-IO.
+        */
+       /**/
 }
 
 static void
@@ -1080,6 +1051,7 @@ swap_chain_iodone(struct bio *biox)
        struct buf *bufx;       /* chained sub-buffer */
        struct bio *nbio;       /* parent nbio with chain glue */
        struct buf *bp;         /* original bp associated with nbio */
+       int chain_empty;
 
        bufx = biox->bio_buf;
        nbio = biox->bio_caller_info1.cluster_parent;
@@ -1090,52 +1062,40 @@ swap_chain_iodone(struct bio *biox)
         */
         KKASSERT(bp != NULL);
        if (bufx->b_flags & B_ERROR) {
-               bp->b_flags |= B_ERROR;
+               atomic_set_int(&bufx->b_flags, B_ERROR);
                bp->b_error = bufx->b_error;
        } else if (bufx->b_resid != 0) {
-               bp->b_flags |= B_ERROR;
+               atomic_set_int(&bufx->b_flags, B_ERROR);
                bp->b_error = EINVAL;
        } else {
-               bp->b_resid -= bufx->b_bcount;
+               atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
        }
 
        /*
-        * Remove us from the chain.  It is sufficient to clean up 
-        * cluster_head.  Once the chain is operational cluster_tail
-        * may be used to indicate AUTOCHAINDONE.  Note that I/O's
-        * can complete while the swap system is still appending new
-        * BIOs to the chain.
+        * Remove us from the chain.
         */
+       spin_lock_wr(&bp->b_lock.lk_spinlock);
        nextp = &nbio->bio_caller_info1.cluster_head;
        while (*nextp != bufx) {
                KKASSERT(*nextp != NULL);
                nextp = &(*nextp)->b_cluster_next;
        }
        *nextp = bufx->b_cluster_next;
-       if (bp->b_flags & B_WANT) {
-               bp->b_flags &= ~B_WANT;
-               wakeup(bp);
-       }
+       chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
+       spin_unlock_wr(&bp->b_lock.lk_spinlock);
 
        /*
-        * Clean up bufx.  If this was the last buffer in the chain
-        * and AUTOCHAINDONE was set, finish off the original I/O
-        * as well.
-        *
-        * nbio was just a fake BIO layer to hold the cluster links,
-        * we can issue the biodone() on the layer above it.
+        * Clean up bufx.  If the chain is now empty we finish out
+        * the parent.  Note that we may be racing other completions
+        * so we must use the chain_empty status from above.
         */
-       if (nbio->bio_caller_info1.cluster_head == NULL &&
-           nbio->bio_caller_info2.cluster_tail == AUTOCHAINDONE
-       ) {
-               nbio->bio_caller_info2.cluster_tail = NULL;
+       if (chain_empty) {
                if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
-                       bp->b_flags |= B_ERROR;
+                       atomic_set_int(&bp->b_flags, B_ERROR);
                        bp->b_error = EINVAL;
                }
-               biodone(nbio->bio_prev);
+               biodone_chain(nbio);
         }
-        bufx->b_flags &= ~B_ASYNC;
         relpbuf(bufx, NULL);
 }
 
@@ -1518,7 +1478,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                 * asynchronous
                 */
                if (sync == FALSE) {
-                       bp->b_flags |= B_ASYNC;
                        bio->bio_done = swp_pager_async_iodone;
                        BUF_KERNPROC(bp);
                        vn_strategy(swapdev_vp, bio);
@@ -1529,22 +1488,17 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                }
 
                /*
-                * synchronous
-                */
-
-               bio->bio_done = swp_pager_sync_iodone;
-               vn_strategy(swapdev_vp, bio);
-
-               /*
+                * Issue synchrnously.
+                *
                 * Wait for the sync I/O to complete, then update rtvals.
                 * We just set the rtvals[] to VM_PAGER_PEND so we can call
                 * our async completion routine at the end, thus avoiding a
                 * double-free.
                 */
-               crit_enter();
-
-               while (bp->b_cmd != BUF_CMD_DONE)
-                       tsleep(bp, 0, "swwrt", 0);
+               bio->bio_done = biodone_sync;
+               bio->bio_flags |= BIO_SYNC;
+               vn_strategy(swapdev_vp, bio);
+               biowait(bio, "swwrt");
 
                for (j = 0; j < n; ++j)
                        rtvals[i+j] = VM_PAGER_PEND;
@@ -1553,10 +1507,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                 * Now that we are through with the bp, we can call the
                 * normal async completion, which frees everything up.
                 */
-
                swp_pager_async_iodone(bio);
-
-               crit_exit();
        }
 }
 
@@ -1567,26 +1518,6 @@ swap_pager_newswap(void)
 }
 
 /*
- *     swap_pager_sync_iodone:
- *
- *     Completion routine for synchronous reads and writes from/to swap.
- *     We just mark the bp is complete and wake up anyone waiting on it.
- *
- *     This routine may not block.  This routine is called at splbio()
- *     or better.
- */
-
-static void
-swp_pager_sync_iodone(struct bio *bio)
-{
-       struct buf *bp = bio->bio_buf;
-
-       bp->b_flags &= ~B_ASYNC;
-       bp->b_cmd = BUF_CMD_DONE;
-       wakeup(bp);
-}
-
-/*
  *     swp_pager_async_iodone:
  *
  *     Completion routine for asynchronous reads and writes from/to swap.
@@ -1600,7 +1531,6 @@ swp_pager_sync_iodone(struct bio *bio)
  *
  *     This routine may not block.
  */
-
 static void
 swp_pager_async_iodone(struct bio *bio)
 {
@@ -1634,7 +1564,6 @@ swp_pager_async_iodone(struct bio *bio)
        /*
         * remove the mapping for kernel virtual
         */
-
        pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
 
        /*
@@ -1645,7 +1574,6 @@ swp_pager_async_iodone(struct bio *bio)
         * but do not free it in the rlist.  The errornous block(s) are thus
         * never reallocated as swap.  Redirty the page and continue.
         */
-
        for (i = 0; i < bp->b_xio.xio_npages; ++i) {
                vm_page_t m = bp->b_xio.xio_pages[i];
 
@@ -1796,10 +1724,10 @@ swp_pager_async_iodone(struct bio *bio)
         */
        if (bp->b_cmd == BUF_CMD_READ)
                nswptr = &nsw_rcount;
-       else if (bp->b_flags & B_ASYNC)
-               nswptr = &nsw_wcount_async;
-       else
+       else if (bio->bio_flags & BIO_SYNC)
                nswptr = &nsw_wcount_sync;
+       else
+               nswptr = &nsw_wcount_async;
        bp->b_cmd = BUF_CMD_DONE;
        relpbuf(bp, nswptr);
        crit_exit();