HAMMER - Add live dedup sysctl and support
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
index a92e9c4..8ddb6a2 100644 (file)
@@ -47,6 +47,9 @@
 #include <sys/file.h>
 #include <vm/vm_extern.h>
 #include <vfs/fifofs/fifo.h>
+
+#include <sys/mplock2.h>
+
 #include "hammer.h"
 
 /*
@@ -160,7 +163,7 @@ void
 hammer_knote(struct vnode *vp, int flags)
 {
        if (flags)
-               KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
+               KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
 }
 
 #ifdef DEBUG_TRUNCATE
@@ -188,28 +191,121 @@ hammer_vop_vnoperate(struct vop_generic_args *)
  * fsync() an inode to disk and wait for it to be completely committed
  * such that the information would not be undone if a crash occured after
  * return.
+ *
+ * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
+ *      a REDO log.  A sysctl is provided to relax HAMMER's fsync()
+ *      operation.
+ *
+ *      Ultimately the combination of a REDO log and use of fast storage
+ *      to front-end cluster caches will make fsync fast, but it aint
+ *      here yet.  And, in anycase, we need real transactional
+ *      all-or-nothing features which are not restricted to a single file.
  */
 static
 int
 hammer_vop_fsync(struct vop_fsync_args *ap)
 {
        hammer_inode_t ip = VTOI(ap->a_vp);
+       hammer_mount_t hmp = ip->hmp;
+       int waitfor = ap->a_waitfor;
+       int mode;
+
+       lwkt_gettoken(&hmp->fs_token);
+
+       /*
+        * Fsync rule relaxation (default is either full synchronous flush
+        * or REDO semantics with synchronous flush).
+        */
+       if (ap->a_flags & VOP_FSYNC_SYSCALL) {
+               switch(hammer_fsync_mode) {
+               case 0:
+mode0:
+                       /* no REDO, full synchronous flush */
+                       goto skip;
+               case 1:
+mode1:
+                       /* no REDO, full asynchronous flush */
+                       if (waitfor == MNT_WAIT)
+                               waitfor = MNT_NOWAIT;
+                       goto skip;
+               case 2:
+                       /* REDO semantics, synchronous flush */
+                       if (hmp->version < HAMMER_VOL_VERSION_FOUR)
+                               goto mode0;
+                       mode = HAMMER_FLUSH_UNDOS_AUTO;
+                       break;
+               case 3:
+                       /* REDO semantics, relaxed asynchronous flush */
+                       if (hmp->version < HAMMER_VOL_VERSION_FOUR)
+                               goto mode1;
+                       mode = HAMMER_FLUSH_UNDOS_RELAXED;
+                       if (waitfor == MNT_WAIT)
+                               waitfor = MNT_NOWAIT;
+                       break;
+               case 4:
+                       /* ignore the fsync() system call */
+                       lwkt_reltoken(&hmp->fs_token);
+                       return(0);
+               default:
+                       /* we have to do something */
+                       mode = HAMMER_FLUSH_UNDOS_RELAXED;
+                       if (waitfor == MNT_WAIT)
+                               waitfor = MNT_NOWAIT;
+                       break;
+               }
 
+               /*
+                * Fast fsync only needs to flush the UNDO/REDO fifo if
+                * HAMMER_INODE_REDO is non-zero and the only modifications
+                * made to the file are write or write-extends.
+                */
+               if ((ip->flags & HAMMER_INODE_REDO) &&
+                   (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
+               ) {
+                       ++hammer_count_fsyncs;
+                       hammer_flusher_flush_undos(hmp, mode);
+                       ip->redo_count = 0;
+                       lwkt_reltoken(&hmp->fs_token);
+                       return(0);
+               }
+
+               /*
+                * REDO is enabled by fsync(), the idea being we really only
+                * want to lay down REDO records when programs are using
+                * fsync() heavily.  The first fsync() on the file starts
+                * the gravy train going and later fsync()s keep it hot by
+                * resetting the redo_count.
+                *
+                * We weren't running REDOs before now so we have to fall
+                * through and do a full fsync of what we have.
+                */
+               if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
+                   (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
+                       ip->flags |= HAMMER_INODE_REDO;
+                       ip->redo_count = 0;
+               }
+       }
+skip:
+
+       /*
+        * Do a full flush sequence.
+        */
        ++hammer_count_fsyncs;
-       vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
+       vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
        hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
-       if (ap->a_waitfor == MNT_WAIT) {
+       if (waitfor == MNT_WAIT) {
                vn_unlock(ap->a_vp);
                hammer_wait_inode(ip);
                vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
        }
+       lwkt_reltoken(&hmp->fs_token);
        return (ip->error);
 }
 
 /*
  * hammer_vop_read { vp, uio, ioflag, cred }
  *
- * MPALMOSTSAFE
+ * MPSAFE (for the cache safe does not require fs_token)
  */
 static
 int
@@ -217,6 +313,7 @@ hammer_vop_read(struct vop_read_args *ap)
 {
        struct hammer_transaction trans;
        hammer_inode_t ip;
+       hammer_mount_t hmp;
        off_t offset;
        struct buf *bp;
        struct uio *uio;
@@ -225,12 +322,13 @@ hammer_vop_read(struct vop_read_args *ap)
        int seqcount;
        int ioseqcount;
        int blksize;
-       int got_mplock;
        int bigread;
+       int got_fstoken;
 
        if (ap->a_vp->v_type != VREG)
                return (EINVAL);
        ip = VTOI(ap->a_vp);
+       hmp = ip->hmp;
        error = 0;
        uio = ap->a_uio;
 
@@ -238,32 +336,18 @@ hammer_vop_read(struct vop_read_args *ap)
         * Allow the UIO's size to override the sequential heuristic.
         */
        blksize = hammer_blocksize(uio->uio_offset);
-       seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
-       ioseqcount = ap->a_ioflag >> 16;
+       seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
+       ioseqcount = (ap->a_ioflag >> 16);
        if (seqcount < ioseqcount)
                seqcount = ioseqcount;
 
-       /*
-        * Temporary hack until more of HAMMER can be made MPSAFE.
-        */
-#ifdef SMP
-       if (curthread->td_mpcount) {
-               got_mplock = -1;
-               hammer_start_transaction(&trans, ip->hmp);
-       } else {
-               got_mplock = 0;
-       }
-#else
-       hammer_start_transaction(&trans, ip->hmp);
-       got_mplock = -1;
-#endif
-
        /*
         * If reading or writing a huge amount of data we have to break
         * atomicy and allow the operation to be interrupted by a signal
         * or it can DOS the machine.
         */
        bigread = (uio->uio_resid > 100 * 1024 * 1024);
+       got_fstoken = 0;
 
        /*
         * Access the data typically in HAMMER_BUFSIZE blocks via the
@@ -297,9 +381,9 @@ hammer_vop_read(struct vop_read_args *ap)
                /*
                 * MPUNSAFE
                 */
-               if (got_mplock == 0) {
-                       got_mplock = 1;
-                       get_mplock();
+               if (got_fstoken == 0) {
+                       lwkt_gettoken(&hmp->fs_token);
+                       got_fstoken = 1;
                        hammer_start_transaction(&trans, ip->hmp);
                }
 
@@ -316,17 +400,23 @@ hammer_vop_read(struct vop_read_args *ap)
                        }
                        error = cluster_read(ap->a_vp,
                                             file_limit, base_offset,
-                                            blksize, MAXPHYS,
-                                            seqcount, &bp);
+                                            blksize, uio->uio_resid,
+                                            seqcount * BKVASIZE, &bp);
                } else {
                        error = bread(ap->a_vp, base_offset, blksize, &bp);
                }
                if (error) {
-                       kprintf("error %d\n", error);
                        brelse(bp);
                        break;
                }
 skip:
+               if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
+                       kprintf("doff %016jx read file %016jx@%016jx\n",
+                               (intmax_t)bp->b_bio2.bio_offset,
+                               (intmax_t)ip->obj_id,
+                               (intmax_t)bp->b_loffset);
+               }
+               bp->b_flags &= ~B_IODEBUG;
 
                /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
                n = blksize - offset;
@@ -348,15 +438,14 @@ skip:
         * XXX only update the atime if we had to get the MP lock.
         * XXX hack hack hack, fixme.
         */
-       if (got_mplock) {
+       if (got_fstoken) {
                if ((ip->flags & HAMMER_INODE_RO) == 0 &&
                    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
                        ip->ino_data.atime = trans.time;
-                       hammer_modify_inode(ip, HAMMER_INODE_ATIME);
+                       hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
                }
                hammer_done_transaction(&trans);
-               if (got_mplock > 0)
-                       rel_mplock();
+               lwkt_reltoken(&hmp->fs_token);
        }
        return (error);
 }
@@ -379,7 +468,6 @@ hammer_vop_write(struct vop_write_args *ap)
        int error;
        int n;
        int flags;
-       int delta;
        int seqcount;
        int bigwrite;
 
@@ -397,6 +485,7 @@ hammer_vop_write(struct vop_write_args *ap)
        /*
         * Create a transaction to cover the operations we perform.
         */
+       lwkt_gettoken(&hmp->fs_token);
        hammer_start_transaction(&trans, hmp);
        uio = ap->a_uio;
 
@@ -414,11 +503,13 @@ hammer_vop_write(struct vop_write_args *ap)
         */
        if (uio->uio_offset < 0) {
                hammer_done_transaction(&trans);
+               lwkt_reltoken(&hmp->fs_token);
                return (EFBIG);
        }
        base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
        if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
                hammer_done_transaction(&trans);
+               lwkt_reltoken(&hmp->fs_token);
                return (EFBIG);
        }
 
@@ -426,8 +517,15 @@ hammer_vop_write(struct vop_write_args *ap)
         * If reading or writing a huge amount of data we have to break
         * atomicy and allow the operation to be interrupted by a signal
         * or it can DOS the machine.
+        *
+        * Preset redo_count so we stop generating REDOs earlier if the
+        * limit is exceeded.
         */
        bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
+       if ((ip->flags & HAMMER_INODE_REDO) &&
+           ip->redo_count < hammer_limit_redo) {
+               ip->redo_count += uio->uio_resid;
+       }
 
        /*
         * Access the data typically in HAMMER_BUFSIZE blocks via the
@@ -438,6 +536,9 @@ hammer_vop_write(struct vop_write_args *ap)
                int fixsize = 0;
                int blksize;
                int blkmask;
+               int trivial;
+               int endofblk;
+               off_t nsize;
 
                if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
                        break;
@@ -464,6 +565,22 @@ hammer_vop_write(struct vop_write_args *ap)
                if ((ap->a_ioflag & IO_RECURSE) == 0)
                        bwillwrite(blksize);
 
+               /*
+                * Control the number of pending records associated with
+                * this inode.  If too many have accumulated start a
+                * flush.  Try to maintain a pipeline with the flusher.
+                */
+               if (ip->rsv_recs >= hammer_limit_inode_recs) {
+                       hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
+               }
+               if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
+                       while (ip->rsv_recs >= hammer_limit_inode_recs) {
+                               tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
+                       }
+                       hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
+               }
+
+#if 0
                /*
                 * Do not allow HAMMER to blow out system memory by
                 * accumulating too many records.   Records are so well
@@ -503,6 +620,7 @@ hammer_vop_write(struct vop_write_args *ap)
                        if (delta > 0)
                                tsleep(&trans, 0, "hmrslo", delta);
                }
+#endif
 
                /*
                 * Calculate the blocksize at the current offset and figure
@@ -512,10 +630,26 @@ hammer_vop_write(struct vop_write_args *ap)
                offset = (int)uio->uio_offset & blkmask;
                base_offset = uio->uio_offset & ~(int64_t)blkmask;
                n = blksize - offset;
-               if (n > uio->uio_resid)
+               if (n > uio->uio_resid) {
                        n = uio->uio_resid;
-               if (uio->uio_offset + n > ip->ino_data.size) {
-                       vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
+                       endofblk = 0;
+               } else {
+                       endofblk = 1;
+               }
+               nsize = uio->uio_offset + n;
+               if (nsize > ip->ino_data.size) {
+                       if (uio->uio_offset > ip->ino_data.size)
+                               trivial = 0;
+                       else
+                               trivial = 1;
+                       nvextendbuf(ap->a_vp,
+                                   ip->ino_data.size,
+                                   nsize,
+                                   hammer_blocksize(ip->ino_data.size),
+                                   hammer_blocksize(nsize),
+                                   hammer_blockoff(ip->ino_data.size),
+                                   hammer_blockoff(nsize),
+                                   trivial);
                        fixsize = 1;
                        kflags |= NOTE_EXTEND;
                }
@@ -561,9 +695,35 @@ hammer_vop_write(struct vop_write_args *ap)
                        if (error == 0)
                                bheavy(bp);
                }
-               if (error == 0) {
-                       error = uiomove((char *)bp->b_data + offset,
-                                       n, uio);
+               if (error == 0)
+                       error = uiomove(bp->b_data + offset, n, uio);
+
+               /*
+                * Generate REDO records if enabled and redo_count will not
+                * exceeded the limit.
+                *
+                * If redo_count exceeds the limit we stop generating records
+                * and clear HAMMER_INODE_REDO.  This will cause the next
+                * fsync() to do a full meta-data sync instead of just an
+                * UNDO/REDO fifo update.
+                *
+                * When clearing HAMMER_INODE_REDO any pre-existing REDOs
+                * will still be tracked.  The tracks will be terminated
+                * when the related meta-data (including possible data
+                * modifications which are not tracked via REDO) is
+                * flushed.
+                */
+               if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
+                       if (ip->redo_count < hammer_limit_redo) {
+                               bp->b_flags |= B_VFSFLAG1;
+                               error = hammer_generate_redo(&trans, ip,
+                                                    base_offset + offset,
+                                                    HAMMER_REDO_WRITE,
+                                                    bp->b_data + offset,
+                                                    (size_t)n);
+                       } else {
+                               ip->flags &= ~HAMMER_INODE_REDO;
+                       }
                }
 
                /*
@@ -573,8 +733,9 @@ hammer_vop_write(struct vop_write_args *ap)
                if (error) {
                        brelse(bp);
                        if (fixsize) {
-                               vtruncbuf(ap->a_vp, ip->ino_data.size,
-                                         hammer_blocksize(ip->ino_data.size));
+                               nvtruncbuf(ap->a_vp, ip->ino_data.size,
+                                         hammer_blocksize(ip->ino_data.size),
+                                         hammer_blockoff(ip->ino_data.size));
                        }
                        break;
                }
@@ -583,14 +744,13 @@ hammer_vop_write(struct vop_write_args *ap)
                /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
                if (ip->ino_data.size < uio->uio_offset) {
                        ip->ino_data.size = uio->uio_offset;
-                       flags = HAMMER_INODE_DDIRTY;
-                       vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
+                       flags = HAMMER_INODE_SDIRTY;
                } else {
                        flags = 0;
                }
                ip->ino_data.mtime = trans.time;
                flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
-               hammer_modify_inode(ip, flags);
+               hammer_modify_inode(&trans, ip, flags);
 
                /*
                 * Once we dirty the buffer any cached zone-X offset
@@ -602,23 +762,58 @@ hammer_vop_write(struct vop_write_args *ap)
 
                /*
                 * Final buffer disposition.
+                *
+                * Because meta-data updates are deferred, HAMMER is
+                * especially sensitive to excessive bdwrite()s because
+                * the I/O stream is not broken up by disk reads.  So the
+                * buffer cache simply cannot keep up.
+                *
+                * WARNING!  blksize is variable.  cluster_write() is
+                *           expected to not blow up if it encounters
+                *           buffers that do not match the passed blksize.
+                *
+                * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
+                *        The ip->rsv_recs check should burst-flush the data.
+                *        If we queue it immediately the buf could be left
+                *        locked on the device queue for a very long time.
+                *
+                * NOTE!  To avoid degenerate stalls due to mismatched block
+                *        sizes we only honor IO_DIRECT on the write which
+                *        abuts the end of the buffer.  However, we must
+                *        honor IO_SYNC in case someone is silly enough to
+                *        configure a HAMMER file as swap, or when HAMMER
+                *        is serving NFS (for commits).  Ick ick.
                 */
                bp->b_flags |= B_AGE;
                if (ap->a_ioflag & IO_SYNC) {
                        bwrite(bp);
-               } else if (ap->a_ioflag & IO_DIRECT) {
+               } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
                        bawrite(bp);
                } else {
+#if 0
+               if (offset + n == blksize) {
+                       if (hammer_cluster_enable == 0 ||
+                           (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
+                               bawrite(bp);
+                       } else {
+                               cluster_write(bp, ip->ino_data.size,
+                                             blksize, seqcount);
+                       }
+               } else {
+#endif
                        bdwrite(bp);
                }
        }
        hammer_done_transaction(&trans);
        hammer_knote(ap->a_vp, kflags);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
 /*
  * hammer_vop_access { vp, mode, cred }
+ *
+ * MPSAFE - does not require fs_token
  */
 static
 int
@@ -640,6 +835,8 @@ hammer_vop_access(struct vop_access_args *ap)
 
 /*
  * hammer_vop_advlock { vp, id, op, fl, flags }
+ *
+ * MPSAFE - does not require fs_token
  */
 static
 int
@@ -652,12 +849,30 @@ hammer_vop_advlock(struct vop_advlock_args *ap)
 
 /*
  * hammer_vop_close { vp, fflag }
+ *
+ * We can only sync-on-close for normal closes.  XXX disabled for now.
  */
 static
 int
 hammer_vop_close(struct vop_close_args *ap)
 {
-       /*hammer_inode_t ip = VTOI(ap->a_vp);*/
+#if 0
+       struct vnode *vp = ap->a_vp;
+       hammer_inode_t ip = VTOI(vp);
+       int waitfor;
+       if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
+               if (vn_islocked(vp) == LK_EXCLUSIVE &&
+                   (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
+                       if (ip->flags & HAMMER_INODE_CLOSESYNC)
+                               waitfor = MNT_WAIT;
+                       else
+                               waitfor = MNT_NOWAIT;
+                       ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
+                                      HAMMER_INODE_CLOSEASYNC);
+                       VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
+               }
+       }
+#endif
        return (vop_stdclose(ap));
 }
 
@@ -675,20 +890,23 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap)
        struct hammer_inode *dip;
        struct hammer_inode *nip;
        struct nchandle *nch;
+       hammer_mount_t hmp;
        int error;
 
        nch = ap->a_nch;
        dip = VTOI(ap->a_dvp);
+       hmp = dip->hmp;
 
        if (dip->flags & HAMMER_INODE_RO)
                return (EROFS);
-       if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
+       if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
                return (error);
 
        /*
         * Create a transaction to cover the operations we perform.
         */
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
 
        /*
@@ -703,6 +921,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap)
                hkprintf("hammer_create_inode error %d\n", error);
                hammer_done_transaction(&trans);
                *ap->a_vpp = NULL;
+               lwkt_reltoken(&hmp->fs_token);
                return (error);
        }
 
@@ -733,6 +952,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap)
                }
                hammer_knote(ap->a_dvp, NOTE_WRITE);
        }
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -744,7 +964,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap)
  * The atime field is stored in the B-Tree element and allowed to be
  * updated without cycling the element.
  *
- * MPSAFE
+ * MPSAFE - does not require fs_token
  */
 static
 int
@@ -825,8 +1045,6 @@ hammer_vop_getattr(struct vop_getattr_args *ap)
 
        vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
        vap->va_filerev = 0;    /* XXX */
-       /* mtime uniquely identifies any adjustments made to the file XXX */
-       vap->va_fsmid = ip->ino_data.mtime;
        vap->va_uid_uuid = ip->ino_data.uid;
        vap->va_gid_uuid = ip->ino_data.gid;
        vap->va_fsid_uuid = ip->hmp->fsid;
@@ -857,6 +1075,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap)
 {
        struct hammer_transaction trans;
        struct namecache *ncp;
+       hammer_mount_t hmp;
        hammer_inode_t dip;
        hammer_inode_t ip;
        hammer_tid_t asof;
@@ -884,8 +1103,10 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap)
        nlen = ncp->nc_nlen;
        flags = dip->flags & HAMMER_INODE_RO;
        ispfs = 0;
+       hmp = dip->hmp;
 
-       hammer_simple_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_simple_transaction(&trans, hmp);
        ++hammer_stats_file_iopsr;
 
        for (i = 0; i < nlen; ++i) {
@@ -1040,6 +1261,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap)
        }
 done:
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -1066,6 +1288,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
        struct hammer_transaction trans;
        struct hammer_inode *dip;
        struct hammer_inode *ip;
+       hammer_mount_t hmp;
        int64_t parent_obj_id;
        u_int32_t parent_obj_localization;
        hammer_tid_t asof;
@@ -1073,11 +1296,13 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
 
        dip = VTOI(ap->a_dvp);
        asof = dip->obj_asof;
+       hmp = dip->hmp;
 
        /*
         * Whos are parent?  This could be the root of a pseudo-filesystem
         * whos parent is in another localization domain.
         */
+       lwkt_gettoken(&hmp->fs_token);
        parent_obj_id = dip->ino_data.parent_obj_id;
        if (dip->obj_id == HAMMER_OBJID_ROOT)
                parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
@@ -1086,19 +1311,20 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
 
        if (parent_obj_id == 0) {
                if (dip->obj_id == HAMMER_OBJID_ROOT &&
-                  asof != dip->hmp->asof) {
+                  asof != hmp->asof) {
                        parent_obj_id = dip->obj_id;
-                       asof = dip->hmp->asof;
+                       asof = hmp->asof;
                        *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
                        ksnprintf(*ap->a_fakename, 19, "0x%016llx",
                                  (long long)dip->obj_asof);
                } else {
                        *ap->a_vpp = NULL;
+                       lwkt_reltoken(&hmp->fs_token);
                        return ENOENT;
                }
        }
 
-       hammer_simple_transaction(&trans, dip->hmp);
+       hammer_simple_transaction(&trans, hmp);
        ++hammer_stats_file_iopsr;
 
        ip = hammer_get_inode(&trans, dip, parent_obj_id,
@@ -1111,6 +1337,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
                *ap->a_vpp = NULL;
        }
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -1125,6 +1352,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap)
        struct hammer_inode *dip;
        struct hammer_inode *ip;
        struct nchandle *nch;
+       hammer_mount_t hmp;
        int error;
 
        if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
@@ -1133,6 +1361,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap)
        nch = ap->a_nch;
        dip = VTOI(ap->a_dvp);
        ip = VTOI(ap->a_vp);
+       hmp = dip->hmp;
 
        if (dip->obj_localization != ip->obj_localization)
                return(EXDEV);
@@ -1141,13 +1370,14 @@ hammer_vop_nlink(struct vop_nlink_args *ap)
                return (EROFS);
        if (ip->flags & HAMMER_INODE_RO)
                return (EROFS);
-       if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
+       if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
                return (error);
 
        /*
         * Create a transaction to cover the operations we perform.
         */
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
 
        /*
@@ -1169,6 +1399,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap)
        hammer_done_transaction(&trans);
        hammer_knote(ap->a_vp, NOTE_LINK);
        hammer_knote(ap->a_dvp, NOTE_WRITE);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -1186,20 +1417,23 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
        struct hammer_inode *dip;
        struct hammer_inode *nip;
        struct nchandle *nch;
+       hammer_mount_t hmp;
        int error;
 
        nch = ap->a_nch;
        dip = VTOI(ap->a_dvp);
+       hmp = dip->hmp;
 
        if (dip->flags & HAMMER_INODE_RO)
                return (EROFS);
-       if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
+       if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
                return (error);
 
        /*
         * Create a transaction to cover the operations we perform.
         */
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
 
        /*
@@ -1213,6 +1447,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
                hkprintf("hammer_mkdir error %d\n", error);
                hammer_done_transaction(&trans);
                *ap->a_vpp = NULL;
+               lwkt_reltoken(&hmp->fs_token);
                return (error);
        }
        /*
@@ -1242,6 +1477,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
        hammer_done_transaction(&trans);
        if (error == 0)
                hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -1259,20 +1495,23 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap)
        struct hammer_inode *dip;
        struct hammer_inode *nip;
        struct nchandle *nch;
+       hammer_mount_t hmp;
        int error;
 
        nch = ap->a_nch;
        dip = VTOI(ap->a_dvp);
+       hmp = dip->hmp;
 
        if (dip->flags & HAMMER_INODE_RO)
                return (EROFS);
-       if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
+       if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
                return (error);
 
        /*
         * Create a transaction to cover the operations we perform.
         */
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
 
        /*
@@ -1287,6 +1526,7 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap)
        if (error) {
                hammer_done_transaction(&trans);
                *ap->a_vpp = NULL;
+               lwkt_reltoken(&hmp->fs_token);
                return (error);
        }
 
@@ -1315,11 +1555,14 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap)
        hammer_done_transaction(&trans);
        if (error == 0)
                hammer_knote(ap->a_dvp, NOTE_WRITE);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
 /*
  * hammer_vop_open { vp, mode, cred, fp }
+ *
+ * MPSAFE (does not require fs_token)
  */
 static
 int
@@ -1355,6 +1598,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap)
        struct hammer_transaction trans;
        struct hammer_cursor cursor;
        struct hammer_inode *ip;
+       hammer_mount_t hmp;
        struct uio *uio;
        hammer_base_elm_t base;
        int error;
@@ -1369,6 +1613,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap)
        ip = VTOI(ap->a_vp);
        uio = ap->a_uio;
        saveoff = uio->uio_offset;
+       hmp = ip->hmp;
 
        if (ap->a_ncookies) {
                ncookies = uio->uio_resid / 16 + 1;
@@ -1382,7 +1627,8 @@ hammer_vop_readdir(struct vop_readdir_args *ap)
                cookie_index = 0;
        }
 
-       hammer_simple_transaction(&trans, ip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_simple_transaction(&trans, hmp);
 
        /*
         * Handle artificial entries
@@ -1497,6 +1743,7 @@ done:
                        *ap->a_cookies = cookies;
                }
        }
+       lwkt_reltoken(&hmp->fs_token);
        return(error);
 }
 
@@ -1510,12 +1757,16 @@ hammer_vop_readlink(struct vop_readlink_args *ap)
        struct hammer_transaction trans;
        struct hammer_cursor cursor;
        struct hammer_inode *ip;
+       hammer_mount_t hmp;
        char buf[32];
        u_int32_t localization;
        hammer_pseudofs_inmem_t pfsm;
        int error;
 
        ip = VTOI(ap->a_vp);
+       hmp = ip->hmp;
+
+       lwkt_gettoken(&hmp->fs_token);
 
        /*
         * Shortcut if the symlink data was stuffed into ino_data.
@@ -1534,7 +1785,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap)
                    ip->obj_asof == HAMMER_MAX_TID &&
                    ip->obj_localization == 0 &&
                    strncmp(ptr, "@@PFS", 5) == 0) {
-                       hammer_simple_transaction(&trans, ip->hmp);
+                       hammer_simple_transaction(&trans, hmp);
                        bcopy(ptr + 5, buf, 5);
                        buf[5] = 0;
                        localization = strtoul(buf, NULL, 10) << 16;
@@ -1564,17 +1815,18 @@ hammer_vop_readlink(struct vop_readlink_args *ap)
                                bytes = strlen(buf);
                        }
                        if (pfsm)
-                               hammer_rel_pseudofs(trans.hmp, pfsm);
+                               hammer_rel_pseudofs(hmp, pfsm);
                        hammer_done_transaction(&trans);
                }
                error = uiomove(ptr, bytes, ap->a_uio);
+               lwkt_reltoken(&hmp->fs_token);
                return(error);
        }
 
        /*
         * Long version
         */
-       hammer_simple_transaction(&trans, ip->hmp);
+       hammer_simple_transaction(&trans, hmp);
        ++hammer_stats_file_iopsr;
        hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
 
@@ -1607,6 +1859,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap)
        }
        hammer_done_cursor(&cursor);
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
        return(error);
 }
 
@@ -1619,21 +1872,25 @@ hammer_vop_nremove(struct vop_nremove_args *ap)
 {
        struct hammer_transaction trans;
        struct hammer_inode *dip;
+       hammer_mount_t hmp;
        int error;
 
        dip = VTOI(ap->a_dvp);
+       hmp = dip->hmp;
 
        if (hammer_nohistory(dip) == 0 &&
-           (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
+           (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
                return (error);
        }
 
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
        error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
        hammer_done_transaction(&trans);
        if (error == 0)
                hammer_knote(ap->a_dvp, NOTE_WRITE);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -1650,6 +1907,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap)
        struct hammer_inode *fdip;
        struct hammer_inode *tdip;
        struct hammer_inode *ip;
+       hammer_mount_t hmp;
        struct hammer_cursor cursor;
        int64_t namekey;
        u_int32_t max_iterations;
@@ -1667,6 +1925,8 @@ hammer_vop_nrename(struct vop_nrename_args *ap)
        ip = VTOI(fncp->nc_vp);
        KKASSERT(ip != NULL);
 
+       hmp = ip->hmp;
+
        if (fdip->obj_localization != tdip->obj_localization)
                return(EXDEV);
        if (fdip->obj_localization != ip->obj_localization)
@@ -1678,10 +1938,11 @@ hammer_vop_nrename(struct vop_nrename_args *ap)
                return (EROFS);
        if (ip->flags & HAMMER_INODE_RO)
                return (EROFS);
-       if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
+       if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
                return (error);
 
-       hammer_start_transaction(&trans, fdip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
 
        /*
@@ -1700,7 +1961,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap)
                if (error == 0) {
                        ip->ino_data.parent_obj_id = tdip->obj_id;
                        ip->ino_data.ctime = trans.time;
-                       hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
+                       hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
                }
        }
        if (error)
@@ -1774,18 +2035,34 @@ retry:
 
        /*
         * Cleanup and tell the kernel that the rename succeeded.
+        *
+        * NOTE: ip->vp, if non-NULL, cannot be directly referenced
+        *       without formally acquiring the vp since the vp might
+        *       have zero refs on it, or in the middle of a reclaim,
+        *       etc.
         */
         hammer_done_cursor(&cursor);
        if (error == 0) {
                cache_rename(ap->a_fnch, ap->a_tnch);
                hammer_knote(ap->a_fdvp, NOTE_WRITE);
                hammer_knote(ap->a_tdvp, NOTE_WRITE);
-               if (ip->vp)
-                       hammer_knote(ip->vp, NOTE_RENAME);
+               while (ip->vp) {
+                       struct vnode *vp;
+
+                       error = hammer_get_vnode(ip, &vp);
+                       if (error == 0 && vp) {
+                               vn_unlock(vp);
+                               hammer_knote(ip->vp, NOTE_RENAME);
+                               vrele(vp);
+                               break;
+                       }
+                       kprintf("Debug: HAMMER ip/vp race2 avoided\n");
+               }
        }
 
 failed:
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -1798,21 +2075,25 @@ hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
 {
        struct hammer_transaction trans;
        struct hammer_inode *dip;
+       hammer_mount_t hmp;
        int error;
 
        dip = VTOI(ap->a_dvp);
+       hmp = dip->hmp;
 
        if (hammer_nohistory(dip) == 0 &&
-           (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
+           (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
                return (error);
        }
 
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
        error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
        hammer_done_transaction(&trans);
        if (error == 0)
                hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -1825,21 +2106,25 @@ hammer_vop_markatime(struct vop_markatime_args *ap)
 {
        struct hammer_transaction trans;
        struct hammer_inode *ip;
+       hammer_mount_t hmp;
 
        ip = VTOI(ap->a_vp);
        if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
                return (EROFS);
        if (ip->flags & HAMMER_INODE_RO)
                return (EROFS);
-       if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
+       hmp = ip->hmp;
+       if (hmp->mp->mnt_flag & MNT_NOATIME)
                return (0);
-       hammer_start_transaction(&trans, ip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
 
        ip->ino_data.atime = trans.time;
-       hammer_modify_inode(ip, HAMMER_INODE_ATIME);
+       hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
        hammer_done_transaction(&trans);
        hammer_knote(ap->a_vp, NOTE_ATTRIB);
+       lwkt_reltoken(&hmp->fs_token);
        return (0);
 }
 
@@ -1851,31 +2136,36 @@ int
 hammer_vop_setattr(struct vop_setattr_args *ap)
 {
        struct hammer_transaction trans;
-       struct vattr *vap;
        struct hammer_inode *ip;
+       struct vattr *vap;
+       hammer_mount_t hmp;
        int modflags;
        int error;
        int truncating;
        int blksize;
        int kflags;
+#if 0
        int64_t aligned_size;
+#endif
        u_int32_t flags;
 
        vap = ap->a_vap;
        ip = ap->a_vp->v_data;
        modflags = 0;
        kflags = 0;
+       hmp = ip->hmp;
 
        if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
                return(EROFS);
        if (ip->flags & HAMMER_INODE_RO)
                return (EROFS);
        if (hammer_nohistory(ip) == 0 &&
-           (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
+           (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
                return (error);
        }
 
-       hammer_start_transaction(&trans, ip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
        error = 0;
 
@@ -1935,28 +2225,58 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                case VREG:
                        if (vap->va_size == ip->ino_data.size)
                                break;
+
+                       /*
+                        * Log the operation if in fast-fsync mode or if
+                        * there are unterminated redo write records present.
+                        *
+                        * The second check is needed so the recovery code
+                        * properly truncates write redos even if nominal
+                        * REDO operations is turned off due to excessive
+                        * writes, because the related records might be
+                        * destroyed and never lay down a TERM_WRITE.
+                        */
+                       if ((ip->flags & HAMMER_INODE_REDO) ||
+                           (ip->flags & HAMMER_INODE_RDIRTY)) {
+                               error = hammer_generate_redo(&trans, ip,
+                                                            vap->va_size,
+                                                            HAMMER_REDO_TRUNC,
+                                                            NULL, 0);
+                       }
+                       blksize = hammer_blocksize(vap->va_size);
+
                        /*
                         * XXX break atomicy, we can deadlock the backend
                         * if we do not release the lock.  Probably not a
                         * big deal here.
                         */
-                       blksize = hammer_blocksize(vap->va_size);
                        if (vap->va_size < ip->ino_data.size) {
-                               vtruncbuf(ap->a_vp, vap->va_size, blksize);
+                               nvtruncbuf(ap->a_vp, vap->va_size,
+                                          blksize,
+                                          hammer_blockoff(vap->va_size));
                                truncating = 1;
                                kflags |= NOTE_WRITE;
                        } else {
-                               vnode_pager_setsize(ap->a_vp, vap->va_size);
+                               nvextendbuf(ap->a_vp,
+                                           ip->ino_data.size,
+                                           vap->va_size,
+                                           hammer_blocksize(ip->ino_data.size),
+                                           hammer_blocksize(vap->va_size),
+                                           hammer_blockoff(ip->ino_data.size),
+                                           hammer_blockoff(vap->va_size),
+                                           0);
                                truncating = 0;
                                kflags |= NOTE_WRITE | NOTE_EXTEND;
                        }
                        ip->ino_data.size = vap->va_size;
                        ip->ino_data.mtime = trans.time;
+                       /* XXX safe to use SDIRTY instead of DDIRTY here? */
                        modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
 
                        /*
-                        * on-media truncation is cached in the inode until
-                        * the inode is synchronized.
+                        * On-media truncation is cached in the inode until
+                        * the inode is synchronized.  We must immediately
+                        * handle any frontend records.
                         */
                        if (truncating) {
                                hammer_ip_frontend_trunc(ip, vap->va_size);
@@ -1988,34 +2308,20 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                                }
                        }
 
+#if 0
                        /*
-                        * If truncating we have to clean out a portion of
-                        * the last block on-disk.  We do this in the
-                        * front-end buffer cache.
+                        * When truncating, nvtruncbuf() may have cleaned out
+                        * a portion of the last block on-disk in the buffer
+                        * cache.  We must clean out any frontend records
+                        * for blocks beyond the new last block.
                         */
                        aligned_size = (vap->va_size + (blksize - 1)) &
                                       ~(int64_t)(blksize - 1);
                        if (truncating && vap->va_size < aligned_size) {
-                               struct buf *bp;
-                               int offset;
-
                                aligned_size -= blksize;
-
-                               offset = (int)vap->va_size & (blksize - 1);
-                               error = bread(ap->a_vp, aligned_size,
-                                             blksize, &bp);
                                hammer_ip_frontend_trunc(ip, aligned_size);
-                               if (error == 0) {
-                                       bzero(bp->b_data + offset,
-                                             blksize - offset);
-                                       /* must de-cache direct-io offset */
-                                       bp->b_bio2.bio_offset = NOOFFSET;
-                                       bdwrite(bp);
-                               } else {
-                                       kprintf("ERROR %d\n", error);
-                                       brelse(bp);
-                               }
                        }
+#endif
                        break;
                case VDATABASE:
                        if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
@@ -2062,9 +2368,10 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
        }
 done:
        if (error == 0)
-               hammer_modify_inode(ip, modflags);
+               hammer_modify_inode(&trans, ip, modflags);
        hammer_done_transaction(&trans);
        hammer_knote(ap->a_vp, kflags);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -2078,8 +2385,9 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
        struct hammer_transaction trans;
        struct hammer_inode *dip;
        struct hammer_inode *nip;
-       struct nchandle *nch;
        hammer_record_t record;
+       struct nchandle *nch;
+       hammer_mount_t hmp;
        int error;
        int bytes;
 
@@ -2087,16 +2395,18 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
 
        nch = ap->a_nch;
        dip = VTOI(ap->a_dvp);
+       hmp = dip->hmp;
 
        if (dip->flags & HAMMER_INODE_RO)
                return (EROFS);
-       if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
+       if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
                return (error);
 
        /*
         * Create a transaction to cover the operations we perform.
         */
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
 
        /*
@@ -2110,6 +2420,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
        if (error) {
                hammer_done_transaction(&trans);
                *ap->a_vpp = NULL;
+               lwkt_reltoken(&hmp->fs_token);
                return (error);
        }
 
@@ -2141,7 +2452,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
                 */
                if (error == 0) {
                        nip->ino_data.size = bytes;
-                       hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
+                       hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
                }
        }
        if (error == 0)
@@ -2164,6 +2475,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
                }
        }
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
        return (error);
 }
 
@@ -2176,20 +2488,24 @@ hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
 {
        struct hammer_transaction trans;
        struct hammer_inode *dip;
+       hammer_mount_t hmp;
        int error;
 
        dip = VTOI(ap->a_dvp);
+       hmp = dip->hmp;
 
        if (hammer_nohistory(dip) == 0 &&
-           (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
+           (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
                return (error);
        }
 
-       hammer_start_transaction(&trans, dip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_start_transaction(&trans, hmp);
        ++hammer_stats_file_iopsw;
        error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
                                ap->a_cred, ap->a_flags, -1);
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
 
        return (error);
 }
@@ -2202,10 +2518,15 @@ int
 hammer_vop_ioctl(struct vop_ioctl_args *ap)
 {
        struct hammer_inode *ip = ap->a_vp->v_data;
+       hammer_mount_t hmp = ip->hmp;
+       int error;
 
        ++hammer_stats_file_iopsr;
-       return(hammer_ioctl(ip, ap->a_command, ap->a_data,
-                           ap->a_fflag, ap->a_cred));
+       lwkt_gettoken(&hmp->fs_token);
+       error = hammer_ioctl(ip, ap->a_command, ap->a_data,
+                            ap->a_fflag, ap->a_cred);
+       lwkt_reltoken(&hmp->fs_token);
+       return (error);
 }
 
 static
@@ -2229,8 +2550,9 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap)
        KKASSERT(mp->mnt_data != NULL);
        hmp = (struct hammer_mount *)mp->mnt_data;
 
-       switch(ap->a_op) {
+       lwkt_gettoken(&hmp->fs_token);
 
+       switch(ap->a_op) {
        case MOUNTCTL_SET_EXPORT:
                if (ap->a_ctllen != sizeof(struct export_args))
                        error = EINVAL;
@@ -2251,7 +2573,8 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap)
                usedbytes = *ap->a_res;
 
                if (usedbytes > 0 && usedbytes < ap->a_buflen) {
-                       usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf,
+                       usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
+                                                   ap->a_buf,
                                                    ap->a_buflen - usedbytes,
                                                    &error);
                }
@@ -2263,6 +2586,7 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap)
                error = vop_stdmountctl(ap);
                break;
        }
+       lwkt_reltoken(&hmp->fs_token);
        return(error);
 }
 
@@ -2298,6 +2622,9 @@ hammer_vop_strategy(struct vop_strategy_args *ap)
                biodone(ap->a_bio);
                break;
        }
+
+       /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
+
        return (error);
 }
 
@@ -2319,6 +2646,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
        struct hammer_transaction trans;
        struct hammer_inode *ip;
        struct hammer_inode *dip;
+       hammer_mount_t hmp;
        struct hammer_cursor cursor;
        hammer_base_elm_t base;
        hammer_off_t disk_offset;
@@ -2336,6 +2664,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
        bio = ap->a_bio;
        bp = bio->bio_buf;
        ip = ap->a_vp->v_data;
+       hmp = ip->hmp;
 
        /*
         * The zone-2 disk offset may have been set by the cluster code via
@@ -2346,7 +2675,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
        nbio = push_bio(bio);
        if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
            HAMMER_ZONE_LARGE_DATA) {
-               error = hammer_io_direct_read(ip->hmp, nbio, NULL);
+               lwkt_gettoken(&hmp->fs_token);
+               error = hammer_io_direct_read(hmp, nbio, NULL);
+               lwkt_reltoken(&hmp->fs_token);
                return (error);
        }
 
@@ -2354,7 +2685,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
         * Well, that sucked.  Do it the hard way.  If all the stars are
         * aligned we may still be able to issue a direct-read.
         */
-       hammer_simple_transaction(&trans, ip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_simple_transaction(&trans, hmp);
        hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
 
        /*
@@ -2455,8 +2787,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                 * buffers and frontend-owned in-memory records synchronously.
                 */
                if (ip->flags & HAMMER_INODE_TRUNCATED) {
-                       if (hammer_cursor_ondisk(&cursor) ||
-                           cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
+                       if (hammer_cursor_ondisk(&cursor)/* ||
+                           cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
                                if (ip->trunc_off <= rec_offset)
                                        n = 0;
                                else if (ip->trunc_off < rec_offset + n)
@@ -2488,8 +2820,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                        KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
                                 HAMMER_ZONE_LARGE_DATA);
                        nbio->bio_offset = disk_offset;
-                       error = hammer_io_direct_read(trans.hmp, nbio,
-                                                     cursor.leaf);
+                       error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
+                       if (hammer_live_dedup)
+                               hammer_dedup_cache_add(ip, cursor.leaf);
                        goto done;
                } else if (n) {
                        error = hammer_ip_resolve_data(&cursor);
@@ -2501,6 +2834,13 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                if (error)
                        break;
 
+               /*
+                * We have to be sure that the only elements added to the
+                * dedup cache are those which are already on-media.
+                */
+               if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
+                       hammer_dedup_cache_add(ip, cursor.leaf);
+
                /*
                 * Iterate until we have filled the request.
                 */
@@ -2550,6 +2890,7 @@ done:
        }
        hammer_done_cursor(&cursor);
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
        return(error);
 }
 
@@ -2574,6 +2915,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
 {
        struct hammer_transaction trans;
        struct hammer_inode *ip;
+       hammer_mount_t hmp;
        struct hammer_cursor cursor;
        hammer_base_elm_t base;
        int64_t rec_offset;
@@ -2590,6 +2932,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
 
        ++hammer_stats_file_iopsr;
        ip = ap->a_vp->v_data;
+       hmp = ip->hmp;
 
        /*
         * We can only BMAP regular files.  We can't BMAP database files,
@@ -2609,7 +2952,8 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
         * Scan the B-Tree to acquire blockmap addresses, then translate
         * to raw addresses.
         */
-       hammer_simple_transaction(&trans, ip->hmp);
+       lwkt_gettoken(&hmp->fs_token);
+       hammer_simple_transaction(&trans, hmp);
 #if 0
        kprintf("bmap_beg %016llx ip->cache %p\n",
                (long long)ap->a_loffset, ip->cache[1]);
@@ -2711,7 +3055,11 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
                        }
                        last_offset = rec_offset + rec_len;
                        last_disk_offset = disk_offset + rec_len;
+
+                       if (hammer_live_dedup)
+                               hammer_dedup_cache_add(ip, cursor.leaf);
                }
+               
                error = hammer_ip_next(&cursor);
        }
 
@@ -2734,6 +3082,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
        }
        hammer_done_cursor(&cursor);
        hammer_done_transaction(&trans);
+       lwkt_reltoken(&hmp->fs_token);
 
        /*
         * If we couldn't find any records or the records we did find were
@@ -2826,6 +3175,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
                return(EROFS);
        }
 
+       lwkt_gettoken(&hmp->fs_token);
+
        /*
         * Interlock with inode destruction (no in-kernel or directory
         * topology visibility).  If we queue new IO while trying to
@@ -2840,6 +3191,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
            (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
                bp->b_resid = 0;
                biodone(ap->a_bio);
+               lwkt_reltoken(&hmp->fs_token);
                return(0);
        }
 
@@ -2869,8 +3221,24 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
 
        record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
                                    bytes, &error);
+
+       /*
+        * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
+        * in hammer_vop_write().  We must flag the record so the proper
+        * REDO_TERM_WRITE entry is generated during the flush.
+        */
        if (record) {
-               hammer_io_direct_write(hmp, record, bio);
+               if (bp->b_flags & B_VFSFLAG1) {
+                       record->flags |= HAMMER_RECF_REDO;
+                       bp->b_flags &= ~B_VFSFLAG1;
+               }
+               if (record->flags & HAMMER_RECF_DEDUPED) {
+                       bp->b_resid = 0;
+                       hammer_ip_replace_bulk(hmp, record);
+                       biodone(ap->a_bio);
+               } else {
+                       hammer_io_direct_write(hmp, bio, record);
+               }
                if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
                        hammer_flush_inode(ip, 0);
        } else {
@@ -2879,6 +3247,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
                bp->b_flags |= B_ERROR;
                biodone(ap->a_bio);
        }
+       lwkt_reltoken(&hmp->fs_token);
        return(error);
 }
 
@@ -2895,6 +3264,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
        struct namecache *ncp;
        hammer_inode_t dip;
        hammer_inode_t ip;
+       hammer_mount_t hmp;
        struct hammer_cursor cursor;
        int64_t namekey;
        u_int32_t max_iterations;
@@ -2909,6 +3279,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
         */
        dip = VTOI(dvp);
        ncp = nch->ncp;
+       hmp = dip->hmp;
 
        if (dip->flags & HAMMER_INODE_RO)
                return (EROFS);
@@ -2965,7 +3336,7 @@ retry:
        if (error == 0) {
                hammer_unlock(&cursor.ip->lock);
                ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
-                                     dip->hmp->asof,
+                                     hmp->asof,
                                      cursor.data->entry.localization,
                                      0, &error);
                hammer_lock_sh(&cursor.ip->lock);
@@ -3003,6 +3374,9 @@ retry:
                 *
                 * If any changes whatsoever have been made to the cursor
                 * set EDEADLK and retry.
+                *
+                * WARNING: See warnings in hammer_unlock_cursor()
+                *          function.
                 */
                if (error == 0 && ip && ip->ino_data.obj_type ==
                                        HAMMER_OBJTYPE_DIRECTORY) {
@@ -3032,10 +3406,30 @@ retry:
                if (error == 0) {
                        cache_setunresolved(nch);
                        cache_setvp(nch, NULL);
-                       /* XXX locking */
-                       if (ip && ip->vp) {
-                               hammer_knote(ip->vp, NOTE_DELETE);
-                               cache_inval_vp(ip->vp, CINV_DESTROY);
+
+                       /*
+                        * NOTE: ip->vp, if non-NULL, cannot be directly
+                        *       referenced without formally acquiring the
+                        *       vp since the vp might have zero refs on it,
+                        *       or in the middle of a reclaim, etc.
+                        *
+                        * NOTE: The cache_setunresolved() can rip the vp
+                        *       out from under us since the vp may not have
+                        *       any refs, in which case ip->vp will be NULL
+                        *       from the outset.
+                        */
+                       while (ip && ip->vp) {
+                               struct vnode *vp;
+
+                               error = hammer_get_vnode(ip, &vp);
+                               if (error == 0 && vp) {
+                                       vn_unlock(vp);
+                                       hammer_knote(ip->vp, NOTE_DELETE);
+                                       cache_inval_vp(ip->vp, CINV_DESTROY);
+                                       vrele(vp);
+                                       break;
+                               }
+                               kprintf("Debug: HAMMER ip/vp race1 avoided\n");
                        }
                }
                if (ip)
@@ -3054,7 +3448,6 @@ retry:
  ************************************************************************
  *
  */
-
 static int
 hammer_vop_fifoclose (struct vop_close_args *ap)
 {
@@ -3105,11 +3498,11 @@ static int filt_hammerwrite(struct knote *kn, long hint);
 static int filt_hammervnode(struct knote *kn, long hint);
 
 static struct filterops hammerread_filtops =
-       { 1, NULL, filt_hammerdetach, filt_hammerread };
+       { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
 static struct filterops hammerwrite_filtops =
-       { 1, NULL, filt_hammerdetach, filt_hammerwrite };
+       { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
 static struct filterops hammervnode_filtops =
-       { 1, NULL, filt_hammerdetach, filt_hammervnode };
+       { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
 
 static
 int
@@ -3117,7 +3510,6 @@ hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
 {
        struct vnode *vp = ap->a_vp;
        struct knote *kn = ap->a_kn;
-       lwkt_tokref vlock;
 
        switch (kn->kn_filter) {
        case EVFILT_READ:
@@ -3130,14 +3522,12 @@ hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
                kn->kn_fop = &hammervnode_filtops;
                break;
        default:
-               return (1);
+               return (EOPNOTSUPP);
        }
 
        kn->kn_hook = (caddr_t)vp;
 
-       lwkt_gettoken(&vlock, &vp->v_token);
-       SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
-       lwkt_reltoken(&vlock);
+       knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
 
        return(0);
 }
@@ -3146,12 +3536,8 @@ static void
 filt_hammerdetach(struct knote *kn)
 {
        struct vnode *vp = (void *)kn->kn_hook;
-       lwkt_tokref vlock;
 
-       lwkt_gettoken(&vlock, &vp->v_token);
-       SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
-                    kn, knote, kn_selnext);
-       lwkt_reltoken(&vlock);
+       knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
 }
 
 static int
@@ -3159,12 +3545,19 @@ filt_hammerread(struct knote *kn, long hint)
 {
        struct vnode *vp = (void *)kn->kn_hook;
        hammer_inode_t ip = VTOI(vp);
+       hammer_mount_t hmp = ip->hmp;
+       off_t off;
 
        if (hint == NOTE_REVOKE) {
                kn->kn_flags |= (EV_EOF | EV_ONESHOT);
                return(1);
        }
-       kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
+       lwkt_gettoken(&hmp->fs_token);  /* XXX use per-ip-token */
+       off = ip->ino_data.size - kn->kn_fp->f_offset;
+       kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
+       lwkt_reltoken(&hmp->fs_token);
+       if (kn->kn_sfflags & NOTE_OLDAPI)
+               return(1);
        return (kn->kn_data != 0);
 }