HAMMER 60J/Many: Mirroring
authorMatthew Dillon <dillon@dragonflybsd.org>
Thu, 10 Jul 2008 04:44:33 +0000 (04:44 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Thu, 10 Jul 2008 04:44:33 +0000 (04:44 +0000)
Finish implementing the core mirroring algorithm.  The last bit was to add
support for no-history deletions on the master.  The same support also covers
masters which have pruned records away prior to the mirroring operation.
As with the work done previously, the algorithm is 100% queue-less and
has no age limitations.  You could wait a month, and then do a mirroring
update from master to slave, and the algorithm will efficiently handle it.

The basic issue that this commit tackles is what to do when records are
physically deleted from the master.  When this occurs the mirror master
cannot provide a list of records to delete to its slaves.

The solution is to use the mirror TID propagation to physically identify
swaths of the B-Tree in which a deletion MAY have taken place.  The
mirroring code uses this information to generate PASS and SKIP mrecords.

A PASS identifies a record (sans its data payload) that remains within
the identified swath and should already exist on the target.  The
mirroring target does a simultanious iteration of the same swath on the
target B-Tree and deletes records not identified by the master.

A SKIP is the heart of the algorithm's efficiency.  The same mirror TID
stored in the B-Tree can also identify large swaths of the B-Tree for which
*NO* deletions have taken place (which will be most of the B-Tree).  One
SKIP Record can identify an arbitrarily large swath.  The target uses
the SKIP record to skip that swath on the target.  No scan takes place.
SKIP records can be generated from any internal node of the B-Tree and cover
that node's entire sub-tree.

This also provides us with the feature where the retention policy can be
completely different between a master and a mirror, or between mirrors.
When the slave identifies a record that must be deleted through the above
algorithm it only needs to mark it as historically deleted, it does not
have to physically delete the record.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.h
sys/vfs/hammer/hammer_disk.h
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_ioctl.h
sys/vfs/hammer/hammer_mirror.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_subs.c
sys/vfs/hammer/hammer_vnops.c

index 94f23da..ea1070d 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.108 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.109 2008/07/10 04:44:33 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -886,7 +886,8 @@ int hammer_btree_iterate_reverse(hammer_cursor_t cursor);
 int    hammer_btree_insert(hammer_cursor_t cursor,
                            hammer_btree_leaf_elm_t elm, int *doprop);
 int    hammer_btree_delete(hammer_cursor_t cursor);
-void   hammer_btree_do_propagation(hammer_cursor_t cursor, hammer_inode_t ip,
+void   hammer_btree_do_propagation(hammer_cursor_t cursor,
+                           hammer_pseudofs_inmem_t pfsm,
                            hammer_btree_leaf_elm_t leaf);
 int    hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2);
 int    hammer_btree_chkts(hammer_tid_t ts, hammer_base_elm_t key);
index 8713a4c..495702f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.67 2008/07/08 04:34:41 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.68 2008/07/10 04:44:33 dillon Exp $
  */
 
 /*
@@ -91,6 +91,7 @@ static int hammer_btree_mirror_propagate(hammer_cursor_t cursor,
                        hammer_tid_t mirror_tid);
 static void hammer_make_separator(hammer_base_elm_t key1,
                        hammer_base_elm_t key2, hammer_base_elm_t dest);
+static void hammer_cursor_mirror_filter(hammer_cursor_t cursor);
 
 /*
  * Iterate records after a search.  The cursor is iterated forwards past
@@ -231,13 +232,15 @@ hammer_btree_iterate(hammer_cursor_t cursor)
 
                        /*
                         * If running the mirror filter see if we can skip
-                        * the entire sub-tree.
+                        * one or more entire sub-trees.  If we can we
+                        * return the internal mode and the caller processes
+                        * the skipped range (see mirror_read)
                         */
                        if (cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) {
                                if (elm->internal.mirror_tid <
-                                   cursor->mirror_tid) {
-                                       ++cursor->index;
-                                       continue;
+                                   cursor->cmirror->mirror_tid) {
+                                       hammer_cursor_mirror_filter(cursor);
+                                       return(0);
                                }
                        }
 
@@ -317,6 +320,51 @@ hammer_btree_iterate(hammer_cursor_t cursor)
        return(error);
 }
 
+/*
+ * We hit an internal element that we could skip as part of a mirroring
+ * scan.  Calculate the entire range being skipped.
+ *
+ * It is important to include any gaps between the parent's left_bound
+ * and the node's left_bound, and same goes for the right side.
+ */
+static void
+hammer_cursor_mirror_filter(hammer_cursor_t cursor)
+{
+       struct hammer_cmirror *cmirror;
+       hammer_node_ondisk_t ondisk;
+       hammer_btree_elm_t elm;
+
+       ondisk = cursor->node->ondisk;
+       cmirror = cursor->cmirror;
+
+       /*
+        * Calculate the skipped range
+        */
+       elm = &ondisk->elms[cursor->index];
+       if (cursor->index == 0)
+               cmirror->skip_beg = *cursor->left_bound;
+       else
+               cmirror->skip_beg = elm->internal.base;
+       while (cursor->index < ondisk->count) {
+               if (elm->internal.mirror_tid >= cmirror->mirror_tid)
+                       break;
+               ++cursor->index;
+               ++elm;
+       }
+       if (cursor->index == ondisk->count)
+               cmirror->skip_end = *cursor->right_bound;
+       else
+               cmirror->skip_end = elm->internal.base;
+
+       /*
+        * clip the returned result.
+        */
+       if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0)
+               cmirror->skip_beg = cursor->key_beg;
+       if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0)
+               cmirror->skip_end = cursor->key_end;
+}
+
 /*
  * Iterate in the reverse direction.  This is used by the pruning code to
  * avoid overlapping records.
@@ -330,6 +378,9 @@ hammer_btree_iterate_reverse(hammer_cursor_t cursor)
        int r;
        int s;
 
+       /* mirror filtering not supported for reverse iteration */
+       KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0);
+
        /*
         * Skip past the current record.  For various reasons the cursor
         * may end up set to -1 or set to point at the end of the current
@@ -2093,9 +2144,6 @@ btree_remove(hammer_cursor_t cursor)
        } else {
                KKASSERT(parent->ondisk->count > 1);
 
-               /*
-                * Delete the subtree reference in the parent
-                */
                hammer_modify_node_all(cursor->trans, parent);
                ondisk = parent->ondisk;
                KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
@@ -2103,6 +2151,36 @@ btree_remove(hammer_cursor_t cursor)
                elm = &ondisk->elms[cursor->parent_index];
                KKASSERT(elm->internal.subtree_offset == node->node_offset);
                KKASSERT(ondisk->count > 0);
+
+               /*
+                * We must retain the highest mirror_tid.  The deleted
+                * range is now encompassed by the element to the left.
+                * If we are already at the left edge the new left edge
+                * inherits mirror_tid.
+                *
+                * Note that bounds of the parent to our parent may create
+                * a gap to the left of our left-most node or to the right
+                * of our right-most node.  The gap is silently included
+                * in the mirror_tid's area of effect from the point of view
+                * of the scan.
+                */
+               if (cursor->parent_index) {
+                       if (elm[-1].internal.mirror_tid <
+                           elm[0].internal.mirror_tid) {
+                               elm[-1].internal.mirror_tid =
+                                   elm[0].internal.mirror_tid;
+                       }
+               } else {
+                       if (elm[1].internal.mirror_tid <
+                           elm[0].internal.mirror_tid) {
+                               elm[1].internal.mirror_tid =
+                                   elm[0].internal.mirror_tid;
+                       }
+               }
+
+               /*
+                * Delete the subtree reference in the parent
+                */
                bcopy(&elm[1], &elm[0],
                      (ondisk->count - cursor->parent_index) * esize);
                --ondisk->count;
@@ -2128,10 +2206,10 @@ btree_remove(hammer_cursor_t cursor)
  * are propagating the mirror_tid for.
  */
 void
-hammer_btree_do_propagation(hammer_cursor_t cursor, hammer_inode_t ip,
+hammer_btree_do_propagation(hammer_cursor_t cursor,
+                           hammer_pseudofs_inmem_t pfsm,
                            hammer_btree_leaf_elm_t leaf)
 {
-       hammer_pseudofs_inmem_t pfsm;
        hammer_cursor_t ncursor;
        hammer_tid_t mirror_tid;
        int error;
@@ -2139,10 +2217,11 @@ hammer_btree_do_propagation(hammer_cursor_t cursor, hammer_inode_t ip,
        /*
         * We only propagate the mirror_tid up if we are in master or slave
         * mode.  We do not bother if we are in no-mirror mode.
+        *
+        * If pfsm is NULL we propagate (from mirror_write).
         */
-       pfsm = ip->pfsm;
-       KKASSERT(pfsm != NULL);
-       if (pfsm->pfsd.master_id < 0 &&
+       if (pfsm &&
+           pfsm->pfsd.master_id < 0 &&
            (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0) {
                return;
        }
index c96c1ee..66f7e3d 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.24 2008/07/07 00:24:31 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.25 2008/07/10 04:44:33 dillon Exp $
  */
 
+struct hammer_cmirror;
+
 /*
  * The hammer_cursor structure is the primary in-memory management structure
  * for B-Tree operations.  
@@ -90,7 +92,7 @@ struct hammer_cursor {
        struct hammer_base_elm key_beg;
        struct hammer_base_elm key_end;
        hammer_tid_t    asof;
-       hammer_tid_t    mirror_tid;
+       struct hammer_cmirror *cmirror;
 
        /*
         * Related data and record references.  Note that the related buffers
@@ -142,6 +144,21 @@ typedef struct hammer_cursor *hammer_cursor_t;
  */
 #define HAMMER_CURSOR_INITMASK         (~0)
 
+/*
+ * Mirror scan extension structure.  Caller sets mirror_tid to restrict
+ * the scan.  If the iteration is able to skip one or more internal nodes
+ * it returns an internal node with skip_beg/end set to the skipped range.
+ *
+ * If the first element of an internal node is skipped skip_beg will use
+ * the left_bound inherited from the parent, and the same for the last
+ * element.  This is because gaps can develop in the bounds.
+ */
+struct hammer_cmirror {
+       hammer_tid_t    mirror_tid;
+       struct hammer_base_elm skip_beg;
+       struct hammer_base_elm skip_end;
+};
+
 /*
  * NOTE: iprec can be NULL, but the address-of does not indirect through
  * it so we are ok.
index 9d3a142..847e37b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.48 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.49 2008/07/10 04:44:33 dillon Exp $
  */
 
 #ifndef VFS_HAMMER_DISK_H_
@@ -354,6 +354,8 @@ typedef struct hammer_blockmap_layer2 *hammer_blockmap_layer2_t;
 #define HAMMER_HEAD_ALIGN              8
 #define HAMMER_HEAD_ALIGN_MASK         (HAMMER_HEAD_ALIGN - 1)
 #define HAMMER_TAIL_ONDISK_SIZE                8
+#define HAMMER_HEAD_DOALIGN(bytes)     \
+       (((bytes) + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK)
 
 struct hammer_fifo_head {
        u_int16_t hdr_signature;
@@ -680,8 +682,17 @@ struct hammer_pseudofs_data {
        uuid_t          shared_uuid;    /* shared uuid (match required) */
        uuid_t          unique_uuid;    /* unique uuid of this master/slave */
        int32_t         master_id;      /* 0-15 (-1 if slave) */
-       int32_t         mirror_flags;   /* (reserved) */
+       int32_t         mirror_flags;   /* misc flags */
        char            label[64];      /* filesystem space label */
+       char            prune_path[64]; /* softlink dir for pruning */
+       int16_t         prune_time;     /* how long to spend pruning */
+       int16_t         prune_freq;     /* how often we prune */
+       int16_t         reblock_time;   /* how long to spend reblocking */
+       int16_t         reblock_freq;   /* how often we reblock */
+       int32_t         snapshot_freq;  /* how often we create a snapshot */
+       int32_t         prune_min;      /* do not prune recent history */
+       int32_t         prune_max;      /* do not retain history beyond here */
+       int32_t         reserved[16];
 };
 
 typedef struct hammer_pseudofs_data *hammer_pseudofs_data_t;
index bb6515f..e264db0 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.96 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.97 2008/07/10 04:44:33 dillon Exp $
  */
 
 #include "hammer.h"
@@ -794,6 +794,9 @@ retry:
 
 /*
  * Create a root directory for a PFS if one does not alredy exist.
+ *
+ * The PFS root stands alone so we must also bump the nlinks count
+ * to prevent it from being destroyed on release.
  */
 int
 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
@@ -810,6 +813,10 @@ hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
                vap.va_mode = 0755;
                vap.va_type = VDIR;
                error = hammer_create_inode(trans, &vap, cred, NULL, pfsm, &ip);
+               if (error == 0) {
+                       ++ip->ino_data.nlinks;
+                       hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
+               }
        }
        if (ip)
                hammer_rel_inode(ip, 0);
index c615311..ad2c2aa 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.18 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.19 2008/07/10 04:44:33 dillon Exp $
  */
 /*
  * HAMMER ioctl's.  This file can be #included from userland
  */
 struct hammer_ioc_head {
        int32_t         flags;
-       int32_t         reserved01;
+       int32_t         error;
        int32_t         reserved02[4];
 };
 
+#define HAMMER_IOC_HEAD_ERROR  0x00008000
 #define HAMMER_IOC_HEAD_INTR   0x00010000
 #define HAMMER_IOC_DO_BTREE    0x00020000      /* reblocker */
 #define HAMMER_IOC_DO_INODES   0x00040000      /* reblocker */
@@ -242,23 +243,65 @@ struct hammer_ioc_mirror_rw {
  * NOTE: crc is for the data block starting at rec_size, not including the
  * data[] array.
  */
-typedef struct hammer_ioc_mrecord {
+struct hammer_ioc_mrecord_head {
        u_int32_t               signature;      /* signature for byte order */
        u_int32_t               rec_crc;
        u_int32_t               rec_size;
        u_int32_t               type;
-       struct hammer_btree_leaf_elm leaf;
+       /* extended */
+};
+
+typedef struct hammer_ioc_mrecord_head *hammer_ioc_mrecord_head_t;
+
+struct hammer_ioc_mrecord_rec {
+       struct hammer_ioc_mrecord_head  head;
+       struct hammer_btree_leaf_elm    leaf;
        /* extended by data */
-} *hammer_ioc_mrecord_t;
+};
 
-#define HAMMER_MREC_TYPE_RESERVED      0
-#define HAMMER_MREC_TYPE_REC           1
-#define HAMMER_MREC_TYPE_PFSD          2
-#define HAMMER_MREC_TYPE_UPDATE                3
-#define HAMMER_MREC_TYPE_SYNC          4
+struct hammer_ioc_mrecord_skip {
+       struct hammer_ioc_mrecord_head  head;
+       struct hammer_base_elm          skip_beg;
+       struct hammer_base_elm          skip_end;
+};
+
+struct hammer_ioc_mrecord_update {
+       struct hammer_ioc_mrecord_head  head;
+       hammer_tid_t                    tid;
+};
 
-#define HAMMER_MREC_CRCOFF     (offsetof(struct hammer_ioc_mrecord, rec_size))
-#define HAMMER_MREC_HEADSIZE   sizeof(struct hammer_ioc_mrecord)
+struct hammer_ioc_mrecord_sync {
+       struct hammer_ioc_mrecord_head  head;
+};
+
+struct hammer_ioc_mrecord_pfs {
+       struct hammer_ioc_mrecord_head  head;
+       u_int32_t                       version;
+       u_int32_t                       reserved01;
+       struct hammer_pseudofs_data     pfsd;
+};
+
+union hammer_ioc_mrecord_any {
+       struct hammer_ioc_mrecord_head  head;
+       struct hammer_ioc_mrecord_rec   rec;
+       struct hammer_ioc_mrecord_skip  skip;
+       struct hammer_ioc_mrecord_update update;
+       struct hammer_ioc_mrecord_update sync;
+       struct hammer_ioc_mrecord_pfs   pfs;
+};
+
+typedef union hammer_ioc_mrecord_any *hammer_ioc_mrecord_any_t;
+
+#define HAMMER_MREC_TYPE_RESERVED      0
+#define HAMMER_MREC_TYPE_REC           1       /* record w/ data */
+#define HAMMER_MREC_TYPE_PFSD          2       /* (userland only) */
+#define HAMMER_MREC_TYPE_UPDATE                3       /* (userland only) */
+#define HAMMER_MREC_TYPE_SYNC          4       /* (userland only) */
+#define HAMMER_MREC_TYPE_SKIP          5       /* skip-range */
+#define HAMMER_MREC_TYPE_PASS          6       /* record for cmp only (pass) */
+
+#define HAMMER_MREC_CRCOFF     (offsetof(struct hammer_ioc_mrecord_head, rec_size))
+#define HAMMER_MREC_HEADSIZE   sizeof(struct hammer_ioc_mrecord_head)
 
 #define HAMMER_IOC_MIRROR_SIGNATURE    0x4dd97272U
 #define HAMMER_IOC_MIRROR_SIGNATURE_REV        0x7272d94dU
index 6c08d3b..49486c5 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.9 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.10 2008/07/10 04:44:33 dillon Exp $
  */
 /*
  * HAMMER mirroring ioctls - serialize and deserialize modifications made
 #include "hammer.h"
 
 static int hammer_mirror_check(hammer_cursor_t cursor,
-                               struct hammer_ioc_mrecord *mrec);
+                               struct hammer_ioc_mrecord_rec *mrec);
 static int hammer_mirror_update(hammer_cursor_t cursor,
-                               struct hammer_ioc_mrecord *mrec);
+                               struct hammer_ioc_mrecord_rec *mrec);
 static int hammer_mirror_write(hammer_cursor_t cursor,
-                               struct hammer_ioc_mrecord *mrec,
-                               hammer_inode_t ip, char *udata);
+                               struct hammer_ioc_mrecord_rec *mrec,
+                               char *udata);
+static int hammer_ioc_mirror_write_rec(hammer_cursor_t cursor,
+                               struct hammer_ioc_mrecord_rec *mrec,
+                               struct hammer_ioc_mirror_rw *mirror,
+                               u_int32_t localization,
+                               char *uptr);
+static int hammer_ioc_mirror_write_pass(hammer_cursor_t cursor,
+                               struct hammer_ioc_mrecord_rec *mrec,
+                               struct hammer_ioc_mirror_rw *mirror,
+                               u_int32_t localization);
+static int hammer_ioc_mirror_write_skip(hammer_cursor_t cursor,
+                               struct hammer_ioc_mrecord_skip *mrec,
+                               struct hammer_ioc_mirror_rw *mirror,
+                               u_int32_t localization);
+static int hammer_mirror_delete_at_cursor(hammer_cursor_t cursor,
+                               struct hammer_ioc_mirror_rw *mirror);
 static int hammer_mirror_localize_data(hammer_data_ondisk_t data,
                                hammer_btree_leaf_elm_t leaf);
 
@@ -64,16 +79,18 @@ int
 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip,
                       struct hammer_ioc_mirror_rw *mirror)
 {
+       struct hammer_cmirror cmirror;
        struct hammer_cursor cursor;
-       struct hammer_ioc_mrecord mrec;
+       union hammer_ioc_mrecord_any mrec;
        hammer_btree_leaf_elm_t elm;
-       const int head_size = HAMMER_MREC_HEADSIZE;
        const int crc_start = HAMMER_MREC_CRCOFF;
        char *uptr;
        int error;
        int data_len;
        int bytes;
+       int eatdisk;
        u_int32_t localization;
+       u_int32_t rec_crc;
 
        localization = (u_int32_t)mirror->pfs_id << 16;
 
@@ -85,8 +102,10 @@ hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip,
                return(EINVAL);
 
        mirror->key_cur = mirror->key_beg;
+       mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK;
        mirror->key_cur.localization += localization;
        bzero(&mrec, sizeof(mrec));
+       bzero(&cmirror, sizeof(cmirror));
 
 retry:
        error = hammer_init_cursor(trans, &cursor, NULL, NULL);
@@ -96,6 +115,7 @@ retry:
        }
        cursor.key_beg = mirror->key_cur;
        cursor.key_end = mirror->key_end;
+       cursor.key_end.localization &= HAMMER_LOCALIZE_MASK;
        cursor.key_end.localization += localization;
 
        cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
@@ -107,26 +127,88 @@ retry:
         * field stored with internal and leaf nodes to shortcut the scan.
         */
        cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
-       cursor.mirror_tid = mirror->tid_beg;
+       cursor.cmirror = &cmirror;
+       cmirror.mirror_tid = mirror->tid_beg;
 
        error = hammer_btree_first(&cursor);
        while (error == 0) {
                /*
-                * Leaf node.  Only return elements modified in the range
-                * requested by userland.
+                * An internal node can be returned in mirror-filtered
+                * mode and indicates that the scan is returning a skip
+                * range in the cursor->cmirror structure.
+                */
+               uptr = (char *)mirror->ubuf + mirror->count;
+               if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
+                       /*
+                        * Check space
+                        */
+                       mirror->key_cur = cmirror.skip_beg;
+                       bytes = sizeof(mrec.skip);
+                       if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) >
+                           mirror->size) {
+                               break;
+                       }
+
+                       /*
+                        * Fill mrec
+                        */
+                       mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE;
+                       mrec.head.type = HAMMER_MREC_TYPE_SKIP;
+                       mrec.head.rec_size = bytes;
+                       mrec.skip.skip_beg = cmirror.skip_beg;
+                       mrec.skip.skip_end = cmirror.skip_end;
+                       mrec.head.rec_crc = crc32(&mrec.head.rec_size,
+                                                bytes - crc_start);
+                       error = copyout(&mrec, uptr, bytes);
+                       eatdisk = 0;
+                       goto didwrite;
+               }
+
+               /*
+                * Leaf node.  In full-history mode we could filter out
+                * elements modified outside the user-requested TID range.
+                *
+                * However, such elements must be returned so the writer
+                * can compare them against the target to detemrine what
+                * needs to be deleted on the target, particular for
+                * no-history mirrors.
                 */
                KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
                elm = &cursor.node->ondisk->elms[cursor.index].leaf;
+               mirror->key_cur = elm->base;
 
-               if (elm->base.create_tid < mirror->tid_beg ||
-                   elm->base.create_tid >= mirror->tid_end) {
-                       if (elm->base.delete_tid < mirror->tid_beg ||
-                           elm->base.delete_tid >= mirror->tid_end) {
-                               goto skip;
+               if ((elm->base.create_tid < mirror->tid_beg ||
+                   elm->base.create_tid > mirror->tid_end) &&
+                   (elm->base.delete_tid < mirror->tid_beg ||
+                   elm->base.delete_tid > mirror->tid_end)) {
+                       bytes = sizeof(mrec.rec);
+                       if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) >
+                           mirror->size) {
+                               break;
                        }
-               }
 
-               mirror->key_cur = elm->base;
+                       /*
+                        * Fill mrec.  PASS records are records which are
+                        * outside the TID range needed for the mirror
+                        * update.  They are sent without any data payload
+                        * because the mirroring target must still compare
+                        * records that fall outside the SKIP ranges to
+                        * determine what might need to be deleted.  Such
+                        * deletions are needed if the master or files on
+                        * the master are no-history, or if the slave is
+                        * so far behind the master has already been pruned.
+                        */
+                       mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE;
+                       mrec.head.type = HAMMER_MREC_TYPE_PASS;
+                       mrec.head.rec_size = bytes;
+                       mrec.rec.leaf = *elm;
+                       mrec.head.rec_crc = crc32(&mrec.head.rec_size,
+                                                bytes - crc_start);
+                       error = copyout(&mrec, uptr, bytes);
+                       eatdisk = 1;
+                       goto didwrite;
+                       
+               }
 
                /*
                 * Yield to more important tasks
@@ -152,10 +234,9 @@ retry:
                        if (error)
                                break;
                }
-               bytes = sizeof(struct hammer_ioc_mrecord) + data_len;
-               bytes = (bytes + HAMMER_HEAD_ALIGN_MASK) &
-                       ~HAMMER_HEAD_ALIGN_MASK;
-               if (mirror->count + bytes > mirror->size)
+
+               bytes = sizeof(mrec.rec) + data_len;
+               if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size)
                        break;
 
                /*
@@ -166,24 +247,37 @@ retry:
                 * is not considered deleted from the point of view of
                 * userland and delete_tid is cleared.
                 */
-               mrec.signature = HAMMER_IOC_MIRROR_SIGNATURE;
-               mrec.type = HAMMER_MREC_TYPE_REC;
-               mrec.rec_size = bytes;
-               mrec.leaf = *elm;
+               mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE;
+               mrec.head.type = HAMMER_MREC_TYPE_REC;
+               mrec.head.rec_size = bytes;
+               mrec.rec.leaf = *elm;
                if (elm->base.delete_tid >= mirror->tid_end)
-                       mrec.leaf.base.delete_tid = 0;
-               mrec.rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
-               uptr = (char *)mirror->ubuf + mirror->count;
-               error = copyout(&mrec, uptr, head_size);
+                       mrec.rec.leaf.base.delete_tid = 0;
+               rec_crc = crc32(&mrec.head.rec_size,
+                               sizeof(mrec.rec) - crc_start);
+               if (data_len)
+                       rec_crc = crc32_ext(cursor.data, data_len, rec_crc);
+               mrec.head.rec_crc = rec_crc;
+               error = copyout(&mrec, uptr, sizeof(mrec.rec));
                if (data_len && error == 0) {
-                       error = copyout(cursor.data, uptr + head_size,
+                       error = copyout(cursor.data, uptr + sizeof(mrec.rec),
                                        data_len);
                }
-               if (error == 0)
-                       mirror->count += bytes;
-skip:
+               eatdisk = 1;
+
+               /*
+                * eatdisk controls whether we skip the current cursor
+                * position on the next scan or not.  If doing a SKIP
+                * the cursor is already positioned properly for the next
+                * scan and eatdisk will be 0.
+                */
+didwrite:
                if (error == 0) {
-                       cursor.flags |= HAMMER_CURSOR_ATEDISK;
+                       mirror->count += HAMMER_HEAD_DOALIGN(bytes);
+                       if (eatdisk)
+                               cursor.flags |= HAMMER_CURSOR_ATEDISK;
+                       else
+                               cursor.flags &= ~HAMMER_CURSOR_ATEDISK;
                        error = hammer_btree_iterate(&cursor);
                }
        }
@@ -204,8 +298,7 @@ failed:
 }
 
 /*
- * Copy records from userland to the target mirror.  Records which already
- * exist may only have their delete_tid updated.
+ * Copy records from userland to the target mirror.
  *
  * The PFS is identified in the mirror structure.  The passed ip is just
  * some directory in the overall HAMMER filesystem and has nothing to
@@ -216,110 +309,395 @@ int
 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip,
                       struct hammer_ioc_mirror_rw *mirror)
 {
+       union hammer_ioc_mrecord_any mrec;
        struct hammer_cursor cursor;
-       struct hammer_ioc_mrecord mrec;
-       const int head_size = HAMMER_MREC_HEADSIZE;
-       const int crc_start = HAMMER_MREC_CRCOFF;
-       u_int32_t rec_crc;
+       u_int32_t localization;
        int error;
+       int bytes;
        char *uptr;
-       u_int32_t localization;
 
        localization = (u_int32_t)mirror->pfs_id << 16;
 
+       /*
+        * Validate the mirror structure and relocalize the tracking keys.
+        */
        if (mirror->size < 0 || mirror->size > 0x70000000)
                return(EINVAL);
+       mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK;
+       mirror->key_beg.localization += localization;
+       mirror->key_end.localization &= HAMMER_LOCALIZE_MASK;
+       mirror->key_end.localization += localization;
+       mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK;
+       mirror->key_cur.localization += localization;
 
+       /*
+        * Set up our tracking cursor for the loop.  The tracking cursor
+        * is used to delete records that are no longer present on the
+        * master.  The last handled record at key_cur must be skipped.
+        */
        error = hammer_init_cursor(trans, &cursor, NULL, NULL);
-retry:
-       hammer_normalize_cursor(&cursor);
 
-       while (error == 0 && mirror->count + head_size <= mirror->size) {
+       cursor.key_beg = mirror->key_cur;
+       cursor.key_end = mirror->key_end;
+       cursor.flags |= HAMMER_CURSOR_BACKEND;
+       error = hammer_btree_first(&cursor);
+       if (error == 0)
+               cursor.flags |= HAMMER_CURSOR_ATEDISK;
+       if (error == ENOENT)
+               error = 0;
+
+       /*
+        * Loop until our input buffer has been exhausted.
+        */
+       while (error == 0 &&
+              mirror->count + sizeof(mrec.head) <= mirror->size) {
+
                /*
                 * Acquire and validate header
                 */
+               if ((bytes = mirror->size - mirror->count) > sizeof(mrec))
+                       bytes = sizeof(mrec);
                uptr = (char *)mirror->ubuf + mirror->count;
-               error = copyin(uptr, &mrec, head_size);
+               error = copyin(uptr, &mrec, bytes);
                if (error)
                        break;
-               rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
-               if (mrec.signature != HAMMER_IOC_MIRROR_SIGNATURE) {
+               if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) {
                        error = EINVAL;
                        break;
                }
-               if (mrec.type != HAMMER_MREC_TYPE_REC) {
+               if (mrec.head.rec_size < sizeof(mrec.head) ||
+                   mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE ||
+                   mirror->count + mrec.head.rec_size > mirror->size) {
                        error = EINVAL;
                        break;
                }
-               if (rec_crc != mrec.rec_crc) {
-                       error = EINVAL;
+
+               switch(mrec.head.type) {
+               case HAMMER_MREC_TYPE_SKIP:
+                       if (mrec.head.rec_size != sizeof(mrec.skip))
+                               error = EINVAL;
+                       if (error == 0)
+                               error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization);
                        break;
-               }
-               if (mrec.rec_size < head_size ||
-                   mrec.rec_size > head_size + HAMMER_XBUFSIZE + 16 ||
-                   mirror->count + mrec.rec_size > mirror->size) {
-                       error = EINVAL;
+               case HAMMER_MREC_TYPE_REC:
+                       if (mrec.head.rec_size < sizeof(mrec.rec))
+                               error = EINVAL;
+                       if (error == 0)
+                               error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec));
                        break;
-               }
-               if (mrec.leaf.data_len < 0 || 
-                   mrec.leaf.data_len > HAMMER_XBUFSIZE ||
-                   sizeof(struct hammer_ioc_mrecord) + mrec.leaf.data_len > mrec.rec_size) {
+               case HAMMER_MREC_TYPE_PASS:
+                       if (mrec.head.rec_size != sizeof(mrec.rec))
+                               error = EINVAL;
+                       if (error == 0)
+                               error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization);
+                       break;
+               default:
                        error = EINVAL;
+                       break;
                }
 
                /*
-                * Re-localize for target.  relocalization of data is handled
-                * by hammer_mirror_write().
+                * Retry the current record on deadlock, otherwise setup
+                * for the next loop.
                 */
-               mrec.leaf.base.localization &= HAMMER_LOCALIZE_MASK;
-               mrec.leaf.base.localization += localization;
+               if (error == EDEADLK) {
+                       while (error == EDEADLK) {
+                               hammer_recover_cursor(&cursor);
+                               error = hammer_cursor_upgrade(&cursor);
+                       }
+               } else {
+                       if (error == EALREADY)
+                               error = 0;
+                       if (error == 0) {
+                               mirror->count += 
+                                       HAMMER_HEAD_DOALIGN(mrec.head.rec_size);
+                       }
+               }
+       }
+       hammer_done_cursor(&cursor);
+
+       /*
+        * cumulative error 
+        */
+       if (error) {
+               mirror->head.flags |= HAMMER_IOC_HEAD_ERROR;
+               mirror->head.error = error;
+       }
+
+       /*
+        * ioctls don't update the RW data structure if an error is returned,
+        * always return 0.
+        */
+       return(0);
+}
+
+/*
+ * Handle skip records.
+ *
+ * We must iterate from the last resolved record position at mirror->key_cur
+ * to skip_beg and delete any records encountered.
+ *
+ * mirror->key_cur must be carefully set when we succeed in processing
+ * this mrec.
+ */
+static int
+hammer_ioc_mirror_write_skip(hammer_cursor_t cursor,
+                            struct hammer_ioc_mrecord_skip *mrec,
+                            struct hammer_ioc_mirror_rw *mirror,
+                            u_int32_t localization)
+{
+       int error;
+
+       /*
+        * Relocalize the skip range
+        */
+       mrec->skip_beg.localization &= HAMMER_LOCALIZE_MASK;
+       mrec->skip_beg.localization += localization;
+       mrec->skip_end.localization &= HAMMER_LOCALIZE_MASK;
+       mrec->skip_end.localization += localization;
+
+       /*
+        * Iterate from current position to skip_beg, deleting any records
+        * we encounter.
+        */
+       cursor->key_end = mrec->skip_beg;
+       cursor->flags |= HAMMER_CURSOR_BACKEND;
+
+       error = hammer_btree_iterate(cursor);
+       while (error == 0) {
+               error = hammer_mirror_delete_at_cursor(cursor, mirror);
+               if (error == 0)
+                       error = hammer_btree_iterate(cursor);
+       }
+
+       /*
+        * ENOENT just means we hit the end of our iteration.
+        */
+       if (error == ENOENT)
+               error = 0;
+
+       /*
+        * Now skip past the skip (which is the whole point point of
+        * having a skip record).  The sender has not sent us any records
+        * for the skip area so we wouldn't know what to keep and what
+        * to delete anyway.
+        *
+        * Clear ATEDISK because skip_end is non-inclusive, so we can't
+        * count an exact match if we happened to get one.
+        */
+       if (error == 0) {
+               mirror->key_cur = mrec->skip_end;
+               cursor->key_beg = mrec->skip_end;
+               error = hammer_btree_lookup(cursor);
+               cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
+               if (error == ENOENT)
+                       error = 0;
+       }
+       return(error);
+}
+
+/*
+ * Handle B-Tree records.
+ *
+ * We must iterate to mrec->base.key (non-inclusively), and then process
+ * the record.  We are allowed to write a new record or delete an existing
+ * record, but cannot replace an existing record.
+ *
+ * mirror->key_cur must be carefully set when we succeed in processing
+ * this mrec.
+ */
+static int
+hammer_ioc_mirror_write_rec(hammer_cursor_t cursor,
+                           struct hammer_ioc_mrecord_rec *mrec,
+                           struct hammer_ioc_mirror_rw *mirror,
+                           u_int32_t localization,
+                           char *uptr)
+{
+       hammer_transaction_t trans;
+       u_int32_t rec_crc;
+       int error;
+
+       trans = cursor->trans;
+       rec_crc = crc32(mrec, sizeof(*mrec));
+
+       if (mrec->leaf.data_len < 0 || 
+           mrec->leaf.data_len > HAMMER_XBUFSIZE ||
+           mrec->leaf.data_len + sizeof(*mrec) > mrec->head.rec_size) {
+               return(EINVAL);
+       }
+
+       /*
+        * Re-localize for target.  relocalization of data is handled
+        * by hammer_mirror_write().
+        */
+       mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK;
+       mrec->leaf.base.localization += localization;
+
+       /*
+        * Delete records through until we reach (non-inclusively) the
+        * target record.
+        */
+       cursor->key_end = mrec->leaf.base;
+       cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE;
+       cursor->flags |= HAMMER_CURSOR_BACKEND;
+
+       error = hammer_btree_iterate(cursor);
+       while (error == 0) {
+               error = hammer_mirror_delete_at_cursor(cursor, mirror);
+               if (error == 0)
+                       error = hammer_btree_iterate(cursor);
+       }
+       if (error == ENOENT)
+               error = 0;
+
+       /*
+        * Locate the record.
+        *
+        * If the record exists only the delete_tid may be updated.
+        *
+        * If the record does not exist we create it.  For now we
+        * ignore records with a non-zero delete_tid.  Note that
+        * mirror operations are effective an as-of operation and
+        * delete_tid can be 0 for mirroring purposes even if it is
+        * not actually 0 at the originator.
+        */
+       cursor->key_beg = mrec->leaf.base;
+       cursor->flags |= HAMMER_CURSOR_BACKEND;
+       cursor->flags &= ~HAMMER_CURSOR_INSERT;
+       error = hammer_btree_lookup(cursor);
+
+       if (error == 0 && hammer_mirror_check(cursor, mrec)) {
+               hammer_sync_lock_sh(trans);
+               error = hammer_mirror_update(cursor, mrec);
+               hammer_sync_unlock(trans);
+       } else if (error == ENOENT && mrec->leaf.base.delete_tid == 0) {
+               hammer_sync_lock_sh(trans);
+               error = hammer_mirror_write(cursor, mrec, uptr);
+               hammer_sync_unlock(trans);
+       } else if (error == ENOENT) {
+               error = 0;
+       }
+       if (error == 0 || error == EALREADY)
+               mirror->key_cur = mrec->leaf.base;
+       return(error);
+}
+
+/*
+ * This works like write_rec but no write or update is necessary,
+ * and no data payload is included so we couldn't do a write even
+ * if we wanted to.
+ *
+ * We must still iterate for deletions, and we can validate the
+ * record header which is a good way to test for corrupted mirror
+ * targets XXX.
+ *
+ * mirror->key_cur must be carefully set when we succeed in processing
+ * this mrec.
+ */
+static
+int
+hammer_ioc_mirror_write_pass(hammer_cursor_t cursor,
+                            struct hammer_ioc_mrecord_rec *mrec,
+                            struct hammer_ioc_mirror_rw *mirror,
+                            u_int32_t localization)
+{
+       hammer_transaction_t trans;
+       u_int32_t rec_crc;
+       int error;
+
+       trans = cursor->trans;
+       rec_crc = crc32(mrec, sizeof(*mrec));
+
+       /*
+        * Re-localize for target.  Relocalization of data is handled
+        * by hammer_mirror_write().
+        */
+       mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK;
+       mrec->leaf.base.localization += localization;
 
+       /*
+        * Delete records through until we reach (non-inclusively) the
+        * target record.
+        */
+       cursor->key_end = mrec->leaf.base;
+       cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE;
+       cursor->flags |= HAMMER_CURSOR_BACKEND;
+
+       error = hammer_btree_iterate(cursor);
+       while (error == 0) {
+               error = hammer_mirror_delete_at_cursor(cursor, mirror);
+               if (error == 0)
+                       error = hammer_btree_iterate(cursor);
+       }
+       if (error == ENOENT)
+               error = 0;
+
+       /*
+        * Locate the record and get past it by setting ATEDISK.
+        */
+       if (error == 0) {
+               mirror->key_cur = mrec->leaf.base;
+               cursor->key_beg = mrec->leaf.base;
+               cursor->flags |= HAMMER_CURSOR_BACKEND;
+               cursor->flags &= ~HAMMER_CURSOR_INSERT;
+               error = hammer_btree_lookup(cursor);
+               if (error == 0)
+                       cursor->flags |= HAMMER_CURSOR_ATEDISK;
+               else
+                       cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
+               if (error == ENOENT)
+                       error = 0;
+       }
+       return(error);
+}
+
+/*
+ * As part of the mirror write we iterate across swaths of records
+ * on the target which no longer exist on the source, and mark them
+ * deleted.
+ */
+static
+int
+hammer_mirror_delete_at_cursor(hammer_cursor_t cursor,
+                              struct hammer_ioc_mirror_rw *mirror)
+{
+       hammer_transaction_t trans;
+       hammer_btree_elm_t elm;
+
+       elm = &cursor->node->ondisk->elms[cursor->index];
+       KKASSERT(elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD);
+
+       kprintf("mirror_delete %016llx %016llx\n", elm->leaf.base.obj_id, elm->leaf.base.key);
+
+       trans = cursor->trans;
+       hammer_sync_lock_sh(trans);
+
+       if (elm->leaf.base.delete_tid == 0) {
                /*
-                * Locate the record.
-                *
-                * If the record exists only the delete_tid may be updated.
-                *
-                * If the record does not exist we create it.  For now we
-                * ignore records with a non-zero delete_tid.  Note that
-                * mirror operations are effective an as-of operation and
-                * delete_tid can be 0 for mirroring purposes even if it is
-                * not actually 0 at the originator.
+                * We don't know when the originator deleted the element
+                * because it was destroyed, tid_end works.
                 */
-               hammer_normalize_cursor(&cursor);
-               cursor.key_beg = mrec.leaf.base;
-               cursor.flags |= HAMMER_CURSOR_BACKEND;
-               cursor.flags &= ~HAMMER_CURSOR_INSERT;
-               error = hammer_btree_lookup(&cursor);
-
-               if (error == 0 && hammer_mirror_check(&cursor, &mrec)) {
-                       hammer_sync_lock_sh(trans);
-                       error = hammer_mirror_update(&cursor, &mrec);
-                       hammer_sync_unlock(trans);
-               } else if (error == ENOENT && mrec.leaf.base.delete_tid == 0) {
-                       hammer_sync_lock_sh(trans);
-                       error = hammer_mirror_write(&cursor, &mrec, ip,
-                                                   uptr + head_size);
-                       hammer_sync_unlock(trans);
-               } else if (error == ENOENT) {
-                       error = 0;
-               }
+               KKASSERT(elm->base.create_tid < mirror->tid_end);
+               hammer_modify_node(trans, cursor->node, elm, sizeof(*elm));
+               elm->base.delete_tid = mirror->tid_end;
+               elm->leaf.delete_ts = time_second;
+               hammer_modify_node_done(cursor->node);
 
                /*
-                * Clean for loop.  It is ok if the record already exists
-                * on the target.
+                * Track a count of active inodes.
                 */
-               if (error == EDEADLK) {
-                       hammer_done_cursor(&cursor);
-                       error = hammer_init_cursor(trans, &cursor, NULL, NULL);
-                       goto retry;
+               if (elm->base.obj_type == HAMMER_RECTYPE_INODE) {
+                       hammer_modify_volume_field(trans,
+                                                  trans->rootvol,
+                                                  vol0_stat_inodes);
+                       --trans->hmp->rootvol->ondisk->vol0_stat_inodes;
+                       hammer_modify_volume_done(trans->rootvol);
                }
-
-               if (error == EALREADY)
-                       error = 0;
-               if (error == 0)
-                       mirror->count += mrec.rec_size;
        }
-       hammer_done_cursor(&cursor);
+       hammer_sync_unlock(trans);
+
+       cursor->flags |= HAMMER_CURSOR_ATEDISK;
+
        return(0);
 }
 
@@ -332,7 +710,7 @@ retry:
  */
 static
 int
-hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
+hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord_rec *mrec)
 {
        hammer_btree_leaf_elm_t leaf = cursor->leaf;
 
@@ -348,7 +726,8 @@ hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
  */
 static
 int
-hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
+hammer_mirror_update(hammer_cursor_t cursor,
+                    struct hammer_ioc_mrecord_rec *mrec)
 {
        hammer_transaction_t trans;
        hammer_btree_leaf_elm_t elm;
@@ -369,6 +748,11 @@ hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
        elm->delete_ts = mrec->leaf.delete_ts;
        hammer_modify_node_done(cursor->node);
 
+       /*
+        * Cursor is left on the current element, we want to skip it now.
+        */
+       cursor->flags |= HAMMER_CURSOR_ATEDISK;
+
        /*
         * Track a count of active inodes.
         */
@@ -388,8 +772,9 @@ hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
  */
 static
 int
-hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
-                   hammer_inode_t ip, char *udata)
+hammer_mirror_write(hammer_cursor_t cursor,
+                   struct hammer_ioc_mrecord_rec *mrec,
+                   char *udata)
 {
        hammer_transaction_t trans;
        hammer_buffer_t data_buffer;
@@ -399,19 +784,6 @@ hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
        int error;
        int doprop;
 
-#if 0
-       /* 
-        * removed: all records are now duplicated, including the root
-        * inode.
-        */
-       if (mrec->leaf.base.obj_id == HAMMER_OBJID_ROOT) {
-               if (mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE ||
-                   mrec->leaf.base.rec_type == HAMMER_RECTYPE_FIX) {
-                       return(0);
-               }
-       }
-#endif
-
        trans = cursor->trans;
        data_buffer = NULL;
 
@@ -446,7 +818,7 @@ hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
                goto failed;
 
        /*
-        * Do the insertion
+        * Do the insertion.  This can fail with a EDEADLK or EALREADY
         */
        cursor->flags |= HAMMER_CURSOR_INSERT;
        error = hammer_btree_lookup(cursor);
@@ -455,10 +827,15 @@ hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
                        error = EALREADY;
                goto failed;
        }
-       error = 0;
 
        error = hammer_btree_insert(cursor, &mrec->leaf, &doprop);
 
+       /*
+        * Cursor is left on the current element, we want to skip it now.
+        */
+       cursor->flags |= HAMMER_CURSOR_ATEDISK;
+       cursor->flags &= ~HAMMER_CURSOR_INSERT;
+
        /*
         * Track a count of active inodes.
         */
@@ -485,7 +862,7 @@ hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
        }
 
        if (error == 0 && doprop)
-               hammer_btree_do_propagation(cursor, ip, &mrec->leaf);
+               hammer_btree_do_propagation(cursor, NULL, &mrec->leaf);
 
 failed:
        /*
index ec72df5..b99c470 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.84 2008/07/08 04:34:41 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.85 2008/07/10 04:44:33 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1142,7 +1142,8 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
         */
        if (error == 0) {
                if (doprop) {
-                       hammer_btree_do_propagation(cursor, record->ip,
+                       hammer_btree_do_propagation(cursor,
+                                                   record->ip->pfsm,
                                                    &record->leaf);
                }
                if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
@@ -2054,7 +2055,7 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
         */
        if (doprop) {
                KKASSERT(cursor->ip != NULL);
-               hammer_btree_do_propagation(cursor, cursor->ip, leaf);
+               hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf);
        }
        return (error);
 }
index 5f17a33..83f78f6 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.32 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.33 2008/07/10 04:44:33 dillon Exp $
  */
 /*
  * HAMMER structural locking
@@ -462,7 +462,7 @@ hammer_str_to_tid(const char *str, int *ispfs, u_int32_t *localizationp)
        tid = strtouq(str, &ptr, 0);
        if (*ptr == ':') {
                *ispfs = 1;
-               *localizationp = strtoul(ptr + 1, NULL, 0) << 16;
+               *localizationp = strtoul(ptr + 1, NULL, 10) << 16;
        } else {
                *ispfs = 0;
        }
index c1050e2..9cffcc4 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.84 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.85 2008/07/10 04:44:33 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -1346,23 +1346,10 @@ hammer_vop_readlink(struct vop_readlink_args *ap)
 
        ip = VTOI(ap->a_vp);
 
-       /*
-        * Special softlink for PFS access, created by hammer pfs-create
-        */
-
-       if (ip->obj_id == HAMMER_OBJID_ROOT && ip->obj_localization &&
-           ip->obj_asof == HAMMER_MAX_TID) {
-               ksnprintf(buf, sizeof(buf), "@@0x%016llx:0x%04x",
-                       ip->pfsm->pfsd.sync_end_tid,
-                       ip->obj_localization >> 16);
-               error = uiomove(buf, strlen(buf), ap->a_uio);
-               return(error);
-       }
-
        /*
         * Shortcut if the symlink data was stuffed into ino_data.
         *
-        * Also expand special @@PFSxxxxx softlinks.
+        * Also expand special "@@PFS%05d" softlinks.
         */
        if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
                char *ptr;
@@ -1378,10 +1365,18 @@ hammer_vop_readlink(struct vop_readlink_args *ap)
                        pfsm = hammer_load_pseudofs(&trans, localization,
                                                    &error);
                        if (error == 0) {
-                               ksnprintf(buf, sizeof(buf),
-                                        "@@0x%016llx:%05d",
-                                        pfsm->pfsd.sync_end_tid,
-                                        localization >> 16);
+                               if (pfsm->pfsd.mirror_flags &
+                                   HAMMER_PFSD_SLAVE) {
+                                       ksnprintf(buf, sizeof(buf),
+                                                 "@@0x%016llx:%05d",
+                                                 pfsm->pfsd.sync_end_tid,
+                                                 localization >> 16);
+                               } else {
+                                       ksnprintf(buf, sizeof(buf),
+                                                 "@@0x%016llx:%05d",
+                                                 HAMMER_MAX_TID,
+                                                 localization >> 16);
+                               }
                                ptr = buf;
                                bytes = strlen(buf);
                        }