hammer2 - Major restructuring, part 3/several
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 2 May 2013 20:06:28 +0000 (13:06 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 2 May 2013 20:06:28 +0000 (13:06 -0700)
* Use hammer2_chain_duplicate() to move hardlinks to common parents.

* Adjust flush algorithms, particularly when it comes to figuring out
  when we can clear the MOVED bit on a chain.

* chain RBTREE sorts on delete_tid in addition to sorting on the key.
  Add ranged searches.

  The frontend no longer modifies indirect blocks.  Instead, chains in
  the RBTREE are used to placehold both insertions and deletions.

* Stabilization pass.  (still work to do on chain refs / unmount and
  optimizing the flush).

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vnops.c

index fed1ad6..432db1f 100644 (file)
@@ -142,7 +142,7 @@ struct hammer2_chain {
        struct hammer2_mount    *hmp;
        struct hammer2_chain    *duplink;       /* duplication link */
 
-       hammer2_tid_t   create_tid;             /* snapshot/flush filter */
+       hammer2_tid_t   modify_tid;             /* snapshot/flush filter */
        hammer2_tid_t   delete_tid;
        struct buf      *bp;                    /* physical data buffer */
        u_int           bytes;                  /* physical data size */
@@ -436,7 +436,8 @@ void hammer2_inode_put(hammer2_inode_t *ip);
 void hammer2_inode_free(hammer2_inode_t *ip);
 void hammer2_inode_ref(hammer2_inode_t *ip);
 void hammer2_inode_drop(hammer2_inode_t *ip);
-void hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_chain_t **chainp);
+void hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
+                       hammer2_chain_t *chain);
 int hammer2_inode_calc_alloc(hammer2_key_t filesize);
 
 hammer2_inode_t *hammer2_inode_create(hammer2_trans_t *trans,
@@ -479,6 +480,8 @@ int hammer2_chain_lock(hammer2_chain_t *chain, int how);
 void hammer2_chain_moved(hammer2_chain_t *chain);
 void hammer2_chain_modify(hammer2_trans_t *trans,
                                hammer2_chain_t *chain, int flags);
+hammer2_inode_data_t *hammer2_chain_modify_ip(hammer2_trans_t *trans,
+                               hammer2_inode_t *ip, int flags);
 void hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                                struct buf *bp,
                                hammer2_chain_t *parent,
@@ -503,9 +506,10 @@ int hammer2_chain_create(hammer2_trans_t *trans,
                                hammer2_chain_t **chainp,
                                hammer2_key_t key, int keybits,
                                int type, size_t bytes);
-void hammer2_chain_duplicate(hammer2_trans_t *trans,
-                               hammer2_chain_t *parent, int i,
-                               hammer2_chain_t **chainp);
+void hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent,
+                               int i,
+                               hammer2_chain_t **chainp,
+                               hammer2_blockref_t *bref);
 void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                                hammer2_chain_t *chain);
 void hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain);
index 1bff81b..17e3653 100644 (file)
@@ -124,6 +124,8 @@ hammer2_chain_parent_setsubmod(hammer2_chain_t *chain)
        while ((parent = chain->parent) != NULL) {
                core = parent->core;
                spin_lock(&core->cst.spin);
+               while (parent->duplink)
+                       parent = parent->duplink;
                if (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) {
                        spin_unlock(&core->cst.spin);
                        break;
@@ -314,6 +316,15 @@ void
 hammer2_chain_drop(hammer2_chain_t *chain)
 {
        u_int refs;
+       u_int need = 0;
+
+#if 1
+       if (chain->flags & HAMMER2_CHAIN_MOVED)
+               ++need;
+       if (chain->flags & HAMMER2_CHAIN_MODIFIED)
+               ++need;
+       KKASSERT(chain->refs > need);
+#endif
 
        while (chain) {
                refs = chain->refs;
@@ -810,7 +821,7 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         *       structure.
         */
        hammer2_chain_delete(trans, parent, chain);
-       hammer2_chain_duplicate(trans, parent, chain->index, &chain);
+       hammer2_chain_duplicate(trans, parent, chain->index, &chain, NULL);
 
        /*
         * Set MODIFIED and add a chain ref to prevent destruction.  Both
@@ -820,7 +831,7 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         *
         * If the chain is already marked MODIFIED then we can safely
         * return the previous allocation to the pool without having to
-        * worry about snapshots.
+        * worry about snapshots.  XXX check flush synchronization.
         */
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
                atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
@@ -910,10 +921,12 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
        }
        hammer2_chain_parent_setsubmod(chain);
+       *chainp = chain;
 }
 
 /*
- * Convert a locked chain that was retrieved read-only to read-write.
+ * Convert a locked chain that was retrieved read-only to read-write,
+ * duplicating it if necessary to satisfy active flush points.
  *
  * If not already marked modified a new physical block will be allocated
  * and assigned to the bref.
@@ -931,6 +944,25 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
  * This function may return a different chain than was passed, in which case
  * the old chain will be unlocked and the new chain will be locked.
  */
+hammer2_inode_data_t *
+hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                            int flags)
+{
+#if 0
+       hammer2_chain_t *ochain;
+
+       ochain = ip->chain;
+#endif
+       hammer2_chain_modify(trans, ip->chain, flags);
+#if 0
+       if (ochain != ip->chain) {
+               hammer2_chain_ref(ip->chain);
+               hammer2_chain_drop(ochain);
+       }
+#endif
+       return(&ip->chain->data->ipdata);
+}
+
 void
 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
 {
@@ -951,13 +983,18 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
                chain->bref.modify_tid = trans->sync_tid;
 
        /*
-        * If the chain is already marked MODIFIED we can just return.
+        * If the chain is already marked MODIFIED we can usually just
+        * return.
+        *
+        * WARNING!  It is possible that a prior lock/modify sequence
+        *           retired the buffer.  During this lock/modify sequence
+        *           MODIFIED may still be set but the buffer could wind up
+        *           clean.  Since the caller is going to modify the buffer
+        *           further we have to be sure that DIRTYBP is set again.
         *
-        * However, it is possible that a prior lock/modify sequence
-        * retired the buffer.  During this lock/modify sequence MODIFIED
-        * may still be set but the buffer could wind up clean.  Since
-        * the caller is going to modify the buffer further we have to
-        * be sure that DIRTYBP is set again.
+        * WARNING!  Currently the caller is responsible for handling
+        *           any delete/duplication roll of the chain to account
+        *           for modifications crossing synchronization points.
         */
        if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
                if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
@@ -976,6 +1013,12 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
        hammer2_chain_ref(chain);
 
        /*
+        * Adjust chain->modify_tid so the flusher knows when the
+        * modification occurred.
+        */
+       chain->modify_tid = trans->sync_tid;
+
+       /*
         * We must allocate the copy-on-write block.
         *
         * If the data is embedded no other action is required.
@@ -1116,21 +1159,69 @@ hammer2_modify_volume(hammer2_mount_t *hmp)
  * chain is returned with a reference and without a lock, or NULL
  * if not found.
  *
- * NOTE: A chain on-media might exist for this index when NULL is returned.
+ * This function returns the chain at the specified index with the highest
+ * delete_tid.  The caller must check whether the chain is flagged
+ * CHAIN_DELETED or not.
  *
- * NOTE: Can only be used to locate chains which have not been deleted.
+ * NOTE: If no chain is found the caller usually must check the on-media
+ *      array to determine if a blockref exists at the index.
  */
+struct hammer2_chain_find_info {
+       hammer2_chain_t *best;
+       hammer2_tid_t   delete_tid;
+       int index;
+};
+
+static
+int
+hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
+{
+       struct hammer2_chain_find_info *info = data;
+
+       if (child->index < info->index)
+               return(-1);
+       if (child->index > info->index)
+               return(1);
+       return(0);
+}
+
+static
+int
+hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
+{
+       struct hammer2_chain_find_info *info = data;
+
+       if (info->delete_tid < child->delete_tid) {
+               info->delete_tid = child->delete_tid;
+               info->best = child;
+       }
+       return(0);
+}
+
+static
+hammer2_chain_t *
+hammer2_chain_find_locked(hammer2_chain_t *parent, int index)
+{
+       struct hammer2_chain_find_info info;
+
+       info.index = index;
+       info.delete_tid = 0;
+       info.best = NULL;
+
+       RB_SCAN(hammer2_chain_tree, &parent->core->rbtree,
+               hammer2_chain_find_cmp, hammer2_chain_find_callback,
+               &info);
+
+       return (info.best);
+}
+
 hammer2_chain_t *
 hammer2_chain_find(hammer2_chain_t *parent, int index)
 {
-       hammer2_chain_t dummy;
        hammer2_chain_t *chain;
 
-       dummy.flags = 0;
-       dummy.index = index;
-       dummy.delete_tid = HAMMER2_MAX_TID;
        spin_lock(&parent->core->cst.spin);
-       chain = RB_FIND(hammer2_chain_tree, &parent->core->rbtree, &dummy);
+       chain = hammer2_chain_find_locked(parent, index);
        if (chain)
                hammer2_chain_ref(chain);
        spin_unlock(&parent->core->cst.spin);
@@ -1362,6 +1453,9 @@ hammer2_chain_lookup(hammer2_chain_t **parentp,
                hammer2_chain_ref(parent);              /* ref old parent */
                hammer2_chain_unlock(parent);           /* unlock old parent */
                parent = parent->parent;
+               while (parent->duplink)
+                       parent = parent->duplink;
+
                                                        /* lock new parent */
                hammer2_chain_lock(parent, how_maybe);
                hammer2_chain_drop(*parentp);           /* drop old parent */
@@ -1432,7 +1526,8 @@ again:
        for (i = 0; i < count; ++i) {
                tmp = hammer2_chain_find(parent, i);
                if (tmp) {
-                       KKASSERT((tmp->flags & HAMMER2_CHAIN_DELETED) == 0);
+                       if (tmp->flags & HAMMER2_CHAIN_DELETED)
+                               continue;
                        bref = &tmp->bref;
                        KKASSERT(bref->type != 0);
                } else if (base == NULL || base[i].type == 0) {
@@ -1577,6 +1672,8 @@ again:
 
                i = parent->index + 1;
                nparent = parent->parent;
+               while (nparent->duplink)
+                       nparent = nparent->duplink;
                hammer2_chain_ref(nparent);     /* ref new parent */
                hammer2_chain_unlock(parent);   /* unlock old parent */
                                                /* lock new parent */
@@ -1634,7 +1731,10 @@ again2:
        while (i < count) {
                tmp = hammer2_chain_find(parent, i);
                if (tmp) {
-                       KKASSERT((tmp->flags & HAMMER2_CHAIN_DELETED) == 0);
+                       if (tmp->flags & HAMMER2_CHAIN_DELETED) {
+                               ++i;
+                               continue;
+                       }
                        bref = &tmp->bref;
                } else if (base == NULL || base[i].type == 0) {
                        ++i;
@@ -1709,7 +1809,7 @@ again2:
  * must be locked and held.  Do not pass the inode chain to this function
  * unless that is the chain returned by the failed lookup.
  *
- * (chain) is either NULL, a newly allocated chain, or a chain allocated
+ * (*chainp) is either NULL, a newly allocated chain, or a chain allocated
  * via hammer2_chain_duplicate().  When not NULL, the passed-in chain must
  * NOT be attached to any parent, and will be attached by this function.
  * This mechanic is used by the rename code.
@@ -1740,9 +1840,9 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t *parent,
 {
        hammer2_mount_t *hmp;
        hammer2_chain_t *chain;
+       hammer2_chain_t *child;
        hammer2_blockref_t dummy;
        hammer2_blockref_t *base;
-       hammer2_chain_t dummy_chain;
        int unlock_parent = 0;
        int allocated = 0;
        int error = 0;
@@ -1857,24 +1957,19 @@ again:
         * new slots can only transition from empty if the parent is
         * locked exclusively.
         */
-       bzero(&dummy_chain, sizeof(dummy_chain));
-       dummy_chain.delete_tid = HAMMER2_MAX_TID;
 
        spin_lock(&parent->core->cst.spin);
        for (i = 0; i < count; ++i) {
-               if (base == NULL) {
-                       dummy_chain.index = i;
-                       if (RB_FIND(hammer2_chain_tree,
-                                   &parent->core->rbtree, &dummy_chain) == NULL) {
+               child = hammer2_chain_find_locked(parent, i);
+               if (child) {
+                       if (child->flags & HAMMER2_CHAIN_DELETED)
                                break;
-                       }
-               } else if (base[i].type == 0) {
-                       dummy_chain.index = i;
-                       if (RB_FIND(hammer2_chain_tree,
-                                   &parent->core->rbtree, &dummy_chain) == NULL) {
-                               break;
-                       }
+                       continue;
                }
+               if (base == NULL)
+                       break;
+               if (base[i].type == 0)
+                       break;
        }
        spin_unlock(&parent->core->cst.spin);
 
@@ -1930,8 +2025,11 @@ again:
 
        /*
         * (allocated) indicates that this is a newly-created chain element
-        * rather than a renamed chain element.  In this situation we want
-        * to place the chain element in the MODIFIED state.
+        * rather than a renamed chain element.
+        *
+        * In this situation we want to place the chain element in
+        * the MODIFIED state.  The caller expects it to NOT be in the
+        * INITIAL state.
         *
         * The data area will be set up as follows:
         *
@@ -1967,16 +2065,9 @@ again:
                }
        } else {
                /*
-                * When reconnecting inodes we have to call setsubmod()
-                * to ensure that its state propagates up the newly
-                * connected parent.
-                *
-                * Make sure MOVED is set but do not update bref_flush.  If
-                * the chain is undergoing modification bref_flush will be
-                * updated when it gets flushed.  If it is not then the
-                * bref may not have been flushed yet and we do not want to
-                * set MODIFIED here as this could result in unnecessary
-                * reallocations.
+                * When reconnecting a chain we must set MOVED and setsubmod
+                * so the flush recognizes that it must update the bref in
+                * the parent.
                 */
                if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
                        hammer2_chain_ref(chain);
@@ -2012,12 +2103,14 @@ done:
  *      appears atomic on the media.
  */
 void
-hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent,
-                       int i, hammer2_chain_t **chainp)
+hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent, int i,
+                       hammer2_chain_t **chainp, hammer2_blockref_t *bref)
 {
        hammer2_mount_t *hmp = trans->hmp;
        hammer2_blockref_t *base;
-       hammer2_chain_t *chain;
+       hammer2_chain_t *ochain;
+       hammer2_chain_t *nchain;
+       hammer2_chain_t *scan;
        size_t bytes;
        int count;
 
@@ -2027,31 +2120,80 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * to the same bref (the same media block), and copying any inline
         * data.
         */
-       KKASSERT(((*chainp)->flags & HAMMER2_CHAIN_INITIAL) == 0);
-       chain = hammer2_chain_alloc(hmp, &(*chainp)->bref);
-       hammer2_chain_core_alloc(chain, (*chainp)->core);
+       ochain = *chainp;
+       if (bref == NULL)
+               bref = &ochain->bref;
+       nchain = hammer2_chain_alloc(hmp, bref);
+       hammer2_chain_core_alloc(nchain, ochain->core);
 
        bytes = (hammer2_off_t)1 <<
-               (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
-       chain->bytes = bytes;
+               (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
+       nchain->bytes = bytes;
 
-       switch(chain->bref.type) {
+       /*
+        * Be sure to copy the INITIAL flag as well or we could end up
+        * loading garbage from the bref.
+        */
+       if (ochain->flags & HAMMER2_CHAIN_INITIAL)
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
+
+       /*
+        * If the old chain is modified the new one must be too,
+        * but we only want to allocate a new bref.
+        */
+       if (ochain->flags & HAMMER2_CHAIN_MODIFIED) {
+               /*
+                * When duplicating chains the MODIFIED state is inherited.
+                * A new bref typically must be allocated.  However, file
+                * data chains may already have the data offset assigned
+                * to a logical buffer cache buffer so we absolutely cannot
+                * allocate a new bref here for TYPE_DATA.
+                *
+                * Basically the flusher core only dumps media topology
+                * and meta-data, not file data.  The VOP_FSYNC code deals
+                * with the file data.  XXX need back-pointer to inode.
+                */
+               if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
+                       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MODIFIED);
+                       hammer2_chain_ref(nchain);
+               } else {
+                       hammer2_chain_modify(trans, nchain,
+                                            HAMMER2_MODIFY_OPTDATA);
+               }
+       } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
+               /*
+                * When duplicating chains in the INITITAL state we need
+                * to ensure that the chain is marked modified so a
+                * block is properly assigned to it, otherwise the MOVED
+                * bit won't do the right thing.
+                */
+               KKASSERT (nchain->bref.type != HAMMER2_BREF_TYPE_DATA);
+               hammer2_chain_modify(trans, nchain, HAMMER2_MODIFY_OPTDATA);
+       }
+       if (parent || (ochain->flags & HAMMER2_CHAIN_MOVED)) {
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
+               hammer2_chain_ref(nchain);
+       }
+       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
+
+       switch(nchain->bref.type) {
        case HAMMER2_BREF_TYPE_VOLUME:
                panic("hammer2_chain_duplicate: cannot be called w/volhdr");
                break;
        case HAMMER2_BREF_TYPE_INODE:
                KKASSERT(bytes == HAMMER2_INODE_BYTES);
-               if ((*chainp)->data) {
-                       chain->data = kmalloc(sizeof(chain->data->ipdata),
+               if (ochain->data) {
+                       nchain->data = kmalloc(sizeof(nchain->data->ipdata),
                                              hmp->minode, M_WAITOK | M_ZERO);
-                       chain->data->ipdata = (*chainp)->data->ipdata;
+                       nchain->data->ipdata = ochain->data->ipdata;
                }
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-#if 0
-               panic("hammer2_chain_duplicate: cannot be used to"
-                     "create an indirect block");
-#endif
+               if ((nchain->flags & HAMMER2_CHAIN_MODIFIED) &&
+                   nchain->data) {
+                       bcopy(ochain->data, nchain->data,
+                             nchain->bytes);
+               }
                break;
        case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
        case HAMMER2_BREF_TYPE_FREEMAP_NODE:
@@ -2061,25 +2203,45 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent,
        case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
        case HAMMER2_BREF_TYPE_DATA:
        default:
+               if ((nchain->flags & HAMMER2_CHAIN_MODIFIED) &&
+                   nchain->data) {
+                       bcopy(ochain->data, nchain->data,
+                             nchain->bytes);
+               }
                /* leave chain->data NULL */
-               KKASSERT(chain->data == NULL);
+               KKASSERT(nchain->data == NULL);
                break;
        }
 
        /*
         * Both chains must be locked for us to be able to set the
-        * duplink.  To avoid buffer cache deadlocks we do not try
-        * to resolve the new chain until after we've unlocked the
-        * old one.
-        */
-       hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
-       KKASSERT((*chainp)->duplink == NULL);
-       (*chainp)->duplink = chain;     /* inherits excess ref from alloc */
-       hammer2_chain_unlock(*chainp);
-       *chainp = chain;
-       hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
-       hammer2_chain_unlock(chain);
+        * duplink.  The caller may expect valid data.
+        *
+        * Unmodified duplicated blocks may have the same bref, we
+        * must be careful to avoid buffer cache deadlocks so we
+        * unlock the old chain before resolving the new one.
+        *
+        * Insert nchain at the end of the duplication list.
+        */
+       hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
+
+       for (;;) {
+               for (scan = ochain; scan->duplink; scan = scan->duplink)
+                       ;
+               spin_lock(&scan->core->cst.spin);
+               if (scan->duplink == NULL) {
+                       /* inherits excess ref from alloc */
+                       scan->duplink = nchain;
+                       spin_unlock(&scan->core->cst.spin);
+                       break;
+               }
+               spin_unlock(&scan->core->cst.spin);
+       }
 
+       hammer2_chain_unlock(ochain);
+       *chainp = nchain;
+       hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE);
+       hammer2_chain_unlock(nchain);
 
        /*
         * If parent is not NULL, insert into the parent at the requested
@@ -2125,22 +2287,22 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent,
                KKASSERT(i >= 0 && i < count);
                KKASSERT(base == NULL || base[i].type == 0);
 
-               chain->parent = parent;
-               chain->index = i;
-               KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
+               nchain->parent = parent;
+               nchain->index = i;
+               KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
                KKASSERT(parent->refs > 0);
                spin_lock(&parent->core->cst.spin);
-               if (RB_INSERT(hammer2_chain_tree, &parent->core->rbtree, chain))
+               if (RB_INSERT(hammer2_chain_tree, &parent->core->rbtree, nchain))
                        panic("hammer2_chain_link: collision");
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
-               hammer2_chain_ref(parent);      /* chain->parent ref */
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
+               hammer2_chain_ref(parent);      /* nchain->parent ref */
                spin_unlock(&parent->core->cst.spin);
 
-               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                       hammer2_chain_ref(chain);
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
+               if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+                       hammer2_chain_ref(nchain);
+                       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
                }
-               hammer2_chain_parent_setsubmod(chain);
+               hammer2_chain_parent_setsubmod(nchain);
        }
 }
 
@@ -2195,6 +2357,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
        hammer2_blockref_t *base;
        hammer2_blockref_t *bref;
        hammer2_chain_t *chain;
+       hammer2_chain_t *child;
        hammer2_chain_t *ichain;
        hammer2_chain_t dummy;
        hammer2_key_t key = create_key;
@@ -2213,7 +2376,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
        KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
        *errorp = 0;
 
-       hammer2_chain_modify(trans, parent, HAMMER2_MODIFY_OPTDATA);
+       /*hammer2_chain_modify(trans, parent, HAMMER2_MODIFY_OPTDATA);*/
        if (parent->flags & HAMMER2_CHAIN_INITIAL) {
                base = NULL;
 
@@ -2275,12 +2438,11 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
        for (i = 0; i < count; ++i) {
                int nkeybits;
 
-               dummy.index = i;
-               chain = RB_FIND(hammer2_chain_tree, &parent->core->rbtree,
-                               &dummy);
-               if (chain) {
-                       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
-                       bref = &chain->bref;
+               child = hammer2_chain_find_locked(parent, i);
+               if (child) {
+                       if (child->flags & HAMMER2_CHAIN_DELETED)
+                               continue;
+                       bref = &child->bref;
                } else if (base && base[i].type) {
                        bref = &base[i];
                } else {
@@ -2293,8 +2455,13 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                 * that we will later cut in half (two halves @ nkeybits - 1).
                 */
                nkeybits = keybits;
-               if (nkeybits < bref->keybits)
+               if (nkeybits < bref->keybits) {
+                       if (bref->keybits > 64) {
+                               kprintf("bad bref index %d chain %p bref %p\n", i, chain, bref);
+                               Debugger("fubar");
+                       }
                        nkeybits = bref->keybits;
+               }
                while (nkeybits < 64 &&
                       (~(((hammer2_key_t)1 << nkeybits) - 1) &
                        (key ^ bref->key)) != 0) {
@@ -2400,6 +2567,13 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
        hammer2_chain_drop(ichain);     /* excess ref from alloc */
 
        /*
+        * We have to mark it modified to allocate its block, but use
+        * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
+        * it won't be acted upon by the flush code.
+        */
+       hammer2_chain_modify(trans, ichain, HAMMER2_MODIFY_OPTDATA);
+
+       /*
         * Iterate the original parent and move the matching brefs into
         * the new indirect block.
         *
@@ -2414,12 +2588,14 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                 * anyway so we can avoid checking the cache when the media
                 * has a key.
                 */
-               dummy.index = i;
-               chain = RB_FIND(hammer2_chain_tree, &parent->core->rbtree,
-                               &dummy);
-               if (chain) {
-                       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
-                       bref = &chain->bref;
+               child = hammer2_chain_find_locked(parent, i);
+               if (child) {
+                       if (child->flags & HAMMER2_CHAIN_DELETED) {
+                               if (ichain->index < 0)
+                                       ichain->index = i;
+                               continue;
+                       }
+                       bref = &child->bref;
                } else if (base && base[i].type) {
                        bref = &base[i];
                } else {
@@ -2464,12 +2640,8 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                spin_unlock(&parent->core->cst.spin);
                chain = hammer2_chain_get(parent, i, HAMMER2_LOOKUP_NODATA);
                hammer2_chain_delete(trans, parent, chain);
-               hammer2_chain_duplicate(trans, ichain, i, &chain);
+               hammer2_chain_duplicate(trans, ichain, i, &chain, NULL);
 
-#if 0
-               if (base)
-                       bzero(&base[i], sizeof(base[i]));
-#endif
                hammer2_chain_unlock(chain);
                KKASSERT(parent->refs > 0);
                chain = NULL;
@@ -2489,6 +2661,9 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * The insertion shouldn't race as this is a completely new block
         * and the parent is locked.
         */
+       if (ichain->index < 0)
+               kprintf("indirect parent %p count %d key %016jx/%d\n",
+                       parent, count, (intmax_t)key, keybits);
        KKASSERT(ichain->index >= 0);
        KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
        spin_lock(&parent->core->cst.spin);
@@ -2498,6 +2673,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
        ichain->parent = parent;
        hammer2_chain_ref(parent);      /* ichain->parent ref */
        spin_unlock(&parent->core->cst.spin);
+       KKASSERT(parent->duplink == NULL); /* XXX mus be inside spin */
 
        /*
         * Mark the new indirect block modified after insertion, which
@@ -2511,7 +2687,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * our moved blocks, then call setsubmod() to set the bit
         * recursively.
         */
-       hammer2_chain_modify(trans, ichain, HAMMER2_MODIFY_OPTDATA);
+       /*hammer2_chain_modify(trans, ichain, HAMMER2_MODIFY_OPTDATA);*/
        hammer2_chain_parent_setsubmod(ichain);
        atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
 
@@ -2544,41 +2720,31 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
 
 /*
  * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and
- * remove the parent's bref reference to chain, generating a modification
- * on the parent.
- *
- * We do not attempt to defer adjustment of the parent bref to the chain
- * as this could become quite complex with multiple deletions / replacements.
- * Intead, a modification is generated in the parent which can cause it to
- * be duplicated if the current parent's data is required for a flush in
- * progress.
+ * set chain->delete_tid.
  *
- * NOTE: We can trivially adjust the parent if it is in the INITIAL state.
- *
- * NOTE: The flush code handles the actual removal of the chain from
- *      the BTREE (also, depending on synchronization points, the
- *      chain may still be relevant to the flush).
- *
- * NOTE: chain->delete_tid distinguishes deleted chains from live chains,
- *      by setting it to something less than HAMMER2_MAX_TID the
- *      chain_lookup(), chain_next(), and chain_get() functions will
- *      not have visibility.
+ * This function does NOT generate a modification to the parent.  Such
+ * modifications are handled by the flush code and are properly merged
+ * using the flush synchronization point.  The find/get code will
+ * properly overload the RBTREE check on top of the bref check to detect
+ * deleted entries prior to flush.
  *
  * This function is NOT recursive.  Any entity already pushed into the
  * chain (such as an inode) may still need visibility into its contents,
  * as well as the ability to read and modify the contents.  For example,
  * for an unlinked file which is still open.
+ *
+ * NOTE: This function does NOT set chain->modify_tid, allowing future
+ *      code to distinguish between live and deleted chains by testing
+ *      sync_tid.
  */
 void
 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                     hammer2_chain_t *chain)
 {
-       hammer2_mount_t *hmp = trans->hmp;
-       hammer2_blockref_t *base;
-       int count;
-
-       if (chain->parent != parent)
+       if (chain->parent->core != parent->core)
                panic("hammer2_chain_delete: parent mismatch");
+       if (parent->duplink)
+               panic("hammer2_chain_delete: deleting via stale path");
        KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
 
        /*
@@ -2588,55 +2754,9 @@ hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                return;
 
        /*
-        * Mark the parent modified so our base[] pointer remains valid
-        * while we move entries.  For the optimized indirect block
-        * case mark the parent moved instead.
-        *
-        * Calculate the blockref reference in the parent and zero it out.
-        */
-       switch(parent->bref.type) {
-       case HAMMER2_BREF_TYPE_INODE:
-               hammer2_chain_modify(trans, parent,
-                                    HAMMER2_MODIFY_NO_MODIFY_TID);
-               base = &parent->data->ipdata.u.blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       case HAMMER2_BREF_TYPE_INDIRECT:
-       case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
-       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-               hammer2_chain_modify(trans, parent,
-                                    HAMMER2_MODIFY_OPTDATA |
-                                    HAMMER2_MODIFY_NO_MODIFY_TID);
-               if (parent->flags & HAMMER2_CHAIN_INITIAL)
-                       base = NULL;
-               else
-                       base = &parent->data->npdata.blockref[0];
-               count = parent->bytes / sizeof(hammer2_blockref_t);
-               break;
-       case HAMMER2_BREF_TYPE_VOLUME:
-               hammer2_chain_modify(trans, parent,
-                                    HAMMER2_MODIFY_NO_MODIFY_TID);
-               base = &hmp->voldata.sroot_blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       default:
-               panic("hammer2_chain_delete: unrecognized blockref type: %d",
-                     parent->bref.type);
-               base = NULL;    /* NOT REACHED */
-               count = 0;      /* NOT REACHED */
-               break;          /* NOT REACHED */
-       }
-       KKASSERT(chain->index >= 0 && chain->index < count);
-
-       /*
-        * Clean out the blockref immediately.
-        */
-       if (base)
-               bzero(&base[chain->index], sizeof(*base));
-
-       /*
-        * Must set MOVED along with DELETED for the flush code to recognize
-        * the operation and properly disconnect the chain in-memory.
+        * We must set MOVED along with DELETED for the flush code to
+        * recognize the operation and properly disconnect the chain
+        * in-memory.
         *
         * The setting of DELETED causes finds, lookups, and _next iterations
         * to no longer recognize the chain.  RB_SCAN()s will still have
index 83bec77..4e883c3 100644 (file)
@@ -179,7 +179,8 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
                 * Flush pass1 on root.  SUBMODIFIED can remain set after
                 * this call for numerous reasons, including write failures,
                 * but most likely due to only a partial flush being
-                * requested.
+                * requested or the chain element belongs to the wrong
+                * synchronization point.
                 */
                info.diddeferral = 0;
                hammer2_chain_flush_core(&info, chain);
@@ -208,9 +209,10 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
 }
 
 /*
- * (chain) is locked by the caller and remains locked on return.
- * This function is keyed off of SUBMODIFIED but must make fine-grained
- * choices based on the synchronization point we are flushing to.
+ * This is the core of the chain flushing code.  The chain is locked by the
+ * caller and remains locked on return.  This function is keyed off of
+ * the SUBMODIFIED bit but must make fine-grained choices based on the
+ * synchronization point we are flushing to.
  *
  * If the flush accomplished any work chain will be flagged MOVED
  * indicating a copy-on-write propagation back up is required.
@@ -228,6 +230,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
        hammer2_mount_t *hmp;
        hammer2_blockref_t *bref;
        hammer2_off_t pbase;
+       hammer2_tid_t saved_sync;
        size_t bbytes;
        size_t boff;
        char *bdata;
@@ -239,6 +242,32 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
        hmp = info->hmp;
 
        /*
+        * A chain modified beyond our flush point is ignored by the current
+        * flush.  We could safely flush such chains if we wanted to (they
+        * just wouldn't propagate back up and be left with MOVED set), but
+        * doing so could lead to an infinite flush loop under heavy
+        * filesystem write loads.  By ignoring such elements the flush
+        * will only deal with changes as-of when the flush was started.
+        *
+        * NOTE: Unmodified chains set modify_tid to 0, allowing us to reach
+        *       deeper chains.
+        */
+       if (chain->modify_tid > info->sync_tid)
+               return;
+
+       /*
+        * Restrict the synchronization point when we encounter a
+        * delete/duplicate chain.  We do not do this for deletions
+        * at the end of the linked list because they represent an
+        * operation occuring within the flush range, whereas flushes
+        * through deleted chains which have been duplicated represent
+        * only changes made through that deletion point.
+        */
+       saved_sync = info->sync_tid;
+       if (chain->duplink && chain->delete_tid < saved_sync)
+               info->sync_tid = chain->delete_tid;
+
+       /*
         * If SUBMODIFIED is set we recurse the flush and adjust the
         * blockrefs accordingly.
         *
@@ -247,6 +276,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
         */
        if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
                hammer2_chain_t *saved_parent;
+               hammer2_tid_t saved_mirror;
 
                /*
                 * Clear SUBMODIFIED to catch races.  Note that any child
@@ -286,7 +316,9 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                 *       the scan.
                 */
                saved_parent = info->parent;
+               saved_mirror = info->mirror_tid;
                info->parent = chain;
+               info->mirror_tid = chain->bref.mirror_tid;
 
                if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
                        if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
@@ -320,11 +352,18 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                RB_SCAN(hammer2_chain_tree, &chain->core->rbtree,
                        NULL, hammer2_chain_flush_scan2, info);
                spin_unlock(&chain->core->cst.spin);
+               chain->bref.mirror_tid = info->mirror_tid;
+               info->mirror_tid = saved_mirror;
                info->parent = saved_parent;
                hammer2_chain_drop(chain);
        }
 
        /*
+        * Restore sync_tid in case it was restricted by a delete/duplicate.
+        */
+       info->sync_tid = saved_sync;
+
+       /*
         * Rollup diddeferral for caller.  Note direct assignment, not +=.
         */
        info->diddeferral = diddeferral;
@@ -343,6 +382,15 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
        }
 
        /*
+        * The DESTROYED flag is set when an inode is physically deleted
+        * and no longer referenced (no open descriptors).   We can
+        * safely clear the MODIFIED bit.
+        *
+        * The MOVED bit has to be left intact as this flags the zeroing
+        * of the bref in the parent chain.
+        *
+        * XXX
+        *
         * Chain objects flagged for complete destruction recurse down from
         * their inode.  The inode will have already been removed from
         * its parent.  We have no need to disconnect the children from
@@ -350,11 +398,6 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
         * waste time and storage with copy-on-write operations), so
         * we can clear both the MODIFIED bit and the MOVED bit.
         *
-        * However, delete_tid must be within the synchronization zone
-        * for us to act on this bit.  Open-but-deleted files have to
-        * be managed by the cluster such that they are not subjected to
-        * reclamation.
-        *
         * DESTROYED chains stop processing here.
         */
        if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
@@ -365,10 +408,12 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
                        hammer2_chain_drop(chain);
                }
+#if 0
                if (chain->flags & HAMMER2_CHAIN_MOVED) {
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
                        hammer2_chain_drop(chain);
                }
+#endif
                if (hammer2_debug & 0x0008) {
                        kprintf("%*.*s} %p/%d %04x (destroyed)",
                                info->depth, info->depth, "",
@@ -378,8 +423,8 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
        }
 
        /*
-        * If MODIFIED is not set or modify_tid is > sync_tid we have
-        * nothing to do.
+        * A degenerate flush might not have flushed anything and thus not
+        * processed modified blocks on the way back up.  Detect the case.
         *
         * Note that MOVED can be set without MODIFIED being set due to
         * a deletion, in which case it is handled by Scan2 later on.
@@ -387,18 +432,11 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
         * Both bits can be set along with DELETED due to a deletion if
         * modified data within the synchronization zone and the chain
         * was then deleted beyond the zone, in which case we still have
-        * to flush for synchronization point consistency.
+        * to flush for synchronization point consistency.  Otherwise though
+        * DELETED and MODIFIED are treated as separate flags.
         */
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
                return;
-       if (chain->bref.modify_tid > info->sync_tid) {
-               if (hammer2_debug & 0x0008) {
-                       kprintf("%*.*s} %p/%d %04x (skip - beyond sync_tid)",
-                               info->depth, info->depth, "",
-                               chain, chain->refs, chain->flags);
-               }
-               return;
-       }
 
        /*
         * Issue flush.
@@ -442,15 +480,17 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
 
        /*
         * If this is part of a recursive flush we can go ahead and write
-        * out the buffer cache buffer and pass a new bref back up the chain.
+        * out the buffer cache buffer and pass a new bref back up the chain
+        * via the MOVED bit.
         *
-        * This will never be a volume header.
+        * Volume headers are NOT flushed here as they require special
+        * processing.
         */
        switch(chain->bref.type) {
        case HAMMER2_BREF_TYPE_VOLUME:
                /*
                 * The volume header is flushed manually by the syncer, not
-                * here.
+                * here.  All we do is adjust the crc's.
                 */
                KKASSERT(chain->data != NULL);
                KKASSERT(chain->bp == NULL);
@@ -481,7 +521,8 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                 * file buffer cache.  Their hash was set in the bref by
                 * the vop_write code.
                 *
-                * Make sure the buffer(s) have been flushed out here.
+                * Make sure any device buffer(s) have been flushed out here.
+                * (there aren't usually any to flush).
                 */
                bbytes = chain->bytes;
                pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
@@ -587,11 +628,15 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
  * Flush helper scan1 (recursive)
  *
  * Flushes the children of the caller's chain (parent) and updates
- * the blockref.
+ * the blockref, restricted by sync_tid.
  *
  * Ripouts during the loop should not cause any problems.  Because we are
  * flushing to a synchronization point, modification races will occur after
  * sync_tid and do not have to be flushed anyway.
+ *
+ * It is also ok if the parent is chain_duplicate()'d while unlocked because
+ * the delete/duplication will install a delete_tid that is still larger than
+ * our current sync_tid.
  */
 static int
 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
@@ -601,15 +646,22 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
        /*hammer2_mount_t *hmp = info->hmp;*/
        int diddeferral;
 
+#if 0
+       kprintf("flush %p,%d [%08x] -> %p [%08x] %jx\n",
+               parent, child->index, parent->flags,
+               child, child->flags, child->bref.key);
+#endif
+
        /*
         * We should only need to recurse if SUBMODIFIED is set, but as
-        * a safety also recursive if MODIFIED is also set.  Return early
+        * a safety also recurse if MODIFIED is also set.  Return early
         * if neither bit is set.
         */
        if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
                             HAMMER2_CHAIN_MODIFIED)) == 0) {
                return (0);
        }
+       hammer2_chain_ref(child);
        spin_unlock(&parent->core->cst.spin);
 
        /*
@@ -623,28 +675,31 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
        if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
                             HAMMER2_CHAIN_MODIFIED)) == 0) {
                hammer2_chain_unlock(child);
-               hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_drop(child);
+               hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
                spin_lock(&parent->core->cst.spin);
                return (0);
        }
 
        /*
-        * Propagate the DESTROYED flag if found set as well as deal with
-        * delete_tid.  This also causes SUBMODIFIED to be propagated
-        * downward to keep the recursion going.
-        *
-        * In the case of delete_tid, nothing need be done.  Destruction
-        * occurs after any deletions and destruction of internal chains
-        * where delete_tid may be 0 (since we don't bother to copy-on-write
-        * the propagation of a deletion) will pass the conditional just
-        * fine.
+        * The DESTROYED flag can only be initially set on an unreferenced
+        * deleted inode and will propagate downward via the mechanic below.
+        * Such inode chains have been deleted for good and should no longer
+        * be subject to delete/duplication.
         *
         * This optimization allows the inode reclaim (destroy unlinked file
         * on vnode reclamation after last close) to be flagged by just
-        * setting HAMMER2_CHAIN_DESTROYED at the top level.
+        * setting HAMMER2_CHAIN_DESTROYED at the top level and then will
+        * cause the chains to be terminated and related buffers to be
+        * invalidated and not flushed out.
+        *
+        * We have to be careful not to propagate the DESTROYED flag if
+        * the destruction occurred after our flush sync_tid.
         */
        if ((parent->flags & HAMMER2_CHAIN_DESTROYED) &&
+           child->delete_tid <= info->sync_tid &&
            (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
+               KKASSERT(child->duplink == NULL);
                atomic_set_int(&child->flags,
                               HAMMER2_CHAIN_DESTROYED |
                               HAMMER2_CHAIN_SUBMODIFIED);
@@ -660,12 +715,9 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
        info->diddeferral += diddeferral;
 
        hammer2_chain_unlock(child);
+       hammer2_chain_drop(child);
 
-       /*
-        * Always resolve when relocking the parent meta-data so Scan2
-        * has the indirect block data in-hand to handle the MOVED bit.
-        */
-       hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
+       hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
 
        spin_lock(&parent->core->cst.spin);
        return (0);
@@ -675,32 +727,79 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
  * Flush helper scan2 (non-recursive)
  *
  * This pass on a chain's children propagates any MOVED or DELETED
- * elements back up the chain towards the root.  The bref's modify_tid
- * must be within the synchronization zone for MOVED to be recognized
- * and delete_tid must be within the synchronization zone for DELETED
- * to be recognized.
+ * elements back up the chain towards the root after those elements have
+ * been fully flushed.  Unlike scan1, this function is NOT recursive and
+ * the parent remains locked across the entire scan.
+ *
+ * Moves     - MOVED elements need to propagate their bref up to the parent.
+ *            all parents from element->parent through the duplink chain
+ *            must be updated.  The flag can only be reset once SUBMODIFIED
+ *            has been cleared for all parents in the chain.
+ *
+ *            A secondary bcmp of the bref is made to catch out-of-order
+ *            flushes and not re-sync parents which are already correct.
+ *
+ * Deletions - Deletions are handled via delete_tid coupled with the MOVED
+ *            flag.  When a deletion is detected the parent's bref to the
+ *            child is properly cleared.  MOVED is always set when a deletion
+ *            is made.  A deleted element is an element where delete_tid !=
+ *            HAMMER2_MAX_TID.
+ *
+ * NOTE!  We must re-set SUBMODIFIED on the parent(s) as appropriate, and
+ *       due to the above conditions it is possible to do this and still
+ *       have some children flagged MOVED depending on the synchronization.
  *
- * We must re-set SUBMODIFIED if appropriate.
+ * NOTE!  A deletion is a visbility issue, there can still be referenced to
+ *       deleted elements (for example, to an unlinked file which is still
+ *       open), and there can also be multiple chains pointing to the same
+ *       bref where some are deleted and some are not (for example due to
+ *       a rename).   So a chain marked for deletion is basically considered
+ *       to be live until it is explicitly destroyed.  Such chains can also
+ *       be freed once all MOVED and MODIFIED handling is done.
  */
 static int
 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
 {
-       enum { HC_NONE, HC_DELETE, HC_UPDATE } action = HC_NONE;
        hammer2_flush_info_t *info = data;
        hammer2_chain_t *parent = info->parent;
        hammer2_mount_t *hmp = info->hmp;
        hammer2_blockref_t *base;
        int count;
+       int child_flags;
+
+       /*
+        * Ignore children modified beyond our flush point.  If the parent
+        * is deleted within our flush we don't have to re-set SUBMODIFIED,
+        * otherwise we must set it according to the child's flags so
+        * SUBMODIFIED remains flagged for later flushes.
+        *
+        * NOTE: modify_tid is only updated for modifications, NOT for
+        *       deletions (delete_tid is updated for deletions).  Also note
+        *       that delete_tid will ALWAYS be >= modify_tid.
+        *
+        * XXX spin-lock on child->modify_tid ?
+        */
+       if (child->modify_tid > info->sync_tid) {
+               if (parent->delete_tid <= info->sync_tid)
+                       child_flags = 0;
+               else
+                       child_flags = child->flags;
+               goto finalize;
+       }
 
        /*
         * Check update conditions prior to locking child.
         * We may not be able to safely test the 64-bit TIDs
         * but we can certainly test the flags.
+        *
+        * NOTE: DELETED always also sets MOVED.
         */
-       if ((child->flags & (HAMMER2_CHAIN_DELETED |
-                            HAMMER2_CHAIN_MOVED)) == 0) {
+       if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
+               child_flags = child->flags;
                goto finalize;
        }
+
+       hammer2_chain_ref(child);
        spin_unlock(&parent->core->cst.spin);
 
        /*
@@ -708,53 +807,58 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
         * the child from being destroyed out from under our operation
         * so we can lock the child safely without worrying about it
         * getting ripped up (?).
+        *
+        * We can only update parents where child->parent matches.  The
+        * child->parent link will migrate along the chain but the flush
+        * order must be enforced absolutely.  Parent reflushed after the
+        * child has passed them by should skip due to the modify_tid test.
         */
        hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
 
-       /*
-        * Full condition check.  We can only update and clear MOVED
-        * if the child is deleted or updated within our synchronization
-        * zone.
-        */
-       if ((child->flags & HAMMER2_CHAIN_DELETED) &&
-           child->delete_tid <= info->sync_tid) {
-               action = HC_DELETE;
-       } else if ((child->flags & HAMMER2_CHAIN_MOVED) &&
-                  child->bref.modify_tid <= info->sync_tid) {
-               action = HC_UPDATE;
-       } else {
+       if (child->parent != parent) {
+               child_flags = child->flags;
                hammer2_chain_unlock(child);
                spin_lock(&parent->core->cst.spin);
                goto finalize;
        }
 
        /*
-        * If the parent is to be deleted then we can clear MOVED
-        * in the child without updating the parent.  That is, it
-        * doesn't matter that the parent->child blockref is left intact
-        * because the parent is going to be deleted too.  This little
-        * bit of code will result in major optimizations of recursive
-        * file tree deletions and truncations.
-        */
-       if ((parent->flags & HAMMER2_CHAIN_DELETED) &&
-           parent->delete_tid <= info->sync_tid) {
-               goto cleanup;
-       }
-
-       /*
         * The parent's blockref to the child must be deleted or updated.
         *
         * This point is not reached on successful DESTROYED optimizations
-        * but can be reached on recursive deletions.  We can optimize
+        * but can be reached on recursive deletions.
+        *
+        * XXX recursive deletions not optimized.
         */
        hammer2_chain_modify(info->trans, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
+       if (parent->flags & HAMMER2_CHAIN_INITIAL)
+               kprintf("parent %p INITIAL still set! data %p\n",
+                       parent, parent->data);
 
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we
+                * properly duplicate the inode headers and do proper flush
+                * range checks (all the children should be beyond the flush
+                * point).  For now just don't sync the non-applicable
+                * children.
+                *
+                * XXX Can also occur due to hardlink consolidation.  We
+                * set OPFLAG_DIRECTDATA to prevent the indirect and data
+                * blocks from syncing ot the hardlink pointer.
+                */
+#if 0
                KKASSERT((parent->data->ipdata.op_flags &
                          HAMMER2_OPFLAG_DIRECTDATA) == 0);
-               base = &parent->data->ipdata.u.blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
+#endif
+               if (parent->data->ipdata.op_flags &
+                   HAMMER2_OPFLAG_DIRECTDATA) {
+                       base = NULL;
+               } else {
+                       base = &parent->data->ipdata.u.blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+               }
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
                if (parent->data) {
@@ -786,40 +890,61 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
         * WARNING! Deleted chains may still be used by the filesystem
         *          in a later duplication, for example in a rename()
         *          operation.  Also any topological movement of the
-        *          related blocks.
+        *          related blocks.  We only mess with the parent
+        *          block array, we do not mess with the child!
         *
         *          We adjust the parent's bref pointer to the child but
         *          we do not modify the contents of the child.
         */
-       if (action == HC_DELETE) {
+       if (child->delete_tid <= info->sync_tid) {
                if (base) {
                        KKASSERT(child->index < count);
                        bzero(&base[child->index], sizeof(child->bref));
+                       if (info->mirror_tid < child->delete_tid)
+                               info->mirror_tid = child->delete_tid;
                }
        } else {
                if (base) {
                        KKASSERT(child->index < count);
                        base[child->index] = child->bref;
+                       if (info->mirror_tid < child->modify_tid)
+                               info->mirror_tid = child->modify_tid;
                }
        }
        KKASSERT(child->index >= 0);
 
-       if (parent->bref.mirror_tid < child->bref.mirror_tid) {
-               parent->bref.mirror_tid = child->bref.mirror_tid;
+       /*
+        * Propagate mirror_tid back up.
+        */
+       if (info->mirror_tid < child->bref.mirror_tid) {
+               info->mirror_tid = child->bref.mirror_tid;
        }
        if (parent->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
            hmp->voldata.mirror_tid < child->bref.mirror_tid) {
                hmp->voldata.mirror_tid = child->bref.mirror_tid;
        }
 
-cleanup:
-
        /*
-        * Cleanup the child's MOVED flag and unlock the child.
+        * We can only clear the MOVED flag when the child has progressed
+        * to the last parent in the duplication chain.
+        *
+        * MOVED might not be set if we are reflushing this chain due to
+        * the previous chain overwriting the same index in the parent.
         */
-       if (child->flags & HAMMER2_CHAIN_MOVED) {
-               atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
-               hammer2_chain_drop(child);      /* flag */
+       if (child->parent == parent && parent->duplink) {
+               hammer2_chain_ref(parent->duplink);
+               child->parent = parent->duplink;
+               child->modify_tid = child->parent->modify_tid;
+               hammer2_chain_drop(parent);
+       }
+       if ((child->flags & HAMMER2_CHAIN_MOVED) && parent->duplink == NULL) {
+               if (child->delete_tid == HAMMER2_MAX_TID) {
+                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
+                       hammer2_chain_drop(child);      /* flag */
+               } else if (child->delete_tid <= info->sync_tid) {
+                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
+                       hammer2_chain_drop(child);      /* flag */
+               }
        }
 
        /*
@@ -828,7 +953,9 @@ cleanup:
         * the structure.  The RB_SCAN() our caller is doing handles the
         * situation.
         */
+       child_flags = child->flags;
        hammer2_chain_unlock(child);
+       hammer2_chain_drop(child);
        spin_lock(&parent->core->cst.spin);
 
        /*
@@ -838,7 +965,7 @@ cleanup:
         * up.
         */
 finalize:
-       if (child->flags & (HAMMER2_CHAIN_MOVED |
+       if (child_flags & (HAMMER2_CHAIN_MOVED |
                            HAMMER2_CHAIN_DELETED |
                            HAMMER2_CHAIN_MODIFIED |
                            HAMMER2_CHAIN_SUBMODIFIED)) {
index 3266ba6..7bcdb64 100644 (file)
@@ -63,7 +63,7 @@ hammer2_inode_drop(hammer2_inode_t *ip)
        hammer2_chain_t *chain;
        u_int refs;
 
-       for (;;) {
+       while (ip) {
                refs = ip->refs;
                cpu_ccfence();
                if (refs == 1) {
@@ -85,10 +85,8 @@ hammer2_inode_drop(hammer2_inode_t *ip)
                                 * ip->pip.  We can simply loop on it.
                                 */
                                kfree(ip, hmp->minode);
-                               if (pip == NULL)
-                                       break;
                                ip = pip;
-                               /* continue */
+                               /* continue with pip (can be NULL) */
                        }
                } else {
                        if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
@@ -478,7 +476,8 @@ retry:
 /*
  * Create a duplicate of (ochain) in the specified target directory (dip).
  * ochain must represent an inode.  The new chain is returned locked and
- * referenced.
+ * referenced and its filename is changed to a representation of the
+ * inode number "0xBLAH", as a hidden directory entry.
  */
 hammer2_chain_t *
 hammer2_inode_duplicate(hammer2_trans_t *trans, hammer2_chain_t *ochain,
@@ -488,7 +487,9 @@ hammer2_inode_duplicate(hammer2_trans_t *trans, hammer2_chain_t *ochain,
        hammer2_mount_t *hmp;
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
+       hammer2_chain_t *tmpchain;
        hammer2_key_t lhc;
+       hammer2_blockref_t bref;
 
        *errorp = 0;
        hmp = dip->hmp;
@@ -497,7 +498,8 @@ hammer2_inode_duplicate(hammer2_trans_t *trans, hammer2_chain_t *ochain,
 
        /*
         * Locate the inode or indirect block to create the new
-        * entry in.
+        * entry in.  lhc represents the inode number so there is
+        * no collision iteration.
         *
         * There should be no key collisions with invisible inode keys.
         */
@@ -514,6 +516,7 @@ retry:
         * Create entry in common parent directory.
         */
        if (*errorp == 0) {
+               KKASSERT(chain == NULL);
                *errorp = hammer2_chain_create(trans, parent, &chain,
                                               lhc, 0,
                                               HAMMER2_BREF_TYPE_INODE,/* n/a */
@@ -531,47 +534,43 @@ retry:
                goto retry;
        }
 
-       hammer2_chain_lookup_done(parent);
-
        /*
         * Handle the error case
         */
        if (*errorp) {
                KKASSERT(chain == NULL);
+               hammer2_chain_lookup_done(parent);
                return (NULL);
        }
 
        /*
-        * XXX This is currently a horrible hack.  Well, if we wanted to
-        *     duplicate a file, i.e. as in a snapshot, we definitely
-        *     would have to flush it first.
+        * Use chain as a placeholder for our duplication.
         *
-        *     For hardlink target generation we can theoretically move any
-        *     active chain structures without flushing, but that gets really
-        *     iffy for code which follows chain->parent and ip->pip links.
+        * Gain a second lock on ochain for the duplication function to
+        * unlock, maintain the original lock across the calls.
         *
-        * XXX only works with files.  Duplicating a directory hierarchy
-        *     requires a flush but doesn't deal with races post-flush.
-        *     Well, it would work I guess, but you might catch some files
-        *     mid-operation.
-        *
-        * We cannot leave ochain with any in-memory chains because (for a
-        * hardlink), ochain will become a OBJTYPE_HARDLINK which is just a
-        * pointer to the real hardlink's inum and can't have any sub-chains.
-        * XXX might be 0-ref chains left.
+        * This is a bit messy.
         */
-       hammer2_chain_flush(trans, ochain);
-       /*KKASSERT(RB_EMPTY(&ochain.rbhead));*/
+       hammer2_chain_delete(trans, parent, chain);
+       hammer2_chain_lock(ochain, HAMMER2_RESOLVE_ALWAYS);
+       tmpchain = ochain;
+       bref = tmpchain->bref;
+       bref.key = lhc;         /* key for create placeholder must match */
+       bref.keybits = 0;
+       hammer2_chain_duplicate(trans, parent, chain->index, &tmpchain, &bref);
+       hammer2_chain_lookup_done(parent);
 
-       hammer2_chain_modify(trans, chain, 0);
-       nipdata = &chain->data->ipdata;
-       *nipdata = ochain->data->ipdata;
+       hammer2_chain_unlock(chain);
+       chain = tmpchain;
+       tmpchain = NULL;        /* safety */
 
        /*
         * Directory entries are inodes but this is a hidden hardlink
         * target.  The name isn't used but to ease debugging give it
         * a name after its inode number.
         */
+       hammer2_chain_modify(trans, chain, 0);
+       nipdata = &chain->data->ipdata;
        ksnprintf(nipdata->filename, sizeof(nipdata->filename),
                  "0x%016jx", (intmax_t)nipdata->inum);
        nipdata->name_len = strlen(nipdata->filename);
@@ -691,7 +690,7 @@ hammer2_inode_connect(hammer2_trans_t *trans, hammer2_inode_t *dip,
                         */
                        nchain = ochain;
                        ochain = NULL;
-                       hammer2_chain_duplicate(trans, NULL, -1, &nchain);
+                       hammer2_chain_duplicate(trans, NULL, -1, &nchain, NULL);
                        error = hammer2_chain_create(trans, parent, &nchain,
                                                     lhc, 0,
                                                     HAMMER2_BREF_TYPE_INODE,
@@ -811,10 +810,18 @@ hammer2_inode_connect(hammer2_trans_t *trans, hammer2_inode_t *dip,
  * Return *nchainp = NULL as a safety.
  */
 void
-hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_chain_t **nchainp)
+hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
+                     hammer2_chain_t *nchain)
 {
-       hammer2_chain_t *nchain = *nchainp;
        hammer2_chain_t *ochain;
+       hammer2_inode_t *opip;
+
+       /*
+        * ip->chain points to the hardlink target, not the hardlink psuedo
+        * inode.  Do not repoint nchain to the pseudo-node.
+        */
+       if (nchain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK)
+               return;
 
        /*
         * Repoint ip->chain if necessary.
@@ -829,10 +836,17 @@ hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_chain_t **nchainp)
                        hammer2_chain_unlock(ochain);
                        hammer2_chain_drop(ochain);     /* for ip->chain */
                }
-       } else {
-               hammer2_chain_unlock(nchain);
+               /* replace locked chain in ip (additional lock) */
+               hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS);
+       }
+       if (ip->pip != pip) {
+               opip = ip->pip;
+               if (pip)
+                       hammer2_inode_ref(pip);
+               ip->pip = pip;
+               if (opip)
+                       hammer2_inode_drop(opip);
        }
-       *nchainp = NULL;
 }
 
 /*
@@ -1060,7 +1074,7 @@ hammer2_inode_calc_alloc(hammer2_key_t filesize)
 /*
  * Given an exclusively locked inode we consolidate its chain for hardlink
  * creation, adding (nlinks) to the file's link count and potentially
- * relocating the file to a directory common to ip->pip and tdip.
+ * relocating the inode to a directory common to ip->pip and tdip.
  *
  * Returns a locked chain in (*chainp) (the chain's lock is in addition to
  * any lock it might already have due to the inode being locked).  *chainp
@@ -1139,12 +1153,15 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
         * to all directory entries referencing the hardlink.
         */
        nchain = hammer2_inode_duplicate(trans, chain, cdip, &error);
+
        if (error == 0) {
                /*
-                * Bump nlinks on duplicated hidden inode.
+                * Bump nlinks on duplicated hidden inode, repoint
+                * ip->chain.
                 */
                hammer2_chain_modify(trans, nchain, 0);
                nchain->data->ipdata.nlinks += nlinks;
+               hammer2_inode_repoint(ip, cdip, nchain);
 
                /*
                 * If the old chain is not a hardlink target then replace
index ab4bb20..3c68588 100644 (file)
@@ -463,8 +463,7 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
                                     pfs->name, strlen(pfs->name),
                                     &error);
        if (error == 0) {
-               hammer2_chain_modify(&trans, nip->chain, 0);
-               nipdata = &nip->chain->data->ipdata;
+               nipdata = hammer2_chain_modify_ip(&trans, nip, 0);
                nipdata->pfs_type = pfs->pfs_type;
                nipdata->pfs_clid = pfs->pfs_clid;
                nipdata->pfs_fsid = pfs->pfs_fsid;
index a725389..f65889c 100644 (file)
@@ -56,8 +56,8 @@
  * NOTE: We don't combine the inode/chain lock because putting away an
  *       inode would otherwise confuse multiple lock holders of the inode.
  *
- * WARNING! hammer2_inode_repoint() expects exactly one exclusive lock
- *         on ip->chain.
+ * WARNING! hammer2_inode_repoint() can replace ip->chain and assumes that
+ *         there is only one 'inode' lock. XXX
  */
 void
 hammer2_inode_lock_ex(hammer2_inode_t *ip)
@@ -68,6 +68,14 @@ hammer2_inode_lock_ex(hammer2_inode_t *ip)
        ccms_thread_lock(&ip->topo_cst, CCMS_STATE_EXCLUSIVE);
 
        chain = ip->chain;
+       while (chain->duplink)
+               chain = chain->duplink;
+       if (chain != ip->chain) {
+               hammer2_chain_ref(chain);
+               hammer2_chain_drop(ip->chain);
+               ip->chain = chain;
+       }
+
        KKASSERT(chain != NULL);        /* for now */
        hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
 }
index 76796b0..016bf5b 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+/*
+ * Kernel Filesystem interface
+ *
+ * NOTE! local ipdata pointers must be reloaded on any modifying operation
+ *      to the inode as its underlying chain may have changed.
+ */
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -51,8 +58,8 @@
 
 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
                                int seqcount);
-static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
-                               int ioflag, int seqcount);
+static int hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
+                               struct uio *uio, int ioflag, int seqcount);
 static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
                                hammer2_inode_t *ip,
                                hammer2_key_t lbase, int lblksize,
@@ -144,9 +151,38 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
        /*
         * Set SUBMODIFIED so we can detect and propagate the DESTROYED
         * bit in the flush code.
+        *
+        * ip->chain might be stale, correct it before checking as older
+        * versions of the chain are likely marked deleted even if the
+        * file hasn't been.  XXX ip->chain should never be stale on
+        * reclaim.
         */
        hammer2_inode_lock_ex(ip);
        chain = ip->chain;
+       if (chain->duplink)
+               kprintf("RECLAIM DUPLINKED IP: %p %p\n", ip, ip->chain);
+#if 0
+       while (chain->duplink)
+               chain = chain->duplink;
+       if (ip->chain != chain) {
+               hammer2_inode_repoint(ip, ip->pip, chain);
+               chain = ip->chain;
+       }
+#endif
+
+       /*
+        * The final close of a deleted file or directory marks it for
+        * destruction.  The DESTROYED flag allows the flusher to shortcut
+        * any modified blocks still unflushed (that is, just ignore them).
+        *
+        * HAMMER2 usually does not try to optimize the freemap by returning
+        * deleted blocks to it as it does not usually know how many snapshots
+        * might be referencing portions of the file/dir.  XXX TODO.
+        *
+        * XXX TODO - However, any modified file as-of when a snapshot is made
+        *            cannot use this optimization as some of the modifications
+        *            may wind up being part of the snapshot.
+        */
        vp->v_data = NULL;
        ip->vp = NULL;
        if (chain->flags & HAMMER2_CHAIN_DELETED) {
@@ -327,6 +363,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                if (error == 0) {
                        if (ipdata->uflags != flags) {
                                hammer2_chain_modify(&trans, ip->chain, 0);
+                               ipdata = &ip->chain->data->ipdata; /* RELOAD */
                                ipdata->uflags = flags;
                                ipdata->ctime = ctime;
                                kflags |= NOTE_ATTRIB;
@@ -360,6 +397,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                            ipdata->mode != cur_mode
                        ) {
                                hammer2_chain_modify(&trans, ip->chain, 0);
+                               ipdata = &ip->chain->data->ipdata; /* RELOAD */
                                ipdata->uid = uuid_uid;
                                ipdata->gid = uuid_gid;
                                ipdata->mode = cur_mode;
@@ -382,6 +420,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                        } else {
                                hammer2_extend_file(&trans, ip, vap->va_size);
                        }
+                       ipdata = &ip->chain->data->ipdata; /* RELOAD */
                        domtime = 1;
                        break;
                default:
@@ -393,12 +432,14 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
        /* atime not supported */
        if (vap->va_atime.tv_sec != VNOVAL) {
                hammer2_chain_modify(&trans, ip->chain, 0);
+               ipdata = &ip->chain->data->ipdata; /* RELOAD */
                ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
                kflags |= NOTE_ATTRIB;
        }
 #endif
        if (vap->va_mtime.tv_sec != VNOVAL) {
                hammer2_chain_modify(&trans, ip->chain, 0);
+               ipdata = &ip->chain->data->ipdata; /* RELOAD */
                ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
                kflags |= NOTE_ATTRIB;
        }
@@ -411,6 +452,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                                         cur_uid, cur_gid, &cur_mode);
                if (error == 0 && ipdata->mode != cur_mode) {
                        hammer2_chain_modify(&trans, ip->chain, 0);
+                       ipdata = &ip->chain->data->ipdata; /* RELOAD */
                        ipdata->mode = cur_mode;
                        ipdata->ctime = ctime;
                        kflags |= NOTE_ATTRIB;
@@ -664,6 +706,7 @@ hammer2_vop_write(struct vop_write_args *ap)
 {
        hammer2_mount_t *hmp;
        hammer2_inode_t *ip;
+       hammer2_trans_t trans;
        thread_t td;
        struct vnode *vp;
        struct uio *uio;
@@ -711,8 +754,11 @@ hammer2_vop_write(struct vop_write_args *ap)
         * might wind up being copied into the embedded data area.
         */
        hammer2_inode_lock_ex(ip);
-       error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount);
+       hammer2_trans_init(&trans, ip->hmp);
+       error = hammer2_write_file(ip, &trans, uio, ap->a_ioflag, seqcount);
        hammer2_inode_unlock_ex(ip);
+       hammer2_trans_done(&trans);
+
        return (error);
 }
 
@@ -774,10 +820,9 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
  */
 static
 int
-hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
-                  int ioflag, int seqcount)
+hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
+                  struct uio *uio, int ioflag, int seqcount)
 {
-       hammer2_trans_t trans;
        hammer2_inode_data_t *ipdata;
        hammer2_key_t old_eof;
        struct buf *bp;
@@ -794,8 +839,6 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
        kflags = 0;
        error = 0;
 
-       hammer2_trans_init(&trans, ip->hmp);
-
        /*
         * Extend the file if necessary.  If the write fails at some point
         * we will truncate it back down to cover as much as we were able
@@ -804,13 +847,16 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
         * Doing this now makes it easier to calculate buffer sizes in
         * the loop.
         */
+       KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
        old_eof = ipdata->size;
        if (uio->uio_offset + uio->uio_resid > ipdata->size) {
                modified = 1;
-               hammer2_extend_file(&trans, ip,
+               hammer2_extend_file(trans, ip,
                                    uio->uio_offset + uio->uio_resid);
+               ipdata = &ip->chain->data->ipdata;      /* RELOAD */
                kflags |= NOTE_EXTEND;
        }
+       KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
 
        /*
         * UIO write loop
@@ -916,8 +962,9 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
                 * strategy code will take care of it in that case.
                 */
                bp->b_bio2.bio_offset =
-                       hammer2_assign_physical(&trans, ip,
+                       hammer2_assign_physical(trans, ip,
                                                lbase, lblksize, &error);
+               ipdata = &ip->chain->data->ipdata;      /* RELOAD */
                if (error) {
                        brelse(bp);
                        break;
@@ -972,18 +1019,24 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
         * the entire write is a failure and we have to back-up.
         */
        if (error && ipdata->size != old_eof) {
-               hammer2_truncate_file(&trans, ip, old_eof);
+               hammer2_truncate_file(trans, ip, old_eof);
+               ipdata = &ip->chain->data->ipdata;      /* RELOAD */
        } else if (modified) {
-               hammer2_chain_modify(&trans, ip->chain, 0);
+               hammer2_chain_modify(trans, ip->chain, 0);
+               ipdata = &ip->chain->data->ipdata;      /* RELOAD */
                hammer2_update_time(&ipdata->mtime);
        }
        hammer2_knote(ip->vp, kflags);
-       hammer2_trans_done(&trans);
+
        return error;
 }
 
 /*
- * Assign physical storage to a logical block.
+ * Assign physical storage to a logical block.  This function creates the
+ * related meta-data chains representing the data blocks and marks them
+ * MODIFIED.  We could mark them MOVED instead but ultimately I need to
+ * XXX code the flusher to check that the related logical buffer is
+ * flushed.
  *
  * NOOFFSET is returned if the data is inode-embedded.  In this case the
  * strategy code will simply bcopy() the data into the inode.
@@ -1295,6 +1348,7 @@ hammer2_extend_file(hammer2_trans_t *trans,
                            0, HAMMER2_EMBEDDED_BYTES,
                            0, (int)nsize,
                            1);
+               /* ipdata = &ip->chain->data->ipdata; RELOAD */
                return;
        }
 
@@ -1316,6 +1370,7 @@ hammer2_extend_file(hammer2_trans_t *trans,
                    0, nblksize,
                    0, (int)nsize & HAMMER2_PBUFMASK,
                    1);
+       ipdata = &ip->chain->data->ipdata;
 
        /*
         * Early return if we have no more work to do.
@@ -1776,7 +1831,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         */
        error = hammer2_inode_connect(&trans, dip, &chain, name, name_len);
        if (error == 0) {
-               hammer2_inode_repoint(ip, &chain);
+               hammer2_inode_repoint(ip, ip->pip, chain);
                cache_setunresolved(ap->a_nch);
                cache_setvp(ap->a_nch, ap->a_vp);
        }
@@ -1904,7 +1959,9 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
                        auio.uio_td = curthread;
                        aiov.iov_base = ap->a_target;
                        aiov.iov_len = bytes;
-                       error = hammer2_write_file(nip, &auio, IO_APPEND, 0);
+                       error = hammer2_write_file(nip, &trans,
+                                                  &auio, IO_APPEND, 0);
+                       nipdata = &nip->chain->data->ipdata; /* RELOAD */
                        /* XXX handle error */
                        error = 0;
                }
@@ -2066,6 +2123,10 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
         * does nothing other than return the locked chain.
         *
         * The returned chain will be locked.
+        *
+        * WARNING!  We do not currently have a local copy of ipdata but
+        *           we do use one later remember that it must be reloaded
+        *           on any modification to the inode, including connects.
         */
        hammer2_inode_lock_ex(ip);
        error = hammer2_hardlink_consolidate(&trans, ip, &chain, tdip, 0);
@@ -2099,7 +2160,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
                                      &chain, tname, tname_len);
        if (error == 0) {
                KKASSERT(chain != NULL);
-               hammer2_inode_repoint(ip, &chain);
+               hammer2_inode_repoint(ip, tdip, chain);
                cache_rename(ap->a_fnch, ap->a_tnch);
        }
 done: