hammer2 - multi-target mount part 2/many
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 30 Mar 2014 00:09:08 +0000 (17:09 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 30 Mar 2014 00:13:04 +0000 (17:13 -0700)
* Split media and in-memory tracking of TIDs.  In-memory tracking is
  effectively global, while on-media TIDs are now split into an independent
  domain for each PFS plus one for the super-root and freemap topology.

  Introduces hammer2_xid_t and reformulates some of the TID and XID
  constants.

* Refactor the flush code to no longer depend on media modify_tid/mirror_tid
  values during a flush.  In-memory XIDs are used for all flush recursions.

* Refactor the transaction mechanisms in order to allow XIDs to cover multiple
  TID domains.  For the moment, the transaction management structure is
  global but the code is designed to allow us to have multiple management
  domains in a future commit.

* Associate a PMP structure with each storage topology's super-root and
  freemap (still a bit rough).

* Each PFS now accounts for its own inode numbers and transaction ids,
  independent of other PFSs.  This also means that both inode numbers
  and transaction ids are PFS-specific, so multiple storage medias which
  are part of the same PFS will use the PFS-centric tracking rather than
  per-media tracking.

* Cleanup a few memory leaks and remove some debugging from the chain code.

* Refactor mount and unmount code to handle the new PFS abstraction.

* Refactor the volume data lock/unlock/modify mechanics somewhat.

14 files changed:
sbin/hammer2/cmd_debug.c
sbin/newfs_hammer2/newfs_hammer2.c
sys/vfs/hammer2/TODO
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_cluster.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index 549c4d8..b19f7a4 100644 (file)
@@ -554,8 +554,8 @@ show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref, int dofreemap)
                }
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
-               printf("alloc_tid=%016jx freemap_tid=%016jx ",
-                       media.voldata.alloc_tid,
+               printf("mirror_tid=%016jx freemap_tid=%016jx ",
+                       media.voldata.mirror_tid,
                        media.voldata.freemap_tid);
                if (dofreemap) {
                        bscan = &media.voldata.freemap_blockset.blockref[0];
index 3d62933..e9ff39f 100644 (file)
@@ -555,6 +555,7 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
        rawip->pfs_fsid = Hammer2_PfsCLID;
        rawip->pfs_type = HAMMER2_PFSTYPE_MASTER;
        rawip->op_flags |= HAMMER2_OPFLAG_PFSROOT;
+       rawip->pfs_inum = 16;   /* first allocatable inode number */
 
        /* rawip->u.blockset is left empty */
 
@@ -574,6 +575,8 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
        root_blockref.type = HAMMER2_BREF_TYPE_INODE;
        root_blockref.methods = HAMMER2_ENC_CHECK(HAMMER2_CHECK_ISCSI32) |
                                HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
+       root_blockref.mirror_tid = 16;
+       root_blockref.flags = HAMMER2_BREF_FLAG_PFSROOT;
 
        /*
         * Format the super-root directory inode, giving it one directory
@@ -605,11 +608,17 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
         * random FSID, making it possible to mirror an entire HAMMER2 disk
         * snapshots and all if desired.  PFS ids are used to match up
         * mirror sources and targets and cluster copy sources and targets.
+        *
+        * (XXX whole-disk logical mirroring is not really supported in
+        *  the first attempt because each PFS is in its own modify/mirror
+        *  transaction id domain, so normal mechanics cannot cross a PFS
+        *  boundary).
         */
        rawip->pfs_clid = Hammer2_SupCLID;
        rawip->pfs_fsid = Hammer2_SupFSID;
        rawip->pfs_type = HAMMER2_PFSTYPE_MASTER;
-       rawip->op_flags |= HAMMER2_OPFLAG_PFSROOT;
+       rawip->op_flags |= HAMMER2_OPFLAG_SUPROOT;
+       rawip->pfs_inum = 16;   /* first allocatable inode number */
 
        /*
         * The super-root has one directory entry pointing at the named
@@ -627,6 +636,7 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
        sroot_blockref.type = HAMMER2_BREF_TYPE_INODE;
        sroot_blockref.methods = HAMMER2_ENC_CHECK(HAMMER2_CHECK_ISCSI32) |
                                 HAMMER2_ENC_COMP(HAMMER2_COMP_AUTOZERO);
+       sroot_blockref.mirror_tid = 16;
        rawip = NULL;
 
        /*
@@ -667,9 +677,8 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
        vol->allocator_beg = alloc_base;
 
        vol->sroot_blockset.blockref[0] = sroot_blockref;
-       vol->mirror_tid = 0;
-       vol->alloc_tid = 16;    /* first transaction id */
-       vol->inode_tid = 16;    /* first allocatable inode number */
+       vol->mirror_tid = 16;   /* all blockref mirror TIDs set to 16 */
+       vol->freemap_tid = 16;  /* all blockref mirror TIDs set to 16 */
        vol->icrc_sects[HAMMER2_VOL_ICRC_SECT1] =
                        hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRC1_OFF,
                                       HAMMER2_VOLUME_ICRC1_SIZE);
@@ -718,8 +727,8 @@ alloc_direct(hammer2_off_t *basep, hammer2_blockref_t *bref, size_t bytes)
                ++radix;
        }
        assert(bytes == 1);
-       if (radix < HAMMER2_MIN_RADIX)
-               radix = HAMMER2_MIN_RADIX;
+       if (radix < HAMMER2_RADIX_MIN)
+               radix = HAMMER2_RADIX_MIN;
 
        bzero(bref, sizeof(*bref));
        bref->data_off = *basep | radix;
index 525609a..e1871e3 100644 (file)
@@ -1,4 +1,13 @@
 
+* hammer2_xid_t needs to be 64 bits
+
+* snapshot creation must allocate and separately pass a new pmp for the pfs
+  degenerate 'cluster' representing the snapshot.  This theoretically will
+  also allow a snapshot to be generated inside a cluster of more than one
+  node.
+
+* snapshot copy currently also copies uuids and can confuse cluster code
+
 * hidden dir or other dirs/files/modifications made to PFS before
   additional cluster entries added.
 
index 898f81a..d90174d 100644 (file)
@@ -137,6 +137,16 @@ struct hammer2_span;
 struct hammer2_state;
 struct hammer2_msg;
 
+/*
+ * The xid tracks internal transactional updates.
+ *
+ * XXX fix-me, really needs to be 64-bits
+ */
+typedef uint32_t hammer2_xid_t;
+
+#define HAMMER2_XID_MIN        0x00000000U
+#define HAMMER2_XID_MAX 0x7FFFFFFFU
+
 /*
  * The chain structure tracks a portion of the media topology from the
  * root (volume) down.  Chains represent volumes, inodes, indirect blocks,
@@ -262,25 +272,12 @@ struct hammer2_chain {
        hammer2_chain_core_t    *above;
        struct hammer2_state    *state;         /* if active cache msg */
        struct hammer2_mount    *hmp;
-       struct hammer2_pfsmount *pmp;           /* can be NULL */
-
-       hammer2_blockref_t      dsrc;                   /* DEBUG */
-       int                     ninserts;               /* DEBUG */
-       int                     nremoves;               /* DEBUG */
-       hammer2_tid_t           dsrc_dupfromat;         /* DEBUG */
-       uint32_t                dsrc_dupfromflags;      /* DEBUG */
-       int                     dsrc_reason;            /* DEBUG */
-       int                     dsrc_ninserts;          /* DEBUG */
-       uint32_t                dsrc_flags;             /* DEBUG */
-       hammer2_tid_t           dsrc_modify;            /* DEBUG */
-       hammer2_tid_t           dsrc_delete;            /* DEBUG */
-       hammer2_tid_t           dsrc_update_lo;         /* DEBUG */
-       struct hammer2_chain    *dsrc_original;         /* DEBUG */
-
-       hammer2_tid_t   modify_tid;             /* flush filter */
-       hammer2_tid_t   delete_tid;             /* flush filter */
-       hammer2_tid_t   update_lo;              /* flush propagation */
-       hammer2_tid_t   update_hi;              /* setsubmod propagation */
+       struct hammer2_pfsmount *pmp;           /* (pfs-cluster pmp or spmp) */
+
+       hammer2_xid_t   modify_xid;             /* flush filter */
+       hammer2_xid_t   delete_xid;             /* flush filter */
+       hammer2_xid_t   update_xlo;             /* flush propagation */
+       hammer2_xid_t   update_xhi;             /* setsubmod propagation */
        hammer2_key_t   data_count;             /* delta's to apply */
        hammer2_key_t   inode_count;            /* delta's to apply */
        hammer2_io_t    *dio;                   /* physical data buffer */
@@ -331,6 +328,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_ONDBTREE         0x00080000      /* bmapped deletes */
 #define HAMMER2_CHAIN_DUPLICATED       0x00100000      /* fwd delete-dup */
 #define HAMMER2_CHAIN_PFSROOT          0x00200000      /* in pfs->cluster */
+#define HAMMER2_CHAIN_PFSBOUNDARY      0x00400000      /* super->pfs inode */
 
 /*
  * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
@@ -539,11 +537,9 @@ typedef struct hammer2_inode_unlink hammer2_inode_unlink_t;
  */
 struct hammer2_trans {
        TAILQ_ENTRY(hammer2_trans) entry;
-       struct hammer2_pfsmount *pmp;           /* might be NULL */
-       struct hammer2_mount    *hmp_single;    /* if single-targetted */
-       hammer2_tid_t           orig_tid;
-       hammer2_tid_t           sync_tid;       /* effective transaction id */
-       hammer2_tid_t           inode_tid;
+       struct hammer2_pfsmount *pmp;
+       hammer2_xid_t           sync_xid;
+       hammer2_tid_t           inode_tid;      /* inode number assignment */
        thread_t                td;             /* pointer */
        int                     flags;
        int                     blocked;
@@ -557,7 +553,7 @@ typedef struct hammer2_trans hammer2_trans_t;
 #define HAMMER2_TRANS_CONCURRENT       0x0002  /* concurrent w/flush */
 #define HAMMER2_TRANS_BUFCACHE         0x0004  /* from bioq strategy write */
 #define HAMMER2_TRANS_NEWINODE         0x0008  /* caller allocating inode */
-#define HAMMER2_TRANS_ISALLOCATING     0x0010  /* in allocator */
+#define HAMMER2_TRANS_UNUSED0010       0x0010
 #define HAMMER2_TRANS_PREFLUSH         0x0020  /* preflush state */
 
 #define HAMMER2_FREEMAP_HEUR_NRADIX    4       /* pwr 2 PBUFRADIX-MINIORADIX */
@@ -569,10 +565,23 @@ typedef struct hammer2_trans hammer2_trans_t;
 #define HAMMER2_CLUSTER_COPY_NOREF     0x0002  /* do not ref chains */
 
 /*
- * Global (per device) mount structure for device (aka vp->v_mount->hmp)
+ * Transaction Rendezvous
  */
 TAILQ_HEAD(hammer2_trans_queue, hammer2_trans);
 
+struct hammer2_trans_manage {
+       hammer2_xid_t           flush_xid;      /* last flush transaction */
+       hammer2_xid_t           alloc_xid;
+       struct lock             translk;        /* lockmgr lock */
+       struct hammer2_trans_queue transq;      /* modifying transactions */
+       int                     flushcnt;       /* track flush trans */
+};
+
+typedef struct hammer2_trans_manage hammer2_trans_manage_t;
+
+/*
+ * Global (per device) mount structure for device (aka vp->v_mount->hmp)
+ */
 struct hammer2_mount {
        struct vnode    *devvp;         /* device vnode */
        int             ronly;          /* read-only mount */
@@ -587,13 +596,9 @@ struct hammer2_mount {
        int             iofree_count;
        hammer2_chain_t vchain;         /* anchor chain (topology) */
        hammer2_chain_t fchain;         /* anchor chain (freemap) */
-       hammer2_inode_t *sroot;         /* super-root localized to media */
-       struct lock     alloclk;        /* lockmgr lock */
-       struct lock     voldatalk;      /* lockmgr lock */
-       struct hammer2_trans_queue transq; /* all in-progress transactions */
+       struct hammer2_pfsmount *spmp;  /* super-root pmp for transactions */
+       struct lock     vollk;          /* lockmgr lock */
        hammer2_off_t   heur_freemap[HAMMER2_FREEMAP_HEUR];
-       int             flushcnt;       /* #of flush trans on the list */
-
        int             volhdrno;       /* last volhdrno written */
        hammer2_volume_data_t voldata;
        hammer2_volume_data_t volsync;  /* synchronized voldata */
@@ -621,6 +626,7 @@ struct hammer2_pfsmount {
        TAILQ_ENTRY(hammer2_pfsmount) mntentry; /* hammer2_pfslist */
        uuid_t                  pfs_clid;
        uuid_t                  pfs_fsid;
+       hammer2_mount_t         *spmp_hmp;      /* (spmp only) */
        hammer2_inode_t         *iroot;         /* PFS root inode */
        hammer2_inode_t         *ihidden;       /* PFS hidden directory */
        struct lock             lock;           /* PFS lock for certain ops */
@@ -632,7 +638,10 @@ struct hammer2_pfsmount {
        struct malloc_type      *mmsg;
        kdmsg_iocom_t           iocom;
        struct spinlock         inum_spin;      /* inumber lookup */
-       struct hammer2_inode_tree inum_tree;
+       struct hammer2_inode_tree inum_tree;    /* (not applicable to spmp) */
+       hammer2_tid_t           alloc_tid;
+       hammer2_tid_t           flush_tid;
+       hammer2_tid_t           inode_tid;
        long                    inmem_inodes;
        long                    inmem_dirty_chains;
        int                     count_lwinprog; /* logical write in prog */
@@ -747,8 +756,6 @@ hammer2_cluster_t *hammer2_inode_lock_ex(hammer2_inode_t *ip);
 hammer2_cluster_t *hammer2_inode_lock_sh(hammer2_inode_t *ip);
 void hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_cluster_t *chain);
 void hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_cluster_t *chain);
-void hammer2_voldata_lock(hammer2_mount_t *hmp);
-void hammer2_voldata_unlock(hammer2_mount_t *hmp, int modify);
 ccms_state_t hammer2_inode_lock_temp_release(hammer2_inode_t *ip);
 void hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, ccms_state_t ostate);
 ccms_state_t hammer2_inode_lock_upgrade(hammer2_inode_t *ip);
@@ -765,6 +772,8 @@ void hammer2_time_to_timespec(u_int64_t xtime, struct timespec *ts);
 u_int64_t hammer2_timespec_to_time(const struct timespec *ts);
 u_int32_t hammer2_to_unix_xid(const uuid_t *uuid);
 void hammer2_guid_to_uuid(uuid_t *uuid, u_int32_t guid);
+hammer2_xid_t hammer2_trans_newxid(hammer2_pfsmount_t *pmp);
+void hammer2_trans_manage_init(void);
 
 hammer2_key_t hammer2_dirhash(const unsigned char *name, size_t len);
 int hammer2_getradix(size_t bytes);
@@ -824,7 +833,9 @@ void hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp);
 /*
  * hammer2_chain.c
  */
-void hammer2_modify_volume(hammer2_mount_t *hmp);
+void hammer2_voldata_lock(hammer2_mount_t *hmp);
+void hammer2_voldata_unlock(hammer2_mount_t *hmp);
+void hammer2_voldata_modify(hammer2_mount_t *hmp);
 hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
                                hammer2_pfsmount_t *pmp,
                                hammer2_trans_t *trans,
@@ -866,9 +877,9 @@ hammer2_chain_t *hammer2_chain_scan(hammer2_chain_t *parent,
                                hammer2_chain_t *chain,
                                int *cache_indexp, int flags);
 
-int hammer2_chain_create(hammer2_trans_t *trans,
-                               hammer2_chain_t **parentp,
+int hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                                hammer2_chain_t **chainp,
+                               hammer2_pfsmount_t *pmp,
                                hammer2_key_t key, int keybits,
                                int type, size_t bytes);
 void hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
@@ -908,7 +919,7 @@ void hammer2_chain_refactor(hammer2_chain_t **chainp);
  * hammer2_trans.c
  */
 void hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
-                               hammer2_mount_t *hmp, int flags);
+                               int flags);
 void hammer2_trans_done(hammer2_trans_t *trans);
 
 /*
index 56e1415..967d68d 100644 (file)
@@ -142,7 +142,10 @@ hammer2_isclusterable(hammer2_chain_t *chain)
 }
 
 /*
- * Recursively set update_hi starting at chain up through to the root.
+ * Recursively set update_xhi starting at chain and moving upward.  Stop early
+ * if we hit a PFS transition (PFS flush code will have to detect the case
+ * and perform an update within its own transaction).  The transaction xid
+ * is only good within the current PFS.
  *
  * This controls top-down visibility for flushes.  The child has just one
  * 'above' core, but the core itself can be multi-homed with parents iterated
@@ -154,21 +157,23 @@ hammer2_isclusterable(hammer2_chain_t *chain)
  * allocating which requires the live tree).  The flush keeps track of its
  * recursion itself.
  *
- * XXX SMP races
+ * XXX SMP races.  For now we do not allow concurrent transactions with
+ *     different transaction ids and there should be no race, but if we do
+ *     later on there will be a problem.
  */
 void
 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
 {
        hammer2_chain_core_t *above;
 
-       if (chain->update_hi < trans->sync_tid)
-               chain->update_hi = trans->sync_tid;
+       if (chain->update_xhi < trans->sync_xid)
+               chain->update_xhi = trans->sync_xid;
 
        while ((above = chain->above) != NULL) {
                spin_lock(&above->cst.spin);
                chain = TAILQ_LAST(&above->ownerq, h2_core_list);
-               if (chain->update_hi < trans->sync_tid)
-                       chain->update_hi = trans->sync_tid;
+               if (chain->update_xhi < trans->sync_xid)
+                       chain->update_xhi = trans->sync_xid;
                spin_unlock(&above->cst.spin);
        }
 }
@@ -182,6 +187,9 @@ hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
  * NULL.  The caller must call chain_core_alloc() to allocate or associate
  * a core with the chain.
  *
+ * chain->pmp inherits pmp unless the chain is an inode (other than the
+ * super-root inode).
+ *
  * NOTE: Returns a referenced but unlocked (because there is no core) chain.
  */
 hammer2_chain_t *
@@ -206,8 +214,6 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
                 * purposes.  The pmp can be NULL.
                 */
                chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO);
-               if (pmp)
-                       chain->pmp = pmp;
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
        case HAMMER2_BREF_TYPE_FREEMAP:
@@ -219,16 +225,26 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
                      bref->type);
        }
 
+       /*
+        * Initialize the new chain structure.
+        */
+       chain->pmp = pmp;
        chain->hmp = hmp;
        chain->bref = *bref;
        chain->bytes = bytes;
        chain->refs = 1;
        chain->flags = HAMMER2_CHAIN_ALLOCATED;
-       chain->delete_tid = HAMMER2_MAX_TID;
+       chain->delete_xid = HAMMER2_XID_MAX;
+
+       /*
+        * Set the PFS boundary flag if this chain represents a PFS root.
+        */
+       if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
+               chain->flags |= HAMMER2_CHAIN_PFSBOUNDARY;
 
        /*
-        * Set modify_tid if a transaction is creating the inode.
-        * Enforce update_lo = 0 so nearby transactions do not think
+        * Set modify_xid if a transaction is creating the inode.
+        * Enforce update_xlo = 0 so nearby transactions do not think
         * it has been flushed when it hasn't.
         *
         * NOTE: When loading a chain from backing store or creating a
@@ -236,8 +252,8 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
         *       for setting these fields.
         */
        if (trans) {
-               chain->modify_tid = trans->sync_tid;
-               chain->update_lo = 0;
+               chain->modify_xid = trans->sync_xid;
+               chain->update_xlo = 0;
        }
 
        return (chain);
@@ -291,8 +307,8 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
                 *
                 * It is possible for the DUPLICATED flag to already be
                 * set when called via a flush operation because flush
-                * operations may have to work on elements with delete_tid's
-                * beyond the flush sync_tid.  In this situation we must
+                * operations may have to work on elements with delete_xid's
+                * beyond the flush sync_xid.  In this situation we must
                 * ensure that nchain is placed just after ochain in the
                 * ownerq and that the DUPLICATED flag is set on nchain so
                 * 'live' operations skip past it to the correct chain.
@@ -306,10 +322,7 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
                 *          more than just adjusting a block table.
                 */
                if (ochain->flags & HAMMER2_CHAIN_DUPLICATED) {
-                       KKASSERT((trans->flags &
-                                 (HAMMER2_TRANS_ISFLUSH |
-                                  HAMMER2_TRANS_ISALLOCATING)) ==
-                                HAMMER2_TRANS_ISFLUSH);
+                       KKASSERT(trans->flags & HAMMER2_TRANS_ISFLUSH);
                        atomic_set_int(&nchain->flags,
                                       HAMMER2_CHAIN_DUPLICATED);
                }
@@ -331,7 +344,7 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
                 * on forward-indexed ochains.  We must properly insert
                 * nchain relative to ochain.
                 */
-               if (trans && trans->sync_tid < ochain->modify_tid) {
+               if (trans && trans->sync_xid < ochain->modify_xid) {
                        TAILQ_INSERT_BEFORE(ochain, nchain, core_entry);
                } else {
                        TAILQ_INSERT_AFTER(&core->ownerq, ochain,
@@ -1359,9 +1372,6 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        chain = *chainp;
        hmp = chain->hmp;
 
-       KKASSERT(chain->bref.mirror_tid != trans->sync_tid ||
-                (chain->flags & HAMMER2_CHAIN_MODIFIED));
-
        /*
         * data is not optional for freemap chains (we must always be sure
         * to copy the data on COW storage allocations).
@@ -1382,7 +1392,7 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         *
         * The freemap and volume header special chains are never D-Dd.
         */
-       if (chain->modify_tid != trans->sync_tid &&        /* cross boundary */
+       if (chain->modify_xid != trans->sync_xid &&        /* cross boundary */
            (flags & HAMMER2_MODIFY_INPLACE) == 0) {       /* from d-d */
                if (chain != &hmp->fchain && chain != &hmp->vchain) {
                        KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
@@ -1427,7 +1437,7 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        if (chain != &hmp->vchain && chain != &hmp->fchain) {
                if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
                     ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 &&
-                     chain->modify_tid != trans->sync_tid)
+                     chain->modify_xid != trans->sync_xid)
                ) {
                        hammer2_freemap_alloc(trans, chain, chain->bytes);
                        /* XXX failed allocation */
@@ -1439,21 +1449,18 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        }
 
        /*
-        * Update modify_tid.  XXX special-case vchain/fchain because they
+        * Update modify_xid.  XXX special-case vchain/fchain because they
         * are always modified in-place.  Otherwise the chain being modified
         * must not be part of a future transaction.
         */
        if (chain == &hmp->vchain || chain == &hmp->fchain) {
-               if (chain->modify_tid <= trans->sync_tid)
-                       chain->modify_tid = trans->sync_tid;
+               if (chain->modify_xid <= trans->sync_xid)
+                       chain->modify_xid = trans->sync_xid;
        } else {
-               KKASSERT(chain->modify_tid <= trans->sync_tid);
-               chain->modify_tid = trans->sync_tid;
+               KKASSERT(chain->modify_xid <= trans->sync_xid);
+               chain->modify_xid = trans->sync_xid;
        }
 
-       if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
-               chain->bref.modify_tid = trans->sync_tid;
-
        /*
         * Do not COW BREF_TYPE_DATA when OPTDATA is set.  This is because
         * data modifications are done via the logical buffer cache so COWing
@@ -1556,26 +1563,36 @@ skip2:
 }
 
 /*
- * Mark the volume as having been modified.  This short-cut version
- * does not have to lock the volume's chain, which allows the ioctl
- * code to make adjustments to connections without deadlocking.  XXX
- *
- * No ref is made on vchain when flagging it MODIFIED.
+ * Volume header data locks
  */
 void
-hammer2_modify_volume(hammer2_mount_t *hmp)
+hammer2_voldata_lock(hammer2_mount_t *hmp)
 {
-       hammer2_voldata_lock(hmp);
-       hammer2_voldata_unlock(hmp, 1);
+       lockmgr(&hmp->vollk, LK_EXCLUSIVE);
+}
+
+void
+hammer2_voldata_unlock(hammer2_mount_t *hmp)
+{
+       lockmgr(&hmp->vollk, LK_RELEASE);
+}
+
+void
+hammer2_voldata_modify(hammer2_mount_t *hmp)
+{
+       if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
+               atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
+               hammer2_chain_ref(&hmp->vchain);
+       }
 }
 
 /*
  * This function returns the chain at the nearest key within the specified
- * range with the highest delete_tid.  The core spinlock must be held on
+ * range with the highest delete_xid.  The core spinlock must be held on
  * call and the returned chain will be referenced but not locked.
  *
  * The returned chain may or may not be in a deleted state.  Note that
- * live chains have a delete_tid = MAX_TID.
+ * live chains have a delete_xid = XID_MAX.
  *
  * This function will recurse through chain->rbtree as necessary and will
  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
@@ -1649,7 +1666,7 @@ hammer2_chain_find_deleted(hammer2_chain_t *parent,
                hammer2_chain_find_cmp, hammer2_chain_find_callback,
                &info);
        if ((child = info.best) != NULL) {
-               if (child->delete_tid <= parent->update_lo)
+               if (child->delete_xid <= parent->update_xlo)
                        child = NULL;
        }
        return child;
@@ -1699,43 +1716,43 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
                   child->bref.key <= info->key_beg) {
                /*
                 * If our current best is flush with key_beg and child is
-                * also flush with key_beg choose based on delete_tid.
+                * also flush with key_beg choose based on delete_xid.
                 *
                 * key_next will automatically be limited to the smaller of
                 * the two end-points.
                 */
-               if (child->delete_tid > best->delete_tid)
+               if (child->delete_xid > best->delete_xid)
                        info->best = child;
        } else if (child->bref.key < best->bref.key) {
                /*
                 * Child has a nearer key and best is not flush with key_beg.
                 * Truncate key_next to the old best key iff it had a better
-                * delete_tid.
+                * delete_xid.
                 */
                info->best = child;
-               if (best->delete_tid >= child->delete_tid &&
+               if (best->delete_xid >= child->delete_xid &&
                    (info->key_next > best->bref.key || info->key_next == 0))
                        info->key_next = best->bref.key;
        } else if (child->bref.key == best->bref.key) {
                /*
                 * If our current best is flush with the child then choose
-                * based on delete_tid.
+                * based on delete_xid.
                 *
                 * key_next will automatically be limited to the smaller of
                 * the two end-points.
                 */
-               if (child->delete_tid > best->delete_tid)
+               if (child->delete_xid > best->delete_xid)
                        info->best = child;
        } else {
                /*
                 * Keep the current best but truncate key_next to the child's
-                * base iff the child has a higher delete_tid.
+                * base iff the child has a higher delete_xid.
                 *
                 * key_next will also automatically be limited to the smaller
                 * of the two end-points (probably not necessary for this case
                 * but we do it anyway).
                 */
-               if (child->delete_tid >= best->delete_tid &&
+               if (child->delete_xid >= best->delete_xid &&
                    (info->key_next > child->bref.key || info->key_next == 0))
                        info->key_next = child->bref.key;
        }
@@ -1752,8 +1769,9 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
 
 /*
  * Retrieve the specified chain from a media blockref, creating the
- * in-memory chain structure which reflects it.  modify_tid will be
- * left 0 which forces any modifications to issue a delete-duplicate.
+ * in-memory chain structure which reflects it.  modify_xid will be
+ * set to the min value which forces any modifications to issue a
+ * delete-duplicate.
  *
  * To handle insertion races pass the INSERT_RACE flag along with the
  * generation number of the core.  NULL will be returned if the generation
@@ -1762,6 +1780,9 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
  *
  * Caller must hold the parent locked shared or exclusive since we may
  * need the parent's bref array to find our block.
+ *
+ * WARNING! chain->pmp is left NULL if the bref represents a PFS mount
+ *         point.
  */
 hammer2_chain_t *
 hammer2_chain_get(hammer2_chain_t *parent, int generation,
@@ -1776,16 +1797,19 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation,
         * Allocate a chain structure representing the existing media
         * entry.  Resulting chain has one ref and is not locked.
         */
-       chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
+       if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
+               chain = hammer2_chain_alloc(hmp, NULL, NULL, bref);
+       else
+               chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
        hammer2_chain_core_alloc(NULL, chain, NULL);
        /* ref'd chain returned */
 
        /*
-        * Set modify_tid and update_lo to the chain's synchronization
+        * Set modify_xid and update_xlo to the chain's synchronization
         * point from the media.
         */
-       chain->modify_tid = chain->bref.mirror_tid;
-       chain->update_lo = chain->bref.mirror_tid;
+       chain->modify_xid = HAMMER2_XID_MIN;
+       chain->update_xlo = HAMMER2_XID_MIN;
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
 
        /*
@@ -2401,7 +2425,7 @@ again:
        spin_lock(&above->cst.spin);
        chain = hammer2_combined_find(parent, base, count,
                                      cache_indexp, &next_key,
-                                     key, HAMMER2_MAX_KEY,
+                                     key, HAMMER2_KEY_MAX,
                                      &bref);
        generation = above->generation;
 
@@ -2510,7 +2534,7 @@ done:
  */
 int
 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
-                    hammer2_chain_t **chainp,
+                    hammer2_chain_t **chainp, hammer2_pfsmount_t *pmp,
                     hammer2_key_t key, int keybits, int type, size_t bytes)
 {
        hammer2_mount_t *hmp;
@@ -2524,6 +2548,9 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
        int count;
        int maxloops = 300000;
 
+       /*
+        * Topology may be crossing a PFS boundary.
+        */
        above = parent->core;
        KKASSERT(ccms_thread_lock_owned(&above->cst));
        hmp = parent->hmp;
@@ -2542,7 +2569,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                dummy.keybits = keybits;
                dummy.data_off = hammer2_getradix(bytes);
                dummy.methods = parent->bref.methods;
-               chain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy);
+               chain = hammer2_chain_alloc(hmp, pmp, trans, &dummy);
                hammer2_chain_core_alloc(trans, chain, NULL);
 
                /*
@@ -2599,7 +2626,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                 *
                 * The chain must be modified in the current transaction
                 * (the duplication code should have done that for us),
-                * and it's modify_tid should be greater than the parent's
+                * and it's modify_xid should be greater than the parent's
                 * bref.mirror_tid.  This should cause it to be created under
                 * the new parent.
                 *
@@ -2609,7 +2636,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                 *
                 * Do NOT mess with the current state of the INITIAL flag.
                 */
-               KKASSERT(chain->modify_tid == trans->sync_tid);
+               KKASSERT(chain->modify_xid == trans->sync_xid);
                chain->bref.key = key;
                chain->bref.keybits = keybits;
                KKASSERT(chain->above == NULL);
@@ -2808,7 +2835,7 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
         * We want nchain to be our go-to live chain, but ochain may be in
         * a MODIFIED state within the current flush synchronization segment.
         * Force any further modifications of ochain to do another COW
-        * operation even if modify_tid indicates that one is not needed.
+        * operation even if modify_xid indicates that one is not needed.
         *
         * We don't want to set FORCECOW on nchain simply as an optimization,
         * as many duplication calls simply move chains into ichains and
@@ -2829,10 +2856,10 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
         * it with the same core, making it the same size, pointing it
         * to the same bref (the same media block).
         *
-        * Give nchain the same modify_tid that we previously ensured was
+        * Give nchain the same modify_xid that we previously ensured was
         * sufficiently advanced to trigger a block table insertion on flush.
         *
-        * nchain copies ochain's data and must inherit ochain->update_lo.
+        * nchain copies ochain's data and must inherit ochain->update_xlo.
         *
         * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
         *       hammer2_chain_alloc()
@@ -2849,14 +2876,16 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
        bytes = (hammer2_off_t)1 <<
                (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
        nchain->bytes = bytes;
-       nchain->modify_tid = ochain->modify_tid;
-       nchain->update_lo = ochain->update_lo;
+       nchain->modify_xid = ochain->modify_xid;
+       nchain->update_xlo = ochain->update_xlo;
        nchain->inode_reason = ochain->inode_reason + 0x100000;
        atomic_set_int(&nchain->flags,
                       ochain->flags & (HAMMER2_CHAIN_INITIAL |
                                        HAMMER2_CHAIN_FORCECOW |
-                                       HAMMER2_CHAIN_UNLINKED));
-       if (ochain->modify_tid == trans->sync_tid)
+                                       HAMMER2_CHAIN_UNLINKED |
+                                       HAMMER2_CHAIN_PFSROOT |
+                                       HAMMER2_CHAIN_PFSBOUNDARY));
+       if (ochain->modify_xid == trans->sync_xid)
                atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
 
        /*
@@ -2869,12 +2898,12 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
 
        /*
         * Place nchain in the modified state, instantiate media data
-        * if necessary.  Because modify_tid is already completely
+        * if necessary.  Because modify_xid is already completely
         * synchronized this should not result in a delete-duplicate.
         *
         * We want nchain at the target to look like a new insertion.
         * Forcing the modification to be INPLACE accomplishes this
-        * because we get the same nchain with an updated modify_tid.
+        * because we get the same nchain with an updated modify_xid.
         */
        if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
                hammer2_chain_modify(trans, &nchain,
@@ -2903,7 +2932,7 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
                KKASSERT(parent->refs > 0);
 
-               hammer2_chain_create(trans, parentp, &nchain,
+               hammer2_chain_create(trans, parentp, &nchain, nchain->pmp,
                                     nchain->bref.key, nchain->bref.keybits,
                                     nchain->bref.type, nchain->bytes);
                parent = NULL;
@@ -2922,7 +2951,7 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
  *
  * If appropriate, the chain is added to the shadow topology and FLUSH_DELETE
  * is set for flusher visbility.  The caller is responsible for calling
- * setsubmod on chain, so we do not adjust update_hi here.
+ * setsubmod on chain, so we do not adjust update_xhi here.
  */
 static void
 _hammer2_chain_delete_helper(hammer2_trans_t *trans,
@@ -2933,7 +2962,7 @@ _hammer2_chain_delete_helper(hammer2_trans_t *trans,
        hammer2_chain_t *xchain;
 
        KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
-       KKASSERT(trans->sync_tid >= chain->modify_tid);
+       KKASSERT(trans->sync_xid >= chain->modify_xid);
        KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED |
                                  HAMMER2_CHAIN_ONDBQ |
                                  HAMMER2_CHAIN_ONDBTREE |
@@ -2943,7 +2972,7 @@ _hammer2_chain_delete_helper(hammer2_trans_t *trans,
         * Flag as deleted, reduce live_count and bump the above core's
         * generation.
         */
-       chain->delete_tid = trans->sync_tid;
+       chain->delete_xid = trans->sync_xid;
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
        atomic_add_int(&above->live_count, -1);
        ++above->generation;
@@ -3046,7 +3075,7 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         */
        nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref);
        if ((ochain->flags & HAMMER2_CHAIN_DELETED) ||
-           (ochain->modify_tid > trans->sync_tid)) {
+           (ochain->modify_xid > trans->sync_xid)) {
                atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
        }
        if (flags & HAMMER2_DELDUP_RECORE)
@@ -3065,7 +3094,7 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         * doing this in-place under the same parent the block array
         * inserted/deleted state does not change.
         *
-        * nchain copies ochain's data and must inherit ochain->update_lo.
+        * nchain copies ochain's data and must inherit ochain->update_xlo.
         *
         * If ochain was previously marked FORCECOW we also flag nchain
         * FORCECOW (used during hardlink splits).  FORCECOW forces a
@@ -3080,21 +3109,13 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        atomic_set_int(&nchain->flags,
                       ochain->flags & (HAMMER2_CHAIN_INITIAL |
                                        HAMMER2_CHAIN_FORCECOW |
-                                       HAMMER2_CHAIN_UNLINKED));
-       if (ochain->modify_tid == trans->sync_tid)
+                                       HAMMER2_CHAIN_UNLINKED |
+                                       HAMMER2_CHAIN_PFSROOT |
+                                       HAMMER2_CHAIN_PFSBOUNDARY));
+       if (ochain->modify_xid == trans->sync_xid)
                atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
        nchain->inode_reason = ochain->inode_reason + 0x1000;
-       nchain->update_lo = ochain->update_lo;
-       nchain->dsrc = ochain->bref;                    /* DEBUG */
-       nchain->dsrc_dupfromat = trans->sync_tid;       /* DEBUG */
-       nchain->dsrc_dupfromflags = trans->flags;       /* DEBUG */
-       nchain->dsrc_reason = ochain->inode_reason;     /* DEBUG */
-       nchain->dsrc_ninserts = ochain->ninserts;       /* DEBUG */
-       nchain->dsrc_flags = ochain->flags;             /* DEBUG */
-       nchain->dsrc_modify = ochain->modify_tid;       /* DEBUG */
-       nchain->dsrc_delete = ochain->delete_tid;       /* DEBUG */
-       nchain->dsrc_update_lo = ochain->update_lo;     /* DEBUG */
-       nchain->dsrc_original = ochain;                 /* DEBUG */
+       nchain->update_xlo = ochain->update_xlo;
 
        /*
         * Lock nchain so both chains are now locked (extremely important
@@ -3109,8 +3130,8 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                                  HAMMER2_CHAIN_ONDBQ));
        spin_lock(&above->cst.spin);
 
-       nchain->modify_tid = ochain->modify_tid;
-       nchain->delete_tid = HAMMER2_MAX_TID;
+       nchain->modify_xid = ochain->modify_xid;
+       nchain->delete_xid = HAMMER2_XID_MAX;
 
        if ((nchain->flags & HAMMER2_CHAIN_DELETED) &&
            (oflags & HAMMER2_CHAIN_DUPLICATED)) {
@@ -3124,17 +3145,17 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                 * of ochain.
                 */
                KKASSERT(trans->flags & HAMMER2_TRANS_ISFLUSH);
-               KKASSERT(ochain->modify_tid < trans->sync_tid);
-               KKASSERT(ochain->delete_tid > trans->sync_tid);
+               KKASSERT(ochain->modify_xid < trans->sync_xid);
+               KKASSERT(ochain->delete_xid > trans->sync_xid);
                atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_TEMPORARY);
                hammer2_chain_insert(above, ochain, nchain, 0, 0);
 
                if ((ochain->flags & HAMMER2_CHAIN_DELETED) &&
-                   ochain->modify_tid < trans->sync_tid) {
-                       nchain->delete_tid = ochain->delete_tid;
-                       ochain->delete_tid = trans->sync_tid;
-               } else if (ochain->modify_tid > trans->sync_tid) {
-                       nchain->delete_tid = ochain->modify_tid;
+                   ochain->modify_xid < trans->sync_xid) {
+                       nchain->delete_xid = ochain->delete_xid;
+                       ochain->delete_xid = trans->sync_xid;
+               } else if (ochain->modify_xid > trans->sync_xid) {
+                       nchain->delete_xid = ochain->modify_xid;
                }
        } else if (nchain->flags & HAMMER2_CHAIN_DELETED) {
                /*
@@ -3149,7 +3170,7 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                 * marked deleted.
                 */
                hammer2_chain_insert(above, ochain, nchain, 0, 0);
-               nchain->delete_tid = trans->sync_tid;
+               nchain->delete_xid = trans->sync_xid;
        } else {
                /*
                 * Normal case, delete-duplicate deletes ochain and nchain
@@ -3173,8 +3194,8 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         * Finishing fixing up nchain.  A new block will be allocated if
         * crossing a synchronization point (meta-data only).
         *
-        * Calling hammer2_chain_modify() will update modify_tid to
-        * (typically) trans->sync_tid.
+        * Calling hammer2_chain_modify() will update modify_xid to
+        * (typically) trans->sync_xid.
         */
        if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
                hammer2_chain_modify(trans, &nchain,
@@ -3360,7 +3381,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * dummy used in later chain allocation (no longer used for lookups).
         */
        bzero(&dummy, sizeof(dummy));
-       dummy.delete_tid = HAMMER2_MAX_TID;
+       dummy.delete_xid = HAMMER2_XID_MAX;
 
        /*
         * When creating an indirect block for a freemap node or leaf
@@ -3433,7 +3454,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * XXX handle flushes.
         */
        key_beg = 0;
-       key_end = HAMMER2_MAX_KEY;
+       key_end = HAMMER2_KEY_MAX;
        cache_index = 0;
        spin_lock(&above->cst.spin);
        loops = 0;
@@ -3651,7 +3672,7 @@ hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
         * slots which are overridden with a deletion.
         */
        key_beg = 0;
-       key_end = HAMMER2_MAX_KEY;
+       key_end = HAMMER2_KEY_MAX;
        cache_index = 0;
        spin_lock(&above->cst.spin);
 
@@ -3764,7 +3785,7 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
         * range into the new indirect block.
         */
        key_beg = 0;
-       key_end = HAMMER2_MAX_KEY;
+       key_end = HAMMER2_KEY_MAX;
        cache_index = 0;
        spin_lock(&above->cst.spin);
 
@@ -3896,7 +3917,7 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
 
 /*
  * Sets CHAIN_DELETED and CHAIN_FLUSH_DELETE in the chain being deleted and
- * set chain->delete_tid.  The chain is not actually marked possibly-free
+ * set chain->delete_xid.  The chain is not actually marked possibly-free
  * in the freemap until the deletion is completely flushed out (because
  * a flush which doesn't cover the entire deletion is flushing the deleted
  * chain as if it were live).
@@ -3914,10 +3935,6 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
  * as well as the ability to read and modify the contents.  For example,
  * for an unlinked file which is still open.
  *
- * NOTE: This function does NOT set chain->modify_tid, allowing future
- *      code to distinguish between live and deleted chains by testing
- *      trans->sync_tid vs chain->modify_tid and chain->delete_tid.
- *
  * NOTE: Deletions normally do not occur in the middle of a duplication
  *      chain but we use a trick for hardlink migration that refactors
  *      the originating inode without deleting it, so we make no assumptions
@@ -4133,9 +4150,12 @@ hammer2_combined_find(hammer2_chain_t *parent,
            chain->bref.key == base[i].key) {
                KKASSERT(chain->bref.key == base[i].key);
                if ((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
-                       kprintf("chain not bmapped %p.%d %08x\n", chain, chain->bref.type, chain->flags);
-                       kprintf("in chain mod/del %016jx %016jx\n", chain->modify_tid, chain->delete_tid);
-                       kprintf("and updlo/hi %016jx %016jx\n", chain->update_lo, chain->update_hi);
+                       kprintf("chain not bmapped %p.%d %08x\n",
+                               chain, chain->bref.type, chain->flags);
+                       kprintf("in chain mod/del %08x %08x\n",
+                               chain->modify_xid, chain->delete_xid);
+                       kprintf("and updlo/hi %08x %08x\n",
+                               chain->update_xlo, chain->update_xhi);
                }
                KKASSERT(chain->flags & HAMMER2_CHAIN_BMAPPED);
                bref = &chain->bref;
@@ -4203,10 +4223,6 @@ hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                return;
        }
        bzero(&base[i], sizeof(*base));
-       base[i].mirror_tid = (intptr_t)parent;          /* MEDIA DEBUG */
-       base[i].modify_tid = (intptr_t)child;           /* MEDIA DEBUG */
-       base[i].check.debug.sync_tid = trans->sync_tid; /* MEDIA DEBUG */
-       ++parent->nremoves;                             /* DEBUG */
 
        /*
         * We can only optimize core->live_zero for live chains.
@@ -4268,7 +4284,6 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
                        i = core->live_zero++;
                        base[i] = *elm;
-       ++parent->ninserts;     /* DEBUG */
                        return;
                }
        }
@@ -4302,7 +4317,6 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                                      (i - j - 1) * sizeof(*base));
                                base[i - 1] = *elm;
                        }
-       ++parent->ninserts;     /* DEBUG */
                        goto validate;
                }
                ++k;
@@ -4320,7 +4334,6 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                                        core->live_zero = k + 1;
                        }
                        u = 2;
-       ++parent->ninserts;     /* DEBUG */
                        goto validate;
                }
        }
index 102bd70..9c76174 100644 (file)
@@ -149,7 +149,7 @@ hammer2_cluster_from_chain(hammer2_chain_t *chain)
        cluster->array[0] = chain;
        cluster->nchains = 1;
        cluster->focus = chain;
-       cluster->pmp = chain->pmp;              /* can be NULL */
+       cluster->pmp = chain->pmp;
        cluster->refs = 1;
 
        return cluster;
@@ -202,19 +202,18 @@ hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
 
        rcluster = &pmp->iroot->cluster;
        for (i = 0; i < rcluster->nchains; ++i) {
-               chain = hammer2_chain_alloc(rcluster->array[i]->hmp, pmp,
-                                           trans, bref);
-               chain->pmp = pmp;
+               chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
+                                           pmp, trans, bref);
                chain->hmp = rcluster->array[i]->hmp;
                chain->bref = *bref;
                chain->bytes = bytes;
                chain->refs = 1;
                chain->flags = HAMMER2_CHAIN_ALLOCATED;
-               chain->delete_tid = HAMMER2_MAX_TID;
+               chain->delete_xid = HAMMER2_XID_MAX;
 
                /*
                 * Set modify_tid if a transaction is creating the inode.
-                * Enforce update_lo = 0 so nearby transactions do not think
+                * Enforce update_xlo = 0 so nearby transactions do not think
                 * it has been flushed when it hasn't.
                 *
                 * NOTE: When loading a chain from backing store or creating a
@@ -222,8 +221,8 @@ hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
                 *       responsible for setting these fields.
                 */
                if (trans) {
-                       chain->modify_tid = trans->sync_tid;
-                       chain->update_lo = 0;
+                       chain->modify_xid = trans->sync_xid;
+                       chain->update_xlo = 0;
                }
                cluster->array[i] = chain;
        }
@@ -888,9 +887,8 @@ hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
                                cparent->focus = cparent->array[i];
                        continue;
                }
-               error = hammer2_chain_create(trans,
-                                            &cparent->array[i],
-                                            &cluster->array[i],
+               error = hammer2_chain_create(trans, &cparent->array[i],
+                                            &cluster->array[i], pmp,
                                             key, keybits, type, bytes);
                KKASSERT(error == 0);
                if (cparent->focus == NULL)
@@ -1041,8 +1039,9 @@ hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
        vat.va_type = VDIR;
        vat.va_mode = 0755;
        ncluster = NULL;
-       nip = hammer2_inode_create(trans, hmp->sroot, &vat, proc0.p_ucred,
-                                  pfs->name, name_len, &ncluster, &error);
+       nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat,
+                                  proc0.p_ucred, pfs->name, name_len,
+                                  &ncluster, &error);
 
        if (nip) {
                wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0);
index e65fbd9..521ab7e 100644 (file)
  * currently supports up to 8 copies, which brings the address space down
  * to 66 bits and gives us 2 bits of leeway.
  */
-#define HAMMER2_MIN_ALLOC      1024    /* minimum allocation size */
-#define HAMMER2_MIN_RADIX      10      /* minimum allocation size 2^N */
-#define HAMMER2_MAX_ALLOC      65536   /* maximum allocation size */
-#define HAMMER2_MAX_RADIX      16      /* maximum allocation size 2^N */
-#define HAMMER2_KEY_RADIX      64      /* number of bits in key */
+#define HAMMER2_ALLOC_MIN      1024    /* minimum allocation size */
+#define HAMMER2_RADIX_MIN      10      /* minimum allocation size 2^N */
+#define HAMMER2_ALLOC_MAX      65536   /* maximum allocation size */
+#define HAMMER2_RADIX_MAX      16      /* maximum allocation size 2^N */
+#define HAMMER2_RADIX_KEY      64      /* number of bits in key */
 
 /*
  * MINALLOCSIZE                - The minimum allocation size.  This can be smaller
@@ -359,12 +359,12 @@ typedef uint32_t hammer2_crc32_t;
 /*
  * Miscellanious ranges (all are unsigned).
  */
-#define HAMMER2_MIN_TID                1ULL
-#define HAMMER2_MAX_TID                0xFFFFFFFFFFFFFFFFULL
-#define HAMMER2_MIN_KEY                0ULL
-#define HAMMER2_MAX_KEY                0xFFFFFFFFFFFFFFFFULL
-#define HAMMER2_MIN_OFFSET     0ULL
-#define HAMMER2_MAX_OFFSET     0xFFFFFFFFFFFFFFFFULL
+#define HAMMER2_TID_MIN                1ULL
+#define HAMMER2_TID_MAX                0xFFFFFFFFFFFFFFFFULL
+#define HAMMER2_KEY_MIN                0ULL
+#define HAMMER2_KEY_MAX                0xFFFFFFFFFFFFFFFFULL
+#define HAMMER2_OFFSET_MIN     0ULL
+#define HAMMER2_OFFSET_MAX     0xFFFFFFFFFFFFFFFFULL
 
 /*
  * HAMMER2 data offset special cases and masking.
@@ -377,7 +377,7 @@ typedef uint32_t hammer2_crc32_t;
  * to as a power of 2.  The theoretical minimum radix is thus 6 (The space
  * needed in the low bits of the data offset field).  However, the practical
  * minimum allocation chunk size is 1KB (a radix of 10), so HAMMER2 sets
- * HAMMER2_MIN_RADIX to 10.  The maximum radix is currently 16 (64KB), but
+ * HAMMER2_RADIX_MIN to 10.  The maximum radix is currently 16 (64KB), but
  * we fully intend to support larger extents in the future.
  */
 #define HAMMER2_OFF_BAD                ((hammer2_off_t)-1)
@@ -483,13 +483,6 @@ struct hammer2_blockref {          /* MUST BE EXACTLY 64 BYTES */
 
 typedef struct hammer2_blockref hammer2_blockref_t;
 
-#if 0
-#define HAMMER2_BREF_SYNC1             0x01    /* modification synchronized */
-#define HAMMER2_BREF_SYNC2             0x02    /* modification committed */
-#define HAMMER2_BREF_DESYNCCHLD                0x04    /* desynchronize children */
-#define HAMMER2_BREF_DELETED           0x80    /* indicates a deletion */
-#endif
-
 #define HAMMER2_BLOCKREF_BYTES         64      /* blockref struct in bytes */
 
 /*
@@ -505,6 +498,8 @@ typedef struct hammer2_blockref hammer2_blockref_t;
 #define HAMMER2_BREF_TYPE_FREEMAP      254     /* pseudo-type */
 #define HAMMER2_BREF_TYPE_VOLUME       255     /* pseudo-type */
 
+#define HAMMER2_BREF_FLAG_PFSROOT      0x01    /* see also related opflag */
+
 #define HAMMER2_ENC_CHECK(n)           ((n) << 4)
 #define HAMMER2_DEC_CHECK(n)           (((n) >> 4) & 15)
 
@@ -569,8 +564,8 @@ typedef struct hammer2_blockset hammer2_blockset_t;
 #if (1 << HAMMER2_PBUFRADIX) != HAMMER2_PBUFSIZE
 #error "HAMMER2_PBUFRADIX and HAMMER2_PBUFSIZE are inconsistent"
 #endif
-#if (1 << HAMMER2_MIN_RADIX) != HAMMER2_MIN_ALLOC
-#error "HAMMER2_MIN_RADIX and HAMMER2_MIN_ALLOC are inconsistent"
+#if (1 << HAMMER2_RADIX_MIN) != HAMMER2_ALLOC_MIN
+#error "HAMMER2_RADIX_MIN and HAMMER2_ALLOC_MIN are inconsistent"
 #endif
 
 /*
@@ -765,8 +760,9 @@ struct hammer2_inode_data {
 typedef struct hammer2_inode_data hammer2_inode_data_t;
 
 #define HAMMER2_OPFLAG_DIRECTDATA      0x01
-#define HAMMER2_OPFLAG_PFSROOT         0x02
+#define HAMMER2_OPFLAG_PFSROOT         0x02    /* (see also bref flag) */
 #define HAMMER2_OPFLAG_COPYIDS         0x04    /* copyids override parent */
+#define HAMMER2_OPFLAG_SUPROOT         0x08
 
 #define HAMMER2_OBJTYPE_UNKNOWN                0
 #define HAMMER2_OBJTYPE_DIRECTORY      1
@@ -919,9 +915,17 @@ struct hammer2_volume_data {
        hammer2_off_t   allocator_size;         /* 0060 Total data space */
        hammer2_off_t   allocator_free;         /* 0068 Free space */
        hammer2_off_t   allocator_beg;          /* 0070 Initial allocations */
+
+       /*
+        * mirror_tid reflects the highest committed super-root change
+        * freemap_tid reflects the highest committed freemap change
+        *
+        * NOTE: mirror_tid does not track (and should not track) changes
+        *       made to or under PFS roots.
+        */
        hammer2_tid_t   mirror_tid;             /* 0078 committed tid (vol) */
-       hammer2_tid_t   alloc_tid;              /* 0080 Alloctable modify tid */
-       hammer2_tid_t   inode_tid;              /* 0088 Inode allocator tid */
+       hammer2_tid_t   reserved0080;           /* 0080 */
+       hammer2_tid_t   reserved0088;           /* 0088 */
        hammer2_tid_t   freemap_tid;            /* 0090 committed tid (fmap) */
        hammer2_tid_t   bulkfree_tid;           /* 0098 bulkfree incremental */
        hammer2_tid_t   reserved00A0[5];        /* 00A0-00C7 */
index 7ef9da1..35201c8 100644 (file)
  * SUCH DAMAGE.
  */
 
+/*
+ *                     TRANSACTION AND FLUSH HANDLING
+ *
+ * Deceptively simple but actually fairly difficult to implement properly is
+ * how I would describe it.
+ *
+ * The biggest problem is that each PFS may belong to a cluster so its
+ * media modify_tid and mirror_tid fields are in a completely different
+ * domain than the topology related to the super-root.  Most of the code
+ * operates using modify_xid and delete_xid which are local identifiers.
+ *
+ * The second biggest problem is that we really want to allow flushes to run
+ * concurrently with new front-end operations, which means that the in-memory
+ * topology of hammer2_chain structures can represent both current state and
+ * snapshot-for-flush state.
+ */
+
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -59,7 +76,7 @@ struct hammer2_flush_info {
        int             cache_index;
        int             domodify;
        struct h2_flush_deferral_list flush_list;
-       hammer2_tid_t   sync_tid;       /* flush synchronization point */
+       hammer2_xid_t   sync_xid;       /* memory synchronization point */
 };
 
 typedef struct hammer2_flush_info hammer2_flush_info_t;
@@ -88,7 +105,7 @@ static __inline
 int
 h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain)
 {
-       return (chain->delete_tid <= info->sync_tid);
+       return (chain->delete_xid <= info->sync_xid);
 }
 
 #if 0
@@ -111,6 +128,39 @@ hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
 }
 #endif
 
+/*
+ * For now use a global transaction manager.  What we ultimately want to do
+ * is give each non-overlapping hmp/pmp group its own transaction manager.
+ *
+ * Transactions govern XID tracking on the physical media (the hmp), but they
+ * also govern TID tracking which is per-PFS and thus might cross multiple
+ * hmp's.  So we can't just stuff tmanage into hammer2_mount or
+ * hammer2_pfsmount.
+ */
+static hammer2_trans_manage_t  tmanage;
+
+void
+hammer2_trans_manage_init(void)
+{
+       lockinit(&tmanage.translk, "h2trans", 0, 0);
+       TAILQ_INIT(&tmanage.transq);
+       tmanage.flush_xid = 1;
+       tmanage.alloc_xid = tmanage.flush_xid + 1;
+}
+
+hammer2_xid_t
+hammer2_trans_newxid(hammer2_pfsmount_t *pmp __unused)
+{
+       hammer2_xid_t xid;
+
+       for (;;) {
+               xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1);
+               if (xid)
+                       break;
+       }
+       return xid;
+}
+
 /*
  * Transaction support functions for writing to the filesystem.
  *
@@ -130,34 +180,21 @@ hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
  * Buffer-cache transactions operate as fs_ops but never block.  A
  * buffer-cache flush will run either before or after the current pending
  * flush depending on its state.
- *
- * NOTE: The sync_tid for a flush's freemap allocation will match the
- *      sync_tid of the following <concurrent_fs_ops> transaction(s).
- *      The freemap topology will be out-of-step by one transaction id
- *      in order to give the flusher a stable freemap topology to flush
- *      out.  This is fixed up at mount-time using a quick incremental
- *      scan.
  */
 void
-hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
-                  hammer2_mount_t *hmp, int flags)
+hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
 {
+       hammer2_trans_manage_t *tman;
        hammer2_trans_t *head;
 
-       bzero(trans, sizeof(*trans));
-       if (pmp) {
-               trans->pmp = pmp;
-               KKASSERT(hmp == NULL);
-               hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
-       } else {
-               trans->hmp_single = hmp;
-               KKASSERT(hmp);
-       }
+       tman = &tmanage;
 
-       hammer2_voldata_lock(hmp);
+       bzero(trans, sizeof(*trans));
+       trans->pmp = pmp;
        trans->flags = flags;
        trans->td = curthread;
-       /*trans->delete_gen = 0;*/      /* multiple deletions within trans */
+
+       lockmgr(&tman->translk, LK_EXCLUSIVE);
 
        if (flags & HAMMER2_TRANS_ISFLUSH) {
                /*
@@ -171,26 +208,28 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
                 * transaction id but a flush transaction needs its own
                 * unique TID for proper block table update accounting.
                 */
-               ++hmp->flushcnt;
-               ++hmp->voldata.alloc_tid;
-               trans->sync_tid = hmp->voldata.alloc_tid;
-               trans->orig_tid = trans->sync_tid;
-               ++hmp->voldata.alloc_tid;
-               TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
-               if (TAILQ_FIRST(&hmp->transq) != trans) {
+               ++tman->flushcnt;
+               ++pmp->alloc_tid;
+               pmp->flush_tid = pmp->alloc_tid;
+               tman->flush_xid = hammer2_trans_newxid(pmp);
+               trans->sync_xid = tman->flush_xid;
+               ++pmp->alloc_tid;
+               TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
+               if (TAILQ_FIRST(&tman->transq) != trans) {
                        trans->blocked = 1;
                        while (trans->blocked) {
-                               lksleep(&trans->sync_tid, &hmp->voldatalk,
+                               lksleep(&trans->sync_xid, &tman->translk,
                                        0, "h2multf", hz);
                        }
                }
-       } else if (hmp->flushcnt == 0) {
+       } else if (tman->flushcnt == 0) {
                /*
-                * No flushes are pending, we can go.
+                * No flushes are pending, we can go.  Use prior flush_xid + 1.
+                *
+                * WARNING!  Also see hammer2_chain_setsubmod()
                 */
-               TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
-               trans->sync_tid = hmp->voldata.alloc_tid;
-               trans->orig_tid = trans->sync_tid;
+               TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
+               trans->sync_xid = tman->flush_xid + 1;
 
                /* XXX improve/optimize inode allocation */
        } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
@@ -202,32 +241,32 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
                 * The buffer cache flush takes on the main flush's
                 * transaction id.
                 */
-               TAILQ_FOREACH(head, &hmp->transq, entry) {
+               TAILQ_FOREACH(head, &tman->transq, entry) {
                        if (head->flags & HAMMER2_TRANS_ISFLUSH)
                                break;
                }
                KKASSERT(head);
                KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
                trans->flags |= HAMMER2_TRANS_PREFLUSH;
-               TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry);
-               trans->sync_tid = head->orig_tid;
-               trans->orig_tid = trans->sync_tid;
+               TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
+               trans->sync_xid = head->sync_xid;
                trans->flags |= HAMMER2_TRANS_CONCURRENT;
                /* not allowed to block */
        } else {
                /*
                 * A normal transaction is requested while a flush is in
                 * progress.  We insert after the current flush and may
-                * block.  Assign sync_tid = flush's tid + 1.
+                * block.
+                *
+                * WARNING!  Also see hammer2_chain_setsubmod()
                 */
-               TAILQ_FOREACH(head, &hmp->transq, entry) {
+               TAILQ_FOREACH(head, &tman->transq, entry) {
                        if (head->flags & HAMMER2_TRANS_ISFLUSH)
                                break;
                }
                KKASSERT(head);
-               TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry);
-               trans->sync_tid = head->orig_tid + 1;
-               trans->orig_tid = trans->sync_tid;
+               TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
+               trans->sync_xid = head->sync_xid + 1;
                trans->flags |= HAMMER2_TRANS_CONCURRENT;
 
                /*
@@ -240,41 +279,51 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
                 * cache ops).
                 */
                if (hammer2_synchronous_flush > 0 ||
-                   TAILQ_FIRST(&hmp->transq) != head) {
+                   TAILQ_FIRST(&tman->transq) != head) {
                        trans->blocked = 1;
                        while (trans->blocked) {
-                               lksleep(&trans->sync_tid,
-                                       &hmp->voldatalk, 0,
+                               lksleep(&trans->sync_xid,
+                                       &tman->translk, 0,
                                        "h2multf", hz);
                        }
                }
        }
        if (flags & HAMMER2_TRANS_NEWINODE) {
-               if (hmp->voldata.inode_tid < HAMMER2_INODE_START)
-                       hmp->voldata.inode_tid = HAMMER2_INODE_START;
-               trans->inode_tid = hmp->voldata.inode_tid++;
+               if (pmp->spmp_hmp) {
+                       /*
+                        * Super-root transaction, all new inodes have an
+                        * inode number of 1.  Normal pfs inode cache
+                        * semantics are not used.
+                        */
+                       trans->inode_tid = 1;
+               } else {
+                       /*
+                        * Normal transaction
+                        */
+                       if (pmp->inode_tid < HAMMER2_INODE_START)
+                               pmp->inode_tid = HAMMER2_INODE_START;
+                       trans->inode_tid = pmp->inode_tid++;
+               }
        }
-       hammer2_voldata_unlock(hmp, 0);
+
+       lockmgr(&tman->translk, LK_RELEASE);
 }
 
 void
 hammer2_trans_done(hammer2_trans_t *trans)
 {
-       hammer2_mount_t *hmp;
+       hammer2_trans_manage_t *tman;
        hammer2_trans_t *head;
        hammer2_trans_t *scan;
 
-       if (trans->pmp)
-               hmp = trans->pmp->iroot->cluster.focus->hmp;
-       else
-               hmp = trans->hmp_single;
+       tman = &tmanage;
 
        /*
         * Remove.
         */
-       hammer2_voldata_lock(hmp);
-       TAILQ_REMOVE(&hmp->transq, trans, entry);
-       head = TAILQ_FIRST(&hmp->transq);
+       lockmgr(&tman->translk, LK_EXCLUSIVE);
+       TAILQ_REMOVE(&tman->transq, trans, entry);
+       head = TAILQ_FIRST(&tman->transq);
 
        /*
         * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
@@ -282,7 +331,7 @@ hammer2_trans_done(hammer2_trans_t *trans)
         * stop there, unlike the unblock code following this section).
         */
        if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
-               --hmp->flushcnt;
+               --tman->flushcnt;
                scan = head;
                while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
                        atomic_clear_int(&scan->flags,
@@ -302,7 +351,7 @@ hammer2_trans_done(hammer2_trans_t *trans)
         */
        if (head && head->blocked) {
                head->blocked = 0;
-               wakeup(&head->sync_tid);
+               wakeup(&head->sync_xid);
 
                if (hammer2_synchronous_flush > 0)
                        scan = head;
@@ -311,24 +360,24 @@ hammer2_trans_done(hammer2_trans_t *trans)
                while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
                        if (scan->blocked) {
                                scan->blocked = 0;
-                               wakeup(&scan->sync_tid);
+                               wakeup(&scan->sync_xid);
                        }
                        scan = TAILQ_NEXT(scan, entry);
                }
        }
-       hammer2_voldata_unlock(hmp, 0);
+       lockmgr(&tman->translk, LK_RELEASE);
 }
 
 /*
  * Flush the chain and all modified sub-chains through the specified
- * synchronization point (sync_tid), propagating parent chain modifications
- * and mirror_tid updates back up as needed.  Since we are recursing downward
+ * synchronization point, propagating parent chain modifications and
+ * mirror_tid updates back up as needed.  Since we are recursing downward
  * we do not have to deal with the complexities of multi-homed chains (chains
  * with multiple parents).
  *
  * Caller must have interlocked against any non-flush-related modifying
- * operations in progress whos modify_tid values are less than or equal
- * to the passed sync_tid.
+ * operations in progress whos modify_xid values are less than or equal
+ * to the passed sync_xid.
  *
  * Caller must have already vetted synchronization points to ensure they
  * are properly flushed.  Only snapshots and cluster flushes can create
@@ -362,7 +411,7 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
        bzero(&info, sizeof(info));
        TAILQ_INIT(&info.flush_list);
        info.trans = trans;
-       info.sync_tid = trans->sync_tid;
+       info.sync_xid = trans->sync_xid;
        info.cache_index = -1;
 
        core = chain->core;
@@ -459,16 +508,11 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
  *     a chain can be unloaded from memory with the expectation that it can
  *     be reloaded later via the block table at any time.
  *
- * NOTE: chain->bref.modify_tid is different from chain->modify_tid.  COW
- *      propagations for block updates do not update chain->bref.modify_tid,
- *      only chain->bref.mirror_tid.  The MODIFIED bit is set on any
- *      modified chain, including COW propagations, but the flusher normally
- *      just keys off of the FLUSH_* bits.  FLUSH_CREATE will also be set
- *      in this situation.
+ *                     WARNING ON BREF MODIFY_TID/MIRROR_TID
  *
- * NOTE: We are responsible for updating chain->bref.mirror_tid and
- *      chain->update_lo.  The caller is responsible for processing us into
- *      our parent (if any).
+ * blockref.modify_tid and blockref.mirror_tid are consistent only within a
+ * PFS.  This is why we cannot cache sync_tid in the transaction structure.
+ * Instead we access it from the pmp.
  */
 static void
 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
@@ -493,9 +537,9 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
         * flushed.
         */
        if (/*(chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&*/
-           (chain->update_lo >= info->sync_tid ||      /* already synced */
-            chain->update_lo >= chain->update_hi)) {   /* old/unchanged */
-               /* update_lo/_hi already filters chain out, do not update */
+           (chain->update_xlo >= info->sync_xid ||     /* already synced */
+            chain->update_xlo >= chain->update_xhi)) { /* old/unchanged */
+               /* update_xlo/_xhi already filters chain out, do not update */
                /* don't update bref.mirror_tid, pass2 is not called */
                return;
        }
@@ -503,12 +547,12 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
        /*
         * mirror_tid should not be forward-indexed
         */
-       KKASSERT(chain->bref.mirror_tid <= info->sync_tid);
+       KKASSERT(chain->bref.mirror_tid <= chain->pmp->flush_tid);
 
        /*
         * Ignore chains modified beyond the current flush point.  These
         * will be treated as if they did not exist.  Subchains with lower
-        * modify_tid's will still be accessible via other parents.
+        * modify_xid's will still be accessible via other parents.
         *
         * Do not update bref.mirror_tid here, it will interfere with
         * synchronization.  e.g. inode flush tid 1, concurrent D-D tid 2,
@@ -516,15 +560,15 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
         * to 1 during inode flush tid 1 the blockrefs would only be partially
         * updated (and likely panic).
         *
-        * We must update chain->update_lo here to prevent re-entry in this
+        * We must update chain->update_xlo here to prevent re-entry in this
         * flush transaction.
         *
         * (vchain and fchain are exceptions since they cannot be duplicated)
         */
-       if (chain->modify_tid > info->sync_tid &&
+       if (chain->modify_xid > info->sync_xid &&
            chain != &hmp->fchain && chain != &hmp->vchain) {
                /* do not update bref.mirror_tid, pass2 ignores chain */
-               /* chain->update_lo = info->sync_tid; */
+               /* chain->update_xlo = info->sync_xid; */
                return;
        }
 
@@ -533,12 +577,12 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
         * (3) Flush live children
         * (4) Flush deleted children
         *
-        * We adjust update_lo if not deferring chain to prevent re-entry
+        * We adjust update_xlo if not deferring chain to prevent re-entry
         * in this flush cycle, but it must be set AFTER the flush in case
         * a deeper flush hits the chain.  Otherwise the deeper flush cannot
         * complete.  We re-check the condition after finishing the flushes.
         *
-        * update_hi was already checked and prevents initial recursions on
+        * update_xhi was already checked and prevents initial recursions on
         * subtrees which have not been modified.
         */
        saved_parent = info->parent;
@@ -591,24 +635,24 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
        }
 
        /*
-        * Stop if deferred, do not update update_lo.
+        * Stop if deferred, do not update update_xlo.
         */
-       if (info->diddeferral)
+       if (info->diddeferral) {
                goto done;
+       }
 
        /*
         * If a block table update is needed place the parent in a modified
         * state, which might delete-duplicate it.
         *
-        * - To prevent loops and other confusion, we synchronize update_lo
+        * - To prevent loops and other confusion, we synchronize update_xlo
         *   for the original chain.
         *
         * - The original parent will not be used by the flush so we can
         *   clear its MODIFIED bit.
         */
        if (info->domodify) {
-               hammer2_chain_modify(info->trans, &info->parent,
-                                    HAMMER2_MODIFY_NO_MODIFY_TID);
+               hammer2_chain_modify(info->trans, &info->parent, 0);
                if (info->parent != chain) {
                        /*
                         * chain        - old
@@ -639,13 +683,9 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
                                hammer2_chain_drop(info->parent);
                        }
 #endif
-#if 0
-                       if (chain->bref.mirror_tid < info->sync_tid)
-                               chain->bref.mirror_tid = info->sync_tid;
-#endif
-                       if (chain->update_lo < info->sync_tid)
-                               chain->update_lo = info->sync_tid;
-                       KKASSERT(info->parent->update_lo < info->sync_tid);
+                       if (chain->update_xlo < info->sync_xid)
+                               chain->update_xlo = info->sync_xid;
+                       KKASSERT(info->parent->update_xlo < info->sync_xid);
                        hammer2_chain_drop(chain);
                        hammer2_chain_ref(info->parent);
                }
@@ -665,21 +705,21 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
                spin_lock(&core->cst.spin);
                TAILQ_FOREACH(scan, &core->ownerq, core_entry) {
                        /*
-                        * Ignore chains which have already been updated
-                        * Ignore unmodified chains
+                        * Ignore the current parent being processed (we do
+                        * not adjust update_xlo until after the fixup).
                         */
-                       if ((scan->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
-                           (scan->update_lo >= info->sync_tid ||
-                            scan->update_lo >= scan->update_hi)) {
+                       if (scan == chain)
                                continue;
-                       }
 
                        /*
-                        * Ignore the current parent being processed (we do
-                        * not adjust update_lo until after the fixup).
+                        * Ignore chains which have already been updated
+                        * Ignore unmodified chains (lo >= hi).
                         */
-                       if (scan == chain)
+                       if ((scan->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
+                           (scan->update_xlo >= info->sync_xid ||
+                            scan->update_xlo >= scan->update_xhi)) {
                                continue;
+                       }
 
                        /*
                         * Cannot exhaust all parents if one is not visible
@@ -688,15 +728,15 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
                         */
                        if (scan != &scan->hmp->fchain &&
                            scan != &scan->hmp->vchain &&
-                           scan->modify_tid > info->sync_tid) {
+                           scan->modify_xid > info->sync_xid) {
                                break;
                        }
 
                        /*
-                        * Fail if update_lo has not been synchronized to
-                        * at least our sync_tid on any modified parent chain.
+                        * Fail if update_xlo has not been synchronized to
+                        * at least our sync_xid on any modified parent chain.
                         */
-                       if (scan->update_lo < info->sync_tid)
+                       if (scan->update_xlo < info->sync_xid)
                                break;
                }
                spin_unlock(&core->cst.spin);
@@ -790,10 +830,10 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
        }
 
        /*
-        * Synchronize update_lo to prevent reentrant block updates of this
+        * Synchronize update_xlo to prevent reentrant block updates of this
         * parent.
         */
-       chain->update_lo = info->sync_tid;
+       chain->update_xlo = info->sync_xid;
 
        /*
         * Skip the flush if the chain was not placed in a modified state
@@ -808,16 +848,16 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
         * Chain is now deterministically being flushed and not being deferred.
         * We've finished running the recursion and the blockref update.
         *
-        * update bref.mirror_tid.  update_lo has already been updated.
+        * update bref.mirror_tid.  update_xlo has already been updated.
         */
-       if (chain->bref.mirror_tid < info->sync_tid)
-               chain->bref.mirror_tid = info->sync_tid;
+       chain->bref.mirror_tid = chain->pmp->flush_tid;
 
        /*
         * Dispose of the modified bit.  FLUSH_CREATE should already be
         * set.
         */
-       KKASSERT(chain->flags & HAMMER2_CHAIN_FLUSH_CREATE);
+       KKASSERT((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) ||
+                chain == &hmp->vchain);
        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
        hammer2_pfs_memory_wakeup(chain->pmp);
 
@@ -861,10 +901,10 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
         * to adjust.
         */
        if (hammer2_debug & 0x1000) {
-               kprintf("Flush %p.%d %016jx/%d sync_tid=%016jx data=%016jx\n",
+               kprintf("Flush %p.%d %016jx/%d sync_xid=%08x data=%016jx\n",
                        chain, chain->bref.type,
                        chain->bref.key, chain->bref.keybits,
-                       info->sync_tid, chain->bref.data_off);
+                       info->sync_xid, chain->bref.data_off);
        }
        if (hammer2_debug & 0x2000) {
                Debugger("Flush hell");
@@ -880,7 +920,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
         */
        switch(chain->bref.type) {
        case HAMMER2_BREF_TYPE_FREEMAP:
-               hammer2_modify_volume(hmp);
+               KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
                hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
@@ -889,6 +929,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
                 * before it flushes vchain.  We must still hold fchain
                 * locked while copying voldata to volsync, however.
                 */
+               hammer2_voldata_lock(hmp);
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
 #if 0
                if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
@@ -906,7 +947,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
                 * must be synchronized to the volume header.
                 */
                hmp->voldata.mirror_tid = chain->bref.mirror_tid;
-               /*hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;*/
+               hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
 
                /*
                 * The volume header is flushed manually by the syncer, not
@@ -933,6 +974,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
                hmp->volsync = hmp->voldata;
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
                hammer2_chain_unlock(&hmp->fchain);
+               hammer2_voldata_unlock(hmp);
                break;
        case HAMMER2_BREF_TYPE_DATA:
                /*
@@ -966,16 +1008,20 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
        case HAMMER2_BREF_TYPE_INDIRECT:
        case HAMMER2_BREF_TYPE_FREEMAP_NODE:
        case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
+               KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
+               break;
        case HAMMER2_BREF_TYPE_INODE:
-               /*
-                * Device-backed.  Buffer will be flushed by the sync
-                * code XXX.
-                */
+               if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) {
+                       /* might not be mounted as a PFS */
+               } else {
+                       /* can't be mounted as a PFS */
+                       KKASSERT((chain->flags & HAMMER2_CHAIN_PFSROOT) == 0);
+               }
                KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
                break;
        default:
                KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
-               panic("hammer2_chain_flush_core: unsupported embedded bref %d",
+               panic("hammer2_flush_core: unsupported embedded bref %d",
                      chain->bref.type);
                /* NOT REACHED */
        }
@@ -989,7 +1035,7 @@ done:
        info->parent = saved_parent;
        *chainp = chain;
 
-       KKASSERT(chain->bref.mirror_tid <= info->sync_tid);
+       KKASSERT(chain->bref.mirror_tid <= chain->pmp->flush_tid);
 }
 
 /*
@@ -1001,13 +1047,16 @@ done:
  *
  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
  * flush scan order prevents any chains from being lost.  A child can be
- * executes more than once (update_lo is used to prevent infinite recursions).
+ * executes more than once (update_xlo is used to prevent infinite recursions).
  *
  * WARNING! If we do not call hammer2_flush_core() we must update
  *         bref.mirror_tid ourselves to indicate that the flush has
  *         processed the child.
  *
  * WARNING! parent->core spinlock is held on entry and return.
+ *
+ * WARNING! Flushes do not cross PFS boundaries.  Specifically, a flush must
+ *         not cross a pfs-root boundary.
  */
 static int
 hammer2_flush_pass1(hammer2_chain_t *child, void *data)
@@ -1027,9 +1076,9 @@ hammer2_flush_pass1(hammer2_chain_t *child, void *data)
         * (child can never be fchain or vchain so a special check isn't
         *  needed).
         */
-       if (child->modify_tid > trans->sync_tid) {
-               KKASSERT(child->delete_tid >= child->modify_tid);
-               /*child->update_lo = info->sync_tid;*/
+       if (child->modify_xid > trans->sync_xid) {
+               KKASSERT(child->delete_xid >= child->modify_xid);
+               /*child->update_xlo = info->sync_xid;*/
                /* do not update mirror_tid, pass2 will ignore chain */
                return (0);
        }
@@ -1047,16 +1096,21 @@ hammer2_flush_pass1(hammer2_chain_t *child, void *data)
        hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
 
        /*
+        * Never recurse across a mounted PFS boundary.
+        *
         * Recurse and collect deferral data.  We only recursively sync
-        * (basically) if update_lo has not been updated, indicating that
+        * (basically) if update_xlo has not been updated, indicating that
         * the child has not already been processed.
         */
-       if ((child->flags & HAMMER2_CHAIN_MODIFIED) ||
-           (child->update_lo < info->sync_tid &&
-            child->update_lo < child->update_hi)) {
-               ++info->depth;
-               hammer2_flush_core(info, &child, 0); /* XXX deleting */
-               --info->depth;
+       if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 ||
+           child->pmp == NULL) {
+               if ((child->flags & HAMMER2_CHAIN_MODIFIED) ||
+                   (child->update_xlo < info->sync_xid &&
+                    child->update_xlo < child->update_xhi)) {
+                       ++info->depth;
+                       hammer2_flush_core(info, &child, 0); /* XXX deleting */
+                       --info->depth;
+               }
        }
 
        /*
@@ -1070,9 +1124,9 @@ hammer2_flush_pass1(hammer2_chain_t *child, void *data)
         *      - parent not deleted as-of this transaction
         */
        if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) &&
-           child->delete_tid > trans->sync_tid &&
-           child->modify_tid > parent->update_lo &&
-           parent->delete_tid > trans->sync_tid) {
+           child->delete_xid > trans->sync_xid &&
+           child->modify_xid > parent->update_xlo &&
+           parent->delete_xid > trans->sync_xid) {
                info->domodify = 1;
        }
 
@@ -1085,10 +1139,10 @@ hammer2_flush_pass1(hammer2_chain_t *child, void *data)
         *      - parent not deleted as-of this transaction
         */
        if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
-           child->delete_tid <= trans->sync_tid &&
-           child->modify_tid <= parent->update_lo &&
-           child->delete_tid > parent->update_lo &&
-           parent->delete_tid > trans->sync_tid) {
+           child->delete_xid <= trans->sync_xid &&
+           child->modify_xid <= parent->update_xlo &&
+           child->delete_xid > parent->update_xlo &&
+           parent->delete_xid > trans->sync_xid) {
                info->domodify = 1;
        }
 
@@ -1128,8 +1182,8 @@ hammer2_flush_pass2(hammer2_chain_t *child, void *data)
         * Prefilter - Ignore children created after our flush_tid (not
         *             visible to our flush).
         */
-       if (child->modify_tid > trans->sync_tid) {
-               KKASSERT(child->delete_tid >= child->modify_tid);
+       if (child->modify_xid > trans->sync_xid) {
+               KKASSERT(child->delete_xid >= child->modify_xid);
                return 0;
        }
 
@@ -1184,8 +1238,7 @@ hammer2_flush_pass2(hammer2_chain_t *child, void *data)
        default:
                base = NULL;
                count = 0;
-               panic("hammer2_chain_flush_pass2: "
-                     "unrecognized blockref type: %d",
+               panic("hammer2_flush_pass2: unrecognized blockref type: %d",
                      parent->bref.type);
        }
 
@@ -1196,9 +1249,9 @@ hammer2_flush_pass2(hammer2_chain_t *child, void *data)
         *      - child created prior or during parent synchronization point
         *      - parent not yet synchronized to child's deletion
         */
-       if (child->delete_tid <= trans->sync_tid &&
-           child->modify_tid <= parent->update_lo &&
-           child->delete_tid > parent->update_lo) {
+       if (child->delete_xid <= trans->sync_xid &&
+           child->modify_xid <= parent->update_xlo &&
+           child->delete_xid > parent->update_xlo) {
                /* can't assert BMAPPED because state adjustment may occur
                 * before we are done, and BMAPPED only applies to the live
                 * parent.
@@ -1237,8 +1290,8 @@ hammer2_flush_pass3(hammer2_chain_t *child, void *data)
         * Prefilter - Ignore children created after our flush_tid (not
         *             visible to our flush).
         */
-       if (child->modify_tid > trans->sync_tid) {
-               KKASSERT(child->delete_tid >= child->modify_tid);
+       if (child->modify_xid > trans->sync_xid) {
+               KKASSERT(child->delete_xid >= child->modify_xid);
                return 0;
        }
 
@@ -1293,7 +1346,7 @@ hammer2_flush_pass3(hammer2_chain_t *child, void *data)
        default:
                base = NULL;
                count = 0;
-               panic("hammer2_chain_flush_pass2: "
+               panic("hammer2_flush_pass3: "
                      "unrecognized blockref type: %d",
                      parent->bref.type);
        }
@@ -1304,8 +1357,8 @@ hammer2_flush_pass3(hammer2_chain_t *child, void *data)
         *      - child not deleted or deletion is beyond transaction id
         *      - child created beyond parent synchronization point
         */
-       if (child->delete_tid > trans->sync_tid &&
-           child->modify_tid > parent->update_lo) {
+       if (child->delete_xid > trans->sync_xid &&
+           child->modify_xid > parent->update_xlo) {
                if (base) {
                        hammer2_rollup_stats(parent, child, 1);
                        hammer2_base_insert(trans, parent, base, count,
@@ -1338,8 +1391,8 @@ hammer2_flush_pass4(hammer2_chain_t *child, void *data)
         * Prefilter - Ignore children created after our flush_tid (not
         *             visible to our flush).
         */
-       if (child->modify_tid > trans->sync_tid) {
-               KKASSERT(child->delete_tid >= child->modify_tid);
+       if (child->modify_xid > trans->sync_xid) {
+               KKASSERT(child->delete_xid >= child->modify_xid);
                return 0;
        }
 
@@ -1364,9 +1417,9 @@ hammer2_flush_pass4(hammer2_chain_t *child, void *data)
                 * Deleting from blockmap, move child out of dbtree
                 * and clear BMAPPED.  Child should not be on RBTREE.
                 */
-               if (child->delete_tid <= trans->sync_tid &&
-                   child->modify_tid <= parent->update_lo &&
-                   child->delete_tid > parent->update_lo &&
+               if (child->delete_xid <= trans->sync_xid &&
+                   child->modify_xid <= parent->update_xlo &&
+                   child->delete_xid > parent->update_xlo &&
                    (child->flags & HAMMER2_CHAIN_BMAPPED)) {
                        KKASSERT(child->flags & HAMMER2_CHAIN_ONDBTREE);
                        RB_REMOVE(hammer2_chain_tree, &above->dbtree, child);
@@ -1416,8 +1469,8 @@ hammer2_flush_pass5(hammer2_chain_t *child, void *data)
         * Prefilter - Ignore children created after our flush_tid (not
         *             visible to our flush).
         */
-       if (child->modify_tid > trans->sync_tid) {
-               KKASSERT(child->delete_tid >= child->modify_tid);
+       if (child->modify_xid > trans->sync_xid) {
+               KKASSERT(child->delete_xid >= child->modify_xid);
                return 0;
        }
 
@@ -1441,8 +1494,8 @@ hammer2_flush_pass5(hammer2_chain_t *child, void *data)
                /*
                 * Inserting into blockmap, place child in rbtree or dbtree.
                 */
-               if (child->delete_tid > trans->sync_tid &&
-                   child->modify_tid > parent->update_lo &&
+               if (child->delete_xid > trans->sync_xid &&
+                   child->modify_xid > parent->update_xlo &&
                    (child->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
                        if (child->flags & HAMMER2_CHAIN_ONDBQ) {
                                TAILQ_REMOVE(&above->dbq, child, db_entry);
@@ -1501,7 +1554,7 @@ hammer2_flush_pass5(hammer2_chain_t *child, void *data)
                        hammer2_chain_drop(child);
                }
                if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
-                   child->delete_tid <= trans->sync_tid) {
+                   child->delete_xid <= trans->sync_xid) {
                        KKASSERT((parent->flags & HAMMER2_CHAIN_DELETED) ||
                                 (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0);
                        /* XXX delete-duplicate chain insertion mech wrong */
index b7f29a3..2ca555a 100644 (file)
@@ -122,46 +122,9 @@ hammer2_freemap_reserve(hammer2_trans_t *trans, hammer2_chain_t *chain,
        }
 
        /*
-        * Calculate new index (our 'allocation').  We have to be careful
-        * here as there can be two different transaction ids running
-        * concurrently when a flush is in-progress.
-        *
-        * We also want to make sure, for algorithmic repeatability, that
-        * the index sequences are monotonic with transaction ids.  Some
-        * skipping is allowed as long as we ensure that all four volume
-        * header backups have consistent freemaps.
-        *
-        * FLUSH  NORMAL FLUSH  NORMAL FLUSH  NORMAL FLUSH  NORMAL
-        * N+=1   N+=2
-        * (0->1) (1->3) (3->4) (4->6) (6->7) (7->9) (9->10) (10->12)
-        *
-        * [-concurrent-][-concurrent-][-concurrent-][-concurrent-]
-        *
-        * (alternative first NORMAL might be 0->2 if flush had not yet
-        *  modified the chain, this is the worst case).
+        * Calculate new index (our 'allocation').
         */
-       if ((trans->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
-               /*
-                * Normal transactions always run with the highest TID.
-                * But if a flush is in-progress we want to reserve a slot
-                * for the flush with a lower TID running concurrently to
-                * do a delete-duplicate.
-                */
-               index = (index + 2) % HAMMER2_ZONE_FREEMAP_COPIES;
-       } else if (trans->flags & HAMMER2_TRANS_ISALLOCATING) {
-               /*
-                * Flush transaction, hammer2_freemap.c itself is doing a
-                * delete-duplicate during an allocation within the freemap.
-                */
-               index = (index + 1) % HAMMER2_ZONE_FREEMAP_COPIES;
-       } else {
-               /*
-                * Flush transaction, hammer2_flush.c is doing a
-                * delete-duplicate on the freemap while flushing
-                * hmp->fchain.
-                */
-               index = (index + 1) % HAMMER2_ZONE_FREEMAP_COPIES;
-       }
+       index = (index + 1) % HAMMER2_ZONE_FREEMAP_COPIES;
 
        /*
         * Calculate the block offset of the reserved block.  This will
@@ -222,12 +185,6 @@ hammer2_freemap_reserve(hammer2_trans_t *trans, hammer2_chain_t *chain,
  *
  * ip and bpref are only used as a heuristic to determine locality of
  * reference.  bref->key may also be used heuristically.
- *
- * WARNING! When called from a flush we have to use the 'live' sync_tid
- *         and not the flush sync_tid.  The live sync_tid is the flush
- *         sync_tid + 1.  That is, freemap allocations which occur during
- *         a flush are not part of the flush.  Crash-recovery will restore
- *         any lost allocations.
  */
 int
 hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain,
@@ -250,39 +207,20 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain,
        radix = hammer2_getradix(bytes);
        KKASSERT((size_t)1 << radix == bytes);
 
-       /*
-        * Freemap blocks themselves are simply assigned from the reserve
-        * area, not allocated from the freemap.
-        */
        if (bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
            bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
-               return (hammer2_freemap_reserve(trans, chain, radix));
+               /*
+                * Freemap blocks themselves are assigned from the reserve
+                * area, not allocated from the freemap.
+                */
+               error = hammer2_freemap_reserve(trans, chain, radix);
+               return error;
        }
 
-#if 0
-       /*
-        * (this mechanic is no longer used, DOMAYFREE is used only by
-        * the bulk freemap scan now).
-        *
-        * Mark previously allocated block as possibly freeable.  There might
-        * be snapshots and other races so we can't just mark it fully free.
-        * (XXX optimize this for the current-transaction create+delete case)
-        */
-       if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) {
-               hammer2_freemap_adjust(trans, hmp, bref,
-                                      HAMMER2_FREEMAP_DOMAYFREE);
-       }
-#endif
+       KKASSERT(bytes >= HAMMER2_ALLOC_MIN && bytes <= HAMMER2_ALLOC_MAX);
 
-       /*
-        * Setting ISALLOCATING ensures correct operation even when the
-        * flusher itself is making allocations.
-        */
-       KKASSERT(bytes >= HAMMER2_MIN_ALLOC && bytes <= HAMMER2_MAX_ALLOC);
-       KKASSERT((trans->flags & HAMMER2_TRANS_ISALLOCATING) == 0);
-       atomic_set_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
        if (trans->flags & (HAMMER2_TRANS_ISFLUSH | HAMMER2_TRANS_PREFLUSH))
-               ++trans->sync_tid;
+               ++trans->sync_xid;
 
        /*
         * Calculate the starting point for our allocation search.
@@ -344,9 +282,8 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain,
        hmp->heur_freemap[hindex] = iter.bnext;
        hammer2_chain_unlock(parent);
 
-       atomic_clear_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
        if (trans->flags & (HAMMER2_TRANS_ISFLUSH | HAMMER2_TRANS_PREFLUSH))
-               --trans->sync_tid;
+               --trans->sync_xid;
 
        return (error);
 }
@@ -406,7 +343,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                kprintf("freemap create L1 @ %016jx bpref %016jx\n",
                        key, iter->bpref);
 #endif
-               error = hammer2_chain_create(trans, parentp, &chain,
+               error = hammer2_chain_create(trans, parentp, &chain, hmp->spmp,
                                     key, HAMMER2_FREEMAP_LEVEL1_RADIX,
                                     HAMMER2_BREF_TYPE_FREEMAP_LEAF,
                                     HAMMER2_FREEMAP_LEVELN_PSIZE);
@@ -588,7 +525,7 @@ hammer2_bmap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp,
            bmap->linear < HAMMER2_SEGSIZE) {
                KKASSERT(bmap->linear >= 0 &&
                         bmap->linear + size <= HAMMER2_SEGSIZE &&
-                        (bmap->linear & (HAMMER2_MIN_ALLOC - 1)) == 0);
+                        (bmap->linear & (HAMMER2_ALLOC_MIN - 1)) == 0);
                offset = bmap->linear;
                i = offset / (HAMMER2_SEGSIZE / 8);
                j = (offset / (HAMMER2_FREEMAP_BLOCK_SIZE / 2)) & 30;
@@ -685,8 +622,9 @@ success:
        *basep += offset;
 
        hammer2_voldata_lock(hmp);
+       hammer2_voldata_modify(hmp);
        hmp->voldata.allocator_free -= size;  /* XXX */
-       hammer2_voldata_unlock(hmp, 1);
+       hammer2_voldata_unlock(hmp);
 
        return(0);
 }
@@ -789,12 +727,6 @@ hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
  * the moment we depend on the bulk freescan to actually free blocks.  It
  * will still call this routine with a non-zero how to stage possible frees
  * and to do the actual free.
- *
- * WARNING! When called from a flush we have to use the 'live' sync_tid
- *         and not the flush sync_tid.  The live sync_tid is the flush
- *         sync_tid + 1.  That is, freemap allocations which occur during
- *         a flush are not part of the flush.  Crash-recovery will restore
- *         any lost allocations.
  */
 void
 hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
@@ -826,7 +758,7 @@ hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
 
        radix = (int)data_off & HAMMER2_OFF_MASK_RADIX;
        data_off &= ~HAMMER2_OFF_MASK_RADIX;
-       KKASSERT(radix <= HAMMER2_MAX_RADIX);
+       KKASSERT(radix <= HAMMER2_RADIX_MAX);
 
        bytes = (size_t)1 << radix;
        class = (bref->type << 8) | hammer2_devblkradix(radix);
@@ -839,10 +771,6 @@ hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                return;
 
        KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
-       KKASSERT((trans->flags & HAMMER2_TRANS_ISALLOCATING) == 0);
-       atomic_set_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
-       if (trans->flags & (HAMMER2_TRANS_ISFLUSH | HAMMER2_TRANS_PREFLUSH))
-               ++trans->sync_tid;
 
        /*
         * Lookup the level1 freemap chain.  The chain must exist.
@@ -876,7 +804,7 @@ hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
         * bref.check.freemap structure.
         */
        if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) {
-               error = hammer2_chain_create(trans, &parent, &chain,
+               error = hammer2_chain_create(trans, &parent, &chain, hmp->spmp,
                                     key, HAMMER2_FREEMAP_LEVEL1_RADIX,
                                     HAMMER2_BREF_TYPE_FREEMAP_LEAF,
                                     HAMMER2_FREEMAP_LEVELN_PSIZE);
@@ -1041,7 +969,4 @@ again:
        hammer2_chain_unlock(chain);
 done:
        hammer2_chain_unlock(parent);
-       atomic_clear_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
-       if (trans->flags & (HAMMER2_TRANS_ISFLUSH | HAMMER2_TRANS_PREFLUSH))
-               --trans->sync_tid;
 }
index 8a5cd14..67d1705 100644 (file)
@@ -261,14 +261,15 @@ hammer2_inode_lookup(hammer2_pfsmount_t *pmp, hammer2_tid_t inum)
 {
        hammer2_inode_t *ip;
 
-       if (pmp) {
+       KKASSERT(pmp);
+       if (pmp->spmp_hmp) {
+               ip = NULL;
+       } else {
                spin_lock(&pmp->inum_spin);
                ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
                if (ip)
                        hammer2_inode_ref(ip);
                spin_unlock(&pmp->inum_spin);
-       } else {
-               ip = NULL;
        }
        return(ip);
 }
@@ -301,12 +302,10 @@ hammer2_inode_drop(hammer2_inode_t *ip)
                        /*
                         * Transition to zero, must interlock with
                         * the inode inumber lookup tree (if applicable).
-                        *
-                        * NOTE: The super-root inode has no pmp.
                         */
                        pmp = ip->pmp;
-                       if (pmp)
-                               spin_lock(&pmp->inum_spin);
+                       KKASSERT(pmp);
+                       spin_lock(&pmp->inum_spin);
 
                        if (atomic_cmpset_int(&ip->refs, 1, 0)) {
                                KKASSERT(ip->topo_cst.count == 0);
@@ -316,8 +315,7 @@ hammer2_inode_drop(hammer2_inode_t *ip)
                                        RB_REMOVE(hammer2_inode_tree,
                                                  &pmp->inum_tree, ip);
                                }
-                               if (pmp)
-                                       spin_unlock(&pmp->inum_spin);
+                               spin_unlock(&pmp->inum_spin);
 
                                pip = ip->pip;
                                ip->pip = NULL;
@@ -334,21 +332,12 @@ hammer2_inode_drop(hammer2_inode_t *ip)
                                 * dispose of our implied reference from
                                 * ip->pip.  We can simply loop on it.
                                 */
-                               if (pmp) {
-                                       KKASSERT((ip->flags &
-                                                 HAMMER2_INODE_SROOT) == 0);
-                                       kfree(ip, pmp->minode);
-                                       atomic_add_long(&pmp->inmem_inodes, -1);
-                               } else {
-                                       KKASSERT(ip->flags &
-                                                HAMMER2_INODE_SROOT);
-                                       kfree(ip, M_HAMMER2);
-                               }
+                               kfree(ip, pmp->minode);
+                               atomic_add_long(&pmp->inmem_inodes, -1);
                                ip = pip;
                                /* continue with pip (can be NULL) */
                        } else {
-                               if (pmp)
-                                       spin_unlock(&ip->pmp->inum_spin);
+                               spin_unlock(&ip->pmp->inum_spin);
                        }
                } else {
                        /*
@@ -514,9 +503,6 @@ hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
  *
  * The hammer2_inode structure regulates the interface between the high level
  * kernel VNOPS API and the filesystem backend (the chains).
- *
- * WARNING!  The mount code is allowed to pass dip == NULL for iroot and
- *          is allowed to pass pmp == NULL and dip == NULL for sroot.
  */
 hammer2_inode_t *
 hammer2_inode_get(hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
@@ -527,6 +513,7 @@ hammer2_inode_get(hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
        const hammer2_inode_data_t *nipdata;
 
        KKASSERT(hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
+       KKASSERT(pmp);
 
        /*
         * Interlocked lookup/ref of the inode.  This code is only needed
@@ -541,7 +528,13 @@ again:
                        break;
 
                ccms_thread_lock(&nip->topo_cst, CCMS_STATE_EXCLUSIVE);
-               if ((nip->flags & HAMMER2_INODE_ONRBTREE) == 0) { /* race */
+
+               /*
+                * Handle SMP race (not applicable to the super-root spmp
+                * which can't index inodes due to duplicative inode numbers).
+                */
+               if (pmp->spmp_hmp == NULL &&
+                   (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
                        ccms_thread_unlock(&nip->topo_cst);
                        hammer2_inode_drop(nip);
                        continue;
@@ -553,15 +546,12 @@ again:
        /*
         * We couldn't find the inode number, create a new inode.
         */
-       if (pmp) {
-               nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
-               atomic_add_long(&pmp->inmem_inodes, 1);
-               hammer2_pfs_memory_inc(pmp);
-               hammer2_pfs_memory_wakeup(pmp);
-       } else {
-               nip = kmalloc(sizeof(*nip), M_HAMMER2, M_WAITOK | M_ZERO);
+       nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
+       atomic_add_long(&pmp->inmem_inodes, 1);
+       hammer2_pfs_memory_inc(pmp);
+       hammer2_pfs_memory_wakeup(pmp);
+       if (pmp->spmp_hmp)
                nip->flags = HAMMER2_INODE_SROOT;
-       }
 
        /*
         * Initialize nip's cluster
@@ -595,7 +585,7 @@ again:
         * Attempt to add the inode.  If it fails we raced another inode
         * get.  Undo all the work and try again.
         */
-       if (pmp) {
+       if (pmp->spmp_hmp == NULL) {
                spin_lock(&pmp->inum_spin);
                if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
                        spin_unlock(&pmp->inum_spin);
@@ -622,6 +612,9 @@ again:
  * under the super-root, so the inode number is set to 1 in this case.
  *
  * dip is not locked on entry.
+ *
+ * NOTE: When used to create a snapshot, the inode is temporarily associated
+ *      with the super-root spmp. XXX should pass new pmp for snapshot.
  */
 hammer2_inode_t *
 hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
@@ -1439,7 +1432,7 @@ hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
         * Find the hidden directory
         */
        bzero(&key_dummy, sizeof(key_dummy));
-       hammer2_trans_init(&trans, pmp, NULL, 0);
+       hammer2_trans_init(&trans, pmp, 0);
 
        cparent = hammer2_inode_lock_ex(pmp->iroot);
        cluster = hammer2_cluster_lookup(cparent, &key_dummy,
@@ -1456,7 +1449,7 @@ hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
                 */
                count = 0;
                scan = hammer2_cluster_lookup(cluster, &key_next,
-                                             0, HAMMER2_MAX_TID,
+                                             0, HAMMER2_TID_MAX,
                                              HAMMER2_LOOKUP_NODATA, &ddflag);
                while (scan) {
                        if (hammer2_cluster_type(scan) ==
@@ -1465,7 +1458,7 @@ hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
                                ++count;
                        }
                        scan = hammer2_cluster_next(cluster, scan, &key_next,
-                                                   0, HAMMER2_MAX_TID,
+                                                   0, HAMMER2_TID_MAX,
                                                    HAMMER2_LOOKUP_NODATA);
                }
 
index fa85505..2fcc1bc 100644 (file)
@@ -192,7 +192,7 @@ hammer2_ioctl_remote_scan(hammer2_inode_t *ip, void *data)
 
        hammer2_voldata_lock(hmp);
        remote->copy1 = hmp->voldata.copyinfo[copyid];
-       hammer2_voldata_unlock(hmp, 0);
+       hammer2_voldata_unlock(hmp);
 
        /*
         * Adjust nextid (GET only)
@@ -236,12 +236,12 @@ hammer2_ioctl_remote_add(hammer2_inode_t *ip, void *data)
                        goto failed;
                }
        }
-       hammer2_modify_volume(hmp);
+       hammer2_voldata_modify(hmp);
        remote->copy1.copyid = copyid;
        hmp->voldata.copyinfo[copyid] = remote->copy1;
        hammer2_volconf_update(pmp, copyid);
 failed:
-       hammer2_voldata_unlock(hmp, 1);
+       hammer2_voldata_unlock(hmp);
        return (error);
 }
 
@@ -276,11 +276,11 @@ hammer2_ioctl_remote_del(hammer2_inode_t *ip, void *data)
                        goto failed;
                }
        }
-       hammer2_modify_volume(hmp);
+       hammer2_voldata_modify(hmp);
        hmp->voldata.copyinfo[copyid].copyid = 0;
        hammer2_volconf_update(pmp, copyid);
 failed:
-       hammer2_voldata_unlock(hmp, 1);
+       hammer2_voldata_unlock(hmp);
        return (error);
 }
 
@@ -300,8 +300,9 @@ hammer2_ioctl_remote_rep(hammer2_inode_t *ip, void *data)
                return (EINVAL);
 
        hammer2_voldata_lock(hmp);
+       hammer2_voldata_modify(hmp);
        /*hammer2_volconf_update(pmp, copyid);*/
-       hammer2_voldata_unlock(hmp, 1);
+       hammer2_voldata_unlock(hmp);
 
        return(0);
 }
@@ -330,7 +331,7 @@ hammer2_ioctl_socket_set(hammer2_inode_t *ip, void *data)
                return (EINVAL);
 
        hammer2_voldata_lock(hmp);
-       hammer2_voldata_unlock(hmp, 0);
+       hammer2_voldata_unlock(hmp);
 
        return(0);
 }
@@ -362,7 +363,7 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
        error = 0;
        hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
        pfs = data;
-       cparent = hammer2_inode_lock_ex(hmp->sroot);
+       cparent = hammer2_inode_lock_ex(hmp->spmp->iroot);
        rcluster = hammer2_inode_lock_ex(ip->pmp->iroot);
 
        /*
@@ -429,7 +430,7 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                pfs->name_next = (hammer2_key_t)-1;
                error = ENOENT;
        }
-       hammer2_inode_unlock_ex(hmp->sroot, cparent);
+       hammer2_inode_unlock_ex(hmp->spmp->iroot, cparent);
 
        return (error);
 }
@@ -454,7 +455,7 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
        error = 0;
        hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
        pfs = data;
-       cparent = hammer2_inode_lock_sh(hmp->sroot);
+       cparent = hammer2_inode_lock_sh(hmp->spmp->iroot);
 
        pfs->name[sizeof(pfs->name) - 1] = 0;
        len = strlen(pfs->name);
@@ -493,7 +494,7 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
        } else {
                error = ENOENT;
        }
-       hammer2_inode_unlock_sh(hmp->sroot, cparent);
+       hammer2_inode_unlock_sh(hmp->spmp->iroot, cparent);
 
        return (error);
 }
@@ -520,8 +521,8 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
                return(EINVAL);
        pfs->name[sizeof(pfs->name) - 1] = 0;   /* ensure 0-termination */
 
-       hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_NEWINODE);
-       nip = hammer2_inode_create(&trans, hmp->sroot, NULL, NULL,
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE);
+       nip = hammer2_inode_create(&trans, hmp->spmp->iroot, NULL, NULL,
                                     pfs->name, strlen(pfs->name),
                                     &ncluster, &error);
        if (error == 0) {
@@ -557,8 +558,8 @@ hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data)
        int error;
 
        hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
-       hammer2_trans_init(&trans, ip->pmp, NULL, 0);
-       error = hammer2_unlink_file(&trans, hmp->sroot,
+       hammer2_trans_init(&trans, ip->pmp, 0);
+       error = hammer2_unlink_file(&trans, hmp->spmp->iroot,
                                    pfs->name, strlen(pfs->name),
                                    2, NULL, NULL);
        hammer2_trans_done(&trans);
@@ -581,7 +582,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
 
        hammer2_vfs_sync(ip->pmp->mp, MNT_WAIT);
 
-       hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_NEWINODE);
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE);
        cparent = hammer2_inode_lock_ex(ip);
        error = hammer2_cluster_snapshot(&trans, cparent, pfs);
        hammer2_inode_unlock_ex(ip, cparent);
@@ -626,7 +627,7 @@ hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data)
        int error = 0;
        int dosync = 0;
 
-       hammer2_trans_init(&trans, ip->pmp, NULL, 0);
+       hammer2_trans_init(&trans, ip->pmp, 0);
        cparent = hammer2_inode_lock_ex(ip);
        ripdata = &hammer2_cluster_data(cparent)->ipdata;
 
index 09667ab..0f2772f 100644 (file)
@@ -64,23 +64,6 @@ hammer2_mount_unlock(hammer2_mount_t *hmp)
        ccms_thread_unlock(&hmp->vchain.core->cst);
 }
 
-void
-hammer2_voldata_lock(hammer2_mount_t *hmp)
-{
-       lockmgr(&hmp->voldatalk, LK_EXCLUSIVE);
-}
-
-void
-hammer2_voldata_unlock(hammer2_mount_t *hmp, int modify)
-{
-       if (modify &&
-           (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
-               atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
-               hammer2_chain_ref(&hmp->vchain);
-       }
-       lockmgr(&hmp->voldatalk, LK_RELEASE);
-}
-
 /*
  * Return the directory entry type for an inode.
  *
@@ -286,15 +269,15 @@ hammer2_dirhash(const unsigned char *name, size_t len)
  * the specified number of bytes.
  *
  * Always returns at least the minimum media allocation
- * size radix, HAMMER2_MIN_RADIX (10), which is 1KB.
+ * size radix, HAMMER2_RADIX_MIN (10), which is 1KB.
  */
 int
 hammer2_allocsize(size_t bytes)
 {
        int radix;
 
-       if (bytes < HAMMER2_MIN_ALLOC)
-               bytes = HAMMER2_MIN_ALLOC;
+       if (bytes < HAMMER2_ALLOC_MIN)
+               bytes = HAMMER2_ALLOC_MIN;
        if (bytes == HAMMER2_PBUFSIZE)
                radix = HAMMER2_PBUFRADIX;
        else if (bytes >= 16384)
@@ -302,7 +285,7 @@ hammer2_allocsize(size_t bytes)
        else if (bytes >= 1024)
                radix = 10;
        else
-               radix = HAMMER2_MIN_RADIX;
+               radix = HAMMER2_RADIX_MIN;
 
        while (((size_t)1 << radix) < bytes)
                ++radix;
@@ -323,8 +306,8 @@ hammer2_getradix(size_t bytes)
                radix = HAMMER2_PBUFRADIX;
        else if (bytes >= HAMMER2_LBUFSIZE)
                radix = HAMMER2_LBUFRADIX;
-       else if (bytes >= HAMMER2_MIN_ALLOC)    /* clamp */
-               radix = HAMMER2_MIN_RADIX;
+       else if (bytes >= HAMMER2_ALLOC_MIN)    /* clamp */
+               radix = HAMMER2_RADIX_MIN;
        else
                radix = 0;
 
@@ -398,7 +381,7 @@ hammer2_calc_physical(hammer2_inode_t *ip,
                return (0);
        eofbytes = (int)(ipdata->size - lbase);
        pblksize = lblksize;
-       while (pblksize >= eofbytes && pblksize >= HAMMER2_MIN_ALLOC)
+       while (pblksize >= eofbytes && pblksize >= HAMMER2_ALLOC_MIN)
                pblksize >>= 1;
        pblksize <<= 1;
 
index 257c158..f8753e6 100644 (file)
@@ -300,6 +300,8 @@ hammer2_vfs_init(struct vfsconf *conf)
 
        hammer2_limit_dirty_chains = desiredvnodes / 10;
 
+       hammer2_trans_manage_init();
+
        return (error);
 }
 
@@ -312,6 +314,36 @@ hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
        return 0;
 }
 
+/*
+ * Core PFS allocator.  Used to allocate the pmp structure for PFS cluster
+ * mounts and the spmp structure for media (hmp) structures.
+ */
+static hammer2_pfsmount_t *
+hammer2_pfsalloc(const hammer2_inode_data_t *ipdata, hammer2_tid_t alloc_tid)
+{
+       hammer2_pfsmount_t *pmp;
+
+       pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
+       kmalloc_create(&pmp->minode, "HAMMER2-inodes");
+       kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
+       lockinit(&pmp->lock, "pfslk", 0, 0);
+       spin_init(&pmp->inum_spin);
+       RB_INIT(&pmp->inum_tree);
+       TAILQ_INIT(&pmp->unlinkq);
+       spin_init(&pmp->unlinkq_spin);
+
+       pmp->alloc_tid = alloc_tid + 1;   /* our first media transaction id */
+       pmp->flush_tid = pmp->alloc_tid;
+       if (ipdata) {
+               pmp->inode_tid = ipdata->pfs_inum + 1;
+               pmp->pfs_clid = ipdata->pfs_clid;
+       }
+       mtx_init(&pmp->wthread_mtx);
+       bioq_init(&pmp->wthread_bioq);
+
+       return pmp;
+}
+
 /*
  * Mount or remount HAMMER2 fileystem from physical media
  *
@@ -339,6 +371,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 {
        struct hammer2_mount_info info;
        hammer2_pfsmount_t *pmp;
+       hammer2_pfsmount_t *spmp;
        hammer2_mount_t *hmp;
        hammer2_key_t key_next;
        hammer2_key_t key_dummy;
@@ -347,10 +380,10 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        struct nlookupdata nd;
        hammer2_chain_t *parent;
        hammer2_chain_t *rchain;
-       hammer2_chain_t *schain;
        hammer2_cluster_t *cluster;
        hammer2_cluster_t *cparent;
        const hammer2_inode_data_t *ipdata;
+       hammer2_blockref_t bref;
        struct file *fp;
        char devstr[MNAMELEN];
        size_t size;
@@ -454,6 +487,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         * the H2 device mount (hmp).
         */
        if (hmp == NULL) {
+               hammer2_chain_t *schain;
+               hammer2_xid_t xid;
+
                if (error == 0 && vcount(devvp) > 0)
                        error = EBUSY;
 
@@ -486,9 +522,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
                RB_INIT(&hmp->iotree);
 
-               lockinit(&hmp->alloclk, "h2alloc", 0, 0);
-               lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
-               TAILQ_INIT(&hmp->transq);
+               lockinit(&hmp->vollk, "h2vol", 0, 0);
 
                /*
                 * vchain setup. vchain.data is embedded.
@@ -501,7 +535,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hmp->vchain.data = (void *)&hmp->voldata;
                hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
                hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
-               hmp->vchain.delete_tid = HAMMER2_MAX_TID;
+               hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
+               hmp->vchain.delete_xid = HAMMER2_XID_MAX;
 
                hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL);
                /* hmp->vchain.u.xxx is left NULL */
@@ -521,10 +556,11 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
                hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
                hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
+               hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
                hmp->fchain.bref.methods =
                        HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
                        HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
-               hmp->fchain.delete_tid = HAMMER2_MAX_TID;
+               hmp->fchain.delete_xid = HAMMER2_XID_MAX;
 
                hammer2_chain_core_alloc(NULL, &hmp->fchain, NULL);
                /* hmp->fchain.u.xxx is left NULL */
@@ -546,12 +582,25 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                 * Really important to get these right or flush will get
                 * confused.
                 */
+               hmp->spmp = hammer2_pfsalloc(NULL, hmp->voldata.mirror_tid);
+               kprintf("alloc spmp %p tid %016jx\n",
+                       hmp->spmp, hmp->voldata.mirror_tid);
+               spmp = hmp->spmp;
+               spmp->inode_tid = 1;
+
+               xid = 0;
                hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
-               hmp->vchain.modify_tid = hmp->voldata.mirror_tid;
-               hmp->vchain.update_lo = hmp->voldata.mirror_tid;
+               hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
+               hmp->vchain.modify_xid = xid;
+               hmp->vchain.update_xlo = xid;
+               hmp->vchain.update_xhi = xid;
+               hmp->vchain.pmp = spmp;
                hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
-               hmp->fchain.modify_tid = hmp->voldata.freemap_tid;
-               hmp->fchain.update_lo = hmp->voldata.freemap_tid;
+               hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
+               hmp->fchain.modify_xid = xid;
+               hmp->fchain.update_xlo = xid;
+               hmp->fchain.update_xhi = xid;
+               hmp->fchain.pmp = spmp;
 
                /*
                 * First locate the super-root inode, which is key 0
@@ -575,27 +624,39 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                }
 
                /*
+                * Sanity-check schain's pmp, finish initializing spmp.
+                */
+               KKASSERT(schain->pmp == spmp);
+               spmp->pfs_clid = schain->data->ipdata.pfs_clid;
+
+               /*
+                * NOTE: The CHAIN_PFSROOT is not set on the super-root inode.
                 * NOTE: inode_get sucks up schain's lock.
                 */
-               atomic_set_int(&schain->flags, HAMMER2_CHAIN_PFSROOT);
                cluster = hammer2_cluster_from_chain(schain);
-               hmp->sroot = hammer2_inode_get(NULL, NULL, cluster);
-               hammer2_inode_ref(hmp->sroot);
-               hammer2_inode_unlock_ex(hmp->sroot, cluster);
+               spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
+               spmp->spmp_hmp = hmp;
+               hammer2_inode_ref(spmp->iroot);
+               hammer2_inode_unlock_ex(spmp->iroot, cluster);
                schain = NULL;
-               /* leave hmp->sroot with one ref */
+               /* leave spmp->iroot with one ref */
 
                if ((mp->mnt_flag & MNT_RDONLY) == 0) {
                        error = hammer2_recovery(hmp);
                        /* XXX do something with error */
                }
+       } else {
+               spmp = hmp->spmp;
        }
        ++hmp->pmp_count;
 
        /*
         * Lookup mount point under the media-localized super-root.
+        *
+        * cluster->pmp will incorrectly point to spmp and must be fixed
+        * up later on.
         */
-       cparent = hammer2_inode_lock_ex(hmp->sroot);
+       cparent = hammer2_inode_lock_ex(spmp->iroot);
        lhc = hammer2_dirhash(label, strlen(label));
        cluster = hammer2_cluster_lookup(cparent, &key_next,
                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
@@ -610,7 +671,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                                            key_next,
                                            lhc + HAMMER2_DIRHASH_LOMASK, 0);
        }
-       hammer2_inode_unlock_ex(hmp->sroot, cparent);
+       hammer2_inode_unlock_ex(spmp->iroot, cparent);
 
        if (cluster == NULL) {
                kprintf("hammer2_mount: PFS label not found\n");
@@ -623,6 +684,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 
        for (i = 0; i < cluster->nchains; ++i) {
                rchain = cluster->array[i];
+               KKASSERT(rchain->pmp == NULL);
                if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
                        kprintf("hammer2_mount: PFS label already mounted!\n");
                        hammer2_cluster_unlock(cluster);
@@ -650,11 +712,15 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         * point.  If it is, add us to the cluster.
         */
        ipdata = &hammer2_cluster_data(cluster)->ipdata;
+       hammer2_cluster_bref(cluster, &bref);
        TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
-               if (bcmp(&pmp->pfs_clid, &ipdata->pfs_clid,
-                   sizeof(pmp->pfs_clid)) == 0)
+               if (pmp->spmp_hmp == NULL &&
+                   bcmp(&pmp->pfs_clid, &ipdata->pfs_clid,
+                        sizeof(pmp->pfs_clid)) == 0) {
                        break;
+               }
        }
+
        if (pmp) {
                int i;
                int j;
@@ -679,6 +745,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                kprintf("hammer2_vfs_mount: Adding pfs to existing cluster\n");
                j = pmp->iroot->cluster.nchains;
                for (i = 0; i < cluster->nchains; ++i) {
+                       rchain = cluster->array[i];
+                       KKASSERT(rchain->pmp == NULL);
+                       rchain->pmp = pmp;
                        hammer2_chain_ref(cluster->array[i]);
                        pmp->iroot->cluster.array[j] = cluster->array[i];
                        ++j;
@@ -700,16 +769,15 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         *
         * From this point on we have to call hammer2_unmount() on failure.
         */
-       pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
-
-       kmalloc_create(&pmp->minode, "HAMMER2-inodes");
-       kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
-       lockinit(&pmp->lock, "pfslk", 0, 0);
-       spin_init(&pmp->inum_spin);
-       RB_INIT(&pmp->inum_tree);
-       TAILQ_INIT(&pmp->unlinkq);
-       spin_init(&pmp->unlinkq_spin);
-       pmp->pfs_clid = ipdata->pfs_clid;
+       pmp = hammer2_pfsalloc(ipdata, bref.mirror_tid);
+       kprintf("PMP mirror_tid is %016jx\n", bref.mirror_tid);
+       for (i = 0; i < cluster->nchains; ++i) {
+               rchain = cluster->array[i];
+               KKASSERT(rchain->pmp == NULL);
+               rchain->pmp = pmp;
+               atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
+       }
+       cluster->pmp = pmp;
 
        kdmsg_iocom_init(&pmp->iocom, pmp,
                         KDMSG_IOCOMF_AUTOCONN |
@@ -748,12 +816,6 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         * After this point hammer2_vfs_unmount() has visibility on hmp
         * and manual hmp1/hmp2 calls are not needed on fatal errors.
         */
-       for (i = 0; i < cluster->nchains; ++i) {
-               rchain = cluster->array[i];
-               KKASSERT(rchain->pmp == NULL);  /* tracking pmp for rchain */
-               rchain->pmp = pmp;
-               atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
-       }
        pmp->iroot = hammer2_inode_get(pmp, NULL, cluster);
        hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
        hammer2_inode_unlock_ex(pmp->iroot, cluster);
@@ -761,9 +823,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        /*
         * The logical file buffer bio write thread handles things
         * like physical block assignment and compression.
+        *
+        * (only applicable to pfs mounts, not applicable to spmp)
         */
-       mtx_init(&pmp->wthread_mtx);
-       bioq_init(&pmp->wthread_bioq);
        pmp->wthread_destroy = 0;
        lwkt_create(hammer2_write_thread, pmp,
                    &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
@@ -782,6 +844,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 
        /*
         * With the cluster operational install ihidden.
+        * (only applicable to pfs mounts, not applicable to spmp)
         */
        hammer2_inode_install_hidden(pmp);
 
@@ -838,7 +901,7 @@ hammer2_write_thread(void *arg)
                }
                cparent = NULL;
 
-               hammer2_trans_init(&trans, pmp, NULL, HAMMER2_TRANS_BUFCACHE);
+               hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
 
                while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
                        /*
@@ -849,7 +912,7 @@ hammer2_write_thread(void *arg)
                                bio->bio_flags |= BIO_DONE;
                                wakeup(bio);
                                hammer2_trans_done(&trans);
-                               hammer2_trans_init(&trans, pmp, NULL,
+                               hammer2_trans_init(&trans, pmp,
                                                   HAMMER2_TRANS_BUFCACHE);
                                continue;
                        }
@@ -946,7 +1009,7 @@ hammer2_assign_physical(hammer2_trans_t *trans,
         * logical buffer cache buffer.
         */
        *errorp = 0;
-       KKASSERT(pblksize >= HAMMER2_MIN_ALLOC);
+       KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
 retry:
        dparent = hammer2_cluster_lookup_init(cparent, 0);
        cluster = hammer2_cluster_lookup(dparent, &key_dummy,
@@ -1601,31 +1664,29 @@ hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp)
        hammer2_voldata_lock(hmp);
        if (((hmp->vchain.flags | hmp->fchain.flags) &
             HAMMER2_CHAIN_MODIFIED) ||
-           hmp->vchain.update_hi > hmp->voldata.mirror_tid ||
-           hmp->fchain.update_hi > hmp->voldata.freemap_tid) {
-               hammer2_voldata_unlock(hmp, 0);
+           hmp->vchain.update_xhi > hmp->vchain.update_xlo ||
+           hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
+               hammer2_voldata_unlock(hmp);
                hammer2_vfs_sync(mp, MNT_WAIT);
                /*hammer2_vfs_sync(mp, MNT_WAIT);*/
        } else {
-               hammer2_voldata_unlock(hmp, 0);
+               hammer2_voldata_unlock(hmp);
        }
        if (hmp->pmp_count == 0) {
                if (((hmp->vchain.flags | hmp->fchain.flags) &
                     HAMMER2_CHAIN_MODIFIED) ||
-                   (hmp->vchain.update_hi >
-                    hmp->voldata.mirror_tid) ||
-                   (hmp->fchain.update_hi >
-                    hmp->voldata.freemap_tid)) {
+                   hmp->vchain.update_xhi > hmp->vchain.update_xlo ||
+                   hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
                        kprintf("hammer2_unmount: chains left over "
                                "after final sync\n");
-                       kprintf("    vchain %08x update_hi %jx/%jx\n",
+                       kprintf("    vchain %08x update_xlo/hi %08x/%08x\n",
                                hmp->vchain.flags,
-                               hmp->voldata.mirror_tid,
-                               hmp->vchain.update_hi);
-                       kprintf("    fchain %08x update_hi %jx/%jx\n",
+                               hmp->vchain.update_xlo,
+                               hmp->vchain.update_xhi);
+                       kprintf("    fchain %08x update_xhi/hi %08x/%08x\n",
                                hmp->fchain.flags,
-                               hmp->voldata.freemap_tid,
-                               hmp->fchain.update_hi);
+                               hmp->fchain.update_xlo,
+                               hmp->fchain.update_xhi);
 
                        if (hammer2_debug & 0x0010)
                                Debugger("entered debugger");
@@ -1637,6 +1698,7 @@ static
 void
 hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
 {
+       hammer2_pfsmount_t *spmp;
        struct vnode *devvp;
        int dumpcnt;
        int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
@@ -1646,9 +1708,17 @@ hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
         * device.
         */
        if (hmp->pmp_count == 0) {
-               if (hmp->sroot) {
-                       hammer2_inode_drop(hmp->sroot);
-                       hmp->sroot = NULL;
+               /*
+                * Clean up SPMP and the super-root inode
+                */
+               spmp = hmp->spmp;
+               if (spmp) {
+                       if (spmp->iroot) {
+                               hammer2_inode_drop(spmp->iroot);
+                               spmp->iroot = NULL;
+                       }
+                       hmp->spmp = NULL;
+                       kfree(spmp, M_HAMMER2);
                }
 
                /*
@@ -1664,6 +1734,42 @@ hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
                        devvp = NULL;
                }
 
+               /*
+                * Clear vchain/fchain flags that might prevent final cleanup
+                * of these chains.
+                */
+               if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
+                       atomic_clear_int(&hmp->vchain.flags,
+                                        HAMMER2_CHAIN_MODIFIED);
+                       hammer2_chain_drop(&hmp->vchain);
+               }
+               if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_CREATE) {
+                       atomic_clear_int(&hmp->vchain.flags,
+                                        HAMMER2_CHAIN_FLUSH_CREATE);
+                       hammer2_chain_drop(&hmp->vchain);
+               }
+               if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_DELETE) {
+                       atomic_clear_int(&hmp->vchain.flags,
+                                        HAMMER2_CHAIN_FLUSH_DELETE);
+                       hammer2_chain_drop(&hmp->vchain);
+               }
+
+               if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
+                       atomic_clear_int(&hmp->fchain.flags,
+                                        HAMMER2_CHAIN_MODIFIED);
+                       hammer2_chain_drop(&hmp->fchain);
+               }
+               if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_CREATE) {
+                       atomic_clear_int(&hmp->fchain.flags,
+                                        HAMMER2_CHAIN_FLUSH_CREATE);
+                       hammer2_chain_drop(&hmp->fchain);
+               }
+               if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_DELETE) {
+                       atomic_clear_int(&hmp->fchain.flags,
+                                        HAMMER2_CHAIN_FLUSH_DELETE);
+                       hammer2_chain_drop(&hmp->fchain);
+               }
+
                /*
                 * Final drop of embedded freemap root chain to
                 * clean up fchain.core (fchain structure is not
@@ -1786,20 +1892,29 @@ hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
  *
  * Updates to the free block table are allowed to lag flushes by one
  * transaction.  In case of a crash, then on a fresh mount we must do an
- * incremental scan of transaction id voldata.mirror_tid and make sure the
- * related blocks have been marked allocated.
+ * incremental scan of the last committed transaction id and make sure that
+ * all related blocks have been marked allocated.
  *
+ * The super-root topology and each PFS has its own transaction id domain,
+ * so we must track PFS boundary transitions.
  */
 struct hammer2_recovery_elm {
        TAILQ_ENTRY(hammer2_recovery_elm) entry;
        hammer2_chain_t *chain;
+       hammer2_tid_t sync_tid;
 };
 
 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
 
+struct hammer2_recovery_info {
+       struct hammer2_recovery_list list;
+       int     depth;
+};
+
 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                        hammer2_chain_t *parent,
-                       struct hammer2_recovery_list *list, int depth);
+                       struct hammer2_recovery_info *info,
+                       hammer2_tid_t sync_tid);
 
 #define HAMMER2_RECOVERY_MAXDEPTH      10
 
@@ -1808,27 +1923,33 @@ int
 hammer2_recovery(hammer2_mount_t *hmp)
 {
        hammer2_trans_t trans;
-       struct hammer2_recovery_list list;
+       struct hammer2_recovery_info info;
        struct hammer2_recovery_elm *elm;
        hammer2_chain_t *parent;
+       hammer2_tid_t sync_tid;
        int error;
        int cumulative_error = 0;
 
-       hammer2_trans_init(&trans, NULL, hmp, 0);
+       hammer2_trans_init(&trans, hmp->spmp, 0);
 
-       TAILQ_INIT(&list);
+       sync_tid = 0;
+       TAILQ_INIT(&info.list);
+       info.depth = 0;
        parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
-       cumulative_error = hammer2_recovery_scan(&trans, hmp, parent, &list, 0);
+       cumulative_error = hammer2_recovery_scan(&trans, hmp, parent,
+                                                &info, sync_tid);
        hammer2_chain_lookup_done(parent);
 
-       while ((elm = TAILQ_FIRST(&list)) != NULL) {
-               TAILQ_REMOVE(&list, elm, entry);
+       while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
+               TAILQ_REMOVE(&info.list, elm, entry);
                parent = elm->chain;
+               sync_tid = elm->sync_tid;
                kfree(elm, M_HAMMER2);
 
                hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
                                           HAMMER2_RESOLVE_NOREF);
-               error = hammer2_recovery_scan(&trans, hmp, parent, &list, 0);
+               error = hammer2_recovery_scan(&trans, hmp, parent,
+                                             &info, sync_tid);
                hammer2_chain_unlock(parent);
                if (error)
                        cumulative_error = error;
@@ -1842,28 +1963,15 @@ static
 int
 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                      hammer2_chain_t *parent,
-                     struct hammer2_recovery_list *list, int depth)
+                     struct hammer2_recovery_info *info,
+                     hammer2_tid_t sync_tid)
 {
        hammer2_chain_t *chain;
        int cache_index;
        int cumulative_error = 0;
+       int pfs_boundary = 0;
        int error;
 
-       /*
-        * Defer operation if depth limit reached.
-        */
-       if (depth >= HAMMER2_RECOVERY_MAXDEPTH) {
-               struct hammer2_recovery_elm *elm;
-
-               elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
-               elm->chain = parent;
-               hammer2_chain_ref(parent);
-               TAILQ_INSERT_TAIL(list, elm, entry);
-               /* unlocked by caller */
-
-               return(0);
-       }
-
        /*
         * Adjust freemap to ensure that the block(s) are marked allocated.
         */
@@ -1885,11 +1993,17 @@ hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                 * for recursion.
                 */
                hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
-               hammer2_chain_unlock(parent);
                if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
                        /* not applicable to recovery scan */
+                       hammer2_chain_unlock(parent);
                        return 0;
                }
+               if ((parent->data->ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) &&
+                   info->depth != 0) {
+                       pfs_boundary = 1;
+                       sync_tid = parent->bref.mirror_tid - 1;
+               }
+               hammer2_chain_unlock(parent);
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
                /*
@@ -1909,6 +2023,24 @@ hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                return EDOM;
        }
 
+       /*
+        * Defer operation if depth limit reached or if we are crossing a
+        * PFS boundary.
+        */
+       if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH || pfs_boundary) {
+               struct hammer2_recovery_elm *elm;
+
+               elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
+               elm->chain = parent;
+               elm->sync_tid = sync_tid;
+               hammer2_chain_ref(parent);
+               TAILQ_INSERT_TAIL(&info->list, elm, entry);
+               /* unlocked by caller */
+
+               return(0);
+       }
+
+
        /*
         * Recursive scan of the last flushed transaction only.  We are
         * doing this without pmp assignments so don't leave the chains
@@ -1919,9 +2051,11 @@ hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                                   HAMMER2_LOOKUP_NODATA);
        while (chain) {
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
-               if (chain->bref.mirror_tid >= hmp->voldata.alloc_tid - 1) {
+               if (chain->bref.mirror_tid >= sync_tid) {
+                       ++info->depth;
                        error = hammer2_recovery_scan(trans, hmp, chain,
-                                                     list, depth + 1);
+                                                     info, sync_tid);
+                       --info->depth;
                        if (error)
                                cumulative_error = error;
                }
@@ -1952,7 +2086,9 @@ int
 hammer2_vfs_sync(struct mount *mp, int waitfor)
 {
        struct hammer2_sync_info info;
+       hammer2_inode_t *iroot;
        hammer2_chain_t *chain;
+       hammer2_chain_t *parent;
        hammer2_pfsmount_t *pmp;
        hammer2_mount_t *hmp;
        int flags;
@@ -1962,6 +2098,9 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        int i;
 
        pmp = MPTOPMP(mp);
+       iroot = pmp->iroot;
+       KKASSERT(iroot);
+       KKASSERT(iroot->pmp == pmp);
 
        /*
         * We can't acquire locks on existing vnodes while in a transaction
@@ -1990,9 +2129,10 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
         * should theoretically not be possible for any new file buffers
         * to be instantiated during this sequence.
         */
-       hammer2_trans_init(&info.trans, pmp, NULL, HAMMER2_TRANS_ISFLUSH |
-                                                  HAMMER2_TRANS_PREFLUSH);
+       hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
+                                            HAMMER2_TRANS_PREFLUSH);
        hammer2_run_unlinkq(&info.trans, pmp);
+
        info.error = 0;
        info.waitfor = MNT_NOWAIT;
        vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
@@ -2015,16 +2155,75 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        hammer2_bioq_sync(info.trans.pmp);
        atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
 
+       total_error = 0;
+
+       /*
+        * Flush all storage elements making up the cluster
+        *
+        * We must also flush any deleted siblings because the super-root
+        * flush won't do it for us.  They all must be staged or the
+        * super-root flush will not be able to update its block table
+        * properly.
+        *
+        * XXX currently done serially instead of concurrently
+        */
+       for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
+               chain = iroot->cluster.array[i];
+               if (chain) {
+                       hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_flush(&info.trans, &chain);
+                       hammer2_chain_unlock(chain);
+               }
+               if (chain) {
+                       hammer2_chain_t *nchain;
+                       chain = TAILQ_FIRST(&chain->core->ownerq);
+                       hammer2_chain_ref(chain);
+                       while (chain) {
+                               hammer2_chain_lock(chain,
+                                                  HAMMER2_RESOLVE_ALWAYS);
+                               hammer2_flush(&info.trans, &chain);
+                               hammer2_chain_unlock(chain);
+                               nchain = TAILQ_NEXT(chain, core_entry);
+                               if (nchain)
+                                       hammer2_chain_ref(nchain);
+                               hammer2_chain_drop(chain);
+                               chain = nchain;
+                       }
+               }
+       }
 #if 0
+       hammer2_trans_done(&info.trans);
+#endif
+
        /*
-        * Start the flush transaction and flush all meta-data.
+        * Flush all volume roots to synchronize PFS flushes with the
+        * storage media.  Use a super-root transaction for each one.
+        *
+        * The flush code will detect super-root -> pfs-root chain
+        * transitions using the last pfs-root flush.
         */
-       hammer2_trans_init(&info.trans, pmp, NULL, HAMMER2_TRANS_ISFLUSH);
+       for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
+               chain = iroot->cluster.array[i];
+               if (chain == NULL)
+                       continue;
+
+               hmp = chain->hmp;
+#if 0
+               hammer2_trans_init(&info.trans, hmp->spmp,
+                                  HAMMER2_TRANS_ISFLUSH);
 #endif
 
-       total_error = 0;
-       for (i = 0; pmp->iroot && i < pmp->iroot->cluster.nchains; ++i) {
-               hmp = pmp->iroot->cluster.array[i]->hmp;
+               /*
+                * Force an update of the XID from the PFS root to the
+                * topology root.  We couldn't do this from the PFS
+                * transaction because a SPMP transaction is needed.
+                * This does not modify blocks, instead what it does is
+                * allow the flush code to find the transition point and
+                * then update on the way back up.
+                */
+               parent = TAILQ_LAST(&chain->above->ownerq, h2_core_list);
+               KKASSERT(chain->pmp != parent->pmp);
+               hammer2_chain_setsubmod(&info.trans, parent);
 
                /*
                 * Media mounts have two 'roots', vchain for the topology
@@ -2035,25 +2234,25 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                 * ahead of the topology.  We depend on the bulk free scan
                 * code to deal with any loose ends.
                 */
-#if 1
+               hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
-               kprintf("sync tid test fmap %016jx %016jx\n",
-                       hmp->fchain.update_hi, hmp->voldata.freemap_tid);
                if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->fchain.update_hi > hmp->voldata.freemap_tid) {
-                       /* this will also modify vchain as a side effect */
+                   hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
+                       /*
+                        * This will also modify vchain as a side effect,
+                        * mark vchain as modified now.
+                        */
+                       hammer2_voldata_modify(hmp);
                        chain = &hmp->fchain;
                        hammer2_flush(&info.trans, &chain);
                        KKASSERT(chain == &hmp->fchain);
                }
                hammer2_chain_unlock(&hmp->fchain);
-#endif
+               hammer2_chain_unlock(&hmp->vchain);
 
                hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
-               kprintf("sync tid test vmap %016jx %016jx\n",
-                       hmp->vchain.update_hi, hmp->voldata.mirror_tid);
                if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->vchain.update_hi > hmp->voldata.mirror_tid) {
+                   hmp->vchain.update_xhi > hmp->vchain.update_xlo) {
                        chain = &hmp->vchain;
                        hammer2_flush(&info.trans, &chain);
                        KKASSERT(chain == &hmp->vchain);
@@ -2066,7 +2265,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
 #if 0
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
                if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->fchain.update_hi > hmp->voldata.freemap_tid ||
+                   hmp->fchain.update_xhi > hmp->fchain.update_xlo ||
                    force_fchain) {
                        /* this will also modify vchain as a side effect */
                        chain = &hmp->fchain;
@@ -2136,6 +2335,10 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                }
                if (error)
                        total_error = error;
+
+#if 0
+               hammer2_trans_done(&info.trans);
+#endif
        }
        hammer2_trans_done(&info.trans);
 
@@ -2468,7 +2671,7 @@ hammer2_autodmsg(kdmsg_msg_t *msg)
                                continue;
                        hammer2_volconf_update(pmp, copyid);
                }
-               hammer2_voldata_unlock(hmp, 0);
+               hammer2_voldata_unlock(hmp);
        }
        if ((msg->any.head.cmd & DMSGF_DELETE) &&
            msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
@@ -2661,16 +2864,16 @@ hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
                chain->bref.key, chain->bref.keybits,
                chain->bref.mirror_tid);
 
-       kprintf("%*.*s      [%08x] (%s) mod=%016jx del=%016jx "
-               "lo=%08jx hi=%08jx refs=%d\n",
+       kprintf("%*.*s      [%08x] (%s) mod=%08x del=%08x "
+               "lo=%08x hi=%08x refs=%d\n",
                tab, tab, "",
                chain->flags,
                ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
                chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
-               chain->modify_tid,
-               chain->delete_tid,
-               chain->update_lo,
-               chain->update_hi,
+               chain->modify_xid,
+               chain->delete_xid,
+               chain->update_xlo,
+               chain->update_xhi,
                chain->refs);
 
        kprintf("%*.*s      core %p [%08x]",
index 24f0278..13f4c0a 100644 (file)
@@ -308,10 +308,10 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
 
 #if 0
        /* XXX can't do this yet */
-       hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_ISFLUSH);
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_ISFLUSH);
        vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
 #endif
-       hammer2_trans_init(&trans, ip->pmp, NULL, 0);
+       hammer2_trans_init(&trans, ip->pmp, 0);
        vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
 
        /*
@@ -439,7 +439,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                return(EROFS);
 
        hammer2_pfs_memory_wait(ip->pmp);
-       hammer2_trans_init(&trans, ip->pmp, NULL, 0);
+       hammer2_trans_init(&trans, ip->pmp, 0);
        cluster = hammer2_inode_lock_ex(ip);
        ripdata = &hammer2_cluster_data(cluster)->ipdata;
        error = 0;
@@ -884,7 +884,7 @@ hammer2_vop_write(struct vop_write_args *ap)
         * The transaction interlocks against flushes initiations
         * (note: but will run concurrently with the actual flush).
         */
-       hammer2_trans_init(&trans, ip->pmp, NULL, 0);
+       hammer2_trans_init(&trans, ip->pmp, 0);
        error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount);
        hammer2_trans_done(&trans);
 
@@ -1276,7 +1276,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                kprintf("hammer2: need to unconsolidate hardlink for %s\n",
                        chain->data->ipdata.filename);
                /* XXX retain shared lock on dip? (currently not held) */
-               hammer2_trans_init(&trans, dip->pmp, NULL, 0);
+               hammer2_trans_init(&trans, dip->pmp, 0);
                hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
                hammer2_trans_done(&trans);
        }
@@ -1370,7 +1370,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
        cluster = NULL;
 
        hammer2_pfs_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
                                   name, name_len, &cluster, &error);
        cluster->focus->inode_reason = 1;
@@ -1489,7 +1489,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         */
        ip = VTOI(ap->a_vp);
        hammer2_pfs_memory_wait(ip->pmp);
-       hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_NEWINODE);
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE);
 
        /*
         * The common parent directory must be locked first to avoid deadlocks.
@@ -1556,7 +1556,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
        hammer2_pfs_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
        ncluster = NULL;
 
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
@@ -1602,7 +1602,7 @@ hammer2_vop_nmknod(struct vop_nmknod_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
        hammer2_pfs_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
        ncluster = NULL;
 
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
@@ -1648,7 +1648,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
        hammer2_pfs_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
        ncparent = NULL;
 
        ap->a_vap->va_type = VLNK;      /* enforce type */
@@ -1741,7 +1741,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap)
        name_len = ncp->nc_nlen;
 
        hammer2_pfs_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, NULL, 0);
+       hammer2_trans_init(&trans, dip->pmp, 0);
        error = hammer2_unlink_file(&trans, dip, name, name_len,
                                    0, NULL, ap->a_nch);
        hammer2_trans_done(&trans);
@@ -1773,7 +1773,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
        name_len = ncp->nc_nlen;
 
        hammer2_pfs_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, NULL, 0);
+       hammer2_trans_init(&trans, dip->pmp, 0);
        hammer2_run_unlinkq(&trans, dip->pmp);
        error = hammer2_unlink_file(&trans, dip, name, name_len,
                                    1, NULL, ap->a_nch);
@@ -1828,7 +1828,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
        tname_len = tncp->nc_nlen;
 
        hammer2_pfs_memory_wait(tdip->pmp);
-       hammer2_trans_init(&trans, tdip->pmp, NULL, 0);
+       hammer2_trans_init(&trans, tdip->pmp, 0);
 
        /*
         * ip is the inode being renamed.  If this is a hardlink then