hammer2 - Refactor frontend part 17
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 28 Jun 2015 23:49:48 +0000 (16:49 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 29 Jun 2015 00:01:38 +0000 (17:01 -0700)
This ends the major refactoring.  All major cluster ops have been converted
to XOPs.  The new XOP APIs will be stabilized in subsequent commits, issues
will primarily be slave scan deadlocks.

* Refactor the fsync and slave scan code to use the XOP interface.

* Cleanup hammer2_cluster.c, removing numerous functions which are no
  longer used.

sys/vfs/hammer2/DESIGN
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_cluster.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_iocom.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_thread.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c
sys/vfs/hammer2/hammer2_xops.c

index 420aede..25158a7 100644 (file)
@@ -37,7 +37,7 @@
   will allocate new blocks up to the root in order to propagate block table
   changes and transaction ids.
 
-* Incremental update scans are trivial by design.
+* Incremental synchronization is queueless and trivial by design.
 
 * Multiple roots, with many features.  This is implemented via the super-root
   concept.  When mounting a HAMMER2 filesystem you specify a device path and
 * Roots are really no different from snapshots (HAMMER1 distinguished between
   its root mount and its PFS's.  HAMMER2 does not).
 
+* I/O and chain locking thread separation.  I/O stalls and lock stalls can
+  cause any filesystem which purports to operate over multiple physical and
+  network devices to implode.  HAMMER2 incorporates a frontend/backend design
+  which separates media operations into support threads and allows the
+  frontend to validate the cluster, proceed with an operation, and disconnect
+  any remaining running operation even when backend ops have not completed
+  on all nodes.  This allows the frontend to return 'early' (so to speak).
+
+* Early return on best data-path supported by virtue of the above.  In a
+  multi-master system, frontend ops will issue I/O on all cluster elements
+  concurrently and will return the instant incoming data validates the
+  cluster.
+
 * Snapshots are writable (in HAMMER1 snapshots were read-only).
 
 * Snapshots are explicit but trivial to create.  In HAMMER1 snapshots were
@@ -85,7 +98,7 @@
   discrepancies, the synchronization thread will use the quorum to figure
   out which information is not correct and update accordingly.
 
-* Support for multiple compression algorithms configured on subdirectory
+* Support for multiple compression algorithms configured on subdirectory
   tree basis and on a file basis.  Block compression up to 64KB will be used.
   Only compression ratios at powers of 2 that are at least 2:1 (e.g. 2:1,
   4:1, 8:1, etc) will work in this scheme because physical block allocations
@@ -493,10 +506,7 @@ will contain a cluster identifier that helps HAMMER2 identify and integrate
 with the nodes making up the cluster.  HAMMER2 will automatically integrate
 *all* entries under the super-root when you mount one of them.  You have to
 mount at least one for HAMMER2 to integrate the block device in the larger
-cluster.  This mount will typically be a SOFT_MASTER, DUMMY, SLAVE, or CACHE
-mount that simply serves to cause hammer to integrate the rest of the
-represented cluster.  ALL CLUSTER ELEMENTS ARE TREATED ACCORDING TO TYPE
-NO MATTER WHICH ONE YOU MOUNT.
+cluster.
 
 For cluster servers every HAMMER2-formatted partition has a "LOCAL" MASTER
 which can be mounted in order to make the rest of the elements under the
index 39b6993..637c815 100644 (file)
@@ -499,9 +499,10 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
  * Flags passed to hammer2_chain_delete()
  */
 #define HAMMER2_DELETE_PERMANENT       0x0001
-#define HAMMER2_DELETE_NOSTATS         0x0002
 
-#define HAMMER2_INSERT_NOSTATS         0x0002
+/*
+ * Flags passed to hammer2_chain_insert() or hammer2_chain_rename()
+ */
 #define HAMMER2_INSERT_PFSROOT         0x0004
 
 /*
@@ -572,13 +573,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_XOPMASK_VOP    0x80000000U
 
 struct hammer2_cluster_item {
-#if 0
-       hammer2_mtx_link_t      async_link;
-#endif
        hammer2_chain_t         *chain;
-#if 0
-       struct hammer2_cluster  *cluster;       /* link back to cluster */
-#endif
        int                     cache_index;
        uint32_t                flags;
 };
@@ -717,7 +712,6 @@ struct hammer2_inode {
        u_int                   refs;           /* +vpref, +flushref */
        uint8_t                 comp_heuristic;
        hammer2_inode_meta_t    meta;           /* copy of meta-data */
-       hammer2_blockref_t      bref;           /* copy of bref statistics */
        hammer2_off_t           osize;
 };
 
@@ -840,19 +834,18 @@ struct hammer2_xop_head {
        uint32_t                chk_mask;
        int                     state;
        int                     error;
+       hammer2_key_t           collect_key;
        char                    *name;
        size_t                  name_len;
        char                    *name2;
        size_t                  name2_len;
-       hammer2_key_t           lkey;
-       hammer2_key_t           nkey;
        hammer2_xop_fifo_t      collect[HAMMER2_MAXCLUSTER];
        hammer2_cluster_t       cluster;        /* help collections */
 };
 
 typedef struct hammer2_xop_head hammer2_xop_head_t;
 
-struct hammer2_xop_vfsroot {
+struct hammer2_xop_ipcluster {
        hammer2_xop_head_t      head;
 };
 
@@ -865,6 +858,7 @@ struct hammer2_xop_strategy {
 
 struct hammer2_xop_readdir {
        hammer2_xop_head_t      head;
+       hammer2_key_t           lkey;
 };
 
 struct hammer2_xop_nresolve {
@@ -893,6 +887,12 @@ struct hammer2_xop_scanlhc {
        hammer2_key_t           lhc;
 };
 
+struct hammer2_xop_scanall {
+       hammer2_xop_head_t      head;
+       hammer2_key_t           key_beg;        /* inclusive */
+       hammer2_key_t           key_end;        /* inclusive */
+};
+
 struct hammer2_xop_lookup {
        hammer2_xop_head_t      head;
        hammer2_key_t           lhc;
@@ -909,8 +909,18 @@ struct hammer2_xop_destroy {
        hammer2_xop_head_t      head;
 };
 
+struct hammer2_xop_fsync {
+       hammer2_xop_head_t      head;
+       hammer2_inode_meta_t    meta;
+       hammer2_off_t           osize;
+       u_int                   ipflags;
+       int                     clear_directdata;
+};
+
 struct hammer2_xop_unlinkall {
        hammer2_xop_head_t      head;
+       hammer2_key_t           key_beg;
+       hammer2_key_t           key_end;
 };
 
 struct hammer2_xop_connect {
@@ -927,19 +937,21 @@ typedef struct hammer2_xop_nresolve hammer2_xop_nresolve_t;
 typedef struct hammer2_xop_nlink hammer2_xop_nlink_t;
 typedef struct hammer2_xop_unlink hammer2_xop_unlink_t;
 typedef struct hammer2_xop_nrename hammer2_xop_nrename_t;
-typedef struct hammer2_xop_vfsroot hammer2_xop_vfsroot_t;
+typedef struct hammer2_xop_ipcluster hammer2_xop_ipcluster_t;
 typedef struct hammer2_xop_strategy hammer2_xop_strategy_t;
 typedef struct hammer2_xop_create hammer2_xop_create_t;
 typedef struct hammer2_xop_destroy hammer2_xop_destroy_t;
+typedef struct hammer2_xop_fsync hammer2_xop_fsync_t;
 typedef struct hammer2_xop_unlinkall hammer2_xop_unlinkall_t;
 typedef struct hammer2_xop_scanlhc hammer2_xop_scanlhc_t;
+typedef struct hammer2_xop_scanall hammer2_xop_scanall_t;
 typedef struct hammer2_xop_lookup hammer2_xop_lookup_t;
 typedef struct hammer2_xop_connect hammer2_xop_connect_t;
 typedef struct hammer2_xop_flush hammer2_xop_flush_t;
 
 union hammer2_xop {
        hammer2_xop_head_t      head;
-       hammer2_xop_vfsroot_t   xop_vfsroot;
+       hammer2_xop_ipcluster_t xop_ipcluster;
        hammer2_xop_readdir_t   xop_readdir;
        hammer2_xop_nresolve_t  xop_nresolve;
        hammer2_xop_nlink_t     xop_nlink;
@@ -948,8 +960,10 @@ union hammer2_xop {
        hammer2_xop_strategy_t  xop_strategy;
        hammer2_xop_create_t    xop_create;
        hammer2_xop_destroy_t   xop_destroy;
+       hammer2_xop_fsync_t     xop_fsync;
        hammer2_xop_unlinkall_t xop_unlinkall;
        hammer2_xop_scanlhc_t   xop_scanlhc;
+       hammer2_xop_scanall_t   xop_scanall;
        hammer2_xop_lookup_t    xop_lookup;
        hammer2_xop_flush_t     xop_flush;
        hammer2_xop_connect_t   xop_connect;
@@ -1083,6 +1097,7 @@ struct hammer2_pfs {
        hammer2_inode_t         *ihidden;       /* PFS hidden directory */
        uint8_t                 pfs_types[HAMMER2_MAXCLUSTER];
        char                    *pfs_names[HAMMER2_MAXCLUSTER];
+       hammer2_dev_t           *pfs_hmps[HAMMER2_MAXCLUSTER];
        hammer2_trans_t         trans;
        struct lock             lock;           /* PFS lock for certain ops */
        struct netexport        export;         /* nfs export */
@@ -1104,7 +1119,7 @@ struct hammer2_pfs {
        struct spinlock         list_spin;
        struct h2_unlk_list     unlinkq;        /* last-close unlink */
        hammer2_thread_t        sync_thrs[HAMMER2_MAXCLUSTER];
-       uint32_t                flags;          /* cached cluster flags */
+       uint32_t                cluster_flags;  /* cached cluster flags */
        hammer2_xop_group_t     xop_groups[HAMMER2_XOPGROUPS];
 };
 
@@ -1239,6 +1254,8 @@ void hammer2_inode_lock(hammer2_inode_t *ip, int how);
 void hammer2_inode_unlock(hammer2_inode_t *ip);
 hammer2_cluster_t *hammer2_inode_cluster(hammer2_inode_t *ip, int how);
 hammer2_chain_t *hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how);
+hammer2_chain_t *hammer2_inode_chain_and_parent(hammer2_inode_t *ip,
+                       int clindex, hammer2_chain_t **parentp, int how);
 hammer2_mtx_state_t hammer2_inode_lock_temp_release(hammer2_inode_t *ip);
 void hammer2_inode_lock_temp_restore(hammer2_inode_t *ip,
                        hammer2_mtx_state_t ostate);
@@ -1290,15 +1307,13 @@ hammer2_inode_t *hammer2_inode_create(hammer2_inode_t *dip,
                        const uint8_t *name, size_t name_len, hammer2_key_t lhc,
                        hammer2_key_t inum, uint8_t type, uint8_t target_type,
                        int flags, int *errorp);
-int hammer2_inode_connect_simple(hammer2_inode_t *dip, hammer2_inode_t *ip,
+int hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *ip,
                        const char *name, size_t name_len,
                        hammer2_key_t lhc);
 hammer2_inode_t *hammer2_inode_common_parent(hammer2_inode_t *fdip,
                        hammer2_inode_t *tdip);
-void hammer2_inode_fsync(hammer2_inode_t *ip, hammer2_cluster_t *cparent);
+void hammer2_inode_fsync(hammer2_inode_t *ip);
 int hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen);
-int hammer2_parent_find(hammer2_cluster_t **cparentp,
-                       hammer2_cluster_t *cluster);
 void hammer2_inode_install_hidden(hammer2_pfs_t *pmp);
 
 /*
@@ -1317,6 +1332,7 @@ void hammer2_chain_lock(hammer2_chain_t *chain, int how);
 void hammer2_chain_load_data(hammer2_chain_t *chain);
 const hammer2_media_data_t *hammer2_chain_rdata(hammer2_chain_t *chain);
 hammer2_media_data_t *hammer2_chain_wdata(hammer2_chain_t *chain);
+int hammer2_chain_snapshot(hammer2_chain_t *chain, hammer2_ioc_pfs_t *pmp);
 
 int hammer2_chain_hardlink_find(hammer2_inode_t *dip,
                                hammer2_chain_t **parentp,
@@ -1356,8 +1372,6 @@ int hammer2_chain_create(hammer2_chain_t **parentp,
 void hammer2_chain_rename(hammer2_blockref_t *bref,
                                hammer2_chain_t **parentp,
                                hammer2_chain_t *chain, int flags);
-int hammer2_chain_snapshot(hammer2_chain_t **chainp,
-                               hammer2_ioc_pfs_t *pmp);
 void hammer2_chain_delete(hammer2_chain_t *parent,
                                hammer2_chain_t *chain, int flags);
 void hammer2_flush(hammer2_chain_t *chain, int istop);
@@ -1438,6 +1452,8 @@ void hammer2_xop_reinit(hammer2_xop_head_t *xop);
 void hammer2_xop_helper_create(hammer2_pfs_t *pmp);
 void hammer2_xop_helper_cleanup(hammer2_pfs_t *pmp);
 void hammer2_xop_start(hammer2_xop_head_t *xop, hammer2_xop_func_t func);
+void hammer2_xop_start_except(hammer2_xop_head_t *xop, hammer2_xop_func_t func,
+                               int notidx);
 int hammer2_xop_collect(hammer2_xop_head_t *xop, int flags);
 void hammer2_xop_retire(hammer2_xop_head_t *xop, uint32_t mask);
 int hammer2_xop_active(hammer2_xop_head_t *xop);
@@ -1447,16 +1463,18 @@ int hammer2_xop_feed(hammer2_xop_head_t *xop, hammer2_chain_t *chain,
 /*
  * XOP backends in hammer2_xops.c
  */
-void hammer2_xop_vfsroot(hammer2_xop_t *xop, int clidx);
+void hammer2_xop_ipcluster(hammer2_xop_t *xop, int clidx);
 void hammer2_xop_readdir(hammer2_xop_t *xop, int clidx);
 void hammer2_xop_nresolve(hammer2_xop_t *xop, int clidx);
 void hammer2_xop_unlink(hammer2_xop_t *xop, int clidx);
 void hammer2_xop_nrename(hammer2_xop_t *xop, int clidx);
 void hammer2_xop_nlink(hammer2_xop_t *xop, int clidx);
 void hammer2_xop_scanlhc(hammer2_xop_t *xop, int clidx);
+void hammer2_xop_scanall(hammer2_xop_t *xop, int clidx);
 void hammer2_xop_lookup(hammer2_xop_t *xop, int clidx);
 void hammer2_inode_xop_create(hammer2_xop_t *xop, int clidx);
 void hammer2_inode_xop_destroy(hammer2_xop_t *xop, int clidx);
+void hammer2_inode_xop_fsync(hammer2_xop_t *xop, int clidx);
 void hammer2_inode_xop_unlinkall(hammer2_xop_t *xop, int clidx);
 void hammer2_inode_xop_connect(hammer2_xop_t *xop, int clidx);
 void hammer2_inode_xop_flush(hammer2_xop_t *xop, int clidx);
@@ -1470,7 +1488,6 @@ int hammer2_msg_adhoc_input(kdmsg_msg_t *msg);
 /*
  * hammer2_vfsops.c
  */
-void hammer2_clusterctl_wakeup(kdmsg_iocom_t *iocom);
 void hammer2_volconf_update(hammer2_dev_t *hmp, int index);
 void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx);
 int hammer2_vfs_sync(struct mount *mp, int waitflags);
@@ -1494,8 +1511,6 @@ void hammer2_freemap_adjust(hammer2_dev_t *hmp,
  */
 uint8_t hammer2_cluster_type(hammer2_cluster_t *cluster);
 const hammer2_media_data_t *hammer2_cluster_rdata(hammer2_cluster_t *cluster);
-const hammer2_media_data_t *hammer2_cluster_rdata_bytes(
-                               hammer2_cluster_t *cluster, size_t *bytesp);
 hammer2_media_data_t *hammer2_cluster_wdata(hammer2_cluster_t *cluster);
 hammer2_cluster_t *hammer2_cluster_from_chain(hammer2_chain_t *chain);
 void hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref);
@@ -1504,41 +1519,12 @@ hammer2_cluster_t *hammer2_cluster_alloc(hammer2_pfs_t *pmp,
 void hammer2_cluster_ref(hammer2_cluster_t *cluster);
 void hammer2_cluster_drop(hammer2_cluster_t *cluster);
 void hammer2_cluster_lock(hammer2_cluster_t *cluster, int how);
-void hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how);
 int hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t lokey,
                        int flags);
 void hammer2_cluster_resolve(hammer2_cluster_t *cluster);
 void hammer2_cluster_forcegood(hammer2_cluster_t *cluster);
 hammer2_cluster_t *hammer2_cluster_copy(hammer2_cluster_t *ocluster);
 void hammer2_cluster_unlock(hammer2_cluster_t *cluster);
-void hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx);
-void hammer2_cluster_modify(hammer2_cluster_t *cluster,
-                       int flags);
-hammer2_inode_data_t *hammer2_cluster_modify_ip(hammer2_inode_t *ip,
-                       hammer2_cluster_t *cluster, int flags);
-void hammer2_cluster_modsync(hammer2_cluster_t *cluster);
-hammer2_cluster_t *hammer2_cluster_lookup_init(hammer2_cluster_t *cparent,
-                       int flags);
-void hammer2_cluster_lookup_done(hammer2_cluster_t *cparent);
-hammer2_cluster_t *hammer2_cluster_lookup(hammer2_cluster_t *cparent,
-                       hammer2_key_t *key_nextp,
-                       hammer2_key_t key_beg, hammer2_key_t key_end,
-                       int flags);
-hammer2_cluster_t *hammer2_cluster_next(hammer2_cluster_t *cparent,
-                       hammer2_cluster_t *cluster,
-                       hammer2_key_t *key_nextp,
-                       hammer2_key_t key_beg, hammer2_key_t key_end,
-                       int flags);
-void hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
-                       hammer2_cluster_t *cluster,
-                       hammer2_key_t *key_nextp,
-                       hammer2_key_t key_beg,
-                       hammer2_key_t key_end,
-                       int i, int flags);
-void hammer2_cluster_delete(hammer2_cluster_t *pcluster,
-                       hammer2_cluster_t *cluster, int flags);
-int hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
-                       hammer2_ioc_pfs_t *pmp);
 
 int hammer2_bulk_scan(hammer2_chain_t *parent,
                        int (*func)(hammer2_chain_t *chain, void *info),
index 4600df5..9b5a42a 100644 (file)
@@ -3023,10 +3023,8 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                 *          inode stats (and thus asserting if there is no
                 *          chain->data loaded).
                 */
-               hammer2_chain_delete(parent, chain,
-                                    HAMMER2_DELETE_NOSTATS);
-               hammer2_chain_rename(NULL, &ichain, chain,
-                                    HAMMER2_INSERT_NOSTATS);
+               hammer2_chain_delete(parent, chain, 0);
+               hammer2_chain_rename(NULL, &ichain, chain, 0);
                hammer2_chain_unlock(chain);
                hammer2_chain_drop(chain);
                KKASSERT(parent->refs > 0);
@@ -4123,3 +4121,97 @@ done:
        *chainp = rchain;
        return (rchain ? EINVAL : 0);
 }
+
+/*
+ * Create a snapshot of the specified {parent, ochain} with the specified
+ * label.  The originating hammer2_inode must be exclusively locked for
+ * safety.
+ *
+ * The ioctl code has already synced the filesystem.
+ */
+int
+hammer2_chain_snapshot(hammer2_chain_t *chain, hammer2_ioc_pfs_t *pmp)
+{
+       hammer2_dev_t *hmp;
+       const hammer2_inode_data_t *ripdata;
+       hammer2_inode_data_t *wipdata;
+       hammer2_chain_t *nchain;
+       hammer2_inode_t *nip;
+       size_t name_len;
+       hammer2_key_t lhc;
+       struct vattr vat;
+#if 0
+       uuid_t opfs_clid;
+#endif
+       int error;
+
+       kprintf("snapshot %s\n", pmp->name);
+
+       name_len = strlen(pmp->name);
+       lhc = hammer2_dirhash(pmp->name, name_len);
+
+       /*
+        * Get the clid
+        */
+       ripdata = &chain->data->ipdata;
+#if 0
+       opfs_clid = ripdata->meta.pfs_clid;
+#endif
+       hmp = chain->hmp;
+
+       /*
+        * Create the snapshot directory under the super-root
+        *
+        * Set PFS type, generate a unique filesystem id, and generate
+        * a cluster id.  Use the same clid when snapshotting a PFS root,
+        * which theoretically allows the snapshot to be used as part of
+        * the same cluster (perhaps as a cache).
+        *
+        * Copy the (flushed) blockref array.  Theoretically we could use
+        * chain_duplicate() but it becomes difficult to disentangle
+        * the shared core so for now just brute-force it.
+        */
+       VATTR_NULL(&vat);
+       vat.va_type = VDIR;
+       vat.va_mode = 0755;
+       nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
+                                  pmp->name, name_len, 0,
+                                  1, 0, 0,
+                                  HAMMER2_INSERT_PFSROOT, &error);
+
+       if (nip) {
+               hammer2_inode_modify(nip);
+               nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_modify(nchain, 0);
+               wipdata = &nchain->data->ipdata;
+
+               nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
+               nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
+               nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
+               kern_uuidgen(&nip->meta.pfs_fsid, 1);
+
+               /*
+                * Give the snapshot its own private cluster id.  As a
+                * snapshot no further synchronization with the original
+                * cluster will be done.
+                */
+#if 0
+               if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
+                       nip->meta.pfs_clid = opfs_clid;
+               else
+                       kern_uuidgen(&nip->meta.pfs_clid, 1);
+#endif
+               kern_uuidgen(&nip->meta.pfs_clid, 1);
+               nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
+
+               /* XXX hack blockset copy */
+               /* XXX doesn't work with real cluster */
+               wipdata->meta = nip->meta;
+               wipdata->u.blockset = ripdata->u.blockset;
+               hammer2_flush(nchain, 1);
+               hammer2_chain_unlock(nchain);
+               hammer2_chain_drop(nchain);
+               hammer2_inode_unlock(nip);
+       }
+       return (error);
+}
index 5aaff03..9d3c64c 100644 (file)
@@ -55,8 +55,6 @@
  * Cluster operations can be broken down into three pieces:
  *
  * (1) Chain locking and data retrieval.
- *             hammer2_cluster_lock()
- *             hammer2_cluster_parent()
  *
  *     - Most complex functions, quorum management on transaction ids.
  *
  *
  * (3) Modifying Operations
  *             hammer2_cluster_create()
- *             hammer2_cluster_rename()
- *             hammer2_cluster_delete()
- *             hammer2_cluster_modify()
- *             hammer2_cluster_modsync()
  *
  *     - Can usually punt on failures, operation continues unless quorum
  *       is lost.  If quorum is lost, must wait for resynchronization
@@ -257,7 +251,7 @@ hammer2_cluster_drop(hammer2_cluster_t *cluster)
  *         necessarily match.
  */
 void
-hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
+hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
 {
        hammer2_chain_t *chain;
        int i;
@@ -275,8 +269,6 @@ hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
         * Lock chains and resolve state.
         */
        for (i = 0; i < cluster->nchains; ++i) {
-               if (i == idx)
-                       continue;
                chain = cluster->array[i].chain;
                if (chain == NULL)
                        continue;
@@ -284,12 +276,6 @@ hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how)
        }
 }
 
-void
-hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
-{
-       hammer2_cluster_lock_except(cluster, -1, how);
-}
-
 /*
  * Calculate the clustering state for the cluster and set its focus.
  * This routine must be called with care.  For example, it should not
@@ -677,10 +663,6 @@ skip4:
         * Determine if the cluster was successfully locked for the
         * requested operation and generate an error code.  The cluster
         * will not be locked (or ref'd) if an error is returned.
-        *
-        * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
-        * to determine if reading or writing is possible.  If writing, the
-        * cluster still requires a call to hammer2_cluster_modify() first.
         */
        atomic_set_int(&cluster->flags, nflags);
        atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
@@ -1063,10 +1045,6 @@ skip4:
         * Determine if the cluster was successfully locked for the
         * requested operation and generate an error code.  The cluster
         * will not be locked (or ref'd) if an error is returned.
-        *
-        * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok()
-        * to determine if reading or writing is possible.  If writing, the
-        * cluster still requires a call to hammer2_cluster_modify() first.
         */
        atomic_set_int(&cluster->flags, nflags);
        atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags);
@@ -1138,7 +1116,7 @@ hammer2_cluster_copy(hammer2_cluster_t *ocluster)
  * Unlock a cluster.  Refcount and focus is maintained.
  */
 void
-hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
+hammer2_cluster_unlock(hammer2_cluster_t *cluster)
 {
        hammer2_chain_t *chain;
        int i;
@@ -1152,620 +1130,12 @@ hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx)
        atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED);
 
        for (i = 0; i < cluster->nchains; ++i) {
-               if (i == idx)
-                       continue;
                chain = cluster->array[i].chain;
                if (chain)
                        hammer2_chain_unlock(chain);
        }
 }
 
-void
-hammer2_cluster_unlock(hammer2_cluster_t *cluster)
-{
-       hammer2_cluster_unlock_except(cluster, -1);
-}
-
-/*
- * Set an inode's cluster modified, marking the related chains RW and
- * duplicating them if necessary.
- *
- * The passed-in chain is a localized copy of the chain previously acquired
- * when the inode was locked (and possilby replaced in the mean time), and
- * must also be updated.  In fact, we update it first and then synchronize
- * the inode's cluster cache.
- */
-hammer2_inode_data_t *
-hammer2_cluster_modify_ip(hammer2_inode_t *ip,
-                         hammer2_cluster_t *cluster, int flags)
-{
-       hammer2_inode_modify(ip);
-       hammer2_cluster_modify(cluster, flags);
-       hammer2_inode_repoint(ip, NULL, cluster);
-       return (&hammer2_cluster_wdata(cluster)->ipdata);
-}
-
-/*
- * Adjust the cluster's chains to allow modification and adjust the
- * focus.  Data will be accessible on return.
- *
- * If our focused master errors on modify, re-resolve the cluster to
- * try to select a different master.
- */
-void
-hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags)
-{
-       hammer2_chain_t *chain;
-       int resolve_again;
-       int i;
-
-       resolve_again = 0;
-       for (i = 0; i < cluster->nchains; ++i) {
-               if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
-                       cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
-                       continue;
-               }
-               chain = cluster->array[i].chain;
-               if (chain == NULL)
-                       continue;
-               if (chain->error)
-                       continue;
-               hammer2_chain_modify(chain, flags);
-               if (cluster->focus == chain && chain->error) {
-                       cluster->error = chain->error;
-                       resolve_again = 1;
-               }
-       }
-       if (resolve_again)
-               hammer2_cluster_resolve(cluster);
-}
-
-/*
- * Synchronize modifications from the focus to other chains in a cluster.
- * Convenient because nominal API users can just modify the contents of the
- * focus (at least for non-blockref data).
- *
- * Nominal front-end operations only edit non-block-table data in a single
- * chain.  This code copies such modifications to the other chains in the
- * cluster.  Blocktable modifications are handled on a chain-by-chain basis
- * by both the frontend and the backend and will explode in fireworks if
- * blindly copied.
- */
-void
-hammer2_cluster_modsync(hammer2_cluster_t *cluster)
-{
-       hammer2_chain_t *focus;
-       hammer2_chain_t *scan;
-       const hammer2_inode_data_t *ripdata;
-       hammer2_inode_data_t *wipdata;
-       int i;
-
-       focus = cluster->focus;
-       KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED);
-
-       for (i = 0; i < cluster->nchains; ++i) {
-               if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
-                       continue;
-               scan = cluster->array[i].chain;
-               if (scan == NULL || scan == focus)
-                       continue;
-               if (scan->error)
-                       continue;
-               KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED);
-               KKASSERT(focus->bytes == scan->bytes &&
-                        focus->bref.type == scan->bref.type);
-               switch(focus->bref.type) {
-               case HAMMER2_BREF_TYPE_INODE:
-                       ripdata = &focus->data->ipdata;
-                       wipdata = &scan->data->ipdata;
-                       if ((ripdata->meta.op_flags &
-                           HAMMER2_OPFLAG_DIRECTDATA) == 0) {
-                               bcopy(ripdata, wipdata,
-                                     offsetof(hammer2_inode_data_t, u));
-                               break;
-                       }
-                       /* fall through to full copy */
-               case HAMMER2_BREF_TYPE_DATA:
-                       bcopy(focus->data, scan->data, focus->bytes);
-                       break;
-               case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-               case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
-               case HAMMER2_BREF_TYPE_FREEMAP:
-               case HAMMER2_BREF_TYPE_VOLUME:
-                       panic("hammer2_cluster_modsync: illegal node type");
-                       /* NOT REACHED */
-                       break;
-               default:
-                       panic("hammer2_cluster_modsync: unknown node type");
-                       break;
-               }
-       }
-}
-
-/*
- * Lookup initialization/completion API.  Returns a locked, fully resolved
- * cluster with one ref.
- */
-hammer2_cluster_t *
-hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags)
-{
-       hammer2_cluster_t *cluster;
-
-       cluster = hammer2_cluster_copy(cparent);
-       if (flags & HAMMER2_LOOKUP_SHARED) {
-               hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS |
-                                             HAMMER2_RESOLVE_SHARED);
-       } else {
-               hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS);
-       }
-       hammer2_cluster_resolve(cluster);
-
-       return (cluster);
-}
-
-void
-hammer2_cluster_lookup_done(hammer2_cluster_t *cparent)
-{
-       if (cparent) {
-               hammer2_cluster_unlock(cparent);
-               hammer2_cluster_drop(cparent);
-       }
-}
-
-/*
- * Locate first match or overlap under parent, return a new, locked, resolved
- * cluster with one ref.
- *
- * Must never be called with HAMMER2_LOOKUP_MATCHIND.
- */
-hammer2_cluster_t *
-hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp,
-                    hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
-{
-       hammer2_pfs_t *pmp;
-       hammer2_cluster_t *cluster;
-       hammer2_chain_t *chain;
-       hammer2_key_t key_accum;
-       hammer2_key_t key_next;
-       int null_count;
-       int rflags;
-       int i;
-
-       KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
-
-       pmp = cparent->pmp;                             /* can be NULL */
-       key_accum = *key_nextp;
-       null_count = 0;
-       if (flags & HAMMER2_LOOKUP_SHARED)
-               rflags = HAMMER2_RESOLVE_SHARED;
-       else
-               rflags = 0;
-
-       cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO);
-       cluster->pmp = pmp;                             /* can be NULL */
-       cluster->refs = 1;
-       if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
-               cluster->flags |= HAMMER2_CLUSTER_LOCKED;
-
-       /*
-        * Iterating earlier cluster elements with later elements still
-        * locked is a problem, so we have to unlock the parent and then
-        * re-lock as we go.
-        */
-       hammer2_cluster_unlock(cparent);
-       cparent->flags |= HAMMER2_CLUSTER_LOCKED;
-
-       /*
-        * Pass-1, issue lookups.
-        */
-       for (i = 0; i < cparent->nchains; ++i) {
-               cluster->array[i].flags = cparent->array[i].flags;
-               key_next = *key_nextp;
-
-               /*
-                * Always relock the parent as we go.
-                */
-               if (cparent->array[i].chain) {
-                       hammer2_chain_lock(cparent->array[i].chain, rflags);
-               }
-
-               /*
-                * Nothing to base the lookup, or parent was not synchronized.
-                */
-               if (cparent->array[i].chain == NULL ||
-                   (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) {
-                       ++null_count;
-                       continue;
-               }
-
-               chain = hammer2_chain_lookup(&cparent->array[i].chain,
-                                            &key_next,
-                                            key_beg, key_end,
-                                            &cparent->array[i].cache_index,
-                                            flags);
-               cluster->array[i].chain = chain;
-               if (chain == NULL) {
-                       ++null_count;
-               }
-               if (key_accum > key_next)
-                       key_accum = key_next;
-       }
-
-       /*
-        * Cleanup
-        */
-       cluster->nchains = i;
-       *key_nextp = key_accum;
-
-       /*
-        * The cluster must be resolved, out of sync elements may be present.
-        *
-        * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
-        */
-       if (null_count != i)
-               hammer2_cluster_resolve(cluster);
-       if (null_count == i ||
-           (cluster->focus == NULL &&
-            (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
-               if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
-                       hammer2_cluster_unlock(cluster);
-               hammer2_cluster_drop(cluster);
-               cluster = NULL;
-       }
-
-       return (cluster);
-}
-
-/*
- * Locate next match or overlap under parent, replace the passed-in cluster.
- * The returned cluster is a new, locked, resolved cluster with one ref.
- *
- * Must never be called with HAMMER2_LOOKUP_MATCHIND.
- */
-hammer2_cluster_t *
-hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                    hammer2_key_t *key_nextp,
-                    hammer2_key_t key_beg, hammer2_key_t key_end, int flags)
-{
-       hammer2_chain_t *ochain;
-       hammer2_chain_t *nchain;
-       hammer2_key_t key_accum;
-       hammer2_key_t key_next;
-       int parent_index;
-       int cluster_index;
-       int null_count;
-       int rflags;
-       int i;
-
-       KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0);
-
-       key_accum = *key_nextp;
-       null_count = 0;
-       parent_index = cparent->focus_index;    /* save prior focus */
-       cluster_index = cluster->focus_index;
-       if (flags & HAMMER2_LOOKUP_SHARED)
-               rflags = HAMMER2_RESOLVE_SHARED;
-       else
-               rflags = 0;
-
-       cluster->focus = NULL;          /* XXX needed any more? */
-       /*cparent->focus = NULL;*/
-       cluster->focus_index = 0;       /* XXX needed any more? */
-       /*cparent->focus_index = 0;*/
-
-       cluster->ddflag = 0;
-
-       /*
-        * The parent is always locked on entry, the iterator may be locked
-        * depending on flags.
-        *
-        * We must temporarily unlock the passed-in clusters to avoid a
-        * deadlock between elements of the cluster with other threads.
-        * We will fixup the lock in the loop.
-        *
-        * Note that this will clear the focus.
-        *
-        * Reflag the clusters as locked, because we will relock them
-        * as we go.
-        */
-       if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
-               hammer2_cluster_unlock(cluster);
-               cluster->flags |= HAMMER2_CLUSTER_LOCKED;
-       }
-       hammer2_cluster_unlock(cparent);
-       cparent->flags |= HAMMER2_CLUSTER_LOCKED;
-
-       for (i = 0; i < cparent->nchains; ++i) {
-               key_next = *key_nextp;
-               ochain = cluster->array[i].chain;
-
-               /*
-                * Always relock the parent as we go.
-                */
-               if (cparent->array[i].chain)
-                       hammer2_chain_lock(cparent->array[i].chain, rflags);
-
-               /*
-                * Nothing to iterate from.  These cases can occur under
-                * normal operations.  For example, during synchronization
-                * a slave might reach the end of its scan while records
-                * are still left on the master(s).
-                */
-               if (ochain == NULL) {
-                       ++null_count;
-                       continue;
-               }
-               if (cparent->array[i].chain == NULL ||
-                   (cparent->array[i].flags & HAMMER2_CITEM_INVALID) ||
-                   (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) {
-                       /* ochain has not yet been relocked */
-                       hammer2_chain_drop(ochain);
-                       cluster->array[i].chain = NULL;
-                       ++null_count;
-                       continue;
-               }
-
-               /*
-                * Relock the child if necessary.  Parent and child will then
-                * be locked as expected by hammer2_chain_next() and flags.
-                */
-               if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
-                       hammer2_chain_lock(ochain, rflags);
-               nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
-                                           &key_next, key_beg, key_end,
-                                           &cparent->array[i].cache_index,
-                                           flags);
-               /* ochain now invalid but can still be used for focus check */
-               if (parent_index == i) {
-                       cparent->focus_index = i;
-                       cparent->focus = cparent->array[i].chain;
-               }
-
-               cluster->array[i].chain = nchain;
-               if (nchain == NULL) {
-                       ++null_count;
-               }
-               if (key_accum > key_next)
-                       key_accum = key_next;
-       }
-
-       /*
-        * Cleanup
-        */
-       cluster->nchains = i;
-       *key_nextp = key_accum;
-
-       /*
-        * The cluster must be resolved, out of sync elements may be present.
-        *
-        * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL.
-        */
-       if (null_count != i)
-               hammer2_cluster_resolve(cluster);
-       if (null_count == i ||
-           (cluster->focus == NULL &&
-            (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) {
-               if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
-                       hammer2_cluster_unlock(cluster);
-               hammer2_cluster_drop(cluster);
-               cluster = NULL;
-       }
-       return(cluster);
-}
-
-/*
- * Advance just one chain in the cluster and recalculate the invalid bit.
- * The cluster index is allowed to be flagged invalid on input and is
- * recalculated on return.
- *
- * (used during synchronization to advance past a chain being deleted).
- *
- * The chain being advanced must not be the focus and the clusters in
- * question must have already passed normal cluster_lookup/cluster_next
- * checks.
- *
- * The cluster always remains intact on return, so void function.
- */
-void
-hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent,
-                                 hammer2_cluster_t *cluster,
-                                 hammer2_key_t *key_nextp,
-                                 hammer2_key_t key_beg,
-                                 hammer2_key_t key_end,
-                                 int i, int flags)
-{
-       hammer2_chain_t *ochain;
-       hammer2_chain_t *nchain;
-       hammer2_chain_t *focus;
-       hammer2_key_t key_accum;
-       hammer2_key_t key_next;
-       int ddflag;
-
-       key_accum = *key_nextp;
-       key_next = *key_nextp;
-       ochain = cluster->array[i].chain;
-       if (ochain == NULL)
-               goto done;
-       KKASSERT(ochain != cluster->focus);
-
-       nchain = hammer2_chain_next(&cparent->array[i].chain, ochain,
-                                   &key_next, key_beg, key_end,
-                                   &cparent->array[i].cache_index,
-                                   flags);
-       /* ochain now invalid */
-       if (cparent->focus_index == i)
-               cparent->focus = cparent->array[i].chain;
-
-       /*
-        * Install nchain.  Note that nchain can be NULL, and can also
-        * be in an unlocked state depending on flags.
-        */
-       cluster->array[i].chain = nchain;
-       cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
-
-       if (key_accum > key_next)
-               key_accum = key_next;
-
-       focus = cluster->focus;
-       if (focus == NULL)
-               goto done;
-       if (nchain == NULL)
-               goto done;
-#if 0
-       if (nchain == focus)    /* ASSERTED NOT TRUE */
-               ...
-#endif
-       ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE);
-       if (nchain->bref.type != focus->bref.type ||
-           nchain->bref.key != focus->bref.key ||
-           nchain->bref.keybits != focus->bref.keybits ||
-           nchain->bref.modify_tid != focus->bref.modify_tid ||
-           nchain->bytes != focus->bytes ||
-           ddflag != cluster->ddflag) {
-               cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
-       }
-
-done:
-       *key_nextp = key_accum;
-#if 0
-       /*
-        * For now don't re-resolve cluster->flags.
-        */
-       hammer2_cluster_resolve(cluster);
-#endif
-}
-
-/*
- * Mark a cluster deleted
- */
-void
-hammer2_cluster_delete(hammer2_cluster_t *cparent,
-                      hammer2_cluster_t *cluster, int flags)
-{
-       hammer2_chain_t *chain;
-       hammer2_chain_t *parent;
-       int i;
-
-       if (cparent == NULL) {
-               kprintf("cparent is NULL\n");
-               return;
-       }
-
-       for (i = 0; i < cluster->nchains; ++i) {
-               if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) {
-                       cluster->array[i].flags |= HAMMER2_CITEM_INVALID;
-                       continue;
-               }
-               parent = cparent->array[i].chain;
-               chain = cluster->array[i].chain;
-               if (chain == NULL)
-                       continue;
-               if (chain->parent != parent) {
-                       kprintf("hammer2_cluster_delete: parent "
-                               "mismatch chain=%p parent=%p against=%p\n",
-                               chain, chain->parent, parent);
-               } else {
-                       hammer2_chain_delete(parent, chain, flags);
-               }
-       }
-}
-
-/*
- * Create a snapshot of the specified {parent, ochain} with the specified
- * label.  The originating hammer2_inode must be exclusively locked for
- * safety.
- *
- * The ioctl code has already synced the filesystem.
- */
-int
-hammer2_cluster_snapshot(hammer2_cluster_t *ocluster,
-                      hammer2_ioc_pfs_t *pmp)
-{
-       hammer2_dev_t *hmp;
-       const hammer2_inode_data_t *ripdata;
-       hammer2_inode_data_t *wipdata;
-       hammer2_chain_t *nchain;
-       hammer2_inode_t *nip;
-       size_t name_len;
-       hammer2_key_t lhc;
-       struct vattr vat;
-#if 0
-       uuid_t opfs_clid;
-#endif
-       int error;
-
-       kprintf("snapshot %s\n", pmp->name);
-
-       name_len = strlen(pmp->name);
-       lhc = hammer2_dirhash(pmp->name, name_len);
-
-       /*
-        * Get the clid
-        */
-       ripdata = &hammer2_cluster_rdata(ocluster)->ipdata;
-#if 0
-       opfs_clid = ripdata->meta.pfs_clid;
-#endif
-       hmp = ocluster->focus->hmp;     /* XXX find synchronized local disk */
-
-       /*
-        * Create the snapshot directory under the super-root
-        *
-        * Set PFS type, generate a unique filesystem id, and generate
-        * a cluster id.  Use the same clid when snapshotting a PFS root,
-        * which theoretically allows the snapshot to be used as part of
-        * the same cluster (perhaps as a cache).
-        *
-        * Copy the (flushed) blockref array.  Theoretically we could use
-        * chain_duplicate() but it becomes difficult to disentangle
-        * the shared core so for now just brute-force it.
-        */
-       VATTR_NULL(&vat);
-       vat.va_type = VDIR;
-       vat.va_mode = 0755;
-       nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred,
-                                  pmp->name, name_len, 0,
-                                  1, 0, 0,
-                                  HAMMER2_INSERT_PFSROOT, &error);
-
-       if (nip) {
-               hammer2_inode_modify(nip);
-               nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
-               hammer2_chain_modify(nchain, 0);
-               wipdata = &nchain->data->ipdata;
-
-               nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER;
-               nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT;
-               nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT;
-               kern_uuidgen(&nip->meta.pfs_fsid, 1);
-
-               /*
-                * Give the snapshot its own private cluster id.  As a
-                * snapshot no further synchronization with the original
-                * cluster will be done.
-                */
-#if 0
-               if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY)
-                       nip->meta.pfs_clid = opfs_clid;
-               else
-                       kern_uuidgen(&nip->meta.pfs_clid, 1);
-#endif
-               kern_uuidgen(&nip->meta.pfs_clid, 1);
-               nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
-
-               /* XXX hack blockset copy */
-               /* XXX doesn't work with real cluster */
-               KKASSERT(ocluster->nchains == 1);
-               wipdata->meta = nip->meta;
-               wipdata->u.blockset = ripdata->u.blockset;
-               hammer2_flush(nchain, 1);
-               hammer2_chain_unlock(nchain);
-               hammer2_chain_drop(nchain);
-               hammer2_inode_unlock(nip);
-       }
-       return (error);
-}
-
 /************************************************************************
  *                             CLUSTER I/O                             *
  ************************************************************************
@@ -1788,14 +1158,6 @@ hammer2_cluster_rdata(hammer2_cluster_t *cluster)
        return(cluster->focus->data);
 }
 
-const hammer2_media_data_t *
-hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp)
-{
-       KKASSERT(cluster->focus != NULL);
-       *bytesp = cluster->focus->bytes;
-       return(cluster->focus->data);
-}
-
 hammer2_media_data_t *
 hammer2_cluster_wdata(hammer2_cluster_t *cluster)
 {
index 3198593..184101c 100644 (file)
@@ -163,6 +163,53 @@ hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how)
        return chain;
 }
 
+hammer2_chain_t *
+hammer2_inode_chain_and_parent(hammer2_inode_t *ip, int clindex,
+                              hammer2_chain_t **parentp, int how)
+{
+       hammer2_chain_t *chain;
+       hammer2_chain_t *parent;
+
+       for (;;) {
+               hammer2_spin_sh(&ip->cluster_spin);
+               if (clindex >= ip->cluster.nchains)
+                       chain = NULL;
+               else
+                       chain = ip->cluster.array[clindex].chain;
+               if (chain) {
+                       hammer2_chain_ref(chain);
+                       hammer2_spin_unsh(&ip->cluster_spin);
+                       hammer2_chain_lock(chain, how);
+               } else {
+                       hammer2_spin_unsh(&ip->cluster_spin);
+               }
+
+               /*
+                * Get parent, lock order must be (parent, chain).
+                */
+               parent = chain->parent;
+               hammer2_chain_ref(parent);
+               hammer2_chain_unlock(chain);
+               hammer2_chain_lock(parent, how);
+               hammer2_chain_lock(chain, how);
+               if (ip->cluster.array[clindex].chain == chain &&
+                   chain->parent == parent) {
+                       break;
+               }
+
+               /*
+                * Retry
+                */
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+               hammer2_chain_unlock(parent);
+               hammer2_chain_drop(parent);
+       }
+       *parentp = parent;
+
+       return chain;
+}
+
 void
 hammer2_inode_unlock(hammer2_inode_t *ip)
 {
@@ -549,7 +596,6 @@ again:
        if (cluster) {
                nipdata = &hammer2_cluster_rdata(cluster)->ipdata;
                nip->meta = nipdata->meta;
-               hammer2_cluster_bref(cluster, &nip->bref);
                atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);
                hammer2_inode_repoint(nip, NULL, cluster);
        } else {
@@ -809,9 +855,9 @@ done2:
  * lhc collisions).
  */
 int
-hammer2_inode_connect_simple(hammer2_inode_t *dip, hammer2_inode_t *ip,
-                            const char *name, size_t name_len,
-                            hammer2_key_t lhc)
+hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *ip,
+                     const char *name, size_t name_len,
+                     hammer2_key_t lhc)
 {
        hammer2_xop_scanlhc_t *sxop;
        hammer2_xop_connect_t *xop;
@@ -1107,8 +1153,8 @@ hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen)
         */
        if (isopen) {
                hammer2_inode_lock(pmp->ihidden, 0);
-               error = hammer2_inode_connect_simple(pmp->ihidden, ip,
-                                                    NULL, 0, ip->meta.inum);
+               error = hammer2_inode_connect(pmp->ihidden, ip,
+                                             NULL, 0, ip->meta.inum);
                hammer2_inode_unlock(pmp->ihidden);
        } else {
                error = 0;
@@ -1184,7 +1230,8 @@ hammer2_inode_install_hidden(hammer2_pfs_t *pmp)
 
                hammer2_inode_lock(pmp->ihidden, 0);
                xop = &hammer2_xop_alloc(pmp->ihidden)->xop_unlinkall;
-               xop->head.lkey = 0;
+               xop->key_beg = HAMMER2_KEY_MIN;
+               xop->key_end = HAMMER2_KEY_MAX;
                hammer2_xop_start(&xop->head, hammer2_inode_xop_unlinkall);
 
                while ((error = hammer2_xop_collect(&xop->head, 0)) == 0) {
@@ -1268,102 +1315,46 @@ hammer2_inode_modify(hammer2_inode_t *ip)
  * Synchronize the inode's frontend state with the chain state prior
  * to any explicit flush of the inode or any strategy write call.
  *
- * Called with a locked inode.
+ * Called with a locked inode inside a transaction.
  */
 void
-hammer2_inode_fsync(hammer2_inode_t *ip, hammer2_cluster_t *cparent)
+hammer2_inode_fsync(hammer2_inode_t *ip)
 {
-       int clear_directdata = 0;
-
-       /* temporary hack, allow cparent to be NULL */
-       if (cparent == NULL) {
-               cparent = hammer2_inode_cluster(ip, HAMMER2_RESOLVE_ALWAYS);
-               hammer2_inode_fsync(ip, cparent);
-               hammer2_cluster_unlock(cparent);
-               hammer2_cluster_drop(cparent);
-               return;
-       }
-
-       if ((ip->flags & HAMMER2_INODE_RESIZED) == 0) {
-               /* do nothing */
-       } else if (ip->meta.size < ip->osize) {
-               /*
-                * We must delete any chains beyond the EOF.  The chain
-                * straddling the EOF will be pending in the bioq.
-                */
-               hammer2_cluster_t *dparent;
-               hammer2_cluster_t *cluster;
-               hammer2_key_t lbase;
-               hammer2_key_t key_next;
-
-               lbase = (ip->meta.size + HAMMER2_PBUFMASK64) &
-                       ~HAMMER2_PBUFMASK64;
-               dparent = hammer2_cluster_lookup_init(&ip->cluster, 0);
-               cluster = hammer2_cluster_lookup(dparent, &key_next,
-                                                lbase, (hammer2_key_t)-1,
-                                                HAMMER2_LOOKUP_NODATA);
-               while (cluster) {
-                       /*
-                        * Degenerate embedded case, nothing to loop on
-                        */
-                       switch (hammer2_cluster_type(cluster)) {
-                       case HAMMER2_BREF_TYPE_INODE:
-                               hammer2_cluster_unlock(cluster);
-                               hammer2_cluster_drop(cluster);
-                               cluster = NULL;
-                               break;
-                       case HAMMER2_BREF_TYPE_DATA:
-                               hammer2_cluster_delete(dparent, cluster,
-                                                  HAMMER2_DELETE_PERMANENT);
-                               /* fall through */
-                       default:
-                               cluster = hammer2_cluster_next(dparent, cluster,
-                                                  &key_next,
-                                                  key_next, (hammer2_key_t)-1,
-                                                  HAMMER2_LOOKUP_NODATA);
-                               break;
+       if (ip->flags & (HAMMER2_INODE_RESIZED | HAMMER2_INODE_MODIFIED)) {
+               hammer2_xop_fsync_t *xop;
+               int error;
+
+               xop = &hammer2_xop_alloc(ip)->xop_fsync;
+               xop->clear_directdata = 0;
+               if (ip->flags & HAMMER2_INODE_RESIZED) {
+                       if ((ip->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
+                           ip->meta.size > HAMMER2_EMBEDDED_BYTES) {
+                               ip->meta.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
+                               xop->clear_directdata = 1;
                        }
+                       xop->osize = ip->osize;
+               } else {
+                       xop->osize = ip->meta.size;     /* safety */
                }
-               hammer2_cluster_lookup_done(dparent);
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
-               KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
-       } else if (ip->meta.size > ip->osize) {
-               /*
-                * When resizing larger we may not have any direct-data
-                * available.
-                */
-               if ((ip->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
-                   ip->meta.size > HAMMER2_EMBEDDED_BYTES) {
-                       ip->meta.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
-                       clear_directdata = 1;
-               }
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
-               KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
-       } else {
-               /*
-                * RESIZED was set but size didn't change.
-                */
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
-               KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
-       }
+               xop->ipflags = ip->flags;
+               xop->meta = ip->meta;
 
-       /*
-        * Sync inode meta-data
-        */
-       if (ip->flags & HAMMER2_INODE_MODIFIED) {
-               hammer2_inode_data_t *wipdata;
-
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-               hammer2_cluster_modify(cparent, 0);
-               hammer2_inode_repoint(ip, NULL, cparent);
-
-               wipdata = &hammer2_cluster_wdata(cparent)->ipdata;
-               wipdata->meta = ip->meta;
-               if (clear_directdata) {
-                       bzero(&wipdata->u.blockset,
-                             sizeof(wipdata->u.blockset));
+               atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED |
+                                            HAMMER2_INODE_MODIFIED);
+               hammer2_xop_start(&xop->head, hammer2_inode_xop_fsync);
+               error = hammer2_xop_collect(&xop->head, 0);
+               hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
+               if (error == ENOENT)
+                       error = 0;
+               if (error) {
+                       kprintf("hammer2: unable to fsync inode %p\n", ip);
+                       /*
+                       atomic_set_int(&ip->flags,
+                                      xop->ipflags & (HAMMER2_INODE_RESIZED |
+                                                      HAMMER2_INODE_MODIFIED));
+                       */
+                       /* XXX return error somehow? */
                }
-               hammer2_cluster_modsync(cparent);
        }
 }
 
@@ -1538,7 +1529,7 @@ hammer2_inode_xop_unlinkall(hammer2_xop_t *arg, int clindex)
        parent = hammer2_inode_chain(xop->head.ip, clindex,
                                     HAMMER2_RESOLVE_ALWAYS);
        chain = hammer2_chain_lookup(&parent, &key_next,
-                                    HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
+                                    xop->key_beg, xop->key_end,
                                     &cache_index,
                                     HAMMER2_LOOKUP_ALWAYS);
        while (chain) {
@@ -1548,7 +1539,7 @@ hammer2_inode_xop_unlinkall(hammer2_xop_t *arg, int clindex)
                                          HAMMER2_RESOLVE_SHARED);
                hammer2_xop_feed(&xop->head, chain, clindex, chain->error);
                chain = hammer2_chain_next(&parent, chain, &key_next,
-                                          key_next, HAMMER2_KEY_MAX,
+                                          key_next, xop->key_end,
                                           &cache_index,
                                           HAMMER2_LOOKUP_ALWAYS |
                                           HAMMER2_LOOKUP_NOUNLOCK);
@@ -1641,3 +1632,87 @@ fail:
                hammer2_chain_drop(chain);
        }
 }
+
+void
+hammer2_inode_xop_fsync(hammer2_xop_t *arg, int clindex)
+{
+       hammer2_xop_fsync_t *xop = &arg->xop_fsync;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       int error;
+
+       parent = hammer2_inode_chain(xop->head.ip, clindex,
+                                    HAMMER2_RESOLVE_ALWAYS);
+       chain = NULL;
+       if (parent == NULL) {
+               error = EIO;
+               goto done;
+       }
+       if (parent->error) {
+               error = parent->error;
+               goto done;
+       }
+
+       error = 0;
+
+       if ((xop->ipflags & HAMMER2_INODE_RESIZED) == 0) {
+               /* osize must be ignored */
+       } else if (xop->meta.size < xop->osize) {
+               /*
+                * We must delete any chains beyond the EOF.  The chain
+                * straddling the EOF will be pending in the bioq.
+                */
+               hammer2_key_t lbase;
+               hammer2_key_t key_next;
+               int cache_index = -1;
+
+               lbase = (xop->meta.size + HAMMER2_PBUFMASK64) &
+                       ~HAMMER2_PBUFMASK64;
+               chain = hammer2_chain_lookup(&parent, &key_next,
+                                            lbase, HAMMER2_KEY_MAX,
+                                            &cache_index,
+                                            HAMMER2_LOOKUP_NODATA |
+                                            HAMMER2_LOOKUP_NODIRECT);
+               while (chain) {
+                       /*
+                        * Degenerate embedded case, nothing to loop on
+                        */
+                       switch (chain->bref.type) {
+                       case HAMMER2_BREF_TYPE_INODE:
+                               KKASSERT(0);
+                               break;
+                       case HAMMER2_BREF_TYPE_DATA:
+                               hammer2_chain_delete(parent, chain,
+                                                    HAMMER2_DELETE_PERMANENT);
+                               break;
+                       }
+                       chain = hammer2_chain_next(&parent, chain, &key_next,
+                                                  key_next, HAMMER2_KEY_MAX,
+                                                  &cache_index,
+                                                  HAMMER2_LOOKUP_NODATA |
+                                                  HAMMER2_LOOKUP_NODIRECT);
+               }
+       }
+
+       /*
+        * Sync the inode meta-data, potentially clear the blockset area
+        * of direct data so it can be used for blockrefs.
+        */
+       hammer2_chain_modify(parent, 0);
+       parent->data->ipdata.meta = xop->meta;
+       if (xop->clear_directdata) {
+               bzero(&parent->data->ipdata.u.blockset,
+                     sizeof(parent->data->ipdata.u.blockset));
+       }
+done:
+       if (chain) {
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+       }
+       if (parent) {
+               hammer2_chain_unlock(parent);
+               hammer2_chain_drop(parent);
+       }
+       hammer2_xop_feed(&xop->head, NULL, clindex, error);
+}
+
index 6e5791c..04d9092 100644 (file)
@@ -284,12 +284,13 @@ static void
 hammer2_update_spans(hammer2_dev_t *hmp, kdmsg_state_t *state)
 {
        const hammer2_inode_data_t *ripdata;
-       hammer2_cluster_t *cparent;
-       hammer2_cluster_t *cluster;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
        hammer2_pfs_t *spmp;
        hammer2_key_t key_next;
        kdmsg_msg_t *rmsg;
        size_t name_len;
+       int cache_index = -1;
 
        /*
         * Lookup mount point under the media-localized super-root.
@@ -299,15 +300,19 @@ hammer2_update_spans(hammer2_dev_t *hmp, kdmsg_state_t *state)
         */
        spmp = hmp->spmp;
        hammer2_inode_lock(spmp->iroot, 0);
-       cparent = hammer2_inode_cluster(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
-       cluster = hammer2_cluster_lookup(cparent, &key_next,
-                                        HAMMER2_KEY_MIN,
-                                        HAMMER2_KEY_MAX,
-                                        0);
-       while (cluster) {
-               if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE)
+
+       parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
+       chain = NULL;
+       if (parent == NULL)
+               goto done;
+       chain = hammer2_chain_lookup(&parent, &key_next,
+                                    HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
+                                    &cache_index,
+                                    0);
+       while (chain) {
+               if (chain->bref.type != HAMMER2_BREF_TYPE_INODE)
                        continue;
-               ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+               ripdata = &chain->data->ipdata;
                kprintf("UPDATE SPANS: %s\n", ripdata->filename);
 
                rmsg = kdmsg_msg_alloc(&hmp->iocom.state0,
@@ -327,16 +332,20 @@ hammer2_update_spans(hammer2_dev_t *hmp, kdmsg_state_t *state)
 
                kdmsg_msg_write(rmsg);
 
-               cluster = hammer2_cluster_next(cparent, cluster,
-                                              &key_next,
-                                              key_next,
-                                              HAMMER2_KEY_MAX,
+               chain = hammer2_chain_next(&parent, chain, &key_next,
+                                              key_next, HAMMER2_KEY_MAX,
+                                              &cache_index,
                                               0);
        }
        hammer2_inode_unlock(spmp->iroot);
-       if (cparent) {
-               hammer2_cluster_unlock(cparent);
-               hammer2_cluster_drop(cparent);
+done:
+       if (chain) {
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+       }
+       if (parent) {
+               hammer2_chain_unlock(parent);
+               hammer2_chain_drop(parent);
        }
 }
 
index 9f95248..b77befe 100644 (file)
@@ -162,10 +162,14 @@ hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data, int fflag,
 static int
 hammer2_ioctl_version_get(hammer2_inode_t *ip, void *data)
 {
-       hammer2_dev_t *hmp = ip->pmp->iroot->cluster.focus->hmp;
        hammer2_ioc_version_t *version = data;
+       hammer2_dev_t *hmp;
 
-       version->version = hmp->voldata.version;
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp)
+               version->version = hmp->voldata.version;
+       else
+               version->version = -1;
        return 0;
 }
 
@@ -201,10 +205,14 @@ hammer2_ioctl_recluster(hammer2_inode_t *ip, void *data)
 static int
 hammer2_ioctl_remote_scan(hammer2_inode_t *ip, void *data)
 {
-       hammer2_dev_t *hmp = ip->pmp->iroot->cluster.focus->hmp;
+       hammer2_dev_t *hmp;
        hammer2_ioc_remote_t *remote = data;
        int copyid = remote->copyid;
 
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
+
        if (copyid < 0 || copyid >= HAMMER2_COPYID_COUNT)
                return (EINVAL);
 
@@ -239,10 +247,12 @@ hammer2_ioctl_remote_add(hammer2_inode_t *ip, void *data)
        int copyid = remote->copyid;
        int error = 0;
 
+       hmp = pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
        if (copyid >= HAMMER2_COPYID_COUNT)
                return (EINVAL);
 
-       hmp = pmp->iroot->cluster.focus->hmp; /* XXX */
        hammer2_voldata_lock(hmp);
        if (copyid < 0) {
                for (copyid = 1; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
@@ -275,7 +285,9 @@ hammer2_ioctl_remote_del(hammer2_inode_t *ip, void *data)
        int copyid = remote->copyid;
        int error = 0;
 
-       hmp = pmp->iroot->cluster.focus->hmp; /* XXX */
+       hmp = pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
        if (copyid >= HAMMER2_COPYID_COUNT)
                return (EINVAL);
        remote->copy1.path[sizeof(remote->copy1.path) - 1] = 0;
@@ -312,8 +324,9 @@ hammer2_ioctl_remote_rep(hammer2_inode_t *ip, void *data)
        hammer2_dev_t *hmp;
        int copyid = remote->copyid;
 
-       hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
-
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
        if (copyid < 0 || copyid >= HAMMER2_COPYID_COUNT)
                return (EINVAL);
 
@@ -344,7 +357,9 @@ hammer2_ioctl_socket_set(hammer2_inode_t *ip, void *data)
        hammer2_dev_t *hmp;
        int copyid = remote->copyid;
 
-       hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
        if (copyid < 0 || copyid >= HAMMER2_COPYID_COUNT)
                return (EINVAL);
 
@@ -382,11 +397,13 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
        int cache_index = -1;
        int error;
 
-       error = 0;
-       hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
-       pfs = data;
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
 
+       pfs = data;
        save_key = pfs->name_key;
+       error = 0;
 
        /*
         * Setup
@@ -417,6 +434,7 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                if (parent == NULL) {
                        hammer2_chain_unlock(chain);
                        hammer2_chain_drop(chain);
+                       chain = NULL;
                        break;
                }
                chain = hammer2_chain_next(&parent, chain, &key_next,
@@ -441,16 +459,20 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                ripdata = NULL; /* safety */
 
                /*
-                * Calculate name_next.
+                * Calculate name_next, if any.
                 */
-               chain = hammer2_chain_next(&parent, chain, &key_next,
-                                           key_next, HAMMER2_KEY_MAX,
-                                           &cache_index,
-                                           HAMMER2_LOOKUP_SHARED);
-               if (chain)
-                       pfs->name_next = chain->bref.key;
-               else
+               if (parent == NULL) {
                        pfs->name_next = (hammer2_key_t)-1;
+               } else {
+                       chain = hammer2_chain_next(&parent, chain, &key_next,
+                                                   key_next, HAMMER2_KEY_MAX,
+                                                   &cache_index,
+                                                   HAMMER2_LOOKUP_SHARED);
+                       if (chain)
+                               pfs->name_next = chain->bref.key;
+                       else
+                               pfs->name_next = (hammer2_key_t)-1;
+               }
        } else {
                pfs->name_next = (hammer2_key_t)-1;
                error = ENOENT;
@@ -493,9 +515,13 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
        int error;
        size_t len;
 
-       error = 0;
-       hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
+
        pfs = data;
+       error = 0;
+
        hammer2_inode_lock(hmp->spmp->iroot, HAMMER2_RESOLVE_SHARED);
        parent = hammer2_inode_chain(hmp->spmp->iroot, 0,
                                     HAMMER2_RESOLVE_ALWAYS |
@@ -564,7 +590,10 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
        hammer2_inode_t *nip;
        int error;
 
-       hmp = ip->pmp->iroot->cluster.focus->hmp; /* XXX */
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
+
        pfs = data;
        nip = NULL;
 
@@ -633,15 +662,19 @@ static int
 hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_pfs_t *pfs = data;
-       hammer2_pfs_t *spmp;
+       hammer2_dev_t   *hmp;
+       hammer2_pfs_t   *spmp;
        hammer2_xop_unlink_t *xop;
        hammer2_inode_t *dip;
        int error;
 
        pfs->name[sizeof(pfs->name) - 1] = 0;   /* ensure termination */
 
-       /* XXX */
-       spmp = ip->pmp->iroot->cluster.focus->hmp->spmp;
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
+
+       spmp = hmp->spmp;
        dip = spmp->iroot;
        hammer2_trans_init(spmp, 0);
        hammer2_inode_lock(dip, 0);
@@ -664,7 +697,8 @@ static int
 hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_pfs_t *pfs = data;
-       hammer2_cluster_t *cparent;
+       hammer2_dev_t   *hmp;
+       hammer2_chain_t *chain;
        int error;
 
        if (pfs->name[0] == 0)
@@ -672,15 +706,21 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
        if (pfs->name[sizeof(pfs->name)-1] != 0)
                return(EINVAL);
 
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
+
        hammer2_vfs_sync(ip->pmp->mp, MNT_WAIT);
 
        hammer2_trans_init(ip->pmp, HAMMER2_TRANS_ISFLUSH);
        hammer2_inode_lock(ip, 0);
-       cparent = hammer2_inode_cluster(ip, HAMMER2_RESOLVE_ALWAYS);
-       error = hammer2_cluster_snapshot(cparent, pfs);
+
+       chain = hammer2_inode_chain(ip, 0, HAMMER2_RESOLVE_ALWAYS);
+       error = hammer2_chain_snapshot(chain, pfs);
+       hammer2_chain_unlock(chain);
+       hammer2_chain_drop(chain);
+
        hammer2_inode_unlock(ip);
-       hammer2_cluster_unlock(cparent);
-       hammer2_cluster_drop(cparent);
        hammer2_trans_done(ip->pmp);
 
        return (error);
@@ -693,20 +733,23 @@ static int
 hammer2_ioctl_inode_get(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_inode_t *ino;
+       hammer2_chain_t *chain;
        int error;
+       int i;
 
        ino = data;
+       error = 0;
 
        hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
-       if (ip->cluster.focus) {
-               /* XXX */
-               ino->data_count = ip->cluster.focus->bref.data_count;
-               ino->inode_count = ip->cluster.focus->bref.inode_count;
-               error = 0;
-       } else {
-               ino->data_count = -1;
-               ino->inode_count = -1;
-               error = EIO;
+       ino->data_count = 0;
+       ino->inode_count = 0;
+       for (i = 0; i < ip->cluster.nchains; ++i) {
+               if ((chain = ip->cluster.array[i].chain) != NULL) {
+                       if (ino->data_count < chain->bref.data_count)
+                               ino->data_count = chain->bref.data_count;
+                       if (ino->inode_count < chain->bref.inode_count)
+                               ino->inode_count = chain->bref.inode_count;
+               }
        }
        bzero(&ino->ip_data, sizeof(ino->ip_data));
        ino->ip_data.meta = ip->meta;
@@ -785,9 +828,13 @@ int
 hammer2_ioctl_bulkfree_scan(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_bulkfree_t *bfi = data;
-       hammer2_dev_t *hmp = ip->pmp->iroot->cluster.focus->hmp;
+       hammer2_dev_t *hmp;
        int error;
 
+       hmp = ip->pmp->pfs_hmps[0];
+       if (hmp == NULL)
+               return (EINVAL);
+
        /* XXX run local cluster targets only */
        error = hammer2_bulkfree_pass(hmp, bfi);
 
index 5802d84..1cacdb4 100644 (file)
  */
 #include "hammer2.h"
 
+typedef struct hammer2_deferred_ip {
+       struct hammer2_deferred_ip *next;
+       hammer2_inode_t *ip;
+} hammer2_deferred_ip_t;
+
+typedef struct hammer2_deferred_list {
+       hammer2_deferred_ip_t   *base;
+       int                     count;
+} hammer2_deferred_list_t;
+
+
 #define HAMMER2_THREAD_DEBUG 1
 
-static int hammer2_sync_slaves(hammer2_thread_t *thr,
-                       hammer2_cluster_t *cparent, int *errors);
-static void hammer2_update_pfs_status(hammer2_thread_t *thr,
-                       hammer2_cluster_t *cparent);
+static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
+                               hammer2_deferred_list_t *list);
+#if 0
+static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
+                               nerror = hammer2_sync_insert(
+                                               thr, &parent, &chain,
+                                               focus->bref.modify_tid,
+                                               idx, focus);
+#endif
 static int hammer2_sync_insert(hammer2_thread_t *thr,
-                       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                       hammer2_tid_t modify_tid,
-                       int i, int *errors);
+                       hammer2_chain_t **parentp, hammer2_chain_t **chainp,
+                       hammer2_tid_t modify_tid, int idx,
+                       hammer2_chain_t *focus);
 static int hammer2_sync_destroy(hammer2_thread_t *thr,
-                       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                       int i, int *errors);
+                       hammer2_chain_t **parentp, hammer2_chain_t **chainp,
+                       int idx);
 static int hammer2_sync_replace(hammer2_thread_t *thr,
-                       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                       hammer2_tid_t modify_tid,
-                       int i, int *errors);
+                       hammer2_chain_t *parent, hammer2_chain_t *chain,
+                       hammer2_tid_t modify_tid, int idx,
+                       hammer2_chain_t *focus);
 
 /****************************************************************************
  *                         HAMMER2 THREAD API                              *
@@ -151,6 +167,18 @@ hammer2_thr_unfreeze(hammer2_thread_t *thr)
        lockmgr(&thr->lk, LK_RELEASE);
 }
 
+static
+int
+hammer2_thr_break(hammer2_thread_t *thr)
+{
+       if (thr->flags & (HAMMER2_THREAD_STOP |
+                         HAMMER2_THREAD_REMASTER |
+                         HAMMER2_THREAD_FREEZE)) {
+               return 1;
+       }
+       return 0;
+}
+
 /****************************************************************************
  *                         HAMMER2 SYNC THREADS                            *
  ****************************************************************************/
@@ -168,13 +196,13 @@ void
 hammer2_primary_sync_thread(void *arg)
 {
        hammer2_thread_t *thr = arg;
-       hammer2_cluster_t *cparent;
-       hammer2_chain_t *chain;
        hammer2_pfs_t *pmp;
-       int errors[HAMMER2_MAXCLUSTER];
+       hammer2_deferred_list_t list;
+       hammer2_deferred_ip_t *defer;
        int error;
 
        pmp = thr->pmp;
+       bzero(&list, sizeof(list));
 
        lockmgr(&thr->lk, LK_EXCLUSIVE);
        while ((thr->flags & HAMMER2_THREAD_STOP) == 0) {
@@ -205,42 +233,63 @@ hammer2_primary_sync_thread(void *arg)
                /*
                 * Synchronization scan.
                 */
-               hammer2_trans_init(pmp, 0);
-               hammer2_inode_lock(pmp->iroot, 0);
-               cparent = hammer2_inode_cluster(pmp->iroot,
-                                               HAMMER2_RESOLVE_ALWAYS);
-               hammer2_update_pfs_status(thr, cparent);
-               hammer2_inode_unlock(pmp->iroot);
-               bzero(errors, sizeof(errors));
                kprintf("sync_slaves clindex %d\n", thr->clindex);
+               hammer2_trans_init(pmp, 0);
 
-               /*
-                * We are the syncer, not a normal frontend operator,
-                * so force cparent good to prime the scan.
-                */
-               hammer2_cluster_forcegood(cparent);
-               error = hammer2_sync_slaves(thr, cparent, errors);
-               if (error)
-                       kprintf("hammer2_sync_slaves: error %d\n", error);
-               chain = cparent->array[thr->clindex].chain;
-
-               /*
-                * Retain chain for our node and release the cluster.
-                */
-               hammer2_chain_ref(chain);
-               hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
-               hammer2_cluster_unlock(cparent);
-               hammer2_cluster_drop(cparent);
+               hammer2_inode_ref(pmp->iroot);
+               for (;;) {
+                       int didbreak = 0;
+                       error = hammer2_sync_slaves(thr, pmp->iroot, &list);
+                       if (error != EAGAIN)
+                               break;
+                       while ((defer = list.base) != NULL) {
+                               hammer2_inode_t *nip;
+
+                               nip = defer->ip;
+                               error = hammer2_sync_slaves(thr, nip, &list);
+                               if (error && error != EAGAIN)
+                                       break;
+                               if (hammer2_thr_break(thr)) {
+                                       didbreak = 1;
+                                       break;
+                               }
+
+                               /*
+                                * If no additional defers occurred we can
+                                * remove this one, otherwrise keep it on
+                                * the list and retry once the additional
+                                * defers have completed.
+                                */
+                               if (defer == list.base) {
+                                       list.base = defer->next;
+                                       kfree(defer, M_HAMMER2);
+                                       defer = NULL;   /* safety */
+                                       hammer2_inode_drop(nip);
+                               }
+                       }
 
-               /*
-                * Flush the chain.
-                */
-               hammer2_flush(chain, 1);
-               hammer2_chain_unlock(chain);
-               hammer2_chain_drop(chain);
+                       /*
+                        * If the thread is being remastered, frozen, or
+                        * stopped, clean up any left-over deferals.
+                        */
+                       if (didbreak) {
+                               kprintf("didbreak\n");
+                               while ((defer = list.base) != NULL) {
+                                       hammer2_inode_drop(defer->ip);
+                                       list.base = defer->next;
+                                       kfree(defer, M_HAMMER2);
+                               }
+                               error = EINPROGRESS;
+                               break;
+                       }
+               }
 
+               hammer2_inode_drop(pmp->iroot);
                hammer2_trans_done(pmp);
 
+               if (error)
+                       kprintf("hammer2_sync_slaves: error %d\n", error);
+
                /*
                 * Wait for event, or 5-second poll.
                 */
@@ -252,21 +301,21 @@ hammer2_primary_sync_thread(void *arg)
        /* thr structure can go invalid after this point */
 }
 
+#if 0
 /*
  * Given a locked cluster created from pmp->iroot, update the PFS's
  * reporting status.
  */
 static
 void
-hammer2_update_pfs_status(hammer2_thread_t *thr, hammer2_cluster_t *cparent)
+hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
 {
        hammer2_pfs_t *pmp = thr->pmp;
-       uint32_t flags;
 
-       flags = cparent->flags & HAMMER2_CLUSTER_ZFLAGS;
-       if (pmp->flags == flags)
+       flags &= HAMMER2_CLUSTER_ZFLAGS;
+       if (pmp->cluster_flags == flags)
                return;
-       pmp->flags = flags;
+       pmp->cluster_flags = flags;
 
        kprintf("pfs %p", pmp);
        if (flags & HAMMER2_CLUSTER_MSYNCED)
@@ -295,7 +344,9 @@ hammer2_update_pfs_status(hammer2_thread_t *thr, hammer2_cluster_t *cparent)
                kprintf(" no-slaves-visible");
        kprintf("\n");
 }
+#endif
 
+#if 0
 static
 void
 dumpcluster(const char *label,
@@ -334,396 +385,332 @@ dumpcluster(const char *label,
                kprintf("\n");
        }
 }
+#endif
 
 /*
- * TODO - have cparent use a shared lock normally instead of exclusive,
- *       (needs to be upgraded for slave adjustments).
+ * Each out of sync node sync-thread must issue an all-nodes XOP scan of
+ * the inode.  This creates a multiplication effect since the XOP scan itself
+ * issues to all nodes.  However, this is the only way we can safely
+ * synchronize nodes which might have disparate I/O bandwidths and the only
+ * way we can safely deal with stalled nodes.
  */
 static
 int
-hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_cluster_t *cparent,
-                   int *errors)
+hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
+                   hammer2_deferred_list_t *list)
 {
-       hammer2_pfs_t *pmp;
-       hammer2_cluster_t *cluster;
-       hammer2_cluster_t *scluster;
-       hammer2_chain_t *focus;
+       hammer2_xop_scanall_t *xop;
+       hammer2_chain_t *parent;
        hammer2_chain_t *chain;
+       hammer2_pfs_t *pmp;
        hammer2_key_t key_next;
+       int cache_index = -1;
+       int needrescan;
+       int didwork;
        int error;
        int nerror;
        int idx;
        int n;
-       int nowork;
-       int dorecursion;
 
-       pmp = thr->pmp;
+       pmp = ip->pmp;
        idx = thr->clindex;     /* cluster node we are responsible for */
+       needrescan = 0;
+       didwork = 0;
 
+#if 0
        /*
         * Nothing to do if all slaves are synchronized.
         * Nothing to do if cluster not authoritatively readable.
         */
-       if (pmp->flags & HAMMER2_CLUSTER_SSYNCED)
+       if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
                return(0);
-       if ((pmp->flags & HAMMER2_CLUSTER_RDHARD) == 0)
+       if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
                return(HAMMER2_ERROR_INCOMPLETE);
+#endif
 
        error = 0;
 
        /*
-        * XXX snapshot the source to provide a stable source to copy.
+        * The inode is left unlocked during the scan.  Issue a XOP
+        * that does *not* include our cluster index to iterate
+        * properly synchronized elements and resolve our cluster index
+        * against it.
         */
-
-       /*
-        * Update all local slaves (remote slaves are handled by the sync
-        * threads on their respective hosts).
-        *
-        * Do a full topology scan, insert/delete elements on slaves as
-        * needed.  cparent must be ref'd so we can unlock and relock it
-        * on the recursion.
-        *
-        * ALLNODES - Allows clusters with a NULL focus to be returned if
-        *            elements remain on other nodes.
-        */
-       hammer2_cluster_ref(cparent);
-       cluster = hammer2_cluster_lookup(cparent, &key_next,
-                                        HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
-                                        HAMMER2_LOOKUP_NODATA |
-                                        HAMMER2_LOOKUP_NOLOCK |
-                                        HAMMER2_LOOKUP_NODIRECT |
-                                        HAMMER2_LOOKUP_ALLNODES);
-       dumpcluster("lookup", cparent, cluster);
-
-       /*
-        * Scan elements
-        */
-       while (cluster) {
+       hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
+       xop = &hammer2_xop_alloc(ip)->xop_scanall;
+       xop->key_beg = HAMMER2_KEY_MIN;
+       xop->key_end = HAMMER2_KEY_MAX;
+       hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
+       parent = hammer2_inode_chain(ip, idx,
+                                    HAMMER2_RESOLVE_ALWAYS |
+                                    HAMMER2_RESOLVE_SHARED);
+
+       hammer2_inode_unlock(ip);
+
+       chain = hammer2_chain_lookup(&parent, &key_next,
+                                    HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
+                                    &cache_index,
+                                    HAMMER2_LOOKUP_SHARED |
+                                    HAMMER2_LOOKUP_NODIRECT |
+                                    HAMMER2_LOOKUP_NODATA);
+       error = hammer2_xop_collect(&xop->head, 0);
+
+       for (;;) {
                /*
-                * nowork is adjusted during the loop,
-                * dorecursion is calculated here.
+                * We are done if our scan is done and the XOP scan is done.
+                * We are done if the XOP scan failed (that is, we don't
+                * have authoritative data to synchronize with).
                 */
-               nowork = 1;
-               focus = cluster->focus;
-               if (focus && focus->bref.type == HAMMER2_BREF_TYPE_INODE)
-                       dorecursion = 1;
-               else
-                       dorecursion = 0;
-
-               if (idx == 3 && (hammer2_debug & 1) && focus)
-                       kprintf("scan3 focus %d.%016jx %d.%016jx\n",
-                           (cparent ? cparent->focus->bref.type : 0xFF),
-                           (cparent ? cparent->focus->bref.key : (uintmax_t)-1LLU),
-                           focus->bref.type, focus->bref.key);
-repeat1:
-               /*
-                * Synchronize chains to focus
-                */
-               if (idx >= cluster->nchains)
-                       goto skip1;
-               chain = cluster->array[idx].chain;
-               if (idx == 3 && (hammer2_debug & 1) && chain)
-                       kprintf("scan3 slave %d.%016jx %d.%016jx\n",
-                           ((cparent && cparent->array[idx].chain) ? cparent->array[idx].chain->bref.type : 0xFF),
-                           ((cparent && cparent->array[idx].chain) ? cparent->array[idx].chain->bref.key : (uintmax_t)-1LLU),
-                           cluster->array[idx].chain->bref.type,
-                           cluster->array[idx].chain->bref.key);
-               if (idx == 3 && (hammer2_debug & 1) && chain == NULL)
-                       kprintf("scan3 slave %d.%16jx NULL\n",
-                           ((cparent && cparent->array[idx].chain) ? cparent->array[idx].chain->bref.type : 0xFF),
-                           ((cparent && cparent->array[idx].chain) ? cparent->array[idx].chain->bref.key : (uintmax_t)-1LLU)
-                       );
+               int advance_local = 0;
+               int advance_xop = 0;
+               int dodefer = 0;
+               hammer2_chain_t *focus;
 
-               /*
-                * Disable recursion for this index and loop up
-                * if a chain error is detected.
-                *
-                * A NULL chain is ok, it simply indicates that
-                * the slave reached the end of its scan, but we
-                * might have stuff from the master that still
-                * needs to be copied in.
-                */
-               if (chain && chain->error) {
-                       kprintf("chain error index %d: %d\n",
-                               idx, chain->error);
-                       errors[idx] = chain->error;
-                       error = chain->error;
-                       cluster->array[idx].flags |= HAMMER2_CITEM_INVALID;
-                       goto skip1;
-               }
+               if (chain == NULL && error == ENOENT)
+                       break;
+               if (error && error != ENOENT)
+                       break;
 
                /*
-                * Skip if the slave already has the record (everything
-                * matches including the modify_tid).  Note that the
-                * mirror_tid does not have to match, mirror_tid is
-                * a per-block-device entity.
+                * Compare
                 */
-               if (chain &&
-                   (cluster->array[idx].flags & HAMMER2_CITEM_INVALID) == 0) {
-                       goto skip1;
+               if (chain && error == ENOENT) {
+                       /*
+                        * If we have local chains but the XOP scan is done,
+                        * the chains need to be deleted.
+                        */
+                       n = -1;
+                       focus = NULL;
+               } else if (chain == NULL) {
+                       /*
+                        * If our local scan is done but the XOP scan is not,
+                        * we need to create the missing chain(s).
+                        */
+                       n = 1;
+                       focus = xop->head.cluster.focus;
+               } else {
+                       /*
+                        * Otherwise compare to determine the action
+                        * needed.
+                        */
+                       focus = xop->head.cluster.focus;
+                       n = hammer2_chain_cmp(chain, focus);
                }
 
                /*
-                * Invalid element needs to be updated.
-                */
-               nowork = 0;
-
-               /*
-                * Otherwise adjust the slave.  Compare the focus to
-                * the chain.  Note that focus and chain can
-                * independently be NULL.
+                * Take action based on comparison results.
                 */
-               KKASSERT(cluster->focus == focus);
-               if (focus) {
-                       if (chain)
-                               n = hammer2_chain_cmp(focus, chain);
-                       else
-                               n = -1; /* end-of-scan on slave */
-               } else {
-                       if (chain)
-                               n = 1;  /* end-of-scan on focus */
-                       else
-                               n = 0;  /* end-of-scan on both */
-               }
-
                if (n < 0) {
                        /*
-                        * slave chain missing, create missing chain.
-                        *
-                        * If we are going to recurse we have to set
-                        * the initial modify_tid to 0 until the
-                        * sub-tree is completely synchronized.
-                        * Setting (n = 0) in this situation forces
-                        * the replacement call to run on the way
-                        * back up after the sub-tree has
-                        * synchronized.
+                        * Delete extranious local data.  This will
+                        * automatically advance the chain.
                         */
-                       if (dorecursion) {
-                               nerror = hammer2_sync_insert(
-                                               thr, cparent, cluster,
+                       nerror = hammer2_sync_destroy(thr, &parent, &chain,
+                                                     idx);
+                       didwork = 1;
+               } else if (n == 0 && chain->bref.modify_tid !=
+                                    focus->bref.modify_tid) {
+                       /*
+                        * Matching key but local data or meta-data requires
+                        * updating.  If we will recurse, we still need to
+                        * update to compatible content first but we do not
+                        * synchronize modify_tid until the entire recursion
+                        * has completed successfully.
+                        */
+                       if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
+                               nerror = hammer2_sync_replace(
+                                               thr, parent, chain,
                                                0,
-                                               idx, errors);
-                               if (nerror == 0)
-                                       n = 0;
+                                               idx, focus);
+                               dodefer = 1;
                        } else {
-                               nerror = hammer2_sync_insert(
-                                               thr, cparent, cluster,
+                               nerror = hammer2_sync_replace(
+                                               thr, parent, chain,
                                                focus->bref.modify_tid,
-                                               idx, errors);
+                                               idx, focus);
                        }
-               } else if (n > 0) {
-                       /*
-                        * excess slave chain, destroy
-                        */
-                       nerror = hammer2_sync_destroy(thr,
-                                                     cparent, cluster,
-                                                     idx, errors);
-                       hammer2_cluster_next_single_chain(
-                               cparent, cluster,
-                               &key_next,
-                               HAMMER2_KEY_MIN,
-                               HAMMER2_KEY_MAX,
-                               idx,
-                               HAMMER2_LOOKUP_NODATA |
-                               HAMMER2_LOOKUP_NOLOCK |
-                               HAMMER2_LOOKUP_NODIRECT |
-                               HAMMER2_LOOKUP_ALLNODES);
+                       didwork = 1;
+               } else if (n == 0) {
                        /*
-                        * Re-execute same index, there might be more
-                        * items to delete before this slave catches
-                        * up to the focus.
+                        * 100% match, advance both
                         */
-                       goto repeat1;
-               } else {
+                       advance_local = 1;
+                       advance_xop = 1;
+                       nerror = 0;
+               } else if (n > 0) {
                        /*
-                        * Key matched but INVALID was set which likely
-                        * means that modify_tid is out of sync.
-                        *
-                        * If we are going to recurse we have to do
-                        * a partial replacement of the parent to
-                        * ensure that the block array is compatible.
-                        * For example, the current slave inode might
-                        * be flagged DIRECTDATA when the focus is not.
-                        * We must set modify_tid to 0 for now and
-                        * will fix it when recursion is complete.
+                        * Insert missing local data.
                         *
-                        * If we are not going to recurse we can do
-                        * a normal replacement.
-                        *
-                        * focus && chain can both be NULL on a match.
+                        * If we will recurse, we still need to update to
+                        * compatible content first but we do not synchronize
+                        * modify_tid until the entire recursion has
+                        * completed successfully.
                         */
-                       if (dorecursion) {
-                               nerror = hammer2_sync_replace(
-                                               thr, cparent, cluster,
+                       if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
+                               nerror = hammer2_sync_insert(
+                                               thr, &parent, &chain,
                                                0,
-                                               idx, errors);
-                       } else if (focus) {
-                               nerror = hammer2_sync_replace(
-                                               thr, cparent, cluster,
-                                               focus->bref.modify_tid,
-                                               idx, errors);
+                                               idx, focus);
+                               dodefer = 1;
                        } else {
-                               nerror = 0;
+                               nerror = hammer2_sync_insert(
+                                               thr, &parent, &chain,
+                                               focus->bref.modify_tid,
+                                               idx, focus);
                        }
+                       advance_local = 1;
+                       advance_xop = 1;
+                       didwork = 1;
                }
-               if (nerror)
-                       error = nerror;
-               /* finished primary synchronization of chains */
-
-skip1:
-#if 0
-               /*
-                * Operation may have modified cparent, we must replace
-                * iroot->cluster if we are at the top level.
-                */
-               if (thr->depth == 0)
-                       hammer2_inode_repoint_one(pmp->iroot, cparent, idx);
-#endif
-               KKASSERT(cluster->focus == focus);
-
-               /*
-                * If no work to do this iteration, skip any recursion.
-                */
-               if (nowork)
-                       goto skip2;
 
                /*
-                * EXECUTE RECURSION (skip if no recursion)
+                * We cannot recurse depth-first because the XOP is still
+                * running in node threads for this scan.  Create a placemarker
+                * by obtaining and record the hammer2_inode.
                 *
-                * Indirect blocks are absorbed by the iteration so we only
-                * have to recurse on inodes.
+                * We excluded our node from the XOP so we must temporarily
+                * add it to xop->head.cluster so it is properly incorporated
+                * into the inode.
                 *
-                * Do not resolve scluster, it represents the iteration
-                * parent and while it is logically in-sync the physical
-                * elements might not match due to the presence of indirect
-                * blocks and such.
+                * The deferral is pushed onto a LIFO list for bottom-up
+                * synchronization.
                 */
-               if (dorecursion == 0)
-                       goto skip2;
-               if (thr->depth > 20) {
-                       kprintf("depth limit reached\n");
-                       nerror = HAMMER2_ERROR_DEPTH;
-               } else {
-                       hammer2_cluster_unlock(cparent);
-                       scluster = hammer2_cluster_copy(cluster);
-                       hammer2_cluster_lock(scluster, HAMMER2_RESOLVE_ALWAYS);
-                       ++thr->depth;
-                       nerror = hammer2_sync_slaves(thr, scluster, errors);
-                       --thr->depth;
-                       hammer2_cluster_unlock(scluster);
-                       hammer2_cluster_drop(scluster);
-                       /* XXX modify_tid on scluster */
-                       /* flush needs to not update modify_tid */
-                       hammer2_cluster_lock(cparent, HAMMER2_RESOLVE_ALWAYS);
+               if (error == 0 && dodefer) {
+                       hammer2_inode_t *nip;
+                       hammer2_deferred_ip_t *defer;
+
+                       KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
+
+                       defer = kmalloc(sizeof(*defer), M_HAMMER2,
+                                       M_WAITOK | M_ZERO);
+                       KKASSERT(xop->head.cluster.array[idx].chain == NULL);
+                       xop->head.cluster.array[idx].flags =
+                                                       HAMMER2_CITEM_INVALID;
+                       xop->head.cluster.array[idx].chain = chain;
+                       nip = hammer2_inode_get(pmp, ip, &xop->head.cluster);
+                       xop->head.cluster.array[idx].chain = NULL;
+
+                       kprintf("DEFER INODE %p->%p\n", ip, nip);
+                       hammer2_inode_ref(nip);
+                       hammer2_inode_unlock(nip);
+
+                       defer->next = list->base;
+                       defer->ip = nip;
+                       list->base = defer;
+                       ++list->count;
+                       needrescan = 1;
                }
-               if (nerror)
-                       goto skip2;
 
                /*
-                * Fixup parent nodes on the way back up from the recursion
-                * if no error occurred.  The modify_tid for these nodes
-                * would have been set to 0 and must be set to their final
-                * value.
+                * If at least one deferral was added and the deferral
+                * list has grown too large, stop adding more.  This
+                * will trigger an EAGAIN return.
                 */
-               chain = cluster->array[idx].chain;
-               if (chain == NULL || chain->error)
-                       goto skip2;
-               /*
-                * should not be set but must fixup parents.
-               if ((cluster->array[idx].flags & HAMMER2_CITEM_INVALID) == 0)
-                       goto skip2;
-               */
+               if (needrescan && list->count > 1000)
+                       break;
 
                /*
-                * At this point we have to have key-matched non-NULL
-                * elements.
+                * Advancements for iteration.
                 */
-               n = hammer2_chain_cmp(focus, chain);
-               if (n != 0) {
-                       kprintf("hammer2_sync_slaves: illegal "
-                               "post-recursion state %d\n", n);
-                       goto skip2;
+               if (advance_xop) {
+                       error = hammer2_xop_collect(&xop->head, 0);
                }
+               if (advance_local) {
+                       chain = hammer2_chain_next(&parent, chain, &key_next,
+                                                  key_next, HAMMER2_KEY_MAX,
+                                                  &cache_index,
+                                                  HAMMER2_LOOKUP_SHARED |
+                                                  HAMMER2_LOOKUP_NODIRECT |
+                                                  HAMMER2_LOOKUP_NODATA);
+               }
+       }
+       hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
+       if (chain) {
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+       }
+       if (parent) {
+               hammer2_chain_unlock(parent);
+               hammer2_chain_drop(parent);
+       }
 
-               /*
-                * Update modify_tid on the way back up.
-                */
-               nerror = hammer2_sync_replace(
-                               thr, cparent, cluster,
-                               focus->bref.modify_tid,
-                               idx, errors);
-               if (nerror)
-                       error = nerror;
-
-#if 0
-               /*
-                * Operation may modify cparent, must replace
-                * iroot->cluster if we are at the top level.
-                */
-               if (thr->depth == 0)
-                       hammer2_inode_repoint_one(pmp->iroot, cparent, idx);
-#endif
+       /*
+        * If we added deferrals we want the caller to synchronize them
+        * and then call us again.
+        *
+        * NOTE: In this situation we do not yet want to synchronize our
+        *       inode, setting the error code also has that effect.
+        */
+       if (error == 0 && needrescan)
+               error = EAGAIN;
 
-skip2:
-               /*
-                * Iterate.
-                */
-               dumpcluster("adjust", cparent, cluster);
-               cluster = hammer2_cluster_next(cparent, cluster,
-                                              &key_next,
-                                              HAMMER2_KEY_MIN,
-                                              HAMMER2_KEY_MAX,
-                                              HAMMER2_LOOKUP_NODATA |
-                                              HAMMER2_LOOKUP_NOLOCK |
-                                              HAMMER2_LOOKUP_NODIRECT |
-                                              HAMMER2_LOOKUP_ALLNODES);
-               dumpcluster("nextcl", cparent, cluster);
+       /*
+        * If no error occurred and work was performed, synchronize the
+        * inode meta-data itself.
+        *
+        * XXX inode lock was lost
+        */
+       if (error == 0 && didwork) {
+               hammer2_xop_ipcluster_t *xop2;
+               hammer2_chain_t *focus;
+
+               xop2 = &hammer2_xop_alloc(ip)->xop_ipcluster;
+               hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
+                                        idx);
+               error = hammer2_xop_collect(&xop2->head, 0);
+               if (error == 0) {
+                       focus = xop2->head.cluster.focus;
+                       kprintf("syncthr: update inode\n");
+                       chain = hammer2_inode_chain_and_parent(ip, idx,
+                                                   &parent,
+                                                   HAMMER2_RESOLVE_ALWAYS |
+                                                   HAMMER2_RESOLVE_SHARED);
+
+                       KKASSERT(parent != NULL);
+                       nerror = hammer2_sync_replace(
+                                       thr, parent, chain,
+                                       focus->bref.modify_tid,
+                                       idx, xop2->head.cluster.focus);
+                       hammer2_chain_unlock(chain);
+                       hammer2_chain_drop(chain);
+                       hammer2_chain_unlock(parent);
+                       hammer2_chain_drop(parent);
+                       /* XXX */
+               }
+               hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
        }
-       hammer2_cluster_drop(cparent);
-       if (cluster)
-               hammer2_cluster_drop(cluster);
 
        return error;
 }
 
 /*
- * cparent is locked exclusively, with an extra ref, cluster is not locked.
+ * Create a missing chain by copying the focus from another device.
+ *
+ * On entry *parentp and focus are both locked shared.  The chain will be
+ * created and returned in *chainp also locked shared.
  */
 static
 int
 hammer2_sync_insert(hammer2_thread_t *thr,
-                   hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                   hammer2_tid_t modify_tid, int i, int *errors)
+                   hammer2_chain_t **parentp, hammer2_chain_t **chainp,
+                   hammer2_tid_t modify_tid, int idx,
+                   hammer2_chain_t *focus)
 {
-       hammer2_chain_t *focus;
        hammer2_chain_t *chain;
-       hammer2_key_t dummy;
 
-       focus = cluster->focus;
 #if HAMMER2_THREAD_DEBUG
        if (hammer2_debug & 1)
        kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
-               cparent->array[i].chain, 
-               cparent->array[i].chain->bref.type,
-               cparent->array[i].chain->bref.key,
-               i, focus->bref.type, focus->bref.key, modify_tid);
+               *parentp, 
+               (*parentp)->bref.type,
+               (*parentp)->bref.key,
+               idx,
+               focus->bref.type, focus->bref.key, modify_tid);
 #endif
 
-       /*
-        * We have to do a lookup to position ourselves at the correct
-        * parent when inserting a record into a new slave because the
-        * cluster iteration for this slave might not be pointing to the
-        * right place.  Our expectation is that the record will not be
-        * found.
-        */
-       hammer2_cluster_unlock_except(cparent, i);
-       chain = hammer2_chain_lookup(&cparent->array[i].chain, &dummy,
-                                    focus->bref.key, focus->bref.key,
-                                    &cparent->array[i].cache_index,
-                                    HAMMER2_LOOKUP_NODIRECT);
-       if (cparent->focus_index == i)
-               cparent->focus = cparent->array[i].chain;
-       KKASSERT(chain == NULL);
+       hammer2_chain_unlock(*parentp);
+       hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
+       /* reissue lookup? */
 
        /*
         * Create the missing chain.
@@ -731,17 +718,10 @@ hammer2_sync_insert(hammer2_thread_t *thr,
         * Have to be careful to avoid deadlocks.
         */
        chain = NULL;
-       if (cluster->focus_index < i)
-               hammer2_chain_lock(focus, HAMMER2_RESOLVE_ALWAYS);
-       hammer2_chain_create(&cparent->array[i].chain,
-                            &chain, thr->pmp,
+       hammer2_chain_create(parentp, &chain, thr->pmp,
                             focus->bref.key, focus->bref.keybits,
                             focus->bref.type, focus->bytes,
                             0);
-       if (cluster->focus_index > i)
-               hammer2_chain_lock(focus, HAMMER2_RESOLVE_ALWAYS);
-       if (cparent->focus_index == i)
-               cparent->focus = cparent->array[i].chain;
        hammer2_chain_modify(chain, HAMMER2_MODIFY_KEEPMODIFY);
 
        /*
@@ -785,63 +765,72 @@ hammer2_sync_insert(hammer2_thread_t *thr,
        /*
         * Avoid ordering deadlock when relocking cparent.
         */
-       if (i == 0) {
-               hammer2_cluster_lock_except(cparent, i, HAMMER2_RESOLVE_ALWAYS);
-       } else {
-               hammer2_chain_unlock(cparent->array[i].chain);
-               hammer2_cluster_lock(cparent, HAMMER2_RESOLVE_ALWAYS);
-       }
-
-       /*
-        * Enter item into (unlocked) cluster.
-        *
-        * Must clear invalid for iteration to work properly.
-        */
-       if (cluster->array[i].chain)
-               hammer2_chain_drop(cluster->array[i].chain);
-       cluster->array[i].chain = chain;
-       cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
+       hammer2_chain_unlock(*parentp);
+       hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
+                                    HAMMER2_RESOLVE_ALWAYS);
+       hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED);
 
        return 0;
 }
 
 /*
- * cparent is locked exclusively, with an extra ref, cluster is not locked.
+ * Destroy an extranious chain.
+ *
+ * Both *parentp and *chainp are locked shared.
+ *
+ * On return, *chainp will be adjusted to point to the next element in the
+ * iteration and locked shared.
  */
 static
 int
 hammer2_sync_destroy(hammer2_thread_t *thr,
-                    hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                    int i, int *errors)
+                    hammer2_chain_t **parentp, hammer2_chain_t **chainp,
+                    int idx)
 {
        hammer2_chain_t *chain;
+       hammer2_chain_t *parent;
+       hammer2_key_t key_next;
+       hammer2_key_t save_key;
+       int cache_index = -1;
+
+       chain = *chainp;
 
-       chain = cluster->array[i].chain;
 #if HAMMER2_THREAD_DEBUG
        if (hammer2_debug & 1)
        kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
-               cparent, cluster,
-               i, chain->bref.type, chain->bref.key);
+               *parentp, chain,
+               idx, chain->bref.type, chain->bref.key);
 #endif
+
+       save_key = chain->bref.key;
+       if (save_key != HAMMER2_KEY_MAX)
+               ++save_key;
+
        /*
         * Try to avoid unnecessary I/O.
         *
         * XXX accounting not propagated up properly.  We might have to do
         *     a RESOLVE_MAYBE here and pass 0 for the flags.
         */
+       hammer2_chain_unlock(chain);    /* relock exclusive */
+       hammer2_chain_unlock(*parentp);
+       hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
        hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
-       hammer2_chain_delete(cparent->array[i].chain, chain,
-                            HAMMER2_DELETE_NOSTATS |
-                            HAMMER2_DELETE_PERMANENT);
-       hammer2_chain_unlock(chain);
-
-       /*
-        * The element is not valid in that it doesn't match the other
-        * elements, but we have to mark it valid here to allow the
-        * cluster_next() call to advance this index to the next element.
-        */
-       cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
 
+       hammer2_chain_delete(*parentp, chain, HAMMER2_DELETE_PERMANENT);
+       hammer2_chain_unlock(chain);
+       hammer2_chain_drop(chain);
+       chain = NULL;                   /* safety */
+
+       hammer2_chain_unlock(*parentp); /* relock shared */
+       hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
+                                    HAMMER2_RESOLVE_ALWAYS);
+       *chainp = hammer2_chain_lookup(&parent, &key_next,
+                                    save_key, HAMMER2_KEY_MAX,
+                                    &cache_index,
+                                    HAMMER2_LOOKUP_SHARED |
+                                    HAMMER2_LOOKUP_NODIRECT |
+                                    HAMMER2_LOOKUP_NODATA);
        return 0;
 }
 
@@ -852,32 +841,26 @@ hammer2_sync_destroy(hammer2_thread_t *thr,
 static
 int
 hammer2_sync_replace(hammer2_thread_t *thr,
-                    hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                    hammer2_tid_t modify_tid, int i, int *errors)
+                    hammer2_chain_t *parent, hammer2_chain_t *chain,
+                    hammer2_tid_t modify_tid, int idx,
+                    hammer2_chain_t *focus)
 {
-       hammer2_chain_t *focus;
-       hammer2_chain_t *chain;
        int nradix;
        uint8_t otype;
 
-       focus = cluster->focus;
-       chain = cluster->array[i].chain;
 #if HAMMER2_THREAD_DEBUG
        if (hammer2_debug & 1)
-       kprintf("replace rec %p/%p slave %d %d.%016jx mod=%016jx\n",
-               cparent, cluster,
-               i, focus->bref.type, focus->bref.key, modify_tid);
+       kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
+               chain,
+               idx,
+               focus->bref.type, focus->bref.key, modify_tid);
 #endif
-       if (cluster->focus_index < i)
-               hammer2_chain_lock(focus, HAMMER2_RESOLVE_ALWAYS);
+       hammer2_chain_unlock(chain);
        hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
-       if (cluster->focus_index >= i)
-               hammer2_chain_lock(focus, HAMMER2_RESOLVE_ALWAYS);
        if (chain->bytes != focus->bytes) {
                /* XXX what if compressed? */
                nradix = hammer2_getradix(chain->bytes);
-               hammer2_chain_resize(NULL, cparent->array[i].chain, chain,
-                                    nradix, 0);
+               hammer2_chain_resize(NULL, parent, chain, nradix, 0);
        }
        hammer2_chain_modify(chain, HAMMER2_MODIFY_KEEPMODIFY);
        otype = chain->bref.type;
@@ -927,13 +910,9 @@ hammer2_sync_replace(hammer2_thread_t *thr,
                break;
        }
 
-       hammer2_chain_unlock(focus);
        hammer2_chain_unlock(chain);
-
-       /*
-        * Must clear invalid for iteration to work properly.
-        */
-       cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID;
+       hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
+                                 HAMMER2_RESOLVE_MAYBE);
 
        return 0;
 }
@@ -968,8 +947,7 @@ hammer2_xop_alloc(hammer2_inode_t *ip)
        xop->head.func = NULL;
        xop->head.state = 0;
        xop->head.error = 0;
-       xop->head.lkey = 0;
-       xop->head.nkey = 0;
+       xop->head.collect_key = 0;
 
        xop->head.cluster.nchains = ip->cluster.nchains;
        xop->head.cluster.pmp = ip->pmp;
@@ -1021,8 +999,7 @@ hammer2_xop_reinit(hammer2_xop_head_t *xop)
 {
        xop->state = 0;
        xop->error = 0;
-       xop->lkey = 0;
-       xop->nkey = 0;
+       xop->collect_key = 0;
        xop->run_mask = HAMMER2_XOPMASK_VOP;
 }
 
@@ -1070,7 +1047,8 @@ hammer2_xop_helper_cleanup(hammer2_pfs_t *pmp)
  * XXX optimize single-target case.
  */
 void
-hammer2_xop_start(hammer2_xop_head_t *xop, hammer2_xop_func_t func)
+hammer2_xop_start_except(hammer2_xop_head_t *xop, hammer2_xop_func_t func,
+                        int notidx)
 {
        hammer2_xop_group_t *xgrp;
        hammer2_thread_t *thr;
@@ -1088,7 +1066,7 @@ hammer2_xop_start(hammer2_xop_head_t *xop, hammer2_xop_func_t func)
 
        for (i = 0; i < xop->ip->cluster.nchains; ++i) {
                thr = &xgrp->thrs[i];
-               if (thr->td) {
+               if (thr->td && i != notidx) {
                        lockmgr(&thr->lk, LK_EXCLUSIVE);
                        if (thr->td &&
                            (thr->flags & HAMMER2_THREAD_STOP) == 0) {
@@ -1102,6 +1080,12 @@ hammer2_xop_start(hammer2_xop_head_t *xop, hammer2_xop_func_t func)
        }
 }
 
+void
+hammer2_xop_start(hammer2_xop_head_t *xop, hammer2_xop_func_t func)
+{
+       hammer2_xop_start_except(xop, func, -1);
+}
+
 /*
  * Retire a XOP.  Used by both the VOP frontend and by the XOP backend.
  */
@@ -1217,7 +1201,14 @@ hammer2_xop_active(hammer2_xop_head_t *xop)
  * ref on the chain but loses the lock (we unlock here).
  *
  * WARNING!  The chain is moving between two different threads, it must
- *          be locked SHARED, not exclusive.
+ *          be locked SHARED to retain its data mapping, not exclusive.
+ *          When multiple operations are in progress at once, chains fed
+ *          back to the frontend for collection can wind up being locked
+ *          in different orders, only a shared lock can prevent a deadlock.
+ *
+ *          Exclusive locks may only be used by a XOP backend node thread
+ *          temporarily, with no direct or indirect dependencies (aka
+ *          blocking/waiting) on other nodes.
  */
 int
 hammer2_xop_feed(hammer2_xop_head_t *xop, hammer2_chain_t *chain,
@@ -1268,10 +1259,7 @@ done:
  * (Frontend) collect a response from a running cluster op.
  *
  * Responses are fed from all appropriate nodes concurrently
- * and collected into a cohesive response >= nkey.  lkey is
- * then set to nkey and nkey is advanced prior to return.
- * The caller may depend on xop->lkey reflecting the current
- * key of the returned response.
+ * and collected into a cohesive response >= collect_key.
  *
  * The collector will return the instant quorum or other requirements
  * are met, even if some nodes get behind or become non-responsive.
@@ -1318,7 +1306,7 @@ loop:
                chain = xop->cluster.array[i].chain;
                if (chain == NULL) {
                        adv = 1;
-               } else if (chain->bref.key < xop->nkey) {
+               } else if (chain->bref.key < xop->collect_key) {
                        adv = 1;
                } else {
                        keynull &= ~HAMMER2_CHECK_NULL;
@@ -1395,24 +1383,24 @@ loop:
        }
        if (error == ESRCH) {
                if (lokey != HAMMER2_KEY_MAX) {
-                       xop->nkey = lokey + 1;
+                       xop->collect_key = lokey + 1;
                        goto loop;
                }
                error = ENOENT;
        }
        if (error == EDEADLK) {
-               kprintf("hammer2: no quorum possible lkey %016jx\n",
+               kprintf("hammer2: no quorum possible lokey %016jx\n",
                        lokey);
                if (lokey != HAMMER2_KEY_MAX) {
-                       xop->nkey = lokey + 1;
+                       xop->collect_key = lokey + 1;
                        goto loop;
                }
                error = ENOENT;
        }
        if (lokey == HAMMER2_KEY_MAX)
-               xop->nkey = lokey;
+               xop->collect_key = lokey;
        else
-               xop->nkey = lokey + 1;
+               xop->collect_key = lokey + 1;
 done:
        return error;
 }
index c7003df..51a809d 100644 (file)
@@ -413,6 +413,7 @@ hammer2_pfsalloc(hammer2_chain_t *chain, const hammer2_inode_data_t *ripdata,
                iroot->cluster.array[j].chain = chain;
                pmp->pfs_types[j] = ripdata->meta.pfs_type;
                pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2);
+               pmp->pfs_hmps[j] = chain->hmp;
 
                /*
                 * If the PFS is already mounted we must account
@@ -559,7 +560,6 @@ hammer2_pfsfree_scan(hammer2_dev_t *hmp)
 {
        hammer2_pfs_t *pmp;
        hammer2_inode_t *iroot;
-       hammer2_cluster_t *cluster;
        hammer2_chain_t *rchain;
        int didfreeze;
        int i;
@@ -583,26 +583,27 @@ again:
                 * in-progress will be aborted and it will have to start
                 * over again when unfrozen, or exit if told to exit.
                 */
-               cluster = &iroot->cluster;
-               for (i = 0; i < cluster->nchains; ++i) {
-                       rchain = cluster->array[i].chain;
-                       if (rchain == NULL || rchain->hmp != hmp)
-                               continue;
-                       break;
+               for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
+                       if (pmp->pfs_hmps[i] == hmp)
+                               break;
                }
-               if (i != cluster->nchains) {
+               if (i != HAMMER2_MAXCLUSTER) {
                        /*
                         * Make sure all synchronization threads are locked
                         * down.
                         */
-                       for (i = 0; i < iroot->cluster.nchains; ++i) {
+                       for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
+                               if (pmp->pfs_hmps[i] == NULL)
+                                       continue;
                                hammer2_thr_freeze_async(&pmp->sync_thrs[i]);
                                for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
                                        hammer2_thr_freeze_async(
                                                &pmp->xop_groups[j].thrs[i]);
                                }
                        }
-                       for (i = 0; i < iroot->cluster.nchains; ++i) {
+                       for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
+                               if (pmp->pfs_hmps[i] == NULL)
+                                       continue;
                                hammer2_thr_freeze(&pmp->sync_thrs[i]);
                                for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
                                        hammer2_thr_freeze(
@@ -625,27 +626,28 @@ again:
                        /*
                         * Remove the chain from matching elements of the PFS.
                         */
-                       for (i = 0; i < cluster->nchains; ++i) {
-                               rchain = cluster->array[i].chain;
-                               if (rchain == NULL || rchain->hmp != hmp)
+                       for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
+                               if (pmp->pfs_hmps[i] != hmp)
                                        continue;
                                hammer2_thr_delete(&pmp->sync_thrs[i]);
                                for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
                                        hammer2_thr_delete(
                                                &pmp->xop_groups[j].thrs[i]);
                                }
-                               rchain = cluster->array[i].chain;
-                               cluster->array[i].chain = NULL;
+                               rchain = iroot->cluster.array[i].chain;
+                               iroot->cluster.array[i].chain = NULL;
                                pmp->pfs_types[i] = 0;
                                if (pmp->pfs_names[i]) {
                                        kfree(pmp->pfs_names[i], M_HAMMER2);
                                        pmp->pfs_names[i] = NULL;
                                }
-                               hammer2_chain_drop(rchain);
-
-                               /* focus hint */
-                               if (cluster->focus == rchain)
-                                       cluster->focus = NULL;
+                               if (rchain) {
+                                       hammer2_chain_drop(rchain);
+                                       /* focus hint */
+                                       if (iroot->cluster.focus == rchain)
+                                               iroot->cluster.focus = NULL;
+                               }
+                               pmp->pfs_hmps[i] = NULL;
                        }
                        hammer2_mtx_unlock(&iroot->lock);
                        didfreeze = 1;  /* remaster, unfreeze down below */
@@ -654,21 +656,19 @@ again:
                }
 
                /*
-                * Cleanup trailing chains.  Do not reorder chains (for now).
-                * XXX might remove more than we intended.
+                * Cleanup trailing chains.  Gaps may remain.
                 */
-               while (i > 0) {
-                       if (cluster->array[i - 1].chain)
+               for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) {
+                       if (pmp->pfs_hmps[i])
                                break;
-                       --i;
                }
-               cluster->nchains = i;
+               iroot->cluster.nchains = i + 1;
 
                /*
                 * If the PMP has no elements remaining we can destroy it.
                 * (this will transition management threads from frozen->exit).
                 */
-               if (cluster->nchains == 0) {
+               if (iroot->cluster.nchains == 0) {
                        kprintf("unmount hmp %p last ref to PMP=%p\n",
                                hmp, pmp);
                        hammer2_pfsfree(pmp);
@@ -680,7 +680,9 @@ again:
                 * flag and unfreeze it.
                 */
                if (didfreeze) {
-                       for (i = 0; i < iroot->cluster.nchains; ++i) {
+                       for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
+                               if (pmp->pfs_hmps[i] == NULL)
+                                       continue;
                                hammer2_thr_remaster(&pmp->sync_thrs[i]);
                                hammer2_thr_unfreeze(&pmp->sync_thrs[i]);
                                for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
@@ -1016,6 +1018,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
                spmp->spmp_hmp = hmp;
                spmp->pfs_types[0] = ripdata->meta.pfs_type;
+               spmp->pfs_hmps[0] = hmp;
                hammer2_inode_ref(spmp->iroot);
                hammer2_inode_unlock(spmp->iroot);
                hammer2_cluster_unlock(cluster);
@@ -1518,21 +1521,21 @@ hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
        hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED);
 
        while (pmp->inode_tid == 0) {
-               hammer2_xop_vfsroot_t *xop;
+               hammer2_xop_ipcluster_t *xop;
                hammer2_inode_meta_t *meta;
 
-               xop = &hammer2_xop_alloc(pmp->iroot)->xop_vfsroot;
-               hammer2_xop_start(&xop->head, hammer2_xop_vfsroot);
+               xop = &hammer2_xop_alloc(pmp->iroot)->xop_ipcluster;
+               hammer2_xop_start(&xop->head, hammer2_xop_ipcluster);
                error = hammer2_xop_collect(&xop->head, 0);
 
                if (error == 0) {
                        meta = &xop->head.cluster.focus->data->ipdata.meta;
                        pmp->iroot->meta = *meta;
-                       pmp->iroot->bref = xop->head.cluster.focus->bref;
                        pmp->inode_tid = meta->pfs_inum + 1;
                        if (pmp->inode_tid < HAMMER2_INODE_START)
                                pmp->inode_tid = HAMMER2_INODE_START;
-                       pmp->modify_tid = pmp->iroot->bref.modify_tid + 1;
+                       pmp->modify_tid =
+                               xop->head.cluster.focus->bref.modify_tid + 1;
                        kprintf("PFS: Starting inode %jd\n",
                                (intmax_t)pmp->inode_tid);
                        kprintf("PMP focus good set nextino=%ld mod=%016jx\n",
@@ -1597,28 +1600,39 @@ hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
        hammer2_pfs_t *pmp;
        hammer2_dev_t *hmp;
        hammer2_blockref_t bref;
+       int i;
 
        /*
         * NOTE: iroot might not have validated the cluster yet.
         */
        pmp = MPTOPMP(mp);
-       if (pmp->iroot->cluster.focus == NULL)
-               return EINVAL;
-
-       KKASSERT(pmp->iroot->cluster.nchains >= 1);
-       hmp = pmp->iroot->cluster.focus->hmp;   /* iroot retains focus */
-       bref = pmp->iroot->cluster.focus->bref; /* no lock */
 
-       mp->mnt_stat.f_files = bref.inode_count;
+       mp->mnt_stat.f_files = 0;
        mp->mnt_stat.f_ffree = 0;
-       mp->mnt_stat.f_blocks = (bref.data_count +
-                                hmp->voldata.allocator_free) /
-                               mp->mnt_vstat.f_bsize;
-       mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free /
-                               mp->mnt_vstat.f_bsize;
-       mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
-
-       *sbp = mp->mnt_stat;
+       mp->mnt_stat.f_blocks = 0;
+       mp->mnt_stat.f_bfree = 0;
+       mp->mnt_stat.f_bavail = 0;
+
+       for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
+               hmp = pmp->pfs_hmps[i];
+               if (hmp == NULL)
+                       continue;
+               if (pmp->iroot->cluster.array[i].chain)
+                       bref = pmp->iroot->cluster.array[i].chain->bref;
+               else
+                       bzero(&bref, sizeof(bref));
+
+               mp->mnt_stat.f_files = bref.inode_count;
+               mp->mnt_stat.f_ffree = 0;
+               mp->mnt_stat.f_blocks = (bref.data_count +
+                                        hmp->voldata.allocator_free) /
+                                       mp->mnt_vstat.f_bsize;
+               mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free /
+                                       mp->mnt_vstat.f_bsize;
+               mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
+
+               *sbp = mp->mnt_stat;
+       }
        return (0);
 }
 
@@ -1629,29 +1643,41 @@ hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
        hammer2_pfs_t *pmp;
        hammer2_dev_t *hmp;
        hammer2_blockref_t bref;
+       int i;
 
        /*
         * NOTE: iroot might not have validated the cluster yet.
         */
        pmp = MPTOPMP(mp);
-       if (pmp->iroot->cluster.focus == NULL)
-               return EINVAL;
-
-       KKASSERT(pmp->iroot->cluster.nchains >= 1);
-       hmp = pmp->iroot->cluster.focus->hmp;   /* iroot retains focus */
-       bref = pmp->iroot->cluster.focus->bref; /* no lock */
 
-       mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
-       mp->mnt_vstat.f_files = bref.inode_count;
+       mp->mnt_vstat.f_bsize = 0;
+       mp->mnt_vstat.f_files = 0;
        mp->mnt_vstat.f_ffree = 0;
-       mp->mnt_vstat.f_blocks = (bref.data_count +
-                                hmp->voldata.allocator_free) /
-                               mp->mnt_vstat.f_bsize;
-       mp->mnt_vstat.f_bfree = hmp->voldata.allocator_free /
-                               mp->mnt_vstat.f_bsize;
-       mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
-
-       *sbp = mp->mnt_vstat;
+       mp->mnt_vstat.f_blocks = 0;
+       mp->mnt_vstat.f_bfree = 0;
+       mp->mnt_vstat.f_bavail = 0;
+
+       for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
+               hmp = pmp->pfs_hmps[i];
+               if (hmp == NULL)
+                       continue;
+               if (pmp->iroot->cluster.array[i].chain)
+                       bref = pmp->iroot->cluster.array[i].chain->bref;
+               else
+                       bzero(&bref, sizeof(bref));
+
+               mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
+               mp->mnt_vstat.f_files = bref.inode_count;
+               mp->mnt_vstat.f_ffree = 0;
+               mp->mnt_vstat.f_blocks = (bref.data_count +
+                                        hmp->voldata.allocator_free) /
+                                       mp->mnt_vstat.f_bsize;
+               mp->mnt_vstat.f_bfree = hmp->voldata.allocator_free /
+                                       mp->mnt_vstat.f_bsize;
+               mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
+
+               *sbp = mp->mnt_vstat;
+       }
        return (0);
 }
 
@@ -1988,7 +2014,7 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
        if ((ip->flags & HAMMER2_INODE_MODIFIED) ||
            !RB_EMPTY(&vp->v_rbdirty_tree)) {
                vfsync(vp, info->waitfor, 1, NULL, NULL);
-               hammer2_inode_fsync(ip, NULL);
+               hammer2_inode_fsync(ip);
        }
        if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
            RB_EMPTY(&vp->v_rbdirty_tree)) {
index 46af5a5..2cea2ad 100644 (file)
@@ -225,7 +225,7 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
         */
        hammer2_inode_lock(ip, 0);
        if (ip->flags & HAMMER2_INODE_MODIFIED)
-               hammer2_inode_fsync(ip, NULL);
+               hammer2_inode_fsync(ip);
        hammer2_inode_unlock(ip);
        hammer2_trans_done(ip->pmp);
 
@@ -261,6 +261,8 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        hammer2_inode_t *ip;
        struct vnode *vp;
        struct vattr *vap;
+       hammer2_chain_t *chain;
+       int i;
 
        LOCKSTART;
        vp = ap->a_vp;
@@ -286,7 +288,13 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        hammer2_time_to_timespec(ip->meta.mtime, &vap->va_mtime);
        hammer2_time_to_timespec(ip->meta.mtime, &vap->va_atime);
        vap->va_gen = 1;
-       vap->va_bytes = ip->bref.data_count;
+       vap->va_bytes = 0;
+       for (i = 0; i < ip->cluster.nchains; ++i) {
+               if ((chain = ip->cluster.array[i].chain) != NULL) {
+                       if (vap->va_bytes < chain->bref.data_count)
+                               vap->va_bytes = chain->bref.data_count;
+               }
+       }
        vap->va_type = hammer2_get_vtype(ip->meta.type);
        vap->va_filerev = 0;
        vap->va_uid_uuid = ip->meta.uid;
@@ -441,7 +449,7 @@ done:
         * block table.
         */
        if (ip->flags & HAMMER2_INODE_RESIZED)
-               hammer2_inode_fsync(ip, NULL);
+               hammer2_inode_fsync(ip);
 
        /*
         * Cleanup.
@@ -551,7 +559,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
         * double lock shared locks as this will screw up upgrades.
         */
        xop = &hammer2_xop_alloc(ip)->xop_readdir;
-       xop->head.lkey = lkey;
+       xop->lkey = lkey;
        hammer2_xop_start(&xop->head, hammer2_xop_readdir);
 
        for (;;) {
@@ -953,14 +961,14 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
                hammer2_mtx_ex(&ip->lock);
                hammer2_truncate_file(ip, old_eof);
                if (ip->flags & HAMMER2_INODE_MODIFIED)
-                       hammer2_inode_fsync(ip, NULL);
+                       hammer2_inode_fsync(ip);
                hammer2_mtx_unlock(&ip->lock);
        } else if (modified) {
                hammer2_mtx_ex(&ip->lock);
                hammer2_inode_modify(ip);
                hammer2_update_time(&ip->meta.mtime);
                if (ip->flags & HAMMER2_INODE_MODIFIED)
-                       hammer2_inode_fsync(ip, NULL);
+                       hammer2_inode_fsync(ip);
                hammer2_mtx_unlock(&ip->lock);
                hammer2_knote(ip->vp, kflags);
        }
index c304a9f..631d5e0 100644 (file)
@@ -67,9 +67,9 @@
  * to the inode_tid and modify_tid.
  */
 void
-hammer2_xop_vfsroot(hammer2_xop_t *arg, int clindex)
+hammer2_xop_ipcluster(hammer2_xop_t *arg, int clindex)
 {
-       hammer2_xop_vfsroot_t *xop = &arg->xop_vfsroot;
+       hammer2_xop_ipcluster_t *xop = &arg->xop_ipcluster;
        hammer2_chain_t *chain;
        int error;
 
@@ -100,7 +100,7 @@ hammer2_xop_readdir(hammer2_xop_t *arg, int clindex)
        int cache_index = -1;
        int error = 0;
 
-       lkey = xop->head.lkey;
+       lkey = xop->lkey;
        if (hammer2_debug & 0x0020)
                kprintf("xop_readdir %p lkey=%016jx\n", xop, lkey);
 
@@ -124,7 +124,7 @@ hammer2_xop_readdir(hammer2_xop_t *arg, int clindex)
                                     &cache_index, HAMMER2_LOOKUP_SHARED);
        if (chain == NULL) {
                chain = hammer2_chain_lookup(&parent, &key_next,
-                                            lkey, (hammer2_key_t)-1,
+                                            lkey, HAMMER2_KEY_MAX,
                                             &cache_index,
                                             HAMMER2_LOOKUP_SHARED);
        }
@@ -133,7 +133,7 @@ hammer2_xop_readdir(hammer2_xop_t *arg, int clindex)
                if (error)
                        break;
                chain = hammer2_chain_next(&parent, chain, &key_next,
-                                          key_next, (hammer2_key_t)-1,
+                                          key_next, HAMMER2_KEY_MAX,
                                           &cache_index,
                                           HAMMER2_LOOKUP_SHARED |
                                           HAMMER2_LOOKUP_NOUNLOCK);
@@ -718,7 +718,7 @@ done:
 }
 
 /*
- * Lookup a specific key.
+ * Generic lookup of a specific key.
  *
  * Used by the inode hidden directory code to find the hidden directory.
  */
@@ -765,3 +765,55 @@ done:
                hammer2_chain_drop(parent);
        }
 }
+
+/*
+ * Generic scan
+ */
+void
+hammer2_xop_scanall(hammer2_xop_t *arg, int clindex)
+{
+       hammer2_xop_scanall_t *xop = &arg->xop_scanall;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_key_t key_next;
+       int cache_index = -1;
+       int error = 0;
+
+       /*
+        * The inode's chain is the iterator.  If we cannot acquire it our
+        * contribution ends here.
+        */
+       parent = hammer2_inode_chain(xop->head.ip, clindex,
+                                    HAMMER2_RESOLVE_ALWAYS |
+                                    HAMMER2_RESOLVE_SHARED);
+       if (parent == NULL) {
+               kprintf("xop_readdir: NULL parent\n");
+               goto done;
+       }
+
+       /*
+        * Generic scan of exact records.  Note that indirect blocks are
+        * automatically recursed and will not be returned.
+        */
+       chain = hammer2_chain_lookup(&parent, &key_next,
+                                    xop->key_beg, xop->key_end,
+                                    &cache_index, HAMMER2_LOOKUP_SHARED |
+                                                  HAMMER2_LOOKUP_NODIRECT);
+       while (chain) {
+               error = hammer2_xop_feed(&xop->head, chain, clindex, 0);
+               if (error)
+                       break;
+               chain = hammer2_chain_next(&parent, chain, &key_next,
+                                          key_next, xop->key_end,
+                                          &cache_index,
+                                          HAMMER2_LOOKUP_SHARED |
+                                          HAMMER2_LOOKUP_NODIRECT |
+                                          HAMMER2_LOOKUP_NOUNLOCK);
+       }
+       if (chain)
+               hammer2_chain_drop(chain);
+       hammer2_chain_unlock(parent);
+       hammer2_chain_drop(parent);
+done:
+       hammer2_xop_feed(&xop->head, NULL, clindex, error);
+}