hammer2 - refactor filesystem sync 1/N
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 5 Nov 2018 01:31:07 +0000 (17:31 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 5 Dec 2018 18:28:39 +0000 (10:28 -0800)
* Change H2 to allow concurrent filesystem sync and modifying
  frontend operations.

* FLUSH transactions no longer block modifying frontend
  transactions.

* Change filesystem sync operation to put all modified
  inodes on the pmp->syncq (which is also combined with
  any inodes on pmp->sideq), and then iterating the
  syncq to flush each inode.

  After this is done, stage 2 will flush the meta-data tree
  leading to each inode.

  This code will also handle delayed inode creation and
  destruction ops, which require modifications to the meta-data
  tree governing the inodes themselves (so we don't want the
  frontend to do it and interfere with the flush).

* Modifying operations against inodes already queued for a
  filesystem sync that is in progress will now reorder the
  inode to the front of the filesystem sync in progress and
  wait for the sync on that inode to complete before proceeding.

  This is handled by blocking in the exclusive inode lock code.

* hammer2_inode_get() does not need to pass 'dip' any more
  because regular inodes are inserted under the iroot (PFS root
  inode) and no longer inserted hierarchically.

* Separate out hammer2_inode_create() into hammer2_inode_create_pfs()
  and hammer2_inode_create_normal().  These two forms are now distinct
  enough that the code is a mess if we try to leave them combined.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_synchro.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index 929aa1f..173ef70 100644 (file)
@@ -700,7 +700,9 @@ hammer2_cluster_wrok(hammer2_cluster_t *cluster)
        return (cluster->flags & HAMMER2_CLUSTER_WROK);
 }
 
-RB_HEAD(hammer2_inode_tree, hammer2_inode);
+RB_HEAD(hammer2_inode_tree, hammer2_inode);    /* ip->rbnode */
+TAILQ_HEAD(syncq_head, hammer2_inode);         /* ip->entry */
+TAILQ_HEAD(sideq_head, hammer2_inode);         /* ip->entry */
 
 /*
  * A hammer2 inode.
@@ -711,6 +713,7 @@ RB_HEAD(hammer2_inode_tree, hammer2_inode);
  */
 struct hammer2_inode {
        RB_ENTRY(hammer2_inode) rbnode;         /* inumber lookup (HL) */
+       TAILQ_ENTRY(hammer2_inode) entry;       /* syncq, SYNCQ flag */
        hammer2_mtx_t           lock;           /* inode lock */
        hammer2_mtx_t           truncate_lock;  /* prevent truncates */
        struct hammer2_pfs      *pmp;           /* PFS mount */
@@ -731,6 +734,34 @@ typedef struct hammer2_inode hammer2_inode_t;
  * MODIFIED    - Inode is in a modified state, ip->meta may have changes.
  * RESIZED     - Inode truncated (any) or inode extended beyond
  *               EMBEDDED_BYTES.
+ *
+ * SYNCQ       - Inode is included in the current filesystem sync.  The
+ *               DELETING and CREATING flags will be acted upon.
+ *
+ * SIDEQ       - Inode has likely been disconnected from the vnode topology
+ *               and so is not visible to the vnode-based filesystem syncer
+ *               code, but is dirty and must be included in the next
+ *               filesystem sync.  These inodes are moved to the SYNCQ at
+ *               the time the sync occurs.
+ *
+ *               Inodes are not placed on this queue simply because they have
+ *               become dirty, if a vnode is attached.
+ *
+ * DELETING    - Inode is flagged for deletion during the next filesystem
+ *               sync.  That is, the inode's chain is currently connected
+ *               and must be deleting during the current or next fs sync.
+ *
+ * CREATING    - Inode is flagged for creation during the next filesystem
+ *               sync.  That is, the inode's chain topology exists (so
+ *               kernel buffer flushes can occur), but is currently
+ *               disconnected and must be inserted during the current or
+ *               next fs sync.  If the DELETING flag is also set, the
+ *               topology can be thrown away instead.
+ *
+ * If an inode that is already part of the current filesystem sync is
+ * modified by the frontend, including by buffer flushes, the inode lock
+ * code detects the SYNCQ flag and moves the inode to the head of the
+ * flush-in-progress, then blocks until the flush has gotten past it.
  */
 #define HAMMER2_INODE_MODIFIED         0x0001
 #define HAMMER2_INODE_SROOT            0x0002  /* kmalloc special case */
@@ -740,25 +771,18 @@ typedef struct hammer2_inode hammer2_inode_t;
 #define HAMMER2_INODE_ISDELETED                0x0020  /* deleted */
 #define HAMMER2_INODE_ISUNLINKED       0x0040
 #define HAMMER2_INODE_METAGOOD         0x0080  /* inode meta-data good */
-#define HAMMER2_INODE_ONSIDEQ          0x0100  /* on side processing queue */
+#define HAMMER2_INODE_SIDEQ            0x0100  /* on side processing queue */
 #define HAMMER2_INODE_NOSIDEQ          0x0200  /* disable sideq operation */
 #define HAMMER2_INODE_DIRTYDATA                0x0400  /* interlocks inode flush */
+#define HAMMER2_INODE_SYNCQ            0x0800  /* sync interlock, sequenced */
+#define HAMMER2_INODE_DELETING         0x1000  /* sync interlock, chain topo */
+#define HAMMER2_INODE_CREATING         0x2000  /* sync interlock, chain topo */
+#define HAMMER2_INODE_SYNCQ_WAKEUP     0x4000  /* sync interlock wakeup */
 
 int hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2);
 RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
                hammer2_tid_t);
 
-/*
- * inode-unlink side-structure
- */
-struct hammer2_inode_sideq {
-       TAILQ_ENTRY(hammer2_inode_sideq) entry;
-       hammer2_inode_t *ip;
-};
-TAILQ_HEAD(h2_sideq_list, hammer2_inode_sideq);
-
-typedef struct hammer2_inode_sideq hammer2_inode_sideq_t;
-
 /*
  * Transaction management sub-structure under hammer2_pfs
  */
@@ -1207,7 +1231,8 @@ struct hammer2_pfs {
        uint32_t                inmem_dirty_chains;
        int                     count_lwinprog; /* logical write in prog */
        struct spinlock         list_spin;
-       struct h2_sideq_list    sideq;          /* last-close dirty/unlink */
+       struct syncq_head       syncq;          /* SYNCQ flagged inodes */
+       struct sideq_head       sideq;          /* SIDEQ flagged inodes */
        long                    sideq_count;
        hammer2_thread_t        sync_thrs[HAMMER2_MAXCLUSTER];
        uint32_t                cluster_flags;  /* cached cluster flags */
@@ -1471,8 +1496,8 @@ void hammer2_adjreadcounter(hammer2_blockref_t *bref, size_t bytes);
 struct vnode *hammer2_igetv(hammer2_inode_t *ip, int *errorp);
 hammer2_inode_t *hammer2_inode_lookup(hammer2_pfs_t *pmp,
                        hammer2_tid_t inum);
-hammer2_inode_t *hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_inode_t *dip,
-                       hammer2_xop_head_t *xop, int idx);
+hammer2_inode_t *hammer2_inode_get(hammer2_pfs_t *pmp,
+                       hammer2_xop_head_t *xop, hammer2_tid_t inum, int idx);
 void hammer2_inode_free(hammer2_inode_t *ip);
 void hammer2_inode_ref(hammer2_inode_t *ip);
 void hammer2_inode_drop(hammer2_inode_t *ip);
@@ -1483,12 +1508,12 @@ void hammer2_inode_repoint_one(hammer2_inode_t *ip, hammer2_cluster_t *cluster,
 void hammer2_inode_modify(hammer2_inode_t *ip);
 void hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall);
 
-hammer2_inode_t *hammer2_inode_create(hammer2_inode_t *dip,
-                       hammer2_inode_t *pip,
+hammer2_inode_t *hammer2_inode_create_normal(hammer2_inode_t *pip,
                        struct vattr *vap, struct ucred *cred,
-                       const uint8_t *name, size_t name_len, hammer2_key_t lhc,
-                       hammer2_key_t inum, uint8_t type, uint8_t target_type,
-                       int flags, int *errorp);
+                       hammer2_key_t inum, int *errorp);
+hammer2_inode_t *hammer2_inode_create_pfs(hammer2_pfs_t *spmp,
+                       const uint8_t *name, size_t name_len,
+                       int *errorp);
 int hammer2_inode_chain_sync(hammer2_inode_t *ip);
 int hammer2_inode_chain_flush(hammer2_inode_t *ip);
 int hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen);
@@ -1789,6 +1814,7 @@ void hammer2_volconf_update(hammer2_dev_t *hmp, int index);
 void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx,
                                u_int flags);
 int hammer2_vfs_sync(struct mount *mp, int waitflags);
+int hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor);
 int hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred);
 
 hammer2_pfs_t *hammer2_pfsalloc(hammer2_chain_t *chain,
index 7fbdc0d..7e9a0dd 100644 (file)
@@ -140,11 +140,19 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
 
                if (flags & HAMMER2_TRANS_ISFLUSH) {
                        /*
-                        * Requesting flush transaction.  Wait for all
-                        * currently running transactions to finish.
-                        * Afterwords, normal transactions will be
-                        * interlocked.
+                        * Requesting flush transaction.  This interlocks
+                        * only with other flush transactions.  Note that
+                        * non-flush modifying transactions can run
+                        * concurrently, but will interlock on any inode
+                        * that are on the SYNCQ.
                         */
+                       if (oflags & HAMMER2_TRANS_ISFLUSH) {
+                               nflags = oflags | HAMMER2_TRANS_WAITING;
+                               dowait = 1;
+                       } else {
+                               nflags = (oflags | flags) + 1;
+                       }
+#if 0
                        if (oflags & HAMMER2_TRANS_MASK) {
                                nflags = oflags | HAMMER2_TRANS_FPENDING |
                                                  HAMMER2_TRANS_WAITING;
@@ -152,6 +160,7 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                        } else {
                                nflags = (oflags | flags) + 1;
                        }
+#endif
                } else if (flags & HAMMER2_TRANS_BUFCACHE) {
                        /*
                         * Requesting strategy transaction from buffer-cache,
@@ -160,26 +169,13 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                         * to avoid deadlocks.
                         */
                        nflags = (oflags | flags) + 1;
-#if 0
-                       /*
-                        * (old) previous code interlocked against the main
-                        *       flush pass.
-                        */
-                       if ((oflags & (HAMMER2_TRANS_ISFLUSH |
-                                      HAMMER2_TRANS_PREFLUSH)) ==
-                           HAMMER2_TRANS_ISFLUSH) {
-                               nflags = oflags | HAMMER2_TRANS_WAITING;
-                               dowait = 1;
-                       } else {
-                               nflags = (oflags | flags) + 1;
-                       }
-#endif
                } else {
                        /*
                         * Requesting a normal modifying transaction.
-                        * Waits for any flush to finish before allowing.
-                        * Multiple modifying transactions can run
-                        * concurrently.
+                        * Does not interlock with flushes.  Multiple
+                        * modifying transactions can run concurrently.
+                        * These do not mess with the on-media topology
+                        * above the inode.
                         *
                         * If a flush is pending for more than one second
                         * but can't run because many modifying transactions
@@ -189,6 +185,7 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                         *       such as read, stat, readdir, etc, do
                         *       not use transactions.
                         */
+#if 0
                        if ((oflags & HAMMER2_TRANS_FPENDING) &&
                            (u_int)(ticks - pmp->trans.fticks) >= (u_int)hz) {
                                nflags = oflags | HAMMER2_TRANS_WAITING;
@@ -196,7 +193,9 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                        } else if (oflags & HAMMER2_TRANS_ISFLUSH) {
                                nflags = oflags | HAMMER2_TRANS_WAITING;
                                dowait = 1;
-                       } else {
+                       } else
+#endif
+                       {
                                nflags = (oflags | flags) + 1;
                        }
                }
@@ -251,8 +250,11 @@ hammer2_trans_done(hammer2_pfs_t *pmp, int quicksideq)
         * due to potential deadlocks, so we have to deal with them from
         * inside other nominal modifying front-end transactions.
         */
-       if (quicksideq && pmp->sideq_count > (pmp->inum_count >> 3))
+       if (quicksideq && pmp->sideq_count > (pmp->inum_count >> 3) && pmp->mp)
+               speedup_syncer(pmp->mp);
+#if 0
                hammer2_inode_run_sideq(pmp, 0);
+#endif
 
        /*
         * Clean-up the transaction
index dc7eb0b..c4818bb 100644 (file)
@@ -68,24 +68,19 @@ static
 void
 hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
 {
-       hammer2_inode_sideq_t *ipul;
        hammer2_pfs_t *pmp = ip->pmp;
 
-       if ((ip->flags & HAMMER2_INODE_ONSIDEQ) == 0) {
-               ipul = kmalloc(sizeof(*ipul), pmp->minode,
-                              M_WAITOK | M_ZERO);
-               ipul->ip = ip;
+       if ((ip->flags & (HAMMER2_INODE_SYNCQ | HAMMER2_INODE_SIDEQ)) == 0) {
                hammer2_spin_ex(&pmp->list_spin);
-               if ((ip->flags & HAMMER2_INODE_ONSIDEQ) == 0) {
+               if ((ip->flags & (HAMMER2_INODE_SYNCQ |
+                                 HAMMER2_INODE_SIDEQ)) == 0) {
                        hammer2_inode_ref(ip);
-                       atomic_set_int(&ip->flags,
-                                      HAMMER2_INODE_ONSIDEQ);
-                       TAILQ_INSERT_TAIL(&pmp->sideq, ipul, entry);
+                       atomic_set_int(&ip->flags, HAMMER2_INODE_SIDEQ);
+                       TAILQ_INSERT_TAIL(&pmp->sideq, ip, entry);
                        ++pmp->sideq_count;
                        hammer2_spin_unex(&pmp->list_spin);
                } else {
                        hammer2_spin_unex(&pmp->list_spin);
-                       kfree(ipul, pmp->minode);
                }
        }
 }
@@ -108,6 +103,10 @@ hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
  *       vnode reclamation code to avoid unnecessary I/O (particularly when
  *       disposing of hundreds of thousands of cached vnodes).
  *
+ * When an exclusive lock is obtained on an inode that is on the SYNCQ,
+ * HAMMER2 will automatically move the inode to the front of the queue before
+ * blocking to avoid long stalls against filesystem sync operations.
+ *
  * The inode locking function locks the inode itself, resolves any stale
  * chains in the inode's cluster, and allocates a fresh copy of the
  * cluster with 1 ref and all the underlying chains locked.
@@ -130,19 +129,65 @@ hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
 void
 hammer2_inode_lock(hammer2_inode_t *ip, int how)
 {
+       hammer2_pfs_t *pmp;
+
        hammer2_inode_ref(ip);
+       pmp = ip->pmp;
 
        /* 
-        * Inode structure mutex
+        * Inode structure mutex - Shared lock
         */
        if (how & HAMMER2_RESOLVE_SHARED) {
                /*how |= HAMMER2_RESOLVE_RDONLY; not used */
                hammer2_mtx_sh(&ip->lock);
+               return;
+       }
+
+       /*
+        * Inode structure mutex - Exclusive lock
+        *
+        * The exclusive lock must wait for inodes on SYNCQ to flush
+        * first, to ensure that meta-data dependencies such as the
+        * nlink count and related directory entries are not split
+        * across flushes.
+        */
+       hammer2_mtx_ex(&ip->lock);
+       while ((ip->flags & HAMMER2_INODE_SYNCQ) && pmp) {
+               hammer2_spin_ex(&pmp->list_spin);
+               if (ip->flags & HAMMER2_INODE_SYNCQ) {
+                       atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ_WAKEUP);
+                       TAILQ_REMOVE(&pmp->syncq, ip, entry);
+                       TAILQ_INSERT_HEAD(&pmp->syncq, ip, entry);
+                       hammer2_spin_unex(&pmp->list_spin);
+                       tsleep_interlock(&ip->flags, 0);
+                       hammer2_mtx_unlock(&ip->lock);
+                       tsleep(&ip->flags, PINTERLOCKED, "h2sync", 0);
+                       hammer2_mtx_ex(&ip->lock);
+                       continue;
+               }
+               hammer2_spin_unex(&pmp->list_spin);
+               break;
+       }
+}
+
+/*
+ * Release an inode lock.  If another thread is blocked on SYNCQ_WAKEUP
+ * we wake them up.
+ */
+void
+hammer2_inode_unlock(hammer2_inode_t *ip)
+{
+       if (ip->flags & HAMMER2_INODE_SYNCQ_WAKEUP) {
+               atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_WAKEUP);
+               hammer2_mtx_unlock(&ip->lock);
+               wakeup(&ip->flags);
        } else {
-               hammer2_mtx_ex(&ip->lock);
+               hammer2_mtx_unlock(&ip->lock);
        }
+       hammer2_inode_drop(ip);
 }
 
+
 /*
  * Select a chain out of an inode's cluster and lock it.
  *
@@ -221,13 +266,6 @@ hammer2_inode_chain_and_parent(hammer2_inode_t *ip, int clindex,
        return chain;
 }
 
-void
-hammer2_inode_unlock(hammer2_inode_t *ip)
-{
-       hammer2_mtx_unlock(&ip->lock);
-       hammer2_inode_drop(ip);
-}
-
 /*
  * Temporarily release a lock held shared or exclusive.  Caller must
  * hold the lock shared or exclusive on call and lock will be released
@@ -544,10 +582,17 @@ hammer2_igetv(hammer2_inode_t *ip, int *errorp)
 }
 
 /*
- * Returns the inode associated with the passed-in cluster, creating the
- * inode if necessary and synchronizing it to the passed-in cluster otherwise.
- * When synchronizing, if idx >= 0, only cluster index (idx) is synchronized.
- * Otherwise the whole cluster is synchronized.
+ * Returns the inode associated with the passed-in cluster, allocating a new
+ * hammer2_inode structure if necessary, then synchronizing it to the passed
+ * xop cluster.  When synchronizing, if idx >= 0, only cluster index (idx)
+ * is synchronized.  Otherwise the whole cluster is synchronized.  inum will
+ * be extracted from the passed-in xop and the inum argument will be ignored.
+ *
+ * If xop is passed as NULL then a new hammer2_inode is allocated with the
+ * specified inum, and returned.   For normal inodes, the inode will be
+ * indexed in memory and if it already exists the existing ip will be
+ * returned instead of allocating a new one.  The superroot and PFS inodes
+ * are not indexed in memory.
  *
  * The passed-in cluster must be locked and will remain locked on return.
  * The returned inode will be locked and the caller may dispose of both
@@ -560,8 +605,8 @@ hammer2_igetv(hammer2_inode_t *ip, int *errorp)
  * On return the inode is locked with the supplied cluster.
  */
 hammer2_inode_t *
-hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_inode_t *dip,
-                 hammer2_xop_head_t *xop, int idx)
+hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_xop_head_t *xop,
+                 hammer2_tid_t inum, int idx)
 {
        hammer2_inode_t *nip;
        const hammer2_inode_data_t *iptmp;
@@ -579,36 +624,38 @@ hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_inode_t *dip,
         *
         * Cluster can be NULL during the initial pfs allocation.
         */
-again:
-       while (xop) {
+       if (xop) {
                iptmp = &hammer2_xop_gdata(xop)->ipdata;
-               nip = hammer2_inode_lookup(pmp, iptmp->meta.inum);
+               inum = iptmp->meta.inum;
                hammer2_xop_pdata(xop);
-               if (nip == NULL)
-                       break;
-
-               hammer2_mtx_ex(&nip->lock);
-
+       }
+again:
+       nip = hammer2_inode_lookup(pmp, inum);
+       if (nip) {
                /*
                 * Handle SMP race (not applicable to the super-root spmp
                 * which can't index inodes due to duplicative inode numbers).
                 */
+               hammer2_mtx_ex(&nip->lock);
                if (pmp->spmp_hmp == NULL &&
                    (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
                        hammer2_mtx_unlock(&nip->lock);
                        hammer2_inode_drop(nip);
-                       continue;
+                       goto again;
+               }
+               if (xop) {
+                       if (idx >= 0)
+                               hammer2_inode_repoint_one(nip, &xop->cluster,
+                                                         idx);
+                       else
+                               hammer2_inode_repoint(nip, NULL, &xop->cluster);
                }
-               if (idx >= 0)
-                       hammer2_inode_repoint_one(nip, &xop->cluster, idx);
-               else
-                       hammer2_inode_repoint(nip, NULL, &xop->cluster);
-
                return nip;
        }
 
        /*
-        * We couldn't find the inode number, create a new inode.
+        * We couldn't find the inode number, create a new inode and try to
+        * insert it, handle insertion races.
         */
        nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
        spin_init(&nip->cluster_spin, "h2clspin");
@@ -632,9 +679,9 @@ again:
                atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);
                hammer2_inode_repoint(nip, NULL, &xop->cluster);
        } else {
-               nip->meta.inum = 1;             /* PFS inum is always 1 XXX */
+               nip->meta.inum = inum;          /* PFS inum is always 1 XXX */
                /* mtime will be updated when a cluster is available */
-               atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);/*XXX*/
+               atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);    /*XXX*/
        }
 
        nip->pmp = pmp;
@@ -664,61 +711,39 @@ again:
                ++pmp->inum_count;
                hammer2_spin_unex(&pmp->inum_spin);
        }
-
        return (nip);
 }
 
 /*
- * MESSY! CLEANUP!
- *
- * Create a new inode using the vattr to figure out the type.  A non-zero
- * type field overrides vattr.  We need the directory to set iparent or to
- * use when the inode is directly embedded in a directory (typically super-root
- * entries), but note that this really only applies OBJTYPE_DIRECTORY as
- * non-directory inodes can be hardlinked.
+ * Create a PFS inode under the superroot.  This function will create the
+ * inode, its media chains, and also insert it into the media.
  *
- * If no error occurs the new inode is returned, otherwise NULL is returned.
- * It is possible for an error to create a junk inode and then fail later.
- * It will attempt to delete the junk inode and return NULL in this situation.
- *
- * If vap and/or cred are NULL the related fields are not set and the
- * inode type defaults to a directory.  This is used when creating PFSs
- * under the super-root, so the inode number is set to 1 in this case.
- *
- * dip is not locked on entry.
- *
- * NOTE: This function is used to create all manners of inodes, including
- *      super-root entries for snapshots and PFSs.  When used to create a
- *      snapshot the inode will be temporarily associated with the spmp.
- *
- * NOTE: When creating a normal file or directory the name/name_len/lhc
- *      is optional, but is typically specified to make debugging and
- *      recovery easeier.
+ * Caller must be in a flush transaction because we are inserting the inode
+ * onto the media.
  */
 hammer2_inode_t *
-hammer2_inode_create(hammer2_inode_t *dip, hammer2_inode_t *pip,
-                    struct vattr *vap, struct ucred *cred,
-                    const uint8_t *name, size_t name_len, hammer2_key_t lhc,
-                    hammer2_key_t inum,
-                    uint8_t type, uint8_t target_type,
-                    int flags, int *errorp)
+hammer2_inode_create_pfs(hammer2_pfs_t *spmp,
+                    const uint8_t *name, size_t name_len,
+                    int *errorp)
 {
        hammer2_xop_create_t *xop;
+       hammer2_inode_t *pip;
        hammer2_inode_t *nip;
        int error;
-       uid_t xuid;
        uuid_t pip_uid;
        uuid_t pip_gid;
        uint32_t pip_mode;
        uint8_t pip_comp_algo;
        uint8_t pip_check_algo;
        hammer2_tid_t pip_inum;
+       hammer2_key_t lhc;
 
-       if (name)
-               lhc = hammer2_dirhash(name, name_len);
-       *errorp = 0;
+       pip = spmp->iroot;
        nip = NULL;
 
+       lhc = hammer2_dirhash(name, name_len);
+       *errorp = 0;
+
        /*
         * Locate the inode or indirect block to create the new
         * entry in.  At the same time check for key collisions
@@ -729,7 +754,7 @@ hammer2_inode_create(hammer2_inode_t *dip, hammer2_inode_t *pip,
         * two different creates can end up with the same lhc so we
         * cannot depend on the OS to prevent the collision.
         */
-       hammer2_inode_lock(dip, 0);
+       hammer2_inode_lock(pip, 0);
 
        pip_uid = pip->meta.uid;
        pip_gid = pip->meta.gid;
@@ -739,15 +764,14 @@ hammer2_inode_create(hammer2_inode_t *dip, hammer2_inode_t *pip,
        pip_inum = (pip == pip->pmp->iroot) ? 1 : pip->meta.inum;
 
        /*
-        * If name specified, locate an unused key in the collision space.
-        * Otherwise use the passed-in lhc directly.
+        * Locate an unused key in the collision space.
         */
-       if (name) {
+       {
                hammer2_xop_scanlhc_t *sxop;
                hammer2_key_t lhcbase;
 
                lhcbase = lhc;
-               sxop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
+               sxop = hammer2_xop_alloc(pip, HAMMER2_XOP_MODIFYING);
                sxop->lhc = lhc;
                hammer2_xop_start(&sxop->head, &hammer2_scanlhc_desc);
                while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
@@ -772,28 +796,130 @@ hammer2_inode_create(hammer2_inode_t *dip, hammer2_inode_t *pip,
        /*
         * Create the inode with the lhc as the key.
         */
-       xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
+       xop = hammer2_xop_alloc(pip, HAMMER2_XOP_MODIFYING);
        xop->lhc = lhc;
-       xop->flags = flags;
+       xop->flags = HAMMER2_INSERT_PFSROOT;
        bzero(&xop->meta, sizeof(xop->meta));
 
-       if (vap) {
-               xop->meta.type = hammer2_get_obj_type(vap->va_type);
+       xop->meta.type = HAMMER2_OBJTYPE_DIRECTORY;
+       xop->meta.inum = 1;
+       xop->meta.iparent = pip_inum;
 
-               switch (xop->meta.type) {
-               case HAMMER2_OBJTYPE_CDEV:
-               case HAMMER2_OBJTYPE_BDEV:
-                       xop->meta.rmajor = vap->va_rmajor;
-                       xop->meta.rminor = vap->va_rminor;
-                       break;
-               default:
-                       break;
-               }
-               type = xop->meta.type;
-       } else {
-               xop->meta.type = type;
-               xop->meta.target_type = target_type;
+       /* Inherit parent's inode compression mode. */
+       xop->meta.comp_algo = pip_comp_algo;
+       xop->meta.check_algo = pip_check_algo;
+       xop->meta.version = HAMMER2_INODE_VERSION_ONE;
+       hammer2_update_time(&xop->meta.ctime);
+       xop->meta.mtime = xop->meta.ctime;
+       xop->meta.mode = 0755;
+       xop->meta.nlinks = 1;
+
+       /*
+        * Regular files and softlinks allow a small amount of data to be
+        * directly embedded in the inode.  This flag will be cleared if
+        * the size is extended past the embedded limit.
+        */
+       if (xop->meta.type == HAMMER2_OBJTYPE_REGFILE ||
+           xop->meta.type == HAMMER2_OBJTYPE_SOFTLINK) {
+               xop->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
+       }
+       hammer2_xop_setname(&xop->head, name, name_len);
+       xop->meta.name_len = name_len;
+       xop->meta.name_key = lhc;
+       KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
+
+       hammer2_xop_start(&xop->head, &hammer2_inode_create_desc);
+
+       error = hammer2_xop_collect(&xop->head, 0);
+#if INODE_DEBUG
+       kprintf("CREATE INODE %*.*s\n",
+               (int)name_len, (int)name_len, name);
+#endif
+
+       if (error) {
+               *errorp = error;
+               goto done;
        }
+
+       /*
+        * Set up the new inode if not a hardlink pointer.
+        *
+        * NOTE: *_get() integrates chain's lock into the inode lock.
+        *
+        * NOTE: Only one new inode can currently be created per
+        *       transaction.  If the need arises we can adjust
+        *       hammer2_trans_init() to allow more.
+        *
+        * NOTE: nipdata will have chain's blockset data.
+        */
+       nip = hammer2_inode_get(pip->pmp, &xop->head, -1, -1);
+       nip->comp_heuristic = 0;
+done:
+       hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
+done2:
+       hammer2_inode_unlock(pip);
+
+       return (nip);
+}
+
+hammer2_inode_t *
+hammer2_inode_create_normal(hammer2_inode_t *pip,
+                           struct vattr *vap, struct ucred *cred,
+                           hammer2_key_t inum, int *errorp)
+{
+       hammer2_xop_create_t *xop;
+       hammer2_inode_t *dip;
+       hammer2_inode_t *nip;
+       int error;
+       uid_t xuid;
+       uuid_t pip_uid;
+       uuid_t pip_gid;
+       uint32_t pip_mode;
+       uint8_t pip_comp_algo;
+       uint8_t pip_check_algo;
+       hammer2_tid_t pip_inum;
+       uint8_t type;
+
+       dip = pip->pmp->iroot;
+       KKASSERT(dip != NULL);
+       nip = NULL;
+
+       *errorp = 0;
+
+       hammer2_inode_lock(dip, 0);
+
+       pip_uid = pip->meta.uid;
+       pip_gid = pip->meta.gid;
+       pip_mode = pip->meta.mode;
+       pip_comp_algo = pip->meta.comp_algo;
+       pip_check_algo = pip->meta.check_algo;
+       pip_inum = (pip == pip->pmp->iroot) ? 1 : pip->meta.inum;
+
+       /*
+        * Create the inode using (inum) as the key.
+        */
+       xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
+       xop->lhc = inum;
+       xop->flags = 0;
+       bzero(&xop->meta, sizeof(xop->meta));
+       KKASSERT(vap);
+
+       /*
+        * Setup the inode meta-data
+        */
+       xop->meta.type = hammer2_get_obj_type(vap->va_type);
+
+       switch (xop->meta.type) {
+       case HAMMER2_OBJTYPE_CDEV:
+       case HAMMER2_OBJTYPE_BDEV:
+               xop->meta.rmajor = vap->va_rmajor;
+               xop->meta.rminor = vap->va_rminor;
+               break;
+       default:
+               break;
+       }
+       type = xop->meta.type;
+
        xop->meta.inum = inum;
        xop->meta.iparent = pip_inum;
        
@@ -803,35 +929,26 @@ hammer2_inode_create(hammer2_inode_t *dip, hammer2_inode_t *pip,
        xop->meta.version = HAMMER2_INODE_VERSION_ONE;
        hammer2_update_time(&xop->meta.ctime);
        xop->meta.mtime = xop->meta.ctime;
-       if (vap)
-               xop->meta.mode = vap->va_mode;
+       xop->meta.mode = vap->va_mode;
        xop->meta.nlinks = 1;
-       if (vap) {
-               if (dip->pmp) {
-                       xuid = hammer2_to_unix_xid(&pip_uid);
-                       xuid = vop_helper_create_uid(dip->pmp->mp,
-                                                    pip_mode,
-                                                    xuid,
-                                                    cred,
-                                                    &vap->va_mode);
-               } else {
-                       /* super-root has no dip and/or pmp */
-                       xuid = 0;
-               }
-               if (vap->va_vaflags & VA_UID_UUID_VALID)
-                       xop->meta.uid = vap->va_uid_uuid;
-               else if (vap->va_uid != (uid_t)VNOVAL)
-                       hammer2_guid_to_uuid(&xop->meta.uid, vap->va_uid);
-               else
-                       hammer2_guid_to_uuid(&xop->meta.uid, xuid);
 
-               if (vap->va_vaflags & VA_GID_UUID_VALID)
-                       xop->meta.gid = vap->va_gid_uuid;
-               else if (vap->va_gid != (gid_t)VNOVAL)
-                       hammer2_guid_to_uuid(&xop->meta.gid, vap->va_gid);
-               else
-                       xop->meta.gid = pip_gid;
-       }
+       xuid = hammer2_to_unix_xid(&pip_uid);
+       xuid = vop_helper_create_uid(dip->pmp->mp, pip_mode,
+                                    xuid, cred,
+                                    &vap->va_mode);
+       if (vap->va_vaflags & VA_UID_UUID_VALID)
+               xop->meta.uid = vap->va_uid_uuid;
+       else if (vap->va_uid != (uid_t)VNOVAL)
+               hammer2_guid_to_uuid(&xop->meta.uid, vap->va_uid);
+       else
+               hammer2_guid_to_uuid(&xop->meta.uid, xuid);
+
+       if (vap->va_vaflags & VA_GID_UUID_VALID)
+               xop->meta.gid = vap->va_gid_uuid;
+       else if (vap->va_gid != (gid_t)VNOVAL)
+               hammer2_guid_to_uuid(&xop->meta.gid, vap->va_gid);
+       else
+               xop->meta.gid = pip_gid;
 
        /*
         * Regular files and softlinks allow a small amount of data to be
@@ -842,16 +959,13 @@ hammer2_inode_create(hammer2_inode_t *dip, hammer2_inode_t *pip,
            xop->meta.type == HAMMER2_OBJTYPE_SOFTLINK) {
                xop->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
        }
-       if (name) {
-               hammer2_xop_setname(&xop->head, name, name_len);
-       } else {
-               name_len = hammer2_xop_setname_inum(&xop->head, inum);
-               KKASSERT(lhc == inum);
-       }
-       xop->meta.name_len = name_len;
-       xop->meta.name_key = lhc;
-       KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
 
+       xop->meta.name_len = hammer2_xop_setname_inum(&xop->head, inum);
+       xop->meta.name_key = inum;
+
+       /*
+        * Create the inode media chains
+        */
        hammer2_xop_start(&xop->head, &hammer2_inode_create_desc);
 
        error = hammer2_xop_collect(&xop->head, 0);
@@ -876,11 +990,10 @@ hammer2_inode_create(hammer2_inode_t *dip, hammer2_inode_t *pip,
         *
         * NOTE: nipdata will have chain's blockset data.
         */
-       nip = hammer2_inode_get(dip->pmp, dip, &xop->head, -1);
+       nip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
        nip->comp_heuristic = 0;
 done:
        hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
-done2:
        hammer2_inode_unlock(dip);
 
        return (nip);
@@ -1199,6 +1312,10 @@ killit:
  * sync will also handle synchronizing the inode meta-data.  If no vnode
  * is present we must ensure that the inode is on pmp->sideq.
  *
+ * NOTE: We must always queue the inode to the sideq.  This allows H2 to
+ *      shortcut vsyncscan() and flush inodes and their related vnodes
+ *      in a two stages.  H2 still calls vfsync() for each vnode.
+ *
  * NOTE: No mtid (modify_tid) is passed into this routine.  The caller is
  *      only modifying the in-memory inode.  A modify_tid is synchronized
  *      later when the inode gets flushed.
@@ -1210,11 +1327,10 @@ void
 hammer2_inode_modify(hammer2_inode_t *ip)
 {
        atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-       if (ip->vp) {
+       if (ip->vp)
                vsetisdirty(ip->vp);
-       } else if (ip->pmp && (ip->flags & HAMMER2_INODE_NOSIDEQ) == 0) {
+       if (ip->pmp && (ip->flags & HAMMER2_INODE_NOSIDEQ) == 0)
                hammer2_inode_delayed_sideq(ip);
-       }
 }
 
 /*
@@ -1297,6 +1413,7 @@ hammer2_inode_chain_flush(hammer2_inode_t *ip)
        return error;
 }
 
+#if 0
 /*
  * The normal filesystem sync no longer has visibility to an inode structure
  * after its vnode has been reclaimed.  In this situation a dirty inode may
@@ -1395,3 +1512,4 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall)
        }
        hammer2_spin_unex(&pmp->list_spin);
 }
+#endif
index 7c41bcd..8e78a9a 100644 (file)
@@ -616,15 +616,12 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
        if (hammer2_ioctl_pfs_lookup(ip, pfs) == 0)
                return(EEXIST);
 
-       hammer2_trans_init(hmp->spmp, 0);
+       hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
        mtid = hammer2_trans_sub(hmp->spmp);
-       nip = hammer2_inode_create(hmp->spmp->iroot, hmp->spmp->iroot,
-                                  NULL, NULL,
-                                  pfs->name, strlen(pfs->name), 0,
-                                  1, HAMMER2_OBJTYPE_DIRECTORY, 0,
-                                  HAMMER2_INSERT_PFSROOT, &error);
+       nip = hammer2_inode_create_pfs(hmp->spmp, pfs->name, strlen(pfs->name),
+                                      &error);
        if (error == 0) {
-               nip->flags |= HAMMER2_INODE_NOSIDEQ;
+               /* nip->flags |= HAMMER2_INODE_NOSIDEQ; */
                hammer2_inode_modify(nip);
                nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
                error = hammer2_chain_modify(nchain, mtid, 0, 0);
@@ -766,7 +763,7 @@ hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data)
 
 #if 0
         if (error == 0) {
-                ip = hammer2_inode_get(dip->pmp, dip, &xop->head, -1);
+                ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
                 if (ip) {
                         hammer2_inode_unlink_finisher(ip, 0);
@@ -795,7 +792,6 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
        hammer2_tid_t   mtid;
        size_t name_len;
        hammer2_key_t lhc;
-       struct vattr vat;
        int error;
 #if 0
        uuid_t opfs_clid;
@@ -850,15 +846,8 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
         * chain_duplicate() but it becomes difficult to disentangle
         * the shared core so for now just brute-force it.
         */
-       VATTR_NULL(&vat);
-       vat.va_type = VDIR;
-       vat.va_mode = 0755;
        hammer2_chain_unlock(chain);
-       nip = hammer2_inode_create(hmp->spmp->iroot, hmp->spmp->iroot,
-                                  &vat, proc0.p_ucred,
-                                  pfs->name, name_len, 0,
-                                  1, 0, 0,
-                                  HAMMER2_INSERT_PFSROOT, &error);
+       nip = hammer2_inode_create_pfs(hmp->spmp, pfs->name, name_len, &error);
        hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
        ripdata = &chain->data->ipdata;
 
@@ -868,7 +857,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
                hammer2_inode_data_t *wipdata;
                hammer2_key_t   starting_inum;
 
-               nip->flags |= HAMMER2_INODE_NOSIDEQ;
+               /* nip->flags |= HAMMER2_INODE_NOSIDEQ; */
                hammer2_inode_modify(nip);
                nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS);
                error = hammer2_chain_modify(nchain, mtid, 0, 0);
index c79480c..e75d33d 100644 (file)
@@ -601,7 +601,7 @@ hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
                        xop->head.cluster.array[idx].flags =
                                                        HAMMER2_CITEM_INVALID;
                        xop->head.cluster.array[idx].chain = chain;
-                       nip = hammer2_inode_get(pmp, ip, &xop->head, idx);
+                       nip = hammer2_inode_get(pmp, &xop->head, -1, idx);
                        xop->head.cluster.array[idx].chain = NULL;
 
                        hammer2_inode_ref(nip);
index 7e87edc..76950e2 100644 (file)
@@ -213,7 +213,9 @@ static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
                                int *exflagsp, struct ucred **credanonp);
 
 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
+#if 0
 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
+#endif
 
 static void hammer2_update_pmps(hammer2_dev_t *hmp);
 
@@ -406,6 +408,7 @@ hammer2_pfsalloc(hammer2_chain_t *chain,
                spin_init(&pmp->lru_spin, "h2lru");
                RB_INIT(&pmp->inum_tree);
                TAILQ_INIT(&pmp->sideq);
+               TAILQ_INIT(&pmp->syncq);
                TAILQ_INIT(&pmp->lru_list);
                spin_init(&pmp->list_spin, "hm2pfsalloc_list");
 
@@ -442,7 +445,7 @@ hammer2_pfsalloc(hammer2_chain_t *chain,
         * Create the PFS's root inode and any missing XOP helper threads.
         */
        if ((iroot = pmp->iroot) == NULL) {
-               iroot = hammer2_inode_get(pmp, NULL, NULL, -1);
+               iroot = hammer2_inode_get(pmp, NULL, 1, -1);
                if (ripdata)
                        iroot->meta = ripdata->meta;
                pmp->iroot = iroot;
@@ -738,10 +741,7 @@ again:
        TAILQ_FOREACH(pmp, wlist, mntentry) {
                if ((iroot = pmp->iroot) == NULL)
                        continue;
-               hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
-               hammer2_inode_run_sideq(pmp, 1);
-               hammer2_bioq_sync(pmp);
-               hammer2_trans_done(pmp, 0);
+               hammer2_vfs_sync_pmp(pmp, MNT_WAIT);
 
                /*
                 * Determine if this PFS is affected.  If it is we must
@@ -850,6 +850,7 @@ again:
                        /*
                         * Free the pmp and restart the loop
                         */
+                       KKASSERT(TAILQ_EMPTY(&pmp->sideq));
                        hammer2_pfsfree(pmp);
                        goto again;
                }
@@ -1291,7 +1292,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hammer2_dummy_xop_from_chain(&xop, schain);
                hammer2_inode_drop(spmp->iroot);
                spmp->iroot = NULL;
-               spmp->iroot = hammer2_inode_get(spmp, NULL, &xop, -1);
+               spmp->iroot = hammer2_inode_get(spmp, &xop, -1, -1);
                spmp->spmp_hmp = hmp;
                spmp->pfs_types[0] = ripdata->meta.pfs_type;
                spmp->pfs_hmps[0] = hmp;
@@ -1884,7 +1885,7 @@ hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
        error = hammer2_xop_collect(&xop->head, 0);
 
        if (error == 0)
-               ip = hammer2_inode_get(pmp, NULL, &xop->head, -1);
+               ip = hammer2_inode_get(pmp, &xop->head, -1, -1);
        hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
 
        if (ip) {
@@ -2389,14 +2390,26 @@ hammer2_fixup_pfses(hammer2_dev_t *hmp)
 int
 hammer2_vfs_sync(struct mount *mp, int waitfor)
 {
+       int error;
+
+       error = hammer2_vfs_sync_pmp(MPTOPMP(mp), waitfor);
+
+       return error;
+}
+
+int
+hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
+{
+       struct mount *mp;
        hammer2_xop_flush_t *xop;
-       struct hammer2_sync_info info;
+       /*struct hammer2_sync_info info;*/
        hammer2_inode_t *iroot;
-       hammer2_pfs_t *pmp;
+       hammer2_inode_t *ip;
+       struct vnode *vp;
        int flags;
        int error;
 
-       pmp = MPTOPMP(mp);
+       mp = pmp->mp;
        iroot = pmp->iroot;
        KKASSERT(iroot);
        KKASSERT(iroot->pmp == pmp);
@@ -2418,59 +2431,106 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                flags |= VMSC_ONEPASS;
 
        /*
-        * Flush vnodes individually using a normal transaction to avoid
-        * stalling any concurrent operations.  This will flush the related
-        * buffer cache buffers and inodes to the media.
-        *
-        * For efficiency do an async pass before making sure with a
-        * synchronous pass on all related buffer cache buffers.
+        * Move all inodes on sideq to syncq.  This will clear sideq.
+        * This should represent all flushable inodes.  These inodes
+        * will already have refs due to being on syncq or sideq.
         */
-       hammer2_trans_init(pmp, 0);
-
-       info.error = 0;
-
-       info.waitfor = MNT_NOWAIT;
-       info.pass = 1;
-       vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
-
-       /*
-        * Now do two passes making sure we get everything.  The first pass
-        * vfsync()s dirty vnodes.  The second pass waits for their I/O's
-        * to finish and cleans up the dirty flag on the vnode.
-        */
-       info.pass = 1;
-       info.waitfor = MNT_WAIT;
-       vsyncscan(mp, flags, hammer2_sync_scan2, &info);
-
-       info.pass = 2;
-       info.waitfor = MNT_WAIT;
-       vsyncscan(mp, flags, hammer2_sync_scan2, &info);
-
-       /*
-        * We must also run the sideq to handle any disconnected inodes
-        * as the vnode scan will not see these.
-        */
-       hammer2_inode_run_sideq(pmp, 1);
-       hammer2_trans_done(pmp, 0);
+       hammer2_spin_ex(&pmp->list_spin);
+       TAILQ_FOREACH(ip, &pmp->sideq, entry) {
+               KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ);
+               atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ);
+               atomic_clear_int(&ip->flags, HAMMER2_INODE_SIDEQ);
+       }
+       TAILQ_CONCAT(&pmp->syncq, &pmp->sideq, entry);
+       pmp->sideq_count = 0;
+       hammer2_spin_unex(&pmp->list_spin);
 
        /*
-        * Start our flush transaction and flush the root topology down to
-        * the inodes, but not the inodes themselves (which we already flushed
-        * above).  Any concurrent activity effecting inode contents will not
+        * Flush transactions only interlock with other flush transactions.
+        * Any concurrent frontend operations will block when obtaining an
+        * exclusive inode lock on any inode on SYNCQ, and we will block here
+        * when we ourselves obtain the exclusive lock.
         *
-        * The flush sequence will
-        *
-        * NOTE! It is still possible for the paging code to push pages
-        *       out via a UIO_NOCOPY hammer2_vop_write() during the main
-        *       flush.
+        * Now run through all inodes on syncq.
         */
        hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
+       ip = NULL;
+       for (;;) {
+               if (ip == NULL) {
+                       hammer2_spin_ex(&pmp->list_spin);
+                       ip = TAILQ_FIRST(&pmp->syncq);
+                       if (ip == NULL) {
+                               hammer2_spin_unex(&pmp->list_spin);
+                               break;
+                       }
+                       TAILQ_REMOVE(&pmp->syncq, ip, entry);
+                       atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ);
+                       hammer2_spin_unex(&pmp->list_spin);
+                       /* leave's ip with a ref from being on SYNCQ */
+               }
 
-       /*
-        * sync dirty vnodes again while in the flush transaction.  This is
-        * currently an expensive shim to makre sure the logical topology is
-        * completely consistent before we flush the volume header.
-        */
+               /*
+                * We hold a ref on ip, SYNCQ flag has been cleared, and
+                * since we own the flush transaction it cannot get set
+                * again (though the ip can be put on SIDEQ again).
+                *
+                * Acquire the vnode and inode exclusively.  Be careful
+                * of order.
+                */
+               if ((vp = ip->vp) != NULL) {
+                       vhold(vp);
+                       if (vget(vp, LK_EXCLUSIVE)) {
+                                vdrop(vp);
+                               hammer2_inode_drop(ip);
+                               continue;
+                       }
+                       vdrop(vp);
+                       hammer2_mtx_ex(&ip->lock);
+                       if (ip->vp != vp) {
+                               hammer2_mtx_unlock(&ip->lock);  /* unlock */
+                               vput(vp);
+                               continue;                       /* retry w/ip */
+                       }
+               } else {
+                       hammer2_mtx_ex(&ip->lock);
+                       if (ip->vp != NULL) {
+                               hammer2_mtx_unlock(&ip->lock);  /* unlock */
+                               continue;                       /* retry w/ip */
+                       }
+               }
+
+               /*
+                * Ok, we hold the inode and vnode exclusively locked,
+                * inside a flush transaction, and can now flush them.
+                *
+                * vp token needed for v_rbdirty_tree check / vclrisdirty
+                * sequencing.  Though we hold the vnode exclusively so
+                * we shouldn't need to hold the token also in this case.
+                */
+               if (vp) {
+                       vfsync(vp, MNT_WAIT, 1, NULL, NULL);
+                       bio_track_wait(&vp->v_track_write, 0, 0); /* XXX */
+                       lwkt_gettoken(&vp->v_token);
+               }
+               hammer2_inode_chain_sync(ip);
+               hammer2_inode_chain_flush(ip);
+               if (vp) {
+                       if ((ip->flags & (HAMMER2_INODE_MODIFIED |
+                                         HAMMER2_INODE_RESIZED |
+                                         HAMMER2_INODE_DIRTYDATA)) == 0 &&
+                           RB_EMPTY(&vp->v_rbdirty_tree) &&
+                           !bio_track_active(&vp->v_track_write)) {
+                               vclrisdirty(vp);
+                       }
+                       lwkt_reltoken(&vp->v_token);
+                       vput(vp);
+               }
+               hammer2_inode_unlock(ip);       /* unlock+drop */
+               ip = NULL;                      /* next ip */
+       }
+       hammer2_bioq_sync(pmp);
+
+#if 0
        info.pass = 1;
        info.waitfor = MNT_WAIT;
        vsyncscan(mp, flags, hammer2_sync_scan2, &info);
@@ -2478,8 +2538,15 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        info.pass = 2;
        info.waitfor = MNT_WAIT;
        vsyncscan(mp, flags, hammer2_sync_scan2, &info);
+#endif
 
        /*
+        * Generally speaking we now want to flush the media topology from
+        * the iroot through to the inodes.  The flush stops at any inode
+        * boundary, which allows the frontend to continue running concurrent
+        * modifying operations on inodes (including kernel flushes of
+        * buffers) without interfering with the main sync.
+        *
         * Use the XOP interface to concurrently flush all nodes to
         * synchronize the PFSROOT subtopology to the media.  A standard
         * end-of-scan ENOENT error indicates cluster sufficiency.
@@ -2489,7 +2556,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
         *
         * XXX For now wait for all flushes to complete.
         */
-       if (iroot) {
+       if (mp && iroot) {
                /*
                 * If unmounting try to flush everything including any
                 * sub-trees under inodes, just in case there is dangling
@@ -2520,6 +2587,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        return (error);
 }
 
+#if 0
 /*
  * Sync passes.
  *
@@ -2594,6 +2662,7 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
 #endif
        return(0);
 }
+#endif
 
 static
 int
index 9968ba0..c0afe80 100644 (file)
@@ -167,34 +167,34 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
         * when vfsync() is called.  However, that requires a vnode.
         *
         * When the vnode is disassociated we must keep track of any modified
-        * inode via the sideq so that it is properly flushed.  We cannot
-        * safely synchronize the inode from inside the reclaim due to
-        * potentially deep locks held as-of when the reclaim occurs.
+        * inode to be flushed in a later filesystem sync.  We cannot safely
+        * synchronize the inode from inside the reclaim due to potentially
+        * deep locks held as-of when the reclaim occurs.
         * Interactions and potential deadlocks abound.
+        *
+        * Place the inode on SIDEQ, unless it is already on the SIDEQ or
+        * SYNCQ.  It will be transfered to the SYNCQ in the next filesystem
+        * sync.  It is not safe to try to shoehorn it into the current fs
+        * sync.
         */
        if ((ip->flags & (HAMMER2_INODE_ISUNLINKED |
                          HAMMER2_INODE_MODIFIED |
                          HAMMER2_INODE_RESIZED |
                          HAMMER2_INODE_DIRTYDATA)) &&
            (ip->flags & HAMMER2_INODE_ISDELETED) == 0) {
-               hammer2_inode_sideq_t *ipul;
-
-               ipul = kmalloc(sizeof(*ipul), pmp->minode, M_WAITOK | M_ZERO);
-               ipul->ip = ip;
-
                hammer2_spin_ex(&pmp->list_spin);
-               if ((ip->flags & HAMMER2_INODE_ONSIDEQ) == 0) {
+               if ((ip->flags & (HAMMER2_INODE_SYNCQ |
+                                 HAMMER2_INODE_SIDEQ)) == 0) {
                        /* ref -> sideq */
-                       atomic_set_int(&ip->flags, HAMMER2_INODE_ONSIDEQ);
-                       TAILQ_INSERT_TAIL(&pmp->sideq, ipul, entry);
+                       atomic_set_int(&ip->flags, HAMMER2_INODE_SIDEQ);
+                       TAILQ_INSERT_TAIL(&pmp->sideq, ip, entry);
                        ++pmp->sideq_count;
                        hammer2_spin_unex(&pmp->list_spin);
+                       /* retain ip ref for SIDEQ linkage */
                } else {
                        hammer2_spin_unex(&pmp->list_spin);
-                       kfree(ipul, pmp->minode);
                        hammer2_inode_drop(ip);         /* vp ref */
                }
-               /* retain ref from vp for ipul */
        } else {
                hammer2_inode_drop(ip);                 /* vp ref */
        }
@@ -1291,7 +1291,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
        if (error) {
                ip = NULL;
        } else {
-               ip = hammer2_inode_get(dip->pmp, dip, &xop->head, -1);
+               ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
        }
        hammer2_inode_unlock(dip);
 
@@ -1393,10 +1393,8 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
         * create the directory entry.  The creation of the actual inode
         * sets its nlinks to 1 which is the value we desire.
         */
-       nip = hammer2_inode_create(dip->pmp->iroot, dip, ap->a_vap, ap->a_cred,
-                                  NULL, 0, inum,
-                                  inum, 0, 0,
-                                  0, &error);
+       nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
+                                         inum, &error);
        if (error) {
                error = hammer2_error_to_errno(error);
        } else {
@@ -1599,10 +1597,8 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
         * create the directory entry.  The creation of the actual inode
         * sets its nlinks to 1 which is the value we desire.
         */
-       nip = hammer2_inode_create(dip->pmp->iroot, dip, ap->a_vap, ap->a_cred,
-                                  NULL, 0, inum,
-                                  inum, 0, 0,
-                                  0, &error);
+       nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
+                                         inum, &error);
 
        if (error) {
                error = hammer2_error_to_errno(error);
@@ -1676,10 +1672,8 @@ hammer2_vop_nmknod(struct vop_nmknod_args *ap)
         * Create the device inode and then create the directory entry.
         */
        inum = hammer2_trans_newinum(dip->pmp);
-       nip = hammer2_inode_create(dip->pmp->iroot, dip, ap->a_vap, ap->a_cred,
-                                  NULL, 0, inum,
-                                  inum, 0, 0,
-                                  0, &error);
+       nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
+                                         inum, &error);
        if (error == 0) {
                error = hammer2_dirent_create(dip, name, name_len,
                                              nip->meta.inum, nip->meta.type);
@@ -1754,10 +1748,8 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
         */
        inum = hammer2_trans_newinum(dip->pmp);
 
-       nip = hammer2_inode_create(dip->pmp->iroot, dip, ap->a_vap, ap->a_cred,
-                                  NULL, 0, inum,
-                                  inum, 0, 0,
-                                  0, &error);
+       nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
+                                         inum, &error);
        if (error == 0) {
                error = hammer2_dirent_create(dip, name, name_len,
                                              nip->meta.inum, nip->meta.type);
@@ -1888,7 +1880,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap)
        hammer2_inode_unlock(dip);
 
        if (error == 0) {
-               ip = hammer2_inode_get(dip->pmp, dip, &xop->head, -1);
+               ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
                hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
                if (ip) {
                        hammer2_inode_unlink_finisher(ip, isopen);
@@ -1965,7 +1957,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
        hammer2_inode_unlock(dip);
 
        if (error == 0) {
-               ip = hammer2_inode_get(dip->pmp, dip, &xop->head, -1);
+               ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
                hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
                if (ip) {
                        hammer2_inode_unlink_finisher(ip, isopen);