hammer2 - refactor filesystem sync 6/N
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 2 Dec 2018 20:39:48 +0000 (12:39 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 5 Dec 2018 18:28:39 +0000 (10:28 -0800)
* Dependency tracking.  Add modest cross-dependency grouping.  This code
  does not track dependencies in a graph.  Instead, it simply groups
  dependent inodes together.  This means that dependency groups can get
  rather large when, for example, lots of files are being created or
  deleted in the same directory.

* We retain the excellent dynamic inode reordering code for the syncq.
  When the frontend blocks on an inode that is in the syncq, the inode
  will be reordered to the front of the queue to reduce the frontend
  stall time as much as possible.

* Remove the COPYQ transaction flag and related sequencing.

* Fix flush sequencing for pmp->iroot.  We must flush iroot's chains with
  HAMMER2_XOP_FSSYNC last.  When iroot is dirty, the out-of-order flush
  of iroot that occurs before the final stage must be run without FSSYNC
  set, otherwise iroot's pmp->pfs_iroot_blocksets[] will not be consistent
  because the remaining inodes in the syncq haven't been flushed yet.

* Fix a broken syncer speedup conditional.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index f32b8e8..6a25307 100644 (file)
@@ -109,6 +109,7 @@ struct hammer2_io;
 struct hammer2_chain;
 struct hammer2_cluster;
 struct hammer2_inode;
+struct hammer2_depend;
 struct hammer2_dev;
 struct hammer2_pfs;
 struct hammer2_span;
@@ -701,8 +702,18 @@ hammer2_cluster_wrok(hammer2_cluster_t *cluster)
 }
 
 RB_HEAD(hammer2_inode_tree, hammer2_inode);    /* ip->rbnode */
-TAILQ_HEAD(syncq_head, hammer2_inode);         /* ip->entry */
-TAILQ_HEAD(sideq_head, hammer2_inode);         /* ip->entry */
+TAILQ_HEAD(inoq_head, hammer2_inode);          /* ip->entry */
+TAILQ_HEAD(depq_head, hammer2_depend);         /* depend->entry */
+
+struct hammer2_depend {
+       TAILQ_ENTRY(hammer2_depend) entry;
+       struct inoq_head        sideq;
+       long                    count;
+       int                     pass2;
+       int                     unused01;
+};
+
+typedef struct hammer2_depend hammer2_depend_t;
 
 /*
  * A hammer2 inode.
@@ -713,7 +724,9 @@ TAILQ_HEAD(sideq_head, hammer2_inode);              /* ip->entry */
  */
 struct hammer2_inode {
        RB_ENTRY(hammer2_inode) rbnode;         /* inumber lookup (HL) */
-       TAILQ_ENTRY(hammer2_inode) entry;       /* syncq, SYNCQ flag */
+       TAILQ_ENTRY(hammer2_inode) entry;       /* SYNCQ/SIDEQ */
+       hammer2_depend_t        *depend;        /* non-NULL if SIDEQ */
+       hammer2_depend_t        depend_static;  /* (in-place allocation) */
        hammer2_mtx_t           lock;           /* inode lock */
        hammer2_mtx_t           truncate_lock;  /* prevent truncates */
        struct hammer2_pfs      *pmp;           /* PFS mount */
@@ -802,7 +815,7 @@ typedef struct hammer2_trans hammer2_trans_t;
 #define HAMMER2_TRANS_ISFLUSH          0x80000000      /* flush code */
 #define HAMMER2_TRANS_BUFCACHE         0x40000000      /* bio strategy */
 #define HAMMER2_TRANS_SIDEQ            0x20000000      /* run sideq */
-#define HAMMER2_TRANS_COPYQ            0x10000000      /* sideq->syncq */
+#define HAMMER2_TRANS_UNUSED10         0x10000000
 #define HAMMER2_TRANS_WAITING          0x08000000      /* someone waiting */
 #define HAMMER2_TRANS_RESCAN           0x04000000      /* rescan sideq */
 #define HAMMER2_TRANS_MASK             0x00FFFFFF      /* count mask */
@@ -1240,9 +1253,9 @@ struct hammer2_pfs {
        uint32_t                inmem_dirty_chains;
        int                     count_lwinprog; /* logical write in prog */
        struct spinlock         list_spin;
-       struct syncq_head       syncq;          /* SYNCQ flagged inodes */
-       struct sideq_head       sideq;          /* SIDEQ flagged inodes */
-       long                    sideq_count;
+       struct inoq_head        syncq;          /* SYNCQ flagged inodes */
+       struct depq_head        depq;           /* SIDEQ flagged inodes */
+       long                    sideq_count;    /* total inodes on depq */
        hammer2_thread_t        sync_thrs[HAMMER2_MAXCLUSTER];
        uint32_t                cluster_flags;  /* cached cluster flags */
        int                     has_xop_threads;
@@ -1434,6 +1447,7 @@ extern int hammer2_bulkfree_tps;
 extern long hammer2_chain_allocs;
 extern long hammer2_chain_frees;
 extern long hammer2_limit_dirty_chains;
+extern long hammer2_limit_dirty_inodes;
 extern long hammer2_count_modified_chains;
 extern long hammer2_iod_invals;
 extern long hammer2_iod_file_read;
@@ -1466,6 +1480,7 @@ extern struct objcache *cache_xops;
 int hammer2_signal_check(time_t *timep);
 const char *hammer2_error_str(int error);
 
+void hammer2_inode_delayed_sideq(hammer2_inode_t *ip);
 void hammer2_inode_lock(hammer2_inode_t *ip, int how);
 void hammer2_inode_lock4(hammer2_inode_t *ip1, hammer2_inode_t *ip2,
                        hammer2_inode_t *ip3, hammer2_inode_t *ip4);
index 8628510..67d112d 100644 (file)
@@ -148,10 +148,8 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                        nflags = (oflags | flags) + 1;
                } else {
                        /*
-                        * Normal transaction.  We currently only interlock
-                        * against COPYQ.  We do not interlock against
-                        * BUFCACHE or ISFLUSH.  COPYQ is used to interlock
-                        * the transfer of SIDEQ into SYNCQ.
+                        * Normal transaction.  We do not interlock against
+                        * BUFCACHE or ISFLUSH.
                         *
                         * Note that vnode locks may be held going into
                         * this call.
@@ -160,12 +158,7 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                         *       such as read, stat, readdir, etc, do
                         *       not use transactions.
                         */
-                       if (oflags & HAMMER2_TRANS_COPYQ) {
-                               nflags = oflags | HAMMER2_TRANS_WAITING;
-                               dowait = 1;
-                       } else {
-                               nflags = (oflags | flags) + 1;
-                       }
+                       nflags = (oflags | flags) + 1;
                }
                if (dowait)
                        tsleep_interlock(&pmp->trans.sync_wait, 0);
@@ -182,6 +175,7 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                /* retry */
        }
 
+#if 0
        /*
         * When entering a FLUSH transaction with COPYQ set, wait for the
         * transaction count to drop to 1 (our flush transaction only)
@@ -205,6 +199,7 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                               "h2trans2", hz);
                }
        }
+#endif
 }
 
 /*
@@ -272,6 +267,7 @@ hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
         * inside other nominal modifying front-end transactions.
         */
        if ((flags & HAMMER2_TRANS_SIDEQ) &&
+           pmp->sideq_count > hammer2_limit_dirty_inodes / 2 &&
            pmp->sideq_count > (pmp->inum_count >> 3) &&
            pmp->mp) {
                speedup_syncer(pmp->mp);
@@ -523,14 +519,11 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
        /*
         * Downward search recursion
         *
-        * We must be careful on cold stops.  If CHAIN_UPDATE is set and
-        * we stop cold, the update can wind up never being applied.  This
-        * situation most typically occurs on inode boundaries due to the way
-        * hammer2_vfs_sync() breaks-up the flush.  As a safety, we
-        * flush-through such situations. XXX removed
+        * We must be careful on cold stops, which often occur on inode
+        * boundaries due to the way hammer2_vfs_sync() sequences the flush.
+        * Be sure to issue an appropriate chain_setflush()
         */
        if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
-           /* (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && */
            (flags & HAMMER2_FLUSH_ALL) == 0 &&
            (flags & HAMMER2_FLUSH_TOP) == 0 &&
            chain->pmp && chain->pmp->mp) {
@@ -562,7 +555,6 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                }
                goto done;
        } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
-                  /* (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && */
                   (flags & HAMMER2_FLUSH_INODE_STOP) &&
                   (flags & HAMMER2_FLUSH_ALL) == 0 &&
                   (flags & HAMMER2_FLUSH_TOP) == 0 &&
@@ -572,6 +564,10 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                 * to include any inode changes for inodes we encounter,
                 * with the exception of the inode that the flush began with.
                 * So: INODE, INODE_STOP, and TOP==0 basically.
+                *
+                * Dirty inodes are flushed based on the hammer2_inode
+                * in-memory structure, issuing a chain_setflush() here
+                * will only cause unnecessary traversals of the topology.
                 */
                goto done;
 #if 0
@@ -1326,11 +1322,6 @@ done:
  *                             Inodes belonging to the same flush are flushed
  *                             separately.
  *
- * HAMMER2_XOP_PARENTONFLUSH   After flushing if the starting chain indicates
- *                             a parent update is needed, we setflush the
- *                             parent to propogate the flush request across
- *                             the inode.
- *
  * chain->parent can be NULL, usually due to destroy races or detached inodes.
  *
  * Primarily called from vfs_sync().
index 0943786..dcf4cdc 100644 (file)
@@ -64,21 +64,131 @@ hammer2_knote(struct vnode *vp, int flags)
                KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
 }
 
-static
+/*
+ * Caller holds pmp->list_spin and the inode should be locked.  Merge ip
+ * with the specified depend.
+ *
+ * If the ip is on SYNCQ it stays there and (void *)-1 is returned, indicating
+ * that successive calls must ensure the ip is on a pass2 depend (or they are
+ * all SYNCQ).  If the passed-in depend is not NULL and not (void *)-1 then
+ * we can set pass2 on it and return.
+ *
+ * If the ip is not on SYNCQ it is merged with the passed-in depend, creating
+ * a self-depend if necessary, and depend->pass2 is set according
+ * to the PASS2 flag.  SIDEQ is set.
+ */
+static __noinline
+hammer2_depend_t *
+hammer2_inode_setdepend_locked(hammer2_inode_t *ip, hammer2_depend_t *depend)
+{
+       hammer2_pfs_t *pmp = ip->pmp;
+       hammer2_depend_t *dtmp;
+       hammer2_inode_t *iptmp;
+
+       /*
+        * If ip is SYNCQ its entry is used for the syncq list and it will
+        * no longer be associated with a dependency.  Merging this status
+        * with a passed-in depend implies PASS2.
+        */
+       if (ip->flags & HAMMER2_INODE_SYNCQ) {
+               if (depend == (void *)-1 ||
+                   depend == NULL) {
+                       return ((void *)-1);
+               }
+               depend->pass2 = 1;
+               hammer2_trans_setflags(pmp, HAMMER2_TRANS_RESCAN);
+
+               return depend;
+       }
+
+       /*
+        * If ip is already SIDEQ, merge ip->depend into the passed-in depend.
+        * If it is not, associate the ip with the passed-in depend, creating
+        * a single-entry dependency using depend_static if necessary.
+        *
+        * NOTE: The use of ip->depend_static always requires that the
+        *       specific ip containing the structure is part of that
+        *       particular depend_static's dependency group.
+        */
+       if (ip->flags & HAMMER2_INODE_SIDEQ) {
+               /*
+                * Merge ip->depend with the passed-in depend.  If the
+                * passed-in depend is not a special case, all ips associated
+                * with ip->depend (including the original ip) must be moved
+                * to the passed-in depend.
+                */
+               if (depend == NULL) {
+                       depend = ip->depend;
+               } else if (depend == (void *)-1) {
+                       depend = ip->depend;
+                       depend->pass2 = 1;
+               } else if (depend != ip->depend) {
+#ifdef INVARIANTS
+                       int sanitychk = 0;
+#endif
+                       dtmp = ip->depend;
+                       while ((iptmp = TAILQ_FIRST(&dtmp->sideq)) != NULL) {
+#ifdef INVARIANTS
+                               if (iptmp == ip)
+                                       sanitychk = 1;
+#endif
+                               TAILQ_REMOVE(&dtmp->sideq, iptmp, entry);
+                               TAILQ_INSERT_TAIL(&depend->sideq, iptmp, entry);
+                               iptmp->depend = depend;
+                       }
+                       KKASSERT(sanitychk == 1);
+                       depend->count += dtmp->count;
+                       depend->pass2 |= dtmp->pass2;
+                       TAILQ_REMOVE(&pmp->depq, dtmp, entry);
+                       dtmp->count = 0;
+                       dtmp->pass2 = 0;
+               }
+       } else {
+               /*
+                * Add ip to the sideq, creating a self-dependency if
+                * necessary.
+                */
+               hammer2_inode_ref(ip);
+               atomic_set_int(&ip->flags, HAMMER2_INODE_SIDEQ);
+               if (depend == NULL) {
+                       depend = &ip->depend_static;
+                       TAILQ_INSERT_TAIL(&pmp->depq, depend, entry);
+               } else if (depend == (void *)-1) {
+                       depend = &ip->depend_static;
+                       depend->pass2 = 1;
+                       TAILQ_INSERT_TAIL(&pmp->depq, depend, entry);
+               } /* else add ip to passed-in depend */
+               TAILQ_INSERT_TAIL(&depend->sideq, ip, entry);
+               ip->depend = depend;
+               ++depend->count;
+               ++pmp->sideq_count;
+       }
+
+       if (ip->flags & HAMMER2_INODE_SYNCQ_PASS2)
+               depend->pass2 = 1;
+       if (depend->pass2)
+               hammer2_trans_setflags(pmp, HAMMER2_TRANS_RESCAN);
+
+       return depend;
+}
+
+/*
+ * Put a solo inode on the SIDEQ (meaning that its dirty).  This can also
+ * occur from inode_lock4() and inode_depend().
+ *
+ * Caller must pass-in a locked inode.
+ */
 void
 hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
 {
        hammer2_pfs_t *pmp = ip->pmp;
 
+       /*
+        * Optimize case to avoid pmp spinlock.
+        */
        if ((ip->flags & (HAMMER2_INODE_SYNCQ | HAMMER2_INODE_SIDEQ)) == 0) {
                hammer2_spin_ex(&pmp->list_spin);
-               if ((ip->flags & (HAMMER2_INODE_SYNCQ |
-                                 HAMMER2_INODE_SIDEQ)) == 0) {
-                       hammer2_inode_ref(ip);
-                       atomic_set_int(&ip->flags, HAMMER2_INODE_SIDEQ);
-                       TAILQ_INSERT_TAIL(&pmp->sideq, ip, entry);
-                       ++pmp->sideq_count;
-               }
+               hammer2_inode_setdepend_locked(ip, NULL);
                hammer2_spin_unex(&pmp->list_spin);
        }
 }
@@ -180,7 +290,7 @@ hammer2_inode_lock(hammer2_inode_t *ip, int how)
  * ip1 and ip2 must not be NULL.  ip3 and ip4 may be NULL, but if ip3 is
  * NULL then ip4 must also be NULL.
  *
- * This function will also ensure that if any
+ * This creates a dependency between up to four inodes.
  */
 void
 hammer2_inode_lock4(hammer2_inode_t *ip1, hammer2_inode_t *ip2,
@@ -188,10 +298,11 @@ hammer2_inode_lock4(hammer2_inode_t *ip1, hammer2_inode_t *ip2,
 {
        hammer2_inode_t *ips[4];
        hammer2_inode_t *iptmp;
+       hammer2_inode_t *ipslp;
+       hammer2_depend_t *depend;
        hammer2_pfs_t *pmp;
        size_t count;
        size_t i;
-       int dosyncq;
 
        pmp = ip1->pmp;                 /* may be NULL */
        KKASSERT(pmp == ip2->pmp);
@@ -220,49 +331,29 @@ restart:
         * Lock the inodes in order
         */
        for (i = 0; i < count; ++i) {
-               iptmp = ips[i];
-               hammer2_mtx_ex(&iptmp->lock);
+               hammer2_mtx_ex(&ips[i]->lock);
        }
 
        /*
-        * If any of the inodes are part of a filesystem sync then we
-        * have to make sure they ALL are, because their modifications
-        * depend on each other (e.g. inode vs dirent).
+        * Associate dependencies, record the first inode found on SYNCQ
+        * (operation is allowed to proceed for inodes on PASS2) for our
+        * sleep operation, this inode is theoretically the last one sync'd
+        * in the sequence.
         *
-        * All PASS2 flags must be set atomically with the spinlock held
-        * to ensure that they are flushed together.
+        * All inodes found on SYNCQ are moved to the head of the syncq
+        * to reduce stalls.
         */
        hammer2_spin_ex(&pmp->list_spin);
-       dosyncq = 0;
+       depend = NULL;
+       ipslp = NULL;
        for (i = 0; i < count; ++i) {
                iptmp = ips[i];
-               if (iptmp->flags & HAMMER2_INODE_SYNCQ)
-                       dosyncq |= 1;
-               if (iptmp->flags & HAMMER2_INODE_SYNCQ_PASS2)
-                       dosyncq |= 2;
-       }
-       if (dosyncq & 3) {
-               for (i = 0; i < count; ++i) {
-                       iptmp = ips[i];
-                       atomic_set_int(&iptmp->flags,
-                                      HAMMER2_INODE_SYNCQ_WAKEUP);
-                       if (iptmp->flags & HAMMER2_INODE_SYNCQ) {
-                               TAILQ_REMOVE(&pmp->syncq, iptmp, entry);
-                               TAILQ_INSERT_HEAD(&pmp->syncq, iptmp, entry);
-                       } else if (iptmp->flags & HAMMER2_INODE_SIDEQ) {
-                               atomic_set_int(&iptmp->flags,
-                                              HAMMER2_INODE_SYNCQ_PASS2);
-                               hammer2_trans_setflags(pmp,
-                                                      HAMMER2_TRANS_RESCAN);
-                       } else {
-                               atomic_set_int(&iptmp->flags,
-                                              HAMMER2_INODE_SIDEQ |
-                                              HAMMER2_INODE_SYNCQ_PASS2);
-                               TAILQ_INSERT_TAIL(&pmp->sideq, iptmp, entry);
-                               hammer2_inode_ref(iptmp);
-                               hammer2_trans_setflags(pmp,
-                                                      HAMMER2_TRANS_RESCAN);
-                       }
+               depend = hammer2_inode_setdepend_locked(iptmp, depend);
+               if (iptmp->flags & HAMMER2_INODE_SYNCQ) {
+                       TAILQ_REMOVE(&pmp->syncq, iptmp, entry);
+                       TAILQ_INSERT_HEAD(&pmp->syncq, iptmp, entry);
+                       if (ipslp == NULL)
+                               ipslp = iptmp;
                }
        }
        hammer2_spin_unex(&pmp->list_spin);
@@ -272,43 +363,12 @@ restart:
         * important that we allow the operation to proceed in the
         * PASS2 case, to avoid deadlocking against the vnode.
         */
-       if (dosyncq & 1) {
+       if (ipslp) {
                for (i = 0; i < count; ++i)
                        hammer2_mtx_unlock(&ips[i]->lock);
-               tsleep(&iptmp->flags, 0, "h2sync", 2);
-               goto restart;
-       }
-#if 0
-               if (pmp == NULL ||
-                   ((iptmp->flags & (HAMMER2_INODE_SYNCQ |
-                                     HAMMER2_INODE_SYNCQ_PASS2)) == 0 &&
-                    dosyncq == 0)) {
-                       continue;
-               }
-               dosyncq = 1;
-               tsleep_interlock(&iptmp->flags, 0);
-
-               /*
-                * We have to accept the inode if it's got more than one
-                * exclusive count because we can't safely unlock it.
-                */
-               if (hammer2_mtx_refs(&iptmp->lock) > 1) {
-                       kprintf("hammer2: exclcount > 1: %p %ld\n", iptmp, hammer2_mtx_refs(&iptmp->lock));
-                       continue;
-               }
-
-               /*
-                * Unlock everything (including the current index) and wait
-                * for our wakeup.
-                */
-               for (j = 0; j <= i; ++j)
-                       hammer2_mtx_unlock(&ips[j]->lock);
-               tsleep(&iptmp->flags, PINTERLOCKED, "h2sync", 0);
-               /*tsleep(&iptmp->flags, 0, "h2sync2", 1);*/
-
+               tsleep(&ipslp->flags, 0, "h2sync", 2);
                goto restart;
        }
-#endif
 }
 
 /*
@@ -330,12 +390,12 @@ hammer2_inode_unlock(hammer2_inode_t *ip)
 
 /*
  * If either ip1 or ip2 have been tapped by the syncer, make sure that both
- * are.  This ensure that dependencies (e.g. inode-vs-dirent) are synced
- * together.
+ * are.  This ensure that dependencies (e.g. dirent-v-inode) are synced
+ * together.  For dirent-v-inode depends, pass the dirent as ip1.
  *
- * We must also check SYNCQ_PASS2, which occurs when the syncer cannot
- * immediately lock the inode on SYNCQ and must temporarily move it to
- * SIDEQ to retry again in another pass (but part of the same flush).
+ * If neither ip1 or ip2 have been tapped by the syncer, merge them into a
+ * single dependency.  Dependencies are entered into pmp->depq.  This
+ * effectively flags the inodes SIDEQ.
  *
  * Both ip1 and ip2 must be locked by the caller.  This also ensures
  * that we can't race the end of the syncer's queue run.
@@ -344,53 +404,15 @@ void
 hammer2_inode_depend(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
 {
        hammer2_pfs_t *pmp;
+       hammer2_depend_t *depend;
 
        pmp = ip1->pmp;
-
        hammer2_spin_ex(&pmp->list_spin);
-       if (((ip1->flags | ip2->flags) & (HAMMER2_INODE_SYNCQ |
-                                         HAMMER2_INODE_SYNCQ_PASS2)) == 0) {
-               hammer2_spin_unex(&pmp->list_spin);
-               return;
-       }
-       if ((ip1->flags & (HAMMER2_INODE_SYNCQ |
-                          HAMMER2_INODE_SYNCQ_PASS2)) &&
-           (ip2->flags & (HAMMER2_INODE_SYNCQ |
-                          HAMMER2_INODE_SYNCQ_PASS2))) {
-               hammer2_spin_unex(&pmp->list_spin);
-               return;
-       }
-       KKASSERT(pmp == ip2->pmp);
-       if ((ip1->flags & (HAMMER2_INODE_SYNCQ |
-                          HAMMER2_INODE_SYNCQ_PASS2)) == 0) {
-               if (ip1->flags & HAMMER2_INODE_SIDEQ) {
-                       atomic_set_int(&ip1->flags,
-                                      HAMMER2_INODE_SYNCQ_PASS2);
-               } else {
-                       atomic_set_int(&ip1->flags, HAMMER2_INODE_SIDEQ |
-                                                   HAMMER2_INODE_SYNCQ_PASS2);
-                       hammer2_inode_ref(ip1);
-                       TAILQ_INSERT_TAIL(&pmp->sideq, ip1, entry);
-               }
-               hammer2_trans_setflags(pmp, HAMMER2_TRANS_RESCAN);
-       }
-       if ((ip2->flags & (HAMMER2_INODE_SYNCQ |
-                          HAMMER2_INODE_SYNCQ_PASS2)) == 0) {
-               if (ip2->flags & HAMMER2_INODE_SIDEQ) {
-                       atomic_set_int(&ip2->flags,
-                                      HAMMER2_INODE_SYNCQ_PASS2);
-               } else {
-                       atomic_set_int(&ip2->flags, HAMMER2_INODE_SIDEQ |
-                                                   HAMMER2_INODE_SYNCQ_PASS2);
-                       hammer2_inode_ref(ip2);
-                       TAILQ_INSERT_TAIL(&pmp->sideq, ip2, entry);
-               }
-               hammer2_trans_setflags(pmp, HAMMER2_TRANS_RESCAN);
-       }
+       depend = hammer2_inode_setdepend_locked(ip1, NULL);
+       depend = hammer2_inode_setdepend_locked(ip2, depend);
        hammer2_spin_unex(&pmp->list_spin);
 }
 
-
 /*
  * Select a chain out of an inode's cluster and lock it.
  *
@@ -895,7 +917,9 @@ again:
         */
        nip->refs = 1;
        hammer2_mtx_init(&nip->lock, "h2inode");
+       hammer2_mtx_init(&nip->truncate_lock, "h2trunc");
        hammer2_mtx_ex(&nip->lock);
+       TAILQ_INIT(&nip->depend_static.sideq);
        /* combination of thread lock and chain lock == inode lock */
 
        /*
index 449cc86..f4ab208 100644 (file)
@@ -413,10 +413,10 @@ hammer2_pfsalloc(hammer2_chain_t *chain,
                spin_init(&pmp->xop_spin, "h2xop");
                spin_init(&pmp->lru_spin, "h2lru");
                RB_INIT(&pmp->inum_tree);
-               TAILQ_INIT(&pmp->sideq);
                TAILQ_INIT(&pmp->syncq);
+               TAILQ_INIT(&pmp->depq);
                TAILQ_INIT(&pmp->lru_list);
-               spin_init(&pmp->list_spin, "hm2pfsalloc_list");
+               spin_init(&pmp->list_spin, "h2pfsalloc_list");
 
                /*
                 * Distribute backend operations to threads
@@ -870,8 +870,8 @@ again:
                        /*
                         * Free the pmp and restart the loop
                         */
-                       KKASSERT(TAILQ_EMPTY(&pmp->sideq));
                        KKASSERT(TAILQ_EMPTY(&pmp->syncq));
+                       KKASSERT(TAILQ_EMPTY(&pmp->depq));
                        hammer2_pfsfree(pmp);
                        goto again;
                }
@@ -1176,8 +1176,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                kmalloc_create(&hmp->mchain, "HAMMER2-chains");
                TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
                RB_INIT(&hmp->iotree);
-               spin_init(&hmp->io_spin, "hm2mount_io");
-               spin_init(&hmp->list_spin, "hm2mount_list");
+               spin_init(&hmp->io_spin, "h2mount_io");
+               spin_init(&hmp->list_spin, "h2mount_list");
 
                lockinit(&hmp->vollk, "h2vol", 0, 0);
                lockinit(&hmp->bulklk, "h2bulk", 0, 0);
@@ -2435,7 +2435,8 @@ hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
        /*hammer2_xop_flush_t *xop;*/
        /*struct hammer2_sync_info info;*/
        hammer2_inode_t *ip;
-       hammer2_inode_t *ipdrop;
+       hammer2_depend_t *depend;
+       hammer2_depend_t *depend_next;
        struct vnode *vp;
        uint32_t pass2;
        int error;
@@ -2466,30 +2467,44 @@ hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
        kprintf("FILESYSTEM SYNC BOUNDARY\n");
 #endif
        dorestart = 0;
+
+       /*
+        * Move inodes from depq to syncq, releasing the related
+        * depend structures.
+        */
 restart:
 #ifdef HAMMER2_DEBUG_SYNC
        kprintf("FILESYSTEM SYNC RESTART (%d)\n", dorestart);
 #endif
-       hammer2_trans_setflags(pmp, HAMMER2_TRANS_COPYQ);
+       hammer2_trans_setflags(pmp, 0/*HAMMER2_TRANS_COPYQ*/);
        hammer2_trans_clearflags(pmp, HAMMER2_TRANS_RESCAN);
 
+       /*
+        * Move inodes from depq to syncq.  When restarting, only depq's
+        * marked pass2 are moved.
+        */
        hammer2_spin_ex(&pmp->list_spin);
-       ipdrop = TAILQ_FIRST(&pmp->sideq);
-       while ((ip = ipdrop) != NULL) {
-               ipdrop = TAILQ_NEXT(ip, entry);
-               KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ);
-               if (dorestart == 0 ||
-                   (ip->flags & HAMMER2_INODE_SYNCQ_PASS2)) {
-                       TAILQ_REMOVE(&pmp->sideq, ip, entry);
-                       TAILQ_INSERT_TAIL(&pmp->syncq, ip, entry);
+       depend_next = TAILQ_FIRST(&pmp->depq);
+
+       while ((depend = depend_next) != NULL) {
+               depend_next = TAILQ_NEXT(depend, entry);
+               if (dorestart && depend->pass2 == 0)
+                       continue;
+               TAILQ_FOREACH(ip, &depend->sideq, entry) {
+                       KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ);
                        atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ);
-                       atomic_clear_int(&ip->flags,
-                                        HAMMER2_INODE_SIDEQ);
-                       --pmp->sideq_count;
+                       atomic_clear_int(&ip->flags, HAMMER2_INODE_SIDEQ);
+                       ip->depend = NULL;
                }
+               TAILQ_CONCAT(&pmp->syncq, &depend->sideq, entry);
+               pmp->sideq_count -= depend->count;
+               depend->count = 0;
+               depend->pass2 = 0;
+               TAILQ_REMOVE(&pmp->depq, depend, entry);
        }
+
        hammer2_spin_unex(&pmp->list_spin);
-       hammer2_trans_clearflags(pmp, HAMMER2_TRANS_COPYQ |
+       hammer2_trans_clearflags(pmp, /*HAMMER2_TRANS_COPYQ |*/
                                      HAMMER2_TRANS_WAITING);
        dorestart = 0;
 
@@ -2500,32 +2515,32 @@ restart:
         * Any conflicting frontend operations will block on the inode, but
         * may hold a vnode lock while doing so.
         */
-       ipdrop = NULL;
-
        hammer2_spin_ex(&pmp->list_spin);
        while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) {
                /*
                 * Remove the inode from the SYNCQ, transfer the syncq ref
                 * to us.  We must clear SYNCQ to allow any potential
-                * front-end deadlock to proceed.
+                * front-end deadlock to proceed.  We must set PASS2 so
+                * the dependency code knows what to do.
                 */
                pass2 = ip->flags;
                cpu_ccfence();
                if (atomic_cmpset_int(&ip->flags,
                              pass2,
                              (pass2 & ~(HAMMER2_INODE_SYNCQ |
-                                       HAMMER2_INODE_SYNCQ_WAKEUP)) |
-                                       HAMMER2_INODE_SYNCQ_PASS2) == 0) {
+                                        HAMMER2_INODE_SYNCQ_WAKEUP)) |
+                             HAMMER2_INODE_SYNCQ_PASS2) == 0) {
                        continue;
                }
-               if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP)
-                       wakeup(&ip->flags);
                TAILQ_REMOVE(&pmp->syncq, ip, entry);
                hammer2_spin_unex(&pmp->list_spin);
-               if (ipdrop) {
-                       hammer2_inode_drop(ipdrop);
-                       ipdrop = NULL;
-               }
+               if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP)
+                       wakeup(&ip->flags);
+
+               /*
+                * Relock the inode, and we inherit a ref from the above.
+                * We will check for a race after we acquire the vnode.
+                */
                hammer2_mtx_ex(&ip->lock);
 
                /*
@@ -2555,10 +2570,13 @@ restart:
                if (vp) {
                        if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) {
                                /*
-                                * Failed to get the vnode, we have to
-                                * make sure the inode is on SYNCQ or SIDEQ.
-                                * It is already flagged PASS2. Then unlock,
-                                * possibly sleep, and retry later.
+                                * Failed to get the vnode, requeue the inode
+                                * (PASS2 is already set so it will be found
+                                * again on the restart).
+                                *
+                                * Then unlock, possibly sleep, and retry
+                                * later.  We sleep if PASS2 was *previously*
+                                * set, before we set it again above.
                                 */
                                vp = NULL;
                                dorestart = 1;
@@ -2566,20 +2584,11 @@ restart:
                                kprintf("inum %ld (sync delayed by vnode)\n",
                                        (long)ip->meta.inum);
 #endif
-                               hammer2_spin_ex(&pmp->list_spin);
-                               if ((ip->flags & (HAMMER2_INODE_SYNCQ |
-                                                 HAMMER2_INODE_SIDEQ)) == 0) {
-                                       atomic_set_int(&ip->flags,
-                                                  HAMMER2_INODE_SIDEQ);
-                                       TAILQ_INSERT_TAIL(&pmp->sideq, ip,
-                                                         entry);
-                                       hammer2_spin_unex(&pmp->list_spin);
-                                       hammer2_mtx_unlock(&ip->lock);
-                               } else {
-                                       hammer2_spin_unex(&pmp->list_spin);
-                                       hammer2_mtx_unlock(&ip->lock);
-                                       hammer2_inode_drop(ip);
-                               }
+                               hammer2_inode_delayed_sideq(ip);
+
+                               hammer2_mtx_unlock(&ip->lock);
+                               hammer2_inode_drop(ip);
+
                                if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) {
                                        tsleep(&dorestart, 0, "h2syndel", 2);
                                }
@@ -2590,6 +2599,22 @@ restart:
                        vp = NULL;
                }
 
+               /*
+                * If the inode wound up on a SIDEQ again it will already be
+                * prepped for another PASS2.  In this situation if we flush
+                * it now we will just wind up flushing it again in the same
+                * syncer run, so we might as well not flush it now.
+                */
+               if (ip->flags & HAMMER2_INODE_SIDEQ) {
+                       hammer2_mtx_unlock(&ip->lock);
+                       hammer2_inode_drop(ip);
+                       if (vp)
+                               vput(vp);
+                       dorestart = 1;
+                       hammer2_spin_ex(&pmp->list_spin);
+                       continue;
+               }
+
                /*
                 * Ok we have the inode exclusively locked and if vp is
                 * not NULL that will also be exclusively locked.  Do the
@@ -2625,9 +2650,29 @@ restart:
 #ifdef HAMMER2_DEBUG_SYNC
                kprintf("inum %ld chain-sync\n", (long)ip->meta.inum);
 #endif
+
+               /*
+                * Because I kinda messed up the design and index the inodes
+                * under the root inode, along side the directory entries,
+                * we can't flush the inode index under the iroot until the
+                * end.  If we do it now we might miss effects created by
+                * other inodes on the SYNCQ.
+                *
+                * Do a normal (non-FSSYNC) flush instead, which allows the
+                * vnode code to work the same.  We don't want to force iroot
+                * back onto the SIDEQ, and we also don't want the flush code
+                * to update pfs_iroot_blocksets until the final flush later.
+                *
+                * XXX at the moment this will likely result in a double-flush
+                * of the iroot chain.
+                */
                hammer2_inode_chain_sync(ip);
-               hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
-                                             HAMMER2_XOP_FSSYNC);
+               if (ip == pmp->iroot) {
+                       hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP);
+               } else {
+                       hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
+                                                     HAMMER2_XOP_FSSYNC);
+               }
                if (vp) {
                        lwkt_gettoken(&vp->v_token);
                        if ((ip->flags & (HAMMER2_INODE_MODIFIED |
@@ -2636,9 +2681,12 @@ restart:
                            RB_EMPTY(&vp->v_rbdirty_tree) &&
                            !bio_track_active(&vp->v_track_write)) {
                                vclrisdirty(vp);
+                       } else {
+                               hammer2_inode_delayed_sideq(ip);
                        }
                        lwkt_reltoken(&vp->v_token);
                        vput(vp);
+                       vp = NULL;      /* safety */
                }
                atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_PASS2);
                hammer2_inode_unlock(ip);       /* unlock+drop */
@@ -2651,21 +2699,17 @@ restart:
                hammer2_spin_ex(&pmp->list_spin);
        }
        hammer2_spin_unex(&pmp->list_spin);
-       if (ipdrop) {
-               hammer2_inode_drop(ipdrop);
-               ipdrop = NULL;
-       }
        if (dorestart || (pmp->trans.flags & HAMMER2_TRANS_RESCAN)) {
 #ifdef HAMMER2_DEBUG_SYNC
                kprintf("FILESYSTEM SYNC STAGE 1 RESTART\n");
-               tsleep(&dorestart, 0, "h2STG1-R", hz*20);
+               /*tsleep(&dorestart, 0, "h2STG1-R", hz*20);*/
 #endif
                dorestart = 1;
                goto restart;
        }
 #ifdef HAMMER2_DEBUG_SYNC
        kprintf("FILESYSTEM SYNC STAGE 2 BEGIN\n");
-       tsleep(&dorestart, 0, "h2STG2", hz*20);
+       /*tsleep(&dorestart, 0, "h2STG2", hz*20);*/
 #endif
 
        /*
@@ -3005,7 +3049,7 @@ hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate)
        hammer2_pfs_t *pmp = ip->pmp;
        struct mount *mp = pmp->mp;
 
-       if (mp && vn_syncer_count(mp) > hammer2_limit_dirty_inodes) {
+       if (mp && pmp->sideq_count > hammer2_limit_dirty_inodes) {
                speedup_syncer(mp);
                /*vn_syncer_one(mp);*/
        }
index 083e586..dbf5b5c 100644 (file)
@@ -251,9 +251,9 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        /*
         * Flush dirty chains related to the inode.
         *
-        * NOTE! We are not in a flush transaction, so we should not use the
-        *       PARENTONFLUSH flag.  The inode remains on the sideq so the
-        *       filesystem syncer can synchronize it to the volume root.
+        * NOTE! We are not in a flush transaction.  The inode remains on
+        *       the sideq so the filesystem syncer can synchronize it to
+        *       the volume root.
         */
        error2 = hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP);
        if (error2)