hammer2 - refactor filesystem sync 7/N
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 5 Dec 2018 18:19:31 +0000 (10:19 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 5 Dec 2018 19:10:04 +0000 (11:10 -0800)
* Increase default caps for dirty chain and dirty inode counts.  The
  new SYNCQ semantics allow this number to be arbitrarily large, but
  it is still a good idea not to allow it to get out of control.

  NOTE: One advantage of higher caps is that it gives the frontend more
  time to delete temporary files.

* Get rid of the old syncer speedup / write moderation mechanisms.
  Replace with a new VFS_MODIFYING() hook that allows the filesystem
  to implement moderation prior to any vnode locks being held.

  Remove hammer2_pfs_memory_wait() calls from VOP bodies, implement
  the hammer2_pfs_memory_wait() call via VFS_MODIFYING() instead.

* Move the moderation wakeup for the inode count to the syncer, and
  change the parameter to use pmp->sideq_count.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_cluster.c
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index 6a25307..2b1c9e5 100644 (file)
@@ -175,8 +175,12 @@ typedef uint32_t hammer2_xid_t;
 #define HAMMER2_XID_MIN                        0x00000000U
 #define HAMMER2_XID_MAX                        0x7FFFFFFFU
 
-#define HAMMER2_LIMIT_DIRTY_CHAINS     (65536)
-#define HAMMER2_LIMIT_DIRTY_INODES     (16384)
+/*
+ * Cap the dynamic calculation for the maximum number of dirty
+ * chains and dirty inodes allowed.
+ */
+#define HAMMER2_LIMIT_DIRTY_CHAINS     (1024*1024)
+#define HAMMER2_LIMIT_DIRTY_INODES     (65536)
 
 /*
  * The chain structure tracks a portion of the media topology from the
@@ -493,7 +497,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 
 #define HAMMER2_RESOLVE_SHARED         0x10    /* request shared lock */
 #define HAMMER2_RESOLVE_LOCKAGAIN      0x20    /* another shared lock */
-#define HAMMER2_RESOLVE_RDONLY         0x40    /* higher level op flag */
+#define HAMMER2_RESOLVE_UNUSED40       0x40
 #define HAMMER2_RESOLVE_NONBLOCK       0x80    /* non-blocking */
 
 /*
@@ -1228,7 +1232,7 @@ struct hammer2_pfs {
        struct lock             lock;           /* PFS lock for certain ops */
        struct lock             lock_nlink;     /* rename and nlink lock */
        struct netexport        export;         /* nfs export */
-       int                     speedup_ticks;  /* speedup_syncer() helper */
+       int                     unused00;
        int                     ronly;          /* read-only mount */
        int                     hflags;         /* pfs-specific mount flags */
        struct malloc_type      *minode;
@@ -1626,7 +1630,7 @@ int hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata);
 int hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name,
                                size_t name_len);
 
-void hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate);
+void hammer2_pfs_memory_wait(hammer2_pfs_t *pmp);
 void hammer2_pfs_memory_inc(hammer2_pfs_t *pmp);
 void hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp);
 
index 3ba39c5..5a323a3 100644 (file)
@@ -275,15 +275,7 @@ hammer2_cluster_lock(hammer2_cluster_t *cluster, int how)
  * that create/modify/delete elements use it.
  *
  * The chains making up the cluster may be narrowed down based on quorum
- * acceptability, and if RESOLVE_RDONLY is specified the chains can be
- * narrowed down to a single chain as long as the entire subtopology is known
- * to be intact.  So, for example, we can narrow a read-only op to a single
- * fast SLAVE but if we focus a CACHE chain we must still retain at least
- * a SLAVE to ensure that the subtopology can be accessed.
- *
- * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need
- * to be maintained once the topology is validated as-of the top level of
- * the operation.
+ * acceptability.
  *
  * If a failure occurs the operation must be aborted by higher-level code and
  * retried. XXX
@@ -517,16 +509,6 @@ hammer2_cluster_resolve(hammer2_cluster_t *cluster)
                                              chain->bref.modify_tid))) {
                                ++nslaves;
                                nflags |= HAMMER2_CLUSTER_RDHARD;
-#if 0
-                               /* XXX optimize for RESOLVE_RDONLY */
-                               if (cluster->focus == NULL) {
-                                       focus_pfs_type = HAMMER2_PFSTYPE_SLAVE;
-                                       cluster->focus_index = i;
-                                       cluster->focus = chain; /* NULL ok */
-                                       cluster->error = chain ? chain->error :
-                                                                0;
-                               }
-#endif
                        } else if (chain == NULL || chain->error == 0) {
                                nflags |= HAMMER2_CLUSTER_UNSOFT;
                        }
index 67d112d..aeacd7d 100644 (file)
@@ -260,6 +260,7 @@ hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
        uint32_t oflags;
        uint32_t nflags;
 
+#if 0
        /*
         * Modifying ops on the front-end can cause dirty inodes to
         * build up in the sideq.  We don't flush these on inactive/reclaim
@@ -272,6 +273,7 @@ hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
            pmp->mp) {
                speedup_syncer(pmp->mp);
        }
+#endif
 
        /*
         * Clean-up the transaction.  Wakeup any waiters when finishing
index dcf4cdc..6686b20 100644 (file)
@@ -213,9 +213,12 @@ hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
  *       vnode reclamation code to avoid unnecessary I/O (particularly when
  *       disposing of hundreds of thousands of cached vnodes).
  *
- * When an exclusive lock is obtained on an inode that is on the SYNCQ,
- * HAMMER2 will automatically move the inode to the front of the queue before
- * blocking to avoid long stalls against filesystem sync operations.
+ * This function, along with lock4, has SYNCQ semantics.  If the inode being
+ * locked is on the SYNCQ, that is it has been staged by the syncer, we must
+ * block until the operation is complete (even if we can lock the inode).  In
+ * order to reduce the stall time, we re-order the inode to the front of the
+ * pmp->syncq prior to blocking.  This reordering VERY significantly improves
+ * performance.
  *
  * The inode locking function locks the inode itself, resolves any stale
  * chains in the inode's cluster, and allocates a fresh copy of the
@@ -225,16 +228,6 @@ hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
  *
  * NOTE: We don't combine the inode/chain lock because putting away an
  *       inode would otherwise confuse multiple lock holders of the inode.
- *
- * NOTE: In-memory inodes always point to hardlink targets (the actual file),
- *      and never point to a hardlink pointer.
- *
- * NOTE: If caller passes HAMMER2_RESOLVE_RDONLY the exclusive locking code
- *      will feel free to reduce the chain set in the cluster as an
- *      optimization.  It will still be validated against the quorum if
- *      appropriate, but the optimization might be able to reduce data
- *      accesses to one node.  This flag is automatically set if the inode
- *      is locked with HAMMER2_RESOLVE_SHARED.
  */
 void
 hammer2_inode_lock(hammer2_inode_t *ip, int how)
@@ -248,7 +241,6 @@ hammer2_inode_lock(hammer2_inode_t *ip, int how)
         * Inode structure mutex - Shared lock
         */
        if (how & HAMMER2_RESOLVE_SHARED) {
-               /*how |= HAMMER2_RESOLVE_RDONLY; not used */
                hammer2_mtx_sh(&ip->lock);
                return;
        }
@@ -885,8 +877,6 @@ again:
        nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
        spin_init(&nip->cluster_spin, "h2clspin");
        atomic_add_long(&pmp->inmem_inodes, 1);
-       hammer2_pfs_memory_inc(pmp);
-       hammer2_pfs_memory_wakeup(pmp);
        if (pmp->spmp_hmp)
                nip->flags = HAMMER2_INODE_SROOT;
 
index f7213c3..77ac83b 100644 (file)
@@ -1169,7 +1169,7 @@ hammer2_ioctl_destroy(hammer2_inode_t *ip, void *data)
                        error = EINVAL;
                        break;
                }
-               hammer2_pfs_memory_wait(ip, 0);
+               hammer2_pfs_memory_wait(pmp);
                hammer2_trans_init(pmp, 0);
                hammer2_inode_lock(ip, 0);
 
@@ -1199,7 +1199,7 @@ hammer2_ioctl_destroy(hammer2_inode_t *ip, void *data)
                        error = EINVAL;
                        break;
                }
-               hammer2_pfs_memory_wait(ip, 0);
+               hammer2_pfs_memory_wait(pmp);
                hammer2_trans_init(pmp, 0);
 
                xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING);
index f4ab208..95ea1fb 100644 (file)
@@ -217,6 +217,7 @@ static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
                                int *exflagsp, struct ucred **credanonp);
+static void hammer2_vfs_modifying(struct mount *mp);
 
 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
 #if 0
@@ -245,7 +246,8 @@ static struct vfsops hammer2_vfsops = {
        .vfs_vget       = hammer2_vfs_vget,
        .vfs_vptofh     = hammer2_vfs_vptofh,
        .vfs_fhtovp     = hammer2_vfs_fhtovp,
-       .vfs_checkexp   = hammer2_vfs_checkexp
+       .vfs_checkexp   = hammer2_vfs_checkexp,
+       .vfs_modifying  = hammer2_vfs_modifying
 };
 
 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
@@ -333,8 +335,10 @@ hammer2_vfs_init(struct vfsconf *conf)
        hammer2_limit_dirty_chains = maxvnodes / 10;
        if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS)
                hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS;
+       if (hammer2_limit_dirty_chains < 1000)
+               hammer2_limit_dirty_chains = 1000;
 
-       hammer2_limit_dirty_inodes = maxvnodes / 100;
+       hammer2_limit_dirty_inodes = maxvnodes / 25;
        if (hammer2_limit_dirty_inodes < 100)
                hammer2_limit_dirty_inodes = 100;
        if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES)
@@ -2508,6 +2512,13 @@ restart:
                                      HAMMER2_TRANS_WAITING);
        dorestart = 0;
 
+       /*
+        * sideq_count may have dropped enough to allow us to unstall
+        * the frontend.
+        */
+       hammer2_pfs_memory_inc(pmp);
+       hammer2_pfs_memory_wakeup(pmp);
+
        /*
         * Now run through all inodes on syncq.
         *
@@ -3028,109 +3039,72 @@ hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe)
 }
 
 /*
- * Attempt to proactively fsync dirty vnodes if we have too many.  This
- * solves an issue where the kernel syncer thread can get seriously behind
- * when multiple user processes/threads are furiously modifying inodes.
- * This situation can occur on slow storage and is only limited by
- * kern.maxvnodes without the moderation code below.  It is made worse
- * when the device buffers underlying the modified inodes (which are clean)
- * get evicted before the flush can occur, forcing a re-read.
+ * It is possible for an excessive number of dirty chains or dirty inodes
+ * to build up.  When this occurs we start an asynchronous filesystem sync.
+ * If the level continues to build up, we stall, waiting for it to drop,
+ * with some hysteresis.
  *
- * We do not want sysads to feel that they have to torpedo kern.maxvnodes
- * to solve this problem, so we implement vfs.hammer2.limit_dirty_inodes
- * (per-mount-basis) and default it to something reasonable.
+ * We limit the stall to two seconds per call.
  *
- * XXX we cannot safely block here because we might be holding locks that
- * the syncer needs.
+ * This relies on the kernel calling hammer2_vfs_modifying() prior to
+ * obtaining any vnode locks before making a modifying VOP call.
  */
 static void
-hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate)
+hammer2_vfs_modifying(struct mount *mp)
 {
-       hammer2_pfs_t *pmp = ip->pmp;
-       struct mount *mp = pmp->mp;
-
-       if (mp && pmp->sideq_count > hammer2_limit_dirty_inodes) {
-               speedup_syncer(mp);
-               /*vn_syncer_one(mp);*/
-       }
+       hammer2_pfs_memory_wait(MPTOPMP(mp));
 }
 
 /*
- * Manage excessive memory resource use for chain and related
- * structures.
- *
- * Called without any inode locks or transaction locks.  VNodes
- * might be locked by the kernel in the call stack.
+ * Initiate an asynchronous filesystem sync and, with hysteresis,
+ * stall if the internal data structure count becomes too bloated.
  */
 void
-hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate)
+hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
 {
-       hammer2_pfs_t *pmp = ip->pmp;
        uint32_t waiting;
-       uint32_t count;
-       uint32_t limit;
-#if 0
-       static int zzticks;
-#endif
+       int loops;
 
-       return; /* XXX */
-
-       /*
-        * Moderate the number of dirty inodes
-        */
-       hammer2_pfs_moderate(ip, always_moderate);
+       if (pmp == NULL || pmp->mp == NULL)
+               return;
 
-       /*
-        * Atomic check condition and wait.  Also do an early speedup of
-        * the syncer to try to avoid hitting the wait.
-        */
-       for (;;) {
-               waiting = pmp->inmem_dirty_chains;
+       for (loops = 0; loops < 2; ++loops) {
+               waiting = pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK;
                cpu_ccfence();
-               count = waiting & HAMMER2_DIRTYCHAIN_MASK;
-
-               limit = pmp->mp->mnt_nvnodelistsize / 10;
-               if (limit < hammer2_limit_dirty_chains)
-                       limit = hammer2_limit_dirty_chains;
-               if (limit < 1000)
-                       limit = 1000;
-
-#if 0
-               if ((int)(ticks - zzticks) > hz) {
-                       zzticks = ticks;
-                       kprintf("count %ld %ld\n", count, limit);
-               }
-#endif
 
                /*
-                * Block if there are too many dirty chains present, wait
-                * for the flush to clean some out.
+                * Start the syncer running at 1/2 the limit
                 */
-               if (count > limit) {
-                       hammer2_pfs_moderate(ip, always_moderate);
-                       tsleep_interlock(&pmp->inmem_dirty_chains, 0);
-                       if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
-                                              waiting,
-                                      waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
-                               if (ticks != pmp->speedup_ticks) {
-                                       pmp->speedup_ticks = ticks;
-                                       speedup_syncer(pmp->mp);
-                               }
-                               tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
-                                      "chnmem", hz);
-                       }
-                       continue;       /* loop on success or fail */
+               if (waiting > hammer2_limit_dirty_chains / 2 ||
+                   pmp->sideq_count > hammer2_limit_dirty_inodes / 2) {
+                       trigger_syncer(pmp->mp);
                }
 
                /*
-                * Try to start an early flush before we are forced to block.
+                * Stall at the limit waiting for the counts to drop.
+                * This code will typically be woken up once the count
+                * drops below 3/4 the limit, or in one second.
                 */
-               if (count > limit * 5 / 10 &&
-                   ticks != pmp->speedup_ticks) {
-                       pmp->speedup_ticks = ticks;
-                       speedup_syncer(pmp->mp);
+               if (waiting < hammer2_limit_dirty_chains &&
+                   pmp->sideq_count < hammer2_limit_dirty_inodes) {
+                       break;
                }
-               break;
+               tsleep_interlock(&pmp->inmem_dirty_chains, 0);
+               atomic_set_int(&pmp->inmem_dirty_chains,
+                              HAMMER2_DIRTYCHAIN_WAITING);
+               if (waiting < hammer2_limit_dirty_chains &&
+                   pmp->sideq_count < hammer2_limit_dirty_inodes) {
+                       break;
+               }
+               trigger_syncer(pmp->mp);
+               tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED, "h2memw", hz);
+#if 0
+               limit = pmp->mp->mnt_nvnodelistsize / 10;
+               if (limit < hammer2_limit_dirty_chains)
+                       limit = hammer2_limit_dirty_chains;
+               if (limit < 1000)
+                       limit = 1000;
+#endif
        }
 }
 
@@ -3150,7 +3124,11 @@ hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp)
        if (pmp) {
                waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, -1);
                /* don't need --waiting to test flag */
-               if (waiting & HAMMER2_DIRTYCHAIN_WAITING) {
+
+               if ((waiting & HAMMER2_DIRTYCHAIN_WAITING) &&
+                   (pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK) <=
+                   hammer2_limit_dirty_chains * 2 / 3 &&
+                   pmp->sideq_count <= hammer2_limit_dirty_inodes * 2 / 3) {
                        atomic_clear_int(&pmp->inmem_dirty_chains,
                                         HAMMER2_DIRTYCHAIN_WAITING);
                        wakeup(&pmp->inmem_dirty_chains);
index dbf5b5c..08ddaec 100644 (file)
@@ -380,7 +380,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
        if (hammer2_vfs_enospace(ip, 0, ap->a_cred) > 1)
                return (ENOSPC);
 
-       hammer2_pfs_memory_wait(ip, 0);
+       /*hammer2_pfs_memory_wait(ip->pmp);*/
        hammer2_trans_init(ip->pmp, 0);
        hammer2_inode_lock(ip, 0);
        error = 0;
@@ -823,7 +823,7 @@ hammer2_vop_write(struct vop_write_args *ap)
        if (uio->uio_segflg == UIO_NOCOPY) {
                hammer2_trans_init(ip->pmp, HAMMER2_TRANS_BUFCACHE);
        } else {
-               hammer2_pfs_memory_wait(ip, 0);
+               /*hammer2_pfs_memory_wait(ip->pmp);*/
                hammer2_trans_init(ip->pmp, 0);
        }
        error = hammer2_write_file(ip, uio, ioflag, seqcount);
@@ -1385,7 +1385,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
 
-       hammer2_pfs_memory_wait(dip, 1);
+       /*hammer2_pfs_memory_wait(dip->pmp);*/
        hammer2_trans_init(dip->pmp, 0);
 
        inum = hammer2_trans_newinum(dip->pmp);
@@ -1521,7 +1521,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         */
        ip = VTOI(ap->a_vp);
        KASSERT(ip->pmp, ("ip->pmp is NULL %p %p", ip, ip->pmp));
-       hammer2_pfs_memory_wait(ip, 0);
+       /*hammer2_pfs_memory_wait(ip->pmp);*/
        hammer2_trans_init(ip->pmp, 0);
 
        /*
@@ -1597,7 +1597,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
-       hammer2_pfs_memory_wait(dip, 1);
+       /*hammer2_pfs_memory_wait(dip->pmp);*/
        hammer2_trans_init(dip->pmp, 0);
 
        inum = hammer2_trans_newinum(dip->pmp);
@@ -1680,7 +1680,7 @@ hammer2_vop_nmknod(struct vop_nmknod_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
-       hammer2_pfs_memory_wait(dip, 1);
+       /*hammer2_pfs_memory_wait(dip->pmp);*/
        hammer2_trans_init(dip->pmp, 0);
 
        /*
@@ -1758,7 +1758,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
-       hammer2_pfs_memory_wait(dip, 1);
+       /*hammer2_pfs_memory_wait(dip->pmp);*/
        hammer2_trans_init(dip->pmp, 0);
 
        ap->a_vap->va_type = VLNK;      /* enforce type */
@@ -1872,7 +1872,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap)
 
        ncp = ap->a_nch->ncp;
 
-       hammer2_pfs_memory_wait(dip, 1);
+       /*hammer2_pfs_memory_wait(dip->pmp);*/
        hammer2_trans_init(dip->pmp, 0);
        hammer2_inode_lock(dip, 0);
 
@@ -1962,7 +1962,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
                return (ENOSPC);
 #endif
 
-       hammer2_pfs_memory_wait(dip, 1);
+       /*hammer2_pfs_memory_wait(dip->pmp);*/
        hammer2_trans_init(dip->pmp, 0);
        hammer2_inode_lock(dip, 0);
 
@@ -2060,7 +2060,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
        tname = tncp->nc_name;
        tname_len = tncp->nc_nlen;
 
-       hammer2_pfs_memory_wait(tdip, 0);
+       /*hammer2_pfs_memory_wait(tdip->pmp);*/
        hammer2_trans_init(tdip->pmp, 0);
 
        update_tdip = 0;