hammer2 - Add vfs.hammer2.limit_dirty_inodes
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 18 Apr 2018 06:10:02 +0000 (23:10 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 18 Apr 2018 06:14:52 +0000 (23:14 -0700)
* Add vfs.hammer2.limit_dirty_inodes which causes hammer2 to
  immediately begin fsync()ing dirty inodes when the count exceeds
  the limit.  Set a reasonable limit.

* Fixes issues on slower storage when the syncer is unable to
  keep up with the userland frontend, causing the number of
  dirty inodes to increase almost unbounded (limited only by
  kern.maxvnodes).

  Allowing a large number of dirty inodes to accumulate can
  can result in a situation where the clean device buffer
  underpinning the dirty inode is discarded by the kernel before
  the filesystem is able to flush it, forcing additional disk reads
  and slowing things down even more.

* Improve the operation of speedup_syncer() by limiting the
  rate at which we call the function.  It is now called a maximum
  of approximately once per tick (each call speeding up a sync
  by one second).

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index 1164397..3d070dc 100644 (file)
@@ -175,6 +175,7 @@ typedef uint32_t hammer2_xid_t;
 #define HAMMER2_XID_MAX                        0x7FFFFFFFU
 
 #define HAMMER2_LIMIT_DIRTY_CHAINS     (65536)
+#define HAMMER2_LIMIT_DIRTY_INODES     (16384)
 
 /*
  * The chain structure tracks a portion of the media topology from the
@@ -1170,6 +1171,7 @@ struct hammer2_pfs {
        struct lock             lock;           /* PFS lock for certain ops */
        struct lock             lock_nlink;     /* rename and nlink lock */
        struct netexport        export;         /* nfs export */
+       int                     speedup_ticks;  /* speedup_syncer() helper */
        int                     ronly;          /* read-only mount */
        int                     hflags;         /* pfs-specific mount flags */
        struct malloc_type      *minode;
@@ -1557,7 +1559,7 @@ int hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata);
 int hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name,
                                size_t name_len);
 
-void hammer2_pfs_memory_wait(hammer2_pfs_t *pmp);
+void hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate);
 void hammer2_pfs_memory_inc(hammer2_pfs_t *pmp);
 void hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp);
 
index 638baeb..07c4caa 100644 (file)
@@ -1165,7 +1165,7 @@ hammer2_ioctl_destroy(hammer2_inode_t *ip, void *data)
                        error = EINVAL;
                        break;
                }
-               hammer2_pfs_memory_wait(pmp);
+               hammer2_pfs_memory_wait(ip, 0);
                hammer2_trans_init(pmp, 0);
                hammer2_inode_lock(ip, 0);
 
@@ -1195,7 +1195,7 @@ hammer2_ioctl_destroy(hammer2_inode_t *ip, void *data)
                        error = EINVAL;
                        break;
                }
-               hammer2_pfs_memory_wait(pmp);
+               hammer2_pfs_memory_wait(ip, 0);
                hammer2_trans_init(pmp, 0);
 
                xop = hammer2_xop_alloc(pmp->iroot, 0);
index a20fb96..afd1246 100644 (file)
@@ -93,6 +93,7 @@ int hammer2_bulkfree_tps = 5000;
 long hammer2_chain_allocs;
 long hammer2_chain_frees;
 long hammer2_limit_dirty_chains;
+long hammer2_limit_dirty_inodes;
 long hammer2_count_modified_chains;
 long hammer2_iod_invals;
 long hammer2_iod_file_read;
@@ -143,6 +144,8 @@ SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW,
           &hammer2_chain_frees, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
           &hammer2_limit_dirty_chains, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW,
+          &hammer2_limit_dirty_inodes, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW,
           &hammer2_count_modified_chains, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
@@ -320,6 +323,12 @@ hammer2_vfs_init(struct vfsconf *conf)
        if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS)
                hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS;
 
+       hammer2_limit_dirty_inodes = maxvnodes / 100;
+       if (hammer2_limit_dirty_inodes < 100)
+               hammer2_limit_dirty_inodes = 100;
+       if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES)
+               hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES;
+
        return (error);
 }
 
@@ -2797,13 +2806,41 @@ hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe)
        }
 }
 
+/*
+ * Attempt to proactively fsync dirty vnodes if we have too many.  This
+ * solves an issue where the kernel syncer thread can get seriously behind
+ * when multiple user processes/threads are furiously modifying inodes.
+ * This situation can occur on slow storage and is only limited by
+ * kern.maxvnodes without the moderation code below.  It is made worse
+ * when the device buffers underlying the modified inodes (which are clean)
+ * get evicted before the flush can occur, forcing a re-read.
+ *
+ * We do not want sysads to feel that they have to torpedo kern.maxvnodes
+ * to solve this problem, so we implement vfs.hammer2.limit_dirty_inodes
+ * (per-mount-basis) and default it to something reasonable.
+ */
+static void
+hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate)
+{
+       hammer2_pfs_t *pmp = ip->pmp;
+       struct mount *mp = pmp->mp;
+
+       if (mp && vn_syncer_count(mp) > hammer2_limit_dirty_inodes) {
+               vn_syncer_one(mp);
+       }
+}
+
 /*
  * Manage excessive memory resource use for chain and related
  * structures.
+ *
+ * Called without any inode locks or transaction locks.  VNodes
+ * might be locked by the kernel in the call stack.
  */
 void
-hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
+hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate)
 {
+       hammer2_pfs_t *pmp = ip->pmp;
        uint32_t waiting;
        uint32_t count;
        uint32_t limit;
@@ -2811,6 +2848,11 @@ hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
        static int zzticks;
 #endif
 
+       /*
+        * Moderate the number of dirty inodes
+        */
+       hammer2_pfs_moderate(ip, always_moderate);
+
        /*
         * Atomic check condition and wait.  Also do an early speedup of
         * the syncer to try to avoid hitting the wait.
@@ -2838,11 +2880,15 @@ hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
                 * for the flush to clean some out.
                 */
                if (count > limit) {
+                       hammer2_pfs_moderate(ip, always_moderate);
                        tsleep_interlock(&pmp->inmem_dirty_chains, 0);
                        if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
                                               waiting,
                                       waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
-                               speedup_syncer(pmp->mp);
+                               if (ticks != pmp->speedup_ticks) {
+                                       pmp->speedup_ticks = ticks;
+                                       speedup_syncer(pmp->mp);
+                               }
                                tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
                                       "chnmem", hz);
                        }
@@ -2852,8 +2898,11 @@ hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
                /*
                 * Try to start an early flush before we are forced to block.
                 */
-               if (count > limit * 5 / 10)
+               if (count > limit * 5 / 10 &&
+                   ticks != pmp->speedup_ticks) {
+                       pmp->speedup_ticks = ticks;
                        speedup_syncer(pmp->mp);
+               }
                break;
        }
 }
index bb7c302..cd720d3 100644 (file)
@@ -230,7 +230,15 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        hammer2_trans_init(ip->pmp, 0);
 
        /*
-        * Clean out buffer cache, wait for I/O's to complete.
+        * Flush dirty buffers in the file's logical buffer cache.
+        * It is best to wait for the strategy code to commit the
+        * buffers to the device's backing buffer cache before
+        * then trying to flush the inode.
+        *
+        * This should be quick, but certain inode modifications cached
+        * entirely in the hammer2_inode structure may not trigger a
+        * buffer read until the flush so the fsync can wind up also
+        * doing scattered reads.
         */
        vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
        bio_track_wait(&vp->v_track_write, 0, 0);
@@ -252,6 +260,18 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        error2 = hammer2_inode_chain_flush(ip);
        if (error2)
                error1 = error2;
+
+       /*
+        * We may be able to clear the vnode dirty flag.  The
+        * hammer2_pfs_moderate() code depends on this usually working.
+        */
+       if ((ip->flags & (HAMMER2_INODE_MODIFIED |
+                         HAMMER2_INODE_RESIZED |
+                         HAMMER2_INODE_DIRTYDATA)) == 0 &&
+           RB_EMPTY(&vp->v_rbdirty_tree) &&
+           !bio_track_active(&vp->v_track_write)) {
+               vclrisdirty(vp);
+       }
        hammer2_inode_unlock(ip);
        hammer2_trans_done(ip->pmp);
 
@@ -362,7 +382,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
        if (hammer2_vfs_enospace(ip, 0, ap->a_cred) > 1)
                return (ENOSPC);
 
-       hammer2_pfs_memory_wait(ip->pmp);
+       hammer2_pfs_memory_wait(ip, 0);
        hammer2_trans_init(ip->pmp, 0);
        hammer2_inode_lock(ip, 0);
        error = 0;
@@ -801,10 +821,12 @@ hammer2_vop_write(struct vop_write_args *ap)
         * transaction related to the buffer cache or other direct
         * VM page manipulation.
         */
-       if (uio->uio_segflg == UIO_NOCOPY)
+       if (uio->uio_segflg == UIO_NOCOPY) {
                hammer2_trans_init(ip->pmp, HAMMER2_TRANS_BUFCACHE);
-       else
+       } else {
+               hammer2_pfs_memory_wait(ip, 0);
                hammer2_trans_init(ip->pmp, 0);
+       }
        error = hammer2_write_file(ip, uio, ioflag, seqcount);
        hammer2_trans_done(ip->pmp);
 
@@ -1360,7 +1382,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
 
-       hammer2_pfs_memory_wait(dip->pmp);
+       hammer2_pfs_memory_wait(dip, 1);
        hammer2_trans_init(dip->pmp, 0);
 
        inum = hammer2_trans_newinum(dip->pmp);
@@ -1489,7 +1511,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         */
        ip = VTOI(ap->a_vp);
        KASSERT(ip->pmp, ("ip->pmp is NULL %p %p", ip, ip->pmp));
-       hammer2_pfs_memory_wait(ip->pmp);
+       hammer2_pfs_memory_wait(ip, 0);
        hammer2_trans_init(ip->pmp, 0);
 
        /*
@@ -1566,7 +1588,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
-       hammer2_pfs_memory_wait(dip->pmp);
+       hammer2_pfs_memory_wait(dip, 1);
        hammer2_trans_init(dip->pmp, 0);
 
        inum = hammer2_trans_newinum(dip->pmp);
@@ -1646,7 +1668,7 @@ hammer2_vop_nmknod(struct vop_nmknod_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
-       hammer2_pfs_memory_wait(dip->pmp);
+       hammer2_pfs_memory_wait(dip, 1);
        hammer2_trans_init(dip->pmp, 0);
 
        /*
@@ -1720,7 +1742,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
-       hammer2_pfs_memory_wait(dip->pmp);
+       hammer2_pfs_memory_wait(dip, 1);
        hammer2_trans_init(dip->pmp, 0);
 
        ap->a_vap->va_type = VLNK;      /* enforce type */
@@ -1830,7 +1852,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap)
 
        ncp = ap->a_nch->ncp;
 
-       hammer2_pfs_memory_wait(dip->pmp);
+       hammer2_pfs_memory_wait(dip, 1);
        hammer2_trans_init(dip->pmp, 0);
        hammer2_inode_lock(dip, 0);
 
@@ -1920,7 +1942,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
                return (ENOSPC);
 #endif
 
-       hammer2_pfs_memory_wait(dip->pmp);
+       hammer2_pfs_memory_wait(dip, 1);
        hammer2_trans_init(dip->pmp, 0);
        hammer2_inode_lock(dip, 0);
 
@@ -2018,7 +2040,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
        tname = tncp->nc_name;
        tname_len = tncp->nc_nlen;
 
-       hammer2_pfs_memory_wait(tdip->pmp);
+       hammer2_pfs_memory_wait(tdip, 0);
        hammer2_trans_init(tdip->pmp, 0);
 
        update_tdip = 0;