hammer2 - Performance work
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 12 Apr 2019 06:16:32 +0000 (23:16 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 12 Apr 2019 06:16:32 +0000 (23:16 -0700)
* Implement immediate (async) flush follow-through when OS requests
  it, with write-clustering support at the device level if enabled.
  Write-clustering support at the device level is disabled by default
  because H2 already uses a fairly large and efficient 64K block size.

* Change the XOP dispatch for strategy functions.  Further partition
  strategy functions by read or write to prevent reads from getting
  stuck behind long write chains.

  Also do not span-out I/O related to a single inode across too many
  XOP threads as doing so creates a massive amount of unnecessary
  lock contention.

* Significantly improves concurrent filesystem ops and I/O and
  significantly improves sequential filesystem write I/O.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_admin.c
sys/vfs/hammer2/hammer2_io.c
sys/vfs/hammer2/hammer2_vfsops.c

index f741212..0436d39 100644 (file)
@@ -323,6 +323,7 @@ typedef struct hammer2_io hammer2_io_t;
 #define HAMMER2_DIO_GOOD       0x4000000000000000LLU   /* dio->bp is stable */
 #define HAMMER2_DIO_WAITING    0x2000000000000000LLU   /* wait on INPROG */
 #define HAMMER2_DIO_DIRTY      0x1000000000000000LLU   /* flush last drop */
+#define HAMMER2_DIO_FLUSH      0x0800000000000000LLU   /* immediate flush */
 
 #define HAMMER2_DIO_MASK       0x00FFFFFFFFFFFFFFLLU
 
@@ -1470,6 +1471,7 @@ extern int hammer2_flush_pipe;
 extern int hammer2_dio_count;
 extern int hammer2_dio_limit;
 extern int hammer2_bulkfree_tps;
+extern int hammer2_worker_rmask;
 extern long hammer2_chain_allocs;
 extern long hammer2_chain_frees;
 extern long hammer2_limit_dirty_chains;
index 24063db..62594a1 100644 (file)
@@ -484,20 +484,49 @@ hammer2_xop_start_except(hammer2_xop_head_t *xop, hammer2_xop_desc_t *desc,
         * get behind and the frontend is allowed to complete the moment a
         * quorum of targets succeed.
         *
-        * Strategy operations must be segregated from non-strategy operations
-        * to avoid a deadlock.  For example, if a vfsync and a bread/bwrite
-        * were queued to the same worker thread, the locked buffer in the
-        * strategy operation can deadlock the vfsync's buffer list scan.
+        * Strategy operations:
+        *
+        *      (1) Must be segregated from non-strategy operations to
+        *          avoid a deadlock.  A vfsync and a bread/bwrite can
+        *          deadlock the vfsync's buffer list scan.
+        *
+        *      (2) Reads are separated from writes to avoid write stalls
+        *          from excessively intefering with reads.  Reads are allowed
+        *          to wander across multiple worker threads for potential
+        *          single-file concurrency improvements.
+        *
+        *      (3) Writes are serialized to a single worker thread (for any
+        *          given inode) in order to try to improve block allocation
+        *          sequentiality and to reduce lock contention.
         *
         * TODO - RENAME fails here because it is potentially modifying
-        *        three different inodes.
+        *        three different inodes, but we triple-lock the inodes
+        *        involved so it shouldn't create a sequencing schism.
         */
        if (xop->flags & HAMMER2_XOP_STRATEGY) {
                hammer2_xop_strategy_t *xopst;
+               hammer2_off_t off;
+               int cdr;
 
                xopst = &((hammer2_xop_t *)xop)->xop_strategy;
-               ng = (int)(hammer2_icrc32(&xop->ip1, sizeof(xop->ip1)) ^
-                          hammer2_icrc32(&xopst->lbase, sizeof(xopst->lbase)));
+               ng = (int)(hammer2_icrc32(&xop->ip1, sizeof(xop->ip1)));
+               if (desc == &hammer2_strategy_read_desc) {
+                       off = xopst->lbase / HAMMER2_PBUFSIZE;
+                       cdr = hammer2_cluster_data_read;
+                       /* sysctl race, load into var */
+                       cpu_ccfence();
+                       if (cdr)
+                               off /= cdr;
+                       ng ^= hammer2_icrc32(&off, sizeof(off)) &
+                             (hammer2_worker_rmask << 1);
+                       ng |= 1;
+               } else {
+#if 0
+                       off = xopst->lbase >> 21;
+                       ng ^= hammer2_icrc32(&off, sizeof(off)) & 3;
+#endif
+                       ng &= ~1;
+               }
                ng = ng & (HAMMER2_XOPGROUPS_MASK >> 1);
                ng += HAMMER2_XOPGROUPS / 2;
        } else {
index 65ca38f..b08127f 100644 (file)
@@ -498,40 +498,34 @@ _hammer2_io_putblk(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
                        /*
                         * Allows dirty buffers to accumulate and
                         * possibly be canceled (e.g. by a 'rm'),
-                        * will burst-write later.
+                        * by default we will burst-write later.
                         *
-                        * We normally do not allow the kernel to
-                        * cluster dirty buffers because H2 already
-                        * uses a large block size.
+                        * We generally do NOT want to issue an actual
+                        * b[a]write() or cluster_write() here.  Due to
+                        * the way chains are locked, buffers may be cycled
+                        * in and out quite often and disposal here can cause
+                        * multiple writes or write-read stalls.
                         *
-                        * NOTE: Do not use cluster_write() here.  The
-                        *       problem is that due to the way chains
-                        *       are locked, buffers are cycled in and out
-                        *       quite often so the disposal here is not
-                        *       necessarily the final disposal.  Avoid
-                        *       excessive rewriting of the same blocks
-                        *       by using bdwrite().
+                        * If FLUSH is set we do want to issue the actual
+                        * write.  This typically occurs in the write-behind
+                        * case when writing to large files.
                         */
-#if 0
                        off_t peof;
                        int hce;
-
-                       if ((hce = hammer2_cluster_write) > 0) {
-                               /*
-                                * Allows write-behind to keep the buffer
-                                * cache sane.
-                                */
-                               peof = (pbase + HAMMER2_SEGMASK64) &
-                                      ~HAMMER2_SEGMASK64;
-                               bp->b_flags |= B_CLUSTEROK;
-                               cluster_write(bp, peof, psize, hce);
-                       } else
-#endif
-                       if (hammer2_cluster_write)
-                               bp->b_flags |= B_CLUSTEROK;
-                       else
+                       if (dio->refs & HAMMER2_DIO_FLUSH) {
+                               if ((hce = hammer2_cluster_write) != 0) {
+                                       peof = (pbase + HAMMER2_SEGMASK64) &
+                                              ~HAMMER2_SEGMASK64;
+                                       bp->b_flags |= B_CLUSTEROK;
+                                       cluster_write(bp, peof, psize, hce);
+                               } else {
+                                       bp->b_flags &= ~B_CLUSTEROK;
+                                       bawrite(bp);
+                               }
+                       } else {
                                bp->b_flags &= ~B_CLUSTEROK;
-                       bdwrite(bp);
+                               bdwrite(bp);
+                       }
                } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
                        brelse(bp);
                } else {
@@ -552,12 +546,14 @@ _hammer2_io_putblk(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
 
        /*
         * Clear INPROG, GOOD, and WAITING (GOOD should already be clear).
+        *
+        * Also clear FLUSH as it was handled above.
         */
        for (;;) {
                orefs = dio->refs;
                cpu_ccfence();
                nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
-                                 HAMMER2_DIO_WAITING);
+                                 HAMMER2_DIO_WAITING | HAMMER2_DIO_FLUSH);
                if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
                        if (orefs & HAMMER2_DIO_WAITING)
                                wakeup(dio);
@@ -711,7 +707,8 @@ _hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase,
 void
 _hammer2_io_bawrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
 {
-       atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
+       atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
+                                     HAMMER2_DIO_FLUSH);
        _hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
 }
 
@@ -725,7 +722,8 @@ _hammer2_io_bdwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
 int
 _hammer2_io_bwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
 {
-       atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
+       atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
+                                     HAMMER2_DIO_FLUSH);
        _hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
        return (0);     /* XXX */
 }
index 70bf42d..dfe17f8 100644 (file)
@@ -92,6 +92,7 @@ int hammer2_flush_pipe = 100;
 int hammer2_dio_count;
 int hammer2_dio_limit = 256;
 int hammer2_bulkfree_tps = 5000;
+int hammer2_worker_rmask = 3;
 long hammer2_chain_allocs;
 long hammer2_chain_frees;
 long hammer2_limit_dirty_chains;
@@ -144,6 +145,8 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW,
           &hammer2_inval_enable, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
           &hammer2_flush_pipe, 0, "");
+SYSCTL_INT(_vfs_hammer2, OID_AUTO, worker_rmask, CTLFLAG_RW,
+          &hammer2_worker_rmask, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW,
           &hammer2_bulkfree_tps, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW,