hammer2 - Refactor dedup, fixes, optimizations
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 24 Aug 2017 10:38:37 +0000 (03:38 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 24 Aug 2017 10:38:37 +0000 (03:38 -0700)
* Refactor the dedup code, reducing tracking complexity.  Also note that
  we cannot really depend on VM page caching for dedup tests.  Document why
  using bread() is better.

* Use a larger dedup heuristic table to improve dedup matching.

* Improve hammer2_io_getquick(), fix a bug where the dedup_delete code
  was improperly using it and could sometimes miss a dio structure due
  to the underlying buffer not being fully cached.

* Cap out at 16KB indirect blocks, instead of 64KB indirect blocks.  This
  significantly reduces meta-data overheads.

* For now remove IO invalidation, it was causing corruption due to bugs.
  This means that deleted meta-data will be flushed.  However, a certain
  amount of meta-data does not get immediately instantiated and file data
  chains are not instantiated unless the buffer cache gets flushed, so
  temporary files are still pretty cheap.

* Try to improve DIO's LRU recycling.

* Fix a brelse() that was supposed to be a bqrelse().  This improves
  meta-data caching which is desirable for dedup.

* Implement the 'always_compress' sysctl which disables the H2 compression
  heuristic that tries to detect uncompressable data.  If set, H2 will always
  try to compress.

sbin/hammer2/hammer2.8
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_bulkfree.c
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_io.c
sys/vfs/hammer2/hammer2_strategy.c
sys/vfs/hammer2/hammer2_vfsops.c

index e17057d..da1498a 100644 (file)
@@ -286,6 +286,14 @@ data.
 Hammer2 compression is only effective when it can reduce the size of dataset
 (typically a 64KB block) by one or more powers of 2.  A 64K block which
 only compresses to 40K will not yield any storage improvement.
+.Pp
+Generally speaking you do not want to set the compression mode to 'none',
+as this will cause blocks of all-zeros to be written as all-zero blocks,
+instead of holes.  The 'autozero' compression mode detects blocks of all-zeros
+and writes them as holes.  However, HAMMER2 will rewrite data in-place if
+the compression mode is set to 'none' and the check code is set to
+'disabled'.  Formal snapshots will still snapshot such files.  However,
+de-duplication will no longer function on the data blocks.
 .\" ==== setcheck ====
 .It Cm setcheck Ar check Op path...
 Set the check code as specified for any newly created elements at or under
@@ -313,6 +321,26 @@ You can specify any PFS for the mount, the bulkfree pass is run on the
 entire partition.
 Note that it takes two passes to actually free space.
 .El
+.Sh SYSCTLS
+.Bl -tag -width indent
+.It Va vfs.hammer2.dedup_enable (default on)
+Enables live de-duplication.  Any recently read data that is on-media
+(already synchronized to media) is tested against pending writes for
+compatibility.  If a match is found, the write will reference the
+existing on-media data instead of writing new data.
+.It Va vfs.hammer2.always_compress (default off)
+This disables the H2 compression heuristic and forces H2 to always
+try to compress data blocks, even if they look uncompressable.
+Enabling this option reduces performance but has higher de-duplication
+repeatability.
+.It Va vfs.hammer2.cluster_read (default 4)
+Set the amount of read-ahead clustering to perform.
+.It Va vfs.hammer2.cluster_write (default 0)
+Set the amount of write-behind clustering to perform.  This is disabled by
+default in order to give temporary files a chance to be deleted before
+media writes are committed.  Enabling this reduces buffer cache stress
+but causes file writes to flush to media more quickly.
+.El
 .Sh SETTING UP /etc/hammer2
 The 'rsainit' directive will create the
 .Pa /etc/hammer2
index 3357686..92a92ad 100644 (file)
@@ -292,8 +292,9 @@ typedef struct hammer2_iocb hammer2_iocb_t;
 /*
  * DIO - Management structure wrapping system buffer cache.
  *
- *      Used for multiple purposes including concurrent management
- *      if small requests by chains into larger DIOs.
+ * HAMMER2 uses an I/O abstraction that allows it to cache and manipulate
+ * fixed-sized filesystem buffers frontend by variable-sized hammer2_chain
+ * structures.
  */
 struct hammer2_io {
        RB_ENTRY(hammer2_io) rbnode;    /* indexed by device offset */
@@ -306,7 +307,9 @@ struct hammer2_io {
        int             psize;
        int             act;            /* activity */
        int             btype;          /* approximate BREF_TYPE_* */
-       int             unused01;
+       int             ticks;
+       uint64_t        invalid_mask;   /* area that is invalid on-disk */
+       uint64_t        dedup_ok_mask;  /* ok to dedup */
 };
 
 typedef struct hammer2_io hammer2_io_t;
@@ -315,13 +318,9 @@ typedef struct hammer2_io hammer2_io_t;
 #define HAMMER2_DIO_GOOD       0x4000000000000000LLU   /* dio->bp is stable */
 #define HAMMER2_DIO_WAITING    0x2000000000000000LLU   /* wait on INPROG */
 #define HAMMER2_DIO_DIRTY      0x1000000000000000LLU   /* flush last drop */
-#define HAMMER2_DIO_INVALOK    0x0800000000000000LLU   /* ok to inval */
-#define HAMMER2_DIO_INVAL      0x0400000000000000LLU   /* inval request */
 
 #define HAMMER2_DIO_MASK       0x00FFFFFFFFFFFFFFLLU
 
-#define HAMMER2_DIO_INVALBITS  (HAMMER2_DIO_INVAL | HAMMER2_DIO_INVALOK)
-
 /*
  * Primary chain structure keeps track of the topology in-memory.
  */
@@ -379,7 +378,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_MODIFIED         0x00000001      /* dirty chain data */
 #define HAMMER2_CHAIN_ALLOCATED                0x00000002      /* kmalloc'd chain */
 #define HAMMER2_CHAIN_DESTROY          0x00000004
-#define HAMMER2_CHAIN_DEDUP            0x00000008      /* recorded for dedup */
+#define HAMMER2_CHAIN_UNUSED0008       0x00000008
 #define HAMMER2_CHAIN_DELETED          0x00000010      /* deleted chain */
 #define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
 #define HAMMER2_CHAIN_UPDATE           0x00000040      /* need parent update */
@@ -783,7 +782,7 @@ typedef struct hammer2_trans hammer2_trans_t;
 #define HAMMER2_FREEMAP_HEUR_SIZE      (HAMMER2_FREEMAP_HEUR_NRADIX * \
                                         HAMMER2_FREEMAP_HEUR_TYPES)
 
-#define HAMMER2_DEDUP_HEUR_SIZE                65536
+#define HAMMER2_DEDUP_HEUR_SIZE                (65536 * 4)
 #define HAMMER2_DEDUP_HEUR_MASK                (HAMMER2_DEDUP_HEUR_SIZE - 1)
 
 #define HAMMER2_FLUSH_TOP              0x0001
@@ -1076,7 +1075,7 @@ struct hammer2_dev {
        int             nipstacks;
        int             maxipstacks;
        kdmsg_iocom_t   iocom;          /* volume-level dmsg interface */
-       struct spinlock io_spin;        /* iotree access */
+       struct spinlock io_spin;        /* iotree, iolruq access */
        struct hammer2_io_tree iotree;
        int             iofree_count;
        hammer2_chain_t vchain;         /* anchor chain (topology) */
@@ -1287,10 +1286,12 @@ extern int hammer2_debug;
 extern int hammer2_cluster_read;
 extern int hammer2_cluster_write;
 extern int hammer2_dedup_enable;
+extern int hammer2_always_compress;
 extern int hammer2_inval_enable;
 extern int hammer2_flush_pipe;
 extern int hammer2_synchronous_flush;
 extern int hammer2_dio_count;
+extern int hammer2_limit_dio;
 extern long hammer2_chain_allocs;
 extern long hammer2_chain_frees;
 extern long hammer2_limit_dirty_chains;
@@ -1494,7 +1495,12 @@ hammer2_tid_t hammer2_trans_sub(hammer2_pfs_t *pmp);
 void hammer2_trans_done(hammer2_pfs_t *pmp);
 hammer2_tid_t hammer2_trans_newinum(hammer2_pfs_t *pmp);
 void hammer2_trans_assert_strategy(hammer2_pfs_t *pmp);
-void hammer2_dedup_record(hammer2_chain_t *chain, char *data);
+void hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio,
+                               char *data);
+void hammer2_dedup_delete(hammer2_dev_t *hmp, hammer2_off_t data_off,
+                               u_int bytes);
+void hammer2_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off,
+                               u_int bytes);
 
 /*
  * hammer2_ioctl.c
@@ -1506,10 +1512,11 @@ int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data,
  * hammer2_io.c
  */
 void hammer2_io_putblk(hammer2_io_t **diop);
+void hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes);
 void hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree);
 char *hammer2_io_data(hammer2_io_t *dio, off_t lbase);
-hammer2_io_t *hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize);
-void hammer2_io_resetinval(hammer2_dev_t *hmp, off_t lbase);
+hammer2_io_t *hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize,
+                               int notgood);
 void hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
                                hammer2_iocb_t *iocb);
 void hammer2_io_complete(hammer2_iocb_t *iocb);
@@ -1527,7 +1534,6 @@ void hammer2_io_bdwrite(hammer2_io_t **diop);
 int hammer2_io_bwrite(hammer2_io_t **diop);
 int hammer2_io_isdirty(hammer2_io_t *dio);
 void hammer2_io_setdirty(hammer2_io_t *dio);
-void hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes);
 void hammer2_io_brelse(hammer2_io_t **diop);
 void hammer2_io_bqrelse(hammer2_io_t **diop);
 int hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp);
index e6aca81..b5132e6 100644 (file)
@@ -792,21 +792,6 @@ next:
        }
 }
 
-/*
- * When bulkfree is finally able to free a block it must make sure that
- * the INVALOK bit in any cached DIO is cleared prior to the block being
- * reused.
- */
-static
-void
-fixup_dio(hammer2_dev_t *hmp, hammer2_off_t data_off, int bindex, int scount)
-{
-       data_off += (scount >> 1) * HAMMER2_FREEMAP_BLOCK_SIZE;
-       data_off += bindex *
-               (HAMMER2_FREEMAP_BLOCK_SIZE * HAMMER2_BMAP_BLOCKS_PER_ELEMENT);
-       hammer2_io_resetinval(hmp, data_off);
-}
-
 /*
  * Merge the bulkfree bitmap against the existing bitmap.
  *
@@ -821,14 +806,19 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
 {
        int bindex;
        int scount;
+       hammer2_off_t tmp_off;
        hammer2_bitmap_t lmask;
        hammer2_bitmap_t mmask;
 
+       tmp_off = data_off;
+
        for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) {
                lmask = live->bitmapq[bindex];  /* live */
                mmask = bmap->bitmapq[bindex];  /* snapshotted bulkfree */
-               if (lmask == mmask)
+               if (lmask == mmask) {
+                       tmp_off += HAMMER2_BMAP_INDEX_SIZE;
                        continue;
+               }
 
                for (scount = 0;
                     scount < HAMMER2_BMAP_BITS_PER_ELEMENT;
@@ -864,15 +854,21 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                                        cbinfo->adj_free +=
                                                HAMMER2_FREEMAP_BLOCK_SIZE;
                                        ++cbinfo->count_10_00;
-                                       fixup_dio(cbinfo->hmp, data_off,
-                                                 bindex, scount);
+                                       hammer2_dedup_assert(
+                                               cbinfo->hmp,
+                                               tmp_off |
+                                               HAMMER2_FREEMAP_BLOCK_RADIX,
+                                               HAMMER2_FREEMAP_BLOCK_SIZE);
                                        break;
                                case 3: /* 11 -> 10 */
                                        live->bitmapq[bindex] &=
                                            ~((hammer2_bitmap_t)1 << scount);
                                        ++cbinfo->count_11_10;
-                                       fixup_dio(cbinfo->hmp, data_off,
-                                                 bindex, scount);
+                                       hammer2_dedup_delete(
+                                               cbinfo->hmp,
+                                               tmp_off |
+                                               HAMMER2_FREEMAP_BLOCK_RADIX,
+                                               HAMMER2_FREEMAP_BLOCK_SIZE);
                                        break;
                                }
                        } else if ((mmask & 3) == 3) {
@@ -905,11 +901,10 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                                }
                                live->bitmapq[bindex] |=
                                        ((hammer2_bitmap_t)3 << scount);
-                               fixup_dio(cbinfo->hmp, data_off,
-                                         bindex, scount);
                        }
                        mmask >>= 2;
                        lmask >>= 2;
+                       tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE;
                }
        }
 
@@ -929,10 +924,25 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                live->class = 0;
                live->linear = 0;
                ++cbinfo->count_l0cleans;
+#if 0
+               hammer2_dedup_assert(cbinfo->hmp,
+                                    data_off |
+                                    HAMMER2_FREEMAP_LEVEL0_RADIX,
+                                    HAMMER2_FREEMAP_LEVEL0_SIZE);
+#endif
        } else if (bindex < 7) {
+               int32_t nlinear;
+
                ++bindex;
+
                if (live->linear > bindex * HAMMER2_FREEMAP_BLOCK_SIZE) {
-                       live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE;
+                       nlinear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE;
+#if 0
+                       hammer2_dedup_assert(cbinfo->hmp,
+                                            data_off + nlinear,
+                                            live->linear - nlinear);
+#endif
+                       live->linear = nlinear;
                        ++cbinfo->count_linadjusts;
                }
 
index 644544c..d601113 100644 (file)
@@ -488,18 +488,16 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain)
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 
                /*
-                * If the chain has children or if it has been MODIFIED and
-                * also recorded for DEDUP, we must still flush the chain.
+                * If the chain has children we must still flush the chain.
+                * Any dedup is already handled by the underlying DIO, so
+                * we do not have to specifically flush it here.
                 *
                 * In the case where it has children, the DESTROY flag test
                 * in the flush code will prevent unnecessary flushes of
                 * MODIFIED chains that are not flagged DEDUP so don't worry
                 * about that here.
                 */
-               if (chain->core.chain_count ||
-                   (chain->flags & (HAMMER2_CHAIN_MODIFIED |
-                                    HAMMER2_CHAIN_DEDUP)) ==
-                   (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_DEDUP)) {
+               if (chain->core.chain_count) {
                        /*
                         * Put on flushq (should ensure refs > 1), retry
                         * the drop.
@@ -1425,8 +1423,6 @@ static __inline
 int
 modified_needs_new_allocation(hammer2_chain_t *chain)
 {
-       hammer2_io_t *dio;
-
        /*
         * We only live-dedup data, we do not live-dedup meta-data.
         */
@@ -1441,6 +1437,10 @@ modified_needs_new_allocation(hammer2_chain_t *chain)
        if (chain->bytes == 0)
                return 0;
 
+       return 0;
+
+#if 0
+       hammer2_io_t *dio;
        /*
         * If this flag is not set the current modification has not been
         * recorded for dedup so a new allocation is not needed.  The
@@ -1474,6 +1474,7 @@ modified_needs_new_allocation(hammer2_chain_t *chain)
                }
        }
        return 1;
+#endif
 }
 
 /*
@@ -1592,7 +1593,9 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
         * containing the caller's desired data.  The dedup offset is
         * allowed to be in a partially free state and we must be sure
         * to reset it to a fully allocated state to force two bulkfree
-        * passes to free it again.
+        * passes to free it again.  The chain will not be marked MODIFIED
+        * in the dedup case, as the dedup data cannot be changed without
+        * a new allocation.
         *
         * NOTE: Only applicable when chain->bytes != 0.
         *
@@ -1608,14 +1611,16 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid,
                                chain->bref.data_off = dedup_off;
                                chain->bytes = 1 << (dedup_off &
                                                     HAMMER2_OFF_MASK_RADIX);
-                               atomic_set_int(&chain->flags,
-                                              HAMMER2_CHAIN_DEDUP);
+                               atomic_clear_int(&chain->flags,
+                                                HAMMER2_CHAIN_MODIFIED);
+                               atomic_add_long(&hammer2_count_modified_chains,
+                                               -1);
+                               if (chain->pmp)
+                                       hammer2_pfs_memory_wakeup(chain->pmp);
                                hammer2_freemap_adjust(hmp, &chain->bref,
                                                HAMMER2_FREEMAP_DORECOVER);
                        } else {
                                hammer2_freemap_alloc(chain, chain->bytes);
-                               atomic_clear_int(&chain->flags,
-                                                HAMMER2_CHAIN_DEDUP);
                        }
                        /* XXX failed allocation */
                }
@@ -3440,7 +3445,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
 
        /*
         * How big should our new indirect block be?  It has to be at least
-        * as large as its parent.
+        * as large as its parent for splits to work properly.
         *
         * The freemap uses a specific indirect block size.  The number of
         * levels are built dynamically and ultimately depend on the size
@@ -3449,16 +3454,22 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
         * much to save disk space.
         *
         * The first indirect block level for a directory usually uses
-        * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries).
-        * (the 4 entries built-into the inode can handle 4 directory
-        *  entries)
+        * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries).  Due to
+        * the hash mechanism, this typically gives us a nominal
+        * 32 * 4 entries with one level of indirection.
         *
-        * The first indirect block level for a file usually uses
-        * HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs = ~8MB file).
-        * (the 4 entries built-into the inode can handle a 256KB file).
+        * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE
+        * indirect blocks.  The initial 4 entries in the inode gives us
+        * 256KB.  Up to 4 indirect blocks gives us 32MB.  Three levels
+        * of indirection gives us 137GB, and so forth.  H2 can support
+        * huge file sizes but they are not typical, so we try to stick
+        * with compactness and do not use a larger indirect block size.
         *
-        * The first indirect block level down from an inode typically
-        * uses LBUFSIZE (16384), else it uses PBUFSIZE (65536).
+        * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but
+        * due to the way indirect blocks are created this usually winds
+        * up being extremely inefficient for small files.  Even though
+        * 16KB requires more levels of indirection for very large files,
+        * the 16KB records can be ganged together into 64KB DIOs.
         */
        if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
            for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
@@ -3471,7 +3482,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                        nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */
 
        } else {
-               nbytes = HAMMER2_IND_BYTES_MAX;
+               nbytes = HAMMER2_IND_BYTES_NOM;
        }
        if (nbytes < count * sizeof(hammer2_blockref_t)) {
                KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE &&
index 7924647..f096b6f 100644 (file)
@@ -868,25 +868,31 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                }
 
                /*
-                * If the chain was destroyed try to avoid unnecessary I/O.
-                * The DIO system buffer may silently disallow the
-                * invalidation.
+                * If the chain was destroyed try to avoid unnecessary I/O
+                * that might not have yet occurred.  Remove the data range
+                * from dedup candidacy and attempt to invalidation that
+                * potentially dirty portion of the I/O buffer.
                 */
                if (chain->flags & HAMMER2_CHAIN_DESTROY) {
+                       hammer2_dedup_delete(hmp,
+                                            chain->bref.data_off,
+                                            chain->bytes);
+#if 0
                        hammer2_io_t *dio;
-
                        if (chain->dio) {
-                               hammer2_io_setinval(chain->dio,
-                                                   chain->bref.data_off,
-                                                   chain->bytes);
+                               hammer2_io_inval(chain->dio,
+                                                chain->bref.data_off,
+                                                chain->bytes);
                        } else if ((dio = hammer2_io_getquick(hmp,
                                                  chain->bref.data_off,
-                                                 chain->bytes)) != NULL) {
-                               hammer2_io_setinval(dio,
-                                                   chain->bref.data_off,
-                                                   chain->bytes);
+                                                 chain->bytes,
+                                                 1)) != NULL) {
+                               hammer2_io_inval(dio,
+                                                chain->bref.data_off,
+                                                chain->bytes);
                                hammer2_io_putblk(&dio);
                        }
+#endif
                }
        }
 
index c547e4e..46c0540 100644 (file)
 
 /*
  * Implements an abstraction layer for synchronous and asynchronous
- * buffered device I/O.  Can be used for OS-abstraction but the main
+ * buffered device I/O.  Can be used as an OS-abstraction but the main
  * purpose is to allow larger buffers to be used against hammer2_chain's
  * using smaller allocations, without causing deadlocks.
  *
+ * The DIOs also record temporary state with limited persistence.  This
+ * feature is used to keep track of dedupable blocks.
  */
 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
-static void dio_write_stats_update(hammer2_io_t *dio);
+static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
 
 static int
 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
@@ -101,6 +103,7 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
        hammer2_io_t *xio;
        off_t pbase;
        off_t pmask;
+
        /*
         * XXX after free, buffer reuse case w/ different size can clash
         * with dio cache.  Lets avoid it for now.  Ultimate we need to
@@ -141,6 +144,7 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
                dio->psize = psize;
                dio->btype = iocb->btype;
                dio->refs = 1;
+               dio->act = 5;
                hammer2_spin_init(&dio->spin, "h2dio");
                TAILQ_INIT(&dio->iocbq);
                hammer2_spin_ex(&hmp->io_spin);
@@ -164,8 +168,9 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
         */
        iocb->dio = dio;
 
-       if (dio->act < 5)       /* SMP race ok */
-               ++dio->act;
+       dio->ticks = ticks;
+       if (dio->act < 10)
+               ++dio->act;             /* SMP race ok */
 
        for (;;) {
                refs = dio->refs;
@@ -226,7 +231,7 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
  * caches the data.
  */
 hammer2_io_t *
-hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
+hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize, int notgood)
 {
        hammer2_iocb_t iocb;
        hammer2_io_t *dio;
@@ -262,8 +267,9 @@ hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
                atomic_add_int(&dio->hmp->iofree_count, -1);
        hammer2_spin_unsh(&hmp->io_spin);
 
-       if (dio->act < 5)       /* SMP race ok */
-               ++dio->act;
+       dio->ticks = ticks;
+       if (dio->act < 10)
+               ++dio->act;             /* SMP race ok */
 
        /*
         * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
@@ -300,15 +306,23 @@ hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
                /*
                 * We own DIO_INPROG, try to set DIO_GOOD.
                 *
-                * For now do not use GETBLK_NOWAIT because 
+                * If (notgood) specified caller just wants the dio and doesn't
+                * care about the buffer a whole lot.  However, if the buffer
+                * is good (or dirty), we still want to return it.
+                *
+                * Otherwise we are trying to resolve a dedup and bread()
+                * is expected to always be better than building a new buffer
+                * that will be written.  Use bread() for better determinism
+                * than getblk().
                 */
                bp = dio->bp;
                dio->bp = NULL;
                if (bp == NULL) {
-#if 0
-                       bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0);
-#endif
-                       bread(hmp->devvp, dio->pbase, dio->psize, &bp);
+                       if (notgood)
+                               bp = getblk(hmp->devvp, dio->pbase,
+                                           dio->psize, 0, 0);
+                       else
+                               bread(hmp->devvp, dio->pbase, dio->psize, &bp);
                }
 
                /*
@@ -338,38 +352,21 @@ hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
        }
 
        /*
-        * Only return the dio if its buffer is good.  If the buffer is not
-        * good be sure to clear INVALOK, meaning that invalidation is no
-        * longer acceptable
+        * Only return the dio if its buffer is good.  If notgood != 0,
+        * we return the buffer regardless (so ephermal dedup bits can be
+        * cleared).
         */
-       if ((dio->refs & HAMMER2_DIO_GOOD) == 0) {
+       if (notgood == 0 && (dio->refs & HAMMER2_DIO_GOOD) == 0) {
                hammer2_io_putblk(&dio);
        }
        return dio;
 }
 
-/*
- * Make sure that all invalidation flags are cleared on the dio associated
- * with the specified data offset, if the dio exists.
- *
- * Called from bulkfree when a block becomes reusable to ensure that new
- * allocations do not accidently discard the buffer later on.
- */
-void
-hammer2_io_resetinval(hammer2_dev_t *hmp, off_t data_off)
-{
-       hammer2_io_t *dio;
-
-       data_off &= ~HAMMER2_PBUFMASK64;
-       hammer2_spin_sh(&hmp->io_spin);
-       dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, data_off);
-       if (dio)
-               atomic_clear_64(&dio->refs, HAMMER2_DIO_INVALBITS);
-       hammer2_spin_unsh(&hmp->io_spin);
-}
-
 /*
  * The originator of the iocb is finished with it.
+ *
+ * WARNING: iocb may be partially initialized with only iocb->dio and
+ *         iocb->flags.
  */
 void
 hammer2_io_complete(hammer2_iocb_t *iocb)
@@ -518,6 +515,7 @@ hammer2_io_putblk(hammer2_io_t **diop)
        off_t peof;
        off_t pbase;
        int psize;
+       int limit_dio;
        uint64_t orefs;
        uint64_t nrefs;
 
@@ -525,9 +523,7 @@ hammer2_io_putblk(hammer2_io_t **diop)
        *diop = NULL;
        hmp = dio->hmp;
 
-       while (dio->unused01) {
-               tsleep(&dio->unused01, 0, "h2DEBUG", hz);
-       }
+       KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
 
        /*
         * Drop refs.
@@ -550,7 +546,6 @@ hammer2_io_putblk(hammer2_io_t **diop)
                         * Lastdrop case, INPROG can be set.
                         */
                        nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
-                       nrefs &= ~(HAMMER2_DIO_INVAL);
                        nrefs |= HAMMER2_DIO_INPROG;
                        if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
                                break;
@@ -575,7 +570,9 @@ hammer2_io_putblk(hammer2_io_t **diop)
 
        /*
         * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
-        * have been cleared.
+        * have been cleared.  iofree_count has not yet been incremented,
+        * note that another accessor race will decrement iofree_count so
+        * we have to increment it regardless.
         *
         * We can now dispose of the buffer, and should do it before calling
         * io_complete() in case there's a race against a new reference
@@ -588,7 +585,7 @@ hammer2_io_putblk(hammer2_io_t **diop)
 
        if (orefs & HAMMER2_DIO_GOOD) {
                KKASSERT(bp != NULL);
-#if 1
+#if 0
                if (hammer2_inval_enable &&
                    (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
                        ++hammer2_iod_invals;
@@ -599,7 +596,7 @@ hammer2_io_putblk(hammer2_io_t **diop)
                if (orefs & HAMMER2_DIO_DIRTY) {
                        int hce;
 
-                       dio_write_stats_update(dio);
+                       dio_write_stats_update(dio, bp);
                        if ((hce = hammer2_cluster_write) > 0) {
                                /*
                                 * Allows write-behind to keep the buffer
@@ -624,7 +621,7 @@ hammer2_io_putblk(hammer2_io_t **diop)
                        bqrelse(bp);
                }
        } else if (bp) {
-#if 1
+#if 0
                if (hammer2_inval_enable &&
                    (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
                        ++hammer2_iod_invals;
@@ -633,10 +630,10 @@ hammer2_io_putblk(hammer2_io_t **diop)
                } else
 #endif
                if (orefs & HAMMER2_DIO_DIRTY) {
-                       dio_write_stats_update(dio);
+                       dio_write_stats_update(dio, bp);
                        bdwrite(bp);
                } else {
-                       brelse(bp);
+                       bqrelse(bp);
                }
        }
 
@@ -660,13 +657,19 @@ hammer2_io_putblk(hammer2_io_t **diop)
         * We cache free buffers so re-use cases can use a shared lock, but
         * if too many build up we have to clean them out.
         */
-       if (hmp->iofree_count > 65536) {
+       limit_dio = hammer2_limit_dio;
+       if (limit_dio < 256)
+               limit_dio = 256;
+       if (limit_dio > 1024*1024)
+               limit_dio = 1024*1024;
+       if (hmp->iofree_count > limit_dio) {
                struct hammer2_cleanupcb_info info;
 
+               kprintf("x");
                RB_INIT(&info.tmptree);
                hammer2_spin_ex(&hmp->io_spin);
-               if (hmp->iofree_count > 65536) {
-                       info.count = hmp->iofree_count / 4;
+               if (hmp->iofree_count > limit_dio) {
+                       info.count = hmp->iofree_count / 5;
                        RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
                                hammer2_io_cleanup_callback, &info);
                }
@@ -690,15 +693,22 @@ hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
 
        if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
                if (dio->act > 0) {
-                       --dio->act;
-                       return 0;
+                       int act;
+
+                       act = dio->act - (ticks - dio->ticks) / hz - 1;
+                       if (act > 0) {
+                               dio->act = act;
+                               return 0;
+                       }
+                       dio->act = 0;
                }
                KKASSERT(dio->bp == NULL);
-               RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
-               xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
-               KKASSERT(xio == NULL);
-               if (--info->count <= 0) /* limit scan */
-                       return(-1);
+               if (info->count > 0) {
+                       RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
+                       xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
+                       KKASSERT(xio == NULL);
+                       --info->count;
+               }
        }
        return 0;
 }
@@ -837,25 +847,6 @@ hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
                                        vfs_bio_clrbuf(dio->bp);
                                        dio->bp->b_flags |= B_CACHE;
                                }
-
-                               /*
-                                * Invalidation is ok on newly allocated
-                                * buffers which cover the entire buffer.
-                                * Flag will be cleared on use by the de-dup
-                                * code.
-                                *
-                                * hammer2_chain_modify() also checks this flag.
-                                *
-                                * QUICK mode is used by the freemap code to
-                                * pre-validate a junk buffer to prevent an
-                                * unnecessary read I/O.  We do NOT want
-                                * to set INVALOK in that situation as the
-                                * underlying allocations may be smaller.
-                                */
-                               if ((iocb->flags & HAMMER2_IOCB_QUICK) == 0) {
-                                       atomic_set_64(&dio->refs,
-                                                     HAMMER2_DIO_INVALOK);
-                               }
                        } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
                                /*
                                 * Partial buffer, quick mode.  Do nothing.
@@ -879,7 +870,8 @@ hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
                                 */
                                if (dio->bp) {
                                        if (dio->refs & HAMMER2_DIO_DIRTY) {
-                                               dio_write_stats_update(dio);
+                                               dio_write_stats_update(dio,
+                                                                      dio->bp);
                                                bdwrite(dio->bp);
                                        } else {
                                                bqrelse(dio->bp);
@@ -1081,15 +1073,24 @@ hammer2_io_setdirty(hammer2_io_t *dio)
 }
 
 /*
- * Request an invalidation.  The hammer2_io code will oblige only if
- * DIO_INVALOK is also set.  INVALOK is cleared if the dio is used
- * in a dedup lookup and prevents invalidation of the dirty buffer.
+ * This routine is called when a MODIFIED chain is being DESTROYED,
+ * in an attempt to allow the related buffer cache buffer to be
+ * invalidated and discarded instead of flushing it to disk.
+ *
+ * At the moment this case is only really useful for file meta-data.
+ * File data is already handled via the logical buffer cache associated
+ * with the vnode, and will be discarded if it was never flushed to disk.
+ * File meta-data may include inodes, directory entries, and indirect blocks.
+ *
+ * XXX
+ * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
+ * invalidated might be smaller.  Most of the meta-data structures above
+ * are in the 'smaller' category.  For now, don't try to invalidate the
+ * data areas.
  */
 void
-hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
+hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
 {
-       if ((u_int)dio->psize == bytes)
-               atomic_set_64(&dio->refs, HAMMER2_DIO_INVAL);
 }
 
 void
@@ -1112,10 +1113,13 @@ hammer2_io_isdirty(hammer2_io_t *dio)
 
 static
 void
-dio_write_stats_update(hammer2_io_t *dio)
+dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
 {
        long *counterp;
 
+       if (bp->b_flags & B_DELWRI)
+               return;
+
        switch(dio->btype) {
        case 0:
                return;
index 7a96e97..39dce35 100644 (file)
@@ -451,8 +451,11 @@ hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data,
                 * block device behind us.  This leaves more room in the
                 * LRU chain cache for meta-data chains which we really
                 * want to retain.
+                *
+                * NOTE: Deduplication cannot be safely recorded for
+                *       records without a check code.
                 */
-               hammer2_dedup_record(chain, data);
+               hammer2_dedup_record(chain, NULL, data);
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
 
                /*
@@ -919,13 +922,19 @@ hammer2_compress_and_write(char *data, hammer2_inode_t *ip,
        /*
         * Compression requested.  Try to compress the block.  We store
         * the data normally if we cannot sufficiently compress it.
+        *
+        * We have a heuristic to detect files which are mostly
+        * uncompressable and avoid the compression attempt in that
+        * case.  If the compression heuristic is turned off, we always
+        * try to compress.
         */
        comp_size = 0;
        comp_buffer = NULL;
 
        KKASSERT(pblksize / 2 <= 32768);
                
-       if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
+       if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0 ||
+           hammer2_always_compress) {
                z_stream strm_compress;
                int comp_level;
                int ret;
@@ -1116,7 +1125,6 @@ hammer2_compress_and_write(char *data, hammer2_inode_t *ip,
                         * so we do it here.
                         */
                        hammer2_chain_setcheck(chain, bdata);
-                       hammer2_dedup_record(chain, bdata);
 
                        /*
                         * Device buffer is now valid, chain is no longer in
@@ -1125,6 +1133,7 @@ hammer2_compress_and_write(char *data, hammer2_inode_t *ip,
                         * (No blockref table worries with file data)
                         */
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
+                       hammer2_dedup_record(chain, dio, bdata);
 
                        /* Now write the related bdp. */
                        if (ioflag & IO_SYNC) {
@@ -1325,7 +1334,6 @@ hammer2_write_bp(hammer2_chain_t *chain, char *data, int ioflag,
                 * so we do it here.
                 */
                hammer2_chain_setcheck(chain, bdata);
-               hammer2_dedup_record(chain, bdata);
 
                /*
                 * Device buffer is now valid, chain is no longer in
@@ -1334,6 +1342,7 @@ hammer2_write_bp(hammer2_chain_t *chain, char *data, int ioflag,
                 * (No blockref table worries with file data)
                 */
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
+               hammer2_dedup_record(chain, dio, bdata);
 
                if (ioflag & IO_SYNC) {
                        /*
@@ -1362,19 +1371,50 @@ hammer2_write_bp(hammer2_chain_t *chain, char *data, int ioflag,
        *errorp = error;
 }
 
+#define HAMMER2_DEDUP_FRAG     (HAMMER2_PBUFSIZE / 64)
+#define HAMMER2_DEDUP_FRAGRADIX        (HAMMER2_PBUFRADIX - 6)
+
+static __inline
+uint64_t
+hammer2_dedup_mask(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
+{
+       int bbeg;
+       int bits;
+       uint64_t mask;
+
+       bbeg = (int)((data_off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) >>
+              HAMMER2_DEDUP_FRAGRADIX;
+       bits = (int)((bytes + (HAMMER2_DEDUP_FRAG - 1)) >>
+              HAMMER2_DEDUP_FRAGRADIX);
+       mask = ((uint64_t)1 << bbeg) - 1;
+       if (bbeg + bits == 64)
+               mask = (uint64_t)-1;
+       else
+               mask = ((uint64_t)1 << (bbeg + bits)) - 1;
+
+       mask &= ~(((uint64_t)1 << bbeg) - 1);
+
+       return mask;
+}
+
 /*
- * LIVE DEDUP HEURISTIC
+ * LIVE DEDUP HEURISTICS
+ *
+ * Record that the media data area is available for dedup operation.  This
+ * will set the appropriate dedup bits in the DIO.  These bits will be cleared
+ * if the dedup area becomes unavailable.
  *
  * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
  *         All fields must be loaded into locals and validated.
  *
- * WARNING! Should only be used for file data, hammer2_chain_modify() only
- *         checks for the dedup case on data chains.  Also, dedup data can
- *         only be recorded for committed chains (so NOT strategy writes
- *         which can undergo further modification after the fact!).
+ * WARNING! Should only be used for file data and directory entries,
+ *         hammer2_chain_modify() only checks for the dedup case on data
+ *         chains.  Also, dedup data can only be recorded for committed
+ *         chains (so NOT strategy writes which can undergo further
+ *         modification after the fact!).
  */
 void
-hammer2_dedup_record(hammer2_chain_t *chain, char *data)
+hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio, char *data)
 {
        hammer2_dev_t *hmp;
        hammer2_dedup_t *dedup;
@@ -1383,9 +1423,21 @@ hammer2_dedup_record(hammer2_chain_t *chain, char *data)
        int i;
        int dticks;
 
+       /*
+        * We can only record a dedup if we have media data to test against.
+        * If dedup is not enabled, return early, which allows a chain to
+        * remain marked MODIFIED (which might have benefits in special
+        * situations, though typically it does not).
+        */
        if (hammer2_dedup_enable == 0)
                return;
+       if (dio == NULL) {
+               dio = chain->dio;
+               if (dio == NULL)
+                       return;
+       }
 
+#if 0
        /*
         * Only committed data can be recorded for de-duplication, otherwise
         * the contents may change out from under us.  So, on read if the
@@ -1395,7 +1447,7 @@ hammer2_dedup_record(hammer2_chain_t *chain, char *data)
            (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_INITIAL)) == 0) {
                return;
        }
-
+#endif
 
        hmp = chain->hmp;
 
@@ -1458,7 +1510,65 @@ hammer2_dedup_record(hammer2_chain_t *chain, char *data)
        dedup->ticks = ticks;
        dedup->data_off = chain->bref.data_off;
        dedup->data_crc = crc;
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP);
+
+       atomic_set_64(&dio->dedup_ok_mask,
+                     hammer2_dedup_mask(dio, chain->bref.data_off,
+                                        chain->bytes));
+
+       /*
+        * Once we record the dedup the chain must be marked clean to
+        * prevent reuse of the underlying block.   Remember that this
+        * write occurs when the buffer cache is flushed (i.e. on sync(),
+        * fsync(), filesystem periodic sync, or when the kernel needs to
+        * flush a buffer), and not whenever the user write()s.
+        */
+       if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+               atomic_add_long(&hammer2_count_modified_chains, -1);
+               if (chain->pmp)
+                       hammer2_pfs_memory_wakeup(chain->pmp);
+       }
+}
+
+/*
+ * Remove the data range from dedup consideration.  This has no effect on
+ * any dedups which have already occurred.  We do not need a valid buffer
+ * for this operation and must clean out dedup_ok_mask even if the dio is
+ * cached without any buffer available.
+ */
+void
+hammer2_dedup_delete(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
+{
+       hammer2_io_t *dio;
+
+       dio = hammer2_io_getquick(hmp, data_off, bytes, 1);
+       if (dio) {
+               if (data_off < dio->pbase ||
+                   (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
+                   dio->pbase + dio->psize) {
+                       panic("DATAOFF BAD %016jx/%d %016jx\n",
+                               data_off, bytes, dio->pbase);
+               }
+               atomic_clear_64(&dio->dedup_ok_mask,
+                               hammer2_dedup_mask(dio, data_off, bytes));
+               hammer2_io_putblk(&dio);
+       }
+}
+
+/*
+ * Assert that the data range is not considered for dedup operation.
+ */
+void
+hammer2_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
+{
+       hammer2_io_t *dio;
+
+       dio = hammer2_io_getquick(hmp, data_off, bytes, 1);
+       if (dio) {
+               KKASSERT((dio->dedup_ok_mask &
+                         hammer2_dedup_mask(dio, data_off, bytes)) == 0);
+               hammer2_io_putblk(&dio);
+       }
 }
 
 static
@@ -1469,7 +1579,9 @@ hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize)
        hammer2_io_t *dio;
        hammer2_off_t off;
        uint64_t crc;
+       uint64_t mask;
        char *data;
+       char *dtmp;
        int i;
 
        if (hammer2_dedup_enable == 0)
@@ -1499,28 +1611,26 @@ hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize)
                        continue;
                if ((1 << (int)(off & HAMMER2_OFF_MASK_RADIX)) != pblksize)
                        continue;
-               dio = hammer2_io_getquick(hmp, off, pblksize);
-               if (dio &&
-                   bcmp(data, hammer2_io_data(dio, off), pblksize) == 0) {
-                       /*
-                        * Make sure the INVALOK flag is cleared to prevent
-                        * the possibly-dirty bp from being invalidated now
-                        * that we are using it as part of a de-dup operation.
-                        */
-                       if (hammer2_debug & 0x40000) {
-                               kprintf("DEDUP SUCCESS %016jx\n",
-                                       (intmax_t)off);
+               dio = hammer2_io_getquick(hmp, off, pblksize, 0);
+               if (dio) {
+                       dtmp = hammer2_io_data(dio, off),
+                       mask = hammer2_dedup_mask(dio, off, pblksize);
+                       if ((dio->dedup_ok_mask & mask) == mask &&
+                           bcmp(data, dtmp, pblksize) == 0) {
+                               if (hammer2_debug & 0x40000) {
+                                       kprintf("DEDUP SUCCESS %016jx\n",
+                                               (intmax_t)off);
+                               }
+                               hammer2_io_putblk(&dio);
+                               *datap = NULL;
+                               dedup[i].ticks = ticks;   /* update use */
+                               atomic_add_long(&hammer2_iod_file_wdedup,
+                                               pblksize);
+
+                               return off;             /* RETURN */
                        }
-                       atomic_clear_64(&dio->refs, HAMMER2_DIO_INVALOK);
                        hammer2_io_putblk(&dio);
-                       *datap = NULL;
-                       dedup[i].ticks = ticks; /* update use */
-                       ++hammer2_iod_file_wdedup;
-
-                       return off;             /* RETURN */
                }
-               if (dio)
-                       hammer2_io_putblk(&dio);
        }
        return 0;
 }
index c0a834c..6561413 100644 (file)
@@ -81,10 +81,12 @@ int hammer2_debug;
 int hammer2_cluster_read = 4;          /* physical read-ahead */
 int hammer2_cluster_write = 0;         /* bdwrite() so later inval works */
 int hammer2_dedup_enable = 1;
+int hammer2_always_compress = 0;       /* always try to compress */
 int hammer2_inval_enable = 0;
 int hammer2_flush_pipe = 100;
 int hammer2_synchronous_flush = 1;
 int hammer2_dio_count;
+int hammer2_limit_dio = 256;
 long hammer2_chain_allocs;
 long hammer2_chain_frees;
 long hammer2_limit_dirty_chains;
@@ -122,6 +124,8 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW,
           &hammer2_cluster_write, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW,
           &hammer2_dedup_enable, 0, "");
+SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW,
+          &hammer2_always_compress, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW,
           &hammer2_inval_enable, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
@@ -138,6 +142,8 @@ SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW,
           &hammer2_count_modified_chains, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
           &hammer2_dio_count, 0, "");
+SYSCTL_INT(_vfs_hammer2, OID_AUTO, limit_dio, CTLFLAG_RW,
+          &hammer2_limit_dio, 0, "");
 
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW,
           &hammer2_iod_invals, 0, "");
@@ -239,6 +245,8 @@ hammer2_vfs_init(struct vfsconf *conf)
 
        error = 0;
 
+       hammer2_limit_dio = nbuf * 2;
+
        if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
                error = EINVAL;
        if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))