HAMMER 56A/Many: Performance tuning - MEDIA STRUCTURES CHANGED!
authorMatthew Dillon <dillon@dragonflybsd.org>
Tue, 17 Jun 2008 04:02:38 +0000 (04:02 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Tue, 17 Jun 2008 04:02:38 +0000 (04:02 +0000)
* MEDIA CHANGE: The blockmaps have been folded into the freemap.  Allocations
  are now made directly out of the freemap.  More work is expected here.

  The blockmaps are still used to sequence allocations, but no block
  number translation is required any more.  This didn't improve performance
  much but it will make it easier for future optimizations to localize
  allocations.

* PERFORMANCE: Removed the holes recording code.  Another commit will
  soon take over the functionality.

* PERFORMANCE: The flusher's slave threads now collect a number of inodes
  into a batch before starting their work, in an attempt to reduce
  deadlocks between slave threads from adjacent inodes.

* PERFORMANCE: B-Tree positional caching now works much better, greatly
  reducing the cpu overhead when accessing the filesystem.

* PERFORMANCE: Added a write-append optimization.  Do not do a
  lookup/iteration to locate records being overwritten when no such
  records should exist.  This cuts the cpu overhead of write-append
  flushes in half.

* PERFORMANCE: Add a vfs.hammer.write_mode sysctl feature to test out
  two different ways of queueing write I/O's.

* Added B-Tree statistics (hammer bstats 1).

14 files changed:
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_blockmap.c
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_disk.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_freemap.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_undo.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index 872ccba..6406e5a 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.84 2008/06/14 01:42:12 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.85 2008/06/17 04:02:38 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -265,7 +265,7 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_MODMASK_NOXDIRTY \
                                (HAMMER_INODE_MODMASK & ~HAMMER_INODE_XDIRTY)
 
-#define HAMMER_MAX_INODE_CURSORS       4
+#define HAMMER_FLUSH_GROUP_SIZE        64
 
 #define HAMMER_FLUSH_SIGNAL    0x0001
 #define HAMMER_FLUSH_RECURSION 0x0002
@@ -514,34 +514,6 @@ union hammer_io_structure {
 
 typedef union hammer_io_structure *hammer_io_structure_t;
 
-/*
- * Allocation holes are recorded when an allocation does not fit within a
- * buffer.  Later allocations which might fit may then be satisfied from
- * a recorded hole.  The resv reference prevents the big block from being
- * allocated out of via the normal blockmap mechanism.
- *
- * This is strictly a heuristic.
- */
-#define HAMMER_MAX_HOLES       8
-
-struct hammer_hole;
-
-struct hammer_holes {
-       TAILQ_HEAD(, hammer_hole) list;
-       int     count;
-};
-
-typedef struct hammer_holes *hammer_holes_t;
-
-struct hammer_hole {
-       TAILQ_ENTRY(hammer_hole) entry;
-       struct hammer_reserve *resv;
-       hammer_off_t    zone_offset;
-       int             bytes;
-};
-
-typedef struct hammer_hole *hammer_hole_t;
-
 /*
  * The reserve structure prevents the blockmap from allocating
  * out of a reserved bigblock.  Such reservations are used by
@@ -555,12 +527,16 @@ struct hammer_reserve {
        RB_ENTRY(hammer_reserve) rb_node;
        TAILQ_ENTRY(hammer_reserve) delay_entry;
        int             flush_group;
+       int             flags;
        int             refs;
+       int             zone;
        hammer_off_t    zone_offset;
 };
 
 typedef struct hammer_reserve *hammer_reserve_t;
 
+#define HAMMER_RESF_ONDELAY    0x0001
+
 #include "hammer_cursor.h"
 
 /*
@@ -615,7 +591,6 @@ struct hammer_mount {
        struct hammer_volume *rootvol;
        struct hammer_base_elm root_btree_beg;
        struct hammer_base_elm root_btree_end;
-       char    *zbuf;  /* HAMMER_BUFSIZE bytes worth of all-zeros */
        int     flags;
        int     hflags;
        int     ronly;
@@ -649,14 +624,12 @@ struct hammer_mount {
        int64_t copy_stat_freebigblocks;        /* number of free bigblocks */
 
        u_int32_t namekey_iterator;
-       hammer_off_t zone_limits[HAMMER_MAX_ZONES];
        struct netexport export;
        struct hammer_lock sync_lock;
        struct hammer_lock free_lock;
        struct hammer_lock undo_lock;
        struct hammer_lock blkmap_lock;
        struct hammer_blockmap  blockmap[HAMMER_MAX_ZONES];
-       struct hammer_holes     holes[HAMMER_MAX_ZONES];
        struct hammer_undo      undos[HAMMER_MAX_UNDOS];
        int                     undo_alloc;
        TAILQ_HEAD(, hammer_undo)  undo_lru_list;
@@ -702,6 +675,14 @@ extern int hammer_count_record_datas;
 extern int hammer_count_volumes;
 extern int hammer_count_buffers;
 extern int hammer_count_nodes;
+extern int64_t hammer_stats_btree_lookups;
+extern int64_t hammer_stats_btree_searches;
+extern int64_t hammer_stats_btree_inserts;
+extern int64_t hammer_stats_btree_deletes;
+extern int64_t hammer_stats_btree_elements;
+extern int64_t hammer_stats_btree_splits;
+extern int64_t hammer_stats_btree_iterations;
+extern int64_t hammer_stats_record_iterations;
 extern int hammer_count_dirtybufs;
 extern int hammer_count_refedbufs;
 extern int hammer_count_reservations;
@@ -713,8 +694,8 @@ extern int hammer_limit_iqueued;
 extern int hammer_limit_irecs;
 extern int hammer_limit_recs;
 extern int hammer_bio_count;
-extern int hammer_stats_btree_iterations;
-extern int hammer_stats_record_iterations;
+extern int hammer_verify_zone;
+extern int hammer_write_mode;
 extern int64_t hammer_contention_count;
 
 int    hammer_vop_inactive(struct vop_inactive_args *);
@@ -885,6 +866,9 @@ hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone,
                        int bytes, hammer_off_t *zone_offp, int *errorp);
 void hammer_blockmap_reserve_complete(hammer_mount_t hmp,
                        hammer_reserve_t resv);
+void hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv,
+                        hammer_off_t zone2_offset);
+void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
 void hammer_blockmap_free(hammer_transaction_t trans,
                        hammer_off_t bmap_off, int bytes);
 int hammer_blockmap_getfree(hammer_mount_t hmp, hammer_off_t bmap_off,
@@ -975,8 +959,6 @@ int hammer_ioc_reblock(hammer_transaction_t trans, hammer_inode_t ip,
 int hammer_ioc_prune(hammer_transaction_t trans, hammer_inode_t ip,
                        struct hammer_ioc_prune *prune);
 
-void hammer_init_holes(hammer_mount_t hmp, hammer_holes_t holes);
-void hammer_free_holes(hammer_mount_t hmp, hammer_holes_t holes);
 int hammer_signal_check(hammer_mount_t hmp);
 
 void hammer_flusher_create(hammer_mount_t hmp);
index 579c6e4..e401035 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.18 2008/06/12 00:16:10 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.19 2008/06/17 04:02:38 dillon Exp $
  */
 
 /*
  */
 #include "hammer.h"
 
-static hammer_off_t hammer_find_hole(hammer_mount_t hmp,
-                                  hammer_holes_t holes, int bytes);
-static void hammer_add_hole(hammer_mount_t hmp, hammer_holes_t holes,
-                                  hammer_off_t zone_offset, int bytes);
-static void hammer_clean_holes(hammer_mount_t hmp, hammer_holes_t holes,
-                                  hammer_off_t base_offset);
 static int hammer_res_rb_compare(hammer_reserve_t res1, hammer_reserve_t res2);
 
 /*
@@ -63,35 +57,6 @@ hammer_res_rb_compare(hammer_reserve_t res1, hammer_reserve_t res2)
        return(0);
 }
 
-/*
- * Allocate a big-block from the freemap and stuff it into the blockmap
- * at layer1/layer2.
- */
-static void
-hammer_blockmap_llalloc(hammer_transaction_t trans,
-               hammer_off_t zone_offset, int *errorp,
-               hammer_buffer_t buffer1, hammer_blockmap_layer1_t layer1,
-               hammer_buffer_t buffer2, hammer_blockmap_layer2_t layer2)
-{
-       hammer_off_t zone2_offset;
-
-       zone2_offset = hammer_freemap_alloc(trans, zone_offset, errorp);
-       if (*errorp)
-               return;
-       hammer_modify_buffer(trans, buffer1, layer1, sizeof(*layer1));
-       KKASSERT(layer1->blocks_free);
-       --layer1->blocks_free;
-       layer1->layer1_crc = crc32(layer1, HAMMER_LAYER1_CRCSIZE);
-       hammer_modify_buffer_done(buffer1);
-       hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
-       bzero(layer2, sizeof(*layer2));
-       layer2->u.phys_offset = zone2_offset;
-       layer2->bytes_free = HAMMER_LARGEBLOCK_SIZE;
-       layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
-       hammer_modify_buffer_done(buffer2);
-}
-
-
 /*
  * Allocate bytes from a zone
  */
@@ -101,7 +66,8 @@ hammer_blockmap_alloc(hammer_transaction_t trans, int zone,
 {
        hammer_mount_t hmp;
        hammer_volume_t root_volume;
-       hammer_blockmap_t rootmap;
+       hammer_blockmap_t blockmap;
+       hammer_blockmap_t freemap;
        hammer_reserve_t resv;
        struct hammer_blockmap_layer1 *layer1;
        struct hammer_blockmap_layer2 *layer2;
@@ -113,8 +79,8 @@ hammer_blockmap_alloc(hammer_transaction_t trans, int zone,
        hammer_off_t result_offset;
        hammer_off_t layer1_offset;
        hammer_off_t layer2_offset;
+       hammer_off_t base_off;
        int loops = 0;
-       int skip_amount;
 
        hmp = trans->hmp;
 
@@ -129,104 +95,66 @@ hammer_blockmap_alloc(hammer_transaction_t trans, int zone,
        KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
 
        /*
-        * Try to use a known-free hole.
+        * Setup
         */
-       result_offset = hammer_find_hole(hmp, &trans->hmp->holes[zone], bytes);
-       if (result_offset) {
-               *errorp = 0;
-               hammer_blockmap_free(trans, result_offset, -bytes);
-               return(result_offset);
-       }
+       root_volume = trans->rootvol;
+       *errorp = 0;
+       blockmap = &hmp->blockmap[zone];
+       freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
+       KKASSERT(HAMMER_ZONE_DECODE(blockmap->next_offset) == zone);
+
+       hammer_lock_ex(&hmp->blkmap_lock);
+       next_offset = blockmap->next_offset;
 
+again:
        /*
-        * Otherwise scan for space
+        * Check for wrap
         */
-       root_volume = hammer_get_root_volume(hmp, errorp);
-       if (*errorp)
-               return(0);
-       rootmap = &hmp->blockmap[zone];
-       KKASSERT(rootmap->phys_offset != 0);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
-                HAMMER_ZONE_RAW_BUFFER_INDEX);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->next_offset) == zone);
+       if (next_offset == 0) {
+               if (++loops == 2) {
+                       result_offset = 0;
+                       *errorp = ENOSPC;
+                       goto done;
+               }
+               next_offset = HAMMER_ZONE_ENCODE(zone, 0);
+       }
 
-       hammer_lock_ex(&hmp->blkmap_lock);
-       next_offset = rootmap->next_offset;
-again:
        /*
         * The allocation request may not cross a buffer boundary.
         */
        tmp_offset = next_offset + bytes - 1;
        if ((next_offset ^ tmp_offset) & ~HAMMER_BUFMASK64) {
-               skip_amount = HAMMER_BUFSIZE - 
-                             ((int)next_offset & HAMMER_BUFMASK);
-               hammer_add_hole(hmp, &hmp->holes[zone],
-                               next_offset, skip_amount);
                next_offset = tmp_offset & ~HAMMER_BUFMASK64;
+               goto again;
        }
 
        /*
-        * Dive layer 1.  If we are starting a new layer 1 entry,
-        * allocate a layer 2 block for it.
+        * Dive layer 1.
         */
-       layer1_offset = rootmap->phys_offset +
+       layer1_offset = freemap->phys_offset +
                        HAMMER_BLOCKMAP_LAYER1_OFFSET(next_offset);
        layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1);
        KKASSERT(*errorp == 0);
-       KKASSERT(next_offset <= rootmap->alloc_offset);
-
-       /*
-        * Check CRC if not allocating into uninitialized space
-        */
-       if ((next_offset != rootmap->alloc_offset) ||
-           (next_offset & HAMMER_BLOCKMAP_LAYER2_MASK)) {
-               if (layer1->layer1_crc != crc32(layer1,
-                                               HAMMER_LAYER1_CRCSIZE)) {
-                       Debugger("CRC FAILED: LAYER1");
-               }
-       }
 
        /*
-        * Allocate layer2 backing store in layer1 if necessary.  next_offset
-        * can skip to a bigblock boundary but alloc_offset is at least
-        * bigblock-aligned so that's ok.
+        * Check CRC.
         */
-       if ((next_offset == rootmap->alloc_offset &&
-           (next_offset & HAMMER_BLOCKMAP_LAYER2_MASK) == 0) ||
-           layer1->phys_offset == HAMMER_BLOCKMAP_FREE
-       ) {
-               KKASSERT((next_offset & HAMMER_BLOCKMAP_LAYER2_MASK) == 0);
-               hammer_modify_buffer(trans, buffer1, layer1, sizeof(*layer1));
-               bzero(layer1, sizeof(*layer1));
-               layer1->phys_offset =
-                       hammer_freemap_alloc(trans, next_offset, errorp);
-               layer1->blocks_free = HAMMER_BLOCKMAP_RADIX2;
-               layer1->layer1_crc = crc32(layer1, HAMMER_LAYER1_CRCSIZE);
-               hammer_modify_buffer_done(buffer1);
-               KKASSERT(*errorp == 0);
+       if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
+               Debugger("CRC FAILED: LAYER1");
        }
-       KKASSERT(layer1->phys_offset);
 
        /*
-        * If layer1 indicates no free blocks in layer2 and our alloc_offset
-        * is not in layer2, skip layer2 entirely.
+        * If we are at a big-block boundary and layer1 indicates no 
+        * free big-blocks, then we cannot allocate a new bigblock in
+        * layer2, skip to the next layer1 entry.
         */
-       if (layer1->blocks_free == 0 &&
-           ((next_offset ^ rootmap->alloc_offset) & ~HAMMER_BLOCKMAP_LAYER2_MASK) != 0) {
-               next_offset = (next_offset + HAMMER_BLOCKMAP_LAYER2_MASK) &
+       if ((next_offset & HAMMER_LARGEBLOCK_MASK) == 0 &&
+           layer1->blocks_free == 0) {
+               next_offset = (next_offset + HAMMER_BLOCKMAP_LAYER2) &
                              ~HAMMER_BLOCKMAP_LAYER2_MASK;
-               if (next_offset >= hmp->zone_limits[zone]) {
-                       hkprintf("blockmap wrap1\n");
-                       next_offset = HAMMER_ZONE_ENCODE(zone, 0);
-                       if (++loops == 2) {     /* XXX poor-man's */
-                               result_offset = 0;
-                               *errorp = ENOSPC;
-                               goto done;
-                       }
-               }
                goto again;
        }
+       KKASSERT(layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
 
        /*
         * Dive layer 2, each entry represents a large-block.
@@ -237,68 +165,64 @@ again:
        KKASSERT(*errorp == 0);
 
        /*
-        * Check CRC if not allocating into uninitialized space
+        * Check CRC.
         */
-       if (next_offset != rootmap->alloc_offset ||
-           (next_offset & HAMMER_LARGEBLOCK_MASK64)) {
-               if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
-                       Debugger("CRC FAILED: LAYER2");
-               }
+       if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
+               Debugger("CRC FAILED: LAYER2");
        }
 
-       if ((next_offset & HAMMER_LARGEBLOCK_MASK64) == 0) {
-               /*
-                * We are at the beginning of a new bigblock
-                */
+       /*
+        * This is a bit complex.  If we are at the beginning of a bigblock
+        * we have to check for reservations.  If we aren't we may still have
+        * to assign ownership of the bigblock in layer2.
+        */
+       if ((next_offset & HAMMER_LARGEBLOCK_MASK) == 0) {
+               if (layer2->zone != 0) {
+                       next_offset = (next_offset + HAMMER_LARGEBLOCK_SIZE) &
+                                     ~HAMMER_LARGEBLOCK_MASK64;
+                       goto again;
+               }
+               base_off = (next_offset & (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) | HAMMER_ZONE_RAW_BUFFER;
                resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
-                                next_offset & ~HAMMER_LARGEBLOCK_MASK64);
-
+                                base_off);
                if (resv) {
-                       goto skip;
-               } else if (next_offset == rootmap->alloc_offset ||
-                          layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) {
-                       /*
-                        * Allocate the bigblock in layer2 if diving into
-                        * uninitialized space or if the block was previously
-                        * freed.
-                        */
-                       hammer_blockmap_llalloc(trans,
-                                               next_offset, errorp,
-                                               buffer1, layer1,
-                                               buffer2, layer2);
-                       KKASSERT(layer2->u.phys_offset != HAMMER_BLOCKMAP_FREE);
-               } else if (layer2->bytes_free != HAMMER_LARGEBLOCK_SIZE) {
-                       /*
-                        * We have encountered a block that is already
-                        * partially allocated.  We must skip this block.
-                        */
-skip:
-                       next_offset += HAMMER_LARGEBLOCK_SIZE;
-                       if (next_offset >= trans->hmp->zone_limits[zone]) {
-                               next_offset = HAMMER_ZONE_ENCODE(zone, 0);
-                               hkprintf("blockmap wrap2\n");
-                               if (++loops == 2) {     /* XXX poor-man's */
-                                       result_offset = 0;
-                                       *errorp = ENOSPC;
-                                       goto done;
-                               }
-                       }
+                       next_offset = (next_offset + HAMMER_LARGEBLOCK_SIZE) &
+                                     ~HAMMER_LARGEBLOCK_MASK64;
                        goto again;
                }
-       } else {
+       }
+
+       if (layer2->zone == 0) {
                /*
-                * We are appending within a bigblock.  It is possible that
-                * the blockmap has been marked completely free via a prior
-                * pruning operation.  We no longer reset the append index
-                * for that case because it compromises the UNDO by allowing
-                * data overwrites.
+                * Assign the bigblock to our zone
                 */
-               /*
-               KKASSERT(layer2->u.phys_offset != HAMMER_BLOCKMAP_FREE);
-               */
+               hammer_modify_buffer(trans, buffer1,
+                                    layer1, sizeof(*layer1));
+               --layer1->blocks_free;
+               layer1->layer1_crc = crc32(layer1,
+                                          HAMMER_LAYER1_CRCSIZE);
+               hammer_modify_buffer_done(buffer1);
+               hammer_modify_buffer(trans, buffer2,
+                                    layer2, sizeof(*layer2));
+               layer2->zone = zone;
+               KKASSERT(layer2->bytes_free == HAMMER_LARGEBLOCK_SIZE);
+               KKASSERT(layer2->append_off == 0);
+               hammer_modify_volume_field(trans, trans->rootvol,
+                                          vol0_stat_freebigblocks);
+               --root_volume->ondisk->vol0_stat_freebigblocks;
+               hmp->copy_stat_freebigblocks =
+                       root_volume->ondisk->vol0_stat_freebigblocks;
+               hammer_modify_volume_done(trans->rootvol);
+
+       } else {
+               hammer_modify_buffer(trans, buffer2,
+                                    layer2, sizeof(*layer2));
        }
+       KKASSERT(layer2->zone == zone);
 
-       hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
+       /*
+        * XXX append_off
+        */
        layer2->bytes_free -= bytes;
        layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
        hammer_modify_buffer_done(buffer2);
@@ -320,17 +244,12 @@ done:
        hammer_modify_volume(NULL, root_volume, NULL, 0);
        if (result_offset) {
                if (result_offset == next_offset) {
-                       rootmap->next_offset = next_offset + bytes;
+                       blockmap->next_offset = next_offset + bytes;
                } else {
-                       rootmap->next_offset = next_offset;
+                       blockmap->next_offset = next_offset;
                }
        } else {
-               rootmap->next_offset = next_offset;
-       }
-       if (rootmap->alloc_offset < rootmap->next_offset) {
-               rootmap->alloc_offset =
-                   (rootmap->next_offset + HAMMER_LARGEBLOCK_MASK) &
-                   ~HAMMER_LARGEBLOCK_MASK64;
+               blockmap->next_offset = next_offset;
        }
        hammer_modify_volume_done(root_volume);
        hammer_unlock(&hmp->blkmap_lock);
@@ -344,7 +263,6 @@ done:
                hammer_rel_buffer(buffer2, 0);
        if (buffer3)
                hammer_rel_buffer(buffer3, 0);
-       hammer_rel_volume(root_volume, 0);
 
        return(result_offset);
 }
@@ -353,23 +271,16 @@ done:
  * Front-end blockmap reservation
  *
  * This code reserves bytes out of a blockmap without committing to any
- * meta-data modifications, allowing the front-end to issue disk write I/O
- * for large blocks of data without having to queue the BIOs to the back-end.
- * If the reservation winds up not being used, for example due to a crash,
- * the reblocker should eventually come along and clean it up.
- *
- * This code will attempt to assign free big-blocks to the blockmap to
- * accomodate the request.
- *
- * If we return 0 a reservation was not possible and the caller must queue
- * the I/O to the backend.
+ * meta-data modifications, allowing the front-end to directly issue disk
+ * write I/O for large blocks of data
  */
 hammer_reserve_t
 hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes,
                        hammer_off_t *zone_offp, int *errorp)
 {
        hammer_volume_t root_volume;
-       hammer_blockmap_t rootmap;
+       hammer_blockmap_t blockmap;
+       hammer_blockmap_t freemap;
        struct hammer_blockmap_layer1 *layer1;
        struct hammer_blockmap_layer2 *layer2;
        hammer_buffer_t buffer1 = NULL;
@@ -379,9 +290,10 @@ hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes,
        hammer_off_t next_offset;
        hammer_off_t layer1_offset;
        hammer_off_t layer2_offset;
+       hammer_off_t base_off;
        hammer_reserve_t resv;
+       hammer_reserve_t resx;
        int loops = 0;
-       int skip_amount;
 
        /*
         * Setup
@@ -390,12 +302,9 @@ hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes,
        root_volume = hammer_get_root_volume(hmp, errorp);
        if (*errorp)
                return(NULL);
-       rootmap = &hmp->blockmap[zone];
-       KKASSERT(rootmap->phys_offset != 0);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
-                HAMMER_ZONE_RAW_BUFFER_INDEX);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->next_offset) == zone);
+       blockmap = &hmp->blockmap[zone];
+       freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
+       KKASSERT(HAMMER_ZONE_DECODE(blockmap->next_offset) == zone);
 
        /*
         * Deal with alignment and buffer-boundary issues.
@@ -407,17 +316,15 @@ hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes,
        KKASSERT(bytes > 0 && bytes <= HAMMER_BUFSIZE);
 
        hammer_lock_ex(&hmp->blkmap_lock);
+       next_offset = blockmap->next_offset;
+again:
+       resv = NULL;
 
        /*
-        * Starting zoneX offset.  The reservation code always wraps at the
-        * alloc_offset (the allocation code is allowed to go through to the
-        * limit).
+        * Check for wrap
         */
-       next_offset = rootmap->next_offset;
-again:
-       resv = NULL;
-       if (next_offset >= rootmap->alloc_offset) {
-               if (++loops == 2) {     /* XXX poor-man's */
+       if (next_offset == 0) {
+               if (++loops == 2) {
                        *errorp = ENOSPC;
                        goto done;
                }
@@ -429,40 +336,37 @@ again:
         */
        tmp_offset = next_offset + bytes - 1;
        if ((next_offset ^ tmp_offset) & ~HAMMER_BUFMASK64) {
-               skip_amount = HAMMER_BUFSIZE - 
-                             ((int)next_offset & HAMMER_BUFMASK);
-               hammer_add_hole(hmp, &hmp->holes[zone],
-                               next_offset, skip_amount);
                next_offset = tmp_offset & ~HAMMER_BUFMASK64;
+               goto again;
        }
 
        /*
         * Dive layer 1.
         */
-       layer1_offset = rootmap->phys_offset +
+       layer1_offset = freemap->phys_offset +
                        HAMMER_BLOCKMAP_LAYER1_OFFSET(next_offset);
        layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1);
        KKASSERT(*errorp == 0);
-       KKASSERT(next_offset <= rootmap->alloc_offset);
 
        /*
-        * Check CRC if not allocating into uninitialized space
+        * Check CRC.
         */
        if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
                Debugger("CRC FAILED: LAYER1");
        }
-       KKASSERT(layer1->phys_offset);
 
        /*
-        * If layer1 indicates no free blocks in layer2 and our alloc_offset
-        * is not in layer2, skip layer2 entirely.
+        * If we are at a big-block boundary and layer1 indicates no 
+        * free big-blocks, then we cannot allocate a new bigblock in
+        * layer2, skip to the next layer1 entry.
         */
-       if (layer1->blocks_free == 0 &&
-           ((next_offset ^ rootmap->alloc_offset) & ~HAMMER_BLOCKMAP_LAYER2_MASK) != 0) {
-               next_offset = (next_offset + HAMMER_BLOCKMAP_LAYER2_MASK) &
+       if ((next_offset & HAMMER_LARGEBLOCK_MASK) == 0 &&
+           layer1->blocks_free == 0) {
+               next_offset = (next_offset + HAMMER_BLOCKMAP_LAYER2) &
                              ~HAMMER_BLOCKMAP_LAYER2_MASK;
                goto again;
        }
+       KKASSERT(layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
 
        /*
         * Dive layer 2, each entry represents a large-block.
@@ -481,73 +385,27 @@ again:
        }
 
        /*
-        * Acquire the related reservation structure.  If it exists we can
-        * only use the bigblock if our current next_offset is already in
-        * it.
+        * Shortcut to avoid unnecessary reservation lookups.  If we are at
+        * the beginning of a new big block determine whether we can use it
+        * or not.
         */
-       resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
-                        next_offset & ~HAMMER_LARGEBLOCK_MASK64);
-
+       base_off = (next_offset & (~HAMMER_LARGEBLOCK_MASK64 &
+                                  ~HAMMER_OFF_ZONE_MASK)) |
+                  HAMMER_ZONE_RAW_BUFFER;
        if ((next_offset & HAMMER_LARGEBLOCK_MASK64) == 0) {
-               /*
-                * We are at the beginning of a new bigblock.
-                *
-                * (1) If the bigblock has already been reserved do not
-                *     try to use it, skip it.
-                *
-                * (2) If the bigblock has not been allocated then allocate
-                *     it.
-                *
-                * (3) If the bigblock is not completely free we have no
-                *     visibility into what portions may have been allocated,
-                *     so skip it.
-                */
-
-               if (resv) {
+               if (layer2->zone != 0) {
                        next_offset += HAMMER_LARGEBLOCK_SIZE;
                        goto again;
-               } else if (layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) {
-                       struct hammer_transaction trans;
-
-                       hammer_start_transaction(&trans, hmp);
-                       if (hammer_sync_lock_sh_try(&trans) == 0) {
-                               hammer_blockmap_llalloc(&trans,
-                                                       next_offset, errorp,
-                                                       buffer1, layer1,
-                                                       buffer2, layer2);
-                               hammer_sync_unlock(&trans);
-                       } else {
-                               hammer_sync_lock_sh(&trans);
-                               hammer_blockmap_llalloc(&trans,
-                                                       next_offset, errorp,
-                                                       buffer1, layer1,
-                                                       buffer2, layer2);
-                               hammer_sync_unlock(&trans);
-                               /* *errorp = EDEADLK; */
-                       }
-                       hammer_done_transaction(&trans);
-                       if (layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) {
-                               resv = NULL;
-                               goto done;
-                       }
-               } else if (layer2->bytes_free != HAMMER_LARGEBLOCK_SIZE) {
-                       /*
-                        * We have encountered a block that is already
-                        * partially allocated.  We must skip this block.
-                        */
+               }
+               resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
+                                base_off);
+               if (resv) {
                        next_offset += HAMMER_LARGEBLOCK_SIZE;
                        goto again;
                }
        } else {
-               /*
-                * We are appending within a bigblock.  It is possible that
-                * the blockmap has been marked completely free via a prior
-                * pruning operation.  We no longer reset the append index
-                * for that case because it compromises the UNDO by allowing
-                * data overwrites.
-                */
-               KKASSERT(layer2->u.phys_offset != HAMMER_BLOCKMAP_FREE);
-               KKASSERT(layer2->bytes_free >= HAMMER_LARGEBLOCK_SIZE - (int)(next_offset & HAMMER_LARGEBLOCK_MASK64));
+               resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
+                                base_off);
        }
 
        /*
@@ -556,6 +414,23 @@ again:
         */
        KKASSERT(layer2->bytes_free >= 0);
 
+       /*
+        * Make the zone-2 reservation.
+        */
+       if (resv) {
+               ++resv->refs;
+               KKASSERT(resv->zone == zone);
+       } else {
+               base_off = (next_offset & (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) | HAMMER_ZONE_RAW_BUFFER;
+               resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO);
+               resv->refs = 1;
+               resv->zone = zone;
+               resv->zone_offset = base_off;
+               resx = RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
+               KKASSERT(resx == NULL);
+               ++hammer_count_reservations;
+       }
+
        /*
         * If we are not reserving a whole buffer but are at the start of
         * a new block, call hammer_bnew() to avoid a disk read.
@@ -567,18 +442,6 @@ again:
                hammer_bnew(hmp, next_offset, errorp, &buffer3);
        }
 
-       /*
-        * Make the reservation
-        */
-       if (resv) {
-               ++resv->refs;
-       } else {
-               resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO);
-               resv->refs = 1;
-               resv->zone_offset = next_offset & ~HAMMER_LARGEBLOCK_MASK64;
-               RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
-               ++hammer_count_reservations;
-       }
 
        /*
         * Adjust our iterator and alloc_offset.  The layer1 and layer2
@@ -588,11 +451,11 @@ again:
 done:
        if (resv) {
                hammer_modify_volume(NULL, root_volume, NULL, 0);
-               rootmap->next_offset = next_offset + bytes;
+               blockmap->next_offset = next_offset + bytes;
                hammer_modify_volume_done(root_volume);
-       } else if (rootmap->next_offset != next_offset) {
+       } else if (blockmap->next_offset != next_offset) {
                hammer_modify_volume(NULL, root_volume, NULL, 0);
-               rootmap->next_offset = next_offset;
+               blockmap->next_offset = next_offset;
                hammer_modify_volume_done(root_volume);
        }
 
@@ -618,12 +481,50 @@ hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
 {
        KKASSERT(resv->refs > 0);
        if (--resv->refs == 0) {
+               KKASSERT((resv->flags & HAMMER_RESF_ONDELAY) == 0);
                RB_REMOVE(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
                kfree(resv, M_HAMMER);
                --hammer_count_reservations;
        }
 }
 
+/*
+ * This ensures that no data reallocations will take place at the specified
+ * zone2_offset (pointing to the base of a bigblock) for 2 flush cycles,
+ * preventing deleted data space, which has no UNDO, from being reallocated 
+ * too fast.
+ */
+void
+hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv,
+                       hammer_off_t zone2_offset)
+{
+       if (resv == NULL) {
+               resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO);
+               resv->refs = 1; /* ref for on-delay list */
+               resv->zone_offset = zone2_offset;
+               RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
+               ++hammer_count_reservations;
+       } else if (resv->flags & HAMMER_RESF_ONDELAY) {
+               TAILQ_REMOVE(&hmp->delay_list, resv, delay_entry);
+               resv->flush_group = hmp->flusher.next + 1;
+       } else {
+               ++resv->refs;   /* ref for on-delay list */
+       }
+       resv->flags |= HAMMER_RESF_ONDELAY;
+       resv->flush_group = hmp->flusher.next + 1;
+       TAILQ_INSERT_TAIL(&hmp->delay_list, resv, delay_entry);
+}
+
+void
+hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv)
+{
+       KKASSERT(resv->flags & HAMMER_RESF_ONDELAY);
+       resv->flags &= ~HAMMER_RESF_ONDELAY;
+       TAILQ_REMOVE(&hmp->delay_list, resv, delay_entry);
+       hammer_blockmap_reserve_complete(hmp, resv);
+}
+
+
 /*
  * Free (offset,bytes) in a zone.
  *
@@ -632,60 +533,61 @@ hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
  */
 void
 hammer_blockmap_free(hammer_transaction_t trans,
-                    hammer_off_t bmap_off, int bytes)
+                    hammer_off_t zone_offset, int bytes)
 {
        hammer_mount_t hmp;
        hammer_volume_t root_volume;
        hammer_reserve_t resv;
-       hammer_blockmap_t rootmap;
+       hammer_blockmap_t blockmap;
+       hammer_blockmap_t freemap;
        struct hammer_blockmap_layer1 *layer1;
        struct hammer_blockmap_layer2 *layer2;
        hammer_buffer_t buffer1 = NULL;
        hammer_buffer_t buffer2 = NULL;
        hammer_off_t layer1_offset;
        hammer_off_t layer2_offset;
+       hammer_off_t base_off;
        int error;
        int zone;
 
+       if (bytes == 0)
+               return;
        hmp = trans->hmp;
 
-       if (bytes >= 0) {
+       /*
+        * Alignment
+        */
+       if (bytes > 0) {
                bytes = (bytes + 15) & ~15;
                KKASSERT(bytes <= HAMMER_BUFSIZE);
-               KKASSERT(((bmap_off ^ (bmap_off + (bytes - 1))) & 
+               KKASSERT(((zone_offset ^ (zone_offset + (bytes - 1))) & 
                          ~HAMMER_LARGEBLOCK_MASK64) == 0);
        } else {
                bytes = -((-bytes + 15) & ~15);
                KKASSERT(bytes >= -HAMMER_BUFSIZE);
        }
-       zone = HAMMER_ZONE_DECODE(bmap_off);
-       KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
-       root_volume = hammer_get_root_volume(hmp, &error);
-       if (error)
-               return;
 
+       /*
+        * Basic zone validation & locking
+        */
+       zone = HAMMER_ZONE_DECODE(zone_offset);
+       KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
+       root_volume = trans->rootvol;
+       error = 0;
        hammer_lock_ex(&hmp->blkmap_lock);
 
-       rootmap = &hmp->blockmap[zone];
-       KKASSERT(rootmap->phys_offset != 0);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
-                HAMMER_ZONE_RAW_BUFFER_INDEX);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
-
-       if (bmap_off >= rootmap->alloc_offset) {
-               panic("hammer_blockmap_lookup: %016llx beyond EOF %016llx",
-                     bmap_off, rootmap->alloc_offset);
-               goto done;
-       }
+       blockmap = &hmp->blockmap[zone];
+       freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
 
        /*
         * Dive layer 1.
         */
-       layer1_offset = rootmap->phys_offset +
-                       HAMMER_BLOCKMAP_LAYER1_OFFSET(bmap_off);
+       layer1_offset = freemap->phys_offset +
+                       HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset);
        layer1 = hammer_bread(hmp, layer1_offset, &error, &buffer1);
        KKASSERT(error == 0);
-       KKASSERT(layer1->phys_offset);
+       KKASSERT(layer1->phys_offset &&
+                layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
        if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
                Debugger("CRC FAILED: LAYER1");
        }
@@ -694,102 +596,103 @@ hammer_blockmap_free(hammer_transaction_t trans,
         * Dive layer 2, each entry represents a large-block.
         */
        layer2_offset = layer1->phys_offset +
-                       HAMMER_BLOCKMAP_LAYER2_OFFSET(bmap_off);
+                       HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset);
        layer2 = hammer_bread(hmp, layer2_offset, &error, &buffer2);
        KKASSERT(error == 0);
-       KKASSERT(layer2->u.phys_offset);
        if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
                Debugger("CRC FAILED: LAYER2");
        }
 
        hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
-       layer2->bytes_free += bytes;
-       KKASSERT(layer2->bytes_free <= HAMMER_LARGEBLOCK_SIZE);
-
-       /*
-        * If the big-block is free, return it to the free pool.  The layer2
-        * infrastructure is left intact even if the entire layer2 becomes
-        * free.
-        *
-        * At the moment if our iterator is in a bigblock that becomes
-        * wholely free, we have to leave the block allocated and we cannot
-        * reset the iterator because there may be UNDOs on-disk that
-        * reference areas of that block and we cannot overwrite those areas.
-        */
-       if (layer2->bytes_free == HAMMER_LARGEBLOCK_SIZE) {
-               hammer_off_t base_off;
-
-               base_off = bmap_off & ~HAMMER_LARGEBLOCK_MASK64;
-               resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
-                                base_off);
-
-               if (resv) {
-                       /*
-                        * Portions of this block have been reserved, do
-                        * not free it.
-                        */
-               } else if ((rootmap->next_offset ^ bmap_off) &
-                           ~HAMMER_LARGEBLOCK_MASK64) {
-                       /*
-                        * Our iterator is not in the now-free big-block
-                        * and we can release it.
-                        */
-                       hammer_clean_holes(hmp, &trans->hmp->holes[zone],
-                                          base_off);
-                       hammer_del_buffers(hmp, base_off,
-                                          layer2->u.phys_offset,
-                                          HAMMER_LARGEBLOCK_SIZE);
-                       hammer_freemap_free(trans, layer2->u.phys_offset,
-                                           bmap_off, &error);
-                       layer2->u.phys_offset = HAMMER_BLOCKMAP_FREE;
-
+       if (bytes > 0) {
+               /*
+                * Freeing previously allocated space
+                */
+               KKASSERT(layer2->zone == zone);
+               layer2->bytes_free += bytes;
+               KKASSERT(layer2->bytes_free <= HAMMER_LARGEBLOCK_SIZE);
+               if (layer2->bytes_free == HAMMER_LARGEBLOCK_SIZE) {
+                       base_off = (zone_offset & (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) | HAMMER_ZONE_RAW_BUFFER;
+                       resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
+                                        base_off);
+                       if (resv) {
+                               /*
+                                * Portions of this block have been reserved, do
+                                * not free it.
+                                *
+                                * Make sure the reservation remains through
+                                * the next flush cycle so potentially undoable
+                                * data is not overwritten.
+                                */
+                               KKASSERT(resv->zone == zone);
+                               hammer_reserve_setdelay(hmp, resv, base_off);
+                       } else if ((blockmap->next_offset ^ zone_offset) &
+                                   ~HAMMER_LARGEBLOCK_MASK64) {
+                               /*
+                                * Our iterator is not in the now-free big-block
+                                * and we can release it.
+                                *
+                                * Make sure the reservation remains through
+                                * the next flush cycle so potentially undoable
+                                * data is not overwritten.
+                                */
+                               hammer_reserve_setdelay(hmp, resv, base_off);
+                               KKASSERT(layer2->zone == zone);
+                               hammer_del_buffers(hmp,
+                                                  zone_offset &
+                                                     ~HAMMER_LARGEBLOCK_MASK64,
+                                                  base_off,
+                                                  HAMMER_LARGEBLOCK_SIZE);
+                               layer2->zone = 0;
+                               layer2->append_off = 0;
+                               hammer_modify_buffer(trans, buffer1,
+                                                    layer1, sizeof(*layer1));
+                               ++layer1->blocks_free;
+                               layer1->layer1_crc = crc32(layer1,
+                                                          HAMMER_LAYER1_CRCSIZE);
+                               hammer_modify_buffer_done(buffer1);
+                               hammer_modify_volume_field(trans,
+                                               trans->rootvol,
+                                               vol0_stat_freebigblocks);
+                               ++root_volume->ondisk->vol0_stat_freebigblocks;
+                               hmp->copy_stat_freebigblocks =
+                                  root_volume->ondisk->vol0_stat_freebigblocks;
+                               hammer_modify_volume_done(trans->rootvol);
+                       }
+               }
+       } else {
+               /*
+                * Allocating previously reserved space
+                */
+               if (layer2->zone == 0) {
+                       layer2->zone = zone;
                        hammer_modify_buffer(trans, buffer1,
                                             layer1, sizeof(*layer1));
-                       ++layer1->blocks_free;
-#if 0
-                       /*
-                        * This commented out code would release the layer2
-                        * bigblock.  We do not want to do this, at least
-                        * not right now.
-                        *
-                        * This also may be incomplete.
-                        */
-                       if (layer1->blocks_free == HAMMER_BLOCKMAP_RADIX2) {
-                               hammer_freemap_free(
-                                       trans, layer1->phys_offset,
-                                       bmap_off & ~HAMMER_BLOCKMAP_LAYER2_MASK,
-                                       &error);
-                               layer1->phys_offset = HAMMER_BLOCKMAP_FREE;
-                       }
-#endif
+                       --layer1->blocks_free;
                        layer1->layer1_crc = crc32(layer1,
                                                   HAMMER_LAYER1_CRCSIZE);
                        hammer_modify_buffer_done(buffer1);
-               } else {
-#if 0
-                       /*
-                        * This commented out code would reset the iterator,
-                        * which we cannot do at the moment as it could cause
-                        * new allocations to overwrite deleted data still
-                        * subject to undo on reboot.
-                        */
-                       hammer_modify_volume(trans, root_volume,
-                                            NULL, 0);
-                       rootmap->next_offset &= ~HAMMER_LARGEBLOCK_MASK64;
-                       hammer_modify_volume_done(root_volume);
-#endif
+                       hammer_modify_volume_field(trans,
+                                       trans->rootvol,
+                                       vol0_stat_freebigblocks);
+                       --root_volume->ondisk->vol0_stat_freebigblocks;
+                       hmp->copy_stat_freebigblocks =
+                          root_volume->ondisk->vol0_stat_freebigblocks;
+                       hammer_modify_volume_done(trans->rootvol);
                }
+               if (layer2->zone != zone)
+                       kprintf("layer2 zone mismatch %d %d\n", layer2->zone, zone);
+               KKASSERT(layer2->zone == zone);
+               layer2->bytes_free += bytes;
        }
        layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
        hammer_modify_buffer_done(buffer2);
-done:
        hammer_unlock(&hmp->blkmap_lock);
 
        if (buffer1)
                hammer_rel_buffer(buffer1, 0);
        if (buffer2)
                hammer_rel_buffer(buffer2, 0);
-       hammer_rel_volume(root_volume, 0);
 }
 
 /*
@@ -797,11 +700,12 @@ done:
  * specified blockmap offset.
  */
 int
-hammer_blockmap_getfree(hammer_mount_t hmp, hammer_off_t bmap_off,
+hammer_blockmap_getfree(hammer_mount_t hmp, hammer_off_t zone_offset,
                        int *curp, int *errorp)
 {
        hammer_volume_t root_volume;
-       hammer_blockmap_t rootmap;
+       hammer_blockmap_t blockmap;
+       hammer_blockmap_t freemap;
        struct hammer_blockmap_layer1 *layer1;
        struct hammer_blockmap_layer2 *layer2;
        hammer_buffer_t buffer = NULL;
@@ -810,32 +714,21 @@ hammer_blockmap_getfree(hammer_mount_t hmp, hammer_off_t bmap_off,
        int bytes;
        int zone;
 
-       zone = HAMMER_ZONE_DECODE(bmap_off);
+       zone = HAMMER_ZONE_DECODE(zone_offset);
        KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
        root_volume = hammer_get_root_volume(hmp, errorp);
        if (*errorp) {
                *curp = 0;
                return(0);
        }
-       rootmap = &hmp->blockmap[zone];
-       KKASSERT(rootmap->phys_offset != 0);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
-                HAMMER_ZONE_RAW_BUFFER_INDEX);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
-
-       if (bmap_off >= rootmap->alloc_offset) {
-               panic("hammer_blockmap_lookup: %016llx beyond EOF %016llx",
-                     bmap_off, rootmap->alloc_offset);
-               bytes = 0;
-               *curp = 0;
-               goto done;
-       }
+       blockmap = &hmp->blockmap[zone];
+       freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
 
        /*
         * Dive layer 1.
         */
-       layer1_offset = rootmap->phys_offset +
-                       HAMMER_BLOCKMAP_LAYER1_OFFSET(bmap_off);
+       layer1_offset = freemap->phys_offset +
+                       HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset);
        layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer);
        KKASSERT(*errorp == 0);
        KKASSERT(layer1->phys_offset);
@@ -847,27 +740,26 @@ hammer_blockmap_getfree(hammer_mount_t hmp, hammer_off_t bmap_off,
         * Dive layer 2, each entry represents a large-block.
         */
        layer2_offset = layer1->phys_offset +
-                       HAMMER_BLOCKMAP_LAYER2_OFFSET(bmap_off);
+                       HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset);
        layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer);
        KKASSERT(*errorp == 0);
-       KKASSERT(layer2->u.phys_offset);
        if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
                Debugger("CRC FAILED: LAYER2");
        }
+       KKASSERT(layer2->zone == zone);
 
        bytes = layer2->bytes_free;
 
-       if ((rootmap->next_offset ^ bmap_off) & ~HAMMER_LARGEBLOCK_MASK64)
+       if ((blockmap->next_offset ^ zone_offset) & ~HAMMER_LARGEBLOCK_MASK64)
                *curp = 0;
        else
                *curp = 1;
-done:
        if (buffer)
                hammer_rel_buffer(buffer, 0);
        hammer_rel_volume(root_volume, 0);
        if (hammer_debug_general & 0x0800) {
                kprintf("hammer_blockmap_getfree: %016llx -> %d\n",
-                       bmap_off, bytes);
+                       zone_offset, bytes);
        }
        return(bytes);
 }
@@ -877,44 +769,56 @@ done:
  * Lookup a blockmap offset.
  */
 hammer_off_t
-hammer_blockmap_lookup(hammer_mount_t hmp, hammer_off_t bmap_off, int *errorp)
+hammer_blockmap_lookup(hammer_mount_t hmp, hammer_off_t zone_offset,
+                      int *errorp)
 {
        hammer_volume_t root_volume;
-       hammer_blockmap_t rootmap;
+       hammer_blockmap_t freemap;
        struct hammer_blockmap_layer1 *layer1;
        struct hammer_blockmap_layer2 *layer2;
        hammer_buffer_t buffer = NULL;
        hammer_off_t layer1_offset;
        hammer_off_t layer2_offset;
        hammer_off_t result_offset;
+       hammer_off_t base_off;
+       hammer_reserve_t resv;
        int zone;
 
-       zone = HAMMER_ZONE_DECODE(bmap_off);
+       /*
+        * Calculate the zone-2 offset.
+        */
+       zone = HAMMER_ZONE_DECODE(zone_offset);
        KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
+
+       result_offset = (zone_offset & ~HAMMER_OFF_ZONE_MASK) |
+                       HAMMER_ZONE_RAW_BUFFER;
+
+       /*
+        * We can actually stop here, normal blockmaps are now direct-mapped
+        * onto the freemap and so represent zone-2 addresses.
+        */
+       if (hammer_verify_zone == 0) {
+               *errorp = 0;
+               return(result_offset);
+       }
+
+       /*
+        * Validate the allocation zone
+        */
        root_volume = hammer_get_root_volume(hmp, errorp);
        if (*errorp)
                return(0);
-       rootmap = &hmp->blockmap[zone];
-       KKASSERT(rootmap->phys_offset != 0);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
-                HAMMER_ZONE_RAW_BUFFER_INDEX);
-       KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
-
-       if (bmap_off >= rootmap->alloc_offset) {
-               panic("hammer_blockmap_lookup: %016llx beyond EOF %016llx",
-                     bmap_off, rootmap->alloc_offset);
-               result_offset = 0;
-               goto done;
-       }
+       freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX];
+       KKASSERT(freemap->phys_offset != 0);
 
        /*
         * Dive layer 1.
         */
-       layer1_offset = rootmap->phys_offset +
-                       HAMMER_BLOCKMAP_LAYER1_OFFSET(bmap_off);
+       layer1_offset = freemap->phys_offset +
+                       HAMMER_BLOCKMAP_LAYER1_OFFSET(zone_offset);
        layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer);
        KKASSERT(*errorp == 0);
-       KKASSERT(layer1->phys_offset);
+       KKASSERT(layer1->phys_offset != HAMMER_BLOCKMAP_UNAVAIL);
        if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
                Debugger("CRC FAILED: LAYER1");
        }
@@ -923,166 +827,68 @@ hammer_blockmap_lookup(hammer_mount_t hmp, hammer_off_t bmap_off, int *errorp)
         * Dive layer 2, each entry represents a large-block.
         */
        layer2_offset = layer1->phys_offset +
-                       HAMMER_BLOCKMAP_LAYER2_OFFSET(bmap_off);
+                       HAMMER_BLOCKMAP_LAYER2_OFFSET(zone_offset);
        layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer);
 
        KKASSERT(*errorp == 0);
-       KKASSERT(layer2->u.phys_offset);
+       if (layer2->zone == 0) {
+               base_off = (zone_offset & (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) | HAMMER_ZONE_RAW_BUFFER;
+               resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
+                                base_off);
+               KKASSERT(resv && resv->zone == zone);
+
+       } else if (layer2->zone != zone) {
+               panic("hammer_blockmap_lookup: bad zone %d/%d\n",
+                       layer2->zone, zone);
+       }
        if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
                Debugger("CRC FAILED: LAYER2");
        }
 
-       result_offset = layer2->u.phys_offset +
-                       (bmap_off & HAMMER_LARGEBLOCK_MASK64);
-done:
        if (buffer)
                hammer_rel_buffer(buffer, 0);
        hammer_rel_volume(root_volume, 0);
        if (hammer_debug_general & 0x0800) {
                kprintf("hammer_blockmap_lookup: %016llx -> %016llx\n",
-                       bmap_off, result_offset);
+                       zone_offset, result_offset);
        }
        return(result_offset);
 }
 
-/************************************************************************
- *                 IN-CORE TRACKING OF ALLOCATION HOLES                *
- ************************************************************************
- *
- * This is a temporary shim in need of a more permanent solution.
- *
- * As we allocate space holes are created due to having to align to a new
- * 16K buffer when an allocation would otherwise cross the buffer boundary.
- * These holes are recorded here and used to fullfill smaller requests as
- * much as possible.  Only a limited number of holes are recorded and these
- * functions operate somewhat like a heuristic, where information is allowed
- * to be thrown away.
- */
-
-void
-hammer_init_holes(hammer_mount_t hmp, hammer_holes_t holes)
-{
-       TAILQ_INIT(&holes->list);
-       holes->count = 0;
-}
-
-void
-hammer_free_holes(hammer_mount_t hmp, hammer_holes_t holes)
-{
-       hammer_hole_t hole;
-
-       while ((hole = TAILQ_FIRST(&holes->list)) != NULL) {
-               TAILQ_REMOVE(&holes->list, hole, entry);
-               if (hole->resv) {
-                       hammer_blockmap_reserve_complete(hmp, hole->resv);
-                       hole->resv = NULL;
-               }
-               kfree(hole, M_HAMMER);
-               --holes->count;
-       }
-}
-
-/*
- * Attempt to locate a hole with sufficient free space to accomodate the
- * requested allocation.  Return the offset or 0 if no hole could be found.
- */
-static hammer_off_t
-hammer_find_hole(hammer_mount_t hmp, hammer_holes_t holes, int bytes)
-{
-       hammer_hole_t hole;
-       hammer_off_t result_off = 0;
-
-       TAILQ_FOREACH(hole, &holes->list, entry) {
-               if (bytes <= hole->bytes) {
-                       result_off = hole->zone_offset;
-                       hole->zone_offset += bytes;
-                       hole->bytes -= bytes;
-                       break;
-               }
-       }
-       return(result_off);
-}
 
 /*
- * If a newly created hole is reasonably sized then record it.  We only
- * keep track of a limited number of holes.  Lost holes are recovered by
- * reblocking.
- *
- * offset is a zone-N offset.
+ * Check space availability
  */
-static void
-hammer_add_hole(hammer_mount_t hmp, hammer_holes_t holes,
-               hammer_off_t zone_offset, int bytes)
+int
+hammer_checkspace(hammer_mount_t hmp)
 {
-       hammer_hole_t hole;
-       hammer_reserve_t resv;
-
-       if (bytes <= 128)
-               return;
-
-       /*
-        * Allocate or reuse a hole structure
-        */
-       if (holes->count < HAMMER_MAX_HOLES) {
-               hole = kmalloc(sizeof(*hole), M_HAMMER, M_WAITOK);
-               ++holes->count;
-       } else {
-               hole = TAILQ_FIRST(&holes->list);
-               TAILQ_REMOVE(&holes->list, hole, entry);
-               if (hole->resv) {
-                       hammer_blockmap_reserve_complete(hmp, hole->resv);
-                       hole->resv = NULL;
-               }
+       const int in_size = sizeof(struct hammer_inode_data) +
+                           sizeof(union hammer_btree_elm);
+       const int rec_size = (sizeof(union hammer_btree_elm) * 2);
+       const int blkconv = HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE;
+       const int limit_inodes = HAMMER_LARGEBLOCK_SIZE / in_size;
+       const int limit_recs = HAMMER_LARGEBLOCK_SIZE / rec_size;
+       int usedbigblocks;;
+
+       /*
+        * Quick and very dirty, not even using the right units (bigblocks
+        * vs 16K buffers), but this catches almost everything.
+        */
+       if (hmp->copy_stat_freebigblocks >= hmp->rsv_databufs + 8 &&
+           hmp->rsv_inodes < limit_inodes &&
+           hmp->rsv_recs < limit_recs &&
+           hmp->rsv_databytes < HAMMER_LARGEBLOCK_SIZE) {
+               return(0);
        }
 
        /*
-        * Associate the structure with the appropriate reservation so the
-        * bigblock does not get freed or reused while we have cached holes,
-        * and install.
+        * Do a more involved check
         */
-       hole->zone_offset = zone_offset;
-       hole->bytes = bytes;
-
-       zone_offset &= ~HAMMER_LARGEBLOCK_MASK64;
-
-       resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, zone_offset);
-       if (resv == NULL) {
-               resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO);
-               resv->zone_offset = zone_offset;
-               resv->refs = 1;
-               RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
-               ++hammer_count_reservations;
-       } else {
-               ++resv->refs;
-       }
-       hole->resv = resv;
-       TAILQ_INSERT_TAIL(&holes->list, hole, entry);
-}
-
-/*
- * Clean out any holes cached for the bigblock we are about to release back
- * to the free pool.
- */
-static void
-hammer_clean_holes(hammer_mount_t hmp, hammer_holes_t holes,
-                  hammer_off_t base_offset)
-{
-       hammer_hole_t hole;
-
-restart:
-       TAILQ_FOREACH(hole, &holes->list, entry) {
-               if ((hole->zone_offset & ~HAMMER_LARGEBLOCK_MASK64) == 
-                   base_offset) {
-                       TAILQ_REMOVE(&holes->list, hole, entry);
-                       if (hole->resv) {
-                               hammer_blockmap_reserve_complete(hmp,
-                                                                hole->resv);
-                               hole->resv = NULL;
-                       }
-                       --holes->count;
-                       kfree(hole, M_HAMMER);
-                       goto restart;
-               }
-       }
+       usedbigblocks = (hmp->rsv_inodes * in_size / HAMMER_LARGEBLOCK_SIZE) +
+                       (hmp->rsv_recs * rec_size / HAMMER_LARGEBLOCK_SIZE) +
+                       hmp->rsv_databufs / blkconv + 6;
+       if (hmp->copy_stat_freebigblocks >= usedbigblocks)
+               return(0);
+       return (ENOSPC);
 }
 
index 2940523..9fdd1ec 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.53 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.54 2008/06/17 04:02:38 dillon Exp $
  */
 
 /*
@@ -503,6 +503,7 @@ hammer_btree_lookup(hammer_cursor_t cursor)
 {
        int error;
 
+       ++hammer_stats_btree_lookups;
        if (cursor->flags & HAMMER_CURSOR_ASOF) {
                KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0);
                cursor->key_beg.create_tid = cursor->asof;
@@ -661,6 +662,7 @@ hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
 
        if ((error = hammer_cursor_upgrade_node(cursor)) != 0)
                return(error);
+       ++hammer_stats_btree_inserts;
 
        /*
         * Insert the element at the leaf node and update the count in the
@@ -728,6 +730,7 @@ hammer_btree_delete(hammer_cursor_t cursor)
 
        if ((error = hammer_cursor_upgrade(cursor)) != 0)
                return(error);
+       ++hammer_stats_btree_deletes;
 
        /*
         * Delete the element from the leaf node. 
@@ -831,6 +834,7 @@ btree_search(hammer_cursor_t cursor, int flags)
        int s;
 
        flags |= cursor->flags;
+       ++hammer_stats_btree_searches;
 
        if (hammer_debug_btree) {
                kprintf("SEARCH   %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td = %p)\n",
@@ -870,6 +874,7 @@ btree_search(hammer_cursor_t cursor, int flags)
                if (r >= 0 && s < 0)
                        break;
                KKASSERT(cursor->parent);
+               ++hammer_stats_btree_iterations;
                error = hammer_cursor_up(cursor);
                if (error)
                        goto done;
@@ -911,6 +916,7 @@ btree_search(hammer_cursor_t cursor, int flags)
                    cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) {
                        break;
                }
+               ++hammer_stats_btree_iterations;
                error = hammer_cursor_up(cursor);
                /* node may have become stale */
                if (error)
@@ -950,6 +956,7 @@ btree_search(hammer_cursor_t cursor, int flags)
                 */
                i = hammer_btree_search_node(&cursor->key_beg, node);
                while (i <= node->count) {
+                       ++hammer_stats_btree_elements;
                        elm = &node->elms[i];
                        r = hammer_btree_cmp(&cursor->key_beg, &elm->base);
                        if (hammer_debug_btree > 2) {
@@ -1154,6 +1161,7 @@ btree_search(hammer_cursor_t cursor, int flags)
         */
        i = hammer_btree_search_node(&cursor->key_beg, node);
        while (i < node->count) {
+               ++hammer_stats_btree_elements;
                elm = &node->elms[i];
 
                r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base);
@@ -1266,6 +1274,7 @@ hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node)
        s = node->count;
        while (s - b > 4) {
                i = b + (s - b) / 2;
+               ++hammer_stats_btree_elements;
                r = hammer_btree_cmp(elm, &node->elms[i].leaf.base);
                if (r <= 1) {
                        s = i;
@@ -1320,6 +1329,7 @@ btree_split_internal(hammer_cursor_t cursor)
                goto done;
        if ((error = hammer_cursor_upgrade(cursor)) != 0)
                goto done;
+       ++hammer_stats_btree_splits;
 
        /* 
         * We are splitting but elms[split] will be promoted to the parent,
@@ -1535,6 +1545,7 @@ btree_split_leaf(hammer_cursor_t cursor)
 
        if ((error = hammer_cursor_upgrade(cursor)) != 0)
                return(error);
+       ++hammer_stats_btree_splits;
 
        KKASSERT(hammer_btree_cmp(cursor->left_bound,
                 &cursor->node->ondisk->elms[0].leaf.base) <= 0);
@@ -2121,6 +2132,7 @@ hammer_btree_lock_children(hammer_cursor_t cursor,
         * pre-get the children before trying to lock the mess.
         */
        for (i = 0; i < ondisk->count; ++i) {
+               ++hammer_stats_btree_elements;
                elm = &ondisk->elms[i];
                if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF &&
                    elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) {
@@ -2137,6 +2149,7 @@ hammer_btree_lock_children(hammer_cursor_t cursor,
         * Do it for real
         */
        for (i = 0; error == 0 && i < ondisk->count; ++i) {
+               ++hammer_stats_btree_elements;
                elm = &ondisk->elms[i];
 
                switch(elm->base.btype) {
index 2252a4f..fb152ec 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.29 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.30 2008/06/17 04:02:38 dillon Exp $
  */
 
 /*
@@ -281,6 +281,10 @@ hammer_cursor_downgrade(hammer_cursor_t cursor)
 
 /*
  * Seek the cursor to the specified node and index.
+ *
+ * The caller must ref the node prior to calling this routine and release
+ * it after it returns.  If the seek succeeds the cursor will gain its own
+ * ref on the node.
  */
 int
 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index)
index eff5a99..802d0ba 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.36 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.37 2008/06/17 04:02:38 dillon Exp $
  */
 
 #ifndef VFS_HAMMER_DISK_H_
@@ -112,16 +112,15 @@ typedef u_int32_t hammer_crc_t;
  * hammer_off_t has several different encodings.  Note that not all zones
  * encode a vol_no.
  *
- * zone 0 (z,v,o):     reserved (for sanity)
+ * zone 0:             reserved for sanity
  * zone 1 (z,v,o):     raw volume relative (offset 0 is the volume header)
  * zone 2 (z,v,o):     raw buffer relative (offset 0 is the first buffer)
- * zone 3 (z,o):       undo fifo       - fixed layer2 array in root vol hdr
- * zone 4 (z,v,o):     freemap         - freemap-backed self-mapping special
- *                                       cased layering.
- *
- * zone 8 (z,o):       B-Tree          - blkmap-backed
- * zone 9 (z,o):       Record          - blkmap-backed
- * zone 10 (z,o):      Large-data      - blkmap-backed
+ * zone 3 (z,o):       undo fifo       - actually fixed phys array in vol hdr
+ * zone 4 (z,v,o):     freemap         - only real blockmap
+ * zone 8 (z,v,o):     B-Tree          - actually zone-2 address
+ * zone 9 (z,v,o):     Record          - actually zone-2 address
+ * zone 10 (z,v,o):    Large-data      - actually zone-2 address
+ * zone 15:            reserved for sanity
  */
 
 #define HAMMER_ZONE_RAW_VOLUME         0x1000000000000000ULL
@@ -138,7 +137,7 @@ typedef u_int32_t hammer_crc_t;
 #define HAMMER_ZONE_RESERVED0C         0xC000000000000000ULL
 #define HAMMER_ZONE_RESERVED0D         0xD000000000000000ULL
 #define HAMMER_ZONE_RESERVED0E         0xE000000000000000ULL
-#define HAMMER_ZONE_RESERVED0F         0xF000000000000000ULL
+#define HAMMER_ZONE_UNAVAIL            0xF000000000000000ULL
 
 #define HAMMER_ZONE_RAW_VOLUME_INDEX   1
 #define HAMMER_ZONE_RAW_BUFFER_INDEX   2
@@ -148,13 +147,7 @@ typedef u_int32_t hammer_crc_t;
 #define HAMMER_ZONE_META_INDEX         9
 #define HAMMER_ZONE_LARGE_DATA_INDEX   10
 #define HAMMER_ZONE_SMALL_DATA_INDEX   11
-
-/*
- * Per-zone size limitation.  This just makes the iterator easier
- * to deal with by preventing an iterator overflow.
- */
-#define HAMMER_ZONE_LIMIT              \
-       (0x1000000000000000ULL - HAMMER_BLOCKMAP_LAYER2 * 2)
+#define HAMMER_ZONE_UNAVAIL_INDEX      15      /* unavailable */
 
 #define HAMMER_MAX_ZONES               16
 
@@ -211,10 +204,15 @@ typedef u_int32_t hammer_crc_t;
        ((hammer_off_t)HAMMER_BUFFERS_PER_LARGEBLOCK_MASK)
 
 /*
- * Every blockmap has this root structure in the root volume header.
+ * The blockmap is somewhat of a degenerate structure.  HAMMER only actually
+ * uses it in its original incarnation to implement the free-map.
  *
- * NOTE: zone 3 (the undo FIFO) does not use phys_offset.  first and next
- * offsets represent the FIFO.
+ * zone:1      raw volume (no blockmap)
+ * zone:2      raw buffer (no blockmap)
+ * zone:3      undo-map   (direct layer2 array in volume header)
+ * zone:4      free-map   (the only real blockmap)
+ * zone:8-15   zone id used to classify big-block only, address is actually
+ *             a zone-2 address.
  */
 struct hammer_blockmap {
        hammer_off_t    phys_offset;    /* zone-2 physical offset */
@@ -236,13 +234,15 @@ typedef struct hammer_blockmap *hammer_blockmap_t;
  * 524288 16-byte entries (19 bits), representing 8MB (23 bit) blockmaps.
  * 18+19+23 = 60 bits.  The top four bits are the zone id.
  *
- * Layer 2 encodes the physical bigblock mapping for a blockmap.  The freemap
- * uses this field to encode the virtual blockmap offset that allocated the
- * physical block.
+ * Currently only the freemap utilizes both layers in all their glory.
+ * All primary data/meta-data zones actually encode a zone-2 address
+ * requiring no real blockmap translation.
  *
- * NOTE:  The freemap maps the vol_no in the upper 8 bits of layer1.
+ * The freemap uses the upper 8 bits of layer-1 to identify the volume,
+ * thus any space allocated via the freemap can be directly translated
+ * to a zone:2 (or zone:8-15) address.
  *
- * zone-4 blockmap offset: [z:4][layer1:18][layer2:19][bigblock:23]
+ * zone-X blockmap offset: [z:4][layer1:18][layer2:19][bigblock:23]
  */
 struct hammer_blockmap_layer1 {
        hammer_off_t    blocks_free;    /* big-blocks free */
@@ -259,10 +259,10 @@ typedef struct hammer_blockmap_layer1 *hammer_blockmap_layer1_t;
        offsetof(struct hammer_blockmap_layer1, layer1_crc)
 
 struct hammer_blockmap_layer2 {
-       union {
-               hammer_off_t    owner;          /* used by freemap */
-               hammer_off_t    phys_offset;    /* used by blockmap */
-       } u;
+       u_int8_t        zone;           /* typed allocation zone */
+       u_int8_t        unused01;
+       u_int16_t       unused02;
+       u_int32_t       append_off;     /* allocatable space index */
        u_int32_t       bytes_free;     /* bytes free within this bigblock */
        hammer_crc_t    entry_crc;
 };
@@ -307,12 +307,12 @@ typedef struct hammer_blockmap_layer2 *hammer_blockmap_layer2_t;
 
 /*
  * HAMMER UNDO parameters.  The UNDO fifo is mapped directly in the volume
- * header with an array of layer2 structures.  A maximum of (64x8MB) = 512MB
+ * header with an array of layer2 structures.  A maximum of (128x8MB) = 1GB
  * may be reserved.  The size of the undo fifo is usually set a newfs time
  * but can be adjusted if the filesystem is taken offline.
  */
 
-#define HAMMER_UNDO_LAYER2     64      /* max layer2 undo mapping entries */
+#define HAMMER_UNDO_LAYER2     128     /* max layer2 undo mapping entries */
 
 /*
  * All on-disk HAMMER structures which make up elements of the UNDO FIFO
@@ -484,7 +484,7 @@ struct hammer_volume_ondisk {
        int64_t vol0_stat_records;      /* total records in filesystem */
        hammer_off_t vol0_btree_root;   /* B-Tree root */
        hammer_tid_t vol0_next_tid;     /* highest synchronized TID */
-       hammer_off_t vol0_zone_limit;   /* limit the zone size */
+       hammer_off_t vol0_unused03;     /* limit the zone size */
 
        /*
         * Blockmaps for zones.  Not all zones use a blockmap.  Note that
@@ -493,9 +493,9 @@ struct hammer_volume_ondisk {
        struct hammer_blockmap  vol0_blockmap[HAMMER_MAX_ZONES];
 
        /*
-        * Layer-2 array for undo fifo
+        * Array of zone-2 addresses for undo FIFO.
         */
-       struct hammer_blockmap_layer2 vol0_undo_array[HAMMER_UNDO_LAYER2];
+       hammer_off_t            vol0_undo_array[HAMMER_UNDO_LAYER2];
 
 };
 
index 268bc8d..49436f6 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.27 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.28 2008/06/17 04:02:38 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -58,7 +58,7 @@ struct hammer_flusher_info {
        struct hammer_mount *hmp;
        thread_t        td;
        int             startit;
-       TAILQ_HEAD(,hammer_inode) work_list;
+       hammer_inode_t  work_array[HAMMER_FLUSH_GROUP_SIZE];
 };
 
 typedef struct hammer_flusher_info *hammer_flusher_info_t;
@@ -111,7 +111,6 @@ hammer_flusher_create(hammer_mount_t hmp)
        for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
                info = kmalloc(sizeof(*info), M_HAMMER, M_WAITOK|M_ZERO);
                info->hmp = hmp;
-               TAILQ_INIT(&info->work_list);
                ++hmp->flusher.count;
                hmp->flusher.info[i] = info;
                lwkt_create(hammer_flusher_slave_thread, info,
@@ -207,6 +206,9 @@ hammer_flusher_slave_thread(void *arg)
        hammer_flusher_info_t info;
        hammer_mount_t hmp;
        hammer_inode_t ip;
+       int c;
+       int i;
+       int n;
 
        info = arg;
        hmp = info->hmp;
@@ -217,11 +219,31 @@ hammer_flusher_slave_thread(void *arg)
                if (info->startit < 0)
                        break;
                info->startit = 0;
+
+               /*
+                * Try to pull out around ~64 inodes at a time to flush.
+                * The idea is to try to avoid deadlocks between the slaves.
+                */
+               n = c = 0;
                while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) {
                        if (ip->flush_group != hmp->flusher.act)
                                break;
                        TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry);
-                       hammer_flusher_flush_inode(ip, &hmp->flusher.trans);
+                       info->work_array[n++] = ip;
+                       c += ip->rsv_recs;
+                       if (n < HAMMER_FLUSH_GROUP_SIZE &&
+                           c < HAMMER_FLUSH_GROUP_SIZE * 8) {
+                               continue;
+                       }
+                       for (i = 0; i < n; ++i){
+                               hammer_flusher_flush_inode(info->work_array[i],
+                                                       &hmp->flusher.trans);
+                       }
+                       n = c = 0;
+               }
+               for (i = 0; i < n; ++i) {
+                       hammer_flusher_flush_inode(info->work_array[i],
+                                                  &hmp->flusher.trans);
                }
                if (--hmp->flusher.running == 0)
                        wakeup(&hmp->flusher.running);
@@ -275,9 +297,9 @@ hammer_flusher_flush(hammer_mount_t hmp)
         * Start work threads.
         */
        i = 0;
-       n = hmp->count_iqueued / 64;
+       n = hmp->count_iqueued / HAMMER_FLUSH_GROUP_SIZE;
        if (TAILQ_FIRST(&hmp->flush_list)) {
-               for (i = 0; i <= hmp->count_iqueued / 64; ++i) {
+               for (i = 0; i <= n; ++i) {
                        if (i == HAMMER_MAX_FLUSHERS ||
                            hmp->flusher.info[i] == NULL) {
                                break;
@@ -305,8 +327,7 @@ hammer_flusher_flush(hammer_mount_t hmp)
        while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) {
                if (resv->flush_group != hmp->flusher.act)
                        break;
-               TAILQ_REMOVE(&hmp->delay_list, resv, delay_entry);
-               hammer_blockmap_reserve_complete(hmp, resv);
+               hammer_reserve_clrdelay(hmp, resv);
        }
        hammer_done_transaction(&hmp->flusher.trans);
 }
index 178b4de..75e74c8 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_freemap.c,v 1.16 2008/06/10 08:51:01 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_freemap.c,v 1.17 2008/06/17 04:02:38 dillon Exp $
  */
 
+/* REMOVED - bigblock allocations are now handled by the blockmap allocator */
+#if 0
+
 /*
  * HAMMER freemap - bigblock allocator.  The freemap is a 2-layer blockmap
  * with one layer2 entry for each big-block in the filesystem.  Big blocks
@@ -269,3 +272,4 @@ hammer_checkspace(hammer_mount_t hmp)
        return (ENOSPC);
 }
 
+#endif
index bdbc00b..ea69063 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.75 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.76 2008/06/17 04:02:38 dillon Exp $
  */
 
 #include "hammer.h"
@@ -293,7 +293,7 @@ loop:
        ip->flags = flags & HAMMER_INODE_RO;
        if (hmp->ronly)
                ip->flags |= HAMMER_INODE_RO;
-       ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
+       ip->sync_trunc_off = ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
        RB_INIT(&ip->rec_tree);
        TAILQ_INIT(&ip->target_list);
 
@@ -331,15 +331,21 @@ retry:
                hammer_cache_node(cursor.node, &ip->cache[0]);
                if (cache)
                        hammer_cache_node(cursor.node, cache);
+
+               /*
+                * The file should not contain any data past the file size
+                * stored in the inode.  Setting sync_trunc_off to the
+                * file size instead of max reduces B-Tree lookup overheads
+                * on append by allowing the flusher to avoid checking for
+                * record overwrites.
+                */
+               ip->sync_trunc_off = ip->ino_data.size;
        }
 
        /*
-        * On success load the inode's record and data and insert the
-        * inode into the B-Tree.  It is possible to race another lookup
-        * insertion of the same inode so deal with that condition too.
-        *
-        * The cursor's locked node interlocks against others creating and
-        * destroying ip while we were blocked.
+        * The inode is placed on the red-black tree and will be synced to
+        * the media when flushed or by the filesystem sync.  If this races
+        * another instantiation/lookup the insertion will fail.
         */
        if (*errorp == 0) {
                hammer_ref(&ip->lock);
@@ -1166,12 +1172,17 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags)
         * The truncation must be retained in the frontend until after
         * we've actually performed the record deletion.
         *
+        * We continue to retain sync_trunc_off even when all truncations
+        * have been resolved as an optimization to determine if we can
+        * skip the B-Tree lookup for overwrite deletions.
+        *
         * NOTE: The DELETING flag is a mod flag, but it is also sticky,
         * and stays in ip->flags.  Once set, it stays set until the
         * inode is destroyed.
         */
        ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
-       ip->sync_trunc_off = ip->trunc_off;
+       if (ip->sync_flags & HAMMER_INODE_TRUNCATED)
+               ip->sync_trunc_off = ip->trunc_off;
        ip->sync_ino_leaf = ip->ino_leaf;
        ip->sync_ino_data = ip->ino_data;
        ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
@@ -1611,6 +1622,7 @@ hammer_sync_inode(hammer_inode_t ip)
 {
        struct hammer_transaction trans;
        struct hammer_cursor cursor;
+       hammer_node_t tmp_node;
        hammer_record_t depend;
        hammer_record_t next;
        int error, tmp_error;
@@ -1620,7 +1632,7 @@ hammer_sync_inode(hammer_inode_t ip)
                return(0);
 
        hammer_start_transaction_fls(&trans, ip->hmp);
-       error = hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
+       error = hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
        if (error)
                goto done;
 
@@ -1692,9 +1704,7 @@ hammer_sync_inode(hammer_inode_t ip)
         * containing the truncation point for us.
         *
         * We don't flush pending frontend data buffers until after we've
-        * dealth with the truncation.
-        *
-        * Don't bother if the inode is or has been deleted.
+        * dealt with the truncation.
         */
        if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
                /*
@@ -1724,9 +1734,17 @@ hammer_sync_inode(hammer_inode_t ip)
                 * Clear the truncation flag on the backend after we have
                 * complete the deletions.  Backend data is now good again
                 * (including new records we are about to sync, below).
+                *
+                * Leave sync_trunc_off intact.  As we write additional
+                * records the backend will update sync_trunc_off.  This
+                * tells the backend whether it can skip the overwrite
+                * test.  This should work properly even when the backend
+                * writes full blocks where the truncation point straddles
+                * the block because the comparison is against the base
+                * offset of the record.
                 */
                ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
-               ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL;
+               /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
        } else {
                error = 0;
        }
@@ -1747,6 +1765,19 @@ hammer_sync_inode(hammer_inode_t ip)
                if (tmp_error)
                        error = tmp_error;
        }
+       hammer_cache_node(cursor.node, &ip->cache[1]);
+
+       /*
+        * Re-seek for inode update.
+        */
+       if (error == 0) {
+               tmp_node = hammer_ref_node_safe(ip->hmp, &ip->cache[0], &error);
+               if (tmp_node) {
+                       hammer_cursor_seek(&cursor, tmp_node, 0);
+                       hammer_rel_node(tmp_node);
+               }
+               error = 0;
+       }
 
        /*
         * If we are deleting the inode the frontend had better not have
index d25783f..69efc3f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.41 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.42 2008/06/17 04:02:38 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
@@ -380,11 +380,15 @@ hammer_io_release(struct hammer_io *io, int flush)
                 * Leaving the buffer passively associated allows us to
                 * use the kernel's LRU buffer flushing mechanisms rather
                 * then rolling our own.
+                *
+                * XXX there are two ways of doing this.  We can re-acquire
+                * and passively release to reset the LRU, or not.
                 */
                crit_enter();
                if (io->running == 0) {
                        regetblk(bp);
                        if ((bp->b_flags & B_LOCKED) || io->reclaim) {
+                               /*regetblk(bp);*/
                                io->released = 0;
                                hammer_io_disassociate(iou, 1);
                        } else {
index 0795b22..b54d305 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.68 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.69 2008/06/17 04:02:38 dillon Exp $
  */
 
 #include "hammer.h"
@@ -41,6 +41,7 @@ static int hammer_mem_lookup(hammer_cursor_t cursor);
 static int hammer_mem_first(hammer_cursor_t cursor);
 static int hammer_rec_trunc_callback(hammer_record_t record,
                                void *data __unused);
+static int hammer_record_needs_overwrite_delete(hammer_record_t record);
 
 struct rec_trunc_info {
        u_int16_t       rec_type;
@@ -887,6 +888,35 @@ hammer_rec_trunc_callback(hammer_record_t record, void *data __unused)
        return(0);
 }
 
+/*
+ * Return 1 if the caller must check for and delete existing records
+ * before writing out a new data record.
+ *
+ * Return 0 if the caller can just insert the record into the B-Tree without
+ * checking.
+ */
+static int
+hammer_record_needs_overwrite_delete(hammer_record_t record)
+{
+       hammer_inode_t ip = record->ip;
+       int64_t file_offset;
+       int r;
+
+       if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE)
+               file_offset = record->leaf.base.key;
+       else
+               file_offset = record->leaf.base.key - record->leaf.data_len;
+       r = (file_offset < ip->sync_trunc_off);
+       if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
+               if (ip->sync_trunc_off <= record->leaf.base.key)
+                       ip->sync_trunc_off = record->leaf.base.key + 1;
+       } else {
+               if (ip->sync_trunc_off < record->leaf.base.key)
+                       ip->sync_trunc_off = record->leaf.base.key;
+       }
+       return(r);
+}
+
 /*
  * Backend code.  Sync a record to the media.
  */
@@ -917,8 +947,13 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
         * it skips in-memory records.
         *
         * It is ok for the lookup to return ENOENT.
+        *
+        * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have
+        * to call hammer_ip_delete_range() or not.  This also means we must
+        * update sync_trunc_off() as we write.
         */
-       if (record->type == HAMMER_MEM_RECORD_DATA) {
+       if (record->type == HAMMER_MEM_RECORD_DATA &&
+           hammer_record_needs_overwrite_delete(record)) {
                file_offset = record->leaf.base.key - record->leaf.data_len;
                KKASSERT((file_offset & HAMMER_BUFMASK) == 0);
                error = hammer_ip_delete_range(
@@ -1280,11 +1315,14 @@ next_btree:
                if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
                        error = hammer_btree_iterate(cursor);
                        cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
-                       if (error == 0)
+                       if (error == 0) {
                                cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
-                       else
+                               hammer_cache_node(cursor->node,
+                                                 &cursor->ip->cache[1]);
+                       } else {
                                cursor->flags |= HAMMER_CURSOR_DISKEOF |
                                                 HAMMER_CURSOR_ATEDISK;
+                       }
                }
        }
 
@@ -1615,6 +1653,9 @@ retry:
                        break;
                error = hammer_ip_next(cursor);
        }
+       if (cursor->node)
+               hammer_cache_node(cursor->node, &ip->cache[1]);
+
        if (error == EDEADLK) {
                hammer_done_cursor(cursor);
                error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
@@ -1690,6 +1731,8 @@ retry:
                        break;
                error = hammer_ip_next(cursor);
        }
+       if (cursor->node)
+               hammer_cache_node(cursor->node, &ip->cache[1]);
        if (error == EDEADLK) {
                hammer_done_cursor(cursor);
                error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
index 7c696bd..8bf3858 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.57 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.58 2008/06/17 04:02:38 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -791,14 +791,15 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush)
                hammer_lock_ex(&buffer->io.lock);
                if (buffer->io.lock.refs == 1) {
                        hammer_io_release(&buffer->io, flush);
-                       hammer_flush_buffer_nodes(buffer);
-                       KKASSERT(TAILQ_EMPTY(&buffer->clist));
-                       --hammer_count_refedbufs;
 
                        if (buffer->io.bp == NULL &&
                            buffer->io.lock.refs == 1) {
                                /*
                                 * Final cleanup
+                                *
+                                * NOTE: It is impossible for any associated
+                                * B-Tree nodes to have refs if the buffer
+                                * has no additional refs.
                                 */
                                RB_REMOVE(hammer_buf_rb_tree,
                                          &buffer->io.hmp->rb_bufs_root,
@@ -807,6 +808,10 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush)
                                buffer->volume = NULL; /* sanity */
                                hammer_rel_volume(volume, 0);
                                hammer_io_clear_modlist(&buffer->io);
+                               hammer_flush_buffer_nodes(buffer);
+                               KKASSERT(TAILQ_EMPTY(&buffer->clist));
+                               if (buffer->io.lock.refs == 1)
+                                       --hammer_count_refedbufs;
                                freeme = 1;
                        }
                }
@@ -1170,8 +1175,9 @@ hammer_uncache_node(struct hammer_node **cache)
                } else {
                        panic("hammer_uncache_node: missing cache linkage");
                }
-               if (node->cache1 == NULL && node->cache2 == NULL)
+               if (node->cache1 == NULL && node->cache2 == NULL) {
                        hammer_flush_node(node);
+               }
        }
 }
 
index 0a8a5bc..8041822 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.16 2008/06/12 00:16:10 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.17 2008/06/17 04:02:38 dillon Exp $
  */
 
 /*
@@ -53,7 +53,6 @@ hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t zone3_off, int *errorp)
 {
        hammer_volume_t root_volume;
        hammer_blockmap_t undomap;
-       struct hammer_blockmap_layer2 *layer2;
        hammer_off_t result_offset;
        int i;
 
@@ -66,8 +65,7 @@ hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t zone3_off, int *errorp)
        KKASSERT (zone3_off < undomap->alloc_offset);
 
        i = (zone3_off & HAMMER_OFF_SHORT_MASK) / HAMMER_LARGEBLOCK_SIZE;
-       layer2 = &root_volume->ondisk->vol0_undo_array[i];
-       result_offset = layer2->u.phys_offset +
+       result_offset = root_volume->ondisk->vol0_undo_array[i] +
                        (zone3_off & HAMMER_LARGEBLOCK_MASK64);
 
        hammer_rel_volume(root_volume, 0);
index 911490d..2f5ab4e 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.47 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.48 2008/06/17 04:02:38 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -65,19 +65,27 @@ int hammer_count_record_datas;
 int hammer_count_volumes;
 int hammer_count_buffers;
 int hammer_count_nodes;
+int64_t hammer_stats_btree_lookups;
+int64_t hammer_stats_btree_searches;
+int64_t hammer_stats_btree_inserts;
+int64_t hammer_stats_btree_deletes;
+int64_t hammer_stats_btree_elements;
+int64_t hammer_stats_btree_splits;
+int64_t hammer_stats_btree_iterations;
+int64_t hammer_stats_record_iterations;
 int hammer_count_dirtybufs;            /* global */
 int hammer_count_refedbufs;            /* global */
 int hammer_count_reservations;
 int hammer_count_io_running_read;
 int hammer_count_io_running_write;
 int hammer_count_io_locked;
-int hammer_stats_btree_iterations;
-int hammer_stats_record_iterations;
 int hammer_limit_dirtybufs;            /* per-mount */
 int hammer_limit_irecs;                        /* per-inode */
 int hammer_limit_recs;                 /* as a whole XXX */
 int hammer_limit_iqueued;              /* per-mount */
 int hammer_bio_count;
+int hammer_verify_zone;
+int hammer_write_mode;
 int64_t hammer_contention_count;
 int64_t hammer_zone_limit;
 
@@ -128,6 +136,22 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, count_buffers, CTLFLAG_RD,
           &hammer_count_buffers, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_nodes, CTLFLAG_RD,
           &hammer_count_nodes, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_btree_searches, CTLFLAG_RD,
+          &hammer_stats_btree_searches, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_btree_lookups, CTLFLAG_RD,
+          &hammer_stats_btree_lookups, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_btree_inserts, CTLFLAG_RD,
+          &hammer_stats_btree_inserts, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_btree_deletes, CTLFLAG_RD,
+          &hammer_stats_btree_deletes, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_btree_elements, CTLFLAG_RD,
+          &hammer_stats_btree_elements, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_btree_splits, CTLFLAG_RD,
+          &hammer_stats_btree_splits, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_btree_iterations, CTLFLAG_RD,
+          &hammer_stats_btree_iterations, 0, "");
+SYSCTL_QUAD(_vfs_hammer, OID_AUTO, stats_record_iterations, CTLFLAG_RD,
+          &hammer_stats_record_iterations, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_dirtybufs, CTLFLAG_RD,
           &hammer_count_dirtybufs, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_refedbufs, CTLFLAG_RD,
@@ -144,6 +168,10 @@ SYSCTL_QUAD(_vfs_hammer, OID_AUTO, zone_limit, CTLFLAG_RW,
           &hammer_zone_limit, 0, "");
 SYSCTL_QUAD(_vfs_hammer, OID_AUTO, contention_count, CTLFLAG_RW,
           &hammer_contention_count, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, verify_zone, CTLFLAG_RW,
+          &hammer_verify_zone, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, write_mode, CTLFLAG_RW,
+          &hammer_write_mode, 0, "");
 
 /*
  * VFS ABI
@@ -235,7 +263,6 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                hmp = kmalloc(sizeof(*hmp), M_HAMMER, M_WAITOK | M_ZERO);
                mp->mnt_data = (qaddr_t)hmp;
                hmp->mp = mp;
-               hmp->zbuf = kmalloc(HAMMER_BUFSIZE, M_HAMMER, M_WAITOK|M_ZERO);
                hmp->namekey_iterator = mycpu->gd_time_seconds;
                /*TAILQ_INIT(&hmp->recycle_list);*/
 
@@ -265,24 +292,6 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                TAILQ_INIT(&hmp->objid_cache_list);
                TAILQ_INIT(&hmp->undo_lru_list);
                TAILQ_INIT(&hmp->reclaim_list);
-
-               /*
-                * Set default zone limits.  This value can be reduced
-                * further by the zone limit specified in the root volume.
-                *
-                * The sysctl can force a small zone limit for debugging
-                * purposes.
-                */
-               for (i = 0; i < HAMMER_MAX_ZONES; ++i) {
-                       hmp->zone_limits[i] =
-                               HAMMER_ZONE_ENCODE(i, HAMMER_ZONE_LIMIT);
-
-                       if (hammer_zone_limit) {
-                               hmp->zone_limits[i] =
-                                   HAMMER_ZONE_ENCODE(i, hammer_zone_limit);
-                       }
-                       hammer_init_holes(hmp, &hmp->holes[i]);
-               }
        }
        hmp->hflags &= ~HMNT_USERFLAGS;
        hmp->hflags |= info.hflags & HMNT_USERFLAGS;
@@ -454,20 +463,6 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
              sizeof(hmp->blockmap));
        hmp->copy_stat_freebigblocks = rootvol->ondisk->vol0_stat_freebigblocks;
 
-       /*
-        * Use the zone limit set by newfs_hammer, or the zone limit set by
-        * sysctl (for debugging), whichever is smaller.
-        */
-       if (rootvol->ondisk->vol0_zone_limit) {
-               hammer_off_t vol0_zone_limit;
-
-               vol0_zone_limit = rootvol->ondisk->vol0_zone_limit;
-               for (i = 0; i < HAMMER_MAX_ZONES; ++i) {
-                       if (hmp->zone_limits[i] > vol0_zone_limit)
-                               hmp->zone_limits[i] = vol0_zone_limit;
-               }
-       }
-
        hammer_flusher_create(hmp);
 
        /*
@@ -528,7 +523,6 @@ static void
 hammer_free_hmp(struct mount *mp)
 {
        struct hammer_mount *hmp = (void *)mp->mnt_data;
-       int i;
 
 #if 0
        /*
@@ -539,13 +533,9 @@ hammer_free_hmp(struct mount *mp)
                hmp->rootvp = NULL;
        }
 #endif
-       kprintf("X1");
        hammer_flusher_sync(hmp);
-       kprintf("X2");
        hammer_flusher_sync(hmp);
-       kprintf("X3");
        hammer_flusher_destroy(hmp);
-       kprintf("X4");
 
        KKASSERT(RB_EMPTY(&hmp->rb_inos_root));
 
@@ -568,27 +558,14 @@ hammer_free_hmp(struct mount *mp)
         */
         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
                hammer_unload_buffer, NULL);
-       kprintf("X5");
        RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
                hammer_unload_volume, NULL);
-       kprintf("X6");
 
        mp->mnt_data = NULL;
        mp->mnt_flag &= ~MNT_LOCAL;
        hmp->mp = NULL;
-       kprintf("X7");
        hammer_destroy_objid_cache(hmp);
-       kprintf("X8");
-       kfree(hmp->zbuf, M_HAMMER);
-       kprintf("X9");
-       kprintf("X10");
-
-       for (i = 0; i < HAMMER_MAX_ZONES; ++i)
-               hammer_free_holes(hmp, &hmp->holes[i]);
-       kprintf("X11");
-
        kfree(hmp, M_HAMMER);
-       kprintf("X12");
 }
 
 /*
index 5a995b6..842d710 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.70 2008/06/14 01:42:13 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.71 2008/06/17 04:02:38 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -271,11 +271,13 @@ hammer_vop_write(struct vop_write_args *ap)
        int n;
        int flags;
        int count;
+       int seqcount;
 
        if (ap->a_vp->v_type != VREG)
                return (EINVAL);
        ip = VTOI(ap->a_vp);
        error = 0;
+       seqcount = ap->a_ioflag >> 16;
 
        if (ip->flags & HAMMER_INODE_RO)
                return (EROFS);
@@ -437,12 +439,25 @@ hammer_vop_write(struct vop_write_args *ap)
 
                /*
                 * Final buffer disposition.
+                *
+                * If write_mode is non-zero we call bawrite()
+                * unconditionally.  Otherwise we only use bawrite()
+                * if the writes are clearly sequential.
                 */
+               bp->b_flags |= B_AGE;
                if (ap->a_ioflag & IO_SYNC) {
                        bwrite(bp);
                } else if (ap->a_ioflag & IO_DIRECT) {
                        bawrite(bp);
+               } else if (hammer_write_mode &&
+                          (uio->uio_offset & HAMMER_BUFMASK) == 0) {
 #if 1
+                       /* strategy write cannot handled clustered writes */
+                       bp->b_flags |= B_CLUSTEROK;
+                       cluster_write(bp, ip->ino_data.size, seqcount);
+#else
+#endif
+                       bawrite(bp);
                } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
                           (uio->uio_offset & HAMMER_BUFMASK) == 0) {
                        /*
@@ -453,7 +468,6 @@ hammer_vop_write(struct vop_write_args *ap)
                         * flushes.
                         */
                        bawrite(bp);
-#endif
                } else {
                        bdwrite(bp);
                }
@@ -2103,6 +2117,9 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
         * to raw addresses.
         */
        hammer_simple_transaction(&trans, ip->hmp);
+#if 0
+       kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
+#endif
        hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
 
        /*
@@ -2202,8 +2219,12 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
                "", base_disk_offset, last_disk_offset);
 #endif
 
-       if (cursor.node)
+       if (cursor.node) {
                hammer_cache_node(cursor.node, &ip->cache[1]);
+#if 0
+               kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
+#endif
+       }
        hammer_done_cursor(&cursor);
        hammer_done_transaction(&trans);