hammer2 - freemap part 3 - group by allocation size
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 4 Jun 2013 21:29:20 +0000 (14:29 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 4 Jun 2013 21:29:20 +0000 (14:29 -0700)
* Each freemap leaf represents ~2MB worth of storage.  Assign a radix to
  each leaf, limiting allocations from that leaf to that radix.

  This primarily results in inodes being grouped together, improving
  the performance for find, ls or other topological scans.  We could
  improve this but for now we'll stick with it as-is.

  This mechanic also allows us to use cluster_read().  This function is
  used for everything except volume-header and freemap elements.

* More formally handle logical sizes vs allocation sizes vs device I/O
  sizes.  For example, a 1KB inode allocates 1KB using 16KB device I/O's.

* Beef up the sysctl I/O counters.

sys/vfs/hammer2/DESIGN
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_vfsops.c

index 47ff7bd..5d9aab5 100644 (file)
@@ -184,6 +184,17 @@ but doesn't complicate anything else.  The inode number is stored in the
 inode itself, an absolutely necessary feature in order to support the
 hugely flexible snapshots that we want to have in HAMMER2.
 
+                           DISK I/O OPTIMIZATIONS
+
+The freemap implements a 1KB allocation resolution.  The minimum I/O size
+is 16KB.  HAMMER2 typically implements 16KB and 64KB physical I/O sizes
+and will cluster larger I/O's.
+
+Each 2MB segment managed by the freemap handles just one particular
+physical I/O size.  Typically this means that inodes, small data, and
+initial (small) indirect blocks get clustered together.  Also large 64KB
+file-data and indirect blocks get clustered together.
+
                                  HARDLINKS
 
 Hardlinks are a particularly sticky problem for HAMMER2 due to the lack of
index 707aec9..55a1438 100644 (file)
@@ -415,7 +415,7 @@ struct hammer2_mount {
        hammer2_trans_t *curflush;      /* current flush in progress */
        hammer2_tid_t   topo_flush_tid; /* currently synchronizing flush pt */
        hammer2_tid_t   free_flush_tid; /* currently synchronizing flush pt */
-       hammer2_off_t   heur_last_alloc;
+       hammer2_off_t   heur_freemap[HAMMER2_MAX_RADIX+1];
        int             flushcnt;       /* #of flush trans on the list */
 
        int             volhdrno;       /* last volhdrno written */
@@ -454,6 +454,30 @@ MALLOC_DECLARE(M_HAMMER2);
 #define VTOI(vp)       ((hammer2_inode_t *)(vp)->v_data)
 #define ITOV(ip)       ((ip)->vp)
 
+static __inline
+int
+hammer2_devblkradix(int radix)
+{
+       int cluster_radix;
+
+       if (radix <= HAMMER2_LBUFRADIX)
+               cluster_radix = HAMMER2_LBUFRADIX;
+       else
+               cluster_radix = HAMMER2_PBUFRADIX;
+       return(cluster_radix);
+}
+
+static __inline
+size_t
+hammer2_devblksize(size_t bytes)
+{
+       if (bytes <= HAMMER2_LBUFSIZE)
+               return(HAMMER2_LBUFSIZE);
+       else
+               return(HAMMER2_PBUFSIZE);
+}
+
+
 static __inline
 hammer2_pfsmount_t *
 MPTOPMP(struct mount *mp)
@@ -498,6 +522,8 @@ extern int hammer2_hardlink_enable;
 extern long hammer2_iod_file_read;
 extern long hammer2_iod_meta_read;
 extern long hammer2_iod_indr_read;
+extern long hammer2_iod_fmap_read;
+extern long hammer2_iod_volu_read;
 extern long hammer2_iod_file_write;
 extern long hammer2_iod_meta_write;
 extern long hammer2_iod_indr_write;
@@ -506,6 +532,8 @@ extern long hammer2_iod_volu_write;
 extern long hammer2_ioa_file_read;
 extern long hammer2_ioa_meta_read;
 extern long hammer2_ioa_indr_read;
+extern long hammer2_ioa_fmap_read;
+extern long hammer2_ioa_volu_read;
 extern long hammer2_ioa_file_write;
 extern long hammer2_ioa_meta_write;
 extern long hammer2_ioa_indr_write;
index a479fad..3686388 100644 (file)
@@ -72,6 +72,7 @@ static int hammer2_indirect_optimize; /* XXX SYSCTL */
 static hammer2_chain_t *hammer2_chain_create_indirect(
                hammer2_trans_t *trans, hammer2_chain_t *parent,
                hammer2_key_t key, int keybits, int for_type, int *errorp);
+static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes);
 
 /*
  * We use a red-black tree to guarantee safe lookups under shared locks.
@@ -99,6 +100,20 @@ hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
        return(0);
 }
 
+static __inline
+int
+hammer2_isclusterable(hammer2_chain_t *chain)
+{
+       if (hammer2_cluster_enable) {
+               if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
+                   chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
+                   chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
+                       return(1);
+               }
+       }
+       return(0);
+}
+
 /*
  * Recursively set the SUBMODIFIED flag up to the root starting at chain's
  * parent.  SUBMODIFIED is not set in chain itself.
@@ -493,10 +508,11 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
        hammer2_chain_core_t *core;
        hammer2_blockref_t *bref;
        hammer2_off_t pbase;
+       hammer2_off_t pmask;
        hammer2_off_t peof;
        ccms_state_t ostate;
        size_t boff;
-       size_t bbytes;
+       size_t psize;
        int error;
        char *bdata;
 
@@ -574,27 +590,29 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
         */
        bref = &chain->bref;
 
-       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
-               bbytes = HAMMER2_MINIOSIZE;
-       pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
-       peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
-       boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+       psize = hammer2_devblksize(chain->bytes);
+       pmask = (hammer2_off_t)psize - 1;
+       pbase = bref->data_off & ~pmask;
+       boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
        KKASSERT(pbase != 0);
+       peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
 
        /*
         * The getblk() optimization can only be used on newly created
         * elements if the physical block size matches the request.
         */
        if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
-           chain->bytes == bbytes) {
-               chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+           chain->bytes == psize) {
+               chain->bp = getblk(hmp->devvp, pbase, psize, 0, 0);
                error = 0;
-       } else if (hammer2_cluster_enable) {
-               error = cluster_read(hmp->devvp, peof, pbase, bbytes,
-                                    HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
+       } else if (hammer2_isclusterable(chain)) {
+               error = cluster_read(hmp->devvp, peof, pbase, psize,
+                                    psize, HAMMER2_PBUFSIZE*4,
                                     &chain->bp);
+               adjreadcounter(&chain->bref, chain->bytes);
        } else {
-               error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
+               error = bread(hmp->devvp, pbase, psize, &chain->bp);
+               adjreadcounter(&chain->bref, chain->bytes);
        }
 
        if (error) {
@@ -788,7 +806,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
                        counterp = &hammer2_ioa_volu_write;
                        break;
                }
-               ++*counterp;
+               *counterp += chain->bytes;
        } else {
                switch(chain->bref.type) {
                case HAMMER2_BREF_TYPE_DATA:
@@ -808,7 +826,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
                        counterp = &hammer2_iod_volu_write;
                        break;
                }
-               ++*counterp;
+               *counterp += chain->bytes;
        }
 
        /*
@@ -1006,11 +1024,13 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        hammer2_mount_t *hmp = trans->hmp;
        hammer2_chain_t *chain;
        hammer2_off_t pbase;
+       hammer2_off_t pmask;
+       hammer2_off_t peof;
        hammer2_tid_t flush_tid;
        struct buf *nbp;
        int error;
        int wasinitial;
-       size_t bbytes;
+       size_t psize;
        size_t boff;
        void *bdata;
 
@@ -1177,29 +1197,33 @@ skipxx: /* XXX */
                 */
                KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain);
 
-               /*
-                * The device buffer may be larger than the allocation size.
-                */
-               if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
-                       bbytes = HAMMER2_MINIOSIZE;
-               pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
-               boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+               psize = hammer2_devblksize(chain->bytes);
+               pmask = (hammer2_off_t)psize - 1;
+               pbase = chain->bref.data_off & ~pmask;
+               boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
+               KKASSERT(pbase != 0);
+               peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
 
                /*
-                * Buffer aliasing is possible, check for the case.
-                *
                 * The getblk() optimization can only be used if the
-                * physical block size matches the request.
+                * chain element size matches the physical block size.
                 */
                if (chain->bp && chain->bp->b_loffset == pbase) {
                        nbp = chain->bp;
-               } else if (chain->bytes == bbytes) {
-                       nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
                        error = 0;
+               } else if (chain->bytes == psize) {
+                       nbp = getblk(hmp->devvp, pbase, psize, 0, 0);
+                       error = 0;
+               } else if (hammer2_isclusterable(chain)) {
+                       error = cluster_read(hmp->devvp, peof, pbase, psize,
+                                            psize, HAMMER2_PBUFSIZE*4,
+                                            &nbp);
+                       adjreadcounter(&chain->bref, chain->bytes);
                } else {
-                       error = bread(hmp->devvp, pbase, bbytes, &nbp);
-                       KKASSERT(error == 0);
+                       error = bread(hmp->devvp, pbase, psize, &nbp);
+                       adjreadcounter(&chain->bref, chain->bytes);
                }
+               KKASSERT(error == 0);
                bdata = (char *)nbp->b_data + boff;
 
                /*
@@ -1226,6 +1250,7 @@ skipxx: /* XXX */
                if (chain->bp != nbp) {
                        if (chain->bp) {
                                if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
+                                       chain->bp->b_flags |= B_CLUSTEROK;
                                        bdwrite(chain->bp);
                                } else {
                                        chain->bp->b_flags |= B_RELBUF;
@@ -3353,3 +3378,30 @@ hammer2_chain_wait(hammer2_chain_t *chain)
 {
        tsleep(chain, 0, "chnflw", 1);
 }
+
+static
+void
+adjreadcounter(hammer2_blockref_t *bref, size_t bytes)
+{
+       long *counterp;
+
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_DATA:
+               counterp = &hammer2_iod_file_read;
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               counterp = &hammer2_iod_meta_read;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               counterp = &hammer2_iod_indr_read;
+               break;
+       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+       case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
+               counterp = &hammer2_iod_fmap_read;
+               break;
+       default:
+               counterp = &hammer2_iod_volu_read;
+               break;
+       }
+       *counterp += bytes;
+}
index faf8fe6..cbe2ae8 100644 (file)
  *                       blocks except the block straddling EOF.
  *
  * HAMMER2_SEGSIZE     - Allocation map segment size, typically 2MB
+ *                       (space represented by a level0 bitmap).
  */
 
-#define HAMMER2_SEGSIZE                (65536 * 8)
+#define HAMMER2_SEGSIZE                (1 << HAMMER2_FREEMAP_LEVEL0_RADIX)
 
 #define HAMMER2_PBUFRADIX      16      /* physical buf (1<<16) bytes */
 #define HAMMER2_PBUFSIZE       65536
 /*
  * Generally speaking we want to use 16K and 64K I/Os
  */
-#if 1
 #define HAMMER2_MINIORADIX     HAMMER2_LBUFRADIX
 #define HAMMER2_MINIOSIZE      HAMMER2_LBUFSIZE
-#else
-#define HAMMER2_MINIORADIX     10
-#define HAMMER2_MINIOSIZE      1024
-#endif
 
 #define HAMMER2_IND_BYTES_MIN  HAMMER2_LBUFSIZE
 #define HAMMER2_IND_BYTES_MAX  HAMMER2_PBUFSIZE
@@ -403,7 +399,7 @@ struct hammer2_blockref {           /* MUST BE EXACTLY 64 BYTES */
                /*
                 * Freemap hints are embedded in addition to the icrc32.
                 *
-                * biggest - largest possible allocation 2^N within sub-tree.
+                * biggest - Largest possible allocation 2^N within sub-tree.
                 *           typically initialized to 64 in freemap_blockref
                 *           and reduced as-needed when a request fails.
                 *
@@ -412,11 +408,15 @@ struct hammer2_blockref {         /* MUST BE EXACTLY 64 BYTES */
                 *           biggest hint will be adjusted downward.
                 *
                 *           Used when allocating space.
+                *
+                * radix   - (Leaf only) once assigned, radix for clustering.
+                *           All device I/O can cluster within the 2MB
+                *           segment.
                 */
                struct {
                        uint32_t icrc32;
                        uint8_t biggest;
-                       uint8_t reserved05;
+                       uint8_t radix;          /* 0, LBUFRADIX, PBUFRADIX */
                        uint8_t reserved06;
                        uint8_t reserved07;
                        uint64_t avail;         /* total available bytes */
index 8f1a1fe..805480e 100644 (file)
@@ -327,10 +327,11 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
        hammer2_mount_t *hmp;
        hammer2_blockref_t *bref;
        hammer2_off_t pbase;
+       hammer2_off_t pmask;
        hammer2_tid_t saved_sync;
        hammer2_trans_t *trans = info->trans;
        hammer2_chain_core_t *core;
-       size_t bbytes;
+       size_t psize;
        size_t boff;
        char *bdata;
        struct buf *bp;
@@ -660,11 +661,12 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                 * Make sure any device buffer(s) have been flushed out here.
                 * (there aren't usually any to flush).
                 */
-               bbytes = chain->bytes;
-               pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
-               boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+               psize = hammer2_devblksize(chain->bytes);
+               pmask = (hammer2_off_t)psize - 1;
+               pbase = chain->bref.data_off & ~pmask;
+               boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
 
-               bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
+               bp = getblk(hmp->devvp, pbase, psize, GETBLK_NOWAIT, 0);
                if (bp) {
                        if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
                            (B_CACHE | B_DIRTY)) {
@@ -730,22 +732,18 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                 * The data is embedded, we have to acquire the
                 * buffer cache buffer and copy the data into it.
                 */
-               if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
-                       bbytes = HAMMER2_MINIOSIZE;
-               pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
-               boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+               psize = hammer2_devblksize(chain->bytes);
+               pmask = (hammer2_off_t)psize - 1;
+               pbase = bref->data_off & ~pmask;
+               boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
 
                /*
                 * The getblk() optimization can only be used if the
                 * physical block size matches the request.
                 */
-               if (chain->bytes == bbytes) {
-                       bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
-                       error = 0;
-               } else {
-                       error = bread(hmp->devvp, pbase, bbytes, &bp);
-                       KKASSERT(error == 0);
-               }
+               error = bread(hmp->devvp, pbase, psize, &bp);
+               KKASSERT(error == 0);
+
                bdata = (char *)bp->b_data + boff;
 
                /*
@@ -756,6 +754,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
                bp->b_flags |= B_CLUSTEROK;
                bdwrite(bp);
                bp = NULL;
+
                switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
                case HAMMER2_CHECK_FREEMAP:
                        chain->bref.check.freemap.icrc32 =
index b55a158..d14205c 100644 (file)
@@ -58,6 +58,13 @@ static int hammer2_freemap_iterate(hammer2_trans_t *trans,
 
 #endif
 
+static __inline
+int
+hammer2_freemapradix(int radix)
+{
+       return(radix);
+}
+
 /*
  * Calculate the device offset for the specified FREEMAP_NODE or FREEMAP_LEAF
  * bref.  Return a combined media offset and physical size radix.  Freemap
@@ -90,13 +97,11 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
 
        /*
         * Adjust by HAMMER2_ZONE_FREEMAP_{A,B,C,D} using the existing
-        * offset as a basis.
+        * offset as a basis.  Start in zone A if previously unallocated.
         */
        if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) {
                off = HAMMER2_ZONE_FREEMAP_A;
        } else {
-               off = HAMMER2_ZONE_FREEMAP_A;
-#if 0
                off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX &
                      (((hammer2_off_t)1 << HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
                off = off / HAMMER2_PBUFSIZE;
@@ -111,7 +116,6 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
                        off = HAMMER2_ZONE_FREEMAP_C;
                else
                        off = HAMMER2_ZONE_FREEMAP_B;
-#endif
        }
        off = off * HAMMER2_PBUFSIZE;
 
@@ -293,6 +297,7 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
        hammer2_chain_t *parent;
        hammer2_off_t bpref;
        hammer2_off_t bnext;
+       int freemap_radix;
        int radix;
        int error;
 
@@ -315,17 +320,25 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
        return (hammer2_freemap_simple_alloc(hmp, bref, radix));
 #else
 
-       /*
-        * Calculate actual allocation in bytes, and radix.  This ensures
-        * a minimum 1KB allocation.
-        */
        KKASSERT(bytes >= HAMMER2_MIN_ALLOC &&
                 bytes <= HAMMER2_MAX_ALLOC);
 
-#if 0
        /*
-        * Calculate starting point
+        * Calculate the starting point for our allocation search.
+        *
+        * Each freemap leaf is dedicated to a specific freemap_radix.
+        * The freemap_radix can be more fine-grained than the device buffer
+        * radix which results in inodes being grouped together in their
+        * own segment, terminal-data (16K or less) and initial indirect
+        * block being grouped together, and then full-indirect and full-data
+        * blocks (64K) being grouped together.
+        *
+        * The single most important aspect of this is the inode grouping
+        * because that is what allows 'find' and 'ls' and other filesystem
+        * topology operations to run fast.
         */
+       freemap_radix = hammer2_freemapradix(radix);
+#if 0
        if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
                bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
        else if (trans->tmp_bpref)
@@ -334,7 +347,8 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
                bpref = trans->tmp_ip->chain->bref.data_off;
        else
 #endif
-               bpref = hmp->heur_last_alloc;   /* SMP race ok, heuristic */
+       KKASSERT(radix >= 0 && radix <= HAMMER2_MAX_RADIX);
+       bpref = hmp->heur_freemap[freemap_radix];
 
        /*
         * Make sure bpref is in-bounds.  It's ok if bpref covers a zone's
@@ -355,7 +369,7 @@ hammer2_freemap_alloc(hammer2_trans_t *trans,
                error = hammer2_freemap_try_alloc(trans, &parent, bref,
                                                  radix, bpref, &bnext);
        }
-       hmp->heur_last_alloc = bnext;   /* XXX */
+       hmp->heur_freemap[freemap_radix] = bnext;
        hammer2_chain_unlock(parent);
 
        return (error);
@@ -412,6 +426,8 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
        int index;
        int count;
        int subindex;
+       int freemap_radix;
+       int devblk_radix;
 
        /*
         * Calculate the number of bytes being allocated, the number
@@ -425,6 +441,9 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
        bits = 1 << (radix - HAMMER2_MIN_RADIX);
        mask = (bits == 64) ? (uint64_t)-1 : (((uint64_t)1 << bits) - 1);
 
+       devblk_radix = hammer2_devblkradix(radix);
+       freemap_radix = hammer2_freemapradix(radix);
+
        /*
         * Lookup the level0 freemap chain, creating and initializing one
         * if necessary.  Intermediate levels will be created automatically
@@ -459,6 +478,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                        chain->bref.check.freemap.biggest =
                                        HAMMER2_FREEMAP_LEVEL0_RADIX;
                        chain->bref.check.freemap.avail = l0size;
+                       chain->bref.check.freemap.radix = freemap_radix;
 
                        /*
                         * Preset bitmap for existing static allocations.
@@ -508,6 +528,11 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                 * Already flagged as not having enough space
                 */
                error = ENOSPC;
+       } else if (chain->bref.check.freemap.radix != freemap_radix) {
+               /*
+                * Wrong cluster radix, cannot allocate from this leaf.
+                */
+               error = ENOSPC;
        } else {
                /*
                 * Modify existing chain to setup for adjustment.
@@ -532,45 +557,23 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
         * Allocate data and meta-data from the beginning and inodes
         * from the end.
         */
-       if (bref->type != HAMMER2_BREF_TYPE_INODE) {
-               for (index = 0; index < count; ++index) {
-                       if (data[index] == (uint64_t)-1) /* all allocated */
-                               continue;
-                       tmp_mask = mask;                 /* iterate */
-                       for (subindex = 0; subindex < 64; subindex += bits) {
-                               if ((data[index] & tmp_mask) == 0)
-                                       break;
-                               tmp_mask <<= bits;
-                       }
-                       if (subindex != 64) {
-                               key += HAMMER2_MIN_ALLOC * 64 * index;
-                               key += HAMMER2_MIN_ALLOC * subindex;
+       for (index = 0; index < count; ++index) {
+               if (data[index] == (uint64_t)-1) /* all allocated */
+                       continue;
+               tmp_mask = mask;                 /* iterate */
+               for (subindex = 0; subindex < 64; subindex += bits) {
+                       if ((data[index] & tmp_mask) == 0)
                                break;
-                       }
+                       tmp_mask <<= bits;
                }
-               if (index == count)
-                       error = ENOSPC;
-       } else {
-               for (index = count - 1; index >= 0; --index) {
-                       if (data[index] == (uint64_t)-1) /* all allocated */
-                               continue;
-                       tmp_mask = mask << (64 - bits);
-                       for (subindex = 64 - bits;
-                            subindex >= 0;
-                            subindex -= bits) {
-                               if ((data[index] & tmp_mask) == 0)
-                                       break;
-                               tmp_mask >>= bits;
-                       }
-                       if (subindex != -bits) {
-                               key += HAMMER2_MIN_ALLOC * 64 * index;
-                               key += HAMMER2_MIN_ALLOC * subindex;
-                               break;
-                       }
+               if (subindex != 64) {
+                       key += HAMMER2_MIN_ALLOC * 64 * index;
+                       key += HAMMER2_MIN_ALLOC * subindex;
+                       break;
                }
-               if (index == -1)
-                       error = ENOSPC;
        }
+       if (index == count)
+               error = ENOSPC;
 
 skip:
        if (error == 0) {
@@ -589,20 +592,15 @@ skip:
                /*
                 * Modify the chain and set the bitmap appropriately.
                 *
-                * Determine if we can massage the buffer cache buffer
-                * to avoid a read.  If the allocation is smaller than
-                * the minimum IO size we look at the bitmap mask covering
-                * the allocation at the minimum IO size.  If it is
-                * unallocated we instantiate and clear the buffer which
-                * marks it B_CACHE and validates it without issuing a read.
-                *
-                * For allocation requests >= MINIOSIZE other code will deal
-                * with the read-avoidance when the chain is locked.
+                * For smaller allocations try to avoid a read-before-write
+                * by priming the buffer cache buffer.  The caller handles
+                * read-avoidance for larger allocations (or more properly,
+                * when the chain is locked).
                 */
                prebuf = 0;
                hammer2_chain_modify(trans, &chain, 0);
                data = &chain->data->bmdata.array[0];
-               if (radix < HAMMER2_MINIORADIX) {
+               if (radix != devblk_radix) {
                        uint64_t iomask;
                        int iobmradix = HAMMER2_MINIORADIX - HAMMER2_MIN_RADIX;
                        int ioindex;
@@ -631,11 +629,15 @@ skip:
                if (prebuf) {
                        struct buf *bp;
                        hammer2_off_t pbase;
+                       hammer2_off_t csize;
+                       hammer2_off_t cmask;
 
-                       pbase = key & ~(hammer2_off_t)(HAMMER2_MINIOSIZE - 1);
+                       csize = (hammer2_off_t)1 << devblk_radix;
+                       cmask = csize - 1;
+                       pbase = key & ~mask;
 
-                       bp = getblk(hmp->devvp, pbase,
-                                   HAMMER2_MINIOSIZE, GETBLK_NOWAIT, 0);
+                       bp = getblk(hmp->devvp, pbase, csize,
+                                   GETBLK_NOWAIT, 0);
                        if (bp) {
                                if ((bp->b_flags & B_CACHE) == 0)
                                        vfs_bio_clrbuf(bp);
@@ -668,24 +670,6 @@ skip:
        return (error);
 }
 
-#if 0
-       /*
-        * When making meta-data allocations smaller than LBUFSIZE we will
-        * use a LBUFSIZE'd buffer.  The first chunk allocated from such a
-        * buffer instantiates a device buffer and marks it clean to avoid
-        * unnecessary read-before-write ops.  XXX buffer cache buffer
-        * sharing.  XXX mixed data/meta-data issues.
-        */
-       if (bytes < HAMMER2_MINIOSIZE &&
-           (data_off & (HAMMER2_MINIOSIZE - 1)) == 0 &&
-           (bitmap shows this is the initial allocation)) {
-               bp = getblk(hmp->devvp, data_off, HAMMER2_MINIOSIZE, 0, 0);
-               bp->b_flags |= B_CACHE;
-               bp->b_resid = 0;
-               bqrelse(bp);
-       }
-#endif
-
 static int
 hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                        hammer2_chain_t **chainp,
index 4117579..074ee2e 100644 (file)
@@ -61,11 +61,13 @@ static struct hammer2_mntlist hammer2_mntlist;
 static struct lock hammer2_mntlk;
 
 int hammer2_debug;
-int hammer2_cluster_enable = 0;        /* XXX temporary until layout ironed out */
+int hammer2_cluster_enable = 1;
 int hammer2_hardlink_enable = 1;
 long hammer2_iod_file_read;
 long hammer2_iod_meta_read;
 long hammer2_iod_indr_read;
+long hammer2_iod_fmap_read;
+long hammer2_iod_volu_read;
 long hammer2_iod_file_write;
 long hammer2_iod_meta_write;
 long hammer2_iod_indr_write;
@@ -74,6 +76,8 @@ long hammer2_iod_volu_write;
 long hammer2_ioa_file_read;
 long hammer2_ioa_meta_read;
 long hammer2_ioa_indr_read;
+long hammer2_ioa_fmap_read;
+long hammer2_ioa_volu_read;
 long hammer2_ioa_fmap_write;
 long hammer2_ioa_file_write;
 long hammer2_ioa_meta_write;
@@ -88,32 +92,48 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
           &hammer2_cluster_enable, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
           &hammer2_hardlink_enable, 0, "");
+
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
           &hammer2_iod_file_read, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
           &hammer2_iod_meta_read, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
           &hammer2_iod_indr_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
+          &hammer2_iod_fmap_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
+          &hammer2_iod_volu_read, 0, "");
+
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
           &hammer2_iod_file_write, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
           &hammer2_iod_meta_write, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
           &hammer2_iod_indr_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
+          &hammer2_iod_fmap_write, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
           &hammer2_iod_volu_write, 0, "");
+
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
           &hammer2_ioa_file_read, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
           &hammer2_ioa_meta_read, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
           &hammer2_ioa_indr_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
+          &hammer2_ioa_fmap_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
+          &hammer2_ioa_volu_read, 0, "");
+
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
           &hammer2_ioa_file_write, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
           &hammer2_ioa_meta_write, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
           &hammer2_ioa_indr_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
+          &hammer2_ioa_fmap_write, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
           &hammer2_ioa_volu_write, 0, "");