Clean up the structural organization. Separate out A-lists and make
authorMatthew Dillon <dillon@dragonflybsd.org>
Fri, 12 Oct 2007 18:57:45 +0000 (18:57 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Fri, 12 Oct 2007 18:57:45 +0000 (18:57 +0000)
the header filenames conform to a single standard.  Add a super-cluster
header and reorganize the A-list topology used in the cluster header.

Augment A-lists to support hinting across A-list layers.  This allows
A-lists to be glued together hierarchically and to pass hinting information
between the layers.  This support will be used by HAMMER to manage cluster
allocation via the A-lists embedded in volume and super-cluster headers,
and to manage B-Tree, record, and data allocation via the A-lists embedded
in the cluster header and embedded in individual filesystem buffers.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_alist.c
sys/vfs/hammer/hammer_alist.h [new file with mode: 0644]
sys/vfs/hammer/hammer_btree.h
sys/vfs/hammer/hammer_disk.h [moved from sys/vfs/hammer/hammerfs.h with 65% similarity]

index 4599e46..d289986 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.1 2007/10/10 19:37:25 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.2 2007/10/12 18:57:45 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
- * implementation.  See hammerfs.h for on-disk structures.
+ * implementation.  See hammer_disk.h for on-disk structures.
  */
 
 #include <sys/tree.h>
 #include <sys/malloc.h>
-#include "hammerfs.h"
+#include <sys/buf.h>
+#include "hammer_disk.h"
+#include "hammer_alist.h"
 #include "hammer_mount.h"
 
 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
@@ -61,8 +63,21 @@ typedef struct hammer_inode_info {
  */
 struct hammer_btree_info {
        struct hammer_base_elm key;
+       struct hammer_cluster *cluster;
+       struct buf *bp_node;
+       struct buf *bp2;
+       struct buf *bp3;
+       struct hammer_btree_node *node; /* marker for insertion/deletion */
+       int index;                      /* marker for insertion/deletion */
+       union hammer_btree_elm *elm;    /* marker for insertion/deletion */
+       union hammer_record_ondisk *rec;/* returned record pointer */
+       union hammer_data_ondisk *data; /* returned data pointer */
 };
 
+#define HAMMER_BTREE_GET_RECORD                0x0001
+#define HAMMER_BTREE_GET_DATA          0x0002
+#define HAMMER_BTREE_CLUSTER_TAG       0x0004  /* stop at the cluster tag */
+
 /*
  * Structures used to internally represent an inode
  */
@@ -76,7 +91,9 @@ struct hammer_inode {
        RB_ENTRY(hammer_inode) rb_node;
        u_int64_t       obj_id;         /* (key) object identifier */
        hammer_tid_t    obj_asof;       /* (key) snapshot transid or 0 */
-       struct hammer_mount *hmp;
+       struct vnode    *vp;
+       struct hammer_inode_record ino_rec;
+       struct hammer_inode_data ino_data;
 };
 
 /*
@@ -96,20 +113,29 @@ RB_PROTOTYPE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
 struct hammer_volume {
        RB_ENTRY(hammer_volume) rb_node;
        struct hammer_clu_rb_tree rb_clus_root;
+       struct buf *bp;
        struct hammer_volume_ondisk *ondisk;
        int32_t vol_no;
        int32_t vol_clsize;
        int64_t cluster_base;   /* base offset of cluster 0 */
-       char *vol_name;
+       char    *vol_name;
        struct vnode *devvp;
        struct hammer_mount *hmp;
+       int     lock_count;
+       int     ref_count;
+       int     flags;
 };
 
+#define HAMFS_CLUSTER_DIRTY    0x0001
+
 struct hammer_cluster {
        RB_ENTRY(hammer_cluster) rb_node;
+       struct buf *bp;
        struct hammer_cluster_ondisk *ondisk;
        struct hammer_volume *volume;
        int32_t clu_no;
+       int     lock_count;
+       int     ref_count;
 };
 
 /*
@@ -138,13 +164,14 @@ int       hammer_vfs_vget(struct mount *mp, ino_t ino, struct  vnode **vpp);
 int    hammer_unload_inode(struct hammer_inode *inode, void *data __unused);
 int    hammer_unload_volume(struct hammer_volume *volume, void *data __unused);
 int    hammer_load_volume(struct hammer_mount *hmp, const char *volname);
-struct hammer_cluster *hammer_load_cluster(struct hammer_mount *hmp,
-                               struct hammer_volume *volume, int clu_no,
-                               int *errorp);
+struct hammer_cluster *hammer_load_cluster(struct hammer_volume *volume,
+                                          int clu_no, int *errorp);
 
-int    hammer_btree_lookup(struct hammer_mount *hmp,
-                               struct hammer_btree_info *info);
+int    hammer_btree_lookup(struct hammer_btree_info *info, int flags);
+void   hammer_btree_lookup_done(struct hammer_btree_info *info);
 
+void   *hammer_bread(struct hammer_cluster *cluster, int32_t cloff,
+                     int *errorp, struct buf **bufp);
 
 #endif
 
index d469c1b..5c88447 100644 (file)
@@ -38,7 +38,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.c,v 1.1 2007/10/10 19:37:25 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.c,v 1.2 2007/10/12 18:57:45 dillon Exp $
  */
 /*
  * This module implements a generic allocator through the use of a hinted
  * arbitrarily without adding much in the way of additional meta-storage
  * for the allocator.
  *
+ * The radix tree supports allocator layering. By supplying a base_radix > 1
+ * the allocator will issue callbacks to recurse into lower layers once 
+ * higher layers have been exhausted.  Allocations in multiples of base_radix
+ * will be entirely retained in the higher level allocator and never recurse.
+ *
  * This code can be compiled stand-alone for debugging.
  */
 
@@ -76,7 +81,7 @@
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 
-#include "hammerfs.h"
+#include "hammer_disk.h"
 
 #else
 
 #define KKASSERT(exp)  assert(exp)
 struct malloc_type;
 
-#include "hammerfs.h"
+#include "hammer_alist.h"
 
 void panic(const char *ctl, ...);
 
@@ -107,75 +112,107 @@ void panic(const char *ctl, ...);
  * static support functions
  */
 
-static int32_t hammer_alst_leaf_alloc_fwd(hammer_almeta_t *scan,
+static int32_t hammer_alst_leaf_alloc_fwd(hammer_almeta_t scan,
                                        int32_t blk, int count);
-static int32_t hammer_alst_meta_alloc_fwd(hammer_almeta_t *scan,
+static int32_t hammer_alst_meta_alloc_fwd(hammer_alist_t live,
+                                       hammer_almeta_t scan,
                                        int32_t blk, int32_t count,
                                        int32_t radix, int skip);
-static int32_t hammer_alst_leaf_alloc_rev(hammer_almeta_t *scan,
+static int32_t hammer_alst_leaf_alloc_rev(hammer_almeta_t scan,
                                        int32_t blk, int count);
-static int32_t hammer_alst_meta_alloc_rev(hammer_almeta_t *scan,
+static int32_t hammer_alst_meta_alloc_rev(hammer_alist_t live,
+                                       hammer_almeta_t scan,
                                        int32_t blk, int32_t count,
                                        int32_t radix, int skip);
-static void hammer_alst_leaf_free(hammer_almeta_t *scan,
+static void hammer_alst_leaf_free(hammer_almeta_t scan,
                                        int32_t relblk, int count);
-static void hammer_alst_meta_free(hammer_almeta_t *scan,
+static void hammer_alst_meta_free(hammer_alist_t live, hammer_almeta_t scan,
                                        int32_t freeBlk, int32_t count, 
                                        int32_t radix, int skip, int32_t blk);
-static int32_t hammer_alst_radix_init(hammer_almeta_t *scan,
+static int32_t hammer_alst_radix_init(hammer_almeta_t scan,
                                        int32_t radix, int skip, int32_t count);
 #ifdef ALIST_DEBUG
-static void    hammer_alst_radix_print(hammer_almeta_t *scan,
-                                       int32_t blk,
+static void    hammer_alst_radix_print(hammer_alist_t live,
+                                       hammer_almeta_t scan, int32_t blk,
                                        int32_t radix, int skip, int tab);
 #endif
 
 /*
- * Initialize an alist for use.  The alist will initially be marked
- * all-allocated so the caller must free the portion it wishes to manage.
+ * Initialize an a-list config structure for use.  The config structure
+ * describes the basic structure of an a-list's topology and may be
+ * shared by any number of a-lists which have the same topology.
+ *
+ * blocks is the total number of blocks, that is the number of blocks
+ * handled at this layer multiplied by the base radix.
+ *
+ * When base_radix != 1 the A-list has only meta elements and does not have
+ * any leaves, in order to be able to track partial allocations.
  */
 void
-hammer_alist_template(hammer_alist_t bl, int blocks, int maxmeta)
+hammer_alist_template(hammer_alist_config_t bl, int32_t blocks,
+                     int32_t base_radix, int32_t maxmeta)
 {
        int radix;
-       int skip = 0;
+       int skip;
 
        /*
-        * Calculate radix and skip field used for scanning.
+        * Calculate radix and skip field used for scanning.  The leaf nodes
+        * in our tree are either BMAP or META nodes depending on whether
+        * we chain to a lower level allocation layer or not.
         */
-       radix = HAMMER_ALIST_BMAP_RADIX;
+       if (base_radix == 1)
+               radix = HAMMER_ALIST_BMAP_RADIX;
+       else
+               radix = HAMMER_ALIST_META_RADIX;
+       skip = 1;
 
-       while (radix < blocks) {
+       while (radix < blocks / base_radix) {
                radix *= HAMMER_ALIST_META_RADIX;
-               skip = (skip + 1) * HAMMER_ALIST_META_RADIX;
+               skip = skip * HAMMER_ALIST_META_RADIX + 1;
        }
 
+       /*
+        * Increase the radix based on the number of blocks a lower level
+        * allocator is able to handle at the 'base' of our allocator.
+        * If base_radix != 1 the caller will have to initialize the callback
+        * fields to implement the lower level allocator.
+        */
+       radix *= base_radix;
+
        bzero(bl, sizeof(*bl));
 
        bl->bl_blocks = blocks;
+       bl->bl_base_radix = base_radix;
        bl->bl_radix = radix;
        bl->bl_skip = skip;
-       bl->bl_rootblks = 1 +
-           hammer_alst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
-       KKASSERT(bl->bl_rootblks <= maxmeta);
+       bl->bl_rootblks = hammer_alst_radix_init(NULL, bl->bl_radix,
+                                                bl->bl_skip, blocks);
+       ++bl->bl_rootblks;      /* one more for freeblks header */
+       if (base_radix == 1)
+               bl->bl_terminal = 1;
+       KKASSERT(maxmeta == 0 || bl->bl_rootblks <= maxmeta);
 
 #if defined(ALIST_DEBUG)
        kprintf(
-               "ALIST representing %d blocks (%d MB of swap)"
+               "PRIMARY ALIST LAYER manages %d blocks"
                ", requiring %dK (%d bytes) of ram\n",
-               bl->bl_blocks,
-               bl->bl_blocks * 4 / 1024,
-               (bl->bl_rootblks * sizeof(hammer_almeta_t) + 1023) / 1024,
-               (bl->bl_rootblks * sizeof(hammer_almeta_t))
+               bl->bl_blocks / bl->bl_base_radix,
+               (bl->bl_rootblks * sizeof(struct hammer_almeta) + 1023) / 1024,
+               (bl->bl_rootblks * sizeof(struct hammer_almeta))
        );
        kprintf("ALIST raw radix tree contains %d records\n", bl->bl_rootblks);
 #endif
 }
 
 void
-hammer_alist_init(hammer_alist_t bl, hammer_almeta_t *meta)
+hammer_alist_init(hammer_alist_t live)
 {
-       hammer_alst_radix_init(meta, bl->bl_radix, bl->bl_skip, bl->bl_blocks);
+       hammer_alist_config_t bl = live->config;
+
+       live->meta->bm_bighint = 0;
+       live->meta->bm_alist_freeblks = 0;
+       hammer_alst_radix_init(live->meta + 1, bl->bl_radix,
+                              bl->bl_skip, bl->bl_blocks);
 }
 
 #if !defined(_KERNEL) && defined(ALIST_DEBUG)
@@ -187,42 +224,34 @@ hammer_alist_init(hammer_alist_t bl, hammer_almeta_t *meta)
  *     blocks.  blocks must be greater then 0
  *
  *     The smallest alist consists of a single leaf node capable of 
- *     managing HAMMER_ALIST_BMAP_RADIX blocks.
+ *     managing HAMMER_ALIST_BMAP_RADIX blocks, or (if base_radix != 1)
+ *     a single meta node capable of managing HAMMER_ALIST_META_RADIX
+ *     blocks which recurses into other storage layers for a total
+ *     handling capability of (HAMMER_ALIST_META_RADIX * base_radix) blocks.
+ *
+ *     Larger a-list's increase their capability exponentially by
+ *     HAMMER_ALIST_META_RADIX.
+ *
+ *     The block count is the total number of blocks inclusive of any
+ *     layering.  blocks can be less then base_radix and will result in
+ *     a radix tree with a single leaf node which then chains down.
  */
 
 hammer_alist_t 
-hammer_alist_create(int32_t blocks, struct malloc_type *mtype,
-                   hammer_almeta_t **metap)
+hammer_alist_create(int32_t blocks, int32_t base_radix,
+                   struct malloc_type *mtype)
 {
-       hammer_alist_t bl;
-       hammer_almeta_t *meta;
-       int radix;
-       int skip = 0;
-       int rootblks;
+       hammer_alist_t live;
+       hammer_alist_config_t bl;
        size_t metasize;
 
-       /*
-        * Calculate radix and skip field used for scanning.
-        */
-       radix = HAMMER_ALIST_BMAP_RADIX;
-
-       while (radix < blocks) {
-               radix *= HAMMER_ALIST_META_RADIX;
-               skip = (skip + 1) * HAMMER_ALIST_META_RADIX;
-       }
+       live = kmalloc(sizeof(*live), mtype, M_WAITOK);
+       live->config = bl = kmalloc(sizeof(*bl), mtype, M_WAITOK);
+       hammer_alist_template(bl, blocks, base_radix, 0);
 
-       rootblks = 1 + hammer_alst_radix_init(NULL, radix, skip, blocks);
-       metasize = sizeof(struct hammer_almeta) * rootblks;
-       bl = kmalloc(sizeof(struct hammer_alist), mtype, M_WAITOK);
-       meta = kmalloc(metasize, mtype, M_WAITOK);
-
-       bzero(bl, sizeof(*bl));
-       bzero(meta, metasize);
-
-       bl->bl_blocks = blocks;
-       bl->bl_radix = radix;
-       bl->bl_skip = skip;
-       bl->bl_rootblks = rootblks;
+       metasize = sizeof(*live->meta) * bl->bl_rootblks;
+       live->meta = kmalloc(metasize, mtype, M_WAITOK);
+       bzero(live->meta, metasize);
 
 #if defined(ALIST_DEBUG)
        kprintf(
@@ -230,21 +259,27 @@ hammer_alist_create(int32_t blocks, struct malloc_type *mtype,
                ", requiring %dK (%d bytes) of ram\n",
                bl->bl_blocks,
                bl->bl_blocks * 4 / 1024,
-               (bl->bl_rootblks * sizeof(hammer_almeta_t) + 1023) / 1024,
-               (bl->bl_rootblks * sizeof(hammer_almeta_t))
+               (bl->bl_rootblks * sizeof(*live->meta) + 1023) / 1024,
+               (bl->bl_rootblks * sizeof(*live->meta))
        );
+       if (base_radix != 1) {
+               kprintf("ALIST recurses when it reaches a base_radix of %d\n",
+                       base_radix);
+       }
        kprintf("ALIST raw radix tree contains %d records\n", bl->bl_rootblks);
 #endif
-       hammer_alst_radix_init(meta, bl->bl_radix, bl->bl_skip, blocks);
-
-       *metap = meta;
-       return(bl);
+       hammer_alist_init(live);
+       return(live);
 }
 
 void
-hammer_alist_destroy(hammer_alist_t bl, struct malloc_type *mtype)
+hammer_alist_destroy(hammer_alist_t live, struct malloc_type *mtype)
 {
-       kfree(bl, mtype);
+       kfree(live->config, mtype);
+       kfree(live->meta, mtype);
+       live->config = NULL;
+       live->meta = NULL;
+       kfree(live, mtype);
 }
 
 #endif
@@ -257,41 +292,59 @@ hammer_alist_destroy(hammer_alist_t bl, struct malloc_type *mtype)
  */
 
 int32_t 
-hammer_alist_alloc(hammer_alist_t bl, hammer_almeta_t *meta, int32_t count)
+hammer_alist_alloc(hammer_alist_t live, int32_t count)
 {
        int32_t blk = HAMMER_ALIST_BLOCK_NONE;
+       hammer_alist_config_t bl = live->config;
 
        KKASSERT((count | (count - 1)) == (count << 1) - 1);
 
-       if (bl && count < bl->bl_radix) {
-               if (bl->bl_radix == HAMMER_ALIST_BMAP_RADIX) {
-                       blk = hammer_alst_leaf_alloc_fwd(meta, 0, count);
+       if (bl && count <= bl->bl_radix) {
+               /*
+                * When skip is 1 we are at a leaf.  If we are the terminal
+                * allocator we use our native leaf functions and radix will
+                * be HAMMER_ALIST_BMAP_RADIX.  Otherwise we are a meta node
+                * which will chain to another allocator.
+                */
+               if (bl->bl_skip == 1 && bl->bl_terminal) {
+#ifndef _KERNEL
+                       KKASSERT(bl->bl_radix == HAMMER_ALIST_BMAP_RADIX);
+#endif
+                       blk = hammer_alst_leaf_alloc_fwd(
+                                   live->meta + 1, 0, count);
                } else {
                        blk = hammer_alst_meta_alloc_fwd(
-                                   meta, 0, count, bl->bl_radix, bl->bl_skip);
+                                   live, live->meta + 1,
+                                   0, count, bl->bl_radix, bl->bl_skip);
                }
                if (blk != HAMMER_ALIST_BLOCK_NONE)
-                       bl->bl_free -= count;
+                       live->meta->bm_alist_freeblks -= count;
        }
        return(blk);
 }
 
 int32_t 
-hammer_alist_alloc_rev(hammer_alist_t bl, hammer_almeta_t *meta, int32_t count)
+hammer_alist_alloc_rev(hammer_alist_t live, int32_t count)
 {
+       hammer_alist_config_t bl = live->config;
        int32_t blk = HAMMER_ALIST_BLOCK_NONE;
 
        KKASSERT((count | (count - 1)) == (count << 1) - 1);
 
        if (bl && count < bl->bl_radix) {
-               if (bl->bl_radix == HAMMER_ALIST_BMAP_RADIX) {
-                       blk = hammer_alst_leaf_alloc_rev(meta, 0, count);
+               if (bl->bl_skip == 1 && bl->bl_terminal) {
+#ifndef _KERNEL
+                       KKASSERT(bl->bl_radix == HAMMER_ALIST_BMAP_RADIX);
+#endif
+                       blk = hammer_alst_leaf_alloc_rev(
+                                   live->meta + 1, 0, count);
                } else {
                        blk = hammer_alst_meta_alloc_rev(
-                                   meta, 0, count, bl->bl_radix, bl->bl_skip);
+                                   live, live->meta + 1,
+                                   0, count, bl->bl_radix, bl->bl_skip);
                }
                if (blk != HAMMER_ALIST_BLOCK_NONE)
-                       bl->bl_free -= count;
+                       live->meta->bm_alist_freeblks -= count;
        }
        return(blk);
 }
@@ -309,9 +362,7 @@ hammer_alist_alloc_rev(hammer_alist_t bl, hammer_almeta_t *meta, int32_t count)
  *     allocated behind that start point, and similarly when going backwards.
  */
 int32_t 
-hammer_alist_alloc_from(hammer_alist_t bl, hammer_almeta_t *meta,
-                       int32_t count, int32_t start, int flags)
-
+hammer_alist_alloc_from(hammer_alist_t live, int32_t count, int32_t start)
 {
 }
 
@@ -328,18 +379,22 @@ hammer_alist_alloc_from(hammer_alist_t bl, hammer_almeta_t *meta,
  */
 
 void 
-hammer_alist_free(hammer_alist_t bl, hammer_almeta_t *meta,
-                 int32_t blkno, int32_t count)
+hammer_alist_free(hammer_alist_t live, int32_t blkno, int32_t count)
 {
-       if (bl) {
-               KKASSERT(blkno + count <= bl->bl_blocks);
-               if (bl->bl_radix == HAMMER_ALIST_BMAP_RADIX)
-                       hammer_alst_leaf_free(meta, blkno, count);
-               else
-                       hammer_alst_meta_free(meta, blkno, count,
-                                             bl->bl_radix, bl->bl_skip, 0);
-               bl->bl_free += count;
+       hammer_alist_config_t bl = live->config;
+
+       KKASSERT(blkno + count <= bl->bl_blocks);
+       if (bl->bl_skip == 1 && bl->bl_terminal) {
+#ifndef _KERNEL
+               KKASSERT(bl->bl_radix == HAMMER_ALIST_BMAP_RADIX);
+#endif
+               hammer_alst_leaf_free(live->meta + 1, blkno, count);
+       } else {
+               hammer_alst_meta_free(live, live->meta + 1,
+                                     blkno, count,
+                                     bl->bl_radix, bl->bl_skip, 0);
        }
+       live->meta->bm_alist_freeblks += count;
 }
 
 #ifdef ALIST_DEBUG
@@ -349,11 +404,15 @@ hammer_alist_free(hammer_alist_t bl, hammer_almeta_t *meta,
  */
 
 void
-hammer_alist_print(hammer_alist_t bl, hammer_almeta_t *meta)
+hammer_alist_print(hammer_alist_t live, int tab)
 {
-       kprintf("ALIST {\n");
-       hammer_alst_radix_print(meta, 0, bl->bl_radix, bl->bl_skip, 4);
-       kprintf("}\n");
+       hammer_alist_config_t bl = live->config;
+
+       kprintf("%*.*sALIST (%d free blocks) {\n",
+               tab, tab, "", live->meta->bm_alist_freeblks);
+       hammer_alst_radix_print(live, live->meta + 1, 0,
+                               bl->bl_radix, bl->bl_skip, tab + 4);
+       kprintf("%*.*s}\n", tab, tab, "");
 }
 
 #endif
@@ -380,7 +439,7 @@ hammer_alist_print(hammer_alist_t bl, hammer_almeta_t *meta)
  */
 
 static int32_t
-hammer_alst_leaf_alloc_fwd(hammer_almeta_t *scan, int32_t blk, int count)
+hammer_alst_leaf_alloc_fwd(hammer_almeta_t scan, int32_t blk, int count)
 {
        u_int32_t orig = scan->bm_bitmap;
 
@@ -454,7 +513,7 @@ hammer_alst_leaf_alloc_fwd(hammer_almeta_t *scan, int32_t blk, int count)
  * This version allocates blocks in the reverse direction.
  */
 static int32_t
-hammer_alst_leaf_alloc_rev(hammer_almeta_t *scan, int32_t blk, int count)
+hammer_alst_leaf_alloc_rev(hammer_almeta_t scan, int32_t blk, int count)
 {
        u_int32_t orig = scan->bm_bitmap;
 
@@ -537,13 +596,15 @@ hammer_alst_leaf_alloc_rev(hammer_almeta_t *scan, int32_t blk, int count)
  *     and we have a few optimizations strewn in as well.
  */
 static int32_t
-hammer_alst_meta_alloc_fwd(hammer_almeta_t *scan, int32_t blk, int32_t count,
+hammer_alst_meta_alloc_fwd(hammer_alist_t live, hammer_almeta_t scan,
+                          int32_t blk, int32_t count,
                           int32_t radix, int skip
 ) {
+       hammer_alist_config_t bl;
        int i;
        u_int32_t mask;
        u_int32_t pmask;
-       int next_skip = ((u_int)skip / HAMMER_ALIST_META_RADIX);
+       int next_skip;
 
        /*
         * ALL-ALLOCATED special case
@@ -554,6 +615,7 @@ hammer_alst_meta_alloc_fwd(hammer_almeta_t *scan, int32_t blk, int32_t count,
        }
 
        radix /= HAMMER_ALIST_META_RADIX;
+       bl = live->config;
 
        /*
         * Radix now represents each bitmap entry for this meta node.  If
@@ -584,57 +646,121 @@ hammer_alst_meta_alloc_fwd(hammer_almeta_t *scan, int32_t blk, int32_t count,
                return(HAMMER_ALIST_BLOCK_NONE);
        }
 
+       /*
+        * If the count is too big we couldn't allocate anything from a
+        * recursion even if the sub-tree were entirely free.
+        */
+       if (count > radix)
+               goto failed;
+
        /*
         * If not we have to recurse.
         */
        mask = 0x00000003;
        pmask = 0x00000001;
-       for (i = 1; i <= skip; i += next_skip) {
-               if (scan[i].bm_bighint == (int32_t)-1) {
-                       /* 
-                        * Terminator
-                        */
-                       break;
-               }
-               if ((scan->bm_bitmap & mask) == mask) {
-                       scan[i].bm_bitmap = (u_int32_t)-1;
-                       scan[i].bm_bighint = radix;
-               }
 
-               if (count <= scan[i].bm_bighint) {
+       if (skip == 1) {
+               /*
+                * If skip is 1 we are a meta leaf node, which means there
+                * is another allocation layer we have to dive down into.
+                */
+               for (i = 0; i < HAMMER_ALIST_META_RADIX; ++i) {
                        /*
-                        * count fits in object
+                        * If the next layer is completely free then call
+                        * its init function to initialize it and mark it
+                        * partially free.
+                        *
+                        * bl_radix_init returns an error code or 0 on
+                        * success.
                         */
-                       int32_t r;
-                       if (next_skip == 1) {
-                               r = hammer_alst_leaf_alloc_fwd(
-                                       &scan[i], blk, count);
-                       } else {
-                               r = hammer_alst_meta_alloc_fwd(
-                                       &scan[i], blk, count,
-                                       radix, next_skip - 1);
-                       }
-                       if (r != HAMMER_ALIST_BLOCK_NONE) {
-                               if (scan[i].bm_bitmap == 0) {
-                                       scan->bm_bitmap &= ~mask;
-                               } else {
+                       if ((scan->bm_bitmap & mask) == mask) {
+                               int32_t v;
+
+                               v = bl->bl_blocks - blk;
+                               if (v > radix)
+                                       v = radix;
+                               if (bl->bl_radix_init(live->info, blk, radix, v) == 0) {
                                        scan->bm_bitmap &= ~mask;
                                        scan->bm_bitmap |= pmask;
                                }
-                               return(r);
                        }
-               } else if (count > radix) {
+
                        /*
-                        * count does not fit in object even if it were
-                        * completely free.
+                        * If there may be some free blocks try to allocate
+                        * out of the layer.  If the layer indicates that
+                        * it is completely full then clear both bits to
+                        * propogate the condition.
                         */
-                       break;
+                       if ((scan->bm_bitmap & mask) == pmask) {
+                               int32_t r;
+                               int32_t full;
+
+                               r = bl->bl_radix_alloc_fwd(live->info, blk,
+                                                          radix, count, &full);
+                               if (r != HAMMER_ALIST_BLOCK_NONE) {
+                                       if (full)
+                                               scan->bm_bitmap &= ~mask;
+                                       return(r);
+                               }
+                       }
+                       blk += radix;
+                       mask <<= 2;
+                       pmask <<= 2;
+               }
+       } else {
+               /*
+                * Otherwise there are sub-records in the current a-list
+                * layer.  We can also peek into the sub-layers to get
+                * more accurate size hints.
+                */
+               next_skip = (skip - 1) / HAMMER_ALIST_META_RADIX;
+               for (i = 1; i <= skip; i += next_skip) {
+                       if (scan[i].bm_bighint == (int32_t)-1) {
+                               /* 
+                                * Terminator
+                                */
+                               break;
+                       }
+                       if ((scan->bm_bitmap & mask) == mask) {
+                               scan[i].bm_bitmap = (u_int32_t)-1;
+                               scan[i].bm_bighint = radix;
+                       }
+
+                       if (count <= scan[i].bm_bighint) {
+                               /*
+                                * count fits in object, recurse into the
+                                * next layer.  If the next_skip is 1 it
+                                * will be either a normal leaf or a meta
+                                * leaf.
+                                */
+                               int32_t r;
+
+                               if (next_skip == 1 && bl->bl_terminal) {
+                                       r = hammer_alst_leaf_alloc_fwd(
+                                               &scan[i], blk, count);
+                               } else {
+                                       r = hammer_alst_meta_alloc_fwd(
+                                               live, &scan[i],
+                                               blk, count,
+                                               radix, next_skip);
+                               }
+                               if (r != HAMMER_ALIST_BLOCK_NONE) {
+                                       if (scan[i].bm_bitmap == 0) {
+                                               scan->bm_bitmap &= ~mask;
+                                       } else {
+                                               scan->bm_bitmap &= ~mask;
+                                               scan->bm_bitmap |= pmask;
+                                       }
+                                       return(r);
+                               }
+                       }
+                       blk += radix;
+                       mask <<= 2;
+                       pmask <<= 2;
                }
-               blk += radix;
-               mask <<= 2;
-               pmask <<= 2;
        }
 
+failed:
        /*
         * We couldn't allocate count in this subtree, update bighint.
         */
@@ -647,14 +773,16 @@ hammer_alst_meta_alloc_fwd(hammer_almeta_t *scan, int32_t blk, int32_t count,
  * This version allocates blocks in the reverse direction.
  */
 static int32_t
-hammer_alst_meta_alloc_rev(hammer_almeta_t *scan, int32_t blk, int32_t count,
+hammer_alst_meta_alloc_rev(hammer_alist_t live, hammer_almeta_t scan,
+                          int32_t blk, int32_t count,
                           int32_t radix, int skip
 ) {
+       hammer_alist_config_t bl;
        int i;
        int j;
        u_int32_t mask;
        u_int32_t pmask;
-       int next_skip = ((u_int)skip / HAMMER_ALIST_META_RADIX);
+       int next_skip;
 
        /*
         * ALL-ALLOCATED special case
@@ -665,6 +793,7 @@ hammer_alst_meta_alloc_rev(hammer_almeta_t *scan, int32_t blk, int32_t count,
        }
 
        radix /= HAMMER_ALIST_META_RADIX;
+       bl = live->config;
 
        /*
         * Radix now represents each bitmap entry for this meta node.  If
@@ -700,77 +829,140 @@ hammer_alst_meta_alloc_rev(hammer_almeta_t *scan, int32_t blk, int32_t count,
        }
 
        /*
-        * If not we have to recurse.  Since we are going in the reverse
-        * direction we need an extra loop to determine if there is a
-        * terminator, then run backwards.
-        *
-        * This is a little weird but we do not want to overflow the
-        * mask/pmask in the loop.
+        * If the count is too big we couldn't allocate anything from a
+        * recursion even if the sub-tree were entirely free.
         */
-       j = 0;
-       for (i = 1; i <= skip; i += next_skip) {
-               if (scan[i].bm_bighint == (int32_t)-1)
-                       break;
-               blk += radix;
-               j += 2;
-       }
-       blk -= radix;
-       j -= 2;
-       mask = 0x00000003 << j;
-       pmask = 0x00000001 << j;
-       i -= next_skip;
+       if (count > radix)
+               goto failed;
 
-       while (i >= 1) {
+       if (skip == 1) {
                /*
-                * Initialize the bitmap in the child if allocating from
-                * the all-free case.
+                * We need the recurse but we are at a meta node leaf, which
+                * means there is another layer under us.
                 */
-               if ((scan->bm_bitmap & mask) == mask) {
-                       scan[i].bm_bitmap = (u_int32_t)-1;
-                       scan[i].bm_bighint = radix;
-               }
+               mask = 0xC0000000;
+               pmask = 0x40000000;
+               blk += radix * HAMMER_ALIST_META_RADIX - radix;
 
-               /*
-                * Handle various count cases.  Bighint may be too large but
-                * is never too small.
-                */
-               if (count <= scan[i].bm_bighint) {
+               for (i = 0; i < HAMMER_ALIST_META_RADIX; ++i) {
                        /*
-                        * count fits in object
+                        * If the next layer is completely free then call
+                        * its init function to initialize it and mark it
+                        * partially free.
                         */
-                       int32_t r;
-                       if (next_skip == 1) {
-                               r = hammer_alst_leaf_alloc_rev(
-                                       &scan[i], blk, count);
-                       } else {
-                               r = hammer_alst_meta_alloc_rev(
-                                       &scan[i], blk, count,
-                                       radix, next_skip - 1);
-                       }
-                       if (r != HAMMER_ALIST_BLOCK_NONE) {
-                               if (scan[i].bm_bitmap == 0) {
-                                       scan->bm_bitmap &= ~mask;
-                               } else {
+                       if ((scan->bm_bitmap & mask) == mask) {
+                               int32_t v;
+
+                               v = bl->bl_blocks - blk;
+                               if (v > radix)
+                                       v = radix;
+                               if (bl->bl_radix_init(live->info, blk, radix, v) == 0) {
                                        scan->bm_bitmap &= ~mask;
                                        scan->bm_bitmap |= pmask;
                                }
-                               return(r);
                        }
-               } else if (count > radix) {
+
                        /*
-                        * count does not fit in object even if it were
-                        * completely free.
+                        * If there may be some free blocks try to allocate
+                        * out of the layer.  If the layer indicates that
+                        * it is completely full then clear both bits to
+                        * propogate the condition.
                         */
-                       break;
+                       if ((scan->bm_bitmap & mask) == pmask) {
+                               int32_t r;
+                               int32_t full;
+
+                               r = bl->bl_radix_alloc_rev(live->info, blk,
+                                                          radix, count, &full);
+                               if (r != HAMMER_ALIST_BLOCK_NONE) {
+                                       if (full)
+                                               scan->bm_bitmap &= ~mask;
+                                       return(r);
+                               }
+                       }
+                       mask >>= 2;
+                       pmask >>= 2;
+                       blk -= radix;
+               }
+       } else {
+               /*
+                * Since we are going in the reverse direction we need an
+                * extra loop to determine if there is a terminator, then
+                * run backwards.
+                *
+                * This is a little weird but we do not want to overflow the
+                * mask/pmask in the loop.
+                */
+               next_skip = (skip - 1) / HAMMER_ALIST_META_RADIX;
+               j = 0;
+               for (i = 1; i <= skip; i += next_skip) {
+                       if (scan[i].bm_bighint == (int32_t)-1)
+                               break;
+                       blk += radix;
+                       j += 2;
                }
                blk -= radix;
-               mask >>= 2;
-               pmask >>= 2;
+               j -= 2;
+               mask = 0x00000003 << j;
+               pmask = 0x00000001 << j;
                i -= next_skip;
+
+               while (i >= 1) {
+                       /*
+                        * Initialize the bitmap in the child if allocating
+                        * from the all-free case.
+                        */
+                       if ((scan->bm_bitmap & mask) == mask) {
+                               scan[i].bm_bitmap = (u_int32_t)-1;
+                               scan[i].bm_bighint = radix;
+                       }
+
+                       /*
+                        * Handle various count cases.  Bighint may be too
+                        * large but is never too small.
+                        */
+                       if (count <= scan[i].bm_bighint) {
+                               /*
+                                * count fits in object
+                                */
+                               int32_t r;
+                               if (next_skip == 1 && bl->bl_terminal) {
+                                       r = hammer_alst_leaf_alloc_rev(
+                                               &scan[i], blk, count);
+                               } else {
+                                       r = hammer_alst_meta_alloc_rev(
+                                               live, &scan[i],
+                                               blk, count,
+                                               radix, next_skip);
+                               }
+                               if (r != HAMMER_ALIST_BLOCK_NONE) {
+                                       if (scan[i].bm_bitmap == 0) {
+                                               scan->bm_bitmap &= ~mask;
+                                       } else {
+                                               scan->bm_bitmap &= ~mask;
+                                               scan->bm_bitmap |= pmask;
+                                       }
+                                       return(r);
+                               }
+                       } else if (count > radix) {
+                               /*
+                                * count does not fit in object even if it were
+                                * completely free.
+                                */
+                               break;
+                       }
+                       blk -= radix;
+                       mask >>= 2;
+                       pmask >>= 2;
+                       i -= next_skip;
+               }
        }
 
+failed:
        /*
         * We couldn't allocate count in this subtree, update bighint.
+        * Since we are restricted to powers of 2, the next highest count
+        * we might be able to allocate is (count >> 1).
         */
        if (scan->bm_bighint >= count)
                scan->bm_bighint = count >> 1;
@@ -784,11 +976,7 @@ hammer_alst_meta_alloc_rev(hammer_almeta_t *scan, int32_t blk, int32_t count,
  *     restricted to powers of 2, the freeing code is not.
  */
 static void
-hammer_alst_leaf_free(
-       hammer_almeta_t *scan,
-       int32_t blk,
-       int count
-) {
+hammer_alst_leaf_free(hammer_almeta_t scan, int32_t blk, int count) {
        /*
         * free some data in this bitmap
         *
@@ -830,15 +1018,12 @@ hammer_alst_leaf_free(
  */
 
 static void 
-hammer_alst_meta_free(
-       hammer_almeta_t *scan, 
-       int32_t freeBlk,
-       int32_t count,
-       int32_t radix, 
-       int skip,
-       int32_t blk
+hammer_alst_meta_free(hammer_alist_t live, hammer_almeta_t scan, 
+                     int32_t freeBlk, int32_t count,
+                     int32_t radix, int skip, int32_t blk
 ) {
-       int next_skip = ((u_int)skip / HAMMER_ALIST_META_RADIX);
+       hammer_alist_config_t bl;
+       int next_skip;
        u_int32_t mask;
        u_int32_t pmask;
        int i;
@@ -850,110 +1035,167 @@ hammer_alst_meta_free(
         * Each block in a meta-node bitmap takes two bits.
         */
        radix /= HAMMER_ALIST_META_RADIX;
+       bl = live->config;
 
        i = (freeBlk - blk) / radix;
        blk += i * radix;
        mask = 0x00000003 << (i * 2);
        pmask = 0x00000001 << (i * 2);
 
-       i = i * next_skip + 1;
+       if (skip == 1) {
+               /*
+                * Our meta node is a leaf node, which means it must recurse
+                * into another allocator.
+                */
+               while (i < HAMMER_ALIST_META_RADIX && blk < freeBlk + count) {
+                       int32_t v;
 
-       while (i <= skip && blk < freeBlk + count) {
-               int32_t v;
+                       v = blk + radix - freeBlk;
+                       if (v > count)
+                               v = count;
 
-               v = blk + radix - freeBlk;
-               if (v > count)
-                       v = count;
+                       if (scan->bm_bighint == (int32_t)-1)
+                               panic("hammer_alst_meta_free: freeing unexpected range");
 
-               if (scan->bm_bighint == (int32_t)-1)
-                       panic("hammer_alst_meta_free: freeing unexpected range");
+                       if (freeBlk == blk && count >= radix) {
+                               /*
+                                * All-free case, no need to update sub-tree
+                                */
+                               scan->bm_bitmap |= mask;
+                               scan->bm_bighint = radix * HAMMER_ALIST_META_RADIX;
+                               /* XXX bighint not being set properly */
+                       } else {
+                               /*
+                                * Recursion case
+                                */
+                               int32_t empty;
+
+                               bl->bl_radix_free(live->info, freeBlk, v,
+                                                 radix, blk, &empty);
+                               if (empty) {
+                                       scan->bm_bitmap |= mask;
+                                       scan->bm_bighint = radix * HAMMER_ALIST_META_RADIX;
+                                       /* XXX bighint not being set properly */
+                               } else {
+                                       scan->bm_bitmap |= pmask;
+                                       if (scan->bm_bighint < radix / 2)
+                                               scan->bm_bighint = radix / 2;
+                                       /* XXX bighint not being set properly */
+                               }
+                       }
+                       ++i;
+                       mask <<= 2;
+                       pmask <<= 2;
+                       count -= v;
+                       freeBlk += v;
+                       blk += radix;
+               }
+       } else {
+               next_skip = (skip - 1) / HAMMER_ALIST_META_RADIX;
+               i = 1 + i * next_skip;
 
-               if (freeBlk == blk && count >= radix) {
-                       /*
-                        * All-free case, no need to update sub-tree
-                        */
-                       scan->bm_bitmap |= mask;
-                       scan->bm_bighint = radix * HAMMER_ALIST_META_RADIX;
-                       /*XXX*/
-               } else {
-                       /*
-                        * Recursion case
-                        */
-                       if (next_skip == 1)
-                               hammer_alst_leaf_free(&scan[i], freeBlk, v);
-                       else
-                               hammer_alst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
-                       if (scan[i].bm_bitmap == (u_int32_t)-1)
+               while (i <= skip && blk < freeBlk + count) {
+                       int32_t v;
+
+                       v = blk + radix - freeBlk;
+                       if (v > count)
+                               v = count;
+
+                       if (scan->bm_bighint == (int32_t)-1)
+                               panic("hammer_alst_meta_free: freeing unexpected range");
+
+                       if (freeBlk == blk && count >= radix) {
+                               /*
+                                * All-free case, no need to update sub-tree
+                                */
                                scan->bm_bitmap |= mask;
-                       else
-                               scan->bm_bitmap |= pmask;
-                       if (scan->bm_bighint < scan[i].bm_bighint)
-                           scan->bm_bighint = scan[i].bm_bighint;
+                               scan->bm_bighint = radix * HAMMER_ALIST_META_RADIX;
+                               /* XXX bighint not being set properly */
+                       } else {
+                               /*
+                                * Recursion case
+                                */
+                               if (next_skip == 1 && bl->bl_terminal) {
+                                       hammer_alst_leaf_free(&scan[i], freeBlk, v);
+                               } else {
+                                       hammer_alst_meta_free(live, &scan[i],
+                                                             freeBlk, v,
+                                                             radix, next_skip,
+                                                             blk);
+                               }
+                               if (scan[i].bm_bitmap == (u_int32_t)-1)
+                                       scan->bm_bitmap |= mask;
+                               else
+                                       scan->bm_bitmap |= pmask;
+                               if (scan->bm_bighint < scan[i].bm_bighint)
+                                       scan->bm_bighint = scan[i].bm_bighint;
+                       }
+                       mask <<= 2;
+                       pmask <<= 2;
+                       count -= v;
+                       freeBlk += v;
+                       blk += radix;
+                       i += next_skip;
                }
-               mask <<= 2;
-               pmask <<= 2;
-               count -= v;
-               freeBlk += v;
-               blk += radix;
-               i += next_skip;
        }
 }
 
 /*
- * BLST_RADIX_INIT()
+ * hammer_alst_radix_init()
  *
  *     Initialize our meta structures and bitmaps and calculate the exact
- *     amount of space required to manage 'count' blocks - this space may
- *     be considerably less then the calculated radix due to the large
- *     RADIX values we use.
+ *     number of meta-nodes required to manage 'count' blocks.  
+ *
+ *     The required space may be truncated due to early termination records.
  */
-
 static int32_t 
-hammer_alst_radix_init(hammer_almeta_t *scan, int32_t radix,
+hammer_alst_radix_init(hammer_almeta_t scan, int32_t radix,
                       int skip, int32_t count)
 {
        int i;
        int next_skip;
-       int32_t memindex = 0;
+       int32_t memindex = 1;
        u_int32_t mask;
        u_int32_t pmask;
 
        /*
-        * Leaf node
+        * Basic initialization of the almeta for meta or leaf nodes
         */
-       if (radix == HAMMER_ALIST_BMAP_RADIX) {
-               if (scan) {
-                       scan->bm_bighint = 0;
-                       scan->bm_bitmap = 0;
-               }
-               return(memindex);
+       if (scan) {
+               scan->bm_bighint = 0;
+               scan->bm_bitmap = 0;
        }
 
+       /*
+        * We are at a leaf, we only eat one meta element. 
+        */
+       if (skip == 1)
+               return(memindex);
+
        /*
         * Meta node.  If allocating the entire object we can special
         * case it.  However, we need to figure out how much memory
         * is required to manage 'count' blocks, so we continue on anyway.
         */
-
-       if (scan) {
-               scan->bm_bighint = 0;
-               scan->bm_bitmap = 0;
-       }
-
        radix /= HAMMER_ALIST_META_RADIX;
-       next_skip = ((u_int)skip / HAMMER_ALIST_META_RADIX);
+       next_skip = (skip - 1) / HAMMER_ALIST_META_RADIX;
        mask = 0x00000003;
        pmask = 0x00000001;
 
        for (i = 1; i <= skip; i += next_skip) {
+               /*
+                * We eat up to this record
+                */
+               memindex = i;
+
                if (count >= radix) {
                        /*
                         * Allocate the entire object
                         */
-                       memindex = i + hammer_alst_radix_init(
+                       memindex += hammer_alst_radix_init(
                            ((scan) ? &scan[i] : NULL),
                            radix,
-                           next_skip - 1,
+                           next_skip,
                            radix
                        );
                        count -= radix;
@@ -962,10 +1204,10 @@ hammer_alst_radix_init(hammer_almeta_t *scan, int32_t radix,
                        /*
                         * Allocate a partial object
                         */
-                       memindex = i + hammer_alst_radix_init(
+                       memindex += hammer_alst_radix_init(
                            ((scan) ? &scan[i] : NULL),
                            radix,
-                           next_skip - 1,
+                           next_skip,
                            count
                        );
                        count = 0;
@@ -977,8 +1219,10 @@ hammer_alst_radix_init(hammer_almeta_t *scan, int32_t radix,
                                scan->bm_bitmap |= pmask;
                } else {
                        /*
-                        * Add terminator and break out
+                        * Add terminator and break out.  The terminal
+                        * eats the meta node at scan[i].
                         */
+                       ++memindex;
                        if (scan)
                                scan[i].bm_bighint = (int32_t)-1;
                        /* already marked as wholely allocated */
@@ -987,23 +1231,21 @@ hammer_alst_radix_init(hammer_almeta_t *scan, int32_t radix,
                mask <<= 2;
                pmask <<= 2;
        }
-       if (memindex < i)
-               memindex = i;
        return(memindex);
 }
 
 #ifdef ALIST_DEBUG
 
 static void    
-hammer_alst_radix_print(hammer_almeta_t *scan, int32_t blk,
-                       int32_t radix, int skip, int tab)
+hammer_alst_radix_print(hammer_alist_t live, hammer_almeta_t scan,
+                       int32_t blk, int32_t radix, int skip, int tab)
 {
        int i;
        int next_skip;
        int lastState = 0;
        u_int32_t mask;
 
-       if (radix == HAMMER_ALIST_BMAP_RADIX) {
+       if (skip == 1 && live->config->bl_terminal) {
                kprintf(
                    "%*.*s(%04x,%d): bitmap %08x big=%d\n", 
                    tab, tab, "",
@@ -1034,52 +1276,78 @@ hammer_alst_radix_print(hammer_almeta_t *scan, int32_t blk,
        }
 
        kprintf(
-           "%*.*s(%04x,%d): subtree (%d) bitmap=%08x big=%d {\n",
+           "%*.*s(%04x,%d): %s (%d) bitmap=%08x big=%d {\n",
            tab, tab, "",
            blk, radix,
+           (skip == 1 ? "LAYER" : "subtree"),
            radix,
            scan->bm_bitmap,
            scan->bm_bighint
        );
 
        radix /= HAMMER_ALIST_META_RADIX;
-       next_skip = ((u_int)skip / HAMMER_ALIST_META_RADIX);
        tab += 4;
        mask = 0x00000003;
 
-       for (i = 1; i <= skip; i += next_skip) {
-               if (scan[i].bm_bighint == (int32_t)-1) {
-                       kprintf(
-                           "%*.*s(%04x,%d): Terminator\n",
-                           tab, tab, "",
-                           blk, radix
-                       );
-                       lastState = 0;
-                       break;
+       if (skip == 1) {
+               for (i = 0; i < HAMMER_ALIST_META_RADIX; ++i) {
+                       if ((scan->bm_bitmap & mask) == mask) {
+                               kprintf(
+                                   "%*.*s(%04x,%d): ALL FREE\n",
+                                   tab, tab, "",
+                                   blk, radix
+                               );
+                       } else if ((scan->bm_bitmap & mask) == 0) {
+                               kprintf(
+                                   "%*.*s(%04x,%d): ALL ALLOCATED\n",
+                                   tab, tab, "",
+                                   blk, radix
+                               );
+                       } else {
+                               live->config->bl_radix_print(
+                                               live->info, blk, radix, tab);
+                       }
+                       blk += radix;
+                       mask <<= 2;
                }
-               if ((scan->bm_bitmap & mask) == mask) {
-                       kprintf(
-                           "%*.*s(%04x,%d): ALL FREE\n",
-                           tab, tab, "",
-                           blk, radix
-                       );
-               } else if ((scan->bm_bitmap & mask) == 0) {
-                       kprintf(
-                           "%*.*s(%04x,%d): ALL ALLOCATED\n",
-                           tab, tab, "",
-                           blk, radix
-                       );
-               } else {
-                       hammer_alst_radix_print(
-                           &scan[i],
-                           blk,
-                           radix,
-                           next_skip - 1,
-                           tab
-                       );
+       } else {
+               next_skip = ((u_int)skip / HAMMER_ALIST_META_RADIX);
+
+               for (i = 1; i <= skip; i += next_skip) {
+                       if (scan[i].bm_bighint == (int32_t)-1) {
+                               kprintf(
+                                   "%*.*s(%04x,%d): Terminator\n",
+                                   tab, tab, "",
+                                   blk, radix
+                               );
+                               lastState = 0;
+                               break;
+                       }
+                       if ((scan->bm_bitmap & mask) == mask) {
+                               kprintf(
+                                   "%*.*s(%04x,%d): ALL FREE\n",
+                                   tab, tab, "",
+                                   blk, radix
+                               );
+                       } else if ((scan->bm_bitmap & mask) == 0) {
+                               kprintf(
+                                   "%*.*s(%04x,%d): ALL ALLOCATED\n",
+                                   tab, tab, "",
+                                   blk, radix
+                               );
+                       } else {
+                               hammer_alst_radix_print(
+                                   live,
+                                   &scan[i],
+                                   blk,
+                                   radix,
+                                   next_skip - 1,
+                                   tab
+                               );
+                       }
+                       blk += radix;
+                       mask <<= 2;
                }
-               blk += radix;
-               mask <<= 2;
        }
        tab -= 4;
 
@@ -1093,26 +1361,129 @@ hammer_alst_radix_print(hammer_almeta_t *scan, int32_t blk,
 
 #ifdef ALIST_DEBUG
 
+static struct hammer_alist_live **layers;      /* initialized by main */
+static int32_t layer_radix = -1;
+
+static
+int
+debug_radix_init(void *info, int32_t blk, int32_t radix, int32_t count)
+{
+       hammer_alist_t layer;
+       int layer_no = blk / layer_radix;
+
+       printf("lower layer: init (%04x,%d) for %d blks\n", blk, radix, count);
+       KKASSERT(layers[layer_no] == NULL);
+       layer = layers[layer_no] = hammer_alist_create(count, 1, NULL); 
+       hammer_alist_free(layer, 0, count);
+       return(0);
+}
+
+static
+int32_t
+debug_radix_alloc_fwd(void *info, int32_t blk, int32_t radix,
+                     int32_t count, int32_t *fullp)
+{
+       hammer_alist_t layer = layers[blk / layer_radix];
+
+       return(hammer_alist_alloc(layer, count));
+}
+
+static
+int32_t
+debug_radix_alloc_rev(void *info, int32_t blk, int32_t radix,
+                     int32_t count, int32_t *fullp)
+{
+       hammer_alist_t layer = layers[blk / layer_radix];
+
+       blk = hammer_alist_alloc_rev(layer, count);
+       *fullp = (layer->meta->bm_alist_freeblks == 0);
+       if (*fullp) {
+               hammer_alist_destroy(layer, NULL);
+               layers[blk / layer_radix] = NULL;
+       }
+       return(blk);
+}
+
+static
+void
+debug_radix_free(void *info, int32_t freeBlk, int32_t count,
+                int32_t radix, int32_t blk, int32_t *emptyp)
+{
+       int layer_no = blk / layer_radix;
+       hammer_alist_t layer = layers[layer_no];
+
+       if (layer == NULL) {
+               /*
+                * XXX layer_radix isn't correct if the total number
+                * of blocks only partially fills this layer.  Don't
+                * worry about it.
+                */
+               layer = hammer_alist_create(layer_radix, 1, NULL); 
+               layers[layer_no] = layer;
+       }
+       hammer_alist_free(layer, freeBlk - blk, count);
+       *emptyp = (layer->meta->bm_alist_freeblks == layer_radix);
+}
+
+static
+void
+debug_radix_print(void *info, int32_t blk, int32_t radix, int tab)
+{
+       hammer_alist_t layer = layers[blk / layer_radix];
+
+       hammer_alist_print(layer, tab);
+}
+
 int
 main(int ac, char **av)
 {
-       int size = 1024;
+       int32_t size = -1;
        int i;
-       hammer_alist_t bl;
-       hammer_almeta_t *meta = NULL;
+       hammer_alist_t live;
+       hammer_almeta_t meta = NULL;
 
        for (i = 1; i < ac; ++i) {
                const char *ptr = av[i];
                if (*ptr != '-') {
-                       size = strtol(ptr, NULL, 0);
+                       if (size == -1)
+                               size = strtol(ptr, NULL, 0);
+                       else if (layer_radix == -1)
+                               layer_radix = strtol(ptr, NULL, 0);
+                       else
+                               ;
                        continue;
                }
                ptr += 2;
                fprintf(stderr, "Bad option: %s\n", ptr - 2);
                exit(1);
        }
-       bl = hammer_alist_create(size, NULL, &meta);
-       hammer_alist_free(bl, meta, 0, size);
+       if (size == -1)
+               size = 1024;
+       if (layer_radix == -1)
+               layer_radix = 1;        /* no second storage layer */
+       if ((size ^ (size - 1)) != (size << 1) - 1) {
+               fprintf(stderr, "size must be a power of 2\n");
+               exit(1);
+       }
+       if ((layer_radix ^ (layer_radix - 1)) != (layer_radix << 1) - 1) {
+               fprintf(stderr, "the second layer radix must be a power of 2\n");
+               exit(1);
+       }
+
+       live = hammer_alist_create(size, layer_radix, NULL);
+       layers = calloc(size, sizeof(hammer_alist_t));
+
+       printf("A-LIST TEST %d blocks, first-layer radix %d, "
+              "second-layer radix %d\n",
+               size, live->config->bl_radix / layer_radix, layer_radix);
+
+       live->config->bl_radix_init = debug_radix_init;
+       live->config->bl_radix_alloc_fwd = debug_radix_alloc_fwd;
+       live->config->bl_radix_alloc_rev = debug_radix_alloc_rev;
+       live->config->bl_radix_free = debug_radix_free;
+       live->config->bl_radix_print = debug_radix_print;
+
+       hammer_alist_free(live, 0, size);
 
        for (;;) {
                char buf[1024];
@@ -1120,17 +1491,18 @@ main(int ac, char **av)
                int32_t count = 0;
                int32_t blk;
 
-               kprintf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+               kprintf("%d/%d> ",
+                       live->meta->bm_alist_freeblks, size);
                fflush(stdout);
                if (fgets(buf, sizeof(buf), stdin) == NULL)
                        break;
                switch(buf[0]) {
                case 'p':
-                       hammer_alist_print(bl, meta);
+                       hammer_alist_print(live, 0);
                        break;
                case 'a':
                        if (sscanf(buf + 1, "%d", &count) == 1) {
-                               blk = hammer_alist_alloc(bl, meta, count);
+                               blk = hammer_alist_alloc(live, count);
                                kprintf("    R=%04x\n", blk);
                        } else {
                                kprintf("?\n");
@@ -1138,7 +1510,7 @@ main(int ac, char **av)
                        break;
                case 'r':
                        if (sscanf(buf + 1, "%d", &count) == 1) {
-                               blk = hammer_alist_alloc_rev(bl, meta, count);
+                               blk = hammer_alist_alloc_rev(live, count);
                                kprintf("    R=%04x\n", blk);
                        } else {
                                kprintf("?\n");
@@ -1146,7 +1518,7 @@ main(int ac, char **av)
                        break;
                case 'f':
                        if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
-                               hammer_alist_free(bl, meta, da, count);
+                               hammer_alist_free(live, da, count);
                        } else {
                                kprintf("?\n");
                        }
diff --git a/sys/vfs/hammer/hammer_alist.h b/sys/vfs/hammer/hammer_alist.h
new file mode 100644 (file)
index 0000000..95fb544
--- /dev/null
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
+ * 
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * $DragonFly: src/sys/vfs/hammer/Attic/hammer_alist.h,v 1.1 2007/10/12 18:57:45 dillon Exp $
+ */
+
+/*
+ * The structures below represent HAMMER's on-disk and in-memory A-List
+ * support.  An A-list is a radix tree bitmap allocator with hinting.  It
+ * is very fast and efficient.
+ *
+ * A-lists can be single-layered or multi-layered.  A single-layered
+ * A-list uses a radix 32 leaf and radix 16 internal nodes.  A multi-layered
+ * A-list uses a radix 16 leaf and radix 16 internal nodes.
+ *
+ * Multi-layered A-lists allow otherwise independant A-lists to be stacked
+ * to form a single A-list.
+ */
+
+/*
+ * This on-disk structure represents the actual information an A-list needs
+ * to store to disk.  Each A-list layer is made up of a linear array of
+ * meta-elements.  The number of elements needed is calculated using a
+ * recursion.  A-lists contain an early-termination feature which allows
+ * an A-list's storage to scale to the actual number of blocks that need
+ * to be managed regardless of whether they are on a radix boundary or not.
+ *
+ * The first almeta structure in the array is used for housekeeping and not
+ * part of the topology proper.  The second almeta structure is the 'root'
+ * meta.
+ */
+typedef struct hammer_almeta {
+       u_int32_t       bm_bitmap;
+       int32_t         bm_bighint;
+} *hammer_almeta_t;
+
+#define bm_alist_freeblks      bm_bitmap
+
+#define HAMMER_ALMETA_SIZE     8
+
+/*
+ * This in-memory structure specifies how an a-list is configured and
+ * can be shared by multiple live alists.
+ */
+typedef struct hammer_alist_config {
+       int32_t bl_blocks;      /* area of coverage */
+       int32_t bl_radix;       /* coverage radix */
+       int32_t bl_base_radix;  /* chain to other allocators */
+       int32_t bl_skip;        /* starting skip for linear layout */
+       int32_t bl_rootblks;    /* meta-blocks allocated for tree */
+       int32_t bl_terminal;    /* terminal alist, else layer recursion */
+       int     (*bl_radix_init)(void *info, int32_t blk, int32_t radix,
+                                       int32_t count);
+       int32_t (*bl_radix_alloc_fwd)(void *info, int32_t blk, int32_t radix,
+                                       int32_t count, int32_t *fullp);
+       int32_t (*bl_radix_alloc_rev)(void *info, int32_t blk, int32_t radix,
+                                       int32_t count, int32_t *fullp);
+       void    (*bl_radix_free)(void *info, int32_t freeBlk, int32_t count,
+                                       int32_t radix, int32_t blk,
+                                       int32_t *emptyp);
+       void    (*bl_radix_print)(void *info, int32_t blk, int32_t radix,
+                                       int tab);
+} *hammer_alist_config_t;
+
+/*
+ * This in-memory structure is needed for each live alist.
+ */
+typedef struct hammer_alist_live {
+       hammer_alist_config_t config;   /* a-list configuration */
+       hammer_almeta_t meta;           /* location of meta array */
+       void *info;                     /* chaining call info argument */
+} *hammer_alist_t;
+
+#define HAMMER_ALIST_META_RADIX        (sizeof(int32_t) * 4)   /* 16 */
+#define HAMMER_ALIST_BMAP_RADIX        (sizeof(int32_t) * 8)   /* 32 */
+#define HAMMER_ALIST_BLOCK_NONE        ((int32_t)-1)
+
+/*
+ * Hard-code some pre-calculated constants for managing varying numbers
+ * of blocks.  These are the number of meta-elements required.
+ */
+#define HAMMER_ALIST_METAELMS_256_1LYR 11
+#define HAMMER_ALIST_METAELMS_32K_1LYR 1095
+#define HAMMER_ALIST_METAELMS_16K_2LYR HAMMER_ALIST_METAELMS_32K_1LYR
+
+#define HAMMER_ALIST_METAELMS_4K_1LYR  139
+#define HAMMER_ALIST_METAELMS_4K_2LYR  275
+
index 4da07e4..be6aebe 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.1 2007/10/10 19:37:25 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.2 2007/10/12 18:57:45 dillon Exp $
  */
 
 /*
  */
 
 /*
- * Common base for all B-Tree element types (40 bytes)
+ * Common base for all B+Tree element types (40 bytes)
+ *
+ * Note that obj_type is set to the object type of the record represents
+ * an inode or a cluster range.  A cluster range is special in that the
+ * B-Tree nodes represent a range within the B-Tree inclusive of rec_type
+ * field, so obj_type must be used to detect the cluster range entries.
  */
 struct hammer_base_elm {
        int64_t obj_id;         /* 00 object record is associated with */
@@ -56,27 +61,15 @@ struct hammer_base_elm {
        hammer_tid_t delete_tid; /* 18 transaction id for record update/del */
 
        u_int16_t rec_type;     /* 20 */
-       u_int16_t obj_type;     /* 22 (only if rec_type is an inode) */
-       u_int32_t reserved01;   /* 24 */
+       u_int16_t obj_type;     /* 22 (special) */
+       int32_t subtree_offset; /* 24 (B+Tree recursion) */
                                /* 28 */
 };
 
-/*
- * Internal B-Tree element (40 + 16 = 56 bytes)
- */
-struct hammer_internal_elm {
-       struct hammer_base_elm base;
-       int32_t subtree_offset;
-       u_int32_t reserved01;
-       u_int32_t reserved02;
-       u_int32_t reserved03;
-
-};
-
 /*
  * Leaf B-Tree element (40 + 16 = 56 bytes)
  */
-struct hammer_leaf_elm {
+struct hammer_record_elm {
        struct hammer_base_elm base;
        int32_t rec_offset;
        int32_t data_offset;
@@ -89,9 +82,10 @@ struct hammer_leaf_elm {
  */
 struct hammer_cluster_elm {
        struct hammer_base_elm base;
-       u_int64_t clu_id;               /* cluster id (sanity check) */
-       u_int32_t vol_no;
-       u_int32_t cluster_no;
+       int32_t rec_offset;             /* cluster recursion record */
+       u_int32_t verifier;             /* low 32 bits of target clu_id */
+       int32_t vol_no;
+       int32_t cluster_no;
 };
 
 /*
@@ -99,8 +93,7 @@ struct hammer_cluster_elm {
  */
 union hammer_btree_elm {
        struct hammer_base_elm base;
-       struct hammer_internal_elm internal;
-       struct hammer_leaf_elm leaf;
+       struct hammer_record_elm record;
        struct hammer_cluster_elm cluster;
 };
 
@@ -118,7 +111,7 @@ struct hammer_btree_node {
         * B-Tree node header (56 bytes)
         */
        int32_t         count;  /* number of elements in B-Tree node */
-       u_int32_t       parent; /* parent B-Tree node in current cluster */
+       int32_t         parent; /* parent B-Tree node in current cluster */
        u_int32_t       reserved[12];
 
        /*
similarity index 65%
rename from sys/vfs/hammer/hammerfs.h
rename to sys/vfs/hammer/hammer_disk.h
index abc3f86..b9eb214 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/Attic/hammerfs.h,v 1.1 2007/10/10 19:37:25 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.1 2007/10/12 18:57:45 dillon Exp $
  */
 
 #ifndef _SYS_UUID_H_
@@ -53,7 +53,9 @@
  * A HAMMER filesystem may spam multiple volumes.
  *
  * A HAMMER filesystem uses a 16K filesystem buffer size.  All filesystem
- * I/O is done in multiples of 16K.
+ * I/O is done in multiples of 16K.  Most buffer-sized headers such as those
+ * used by volumes, super-clusters, clusters, and basic filesystem buffers
+ * use fixed-sized A-lists which are heavily dependant on HAMMER_BUFSIZE.
  */
 #define HAMMER_BUFSIZE 16384
 #define HAMMER_BUFMASK (HAMMER_BUFSIZE - 1)
  */
 typedef u_int64_t hammer_tid_t;
 
-/*
- * Storage allocations are managed in powers of 2 with a hinted radix tree
- * based in volume and cluster headers.  The tree is not necessarily
- * contained within the header and may recurse into other storage elements.
- *
- * The allocator's basic storage element is the hammer_almeta structure
- * which is laid out recursively in a buffer.  Allocations are driven using
- * a template called hammer_alist which is constructed in memory and NOT
- * stored in the filesystem.
- */
-struct hammer_almeta {
-       u_int32_t       bm_bitmap;
-       int32_t         bm_bighint;
-};
-
-#define HAMMER_ALMETA_SIZE     8
-
-struct hammer_alist {
-       int32_t bl_blocks;      /* area of coverage */
-       int32_t bl_radix;       /* coverage radix */
-       int32_t bl_skip;        /* starting skip for linear layout */
-       int32_t bl_free;        /* number of free blocks */
-       int32_t bl_rootblks;    /* meta-blocks allocated for tree */
-};
-
-typedef struct hammer_almeta   hammer_almeta_t;
-typedef struct hammer_alist    *hammer_alist_t;
-
-#define HAMMER_ALIST_META_RADIX        (sizeof(u_int32_t) * 4)   /* 16 */
-#define HAMMER_ALIST_BMAP_RADIX        (sizeof(u_int32_t) * 8)   /* 32 */
-#define HAMMER_ALIST_BLOCK_NONE        ((int32_t)-1)
-#define HAMMER_ALIST_FORWARDS  0x0001
-#define HAMMER_ALIST_BACKWARDS 0x0002
-
 /*
  * Most HAMMER data structures are embedded in 16K filesystem buffers.
  * All filesystem buffers except those designated as pure-data buffers
@@ -111,19 +79,24 @@ typedef struct hammer_alist        *hammer_alist_t;
 
 #define HAMMER_FSBUF_HEAD_SIZE 128
 #define HAMMER_FSBUF_MAXBLKS   256
-#define HAMMER_FSBUF_METAELMS  10      /* 10 elements needed for 256 blks */
+#define HAMMER_FSBUF_METAELMS  HAMMER_ALIST_METAELMS_256_1LYR  /* 11 */
 
 struct hammer_fsbuf_head {
        u_int64_t buf_type;
        u_int32_t buf_crc;
        u_int32_t buf_reserved07;
-       u_int32_t reserved[8];
+       u_int32_t reserved[6];
        struct hammer_almeta buf_almeta[HAMMER_FSBUF_METAELMS];
 };
 
 typedef struct hammer_fsbuf_head *hammer_fsbuf_head_t;
 
+/*
+ * Note: Pure-data buffers contain pure-data and have no buf_type.
+ * Piecemeal data buffers do have a header and use HAMMER_FSBUF_DATA.
+ */
 #define HAMMER_FSBUF_VOLUME    0xC8414D4DC5523031ULL   /* HAMMER01 */
+#define HAMMER_FSBUF_SUPERCL   0xC8414D52C3555052ULL   /* HAMRSUPR */
 #define HAMMER_FSBUF_CLUSTER   0xC8414D52C34C5553ULL   /* HAMRCLUS */
 #define HAMMER_FSBUF_RECORDS   0xC8414D52D2454353ULL   /* HAMRRECS */
 #define HAMMER_FSBUF_BTREE     0xC8414D52C2545245ULL   /* HAMRBTRE */
@@ -140,20 +113,38 @@ typedef struct hammer_fsbuf_head *hammer_fsbuf_head_t;
  * HAMMER Volume header
  *
  * A HAMMER filesystem is built from any number of block devices,  Each block
- * device contains a volume header followed by however many clusters
- * fit in the volume.  Clusters cannot be migrated but the data they contain
- * can, so HAMMER can use a truncated cluster for any extra space at the
- * end of the volume.
+ * device contains a volume header followed by however many super-clusters
+ * and clusters fit into the volume.  Clusters cannot be migrated but the
+ * data they contain can, so HAMMER can use a truncated cluster for any
+ * extra space at the end of the volume.
  *
  * The volume containing the root cluster is designated as the master volume.
  * The root cluster designation can be moved to any volume.
  *
  * The volume header takes up an entire 16K filesystem buffer and includes
- * an A-list to manage the clusters contained within the volume (up to 32768).
- * With 512M clusters a volume will be limited to 16TB.
+ * a one or two-layered A-list to manage the clusters making up the volume.
+ * A volume containing up to 32768 clusters (2TB) can be managed with a
+ * single-layered A-list.  A two-layer A-list is capable of managing up
+ * to 16384 super-clusters with each super-cluster containing 32768 clusters
+ * (32768 TB per volume total).  The number of volumes is limited to 32768
+ * but it only takes 512 to fill out a 64 bit address space so for all
+ * intents and purposes the filesystem has no limits.
+ *
+ * cluster addressing within a volume depends on whether a single or
+ * duel-layer A-list is used.  If a duel-layer A-list is used a 16K
+ * super-cluster buffer is needed for every 16384 clusters in the volume.
+ * However, because the A-list's hinting is grouped in multiples of 16
+ * we group 16 super-cluster buffers together (starting just after the
+ * volume header), followed by 16384x16 clusters, and repeat.
+ *
+ * NOTE: A 32768-element single-layer and 16384-element duel-layer A-list
+ * is the same size.
  */
-#define HAMMER_VOL_MAXCLUSTERS 32768
-#define HAMMER_VOL_METAELMS    1094
+#define HAMMER_VOL_MAXCLUSTERS         32768   /* 1-layer */
+#define HAMMER_VOL_MAXSUPERCLUSTERS    16384   /* 2-layer */
+#define HAMMER_VOL_SUPERCLUSTER_GROUP  16
+#define HAMMER_VOL_METAELMS_1LYR       HAMMER_ALIST_METAELMS_32K_1LYR
+#define HAMMER_VOL_METAELMS_2LYR       HAMMER_ALIST_METAELMS_16K_2LYR
 
 struct hammer_volume_ondisk {
        struct hammer_fsbuf_head head;
@@ -191,24 +182,82 @@ struct hammer_volume_ondisk {
 
        char    reserved[1024];
 
-       hammer_almeta_t vol_almeta[HAMMER_VOL_METAELMS];
+       /*
+        * Meta elements for the volume header's A-list, which is either a
+        * 1-layer A-list capable of managing 32768 clusters, or a 2-layer
+        * A-list capable of managing 16384 super-clusters (each of which
+        * can handle 32768 clusters).
+        */
+       union {
+               hammer_almeta_t super[HAMMER_VOL_METAELMS_2LYR];
+               hammer_almeta_t normal[HAMMER_VOL_METAELMS_1LYR];
+       } vol_almeta;
        u_int32_t       vol0_bitmap[1024];
 };
 
-#define HAMMER_VOLF_VALID      0x0001  /* valid entry */
-#define HAMMER_VOLF_OPEN       0x0002  /* volume is open */
+#define HAMMER_VOLF_VALID              0x0001  /* valid entry */
+#define HAMMER_VOLF_OPEN               0x0002  /* volume is open */
+#define HAMMER_VOLF_SUPERCL_ENABLE     0x0004  /* enable supercluster layer */
+#define HAMMER_VOLF_SUPERCL_RESERVE    0x0008  /* supercluster layout */
+
+/*
+ * HAMMER Super-cluster header
+ *
+ * A super-cluster is used to increase the maximum size of a volume.
+ * HAMMER's volume header can manage up to 32768 direct clusters or
+ * 16384 super-clusters.  Each super-cluster (which is basically just
+ * a 16K filesystem buffer) can manage up to 32768 clusters.  So adding
+ * a super-cluster layer allows a HAMMER volume to be sized upwards of
+ * around 32768TB instead of 2TB.
+ *
+ * Any volume initially formatted to be over 32G reserves space for the layer
+ * but the layer is only enabled if the volume exceeds 2TB.
+ */
+#define HAMMER_SUPERCL_METAELMS                HAMMER_ALIST_METAELMS_32K_1LYR
+
+struct hammer_supercl_ondisk {
+       struct hammer_fsbuf_head head;
+       uuid_t  vol_fsid;       /* identify filesystem - sanity check */
+       uuid_t  vol_fstype;     /* identify filesystem type - sanity check */
+       int32_t reserved[1024];
+
+       hammer_almeta_t scl_meta[HAMMER_SUPERCL_METAELMS];
+};
 
 /*
  * HAMMER Cluster header
  *
- * The cluster header contains all the information required to identify a
- * cluster, locate critical information areas within the cluster, and
- * to manage space within the cluster.
+ * A cluster is limited to 64MB and is made up of 4096 16K filesystem
+ * buffers.  The cluster header contains four A-lists to manage these
+ * buffers.
+ *
+ * master_alist - This is a non-layered A-list which manages pure-data
+ *               allocations and allocations on behalf of other A-lists.
+ *
+ * btree_alist  - This is a layered A-list which manages filesystem buffers
+ *               containing B-Tree nodes.
  *
- * A Cluster contains pure data, incremental data, b-tree nodes, and records.
+ * record_alist - This is a layered A-list which manages filesystem buffers
+ *               containing records.
+ *
+ * mdata_alist  - This is a layered A-list which manages filesystem buffers
+ *               containing piecemeal record data.
+ * 
+ * General storage management works like this:  All the A-lists except the
+ * master start in an all-allocated state.  Now lets say you wish to allocate
+ * a B-Tree node out the btree_alist.  If the allocation fails you allocate
+ * a pure data block out of master_alist and then free that  block in
+ * btree_alist, thereby assigning more space to the btree_alist, and then
+ * retry your allocation out of the btree_alist.  In the reverse direction,
+ * filesystem buffers can be garbage collected back to master_alist simply
+ * by doing whole-buffer allocations in btree_alist and then freeing the
+ * space in master_alist.  The whole-buffer-allocation approach to garbage
+ * collection works because A-list allocations are always power-of-2 sized
+ * and aligned.
  */
-#define HAMMER_CLU_MAXBUFFERS  32768
-#define HAMMER_CLU_METAELMS    1094
+#define HAMMER_CLU_MAXBUFFERS          4096
+#define HAMMER_CLU_MASTER_METAELMS     HAMMER_ALIST_METAELMS_4K_1LYR
+#define HAMMER_CLU_SLAVE_METAELMS      HAMMER_ALIST_METAELMS_4K_2LYR
 
 struct hammer_cluster_ondisk {
        struct hammer_fsbuf_head head;
@@ -238,37 +287,32 @@ struct hammer_cluster_ondisk {
        u_int32_t idx_reserved03;
 
        /* 
-        * Specify the range of information stored in this cluster.  These
-        * structures match the B-Tree elements in our parent cluster
-        * (if any) that point to us.  Note that clu_objend is
-        * range-inclusive, not range-exclusive so e.g. 0-1023 instead
-        * of 0-1024.
+        * Specify the range of information stored in this cluster as two
+        * btree elements.  These elements exist as separate records that
+        * point to us in the parent cluster's B-Tree.
+        *
+        * Note that clu_btree_end is range-inclusive, not range-exclusive.
+        * i.e. 0-1023 instead of 0,1024.
         */
-       int64_t clu_parent;             /* parent vol & cluster */
-       struct hammer_base_elm clu_objstart;
-       struct hammer_base_elm clu_objend;
+       struct hammer_base_elm clu_btree_beg;
+       struct hammer_base_elm clu_btree_end;
 
        /*
-        * The root node of the cluster's B-Tree is embedded in the
-        * cluster header.  The node is 504 bytes.
+        * The cluster's B-Tree root can change as a side effect of insertion
+        * and deletion operations so store an offset instead of embedding
+        * the root node.
         */
-       struct hammer_btree_node clu_btree_root;
+       int32_t         clu_btree_root;
+       int32_t         clu_btree_parent_vol_no;
+       int32_t         clu_btree_parent_clu_no;
+       hammer_tid_t    clu_btree_parent_clu_id;
 
-       /*
-        * HAMMER needs a separate bitmap to indicate which buffers are
-        * managed (contain a hammer_fsbuf_head).  Any buffers not so
-        * designated are either unused or contain pure data.
-        *
-        * synchronized_rec_id is the synchronization point for the
-        * cluster.  Any records with a greater or equal rec_id found
-        * when recovering a cluster are likely incomplete and will be
-        * ignored.
-        */
        u_int64_t synchronized_rec_id;
-       u_int32_t managed_buffers_bitmap[HAMMER_CLU_MAXBUFFERS/32];
 
-       char    reserved[1024];
-       hammer_almeta_t clu_almeta[HAMMER_CLU_METAELMS];
+       hammer_almeta_t clu_master_meta[HAMMER_CLU_MASTER_METAELMS];
+       hammer_almeta_t clu_btree_meta[HAMMER_CLU_SLAVE_METAELMS];
+       hammer_almeta_t clu_record_meta[HAMMER_CLU_SLAVE_METAELMS];
+       hammer_almeta_t clu_mdata_meta[HAMMER_CLU_SLAVE_METAELMS];
 };
 
 /*
@@ -308,10 +352,20 @@ struct hammer_base_record {
                                /* 40 */
 };
 
+/*
+ * Record types are fairly straightforward.  The B-Tree includes the record
+ * type in its index sort.
+ *
+ * In particular please note that it is possible to create a pseudo-
+ * filesystem within a HAMMER filesystem by creating a special object
+ * type within a directory.  Pseudo-filesystems are used as replication
+ * targets and even though they are built within a HAMMER filesystem they
+ * get their own obj_id space (and thus can serve as a replication target)
+ * and look like a mount point to the system.
+ */
 #define HAMMER_RECTYPE_UNKNOWN         0
 #define HAMMER_RECTYPE_INODE           1       /* inode in obj_id space */
-#define HAMMER_RECTYPE_SLAVE           2       /* slave inode */
-#define HAMMER_RECTYPE_OBJZONE         3       /* subdivide obj_id space */
+#define HAMMER_RECTYPE_PSEUDO_INODE    2       /* pseudo filesysem */
 #define HAMMER_RECTYPE_DATA_CREATE     0x10
 #define HAMMER_RECTYPE_DATA_ZEROFILL   0x11
 #define HAMMER_RECTYPE_DATA_DELETE     0x12
@@ -330,8 +384,13 @@ struct hammer_base_record {
 #define HAMMER_OBJTYPE_REGFILE         2
 #define HAMMER_OBJTYPE_DBFILE          3
 #define HAMMER_OBJTYPE_FIFO            4
-#define HAMMER_OBJTYPE_DEVNODE         5
-#define HAMMER_OBJTYPE_SOFTLINK                6
+#define HAMMER_OBJTYPE_CDEV            5
+#define HAMMER_OBJTYPE_BDEV            6
+#define HAMMER_OBJTYPE_SOFTLINK                7
+#define HAMMER_OBJTYPE_PSEUDOFS                8       /* pseudo filesystem obj */
+
+#define HAMMER_OBJTYPE_CLUSTER_BEG     0x10
+#define HAMMER_OBJTYPE_CLUSTER_END     0x11
 
 /*
  * Generic full-sized record
@@ -421,7 +480,7 @@ struct hammer_entry_record {
 /*
  * Hammer rollup record
  */
-union hammer_record {
+union hammer_record_ondisk {
        struct hammer_base_record       base;
        struct hammer_generic_record    generic;
        struct hammer_inode_record      inode;
@@ -429,19 +488,19 @@ union hammer_record {
        struct hammer_entry_record      entry;
 };
 
-typedef union hammer_record *hammer_record_t;
+typedef union hammer_record_ondisk *hammer_record_ondisk_t;
 
 /*
  * Filesystem buffer for records
  */
 #define HAMMER_RECORD_NODES    \
        ((HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) / \
-       sizeof(union hammer_record))
+       sizeof(union hammer_record_ondisk))
 
 struct hammer_fsbuf_recs {
        struct hammer_fsbuf_head        head;
        char                            unused[32];
-       union hammer_record             recs[HAMMER_RECORD_NODES];
+       union hammer_record_ondisk      recs[HAMMER_RECORD_NODES];
 };
 
 /*
@@ -483,19 +542,24 @@ struct hammer_inode_data {
 
 #define HAMMER_INODE_DATA_VERSION      1
 
+/*
+ * Rollup various structures embedded as record data
+ */
+union hammer_data {
+       struct hammer_inode_data inode;
+};
+
+
 /*
  * Function library support available to kernel and userland
  */
-void hammer_alist_template(hammer_alist_t, int blocks, int maxmeta);
-void hammer_alist_init(hammer_alist_t bl, hammer_almeta_t *meta);
-int32_t hammer_alist_alloc(hammer_alist_t bl, hammer_almeta_t *meta,
-                       int32_t count);
-int32_t hammer_alist_alloc_rev(hammer_alist_t bl, hammer_almeta_t *meta,
-                       int32_t count);
+void hammer_alist_template(hammer_alist_config_t bl, int32_t blocks,
+                          int32_t base_radix, int32_t maxmeta);
+void hammer_alist_init(hammer_alist_config_t bl, hammer_almeta_t meta);
+int32_t hammer_alist_alloc(hammer_alist_t live, int32_t count);
+int32_t hammer_alist_alloc_rev(hammer_alist_t live, int32_t count);
 #if 0
-int32_t hammer_alist_alloc_from(hammer_alist_t bl, hammer_almeta_t *meta,
-                       int32_t count, int32_t start, int flags);
+int32_t hammer_alist_alloc_from(hammer_alist_t live, int32_t cnt, int32_t beg);
 #endif
-void hammer_alist_free(hammer_alist_t bl, hammer_almeta_t *meta,
-                       int32_t blkno, int32_t count);
+void hammer_alist_free(hammer_alist_t live, int32_t blkno, int32_t count);