From 8cd0a023d85e93ba91304273c892a533202c863b Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@dragonflybsd.org>
Date: Mon, 19 Nov 2007 00:53:40 +0000
Subject: [PATCH] HAMMER 3/many - more core infrastructure.

* Add an in-memory B-Tree node abstraction
* Add an in-memory record abstraction.
* Put the B-Tree cursor code in its own source file.
* Fill in more of the VOP code.

* Do a major clean-up of all in-memory structures and some on-disk
  structures.  All the major in-memory structures now use similarly
  named functions.

* Move inter-cluster link from a B-Tree leaf node to a B-Tree internal
  node, giving us a left and right boundary to play with.  This simplifies
  the algorithms by quite a bit.
* Allow the B-Tree to be unbalanced by moving the sub-type from the B-Tree
  node header to the B-Tree element structure.

* Revamp the I/O infrastructure, in particular allow B-Tree nodes to be
  held passively.
* Implement a flexible B-Tree node cache.  References into the B-Tree can
  be cached by inodes.  If the related buffer is flushed by the system, the
  related cache pointers will be cleared.
---
 sbin/newfs_hammer/newfs_hammer.c    |   31 +-
 sys/conf/files                      |    3 +-
 sys/vfs/hammer/Makefile             |    6 +-
 sys/vfs/hammer/hammer.h             |  290 +++-
 sys/vfs/hammer/hammer_btree.c       | 2012 +++++++++++++--------------
 sys/vfs/hammer/hammer_btree.h       |  285 ++--
 sys/vfs/hammer/hammer_cursor.c      |  406 ++++++
 sys/vfs/hammer/hammer_cursor.h      |  106 ++
 sys/vfs/hammer/hammer_disk.h        |   47 +-
 sys/vfs/hammer/hammer_inode.c       |  238 +++-
 sys/vfs/hammer/hammer_io.c          |   63 +-
 sys/vfs/hammer/hammer_object.c      |  163 ++-
 sys/vfs/hammer/hammer_ondisk.c      | 1055 ++++++++++----
 sys/vfs/hammer/hammer_subs.c        |  105 +-
 sys/vfs/hammer/hammer_transaction.c |    6 +-
 sys/vfs/hammer/hammer_vfsops.c      |    4 +-
 sys/vfs/hammer/hammer_vnops.c       |  638 +++++++--
 17 files changed, 3614 insertions(+), 1844 deletions(-)
 create mode 100644 sys/vfs/hammer/hammer_cursor.c
 create mode 100644 sys/vfs/hammer/hammer_cursor.h

diff --git a/sbin/newfs_hammer/newfs_hammer.c b/sbin/newfs_hammer/newfs_hammer.c
index 970eff5925..56102c82bd 100644
--- a/sbin/newfs_hammer/newfs_hammer.c
+++ b/sbin/newfs_hammer/newfs_hammer.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sbin/newfs_hammer/newfs_hammer.c,v 1.5 2007/11/02 00:54:26 dillon Exp $
+ * $DragonFly: src/sbin/newfs_hammer/newfs_hammer.c,v 1.6 2007/11/19 00:53:39 dillon Exp $
  */
 
 #include "newfs_hammer.h"
@@ -551,7 +551,8 @@ format_cluster(struct volume_info *vol, int isroot)
 	 */
 	ondisk->clu_btree_parent_vol_no = -1;
 	ondisk->clu_btree_parent_clu_no = -1;
-	ondisk->clu_btree_parent_clu_id = -1;
+	ondisk->clu_btree_parent_offset = -1;
+	ondisk->clu_btree_parent_clu_gen = -1;
 
 	/*
 	 * Cluster 0 is the root cluster.  Set the B-Tree range for this
@@ -560,6 +561,11 @@ format_cluster(struct volume_info *vol, int isroot)
 	 * Note that delete_tid for the ending range must be set to 0,
 	 * 0 indicates 'not deleted', aka 'the most recent'.  See
 	 * hammer_btree_cmp() in sys/vfs/hammer/hammer_btree.c.
+	 *
+	 * The root cluster's key space represents the entire key space for
+	 * the filesystem.  The btree_end element appears to be inclusive
+	 * only because we can't overflow our variables.  It's actually
+	 * non-inclusive... that is, it is a right-side boundary element.
 	 */
 	if (isroot) {
 		ondisk->clu_btree_beg.obj_id = -0x8000000000000000LL;
@@ -574,7 +580,7 @@ format_cluster(struct volume_info *vol, int isroot)
 		ondisk->clu_btree_end.create_tid = 0xFFFFFFFFFFFFFFFFULL;
 		ondisk->clu_btree_end.delete_tid = 0;	/* special case */
 		ondisk->clu_btree_end.rec_type = 0xFFFFU;
-		ondisk->clu_btree_end.obj_type = 0xFFFFU;
+		ondisk->clu_btree_end.obj_type = 0;
 
 		format_root(cluster);
 	}
@@ -595,10 +601,10 @@ format_root(struct cluster_info *cluster)
 	int32_t btree_off;
 	int32_t rec_off;
 	int32_t data_off;
-	hammer_btree_node_t bnode;
+	hammer_node_ondisk_t bnode;
 	union hammer_record_ondisk *rec;
 	struct hammer_inode_data *idata;
-	hammer_btree_leaf_elm_t elm;
+	hammer_btree_elm_t elm;
 
 	bnode = alloc_btree_element(cluster, &btree_off);
 	rec = alloc_record_element(cluster, &rec_off);
@@ -634,16 +640,15 @@ format_root(struct cluster_info *cluster)
 	 * Create the root of the B-Tree.  The root is a leaf node so we
 	 * do not have to worry about boundary elements.
 	 */
-	bnode->base.count = 1;
-	bnode->base.type = HAMMER_BTREE_LEAF_NODE;
-	bnode->base.subtype = 0;
+	bnode->count = 1;
+	bnode->type = HAMMER_BTREE_TYPE_LEAF;
 
-	elm = &bnode->leaf.elms[0];
+	elm = &bnode->elms[0];
 	elm->base = rec->base.base;
-	elm->record.rec_offset = rec_off;
-	elm->record.data_offset = rec->base.data_offset;
-	elm->record.data_len = rec->base.data_len;
-	elm->record.data_crc = rec->base.data_crc;
+	elm->leaf.rec_offset = rec_off;
+	elm->leaf.data_offset = rec->base.data_offset;
+	elm->leaf.data_len = rec->base.data_len;
+	elm->leaf.data_crc = rec->base.data_crc;
 }
 
 void
diff --git a/sys/conf/files b/sys/conf/files
index 1aa013552b..9b40baa4ab 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1,5 +1,5 @@
 # $FreeBSD: src/sys/conf/files,v 1.340.2.137 2003/06/04 17:10:30 sam Exp $
-# $DragonFly: src/sys/conf/files,v 1.194 2007/11/18 17:53:01 pavalos Exp $
+# $DragonFly: src/sys/conf/files,v 1.195 2007/11/19 00:53:38 dillon Exp $
 #
 # The long compile-with and dependency lines are required because of
 # limitations in config: backslash-newline doesn't work in strings, and
@@ -1144,6 +1144,7 @@ vfs/hammer/hammer_vfsops.c	optional hammer
 vfs/hammer/hammer_vnops.c	optional hammer
 vfs/hammer/hammer_subs.c	optional hammer
 vfs/hammer/hammer_ondisk.c	optional hammer
+vfs/hammer/hammer_cursor.c	optional hammer
 vfs/hammer/hammer_btree.c	optional hammer
 vfs/hammer/hammer_io.c		optional hammer
 vfs/hammer/hammer_transaction.c	optional hammer
diff --git a/sys/vfs/hammer/Makefile b/sys/vfs/hammer/Makefile
index 9d0068c0d5..aac8b9d94e 100644
--- a/sys/vfs/hammer/Makefile
+++ b/sys/vfs/hammer/Makefile
@@ -1,11 +1,11 @@
 #
-# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.3 2007/11/07 00:43:24 dillon Exp $
+# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.4 2007/11/19 00:53:40 dillon Exp $
 
 KMOD=	hammer
 SRCS=	hammer_vfsops.c hammer_vnops.c hammer_inode.c \
 	hammer_subs.c hammer_ondisk.c hammer_io.c \
-	hammer_btree.c hammer_transaction.c hammer_alist.c \
-	hammer_object.c hammer_transaction.c hammer_spike.c
+	hammer_cursor.c hammer_btree.c hammer_transaction.c \
+	hammer_alist.c hammer_object.c hammer_spike.c
 
 NOMAN=
 
diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h
index 9921d8fbd8..2fffec3582 100644
--- a/sys/vfs/hammer/hammer.h
+++ b/sys/vfs/hammer/hammer.h
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.5 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.6 2007/11/19 00:53:40 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -49,10 +49,10 @@
 #include <sys/globaldata.h>
 #include <sys/lockf.h>
 #include <sys/buf.h>
+#include <sys/queue.h>
 #include <sys/globaldata.h>
 
 #include <sys/buf2.h>
-
 #include "hammer_alist.h"
 #include "hammer_disk.h"
 #include "hammer_mount.h"
@@ -80,11 +80,14 @@ struct hammer_transaction {
 	hammer_tid_t	tid;
 };
 
+typedef struct hammer_transaction *hammer_transaction_t;
+
 /*
  * HAMMER locks
  */
 struct hammer_lock {
 	int	refs;
+	int	lockcount;
 	int	wanted;
 	struct thread *locktd;
 };
@@ -92,7 +95,7 @@ struct hammer_lock {
 static __inline int
 hammer_islocked(struct hammer_lock *lock)
 {
-	return(lock->refs > 0);
+	return(lock->lockcount != 0);
 }
 
 static __inline int
@@ -102,46 +105,101 @@ hammer_islastref(struct hammer_lock *lock)
 }
 
 /*
- * Structures used to internally represent an inode
+ * Structure used to represent an inode in-memory.
+ *
+ * The record and data associated with an inode may be out of sync with
+ * the disk (xDIRTY flags), or not even on the disk at all (ONDISK flag
+ * clear).
+ *
+ * An inode may also hold a cache of unsynchronized records, used for
+ * database and directories only.  Unsynchronized regular file data is
+ * stored in the buffer cache.
+ *
+ * NOTE: A file which is created and destroyed within the initial
+ * synchronization period can wind up not doing any disk I/O at all.
+ *
+ * Finally, an inode may cache numerous disk-referencing B-Tree cursors.
  */
 struct hammer_ino_rb_tree;
 struct hammer_inode;
 RB_HEAD(hammer_ino_rb_tree, hammer_inode);
 RB_PROTOTYPEX(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
-	     hammer_ino_rb_compare, hammer_inode_info_t);
+	      hammer_ino_rb_compare, hammer_inode_info_t);
+
+struct hammer_rec_rb_tree;
+struct hammer_record;
+RB_HEAD(hammer_rec_rb_tree, hammer_record);
+RB_PROTOTYPEX(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
+	      hammer_rec_rb_compare, hammer_base_elm_t);
+
+TAILQ_HEAD(hammer_node_list, hammer_node);
 
 struct hammer_inode {
 	RB_ENTRY(hammer_inode) rb_node;
 	u_int64_t	obj_id;		/* (key) object identifier */
 	hammer_tid_t	obj_asof;	/* (key) snapshot transid or 0 */
-	hammer_tid_t	obj_lasttid;	/* last modified tid (for fsync) */
+	hammer_tid_t	last_tid;	/* last modified tid (for fsync) */
 	struct hammer_mount *hmp;
 	int		flags;
 	struct vnode	*vp;
 	struct lockf	advlock;
+	struct hammer_lock lock;
 	struct hammer_inode_record ino_rec;
 	struct hammer_inode_data ino_data;
-	struct hammer_lock lock;
+	struct hammer_rec_rb_tree rec_tree;	/* red-black record tree */
+	struct hammer_node	*cache;	/* cached B-Tree node shortcut */
 };
 
+typedef struct hammer_inode *hammer_inode_t;
+
 #define VTOI(vp)	((struct hammer_inode *)(vp)->v_data)
 
-#define HAMMER_INODE_DDIRTY	0x0001
-#define HAMMER_INODE_RDIRTY	0x0002
-#define HAMMER_INODE_ITIMES	0x0004	/* mtime or atime modified */
+#define HAMMER_INODE_DDIRTY	0x0001	/* in-memory ino_data is dirty */
+#define HAMMER_INODE_RDIRTY	0x0002	/* in-memory ino_rec is dirty */
+#define HAMMER_INODE_ITIMES	0x0004	/* in-memory mtime/atime modified */
+#define HAMMER_INODE_ONDISK	0x0010	/* inode is on-disk (else not yet) */
+
+#define HAMMER_MAX_INODE_CURSORS	4
 
 /*
- * Structures used to internally represent a volume and a cluster
+ * Structure used to represent an unsynchronized record in-memory.  This
+ * structure is orgranized in a per-inode RB-tree.  If the inode is not
+ * on disk then neither are any records and the in-memory record tree
+ * represents the entire contents of the inode.  If the inode is on disk
+ * then the on-disk B-Tree is scanned in parallel with the in-memory
+ * RB-Tree to synthesize the current state of the file.
+ *
+ * Only current (delete_tid == 0) unsynchronized records are kept in-memory.
  */
+struct hammer_record {
+	RB_ENTRY(hammer_record)		rb_node;
+	hammer_tid_t			last_tid;
+	struct hammer_inode		*ip;
+	union hammer_record_ondisk	rec;
+	union hammer_data_ondisk	*data;
+	int32_t				data_len;
+	int				flags;
+};
 
+typedef struct hammer_record *hammer_record_t;
+
+#define HAMMER_RECF_ALLOCDATA		0x0001
+#define HAMMER_RECF_ONRBTREE		0x0002
+#define HAMMER_RECF_DELETION		0x0004	/* placemark a deletion */
+
+/*
+ * Structures used to internally represent a volume and a cluster
+ */
 struct hammer_volume;
 struct hammer_cluster;
 struct hammer_supercl;
 struct hammer_buffer;
+struct hammer_node;
 RB_HEAD(hammer_vol_rb_tree, hammer_volume);
 RB_HEAD(hammer_clu_rb_tree, hammer_cluster);
 RB_HEAD(hammer_scl_rb_tree, hammer_supercl);
 RB_HEAD(hammer_buf_rb_tree, hammer_buffer);
+RB_HEAD(hammer_nod_rb_tree, hammer_node);
 
 RB_PROTOTYPE2(hammer_vol_rb_tree, hammer_volume, rb_node,
 	      hammer_vol_rb_compare, int32_t);
@@ -151,6 +209,8 @@ RB_PROTOTYPE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
 	      hammer_scl_rb_compare, int32_t);
 RB_PROTOTYPE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
 	      hammer_buf_rb_compare, int32_t);
+RB_PROTOTYPE2(hammer_nod_rb_tree, hammer_node, rb_node,
+	      hammer_nod_rb_compare, int32_t);
 
 /*
  * IO management - embedded at the head of various in-memory structures
@@ -176,8 +236,10 @@ struct hammer_io {
 	u_int		released : 1;	/* bp released (w/ B_LOCKED set) */
 };
 
+typedef struct hammer_io *hammer_io_t;
+
 /*
- * In-memory volume
+ * In-memory volume representing on-disk buffer
  */
 struct hammer_volume {
 	struct hammer_io io;
@@ -195,8 +257,10 @@ struct hammer_volume {
 	int	vol_flags;
 };
 
+typedef struct hammer_volume *hammer_volume_t;
+
 /*
- * In-memory super-cluster
+ * In-memory super-cluster representing on-disk buffer
  */
 struct hammer_supercl {
 	struct hammer_io io;
@@ -207,8 +271,13 @@ struct hammer_supercl {
 	int32_t	scl_no;
 };
 
+typedef struct hammer_supercl *hammer_supercl_t;
+
 /*
- * In-memory cluster
+ * In-memory cluster representing on-disk buffer
+ *
+ * The cluster's indexing range is cached in hammer_cluster, separate
+ * from the ondisk info in order to allow cursors to point to it.
  */
 struct hammer_cluster {
 	struct hammer_io io;
@@ -220,22 +289,64 @@ struct hammer_cluster {
 	struct hammer_alist_live alist_btree;
 	struct hammer_alist_live alist_record;
 	struct hammer_alist_live alist_mdata;
+	struct hammer_nod_rb_tree rb_nods_root;	/* cursors in cluster */
+	struct hammer_base_elm clu_btree_beg;	/* copy of on-disk info */
+	struct hammer_base_elm clu_btree_end;	/* copy of on-disk info */
 	int32_t clu_no;
 };
 
+typedef struct hammer_cluster *hammer_cluster_t;
+
 /*
- * In-memory buffer (other then volume, super-cluster, or cluster)
+ * In-memory buffer (other then volume, super-cluster, or cluster),
+ * representing an on-disk buffer.
  */
 struct hammer_buffer {
 	struct hammer_io io;
 	RB_ENTRY(hammer_buffer) rb_node;
 	hammer_fsbuf_ondisk_t ondisk;
-	struct hammer_cluster *cluster;
 	struct hammer_volume *volume;
-	struct hammer_alist_live alist;
+	struct hammer_cluster *cluster;
 	int32_t buf_no;
+	u_int64_t buf_type;
+	struct hammer_alist_live alist;
+	struct hammer_node_list clist;
+	struct hammer_node *save_scan;
 };
 
+typedef struct hammer_buffer *hammer_buffer_t;
+
+/*
+ * In-memory B-Tree node, representing an on-disk B-Tree node.
+ *
+ * This is a hang-on structure which is backed by a hammer_buffer,
+ * indexed by a hammer_cluster, and used for fine-grained locking of
+ * B-Tree nodes in order to properly control lock ordering.  A hammer_buffer
+ * can contain multiple nodes representing wildly disassociated portions
+ * of the B-Tree so locking cannot be done on a buffer-by-buffer basis.
+ *
+ * This structure uses a cluster-relative index to reduce the number
+ * of layers required to access it, and also because all on-disk B-Tree
+ * references are cluster-relative offsets.
+ */
+struct hammer_node {
+	struct hammer_lock	lock;		/* node-by-node lock */
+	TAILQ_ENTRY(hammer_node) entry;		/* per-buffer linkage */
+	RB_ENTRY(hammer_node)	rb_node;	/* per-cluster linkage */
+	int32_t			node_offset;	/* cluster-rel offset */
+	struct hammer_cluster	*cluster;
+	struct hammer_buffer	*buffer;	/* backing buffer */
+	hammer_node_ondisk_t	ondisk;		/* ptr to on-disk structure */
+	struct hammer_node	**cache1;	/* passive cache(s) */
+	struct hammer_node	**cache2;
+};
+
+typedef struct hammer_node	*hammer_node_t;
+
+/*
+ * Common I/O management structure - embedded in in-memory structures
+ * which are backed by filesystem buffers.
+ */
 union hammer_io_structure {
 	struct hammer_io	io;
 	struct hammer_volume	volume;
@@ -246,6 +357,8 @@ union hammer_io_structure {
 
 #define HAMFS_CLUSTER_DIRTY	0x0001
 
+#include "hammer_cursor.h"
+
 /*
  * Internal hammer mount data structure
  */
@@ -256,16 +369,15 @@ struct hammer_mount {
 	struct hammer_vol_rb_tree rb_vols_root;
 	struct hammer_volume *rootvol;
 	struct hammer_cluster *rootcl;
-	/* struct hammer_volume *cache_volume */
-	/* struct hammer_cluster *cache_cluster */
-	/* struct hammer_buffer *cache_buffer */
 	char *zbuf;	/* HAMMER_BUFSIZE bytes worth of all-zeros */
 	uuid_t	fsid;
 	udev_t	fsid_udev;
 	u_int32_t namekey_iterator;
-	hammer_tid_t last_tid;
+	hammer_tid_t last_tid;		/* tid for transaction id */
+	hammer_tid_t last_ino;		/* inode creation iterator */
 };
 
+typedef struct hammer_mount	*hammer_mount_t;
 
 #endif
 
@@ -286,68 +398,104 @@ int	hammer_vfs_vget(struct mount *mp, ino_t ino, struct  vnode **vpp);
 
 int	hammer_get_vnode(struct hammer_inode *ip, int lktype,
 			struct vnode **vpp);
-struct hammer_inode *hammer_get_inode(struct hammer_mount *hmp,
+struct hammer_inode *hammer_get_inode(hammer_mount_t hmp,
 			u_int64_t obj_id, int *errorp);
-void	hammer_lock_inode(struct hammer_inode *ip);
 void	hammer_put_inode(struct hammer_inode *ip);
 void	hammer_put_inode_ref(struct hammer_inode *ip);
 
-int	hammer_unload_inode(struct hammer_inode *ip, void *data __unused);
-int	hammer_unload_volume(struct hammer_volume *volume, void *data __unused);
-int	hammer_load_volume(struct hammer_mount *hmp, const char *volname);
-
-void	hammer_lock(struct hammer_lock *lock);
+int	hammer_unload_inode(hammer_inode_t ip, void *data __unused);
+int	hammer_unload_volume(hammer_volume_t volume, void *data __unused);
+int	hammer_install_volume(hammer_mount_t hmp, const char *volname);
+
+hammer_record_ondisk_t
+	hammer_ip_first(hammer_cursor_t cursor, hammer_inode_t ip);
+hammer_record_ondisk_t
+	hammer_ip_next(hammer_cursor_t cursor);
+int	hammer_ip_resolve_data(hammer_cursor_t cursor);
+hammer_record_t
+	hammer_alloc_ip_record(struct hammer_transaction *trans,
+			       hammer_inode_t ip);
+void	hammer_free_ip_record(hammer_record_t record);
+
+int	hammer_cursor_up(hammer_cursor_t cursor);
+int	hammer_cursor_toroot(hammer_cursor_t cursor);
+int	hammer_cursor_down(hammer_cursor_t cursor);
+
+void	hammer_lock_ex(struct hammer_lock *lock);
+int	hammer_lock_ex_try(struct hammer_lock *lock);
+void	hammer_lock_sh(struct hammer_lock *lock);
 void	hammer_unlock(struct hammer_lock *lock);
 void	hammer_ref(struct hammer_lock *lock);
 void	hammer_unref(struct hammer_lock *lock);
-void	hammer_ref_to_lock(struct hammer_lock *lock);
-void	hammer_lock_to_ref(struct hammer_lock *lock);
+void	hammer_downgrade(struct hammer_lock *lock);
+
 u_int32_t hammer_to_unix_xid(uuid_t *uuid);
-void	hammer_to_timespec(u_int64_t hammerts, struct timespec *ts);
+void hammer_guid_to_uuid(uuid_t *uuid, u_int32_t guid);
+void	hammer_to_timespec(hammer_tid_t tid, struct timespec *ts);
+hammer_tid_t hammer_timespec_to_transid(struct timespec *ts);
+
 enum vtype hammer_get_vnode_type(u_int8_t obj_type);
 u_int8_t hammer_get_obj_type(enum vtype vtype);
 int64_t hammer_directory_namekey(void *name, int len);
 
-int	hammer_btree_lookup(hammer_btree_info_t info,
-			    hammer_base_elm_t key, int flags);
-int	hammer_btree_extract(hammer_btree_info_t info, int flags);
-int	hammer_btree_iterate(hammer_btree_cursor_t cursor,
-			    hammer_base_elm_t key);
-int	hammer_btree_insert(hammer_btree_info_t info,
-			    hammer_btree_leaf_elm_t elm);
-int	hammer_btree_delete(hammer_btree_info_t info, hammer_base_elm_t key);
-int	hammer_btree_cursor_init(hammer_btree_cursor_t cursor,
-			    struct hammer_cluster *cluster);
-void	hammer_btree_cursor_done(hammer_btree_cursor_t cusor);
-int	hammer_btree_info_init(hammer_btree_info_t info,
-			    struct hammer_cluster *cluster);
-void	hammer_btree_info_done(hammer_btree_info_t info);
+int	hammer_init_cursor_hmp(hammer_cursor_t cursor, hammer_mount_t hmp);
+int	hammer_init_cursor_ip(hammer_cursor_t cursor, hammer_inode_t ip);
+void	hammer_done_cursor(hammer_cursor_t cursor);
+
+int	hammer_btree_lookup(hammer_cursor_t cursor);
+int	hammer_btree_extract(hammer_cursor_t cursor, int flags);
+int	hammer_btree_iterate(hammer_cursor_t cursor);
+int	hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm);
+int	hammer_btree_delete(hammer_cursor_t cursor);
+int	hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2);
 
 void	*hammer_bread(struct hammer_cluster *cluster, int32_t cloff,
-		      u_int64_t buf_type,
-		      int *errorp, struct hammer_buffer **bufferp);
+		      u_int64_t buf_type, int *errorp,
+		      struct hammer_buffer **bufferp);
+
+hammer_volume_t hammer_get_root_volume(hammer_mount_t hmp, int *errorp);
+hammer_cluster_t hammer_get_root_cluster(hammer_mount_t hmp, int *errorp);
 
-struct hammer_volume *hammer_get_volume(struct hammer_mount *hmp,
+hammer_volume_t	hammer_get_volume(hammer_mount_t hmp,
 			int32_t vol_no, int *errorp);
-struct hammer_supercl *hammer_get_supercl(struct hammer_volume *volume,
+hammer_supercl_t hammer_get_supercl(hammer_volume_t volume,
 			int32_t scl_no, int *errorp, int isnew);
-struct hammer_cluster *hammer_get_cluster(struct hammer_volume *volume,
+hammer_cluster_t hammer_get_cluster(hammer_volume_t volume,
 			int32_t clu_no, int *errorp, int isnew);
-struct hammer_buffer *hammer_get_buffer(struct hammer_cluster *cluster,
-			int32_t buf_no, int64_t buf_type, int *errorp);
+hammer_buffer_t	hammer_get_buffer(hammer_cluster_t cluster,
+			int32_t buf_no, u_int64_t buf_type, int *errorp);
+
+int		hammer_ref_cluster(hammer_cluster_t cluster);
+int		hammer_ref_buffer(hammer_buffer_t buffer);
+void		hammer_flush_buffer_nodes(hammer_buffer_t buffer);
+
+
+void		hammer_rel_volume(hammer_volume_t volume, int flush);
+void		hammer_rel_supercl(hammer_supercl_t supercl, int flush);
+void		hammer_rel_cluster(hammer_cluster_t cluster, int flush);
+void		hammer_rel_buffer(hammer_buffer_t buffer, int flush);
+
+hammer_node_t	hammer_get_node(hammer_cluster_t cluster,
+			int32_t node_offset, int *errorp);
+int		hammer_ref_node(hammer_node_t node);
+void		hammer_rel_node(hammer_node_t node);
+void		hammer_cache_node(hammer_node_t node,
+			struct hammer_node **cache);
+void		hammer_uncache_node(struct hammer_node **cache);
+void		hammer_flush_node(hammer_node_t node);
+
 struct hammer_cluster *hammer_get_rootcl(struct hammer_mount *hmp);
 void hammer_dup_buffer(struct hammer_buffer **bufferp,
 			struct hammer_buffer *buffer);
 void hammer_dup_cluster(struct hammer_cluster **clusterp,
 			struct hammer_cluster *cluster);
-void *hammer_alloc_btree(struct hammer_cluster *cluster,	
-			int *errorp, struct hammer_buffer **bufferp);
+hammer_node_t hammer_alloc_btree(struct hammer_cluster *cluster, int *errorp);
 void *hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes,
 			int *errorp, struct hammer_buffer **bufferp);
 void *hammer_alloc_record(struct hammer_cluster *cluster,
 			int *errorp, struct hammer_buffer **bufferp);
 void hammer_free_btree_ptr(struct hammer_buffer *buffer,
-			hammer_btree_node_t node);
+			hammer_node_ondisk_t node);
 void hammer_free_data_ptr(struct hammer_buffer *buffer, 
 			void *data, int bytes);
 void hammer_free_record_ptr(struct hammer_buffer *buffer,
@@ -364,22 +512,34 @@ void hammer_put_buffer(struct hammer_buffer *buffer, int flush);
 
 void hammer_init_alist_config(void);
 
-void hammer_start_transaction(struct hammer_mount *hmp,
-			      struct hammer_transaction *trans);
+void hammer_start_transaction(struct hammer_transaction *trans,
+			      struct hammer_mount *hmp);
 void hammer_commit_transaction(struct hammer_transaction *trans);
 void hammer_abort_transaction(struct hammer_transaction *trans);
 
 void hammer_modify_inode(struct hammer_transaction *trans,
-			struct hammer_inode *ip, int flags);
-int  hammer_alloc_inode(struct hammer_transaction *trans, struct vattr *vap,
-			struct ucred *cred, struct hammer_inode **ipp);
+			hammer_inode_t ip, int flags);
+int  hammer_create_inode(struct hammer_transaction *trans, struct vattr *vap,
+			struct ucred *cred, struct hammer_inode *dip,
+			struct hammer_inode **ipp);
+void hammer_rel_inode(hammer_inode_t ip);
+
 int  hammer_add_directory(struct hammer_transaction *trans,
-			struct hammer_inode *dip, struct namecache *ncp,
-			struct hammer_inode *nip);
+			hammer_inode_t dip, struct namecache *ncp,
+			hammer_inode_t nip);
+int  hammer_del_directory(struct hammer_transaction *trans,
+			hammer_cursor_t cursor, hammer_inode_t dip,
+			hammer_inode_t ip);
+int  hammer_delete_range(struct hammer_transaction *trans,
+			hammer_inode_t ip, int64_t off_beg, int64_t off_end);
+int  hammer_add_data(struct hammer_transaction *trans,
+			hammer_inode_t ip, int64_t offset,
+			void *data, int bytes);
 
 int hammer_io_read(struct vnode *devvp, struct hammer_io *io);
 int hammer_io_new(struct vnode *devvp, struct hammer_io *io);
 void hammer_io_release(struct hammer_io *io, int flush);
+int hammer_io_checkflush(hammer_io_t io);
 
 #endif
 
@@ -410,6 +570,12 @@ hammer_modify_buffer(struct hammer_buffer *buffer)
 	buffer->io.modified = 1;
 }
 
+static __inline void
+hammer_modify_node(struct hammer_node *node)
+{
+	node->buffer->io.modified = 1;
+}
+
 /*
  * Return the cluster-relative byte offset of an element within a buffer
  */
diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c
index 45e2cf64a4..b7ad8b7272 100644
--- a/sys/vfs/hammer/hammer_btree.c
+++ b/sys/vfs/hammer/hammer_btree.c
@@ -31,618 +31,136 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.3 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.4 2007/11/19 00:53:40 dillon Exp $
  */
 
 /*
- * HAMMER BH-Tree index
+ * HAMMER B-Tree index
  *
  * HAMMER implements a modified B+Tree.  In documentation this will
- * simply be refered to as the HAMMER BH-Tree.  Basically a BH-Tree
+ * simply be refered to as the HAMMER B-Tree.  Basically a B-Tree
  * looks like a B+Tree (A B-Tree which stores its records only at the leafs
  * of the tree), but adds two additional boundary elements which describe
  * the left-most and right-most element a node is able to represent.  In
- * otherwords, we have boundary elements at the two ends of a BH-Tree node
+ * otherwords, we have boundary elements at the two ends of a B-Tree node
  * instead of sub-tree pointers.
  *
- * A BH-Tree internal node looks like this:
+ * A B-Tree internal node looks like this:
  *
  *	B N N N N N N B   <-- boundary and internal elements
  *       S S S S S S S    <-- subtree pointers
  *
- * A BH-Tree leaf node basically looks like this:
+ * A B-Tree leaf node basically looks like this:
  *
  *	L L L L L L L L   <-- leaf elemenets
  *
- * The recursion radix is reduced by 2 relative to a normal B-Tree but
- * we get a number of significant benefits for our troubles.
+ * The radix for an internal node is 1 less then a leaf but we get a
+ * number of significant benefits for our troubles.
  *
- * The big benefit to using a BH-Tree is that it is possible to cache
- * pointers into the middle of the tree and not have to start searches,
- * insertions, OR deletions at the root node.   In particular, searches are
- * able to progress in a definitive direction from any point in the tree
- * without revisting nodes.  This greatly improves the efficiency of many
- * operations, most especially record appends.
+ * The big benefit to using a B-Tree containing boundary information
+ * is that it is possible to cache pointers into the middle of the tree
+ * and not have to start searches, insertions, OR deletions at the root
+ * node.   In particular, searches are able to progress in a definitive
+ * direction from any point in the tree without revisting nodes.  This
+ * greatly improves the efficiency of many operations, most especially
+ * record appends.
  *
- * BH-Trees also make the stacking of trees fairly straightforward.
- */
-#include "hammer.h"
-#include <sys/buf.h>
-#include <sys/buf2.h>
-
-static int btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key,
-			int flags);
-static int btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
-			struct hammer_cluster *cluster);
-static int btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
-			int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id);
-static int btree_cursor_up(hammer_btree_cursor_t cursor);
-static int btree_cursor_down(hammer_btree_cursor_t cursor);
-static int btree_split_internal(hammer_btree_cursor_t cursor);
-static int btree_split_leaf(hammer_btree_cursor_t cursor);
-static int btree_rebalance_node(hammer_btree_cursor_t cursor);
-static int btree_collapse(hammer_btree_cursor_t cursor);
-
-/*
- * Compare two BH-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
- */
-static int
-hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
-{
-#if 0
-	kprintf("compare obj_id %016llx %016llx\n",
-		key1->obj_id, key2->obj_id);
-	kprintf("compare rec_type %04x %04x\n",
-		key1->rec_type, key2->rec_type);
-	kprintf("compare key %016llx %016llx\n",
-		key1->key, key2->key);
-#endif
-
-	/*
-	 * A key1->obj_id of 0 matches any object id
-	 */
-	if (key1->obj_id) {
-		if (key1->obj_id < key2->obj_id)
-			return(-4);
-		if (key1->obj_id > key2->obj_id)
-			return(4);
-	}
-
-	/*
-	 * A key1->rec_type of 0 matches any record type.
-	 */
-	if (key1->rec_type) {
-		if (key1->rec_type < key2->rec_type)
-			return(-3);
-		if (key1->rec_type > key2->rec_type)
-			return(3);
-	}
-
-	/*
-	 * There is no special case for key.  0 means 0.
-	 */
-	if (key1->key < key2->key)
-		return(-2);
-	if (key1->key > key2->key)
-		return(2);
-
-	/*
-	 * This test has a number of special cases.  create_tid in key1 is
-	 * the as-of transction id, and delete_tid in key1 is NOT USED.
-	 *
-	 * A key1->create_tid of 0 matches any record regardles of when
-	 * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
-	 * used to search for the most current state of the object.
-	 *
-	 * key2->create_tid is a HAMMER record and will never be
-	 * 0.   key2->delete_tid is the deletion transaction id or 0 if 
-	 * the record has not yet been deleted.
-	 */
-	if (key1->create_tid) {
-		if (key1->create_tid < key2->create_tid)
-			return(-1);
-		if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
-			return(1);
-	}
-
-	return(0);
-}
-
-/*
- * Create a separator half way inbetween key1 and key2.  For fields just
- * one unit apart, the separator will match key2.
- */
-#define MAKE_SEPARATOR(key1, key2, dest, field)	\
-	dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
-
-static void
-hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
-		      hammer_base_elm_t dest)
-{
-	MAKE_SEPARATOR(key1, key2, dest, obj_id);
-	MAKE_SEPARATOR(key1, key2, dest, rec_type);
-	MAKE_SEPARATOR(key1, key2, dest, key);
-	MAKE_SEPARATOR(key1, key2, dest, create_tid);
-	dest->delete_tid = 0;
-	dest->obj_type = 0;
-	dest->reserved07 = 0;
-}
-
-#undef MAKE_SEPARATOR
-
-/*
- * Return whether a generic internal or leaf node is full
- */
-static int
-btree_node_is_full(struct hammer_base_node *node)
-{
-	switch(node->type) {
-	case HAMMER_BTREE_INTERNAL_NODE:
-		if (node->count == HAMMER_BTREE_INT_ELMS)
-			return(1);
-		break;
-	case HAMMER_BTREE_LEAF_NODE:
-		if (node->count == HAMMER_BTREE_LEAF_ELMS)
-			return(1);
-		break;
-	default:
-		panic("illegal btree subtype");
-	}
-	return(0);
-}
-
-static int
-btree_max_elements(u_int8_t type)
-{
-	if (type == HAMMER_BTREE_LEAF_NODE)
-		return(HAMMER_BTREE_LEAF_ELMS);
-	if (type == HAMMER_BTREE_INTERNAL_NODE)
-		return(HAMMER_BTREE_INT_ELMS);
-	panic("btree_max_elements: bad type %d\n", type);
-}
-
-/*
- * Initialize a cursor, setting the starting point for a BH-Tree search.
- *
- * The passed cluster must be locked.  This function will add a reference
- * to it.
- */
-int
-hammer_btree_cursor_init(hammer_btree_cursor_t cursor,
-			 struct hammer_cluster *cluster)
-{
-	int error;
-
-	cursor->cluster = NULL;
-	cursor->node_buffer = NULL;
-	cursor->parent_buffer = NULL;
-	cursor->node = NULL;
-	cursor->parent = NULL;
-	cursor->index = 0;
-	cursor->parent_index = 0;
-	error = btree_cursor_set_cluster(cursor, cluster);
-	return(error);
-}
-
-/*
- * Clean up a HAMMER BH-Tree cursor after we are finished using it.
- */
-void
-hammer_btree_cursor_done(hammer_btree_cursor_t cursor)
-{
-	if (cursor->node_buffer) {
-		hammer_put_buffer(cursor->node_buffer, 0);
-		cursor->node_buffer = NULL;
-	}
-	if (cursor->parent_buffer) {
-		hammer_put_buffer(cursor->parent_buffer, 0);
-		cursor->parent_buffer = NULL;
-	}
-	if (cursor->cluster) {
-		hammer_put_cluster(cursor->cluster, 0);
-		cursor->cluster = NULL;
-	}
-	cursor->node = NULL;
-	cursor->parent = NULL;
-	cursor->left_bound = NULL;
-	cursor->right_bound = NULL;
-	cursor->index = 0;
-	cursor->parent_index = 0;
-}
-
-/*
- * Initialize a btree info structure and its associated cursor prior to
- * running a BH-Tree operation.
- */
-int
-hammer_btree_info_init(hammer_btree_info_t info, struct hammer_cluster *cluster)
-{
-	int error;
-
-	error = hammer_btree_cursor_init(&info->cursor, cluster);
-	info->data_buffer = NULL;
-	info->record_buffer = NULL;
-	info->data = NULL;
-	info->rec = NULL;
-	return (error);
-}
-
-/*
- * Clean up a BH-Tree info structure after we are finished using it.
- */
-void
-hammer_btree_info_done(hammer_btree_info_t info)
-{
-	hammer_btree_cursor_done(&info->cursor);
-	if (info->data_buffer) {
-		hammer_put_buffer(info->data_buffer, 0);
-		info->data_buffer = NULL;
-	}
-	if (info->record_buffer) {
-		hammer_put_buffer(info->record_buffer, 0);
-		info->record_buffer = NULL;
-	}
-	info->data = NULL;
-	info->rec = NULL;
-}
-
-/*
- * Search a cluster's BH-Tree.  Return the matching node.  The search
- * normally traverses clusters but will terminate at a cluster entry
- * in a leaf if HAMMER_BTREE_CLUSTER_TAG is specified.  If this flag
- * is specified EIO is returned if the search would otherwise have to
- * cursor up into a parent cluster.
- *
- * The cursor must be completely initialized on entry.  If the cursor is
- * at the root of a cluster, the parent pointer & buffer may be NULL (in
- * that case the bounds point to the bounds in the cluster header).  The
- * node_buffer and node must always be valid.
- *
- * The search code may be forced to iterate up the tree if the conditions
- * required for an insertion or deletion are not met.  This does not occur
- * very often.
- *
- * INSERTIONS: The search will split full nodes and leaves on its way down
- * and guarentee that the leaf it ends up on is not full.
- *
- * DELETIONS: The search will rebalance the tree on its way down.
- */
-static 
-int
-btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key, int flags)
-{
-	struct hammer_btree_leaf_node *leaf;
-	int error;
-	int i;
-	int r;
-
-	/*
-	 * Move our cursor up the tree until we find a node whos range covers
-	 * the key we are trying to locate.  This may move us between
-	 * clusters.
-	 *
-	 * Since the root cluster always has a root BH-Tree node, info->node
-	 * is always non-NULL if no error occured.  The parent field will be
-	 * non-NULL unless we are at the root of a cluster.
-	 */
-	while (hammer_btree_cmp(key, cursor->left_bound) < 0 ||
-	       hammer_btree_cmp(key, cursor->right_bound) >= 0) {
-		/*
-		 * Must stay in current cluster if flagged, code should never
-		 * use the flag if it wants us to traverse to the parent
-		 * cluster.
-		 */
-		if (cursor->parent == NULL &&
-		    (flags & HAMMER_BTREE_CLUSTER_TAG)) {
-			error = EIO;
-			goto done;
-		}
-		error = btree_cursor_up(cursor);
-		if (error)
-			goto done;
-	}
-	KKASSERT(cursor->node != NULL);
-
-	/*
-	 * If we are inserting we can't start at a full node if the parent
-	 * is also full (because there is no way to split the node),
-	 * continue running up the tree until we hit the root of the
-	 * current cluster or until the requirement is satisfied.
-	 */
-	while (flags & HAMMER_BTREE_INSERT) {
-		if (btree_node_is_full(&cursor->node->base) == 0)
-			break;
-		if (cursor->parent == NULL)
-			break;
-		if (cursor->parent->base.count != HAMMER_BTREE_INT_ELMS)
-			break;
-		error = btree_cursor_up(cursor);
-		if (error)
-			goto done;
-	}
-
-	/*
-	 * If we are deleting we can't start at a node with only one element
-	 * unless it is root, because all of our code assumes that nodes
-	 * will never be empty.
-	 *
-	 * This handles the case where the cursor is sitting at a leaf and
-	 * either the leaf or parent contain an insufficient number of
-	 * elements.
-	 */
-	while (flags & HAMMER_BTREE_DELETE) {
-		if (cursor->node->base.count > 1)
-			break;
-		if (cursor->parent == NULL)
-			break;
-		KKASSERT(cursor->node->base.count != 0);
-		error = btree_cursor_up(cursor);
-		if (error)
-			goto done;
-	}
-
-new_cluster:
-	/*
-	 * Push down through internal nodes to locate the requested key.
-	 */
-	while (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
-		struct hammer_btree_internal_node *node;
-
-		/*
-		 * If we are a the root node and deleting, try to collapse
-		 * all of the root's children into the root.  This is the
-		 * only point where tree depth is reduced.
-		 */
-		if ((flags & HAMMER_BTREE_DELETE) && cursor->parent == NULL) {
-			error = btree_collapse(cursor);
-			if (error)
-				goto done;
-		}
-
-		/*
-		 * Scan the node to find the subtree index to push down into.
-		 * We go one-past, then back-up.  The key should never be
-		 * less then the left-hand boundary so I should never wind
-		 * up 0.
-		 */
-		node = &cursor->node->internal;
-		for (i = 0; i < node->base.count; ++i) {
-			r = hammer_btree_cmp(key, &node->elms[i].base);
-			if (r < 0)
-				break;
-		}
-		KKASSERT(i != 0);
-
-		/*
-		 * The push-down index is now i - 1.
-		 */
-		--i;
-		cursor->index = i;
-
-		/*
-		 * Handle insertion and deletion requirements.
-		 *
-		 * If inserting split full nodes.  The split code will
-		 * adjust cursor->node and cursor->index if the current
-		 * index winds up in the new node.
-		 */
-		if (flags & HAMMER_BTREE_INSERT) {
-			if (node->base.count == HAMMER_BTREE_INT_ELMS) {
-				error = btree_split_internal(cursor);
-				if (error)
-					goto done;
-				/*
-				 * reload stale pointers
-				 */
-				i = cursor->index;
-				node = &cursor->node->internal;
-			}
-		}
-
-		/*
-		 * If deleting rebalance - do not allow the child to have
-		 * just one element or we will not be able to delete it.
-		 *
-		 * Neither internal or leaf nodes (except a root-leaf) are
-		 * allowed to drop to 0 elements.
-		 *
-		 * Our separators may have been reorganized after rebalancing,
-		 * so we have to pop back up and rescan.
-		 */
-		if (flags & HAMMER_BTREE_DELETE) {
-			if (node->elms[i].subtree_count <= 1) {
-				error = btree_rebalance_node(cursor);
-				if (error)
-					goto done;
-				/* cursor->index is invalid after call */
-				goto new_cluster;
-			}
-		}
-
-		/*
-		 * Push down (push into new node, existing node becomes
-		 * the parent).
-		 */
-		error = btree_cursor_down(cursor);
-		if (error)
-			goto done;
-	}
-
-
-	/*
-	 * We are at a leaf, do a linear search of the key array.
-	 * (XXX do a binary search).  On success the index is set to the
-	 * matching element, on failure the index is set to the insertion
-	 * point.
-	 *
-	 * Boundaries are not stored in leaf nodes, so the index can wind
-	 * up to the left of element 0 (index == 0) or past the end of
-	 * the array (index == leaf->base.count).
-	 */
-	leaf = &cursor->node->leaf;
-	KKASSERT(leaf->base.count <= HAMMER_BTREE_LEAF_ELMS);
-
-	for (i = 0; i < leaf->base.count; ++i) {
-		r = hammer_btree_cmp(key, &leaf->elms[i].base);
-		if (r < 0)
-			break;
-		if (r == 0) {
-			/*
-			 * Return an exact match unless this is a cluster
-			 * element.  If it is, and the cluster tag flag has
-			 * not been set, push into the new cluster and
-			 * continue the search.
-			 */
-			cursor->index = i;
-			if ((leaf->elms[i].base.obj_type &
-			     HAMMER_OBJTYPE_CLUSTER_FLAG) &&
-			    (flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
-				error = btree_cursor_down(cursor);
-				if (error)
-					goto done;
-				goto new_cluster;
-			}
-			error = 0;
-			goto done;
-		}
-	}
-
-	/*
-	 * We could not find an exact match.  Check for a cluster
-	 * recursion.  The cluster's range is bracketed by two
-	 * leaf elements.  One of the two must be in this node.
-	 */
-	if ((flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
-		if (i == leaf->base.count) {
-			if (leaf->elms[i-1].base.obj_type &
-			    HAMMER_OBJTYPE_CLUSTER_FLAG) {
-				cursor->index = i - 1;
-				error = btree_cursor_down(cursor);
-				if (error)
-					goto done;
-				goto new_cluster;
-			}
-		} else {
-			if (leaf->elms[i].base.obj_type &
-			    HAMMER_OBJTYPE_CLUSTER_FLAG) {
-				cursor->index = i;
-				error = btree_cursor_down(cursor);
-				if (error)
-					goto done;
-				goto new_cluster;
-			}
-		}
-	}
-
-	/*
-	 * If inserting split a full leaf before returning.  This
-	 * may have the side effect of adjusting cursor->node and
-	 * cursor->index.
-	 *
-	 * We delayed the split in order to avoid any unnecessary splits.
-	 *
-	 * XXX parent's parent's subtree_count will be wrong after
-	 * this (keep parent of parent around too?  ugh).
-	 */
-	cursor->index = i;
-	if ((flags & HAMMER_BTREE_INSERT) &&
-	    leaf->base.count == HAMMER_BTREE_LEAF_ELMS) {
-		error = btree_split_leaf(cursor);
-		/* NOT USED
-		i = cursor->index;
-		node = &cursor->node->internal;
-		*/
-		if (error)
-			goto done;
-	}
-	error = ENOENT;
+ * B-Trees also make the stacking of trees fairly straightforward.
+ *
+ * INTER-CLUSTER ELEMENTS:  An element of an internal node may reference
+ * the root of another cluster rather then a node in the current cluster.
+ * This is known as an inter-cluster references.  Only B-Tree searches
+ * will cross cluster boundaries.  The rebalancing and collapse code does
+ * not attempt to move children between clusters.  A major effect of this
+ * is that we have to relax minimum element count requirements and allow
+ * trees to become somewhat unabalanced.
+ *
+ * INSERTIONS AND DELETIONS:  When inserting we split full nodes on our
+ * way down as an optimization.  I originally experimented with rebalancing
+ * nodes on the way down for deletions but it created a huge mess due to
+ * the way inter-cluster linkages work.  Instead, now I simply allow
+ * the tree to become unbalanced and allow leaf nodes to become empty.
+ * The delete code will try to clean things up from the bottom-up but
+ * will stop if related elements are not in-core or if it cannot get a node
+ * lock.
+ */
+#include "hammer.h"
+#include <sys/buf.h>
+#include <sys/buf2.h>
 
-	/*
-	 * Set the cursor's last_error.  This is used primarily by
-	 * btree_search_fwd() to determine whether it needs to skip
-	 * the current element or not.
-	 */
-done:
-	cursor->last_error = error;
-	return(error);
-}
+static int btree_search(hammer_cursor_t cursor, int flags);
+static int btree_split_internal(hammer_cursor_t cursor);
+static int btree_split_leaf(hammer_cursor_t cursor);
+static int btree_remove(hammer_node_t node, int index);
+#if 0
+static int btree_rebalance(hammer_cursor_t cursor);
+static int btree_collapse(hammer_cursor_t cursor);
+#endif
+static int btree_node_is_full(hammer_node_ondisk_t node);
+static void hammer_make_separator(hammer_base_elm_t key1,
+			hammer_base_elm_t key2, hammer_base_elm_t dest);
 
 /*
- * Ignoring key->key, iterate records after a search.  The next record which
- * matches the key (except for key->key) is returned.   To synchronize with
- * btree_search() we only check the record at the current cursor index if
- * cursor->last_error is ENOENT, indicating that it is at an insertion point
- * (so if we are not at the end of the node's array, the current record
- * will be the 'next' record to check).
+ * Iterate records after a search.  The cursor is iterated forwards past
+ * the current record until a record matching the key-range requirements
+ * is found.  ENOENT is returned if the iteration goes past the ending
+ * key.
  *
- * Records associated with regular files use the ending offset of the data
- * block (non inclusive) as their key.  E.g. if you write 20 bytes to a
- * file at offset 10, the HAMMER record will use a key of 30 for that record.
- * This way the cursor is properly positioned after a search so the first
- * record returned by btree_iterate() is (likely) the one containing the
- * desired data.  This also means the results of the initial search are
- * ignored.
+ * key_beg/key_end is an INCLUSVE range.  i.e. if you are scanning to load
+ * a 4096 byte buffer key_beg might specify an offset of 0 and key_end an
+ * offset of 4095.
  *
- * This function is also used to iterate through database records, though
- * in that case the caller utilizes any exact match located by btree_search()
- * before diving into the iteration.
+ * cursor->key_beg may or may not be modified by this function during
+ * the iteration.
  */
 int
-hammer_btree_iterate(hammer_btree_cursor_t cursor, hammer_base_elm_t key)
+hammer_btree_iterate(hammer_cursor_t cursor)
 {
-	hammer_btree_node_t node;
-	struct hammer_base_elm save_base;
+	hammer_node_ondisk_t node;
+	hammer_btree_elm_t elm;
 	int64_t save_key;
 	int error;
 	int r;
 	int s;
 
 	/*
-	 * If last_error is not zero we are at the insertion point and must
-	 * start at the current element.  If last_error is zero the caller
-	 * has already processed the current cursor (or doesn't care about
-	 * it) and we must iterate forwards.
+	 * Skip past the current record
 	 */
-	node = cursor->node;
-	if (cursor->index < node->base.count && cursor->last_error == 0)
+	node = cursor->node->ondisk;
+	if (node == NULL) {
+		KKASSERT(cursor->last_error != 0);
+		return(cursor->last_error);
+	}
+	if (cursor->index < node->count)
 		++cursor->index;
 
+	/*
+	 * Loop until an element is found or we are done.
+	 */
 	for (;;) {
 		/*
-		 * We iterate up the tree while we are at the last element.
+		 * We iterate up the tree and then index over one element
+		 * while we are at the last element in the current node.
+		 *
+		 * NOTE: This can pop us up to another cluster.
 		 *
-		 * If we hit the root of the cluster we have to cursor up
-		 * into the parent cluster, but then search to position
-		 * the cursor at the next logical element in the iteration.
-		 * We ignore an ENOENT error in this case.
+		 * If we are at the root of the root cluster, cursor_up
+		 * returns ENOENT.
+		 *
+		 * NOTE: hammer_cursor_up() will adjust cursor->key_beg
+		 * when told to re-search for the cluster tag.
 		 *
 		 * XXX this could be optimized by storing the information in
 		 * the parent reference.
 		 */
-goupagain:
-		while (cursor->index == node->base.count) {
-			if (cursor->parent == NULL) {
-				save_base = *cursor->right_bound;
-				error = btree_cursor_up(cursor);
-				if (error)
-					goto done;
-				error = btree_search(cursor, &save_base, 0);
-				if (error == ENOENT)
-					error = 0;
-				if (error)
-					goto done;
-				node = cursor->node;
-				/* node cannot be empty. cursor->index is 0 */
-				KKASSERT(cursor->index != node->base.count);
-				/* do not further increment the index */
-			} else {
-				error = btree_cursor_up(cursor);
-				if (error)
-					goto done;
-				node = cursor->node;
-				KKASSERT(cursor->index != node->base.count);
-				++cursor->index;
-			}
+		if (cursor->index == node->count) {
+			error = hammer_cursor_up(cursor);
+			if (error)
+				break;
+			node = cursor->node->ondisk;
+			KKASSERT(cursor->index != node->count);
+			++cursor->index;
+			continue;
 		}
 
 		/*
@@ -655,138 +173,161 @@ goupagain:
 		 * except for their transaction spaces.  We can then skip
 		 * the node if the left and right transaction spaces are the
 		 * same sign.  This directly optimized accesses to files with
-		 * HUGE transactional histories, such as database files.
+		 * HUGE transactional histories, such as database files,
+		 * allowing us to avoid having to iterate through the entire
+		 * history.
 		 */
-		while (node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
-			struct hammer_btree_internal_elm *elm;
-
-			elm = &node->internal.elms[cursor->index];
+		if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
+			elm = &node->elms[cursor->index];
 			if (elm[0].base.obj_id == elm[1].base.obj_id &&
 			    elm[0].base.rec_type == elm[1].base.rec_type &&
 			    elm[0].base.key == elm[1].base.key) {
-				save_key = key->key;
-				key->key = elm[0].base.key;
-				r = hammer_btree_cmp(key, &elm[0].base);
-				key->key = elm[1].base.key;
-				s = hammer_btree_cmp(key, &elm[1].base);
-				key->key = save_key;
+				/*
+				 * Left side transaction space
+				 */
+				save_key = cursor->key_beg.key;
+				cursor->key_beg.key = elm[0].base.key;
+				r = hammer_btree_cmp(&cursor->key_beg,
+						     &elm[0].base);
+				cursor->key_beg.key = save_key;
+
+				/*
+				 * Right side transaction space
+				 */
+				save_key = cursor->key_end.key;
+				cursor->key_end.key = elm[1].base.key;
+				s = hammer_btree_cmp(&cursor->key_end,
+						     &elm[1].base);
+				cursor->key_end.key = save_key;
+
+				/*
+				 * If our range is entirely on one side or
+				 * the other side we can skip the sub-tree.
+				 */
 				if ((r < 0 && s < 0) || (r > 0 && s > 0)) {
 					++cursor->index;
-					goto goupagain;
+					continue;
 				}
 			}
-			error = btree_cursor_down(cursor);
+			error = hammer_cursor_down(cursor);
 			if (error)
-				goto done;
-			KKASSERT(cursor->index != node->base.count);
-			node = cursor->node;
+				break;
+			KKASSERT(cursor->index == 0);
+			KKASSERT(cursor->index != node->count);
+			node = cursor->node->ondisk;
+			continue;
 		}
 
 		/*
-		 * Determine if the record at the cursor matches, sans
-		 * key->key (which is what we are iterating).
+		 * We are at a leaf.
+		 *
+		 * Determine if the record at the cursor has gone beyond the
+		 * end of our range.  Remember that our key range is inclusive.
+		 *
+		 * When iterating we may have to 'pick out' records matching
+		 * our transaction requirements.  A comparison return of
+		 * +1 or -1 indicates a transactional record that is too
+		 * old or too new but does not terminate the search.
 		 */
-		{
-			union hammer_btree_leaf_elm *elm;
-
-			elm = &node->leaf.elms[cursor->index];
-			save_key = key->key;
-			key->key = elm->base.key;
-			r = hammer_btree_cmp(key, &elm->base);
-			key->key = save_key;
+		elm = &node->elms[cursor->index];
+		r = hammer_btree_cmp(&cursor->key_end, &elm->base);
+		if (r == -1 || r == 1) {
+			++cursor->index;
+			continue;
 		}
 
 		/*
-		 * The iteration stops if the obj_id or rec_type no longer
-		 * match (unless the original key set those to 0, meaning the
-		 * caller WANTS to iterate through those as well), denoted
-		 * by -3,-2 or +2,+3 return values.  Otherwise the mismatch
-		 * is due to the transaction id and we can get both negative
-		 * and positive values... we have to skip those records and 
-		 * continue searching.
+		 * The search ends if the element goes beyond our key_end
+		 * (after checking transactional return values above).
+		 * Otherwise we have a successful match.
 		 */
-		if (r == 0) {
-			error = 0;
-			break;
-		}
-		if (r < -2 || r > 2) {
-			error = ENOENT;
-			break;
-		}
-		++cursor->index;
+		error = (r < 0) ? ENOENT : 0;
+		break;
 	}
-done:
 	cursor->last_error = error;
 	return(error);
 }
 
 /*
- * Look up the key in the HAMMER BH-Tree and fill in the rest of the
- * info structure with the results according to flags.  0 is returned on
- * success, non-zero on error.
+ * Lookup cursor->key_beg.  0 is returned on success, ENOENT if the entry
+ * could not be found, and a fatal error otherwise.
+ * 
+ * The cursor is suitably positioned for a deletion on success, and suitably
+ * positioned for an insertion on ENOENT.
  *
- * The caller initializes (key, cluster) and makes the call.  The cluster
- * must be referenced and locked on call.  The function can chain through
- * multiple clusters and will replace the passed cluster as it does so,
- * dereferencing and unlocking it, and referencing and locking the chain.
- * On return info->cluster will still be referenced and locked but may
- * represent a different cluster.
+ * The cursor may begin anywhere, the search will traverse clusters in
+ * either direction to locate the requested element.
  */
 int
-hammer_btree_lookup(hammer_btree_info_t info, hammer_base_elm_t key, int flags)
+hammer_btree_lookup(hammer_cursor_t cursor)
 {
 	int error;
 
-	error = btree_search(&info->cursor, key, flags);
-	if (error == 0 &&
-	    (flags & (HAMMER_BTREE_GET_RECORD|HAMMER_BTREE_GET_DATA))) {
-		error = hammer_btree_extract(info, flags);
-	}
+	error = btree_search(cursor, 0);
+	if (error == 0 && cursor->flags)
+		error = hammer_btree_extract(cursor, cursor->flags);
 	return(error);
 }
 
+/*
+ * Extract the record and/or data associated with the cursor's current
+ * position.  Any prior record or data stored in the cursor is replaced.
+ * The cursor must be positioned at a leaf node.
+ *
+ * NOTE: Only records can be extracted from internal B-Tree nodes, and
+ *       only for inter-cluster references.  At the moment we only support
+ *	 extractions from leaf nodes.
+ */
 int
-hammer_btree_extract(hammer_btree_info_t info, int flags)
+hammer_btree_extract(hammer_cursor_t cursor, int flags)
 {
-	struct hammer_cluster *cluster;
-	hammer_btree_leaf_elm_t elm;
+	hammer_node_ondisk_t node;
+	hammer_btree_elm_t elm;
+	hammer_cluster_t cluster;
 	int32_t cloff;
 	int error;
-	int iscl;
 
-	/* 
-	 * Extract the record and data reference if requested.
-	 *
+	/*
 	 * A cluster record type has no data reference, the information
-	 * is stored directly in the record and BH-Tree element.
+	 * is stored directly in the record and B-Tree element.
 	 *
 	 * The case where the data reference resolves to the same buffer
 	 * as the record reference must be handled.
 	 */
-	elm = &info->cursor.node->leaf.elms[info->cursor.index];
-	iscl = (elm->base.obj_type & HAMMER_OBJTYPE_CLUSTER_FLAG) != 0;
-	cluster = info->cursor.cluster;
+	node = cursor->node->ondisk;
+	KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
+	elm = &node->elms[cursor->index];
+	cluster = cursor->node->cluster;
 	error = 0;
 
 	if ((flags & HAMMER_BTREE_GET_RECORD) && error == 0) {
-		cloff = iscl ? elm->cluster.rec_offset : elm->record.rec_offset;
-
-
-		info->rec = hammer_bread(cluster, cloff, HAMMER_FSBUF_RECORDS,
-					 &error, &info->record_buffer);
+		cloff = elm->leaf.rec_offset;
+		cursor->record = hammer_bread(cluster, cloff,
+					      HAMMER_FSBUF_RECORDS, &error,
+					      &cursor->record_buffer);
 	} else {
 		cloff = 0;
 	}
-	if ((flags & HAMMER_BTREE_GET_DATA) && iscl == 0 && error == 0) {
-		if ((cloff ^ elm->record.data_offset) & ~HAMMER_BUFMASK) {
-			info->data = hammer_bread(cluster,
-						  elm->record.data_offset,
-						  HAMMER_FSBUF_DATA,
-						  &error, &info->record_buffer);
+	if ((flags & HAMMER_BTREE_GET_DATA) && error == 0) {
+		if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) {
+			/*
+			 * Data in different buffer than record
+			 */
+			cursor->data = hammer_bread(cluster,
+						  elm->leaf.data_offset,
+						  HAMMER_FSBUF_DATA, &error,
+						  &cursor->data_buffer);
 		} else {
-			info->data = (void *)
-				((char *)info->data_buffer->ondisk +
-				 (elm->record.data_offset & HAMMER_BUFMASK));
+			/*
+			 * Data in same buffer as record.  Note that we
+			 * leave any existing data_buffer intact, even
+			 * though we don't use it in this case, in case
+			 * other records extracted during an iteration
+			 * go back to it.
+			 */
+			cursor->data = (void *)
+				((char *)cursor->record_buffer->ondisk +
+				 (elm->leaf.data_offset & HAMMER_BUFMASK));
 		}
 	}
 	return(error);
@@ -794,16 +335,21 @@ hammer_btree_extract(hammer_btree_info_t info, int flags)
 
 
 /*
- * Insert a record into a BH-Tree's cluster.  The caller has already
- * reserved space for the record and data and must handle a ENOSPC
- * return.
+ * Insert a leaf element into the B-Tree at the current cursor position.
+ * The cursor is positioned such that the element at and beyond the cursor
+ * are shifted to make room for the new record.
+ *
+ * The caller must call hammer_btree_lookup() with the HAMMER_BTREE_INSERT
+ * flag set and that call must return ENOENT before this function can be
+ * called.
+ *
+ * ENOSPC is returned if there is no room to insert a new record.
  */
 int
-hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm)
+hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm)
 {
-	struct hammer_btree_cursor *cursor;
-	struct hammer_btree_internal_node *parent;
-	struct hammer_btree_leaf_node *leaf;
+	hammer_node_ondisk_t parent;
+	hammer_node_ondisk_t node;
 	int i;
 
 #if 0
@@ -815,7 +361,7 @@ hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm)
 	 * The search also does some setup for our insert, so there is always
 	 * room in the leaf.
 	 */
-	error = btree_search(&info->cursor, &elm->base, HAMMER_BTREE_INSERT);
+	error = btree_search(cursor, HAMMER_BTREE_INSERT);
 	if (error != ENOENT) {
 		if (error == 0)
 			error = EEXIST;
@@ -826,40 +372,55 @@ hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm)
 	/*
 	 * Insert the element at the leaf node and update the count in the
 	 * parent.  It is possible for parent to be NULL, indicating that
-	 * the root of the BH-Tree in the cluster is a leaf.  It is also
+	 * the root of the B-Tree in the cluster is a leaf.  It is also
 	 * possible for the leaf to be empty.
 	 *
 	 * Remember that the right-hand boundary is not included in the
 	 * count.
 	 */
-	cursor = &info->cursor;
-	leaf = &cursor->node->leaf;
+	node = cursor->node->ondisk;
 	i = cursor->index;
-	KKASSERT(leaf->base.count < HAMMER_BTREE_LEAF_ELMS);
-	bcopy(&leaf->elms[i], &leaf->elms[i+1],
-	      (leaf->base.count - i + 1) * sizeof(leaf->elms[0]));
-	leaf->elms[i] = *elm;
-	++leaf->base.count;
+	KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
+	KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
+	if (i != node->count) {
+		bcopy(&node->elms[i], &node->elms[i+1],
+		      (node->count - i) * sizeof(*elm));
+	}
+	node->elms[i] = *elm;
+	++node->count;
+	hammer_modify_node(cursor->node);
 
-	if ((parent = cursor->parent) != NULL) {
+	if ((parent = cursor->parent->ondisk) != NULL) {
 		i = cursor->parent_index;
-		++parent->elms[i].subtree_count;
-		KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
-		hammer_modify_buffer(cursor->parent_buffer);
+		++parent->elms[i].internal.subtree_count;
+		KKASSERT(parent->elms[i].internal.subtree_count <= node->count);
+		hammer_modify_node(cursor->parent);
 	}
-	hammer_modify_buffer(cursor->node_buffer);
 	return(0);
 }
 
 /*
- * Delete a record from the BH-Tree.
+ * Delete a record from the B-Tree's at the current cursor position.
+ * The cursor is positioned such that the current element is the one
+ * to be deleted.
+ *
+ * The caller must call hammer_btree_lookup() with the HAMMER_BTREE_DELETE
+ * flag set and that call must return 0 before this function can be
+ * called.
+ *
+ * It is possible that we will be asked to delete the last element in a
+ * leaf.  This case only occurs if the downward search was unable to
+ * rebalance us, which in turn can occur if our parent has inter-cluster
+ * elements.  So the 0-element case for a leaf is allowed.
  */
 int
-hammer_btree_delete(struct hammer_btree_info *info, hammer_base_elm_t key)
+hammer_btree_delete(hammer_cursor_t cursor)
 {
-	struct hammer_btree_cursor *cursor;
-	struct hammer_btree_internal_node *parent;
-	struct hammer_btree_leaf_node *leaf;
+	hammer_node_ondisk_t ondisk;
+	hammer_node_t node;
+	hammer_node_t parent;
+	hammer_btree_elm_t elm;
+	int error;
 	int i;
 
 #if 0
@@ -868,313 +429,393 @@ hammer_btree_delete(struct hammer_btree_info *info, hammer_base_elm_t key)
 	 * Locate the leaf element to delete.  The search is also responsible
 	 * for doing some of the rebalancing work on its way down.
 	 */
-	error = btree_search(&info->cursor, key, HAMMER_BTREE_DELETE);
+	error = btree_search(cursor, HAMMER_BTREE_DELETE);
 	if (error)
 		return (error);
 #endif
 
 	/*
-	 * Delete the element from the leaf node.  We leave empty leafs alone
-	 * and instead depend on a future search to locate and destroy them.
-	 * Otherwise we would have to recurse back up the tree to adjust
-	 * the parent's subtree_count and we do not want to do that.
+	 * Delete the element from the leaf node. 
 	 *
-	 * Remember that the right-hand boundary is not included in the
-	 * count.
+	 * Remember that leaf nodes do not have boundaries.
 	 */
-	cursor = &info->cursor;
-	leaf = &cursor->node->leaf;
+	node = cursor->node;
+	ondisk = node->ondisk;
 	i = cursor->index;
 
-	KKASSERT(cursor->node->base.type == HAMMER_BTREE_LEAF_NODE);
-	bcopy(&leaf->elms[i+1], &leaf->elms[i],
-	      (leaf->base.count - i) * sizeof(leaf->elms[0]));
-	--leaf->base.count;
-	if ((parent = cursor->parent) != NULL) {
+	KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
+	if (i + 1 != ondisk->count) {
+		bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
+		      (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
+	}
+	--ondisk->count;
+	if (cursor->parent != NULL) {
 		/*
 		 * Adjust parent's notion of the leaf's count.  subtree_count
-		 * is only approximately, it is allowed to be too small but
+		 * is only approximate, it is allowed to be too small but
 		 * never allowed to be too large.  Make sure we don't drop
 		 * the count below 0.
 		 */
-		i = cursor->parent_index;
-		if (parent->elms[i].subtree_count)
-			--parent->elms[i].subtree_count;
-		KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
-		hammer_modify_buffer(cursor->parent_buffer);
+		parent = cursor->parent;
+		elm = &parent->ondisk->elms[cursor->parent_index];
+		if (elm->internal.subtree_count)
+			--elm->internal.subtree_count;
+		KKASSERT(elm->internal.subtree_count <= ondisk->count);
+		hammer_modify_node(parent);
 	}
-	hammer_modify_buffer(cursor->node_buffer);
-	return(0);
-}
 
-/************************************************************************
- *				CURSOR SUPPORT				*
- ************************************************************************
- *
- * These routines do basic cursor operations.  This support will probably
- * be expanded in the future to add link fields for linear scans.
- */
+	/*
+	 * If the leaf is empty try to remove the subtree reference
+	 * in at (parent, parent_index).  This will unbalance the
+	 * tree.
+	 *
+	 * Note that internal nodes must have at least one element
+	 * so their boundary information is properly laid out.  If
+	 * we would cause our parent to become empty we try to
+	 * recurse up the tree, but if that doesn't work we just
+	 * leave the tree with an empty leaf.
+	 */
+	if (ondisk->count == 0) {
+		error = btree_remove(cursor->parent, cursor->parent_index);
+		if (error == 0) {
+			hammer_free_btree(node->cluster, node->node_offset);
+		} else if (error == EAGAIN) {
+			hammer_modify_node(node);
+			error = 0;
+		} /* else a real error occured XXX */
+	} else {
+		hammer_modify_node(node);
+		error = 0;
+	}
+	return(error);
+}
 
 /*
- * Unconditionally set the cursor to the root of the specified cluster.
- * The current cursor node is set to the root node of the cluster (which
- * can be an internal node or a degenerate leaf), and the parent info
- * is cleaned up and cleared.
+ * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
+ *
+ * Search a cluster's B-Tree for cursor->key_beg, return the matching node.
+ *
+ * The search begins at the current node and will instantiate a NULL
+ * parent if necessary and if not at the root of the cluster.  On return
+ * parent will be non-NULL unless the cursor is sitting at a root-leaf.
+ *
+ * The search code may be forced to iterate up the tree if the conditions
+ * required for an insertion or deletion are not met.  This does not occur
+ * very often.
+ *
+ * INSERTIONS: The search will split full nodes and leaves on its way down
+ * and guarentee that the leaf it ends up on is not full.
  *
- * The passed cluster must be locked.  This function will add a reference
- * to it.  The cursor must already have a cluster assigned to it, which we
- * will replace.
+ * DELETIONS: The search will rebalance the tree on its way down.
  */
-static
+static 
 int
-btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
-			int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id)
+btree_search(hammer_cursor_t cursor, int flags)
 {
-	struct hammer_cluster *cluster;
-	struct hammer_volume *volume;
-	int error = 0;
-
-	if (vol_no < 0)
-		return(EIO);
-	cluster = cursor->cluster;
-	KKASSERT(cluster != NULL);
-	if (vol_no == cluster->volume->vol_no) {
-		cluster = hammer_get_cluster(cluster->volume, clu_no,
-					     &error, 0);
-	} else {
-		volume = hammer_get_volume(cluster->volume->hmp,
-					   vol_no, &error);
-		if (volume) {
-			cluster = hammer_get_cluster(volume, clu_no,
-						     &error, 0);
-			hammer_put_volume(volume, 0);
-		} else {
-			cluster = NULL;
-		}
+	hammer_node_ondisk_t node;
+	hammer_cluster_t cluster;
+	int error;
+	int i;
+	int r;
+
+	flags |= cursor->flags;
+
+	/*
+	 * Move our cursor up the tree until we find a node whos range covers
+	 * the key we are trying to locate.  This may move us between
+	 * clusters.
+	 *
+	 * The left bound is inclusive, the right bound is non-inclusive.
+	 * It is ok to cursor up too far so when cursoring across a cluster
+	 * boundary.
+	 *
+	 * First see if we can skip the whole cluster.  hammer_cursor_up()
+	 * handles both cases but this way we don't check the cluster
+	 * bounds when going up the tree within a cluster.
+	 */
+	cluster = cursor->node->cluster;
+	while (
+	    hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 ||
+	    hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) {
+		error = hammer_cursor_toroot(cursor);
+		if (error)
+			goto done;
+		error = hammer_cursor_up(cursor);
+		if (error)
+			goto done;
+		cluster = cursor->node->cluster;
 	}
 
 	/*
-	 * Make sure the cluster id matches.  XXX At the moment the
-	 * clu_id in the btree cluster element is only 32 bits, so only
-	 * compare the low 32 bits.
+	 * Deal with normal cursoring within a cluster.  The right bound
+	 * is non-inclusive.  That is, the bounds form a separator.
 	 */
-	if (cluster) {
-		if ((int32_t)cluster->ondisk->clu_id == (int32_t)clu_id) {
-			btree_cursor_set_cluster(cursor, cluster);
-		} else {
-			error = EIO;
-		}
-		hammer_put_cluster(cluster, 0);
+	while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 ||
+	       hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) {
+		error = hammer_cursor_up(cursor);
+		if (error)
+			goto done;
 	}
-	return (error);
-}
 
-static
-int
-btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
-			 struct hammer_cluster *cluster)
-{
-	int error = 0;
-
-	hammer_dup_cluster(&cursor->cluster, cluster);
-	cursor->node = hammer_bread(cluster,
-				    cluster->ondisk->clu_btree_root,
-				    HAMMER_FSBUF_BTREE,
-				    &error,
-				    &cursor->node_buffer);
-	cursor->index = 0;
-	if (cursor->parent) {
-		hammer_put_buffer(cursor->parent_buffer, 0);
-		cursor->parent_buffer = NULL;
-		cursor->parent = NULL;
-		cursor->parent_index = 0;
+	/*
+	 * We better have ended up with a node somewhere, and our second
+	 * while loop had better not have traversed up a cluster.
+	 */
+	KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster);
+
+	/*
+	 * If we are inserting we can't start at a full node if the parent
+	 * is also full (because there is no way to split the node),
+	 * continue running up the tree until we hit the root of the
+	 * root cluster or until the requirement is satisfied.
+	 *
+	 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
+	 *
+	 * XXX as an optimization it should be possible to unbalance the tree
+	 * and stop at the root of the current cluster.
+	 */
+	while (flags & HAMMER_BTREE_INSERT) {
+		if (btree_node_is_full(cursor->node->ondisk) == 0)
+			break;
+		if (cursor->parent == NULL)
+			break;
+		if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS)
+			break;
+		error = hammer_cursor_up(cursor);
+		/* cluster and node are now may become stale */
+		if (error)
+			goto done;
 	}
-	cursor->left_bound = &cluster->ondisk->clu_btree_beg;
-	cursor->right_bound = &cluster->ondisk->clu_btree_end;
-	return (error);
-}
+	/* cluster = cursor->node->cluster; not needed until next cluster = */
 
-/*
- * Cursor the node up to the parent node.  If at the root of a cluster,
- * cursor to the root of the cluster's parent cluster.  Note carefully
- * that we do NOT scan the parent cluster to find the leaf that dropped
- * into our current cluster.
- *
- * This function is primarily used by the search code to avoid having
- * to search from the root of the filesystem BH-Tree.
- */
-static
-int
-btree_cursor_up(hammer_btree_cursor_t cursor)
-{
-	struct hammer_cluster_ondisk *ondisk;
-	struct hammer_btree_internal_node *parent;
-	int error;
-	int i;
-	int r;
+#if 0
+	/*
+	 * If we are deleting we can't start at an internal node with only
+	 * one element unless it is root, because all of our code assumes
+	 * that internal nodes will never be empty.  Just do this generally
+	 * for both leaf and internal nodes to get better balance.
+	 *
+	 * This handles the case where the cursor is sitting at a leaf and
+	 * either the leaf or parent contain an insufficient number of
+	 * elements.
+	 *
+	 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
+	 */
+	while (flags & HAMMER_BTREE_DELETE) {
+		if (cursor->node->ondisk->count > 1)
+			break;
+		if (cursor->parent == NULL)
+			break;
+		KKASSERT(cursor->node->ondisk->count != 0);
+		error = hammer_cursor_up(cursor);
+		/* cluster and node are now may become stale */
+		if (error)
+			goto done;
+	}
+#endif
 
-	error = 0;
-	if (cursor->parent == NULL) {
+/*new_cluster:*/
+	/*
+	 * Push down through internal nodes to locate the requested key.
+	 */
+	cluster = cursor->node->cluster;
+	node = cursor->node->ondisk;
+	while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
+#if 0
 		/*
-		 * We are at the root of the cluster, pop up to the root
-		 * of the parent cluster.  Return ENOENT if we are at the
-		 * root cluster of the filesystem.
+		 * If we are a the root node and deleting, try to collapse
+		 * all of the root's children into the root.  This is the
+		 * only point where tree depth is reduced.
 		 */
-		ondisk = cursor->cluster->ondisk;
-		if (ondisk->clu_btree_parent_vol_no < 0) {
-			error = ENOENT;
-		} else {
-			error = btree_cursor_set_cluster_by_value(
-				    cursor,
-				    ondisk->clu_btree_parent_vol_no,
-				    ondisk->clu_btree_parent_clu_no,
-				    ondisk->clu_btree_parent_clu_id);
+		if ((flags & HAMMER_BTREE_DELETE) && cursor->parent == NULL) {
+			error = btree_collapse(cursor);
+			/* node becomes stale after call */
+			if (error)
+				goto done;
 		}
-	} else {
+		node = cursor->node->ondisk;
+#endif
+
+		/*
+		 * Scan the node to find the subtree index to push down into.
+		 * We go one-past, then back-up.  The key should never be
+		 * less then the left-hand boundary so I should never wind
+		 * up 0.
+		 */
+		for (i = 0; i < node->count; ++i) {
+			r = hammer_btree_cmp(&cursor->key_beg,
+					     &node->elms[i].base);
+			if (r < 0)
+				break;
+		}
+		KKASSERT(i != 0);
+
+		/*
+		 * The push-down index is now i - 1.
+		 */
+		--i;
+		cursor->index = i;
+
+		/*
+		 * Handle insertion and deletion requirements.
+		 *
+		 * If inserting split full nodes.  The split code will
+		 * adjust cursor->node and cursor->index if the current
+		 * index winds up in the new node.
+		 */
+		if (flags & HAMMER_BTREE_INSERT) {
+			if (node->count == HAMMER_BTREE_INT_ELMS) {
+				error = btree_split_internal(cursor);
+				if (error)
+					goto done;
+				/*
+				 * reload stale pointers
+				 */
+				i = cursor->index;
+				node = cursor->node->ondisk;
+			}
+		}
+
+#if 0
 		/*
-		 * Copy the current node's parent info into the node and load
-		 * a new parent.
+		 * If deleting rebalance - do not allow the child to have
+		 * just one element or we will not be able to delete it.
+		 *
+		 * Neither internal or leaf nodes (except a root-leaf) are
+		 * allowed to drop to 0 elements.  (XXX - well, leaf nodes
+		 * can at the moment).
+		 *
+		 * Our separators may have been reorganized after rebalancing,
+		 * so we have to pop back up and rescan.
+		 *
+		 * XXX test for subtree_count < maxelms / 2, minus 1 or 2
+		 * for hysteresis?
 		 */
-		cursor->index = cursor->parent_index;
-		cursor->node = (hammer_btree_node_t)cursor->parent;
-		hammer_dup_buffer(&cursor->node_buffer, cursor->parent_buffer);
+		if (flags & HAMMER_BTREE_DELETE) {
+			if (node->elms[i].internal.subtree_count <= 1) {
+				error = btree_rebalance(cursor);
+				if (error)
+					goto done;
+				/* cursor->index is invalid after call */
+				goto new_cluster;
+			}
+		}
+#endif
 
 		/*
-		 * Load the parent's parent into parent and figure out the
-		 * parent's element index for its child.  Just NULL it out
-		 * if we hit the root of the cluster.
+		 * Push down (push into new node, existing node becomes
+		 * the parent).
 		 */
-		if (cursor->parent->base.parent) {
-			parent = hammer_bread(cursor->cluster,
-					      cursor->node->base.parent,
-					      HAMMER_FSBUF_BTREE,
-					      &error,
-					      &cursor->parent_buffer);
-			for (i = 0; i < parent->base.count; ++i) {
-				r = hammer_btree_cmp(
-					&cursor->node->internal.elms[0].base,
-					&parent->elms[i].base);
-				if (r < 0)
-					break;
-			}
-			cursor->parent = parent;
-			cursor->parent_index = i - 1;
-			KKASSERT(parent->elms[i].subtree_offset ==
-				 hammer_bclu_offset(cursor->node_buffer,
-						    cursor->node));
-		} else {
-			hammer_put_buffer(cursor->parent_buffer, 0);
-			cursor->parent = NULL;
-			cursor->parent_buffer = NULL;
-			cursor->parent_index = 0;
-		}
+		error = hammer_cursor_down(cursor);
+		/* node and cluster become stale */
+		if (error)
+			goto done;
+		node = cursor->node->ondisk;
+		cluster = cursor->node->cluster;
 	}
-	return(error);
-}
 
-/*
- * Cursor down into (node, index)
- *
- * Push down into the current cursor.  The current cursor becomes the parent.
- * If the current cursor represents a leaf cluster element this function will
- * push into the root of a new cluster and clear the parent fields.
- *
- * Pushin down at a leaf which is not a cluster element returns EIO.
- */
-static
-int
-btree_cursor_down(hammer_btree_cursor_t cursor)
-{
-	hammer_btree_node_t node;
-	int error;
+	/*
+	 * We are at a leaf, do a linear search of the key array.
+	 * (XXX do a binary search).  On success the index is set to the
+	 * matching element, on failure the index is set to the insertion
+	 * point.
+	 *
+	 * Boundaries are not stored in leaf nodes, so the index can wind
+	 * up to the left of element 0 (index == 0) or past the end of
+	 * the array (index == node->count).
+	 */
+	KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
+
+	for (i = 0; i < node->count; ++i) {
+		r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base);
 
-	node = cursor->node;
-	if (node->base.type == HAMMER_BTREE_LEAF_NODE) {
-		/*
-		 * Push into another cluster
-		 */
-		hammer_btree_leaf_elm_t elm;
-
-		elm = &node->leaf.elms[cursor->index];
-		if (elm->base.rec_type == HAMMER_RECTYPE_CLUSTER) {
-			error = btree_cursor_set_cluster_by_value(
-				    cursor,
-				    elm->cluster.vol_no,
-				    elm->cluster.clu_no,
-				    elm->cluster.verifier);
-		} else {
-			error = EIO;
-		}
-	} else {
 		/*
-		 * Push into another BH-Tree node (internal or leaf)
+		 * Stop if we've flipped past key_beg
 		 */
-		struct hammer_btree_internal_elm *elm;
+		if (r < 0)
+			break;
 
-		KKASSERT(node->base.type == HAMMER_BTREE_INTERNAL_NODE);
-		elm = &node->internal.elms[cursor->index];
-		KKASSERT(elm->subtree_offset != 0);
-		cursor->parent_index = cursor->index;
-		cursor->parent = &cursor->node->internal;
-		hammer_dup_buffer(&cursor->parent_buffer, cursor->node_buffer);
-
-		cursor->node = hammer_bread(cursor->cluster,
-					    elm->subtree_offset,
-					    HAMMER_FSBUF_BTREE,
-					    &error,
-					    &cursor->node_buffer);
-		cursor->index = 0;
-		cursor->left_bound = &elm[0].base;
-		cursor->right_bound = &elm[1].base;
-		KKASSERT(cursor->node == NULL ||
-			 cursor->node->base.parent == elm->subtree_offset);
-#ifdef INVARIANTS
 		/*
-		 * The bounds of an internal node must match the parent's
-		 * partitioning values.  Leaf nodes do not store bounds.
+		 * Return an exact match
 		 */
-		if (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
-			KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->node->internal.elms[0].base) == 0);
-			KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->node->internal.elms[cursor->node->base.count].base) == 0);
+		if (r == 0) {
+			cursor->index = i;
+			error = 0;
+			goto done;
 		}
-#endif
 	}
+
+	/*
+	 * No exact match was found, i is now at the insertion point.
+	 *
+	 * If inserting split a full leaf before returning.  This
+	 * may have the side effect of adjusting cursor->node and
+	 * cursor->index.
+	 */
+	cursor->index = i;
+	if ((flags & HAMMER_BTREE_INSERT) &&
+	    node->count == HAMMER_BTREE_LEAF_ELMS) {
+		error = btree_split_leaf(cursor);
+		/* NOT USED
+		i = cursor->index;
+		node = &cursor->node->internal;
+		*/
+		if (error)
+			goto done;
+	}
+	error = ENOENT;
+
+	/*
+	 * Set the cursor's last_error.
+	 */
+done:
+	cursor->last_error = error;
 	return(error);
 }
 
+
 /************************************************************************
- *				SPLITTING AND MERGING 			*
+ *			   SPLITTING AND MERGING 			*
  ************************************************************************
  *
  * These routines do all the dirty work required to split and merge nodes.
  */
 
 /*
- * Split an internal into two nodes and move the separator at the split
+ * Split an internal node into two nodes and move the separator at the split
  * point to the parent.  Note that the parent's parent's element pointing
  * to our parent will have an incorrect subtree_count (we don't update it).
  * It will be low, which is ok.
  *
- * Cursor->index indicates the element the caller intends to push into.
- * We will adjust cursor->node and cursor->index if that element winds
+ * (cursor->node, cursor->index) indicates the element the caller intends
+ * to push into.  We will adjust node and index if that element winds
  * up in the split node.
+ *
+ * If we are at the root of a cluster a new root must be created with two
+ * elements, one pointing to the original root and one pointing to the
+ * newly allocated split node.
+ *
+ * NOTE! Being at the root of a cluster is different from being at the
+ * root of the root cluster.  cursor->parent will not be NULL and
+ * cursor->node->ondisk.parent must be tested against 0.  Theoretically
+ * we could propogate the algorithm into the parent and deal with multiple
+ * 'roots' in the cluster header, but it's easier not to.
  */
 static
 int
-btree_split_internal(hammer_btree_cursor_t cursor)
+btree_split_internal(hammer_cursor_t cursor)
 {
-	struct hammer_btree_internal_node *parent;
-	struct hammer_btree_internal_node *node;
-	struct hammer_btree_internal_node *new_node;
-	struct hammer_btree_internal_elm *elm;
-	struct hammer_btree_internal_elm *parent_elm;
-	struct hammer_buffer *new_buffer;
-	int32_t parent_offset;
+	hammer_node_ondisk_t ondisk;
+	hammer_node_t node;
+	hammer_node_t parent;
+	hammer_node_t new_node;
+	hammer_btree_elm_t elm;
+	hammer_btree_elm_t parent_elm;
 	int parent_index;
 	int made_root;
 	int split;
 	int error;
-	const size_t esize = sizeof(struct hammer_btree_internal_elm);
+	const int esize = sizeof(*elm);
 
 	/* 
 	 * We are splitting but elms[split] will be promoted to the parent,
@@ -1182,37 +823,42 @@ btree_split_internal(hammer_btree_cursor_t cursor)
 	 * insertion point will be on the left-hand side adjust the split
 	 * point to give the right hand side one additional node.
 	 */
-	node = &cursor->node->internal;
-	split = (node->base.count + 1) / 2;
+	node = cursor->node;
+	ondisk = node->ondisk;
+	split = (ondisk->count + 1) / 2;
 	if (cursor->index <= split)
 		--split;
 	error = 0;
 
 	/*
-	 * If we are at the root of the tree, create a new root node with
+	 * If we are at the root of the cluster, create a new root node with
 	 * 1 element and split normally.  Avoid making major modifications
 	 * until we know the whole operation will work.
+	 *
+	 * The root of the cluster is different from the root of the root
+	 * cluster.  Use the node's on-disk structure's parent offset to
+	 * detect the case.
 	 */
-	parent = cursor->parent;
-	if (parent == NULL) {
-		parent = hammer_alloc_btree(cursor->cluster, &error,
-					    &cursor->parent_buffer);
+	if (ondisk->parent == 0) {
+		parent = hammer_alloc_btree(node->cluster, &error);
 		if (parent == NULL)
 			return(error);
-		parent->base.count = 1;
-		parent->base.parent = 0;
-		parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
-		parent->base.subtype = node->base.type;
-		parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
-		parent->elms[0].subtree_offset =
-			hammer_bclu_offset(cursor->node_buffer, node);
-		parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
+		hammer_lock_ex(&parent->lock);
+		ondisk = parent->ondisk;
+		ondisk->count = 1;
+		ondisk->parent = 0;
+		ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
+		ondisk->elms[0].base = node->cluster->clu_btree_beg;
+		ondisk->elms[0].internal.subtree_type = node->ondisk->type;
+		ondisk->elms[0].internal.subtree_offset = node->node_offset;
+		ondisk->elms[1].base = node->cluster->clu_btree_end;
 		made_root = 1;
-		cursor->parent_index = 0;	/* insertion point in parent */
+		parent_index = 0;	/* index of current node in parent */
 	} else {
 		made_root = 0;
+		parent = cursor->parent;
+		parent_index = cursor->parent_index;
 	}
-	parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
 
 	/*
 	 * Split node into new_node at the split point.
@@ -1227,36 +873,38 @@ btree_split_internal(hammer_btree_cursor_t cursor)
 	 *   0 1 2 3      4 5 6  
 	 *
 	 */
-	new_buffer = NULL;
-	new_node = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
+	new_node = hammer_alloc_btree(node->cluster, &error);
 	if (new_node == NULL) {
-		if (made_root)
-			hammer_free_btree_ptr(cursor->parent_buffer,
-					      (hammer_btree_node_t)parent);
+		if (made_root) {
+			hammer_unlock(&parent->lock);
+			hammer_free_btree(node->cluster, parent->node_offset);
+			hammer_rel_node(parent);
+		}
 		return(error);
 	}
+	hammer_lock_ex(&new_node->lock);
 
 	/*
-	 * Create the new node.  P become the left-hand boundary in the
+	 * Create the new node.  P becomes the left-hand boundary in the
 	 * new node.  Copy the right-hand boundary as well.
 	 *
 	 * elm is the new separator.
 	 */
-	elm = &node->elms[split];
-	bcopy(elm, &new_node->elms[0], (node->base.count - split + 1) * esize);
-	new_node->base.count = node->base.count - split;
-	new_node->base.parent = parent_offset;
-	new_node->base.type = HAMMER_BTREE_INTERNAL_NODE;
-	new_node->base.subtype = node->base.subtype;
-	KKASSERT(node->base.type == new_node->base.type);
+	ondisk = node->ondisk;
+	elm = &ondisk->elms[split];
+	bcopy(elm, &new_node->ondisk->elms[0],
+	      (ondisk->count - split + 1) * esize);
+	new_node->ondisk->count = ondisk->count - split;
+	new_node->ondisk->parent = parent->node_offset;
+	new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
+	KKASSERT(ondisk->type == new_node->ondisk->type);
 
 	/*
 	 * Cleanup the original node.  P becomes the new boundary, its
 	 * subtree_offset was moved to the new node.  If we had created
 	 * a new root its parent pointer may have changed.
 	 */
-	node->base.parent = parent_offset;
-	elm->subtree_offset = 0;
+	elm->internal.subtree_offset = 0;
 
 	/*
 	 * Insert the separator into the parent, fixup the parent's
@@ -1265,27 +913,33 @@ btree_split_internal(hammer_btree_cursor_t cursor)
 	 *
 	 * Remember that base.count does not include the right-hand boundary.
 	 */
-	parent_index = cursor->parent_index;
-	parent->elms[parent_index].subtree_count = split;
-	parent_elm = &parent->elms[parent_index+1];
+	ondisk = parent->ondisk;
+	ondisk->elms[parent_index].internal.subtree_count = split;
+	parent_elm = &ondisk->elms[parent_index+1];
 	bcopy(parent_elm, parent_elm + 1,
-	      (parent->base.count - parent_index) * esize);
-	parent_elm->base = elm->base;	/* separator P */
-	parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_node);
-	parent_elm->subtree_count = new_node->base.count;
-
-	hammer_modify_buffer(new_buffer);
-	hammer_modify_buffer(cursor->parent_buffer);
-	hammer_modify_buffer(cursor->node_buffer);
+	      (ondisk->count - parent_index) * esize);
+	parent_elm->internal.base = elm->base;	/* separator P */
+	parent_elm->internal.subtree_offset = new_node->node_offset;
+	parent_elm->internal.subtree_count = new_node->ondisk->count;
 
 	/*
 	 * The cluster's root pointer may have to be updated.
 	 */
 	if (made_root) {
-		cursor->cluster->ondisk->clu_btree_root = node->base.parent;
-		hammer_modify_cluster(cursor->cluster);
+		node->cluster->ondisk->clu_btree_root = parent->node_offset;
+		hammer_modify_cluster(node->cluster);
+		node->ondisk->parent = parent->node_offset;
+		if (cursor->parent) {
+			hammer_unlock(&cursor->parent->lock);
+			hammer_rel_node(cursor->parent);
+		}
+		cursor->parent = parent;	/* lock'd and ref'd */
 	}
 
+	hammer_modify_node(node);
+	hammer_modify_node(new_node);
+	hammer_modify_node(parent);
+
 	/*
 	 * Ok, now adjust the cursor depending on which element the original
 	 * index was pointing at.  If we are >= the split point the push node
@@ -1294,14 +948,22 @@ btree_split_internal(hammer_btree_cursor_t cursor)
 	 * NOTE: If we are at the split point itself we cannot stay with the
 	 * original node because the push index will point at the right-hand
 	 * boundary, which is illegal.
+	 *
+	 * NOTE: The cursor's parent or parent_index must be adjusted for
+	 * the case where a new parent (new root) was created, and the case
+	 * where the cursor is now pointing at the split node.
 	 */
 	if (cursor->index >= split) {
+		cursor->parent_index = parent_index + 1;
 		cursor->index -= split;
-		cursor->node = (hammer_btree_node_t)new_node;
-		hammer_put_buffer(cursor->node_buffer, 0);
-		cursor->node_buffer = new_buffer;
+		hammer_unlock(&cursor->node->lock);
+		hammer_rel_node(cursor->node);
+		cursor->node = new_node;	/* locked and ref'd */
+	} else {
+		cursor->parent_index = parent_index;
+		hammer_unlock(&new_node->lock);
+		hammer_rel_node(new_node);
 	}
-
 	return (0);
 }
 
@@ -1310,29 +972,28 @@ btree_split_internal(hammer_btree_cursor_t cursor)
  */
 static
 int
-btree_split_leaf(hammer_btree_cursor_t cursor)
+btree_split_leaf(hammer_cursor_t cursor)
 {
-	struct hammer_btree_internal_node *parent;
-	struct hammer_btree_leaf_node *leaf;
-	struct hammer_btree_leaf_node *new_leaf;
-	union hammer_btree_leaf_elm *elm;
-	struct hammer_btree_internal_elm *parent_elm;
-	struct hammer_buffer *new_buffer;
-	int32_t parent_offset;
+	hammer_node_ondisk_t ondisk;
+	hammer_node_t parent;
+	hammer_node_t leaf;
+	hammer_node_t new_leaf;
+	hammer_btree_elm_t elm;
+	hammer_btree_elm_t parent_elm;
 	int parent_index;
 	int made_root;
 	int split;
 	int error;
-	const size_t esize = sizeof(struct hammer_btree_internal_elm);
+	const size_t esize = sizeof(*elm);
 
 	/* 
-	 * We are splitting but elms[split] will be promoted to the parent,
-	 * leaving the right hand node with one less element.  If the
-	 * insertion point will be on the left-hand side adjust the split
-	 * point to give the right hand side one additional node.
+	 * Calculate the split point.  If the insertion point will be on
+	 * the left-hand side adjust the split point to give the right
+	 * hand side one additional node.
 	 */
-	leaf = &cursor->node->leaf;
-	split = (leaf->base.count + 1) / 2;
+	leaf = cursor->node;
+	ondisk = leaf->ondisk;
+	split = (ondisk->count + 1) / 2;
 	if (cursor->index <= split)
 		--split;
 	error = 0;
@@ -1342,26 +1003,26 @@ btree_split_leaf(hammer_btree_cursor_t cursor)
 	 * 1 element and split normally.  Avoid making major modifications
 	 * until we know the whole operation will work.
 	 */
-	parent = cursor->parent;
-	if (parent == NULL) {
-		parent = hammer_alloc_btree(cursor->cluster, &error,
-					    &cursor->parent_buffer);
+	if (ondisk->parent == 0) {
+		parent = hammer_alloc_btree(leaf->cluster, &error);
 		if (parent == NULL)
 			return(error);
-		parent->base.count = 1;
-		parent->base.parent = 0;
-		parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
-		parent->base.subtype = leaf->base.type;
-		parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
-		parent->elms[0].subtree_offset =
-			hammer_bclu_offset(cursor->node_buffer, leaf);
-		parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
+		hammer_lock_ex(&parent->lock);
+		ondisk = parent->ondisk;
+		ondisk->count = 1;
+		ondisk->parent = 0;
+		ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
+		ondisk->elms[0].base = leaf->cluster->clu_btree_beg;
+		ondisk->elms[0].internal.subtree_type = leaf->ondisk->type;
+		ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
+		ondisk->elms[1].base = leaf->cluster->clu_btree_end;
 		made_root = 1;
-		cursor->parent_index = 0;	/* insertion point in parent */
+		parent_index = 0;	/* insertion point in parent */
 	} else {
 		made_root = 0;
+		parent = cursor->parent;
+		parent_index = cursor->parent_index;
 	}
-	parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
 
 	/*
 	 * Split leaf into new_leaf at the split point.  Select a separator
@@ -1375,33 +1036,35 @@ btree_split_leaf(hammer_btree_cursor_t cursor)
 	 *         /   \
 	 *  L L L L     L L L L
 	 */
-	new_buffer = NULL;
-	new_leaf = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
+	new_leaf = hammer_alloc_btree(leaf->cluster, &error);
 	if (new_leaf == NULL) {
-		if (made_root)
-			hammer_free_btree_ptr(cursor->parent_buffer,
-					      (hammer_btree_node_t)parent);
+		if (made_root) {
+			hammer_unlock(&parent->lock);
+			hammer_free_btree(leaf->cluster, parent->node_offset);
+			hammer_rel_node(parent);
+		}
 		return(error);
 	}
+	hammer_lock_ex(&new_leaf->lock);
 
 	/*
 	 * Create the new node.  P become the left-hand boundary in the
 	 * new node.  Copy the right-hand boundary as well.
 	 */
-	elm = &leaf->elms[split];
-	bcopy(elm, &new_leaf->elms[0], (leaf->base.count - split) * esize);
-	new_leaf->base.count = leaf->base.count - split;
-	new_leaf->base.parent = parent_offset;
-	new_leaf->base.type = HAMMER_BTREE_LEAF_NODE;
-	new_leaf->base.subtype = 0;
-	KKASSERT(leaf->base.type == new_leaf->base.type);
+	ondisk = leaf->ondisk;
+	elm = &ondisk->elms[split];
+	bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
+	new_leaf->ondisk->count = ondisk->count - split;
+	new_leaf->ondisk->parent = parent->node_offset;
+	new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
+	KKASSERT(ondisk->type == new_leaf->ondisk->type);
 
 	/*
-	 * Cleanup the original node.  P becomes the new boundary, its
-	 * subtree_offset was moved to the new node.  If we had created
-	 * a new root its parent pointer may have changed.
+	 * Cleanup the original node.  Because this is a leaf node and
+	 * leaf nodes do not have a right-hand boundary, there
+	 * aren't any special edge cases to clean up.
 	 */
-	leaf->base.parent = parent_offset;
+	/* nothing to do */
 
 	/*
 	 * Insert the separator into the parent, fixup the parent's
@@ -1411,29 +1074,35 @@ btree_split_leaf(hammer_btree_cursor_t cursor)
 	 * Remember that base.count does not include the right-hand boundary.
 	 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
 	 */
-	parent_index = cursor->parent_index;
-	parent->elms[parent_index].subtree_count = split;
-	parent_elm = &parent->elms[parent_index+1];
-	if (parent_index + 1 != parent->base.count) {
+	ondisk = parent->ondisk;
+	ondisk->elms[parent_index].internal.subtree_count = split;
+	parent_elm = &ondisk->elms[parent_index+1];
+	if (parent_index + 1 != ondisk->count) {
 		bcopy(parent_elm, parent_elm + 1,
-		      (parent->base.count - parent_index - 1) * esize);
+		      (ondisk->count - parent_index - 1) * esize);
 	}
 	hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
-	parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_leaf);
-	parent_elm->subtree_count = new_leaf->base.count;
-
-	hammer_modify_buffer(new_buffer);
-	hammer_modify_buffer(cursor->parent_buffer);
-	hammer_modify_buffer(cursor->node_buffer);
+	parent_elm->internal.subtree_offset = new_leaf->node_offset;
+	parent_elm->internal.subtree_count = new_leaf->ondisk->count;
 
 	/*
 	 * The cluster's root pointer may have to be updated.
 	 */
 	if (made_root) {
-		cursor->cluster->ondisk->clu_btree_root = leaf->base.parent;
-		hammer_modify_cluster(cursor->cluster);
+		leaf->cluster->ondisk->clu_btree_root = parent->node_offset;
+		hammer_modify_cluster(leaf->cluster);
+		leaf->ondisk->parent = parent->node_offset;
+		if (cursor->parent) {
+			hammer_unlock(&cursor->parent->lock);
+			hammer_rel_node(cursor->parent);
+		}
+		cursor->parent = parent;	/* lock'd and ref'd */
 	}
 
+	hammer_modify_node(leaf);
+	hammer_modify_node(new_leaf);
+	hammer_modify_node(parent);
+
 	/*
 	 * Ok, now adjust the cursor depending on which element the original
 	 * index was pointing at.  If we are >= the split point the push node
@@ -1444,58 +1113,149 @@ btree_split_leaf(hammer_btree_cursor_t cursor)
 	 * boundary, which is illegal.
 	 */
 	if (cursor->index >= split) {
+		cursor->parent_index = parent_index + 1;
 		cursor->index -= split;
-		cursor->node = (hammer_btree_node_t)new_leaf;
-		hammer_put_buffer(cursor->node_buffer, 0);
-		cursor->node_buffer = new_buffer;
+		cursor->node = new_leaf;
+		hammer_unlock(&cursor->node->lock);
+		hammer_rel_node(cursor->node);
+		cursor->node = new_leaf;
+	} else {
+		cursor->parent_index = parent_index;
+		hammer_unlock(&new_leaf->lock);
+		hammer_rel_node(new_leaf);
 	}
-
 	return (0);
 }
 
 /*
- * This routine is called on an internal node prior to recursing down
- * through (node, index) when (node, index) MIGHT have too few elements for
- * the caller to perform a deletion.
+ * Remove the element at (node, index).  If the internal node would become
+ * empty passively recurse up the tree.
+ *
+ * A locked internal node is passed to this function, the node remains
+ * locked on return.  Leaf nodes cannot be passed to this function.
+ *
+ * Returns EAGAIN if we were unable to acquire the needed locks.  The caller
+ * does not deal with the empty leaf until determines whether this recursion
+ * has succeeded or not.
+ */
+int
+btree_remove(hammer_node_t node, int index)
+{
+	hammer_node_ondisk_t ondisk;
+	hammer_node_t parent;
+	int error;
+
+	ondisk = node->ondisk;
+	KKASSERT(ondisk->count > 0);
+
+	/*
+	 * Remove the element, shifting remaining elements left one.
+	 * Note that our move must include the right-boundary element.
+	 */
+	if (ondisk->count != 1) {
+		bcopy(&ondisk->elms[index+1], &ondisk->elms[index],
+		      (ondisk->count - index) * sizeof(ondisk->elms[0]));
+		--ondisk->count;
+		hammer_modify_node(node);
+		return(0);
+	}
+
+	/*
+	 * Internal nodes cannot drop to 0 elements, so remove the node
+	 * from ITS parent.  If the node is the root node, convert it to
+	 * an empty leaf node (which can drop to 0 elements).
+	 */
+	if (ondisk->parent == 0) {
+		ondisk->count = 0;
+		ondisk->type = HAMMER_BTREE_TYPE_LEAF;
+		hammer_modify_node(node);
+		return(0);
+	}
+
+	/*
+	 * Try to remove the node from its parent.  Return EAGAIN if we
+	 * cannot.
+	 */
+	parent = hammer_get_node(node->cluster, ondisk->parent, &error);
+	if (hammer_lock_ex_try(&parent->lock)) {
+		hammer_rel_node(parent);
+		return(EAGAIN);
+	}
+	ondisk = parent->ondisk;
+	for (index = 0; index < ondisk->count; ++index) {
+		if (ondisk->elms[index].internal.subtree_offset ==
+		    node->node_offset) {
+			break;
+		}
+	}
+	if (index == ondisk->count) {
+		kprintf("btree_remove: lost parent linkage to node\n");
+		error = EIO;
+	} else {
+		error = btree_remove(parent, index);
+		if (error == 0) {
+			hammer_free_btree(node->cluster, node->node_offset);
+			/* NOTE: node can be reallocated at any time now */
+		}
+	}
+	hammer_unlock(&parent->lock);
+	hammer_rel_node(parent);
+	return (error);
+}
+
+#if 0
+
+/*
+ * This routine is called on the internal node (node) prior to recursing down
+ * through (node, index) when the node referenced by (node, index) MIGHT
+ * have too few elements for the caller to perform a deletion.
  *
  * cursor->index is invalid on return because the separators may have gotten
  * adjusted, the caller must rescan the node's elements.  The caller may set
  * cursor->index to -1 if it wants us to do a general rebalancing.
+ *
+ * This routine rebalances the children of the (node), collapsing children
+ * together if possible.  On return each child will have at least L/2-1
+ * elements unless the node only has one child.
  * 
  * NOTE: Because we do not update the parent's parent in the split code,
  * the subtree_count used by the caller may be incorrect.  We correct it
  * here.  Also note that we cannot change the depth of the tree's leaf
  * nodes here (see btree_collapse()).
  *
- * This routine rebalances the children of the node, collapsing children
- * together if possible.  On return each child will have at least L/2-1
- * elements unless the node only has one child.
+ * NOTE: We make no attempt to rebalance inter-cluster elements.
  */
 static
 int
-btree_rebalance_node(hammer_btree_cursor_t cursor)
+btree_rebalance(hammer_cursor_t cursor)
 {
-	struct hammer_btree_internal_node *node;
-	hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
-	struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
-	hammer_btree_node_t child;
-	hammer_btree_elm_inmemory_t elms;
+	hammer_node_ondisk_t ondisk;
+	hammer_node_t node;
+	hammer_node_t children[HAMMER_BTREE_INT_ELMS];
+	hammer_node_t child;
+	hammer_btree_elm_t elm;
+	hammer_btree_elm_t elms;
 	int i, j, n, nelms, goal;
 	int maxelms, halfelms;
 	int error;
 
 	/*
-	 * Basic setup
+	 * If the elm being recursed through is an inter-cluster reference,
+	 * don't worry about it.
 	 */
-	node = &cursor->node->internal;
-	KKASSERT(node->elms[cursor->index].subtree_offset != 0);
+	ondisk = cursor->node->ondisk;
+	elm = &ondisk->elms[cursor->index];
+	if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER)
+		return(0);
+
+	KKASSERT(elm->internal.subtree_offset != 0);
 	error = 0;
 
 	/*
 	 * Load the children of node and do any necessary corrections
-	 * to subtree_count.  subtree_count may be incorrect due to the
+	 * to subtree_count.  subtree_count may be too low due to the
 	 * way insertions split nodes.  Get a count of the total number
-	 * of elements held by our children.
+	 * of actual elements held by our children.
 	 */
 	error = 0;
 
@@ -1509,10 +1269,11 @@ btree_rebalance_node(hammer_btree_cursor_t cursor)
 			continue;
 		child = hammer_bread(cursor->cluster, elm->subtree_offset,
 				     HAMMER_FSBUF_BTREE, &error,
-				     &child_buffer[i]);
+				     &child_buffer[i], XXX);
 		children[i] = child;
 		if (child == NULL)
 			continue;
+		XXX
 		KKASSERT(node->base.subtype == child->base.type);
 
 		/*
@@ -1536,7 +1297,7 @@ btree_rebalance_node(hammer_btree_cursor_t cursor)
 		child = children[i];
 		for (j = 0; j < child->base.count; ++j) {
 			elms[n].owner = child;
-			if (node->base.subtype == HAMMER_BTREE_LEAF_NODE)
+			if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF)
 				elms[n].u.leaf = child->leaf.elms[j];
 			else
 				elms[n].u.internal = child->internal.elms[j];
@@ -1572,7 +1333,7 @@ btree_rebalance_node(hammer_btree_cursor_t cursor)
 
 		child = children[i];
 		for (j = 0; j < goal && n < nelms; ++j) {
-			if (node->base.subtype == HAMMER_BTREE_LEAF_NODE) {
+			if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
 				child->leaf.elms[j] = elms[n].u.leaf;
 			} else {
 				child->internal.elms[j] = elms[n].u.internal;
@@ -1584,12 +1345,12 @@ btree_rebalance_node(hammer_btree_cursor_t cursor)
 			 * expensive.
 			 */
 			if (elms[n].owner != child &&
-			    node->base.subtype == HAMMER_BTREE_INTERNAL_NODE) {
+			    node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) {
 				subchild = hammer_bread(cursor->cluster,
 							elms[n].u.internal.subtree_offset,
 							HAMMER_FSBUF_BTREE,
 							&error,
-							&subchild_buffer);
+							&subchild_buffer, XXX);
 				if (subchild) {
 					subchild->base.parent =
 					    hammer_bclu_offset(child_buffer[i],
@@ -1603,7 +1364,7 @@ btree_rebalance_node(hammer_btree_cursor_t cursor)
 		/* 
 		 * Set right boundary if the children are internal nodes.
 		 */
-		if (node->base.subtype == HAMMER_BTREE_INTERNAL_NODE)
+		if (node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL)
 			child->internal.elms[j] = elms[n].u.internal;
 		child->base.count = j;
 		hammer_modify_buffer(child_buffer[i]);
@@ -1665,10 +1426,10 @@ failed:
  */
 static
 int
-btree_collapse(hammer_btree_cursor_t cursor)
+btree_collapse(hammer_cursor_t cursor)
 {
-	hammer_btree_node_t root, child;
-	hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
+	hammer_btree_node_ondisk_t root, child;
+	hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS];
 	struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
 	int count;
 	int i, j, n;
@@ -1690,7 +1451,7 @@ btree_collapse(hammer_btree_cursor_t cursor)
 	 * children.
 	 */
 	KKASSERT(root->base.parent == 0);	/* must be root node */
-	KKASSERT(root->base.type == HAMMER_BTREE_INTERNAL_NODE);
+	KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL);
 	KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
 
 	for (i = n = 0; i < count; ++i) {
@@ -1716,7 +1477,7 @@ btree_collapse(hammer_btree_cursor_t cursor)
 			continue;
 		child = hammer_bread(cursor->cluster, elm->subtree_offset,
 				     HAMMER_FSBUF_BTREE, &error,
-				     &child_buffer[i]);
+				     &child_buffer[i], XXX);
 		children[i] = child;
 		if (child == NULL)
 			continue;
@@ -1745,7 +1506,7 @@ btree_collapse(hammer_btree_cursor_t cursor)
 	 * element's boundaries should match the root's left and right
 	 * boundaries.
 	 */
-	if (root->base.subtype == HAMMER_BTREE_LEAF_NODE) {
+	if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
 		for (i = n = 0; i < count; ++i) {
 			child = children[i];
 			for (j = 0; j < child->base.count; ++j) {
@@ -1787,7 +1548,8 @@ btree_collapse(hammer_btree_cursor_t cursor)
 							elm->subtree_offset,
 							HAMMER_FSBUF_BTREE,
 							&error,
-							&subchild_buffer);
+							&subchild_buffer,
+							XXX);
 				if (subchild) {
 					subchild->base.parent = root_offset;
 					hammer_modify_buffer(subchild_buffer);
@@ -1799,7 +1561,7 @@ btree_collapse(hammer_btree_cursor_t cursor)
 			/* XXX generate a new separator? */
 			root->internal.elms[n] = child->internal.elms[j];
 		}
-		root->base.type = HAMMER_BTREE_INTERNAL_NODE;
+		root->base.type = HAMMER_BTREE_TYPE_INTERNAL;
 		root->base.subtype = subsubtype;
 		if (subchild_buffer)
 			hammer_put_buffer(subchild_buffer, 0);
@@ -1819,3 +1581,143 @@ done:
 	return(error);
 }
 
+#endif
+
+/************************************************************************
+ *			   MISCELLANIOUS SUPPORT 			*
+ ************************************************************************/
+
+/*
+ * Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
+ *
+ * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
+ *
+ * Note that key1 and key2 are treated differently.  key1 is allowed to
+ * wildcard some of its fields by setting them to 0, while key2 is expected
+ * to be in an on-disk form (no wildcards).
+ */
+int
+hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
+{
+#if 0
+	kprintf("compare obj_id %016llx %016llx\n",
+		key1->obj_id, key2->obj_id);
+	kprintf("compare rec_type %04x %04x\n",
+		key1->rec_type, key2->rec_type);
+	kprintf("compare key %016llx %016llx\n",
+		key1->key, key2->key);
+#endif
+
+	/*
+	 * A key1->obj_id of 0 matches any object id
+	 */
+	if (key1->obj_id) {
+		if (key1->obj_id < key2->obj_id)
+			return(-4);
+		if (key1->obj_id > key2->obj_id)
+			return(4);
+	}
+
+	/*
+	 * A key1->rec_type of 0 matches any record type.
+	 */
+	if (key1->rec_type) {
+		if (key1->rec_type < key2->rec_type)
+			return(-3);
+		if (key1->rec_type > key2->rec_type)
+			return(3);
+	}
+
+	/*
+	 * There is no special case for key.  0 means 0.
+	 */
+	if (key1->key < key2->key)
+		return(-2);
+	if (key1->key > key2->key)
+		return(2);
+
+	/*
+	 * This test has a number of special cases.  create_tid in key1 is
+	 * the as-of transction id, and delete_tid in key1 is NOT USED.
+	 *
+	 * A key1->create_tid of 0 matches any record regardles of when
+	 * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
+	 * used to search for the most current state of the object.
+	 *
+	 * key2->create_tid is a HAMMER record and will never be
+	 * 0.   key2->delete_tid is the deletion transaction id or 0 if 
+	 * the record has not yet been deleted.
+	 */
+	if (key1->create_tid) {
+		if (key1->create_tid < key2->create_tid)
+			return(-1);
+		if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
+			return(1);
+	}
+
+	return(0);
+}
+
+/*
+ * Create a separator half way inbetween key1 and key2.  For fields just
+ * one unit apart, the separator will match key2.
+ *
+ * The handling of delete_tid is a little confusing.  It is only possible
+ * to have one record in the B-Tree where all fields match except delete_tid.
+ * This means, worse case, two adjacent elements may have a create_tid that
+ * is one-apart and cause the separator to choose the right-hand element's
+ * create_tid.  e.g.  (create,delete):  (1,x)(2,x) -> separator is (2,x).
+ *
+ * So all we have to do is set delete_tid to the right-hand element to
+ * guarentee that the separator is properly between the two elements.
+ */
+#define MAKE_SEPARATOR(key1, key2, dest, field)	\
+	dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
+
+static void
+hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
+		      hammer_base_elm_t dest)
+{
+	bzero(dest, sizeof(*dest));
+	MAKE_SEPARATOR(key1, key2, dest, obj_id);
+	MAKE_SEPARATOR(key1, key2, dest, rec_type);
+	MAKE_SEPARATOR(key1, key2, dest, key);
+	MAKE_SEPARATOR(key1, key2, dest, create_tid);
+	dest->delete_tid = key2->delete_tid;
+}
+
+#undef MAKE_SEPARATOR
+
+/*
+ * Return whether a generic internal or leaf node is full
+ */
+static int
+btree_node_is_full(hammer_node_ondisk_t node)
+{
+	switch(node->type) {
+	case HAMMER_BTREE_TYPE_INTERNAL:
+		if (node->count == HAMMER_BTREE_INT_ELMS)
+			return(1);
+		break;
+	case HAMMER_BTREE_TYPE_LEAF:
+		if (node->count == HAMMER_BTREE_LEAF_ELMS)
+			return(1);
+		break;
+	default:
+		panic("illegal btree subtype");
+	}
+	return(0);
+}
+
+#if 0
+static int
+btree_max_elements(u_int8_t type)
+{
+	if (type == HAMMER_BTREE_TYPE_LEAF)
+		return(HAMMER_BTREE_LEAF_ELMS);
+	if (type == HAMMER_BTREE_TYPE_INTERNAL)
+		return(HAMMER_BTREE_INT_ELMS);
+	panic("btree_max_elements: bad type %d\n", type);
+}
+#endif
+
diff --git a/sys/vfs/hammer/hammer_btree.h b/sys/vfs/hammer/hammer_btree.h
index 2120d678d4..8c39a21823 100644
--- a/sys/vfs/hammer/hammer_btree.h
+++ b/sys/vfs/hammer/hammer_btree.h
@@ -31,57 +31,65 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.4 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.5 2007/11/19 00:53:40 dillon Exp $
  */
 
 /*
- * HAMMER BH-Tree index
+ * HAMMER B-Tree index
  *
- * HAMMER implements a modified B+Tree.  In all documentation this will
- * simply be refered to as the HAMMER BH-Tree.  Basically la BH-Tree
- * looks like a B+Tree (A B-Tree which stores its records only at the leafs
- * of the tree), but adds two additional boundary elements which describe
- * the left-most and right-most element a node is able to represent.  In
- * otherwords, we have boundary elements at the two ends of a BH-Tree node
- * instead of sub-tree pointers.
+ * HAMMER implements a modified B+Tree.   B+Trees store records only
+ * at their leaves and HAMMER's modification is to adjust the internal
+ * elements so there is a boundary element on each side instead of sub-tree
+ * pointers.
  *
- * A BH-Tree internal node looks like this:
+ * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to
+ * reduce confusion.
+ *
+ * A B-Tree internal node looks like this:
  *
  *	B N N N N N N B   <-- boundary and internal elements
  *       S S S S S S S    <-- subtree pointers
  *
- * A BH-Tree leaf node looks like this:
+ * A B-Tree leaf node looks like this:
  *
  *	L L L L L L L L   <-- leaf elemenets
  *			      (there is also a previous and next-leaf pointer)
  *
- * The recursion radix is reduced by 2 relative to a normal B-Tree but
- * as a bonus we treat internal nodes completely differently from leaf nodes,
- * which allows us to pack in more elements.
+ * The recursion radix of an internal node is reduced by 1 relative to
+ * a normal B-Tree in order to accomodate the right-hand boundary.
+ *
+ * The big benefit to using a B-Tree with built-in bounds information is
+ * that it makes it possible to cache pointers into the middle of the tree
+ * and not have to start searches, insertions, OR deletions at the root node.
+ * The boundary elements allow searches to progress in a definitive direction
+ * from any point in the tree without revisting nodes.  This greatly improves
+ * the efficiency of many operations, most especially record appends.
  *
- * The big benefit to using a BH-Tree is that it is possible to cache
- * pointers into the middle of the tree and not have to start searches,
- * insertions, OR deletions at the root node.   In particular, searches are
- * able to progress in a definitive direction from any point in the tree
- * without revisting nodes.  This greatly improves the efficiency of many
- * operations, most especially record appends.
+ * HAMMER B-Trees are per-cluster.  The global multi-cluster B-Tree is
+ * constructed by allowing internal nodes to link to the roots of other
+ * clusters.  Fields in the cluster header then reference back to its
+ * parent and use the cluster generation number to detect stale linkages.
  *
- * BH-Trees also make stacking of trees fairly straightforward.
+ * The B-Tree balancing code can operate within a cluster or across the
+ * filesystem's ENTIRE B-Tree super-structure.  A cluster's B-Tree root
+ * can be a leaf node in the worse case.  A cluster is guarenteed to have
+ * sufficient free space to hold a single completely full leaf in the
+ * degenerate case.
  *
- * Most of the structures below are on-disk structures.
+ * All of the structures below are on-disk structures.
  */
 
 /*
- * Common base for all B+Tree element types (40 bytes)
+ * Common base for all B-Tree element types (40 bytes)
  *
- * Note that obj_type is set to the object type the record represents
- * if an inode, directory entry, or a cluster range.  A cluster range is
+ * obj_type is set to the object type the record represents if an inode,
+ * directory entry, or an inter-cluster reference.  A cluster range is
  * special in that the B-Tree nodes represent a range within the B-Tree
  * inclusive of rec_type field, so obj_type must be used to detect the
  * cluster range entries.
  *
- * The special field is used as a subtree pointer for internal nodes,
- * and reserved for leaf nodes.
+ * subtree_type is only used by internal B-Tree elements and indicates
+ * whether we are referencing a cluster, a leaf, or an internal node.
  */
 struct hammer_base_elm {
 	int64_t	obj_id;		/* 00 object record is associated with */
@@ -90,9 +98,9 @@ struct hammer_base_elm {
 	hammer_tid_t create_tid; /* 10 transaction id for record creation */
 	hammer_tid_t delete_tid; /* 18 transaction id for record update/del */
 
-	u_int16_t rec_type;	/* 20 */
-	u_int8_t obj_type;	/* 22 (special) */
-	u_int8_t reserved06;	/* 23 (special) */
+	u_int16_t rec_type;	/* 20 _RECTYPE_ */
+	u_int8_t obj_type;	/* 22 _OBJTYPE_ (restricted) */
+	u_int8_t subtree_type;	/* 23 B-Tree element type */
 	int32_t reserved07;	/* 24 (future) */
 				/* 28 */
 };
@@ -100,23 +108,37 @@ struct hammer_base_elm {
 typedef struct hammer_base_elm *hammer_base_elm_t;
 
 /*
- * Internal element - 48 bytes.  Internal elements are independantly
- * organized.  Note that internal elements are a different size than
- * leaf elements.
+ * Internal element (40 + 16 = 56 bytes).
+ *
+ * An internal element contains the left-hand boundary and a recursion to
+ * another B-Tree node.  If rec_offset is 0 it points to another node in the
+ * current cluster at subtree_offset.  Otherwise it points to the root
+ * of another cluster at volno/cluid.
+ *
+ * sub-cluster references have an associated record while intra-cluster
+ * references do not.
+ *
+ * subtree_count is the number of elements in an intra-cluster reference.
+ * For inter-cluster references subtree_count is chaining depth heuristic
+ * used to help balance the B-Tree globally.
  */
 struct hammer_btree_internal_elm {
 	struct hammer_base_elm base;
-	int32_t	subtree_offset;
-	int8_t subtree_count;	/* hint: can be too small, but not too big */
-	int8_t reserved01;
-	int8_t reserved02;
-	int8_t reserved03;
+	int32_t rec_offset;	/* 0 indicates internal reference */
+	int32_t subtree_offset;	/* offset or cluster id */
+	int32_t subtree_volno;	/* unused or volume number */
+	int32_t subtree_count;	/* hint: can be too small, but not too big */
 };
 
+#define subtree_cluid	subtree_offset
+#define subtree_type	base.subtree_type
+
 /*
- * Leaf B-Tree element (40 + 16 = 56 bytes)
+ * Leaf B-Tree element (40 + 16 = 56 bytes).
+ *
+ * A leaf
  */
-struct hammer_btree_record_elm {
+struct hammer_btree_leaf_elm {
 	struct hammer_base_elm base;
 	int32_t rec_offset;
 	int32_t data_offset;
@@ -124,48 +146,39 @@ struct hammer_btree_record_elm {
 	u_int32_t data_crc;
 };
 
-/*
- * Leaf Cluster-reference B-Tree element (40 + 16 = 56 bytes)
- */
-struct hammer_btree_cluster_elm {
-	struct hammer_base_elm base;
-	int32_t	rec_offset;		/* cluster recursion record */
-	u_int32_t verifier;		/* low 32 bits of target clu_id */
-	int32_t	vol_no;
-	int32_t	clu_no;
-};
-
 /*
  * Rollup btree leaf element types - 56 byte structure
  */
-union hammer_btree_leaf_elm {
+union hammer_btree_elm {
 	struct hammer_base_elm base;
-	struct hammer_btree_record_elm record;
-	struct hammer_btree_cluster_elm cluster;
+	struct hammer_btree_leaf_elm leaf;
+	struct hammer_btree_internal_elm internal;
 };
 
-typedef union hammer_btree_leaf_elm *hammer_btree_leaf_elm_t;
+typedef union hammer_btree_elm *hammer_btree_elm_t;
 
 /*
- * B-Tree node (normal or meta) - 56 + 56 * 8 = 504 bytes
+ * B-Tree node (normal or meta) - 24 + 56 * 14 = 808 bytes (8-byte aligned)
  *
- * 32 B-Tree nodes fit in a 16K filesystem buffer.  Remember, the filesystem
- * buffer has a 128 byte header so (16384 - 128) / 32 = 508, but we want
- * our structures to be 8-byte aligned so we use 504.
+ * 20 B-Tree nodes fit in a 16K filesystem buffer, leaving us room for
+ * the 128 byte filesystem buffer header and another 96 bytes of filler.
  *
- * We store B-Tree records as elements in internal nodes, which means we
- * must have a separate index of sub-tree pointers.  The sub-tree pointers
- * interlace the elements (that is, the elements act as separators).  Thus
- * we have to have one more pointer then we do elements so we can represent
- * the gamut - the sub-tree to the left of the first element AND the sub-tree
- * to the right of the last element.
+ * Each node contains 14 elements.  The last element for an internal node
+ * is the right-boundary so internal nodes have one fewer logical elements
+ * then leaf nodes.
  *
- * subcount represents the number of elements in the subnode (1-8)
+ * 'count' always refers to the number of elements and is non-inclusive of
+ * the right-hand boundary for an internal node.
  *
- * 'count' always refers to the number of elements.
+ * NOTE: The node head for an internal does not contain the subtype
+ * (The B-Tree node type for the nodes referenced by its elements). 
+ * Instead, each element specifies the subtype (elm->base.subtype).
+ * This allows us to maintain an unbalanced B-Tree and to easily identify
+ * special inter-cluster link elements.
  */
-#define HAMMER_BTREE_LEAF_ELMS	8
-#define HAMMER_BTREE_INT_ELMS	9	/* +1 more for the right boundary */
+#define HAMMER_BTREE_LEAF_ELMS	14
+#define HAMMER_BTREE_INT_ELMS	(HAMMER_BTREE_LEAF_ELMS - 1)
+#define HAMMER_BTREE_NODES	20
 
 /*
  * It is safe to combine two adjacent nodes if the total number of elements
@@ -174,126 +187,44 @@ typedef union hammer_btree_leaf_elm *hammer_btree_leaf_elm_t;
 #define HAMMER_BTREE_LEAF_FILL	(HAMMER_BTREE_LEAF_ELMS - 3)
 #define HAMMER_BTREE_INT_FILL	(HAMMER_BTREE_INT_ELMS - 3)
 
-#define HAMMER_BTREE_INTERNAL_NODE	((u_int32_t)'I')
-#define HAMMER_BTREE_LEAF_NODE		((u_int32_t)'L')
+#define HAMMER_BTREE_TYPE_INTERNAL	((u_int8_t)'I')
+#define HAMMER_BTREE_TYPE_LEAF		((u_int8_t)'L')
+#define HAMMER_BTREE_TYPE_CLUSTER	((u_int8_t)'C')
 
-struct hammer_base_node {
+struct hammer_node_ondisk {
+	/*
+	 * B-Tree node header (24 bytes)
+	 */
 	int32_t		count;
-	int32_t		parent;
+	int32_t		parent;		/* 0 if at root of cluster */
 	u_int8_t	type;
-	u_int8_t	subtype;
+	u_int8_t	reserved01;
 	u_int16_t	reserved02;
-	int32_t		reserved03;
-};
-
-struct hammer_btree_internal_node {
-	/*
-	 * B-Tree internal node header (24 bytes)
-	 */
-	struct hammer_base_node base;
-	int32_t		reserved[2];
+	int32_t		reserved03;	/* future heuristic */
+	int32_t		reserved04;	/* future link_left */
+	int32_t		reserved05;	/* future link_right */
 
 	/*
-	 * Internal element array.  The subtree fields are logically to
-	 * the right of the element key.  These fields are unused and
-	 * must be 0 for the right-hand boundary element.
-	 *
-	 * The left-hand boundary element is at elms[0], the right-hand
-	 * boundary element is at elms[count].  count does not include
-	 * the right-hand boundary (so count at the termination of a 
-	 * standard for() loop will point at the right-hand boundary).
+	 * Element array.  Internal nodes have one less logical element
+	 * (meaning: the same number of physical elements) in order to
+	 * accomodate the right-hand boundary.  The left-hand boundary
+	 * is integrated into the first element.  Leaf nodes have no
+	 * boundary elements.
 	 */
-	struct hammer_btree_internal_elm elms[HAMMER_BTREE_INT_ELMS+1];
+	union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS];
 };
 
-/*
- * The hammer leaf node contains a smaller header 
- */
-struct hammer_btree_leaf_node {
-	/*
-	 * B-Tree leaf node header
-	 */
-	struct hammer_base_node base;
-	int32_t		link_left;
-	int32_t		link_right;
-	int32_t		reserved[8];
-
-	/*
-	 * Leaf element array
-	 */
-	union hammer_btree_leaf_elm elms[HAMMER_BTREE_LEAF_ELMS];
-};
-
-union hammer_btree_node {
-	struct hammer_base_node	    base;
-	struct hammer_btree_internal_node internal;
-	struct hammer_btree_leaf_node     leaf;
-};
-
-typedef union hammer_btree_node *hammer_btree_node_t;
-
-/*
- * This in-memory structure is used by btree_rebalance_node().  Note
- * that internal elements are a different size than record and cluster
- * elements.
- */
-struct hammer_btree_elm_inmemory {
-	hammer_btree_node_t owner;
-	union {
-		struct hammer_base_elm base;
-		struct hammer_btree_internal_elm internal;
-		union hammer_btree_leaf_elm leaf;
-	} u;
-};
-
-typedef struct hammer_btree_elm_inmemory *hammer_btree_elm_inmemory_t;
-
+typedef struct hammer_node_ondisk *hammer_node_ondisk_t;
 
 /*
- * B-Tree filesystem buffer
+ * B-Tree filesystem buffer (16K exactly)
  */
-#define HAMMER_BTREE_NODES		\
-        ((HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) / \
-        sizeof(union hammer_btree_node))
-
 struct hammer_fsbuf_btree {
-        struct hammer_fsbuf_head        head;
-	char				unused[128];
-        union hammer_btree_node		nodes[HAMMER_BTREE_NODES];
-};
-
-/*
- * Key and caching structures used for HAMMER B-Tree lookups.  These are
- * in-memory structures.
- */
-struct hammer_btree_cursor {
-	hammer_base_elm_t left_bound;
-	hammer_base_elm_t right_bound;
-	struct hammer_cluster *cluster;
-	struct hammer_buffer *parent_buffer;
-	struct hammer_btree_internal_node *parent;
-	struct hammer_buffer *node_buffer;
-	hammer_btree_node_t node;		/* internal or leaf node */
-	int index;
-	int parent_index;
-	int last_error;
+        struct hammer_fsbuf_head        head;		/* 128 */
+	char				filler[96];
+        struct hammer_node_ondisk	nodes[HAMMER_BTREE_NODES];
 };
 
-typedef struct hammer_btree_cursor *hammer_btree_cursor_t;
-
-struct hammer_btree_info {
-	struct hammer_btree_cursor cursor;
-	struct hammer_buffer *data_buffer;
-	struct hammer_buffer *record_buffer;
-	union hammer_data_ondisk *data;	/* returned data pointer */
-	union hammer_record_ondisk *rec;/* returned record pointer */
-};
-
-typedef struct hammer_btree_info *hammer_btree_info_t;
-
-#define HAMMER_BTREE_GET_RECORD		0x0001
-#define HAMMER_BTREE_GET_DATA		0x0002
-#define HAMMER_BTREE_CLUSTER_TAG	0x0004	/* stop at the cluster tag */
-#define HAMMER_BTREE_INSERT		0x0008	/* adjust for insert */
-#define HAMMER_BTREE_DELETE		0x0010	/* adjust for delete */
-
+#if HAMMER_BTREE_NODES * (HAMMER_BTREE_LEAF_ELMS * 56 + 24) + 128 + 96 != 16384
+#error "Sanity check hammer_fsbuf_btree"
+#endif
diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c
new file mode 100644
index 0000000000..861be29fe2
--- /dev/null
+++ b/sys/vfs/hammer/hammer_cursor.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
+ * 
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.1 2007/11/19 00:53:40 dillon Exp $
+ */
+
+/*
+ * HAMMER B-Tree index - cursor support routines
+ */
+#include "hammer.h"
+
+static int hammer_load_cursor_parent(hammer_cursor_t cursor);
+static int hammer_load_cursor_parent_local(hammer_cursor_t cursor);
+static int hammer_load_cursor_parent_cluster(hammer_cursor_t cursor);
+
+/*
+ * Initialize a fresh cursor at the root of the filesystem hierarchy.
+ *
+ * cursor->parent will be NULL on return since we are loading the root
+ * node of the root cluster.
+ */
+int
+hammer_init_cursor_hmp(hammer_cursor_t cursor, hammer_mount_t hmp)
+{
+	hammer_cluster_t cluster;
+	int error;
+
+	bzero(cursor, sizeof(*cursor));
+	cluster = hammer_get_root_cluster(hmp, &error);
+	if (error == 0) {
+		cursor->node = hammer_get_node(cluster,
+					       cluster->ondisk->clu_btree_root,
+					       &error);
+		hammer_rel_cluster(cluster, 0);
+		if (error == 0)
+			hammer_lock_ex(&cursor->node->lock);
+	}
+	if (error == 0)
+		error = hammer_load_cursor_parent(cursor);
+	return(error);
+}
+
+/*
+ * Initialize a fresh cursor using the B-Tree node cached by the
+ * in-memory inode.
+ */
+int
+hammer_init_cursor_ip(hammer_cursor_t cursor, hammer_inode_t ip)
+{
+	int error;
+
+	if (ip->cache) {
+		bzero(cursor, sizeof(*cursor));
+		cursor->node = ip->cache;
+		error = hammer_ref_node(cursor->node);
+		if (error == 0) {
+			hammer_lock_ex(&cursor->node->lock);
+			error = hammer_load_cursor_parent(cursor);
+		} else {
+			hammer_rel_node(cursor->node);
+			cursor->node = NULL;
+		}
+	} else {
+		error = hammer_init_cursor_hmp(cursor, ip->hmp);
+	}
+	return(error);
+}
+
+/*
+ * We are finished with a cursor.  We NULL out various fields as sanity
+ * check, in case the structure is inappropriately used afterwords.
+ */
+void
+hammer_done_cursor(hammer_cursor_t cursor)
+{
+	if (cursor->parent) {
+		hammer_unlock(&cursor->parent->lock);
+		hammer_rel_node(cursor->parent);
+		cursor->parent = NULL;
+	}
+	if (cursor->node) {
+		hammer_unlock(&cursor->node->lock);
+		hammer_rel_node(cursor->node);
+		cursor->node = NULL;
+	}
+        if (cursor->data_buffer) {
+                hammer_rel_buffer(cursor->data_buffer, 0);
+                cursor->data_buffer = NULL;
+        }
+        if (cursor->record_buffer) {
+                hammer_rel_buffer(cursor->record_buffer, 0);
+                cursor->record_buffer = NULL;
+        }
+	cursor->data = NULL;
+	cursor->record = NULL;
+	cursor->left_bound = NULL;
+	cursor->right_bound = NULL;
+}
+
+/*
+ * Load the parent of cursor->node into cursor->parent.  There are several
+ * cases.  (1) The parent is in the current cluster.  (2) We are at the
+ * root of the cluster and the parent is in another cluster.  (3) We are at
+ * the root of the root cluster.
+ */
+static
+int
+hammer_load_cursor_parent(hammer_cursor_t cursor)
+{
+	hammer_cluster_t cluster;
+	int error;
+
+	cluster = cursor->node->cluster;
+
+	if (cursor->node->ondisk->parent) {
+		error = hammer_load_cursor_parent_local(cursor);
+	} else if (cluster->ondisk->clu_btree_parent_vol_no >= 0) {
+		error = hammer_load_cursor_parent_cluster(cursor);
+	} else {
+		cursor->parent = NULL;
+		cursor->parent_index = 0;
+		cursor->left_bound = &cluster->ondisk->clu_btree_beg;
+		cursor->right_bound = &cluster->ondisk->clu_btree_end;
+		error = 0;
+	}
+	return(error);
+}
+
+static
+int
+hammer_load_cursor_parent_local(hammer_cursor_t cursor)
+{
+	hammer_node_t node;
+	hammer_node_t parent;
+	hammer_btree_elm_t elm;
+	int error;
+	int i;
+
+	node = cursor->node;
+	parent = hammer_get_node(node->cluster, node->ondisk->parent, &error);
+	if (error)
+		return(error);
+	elm = NULL;
+	for (i = 0; i < parent->ondisk->count; ++i) {
+		elm = &parent->ondisk->elms[i];
+		if (parent->ondisk->elms[i].internal.subtree_offset ==
+		    node->node_offset) {
+			break;
+		}
+	}
+	KKASSERT(i != parent->ondisk->count);
+	KKASSERT(parent->ondisk->elms[i].internal.rec_offset == 0);
+	cursor->parent = parent;
+	cursor->parent_index = i;
+	cursor->left_bound = &elm[0].internal.base;
+	cursor->right_bound = &elm[1].internal.base;
+
+	if (hammer_lock_ex_try(&parent->lock) != 0) {
+		hammer_unlock(&cursor->node->lock);
+		hammer_lock_ex(&parent->lock);
+		hammer_lock_ex(&cursor->node->lock);
+		/* XXX check node generation count */
+	}
+	return(error);
+}
+
+static
+int
+hammer_load_cursor_parent_cluster(hammer_cursor_t cursor)
+{
+	hammer_cluster_ondisk_t ondisk;
+	hammer_cluster_t cluster;
+	hammer_volume_t volume;
+	hammer_node_t node;
+	hammer_node_t parent;
+	hammer_btree_elm_t elm;
+	int32_t clu_no;
+	int32_t vol_no;
+	int error;
+	int i;
+
+	node = cursor->node;
+	ondisk = node->cluster->ondisk;
+	vol_no = ondisk->clu_btree_parent_vol_no;
+	clu_no = ondisk->clu_btree_parent_clu_no;
+
+	/*
+	 * Acquire the node from (volume, cluster, offset)
+	 */
+	volume = hammer_get_volume(node->cluster->volume->hmp, vol_no, &error);
+	if (error)
+		return (error);
+	cluster = hammer_get_cluster(volume, clu_no, &error, 0);
+	hammer_rel_volume(volume, 0);
+	if (error)
+		return (error);
+	parent = hammer_get_node(cluster, ondisk->clu_btree_parent_offset,
+				 &error);
+	hammer_rel_cluster(cluster, 0);
+	if (error)
+		return (error);
+
+	/* 
+	 * Ok, we have the node.  Locate the inter-cluster element
+	 */
+	elm = NULL;
+	for (i = 0; i < parent->ondisk->count; ++i) {
+		elm = &parent->ondisk->elms[i];
+		if (elm->internal.rec_offset != 0 &&
+		    elm->internal.subtree_cluid == clu_no) {
+			break;
+		}
+	}
+	KKASSERT(i != parent->ondisk->count);
+	cursor->parent = parent;
+	cursor->parent_index = i;
+	cursor->left_bound = &elm[0].internal.base;
+	cursor->right_bound = &elm[1].internal.base;
+
+	KKASSERT(hammer_btree_cmp(cursor->left_bound,
+		 &parent->cluster->ondisk->clu_btree_beg) == 0);
+	KKASSERT(hammer_btree_cmp(cursor->right_bound,
+		 &parent->cluster->ondisk->clu_btree_end) == 0);
+
+	if (hammer_lock_ex_try(&parent->lock) != 0) {
+		hammer_unlock(&cursor->node->lock);
+		hammer_lock_ex(&parent->lock);
+		hammer_lock_ex(&cursor->node->lock);
+		/* XXX check node generation count */
+	}
+	return(0);
+}
+
+
+/*
+ * Cursor up to our parent node.  Return ENOENT if we are at the root of
+ * the root cluster.
+ */
+int
+hammer_cursor_up(hammer_cursor_t cursor)
+{
+	int error;
+
+	/*
+	 * Set the node to its parent.  If the parent is NULL we are at
+	 * the root of the root cluster and return ENOENT.
+	 */
+	hammer_unlock(&cursor->node->lock);
+	hammer_rel_node(cursor->node);
+	cursor->node = cursor->parent;
+	cursor->index = cursor->parent_index;
+	cursor->parent = NULL;
+	cursor->parent_index = 0;
+
+	if (cursor->node == NULL) {
+		error = ENOENT;
+	} else {
+		error = hammer_load_cursor_parent(cursor);
+	}
+	return(error);
+}
+
+/*
+ * Set the cursor to the root of the current cursor's cluster.
+ */
+int
+hammer_cursor_toroot(hammer_cursor_t cursor)
+{
+	hammer_cluster_t cluster;
+	int error;
+
+	/*
+	 * Already at root of cluster?
+	 */
+	if (cursor->node->ondisk->parent == 0) 
+		return (0);
+
+	/*
+	 * Parent is root of cluster?
+	 */
+	if (cursor->parent->ondisk->parent == 0)
+		return (hammer_cursor_up(cursor));
+
+	/*
+	 * Ok, reload the cursor with the root of the cluster, then
+	 * locate its parent.
+	 */
+	cluster = cursor->node->cluster;
+	error = hammer_ref_cluster(cluster);
+	if (error)
+		return (error);
+
+	hammer_unlock(&cursor->parent->lock);
+	hammer_rel_node(cursor->parent);
+	hammer_unlock(&cursor->node->lock);
+	hammer_rel_node(cursor->node);
+	cursor->parent = NULL;
+	cursor->parent_index = 0;
+
+	cursor->node = hammer_get_node(cluster, cluster->ondisk->clu_btree_root,
+				       &error);
+	cursor->index = 0;
+	hammer_rel_cluster(cluster, 0);
+	if (error == 0)
+		error = hammer_load_cursor_parent(cursor);
+	return(error);
+}
+
+/*
+ * Cursor down through the current node, which must be an internal node.
+ *
+ * This routine adjusts the cursor and sets index to 0.
+ */
+int
+hammer_cursor_down(hammer_cursor_t cursor)
+{
+	hammer_node_t node;
+	hammer_btree_elm_t elm;
+	hammer_volume_t volume;
+	hammer_cluster_t cluster;
+	int32_t vol_no;
+	int32_t clu_no;
+	int error;
+
+	/*
+	 * The current node becomes the current parent
+	 */
+	node = cursor->node;
+	KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
+	KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count);
+	if (cursor->parent) {
+		hammer_unlock(&cursor->parent->lock);
+		hammer_rel_node(cursor->parent);
+	}
+	cursor->parent = node;
+	cursor->parent_index = cursor->index;
+	cursor->node = NULL;
+	cursor->index = 0;
+
+	/*
+	 * Extract element to push into at (node,index)
+	 */
+	elm = &node->ondisk->elms[cursor->parent_index];
+
+	/*
+	 * Ok, push down into elm.  If rec_offset is non-zero this is
+	 * an inter-cluster push, otherwise it is a intra-cluster push.
+	 */
+	if (elm->internal.rec_offset) {
+		vol_no = elm->internal.subtree_volno;
+		clu_no = elm->internal.subtree_cluid;
+		volume = hammer_get_volume(node->cluster->volume->hmp,
+					   vol_no, &error);
+		if (error)
+			return(error);
+		cluster = hammer_get_cluster(volume, clu_no, &error, 0);
+		hammer_rel_volume(volume, 0);
+		if (error)
+			return(error);
+		node = hammer_get_node(cluster,
+				       cluster->ondisk->clu_btree_root,
+				       &error);
+		hammer_rel_cluster(cluster, 0);
+	} else {
+		node = hammer_get_node(node->cluster,
+				       elm->internal.subtree_offset,
+				       &error);
+	}
+	if (error == 0) {
+		hammer_lock_ex(&node->lock);
+		cursor->node = node;
+		cursor->index = 0;
+	}
+	return(error);
+}
+
diff --git a/sys/vfs/hammer/hammer_cursor.h b/sys/vfs/hammer/hammer_cursor.h
new file mode 100644
index 0000000000..db84c61f7a
--- /dev/null
+++ b/sys/vfs/hammer/hammer_cursor.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
+ * 
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.1 2007/11/19 00:53:40 dillon Exp $
+ */
+
+/*
+ * The hammer_cursor structure is the primary in-memory management structure
+ * for B-Tree operations.  
+ *
+ * The most important issue to make note of is that a hammer_cursor is a
+ * tracking structure.  Any active hammer_cursor structure will be linked into
+ * hammer_node based lists and B-Tree operations executed by unrelated
+ * treads MAY MODIFY YOUR CURSOR when you are not holding an exclusive
+ * lock on the cursor's nodes.
+ *
+ * Most operations maintain an exclusive lock on their node and
+ * parent node, but certain cursor operations may temporarily release
+ * one or both locks.  In particular, a cursor_up operation may have
+ * to temporarily release underlying locks to avoid a deadlock.
+ */
+struct hammer_cursor {
+	/*
+	 * Parent B-Tree node, current B-Tree node, and related element
+	 * indices.
+	 */
+	hammer_node_t parent;
+	int parent_index;
+	hammer_node_t node;
+	int index;
+
+	/*
+	 * Pointer to the current node's bounds.  Typically points to the
+	 * appropriate boundary elements in the parent or points to bounds
+	 * stored in the cluster.  The right-boundary is range-exclusive.
+	 */
+	hammer_base_elm_t left_bound;
+	hammer_base_elm_t right_bound;
+
+	/*
+	 * Key or key range governing search
+	 */
+	struct hammer_base_elm key_beg;
+	struct hammer_base_elm key_end;
+
+	/*
+	 * Related data and record references.  Note that the related buffers
+	 * can be NULL when data and/or record is not, typically indicating
+	 * information referenced via an in-memory record.
+	 */
+	struct hammer_buffer *data_buffer;	/* extracted data & record */
+	union hammer_data_ondisk *data;
+	struct hammer_buffer *record_buffer;
+	union hammer_record_ondisk *record;
+
+	/*
+	 * Iteration and extraction control variables
+	 */
+	int flags;
+	int last_error;
+
+	/*
+	 * Merged in-memory/on-disk iterations also use these fields.
+	 */
+	struct hammer_inode *ip;
+	struct hammer_record *iprec;
+};
+
+typedef struct hammer_cursor *hammer_cursor_t;
+
+#define HAMMER_BTREE_GET_RECORD		0x0001
+#define HAMMER_BTREE_GET_DATA		0x0002
+#define HAMMER_BTREE_CLUSTER_TAG	0x0004	/* stop at the cluster tag */
+#define HAMMER_BTREE_INSERT		0x0008	/* adjust for insert */
+#define HAMMER_BTREE_DELETE		0x0010	/* adjust for delete */
+
diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h
index 6663428bfe..58a5c6b07c 100644
--- a/sys/vfs/hammer/hammer_disk.h
+++ b/sys/vfs/hammer/hammer_disk.h
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.5 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.6 2007/11/19 00:53:40 dillon Exp $
  */
 
 #ifndef _SYS_UUID_H_
@@ -202,6 +202,8 @@ struct hammer_volume_ondisk {
 	u_int32_t	vol0_bitmap[1024];
 };
 
+typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t;
+
 #define HAMMER_VOLF_VALID		0x0001	/* valid entry */
 #define HAMMER_VOLF_OPEN		0x0002	/* volume is open */
 #define HAMMER_VOLF_USINGSUPERCL	0x0004	/* using superclusters */
@@ -231,6 +233,8 @@ struct hammer_supercl_ondisk {
 	struct hammer_almeta	scl_meta[HAMMER_SUPERCL_METAELMS];
 };
 
+typedef struct hammer_supercl_ondisk *hammer_supercl_ondisk_t;
+
 /*
  * HAMMER Cluster header
  *
@@ -272,10 +276,8 @@ struct hammer_cluster_ondisk {
 	uuid_t	vol_fsid;	/* identify filesystem - sanity check */
 	uuid_t	vol_fstype;	/* identify filesystem type - sanity check */
 
-	u_int64_t clu_gen;	/* identify generation number of cluster */
-	u_int64_t clu_unused01;
-
 	hammer_tid_t clu_id;	/* unique cluster self identification */
+	hammer_tid_t clu_gen;	/* generation number */
 	int32_t vol_no;		/* cluster contained in volume (sanity) */
 	u_int32_t clu_flags;	/* cluster flags */
 
@@ -296,11 +298,10 @@ struct hammer_cluster_ondisk {
 
 	/* 
 	 * Specify the range of information stored in this cluster as two
-	 * btree elements.  These elements exist as separate records that
-	 * point to us in the parent cluster's B-Tree.
-	 *
-	 * Note that clu_btree_end is range-inclusive, not range-exclusive.
-	 * i.e. 0-1023 instead of 0,1024.
+	 * btree elements.   These elements match the left and right
+	 * boundary elements in the internal B-Tree node of the parent
+	 * cluster that points to the root of our cluster.  Because these
+	 * are boundary elements, the right boundary is range-NONinclusive.
 	 */
 	struct hammer_base_elm clu_btree_beg;
 	struct hammer_base_elm clu_btree_end;
@@ -308,12 +309,16 @@ struct hammer_cluster_ondisk {
 	/*
 	 * The cluster's B-Tree root can change as a side effect of insertion
 	 * and deletion operations so store an offset instead of embedding
-	 * the root node.
+	 * the root node.  The parent_offset is stale if the generation number
+	 * does not match.
+	 *
+	 * Parent linkages are explicit.
 	 */
 	int32_t		clu_btree_root;
 	int32_t		clu_btree_parent_vol_no;
 	int32_t		clu_btree_parent_clu_no;
-	hammer_tid_t	clu_btree_parent_clu_id;
+	int32_t		clu_btree_parent_offset;
+	hammer_tid_t	clu_btree_parent_clu_gen;
 
 	u_int64_t synchronized_rec_id;
 
@@ -323,6 +328,8 @@ struct hammer_cluster_ondisk {
 	struct hammer_almeta	clu_mdata_meta[HAMMER_CLU_SLAVE_METAELMS];
 };
 
+typedef struct hammer_cluster_ondisk *hammer_cluster_ondisk_t;
+
 /*
  * HAMMER records are 96 byte entities encoded into 16K filesystem buffers.
  * Each record has a 64 byte header and a 32 byte extension.  170 records
@@ -370,12 +377,23 @@ struct hammer_base_record {
  * targets and even though they are built within a HAMMER filesystem they
  * get their own obj_id space (and thus can serve as a replication target)
  * and look like a mount point to the system.
+ *
+ * Inter-cluster records are special-cased in the B-Tree.  These records
+ * are referenced from a B-Tree INTERNAL node, NOT A LEAF.  This means
+ * that the element in the B-Tree node is actually a boundary element whos
+ * base element fields, including rec_type, reflect the boundary, NOT
+ * the inter-cluster record type.
+ *
+ * HAMMER_RECTYPE_CLUSTER - only set in the actual inter-cluster record,
+ * not set in the left or right boundary elements around the inter-cluster
+ * reference of an internal node in the B-Tree (because doing so would
+ * interfere with the boundary tests).
  */
 #define HAMMER_RECTYPE_UNKNOWN		0
 #define HAMMER_RECTYPE_LOWEST		1	/* lowest record type avail */
 #define HAMMER_RECTYPE_INODE		1	/* inode in obj_id space */
 #define HAMMER_RECTYPE_PSEUDO_INODE	2	/* pseudo filesysem */
-#define HAMMER_RECTYPE_CLUSTER		3	/* cluster reference */
+#define HAMMER_RECTYPE_CLUSTER		3	/* inter-cluster reference */
 #define HAMMER_RECTYPE_DATA		0x10
 #define HAMMER_RECTYPE_DIRENTRY		0x11
 #define HAMMER_RECTYPE_DB		0x12
@@ -391,10 +409,6 @@ struct hammer_base_record {
 #define HAMMER_OBJTYPE_SOFTLINK		7
 #define HAMMER_OBJTYPE_PSEUDOFS		8	/* pseudo filesystem obj */
 
-#define HAMMER_OBJTYPE_CLUSTER_FLAG	0x20
-#define HAMMER_OBJTYPE_CLUSTER_BEG	0x20
-#define HAMMER_OBJTYPE_CLUSTER_END	0x21
-
 /*
  * Generic full-sized record
  */
@@ -570,6 +584,7 @@ struct hammer_inode_data {
 	u_int64_t parent_obj_id;/* parent directory obj_id */
 	uuid_t	uid;
 	uuid_t	gid;
+	/* XXX device, softlink extension */
 };
 
 #define HAMMER_INODE_DATA_VERSION	1
diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c
index adfdc7eb39..0ac3bd062e 100644
--- a/sys/vfs/hammer/hammer_inode.c
+++ b/sys/vfs/hammer/hammer_inode.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.3 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.4 2007/11/19 00:53:40 dillon Exp $
  */
 
 #include "hammer.h"
@@ -63,10 +63,6 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap)
 /*
  * Obtain a vnode for the specified inode number.  An exclusively locked
  * vnode is returned.
- *
- * To avoid deadlocks we cannot hold the inode lock while we are creating
- * a new vnode.  We can prevent the inode from going away, however.  If
- * we race another vget we just throw away our newly created vnode.
  */
 int
 hammer_vfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
@@ -85,9 +81,8 @@ hammer_vfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
 		*vpp = NULL;
 		return(error);
 	}
-	hammer_lock_to_ref(&ip->lock);
 	error = hammer_get_vnode(ip, LK_EXCLUSIVE, vpp);
-	hammer_put_inode_ref(ip);
+	hammer_rel_inode(ip);
 	return (error);
 }
 
@@ -107,42 +102,48 @@ hammer_get_vnode(struct hammer_inode *ip, int lktype, struct vnode **vpp)
 			error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0);
 			if (error)
 				break;
-			if (ip->vp == NULL) {
-				vp = *vpp;
-				ip->vp = vp;
-				vp->v_type = hammer_get_vnode_type(
-						ip->ino_rec.base.base.obj_type);
-				vp->v_data = (void *)ip;
-				/* vnode locked by getnewvnode() */
-				break;
-			}
-			vp->v_type = VBAD;
-			vx_put(vp);
-		} else {
-			/*
-			 * loop if the vget fails (aka races), or if the vp
-			 * no longer matches ip->vp.
-			 */
-			if (vget(vp, LK_EXCLUSIVE) == 0) {
-				if (vp == ip->vp)
-					break;
-				vput(vp);
+			hammer_lock_ex(&ip->lock);
+			if (ip->vp != NULL) {
+				hammer_unlock(&ip->lock);
+				vp->v_type = VBAD;
+				vx_put(vp);
+				continue;
 			}
+			hammer_ref(&ip->lock);
+			vp = *vpp;
+			ip->vp = vp;
+			vp->v_type = hammer_get_vnode_type(
+					    ip->ino_rec.base.base.obj_type);
+			vp->v_data = (void *)ip;
+			/* vnode locked by getnewvnode() */
+			/* make related vnode dirty if inode dirty? */
+			hammer_unlock(&ip->lock);
+			break;
+		}
+
+		/*
+		 * loop if the vget fails (aka races), or if the vp
+		 * no longer matches ip->vp.
+		 */
+		if (vget(vp, LK_EXCLUSIVE) == 0) {
+			if (vp == ip->vp)
+				break;
+			vput(vp);
 		}
 	}
 	return(error);
 }
 
 /*
- * Get and lock a HAMMER inode.  These functions do not attach or detach
- * the related vnode.
+ * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
+ * do not attach or detach the related vnode (use hammer_get_vnode() for
+ * that).
  */
 struct hammer_inode *
 hammer_get_inode(struct hammer_mount *hmp, u_int64_t obj_id, int *errorp)
 {
-	struct hammer_btree_info binfo;
 	struct hammer_inode_info iinfo;
-	struct hammer_base_elm key;
+	struct hammer_cursor cursor;
 	struct hammer_inode *ip;
 
 	/*
@@ -154,7 +155,7 @@ hammer_get_inode(struct hammer_mount *hmp, u_int64_t obj_id, int *errorp)
 loop:
 	ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
 	if (ip) {
-		hammer_lock(&ip->lock);
+		hammer_ref(&ip->lock);
 		*errorp = 0;
 		return(ip);
 	}
@@ -163,21 +164,24 @@ loop:
 	ip->obj_id = obj_id;
 	ip->obj_asof = iinfo.obj_asof;
 	ip->hmp = hmp;
+	RB_INIT(&ip->rec_tree);
 
 	/*
+	 * Locate the on-disk inode.
 	 * If we do not have an inode cached search the HAMMER on-disk B-Tree
 	 * for it.
 	 */
-	hammer_btree_info_init(&binfo, hmp->rootcl);
-	key.obj_id = ip->obj_id;
-	key.key = 0;
-	key.create_tid = iinfo.obj_asof;
-	key.delete_tid = 0;
-	key.rec_type = HAMMER_RECTYPE_INODE;
-	key.obj_type = 0;
 
-	*errorp = hammer_btree_lookup(&binfo, &key, HAMMER_BTREE_GET_RECORD |
-					            HAMMER_BTREE_GET_DATA);
+	hammer_init_cursor_hmp(&cursor, hmp);
+	cursor.key_beg.obj_id = ip->obj_id;
+	cursor.key_beg.key = 0;
+	cursor.key_beg.create_tid = iinfo.obj_asof;
+	cursor.key_beg.delete_tid = 0;
+	cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
+	cursor.key_beg.obj_type = 0;
+	cursor.flags = HAMMER_BTREE_GET_RECORD | HAMMER_BTREE_GET_DATA;
+
+	*errorp = hammer_btree_lookup(&cursor);
 
 	/*
 	 * On success the B-Tree lookup will hold the appropriate
@@ -185,10 +189,11 @@ loop:
 	 * information.  Copy the information to the in-memory inode.
 	 */
 	if (*errorp == 0) {
-		ip->ino_rec = binfo.rec->inode;
-		ip->ino_data = binfo.data->inode;
+		ip->ino_rec = cursor.record->inode;
+		ip->ino_data = cursor.data->inode;
 	}
-	hammer_btree_info_done(&binfo);
+	hammer_cache_node(cursor.node, &ip->cache);
+	hammer_done_cursor(&cursor);
 
 	/*
 	 * On success load the inode's record and data and insert the
@@ -196,7 +201,10 @@ loop:
 	 * insertion of the same inode so deal with that condition too.
 	 */
 	if (*errorp == 0) {
+		hammer_ref(&ip->lock);
 		if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
+			hammer_uncache_node(&ip->cache);
+			hammer_unref(&ip->lock);
 			kfree(ip, M_HAMMER);
 			goto loop;
 		}
@@ -207,39 +215,93 @@ loop:
 	return (ip);
 }
 
-void
-hammer_lock_inode(struct hammer_inode *ip)
+/*
+ * Create a new filesystem object, returning the inode in *ipp.  The
+ * returned inode will be referenced but not locked.
+ *
+ * The inode is created in-memory and will be delay-synchronized to the
+ * disk.
+ */
+int
+hammer_create_inode(struct hammer_transaction *trans, struct vattr *vap,
+		    struct ucred *cred, struct hammer_inode *dip,
+		    struct hammer_inode **ipp)
 {
-	hammer_lock(&ip->lock);
-}
+	struct hammer_mount *hmp;
+	struct hammer_inode *ip;
 
-void
-hammer_put_inode(struct hammer_inode *ip)
-{
-	hammer_unlock(&ip->lock);
+	hmp = trans->hmp;
+	ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
+	ip->obj_id = ++hmp->last_ino;
+	KKASSERT(ip->obj_id != 0);
+	ip->obj_asof = HAMMER_MAX_TID;	/* XXX */
+	ip->hmp = hmp;
+	ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_RDIRTY |
+		    HAMMER_INODE_ITIMES;
+	ip->last_tid = trans->tid;
+
+	RB_INIT(&ip->rec_tree);
+
+	ip->ino_rec.ino_atime = trans->tid;
+	ip->ino_rec.ino_mtime = trans->tid;
+	ip->ino_rec.ino_size = 0;
+	ip->ino_rec.ino_nlinks = 0;
+	/* XXX */
+	ip->ino_rec.base.rec_id = ++hmp->rootvol->ondisk->vol0_recid;
+	hammer_modify_volume(hmp->rootvol);
+	KKASSERT(ip->ino_rec.base.rec_id != 0);
+	ip->ino_rec.base.base.obj_id = ip->obj_id;
+	ip->ino_rec.base.base.key = 0;
+	ip->ino_rec.base.base.create_tid = trans->tid;
+	ip->ino_rec.base.base.delete_tid = 0;
+	ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE;
+	ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type);
+
+	ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
+	ip->ino_data.mode = vap->va_mode;
+	ip->ino_data.ctime = trans->tid;
+	ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0;
+	if (vap->va_vaflags & VA_UID_UUID_VALID)
+		ip->ino_data.uid = vap->va_uid_uuid;
+	else
+		hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
+	if (vap->va_vaflags & VA_GID_UUID_VALID)
+		ip->ino_data.gid = vap->va_gid_uuid;
+	else
+		hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
+
+	hammer_ref(&ip->lock);
+	if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
+		hammer_unref(&ip->lock);
+		panic("hammer_create_inode: duplicate obj_id");
+	}
+	*ipp = ip;
+	return(0);
 }
 
 void
-hammer_put_inode_ref(struct hammer_inode *ip)
+hammer_rel_inode(struct hammer_inode *ip)
 {
+	/* XXX check last ref */
 	hammer_unref(&ip->lock);
 }
 
 /*
+ * Unload and destroy the specified inode.
+ *
  * (called via RB_SCAN)
  */
 int
 hammer_unload_inode(struct hammer_inode *ip, void *data __unused)
 {
-	struct vnode *vp;
-
 	KKASSERT(ip->lock.refs == 0);
-	if ((vp = ip->vp) != NULL) {
-		ip->vp = NULL;
-		vp->v_data = NULL;
-		/* XXX */
-	}
+	KKASSERT(ip->vp == NULL);
+	hammer_ref(&ip->lock);
 	RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
+
+	hammer_uncache_node(&ip->cache);
+
+	/* XXX flush */
 	kfree(ip, M_HAMMER);
 	return(0);
 }
@@ -253,25 +315,61 @@ hammer_modify_inode(struct hammer_transaction *trans,
 		    struct hammer_inode *ip, int flags)
 {
 	ip->flags |= flags;
+	ip->last_tid = trans->tid;
+}
+
+/************************************************************************
+ *		     HAMMER INODE MERGED-RECORD FUNCTIONS		*
+ ************************************************************************
+ *
+ * These functions augment the B-Tree scanning functions in hammer_btree.c
+ * by merging in-memory records with on-disk records.
+ */
+
+hammer_record_ondisk_t
+hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
+{
+	KKASSERT(0);
+	return(NULL);
+}
+
+hammer_record_ondisk_t
+hammer_ip_next(hammer_cursor_t cursor)
+{
+	KKASSERT(0);
+	return(NULL);
+}
+
+int
+hammer_ip_resolve_data(hammer_cursor_t cursor)
+{
 	KKASSERT(0);
+	return(NULL);
 }
 
 /*
  * Access the filesystem buffer containing the cluster-relative byte
  * offset, validate the buffer type, load *bufferp and return a
- * pointer to the requested data.
+ * pointer to the requested data.  The buffer is reference and locked on
+ * return.
  *
  * If buf_type is 0 the buffer is assumed to be a pure-data buffer and
  * no type or crc check is performed.
  *
+ * If *bufferp is not NULL on entry it is assumed to contain a locked
+ * and referenced buffer which will then be replaced.
+ *
+ * If the caller is holding another unrelated buffer locked it must be
+ * passed in reorderbuf so we can properly order buffer locks.
+ *
  * XXX add a flag for the buffer type and check the CRC here XXX
  */
 void *
-hammer_bread(struct hammer_cluster *cluster, int32_t cloff,
-	     u_int64_t buf_type,
-	     int *errorp, struct hammer_buffer **bufferp)
+hammer_bread(hammer_cluster_t cluster, int32_t cloff,
+	     u_int64_t buf_type, int *errorp,
+	     struct hammer_buffer **bufferp)
 {
-	struct hammer_buffer *buffer;
+	hammer_buffer_t buffer;
 	int32_t buf_no;
 	int32_t buf_off;
 
@@ -282,16 +380,19 @@ hammer_bread(struct hammer_cluster *cluster, int32_t cloff,
 	buffer = *bufferp;
 	if (buffer == NULL || buffer->cluster != cluster ||
 	    buffer->buf_no != buf_no) {
-		if (buffer)
-			hammer_put_buffer(buffer, 0);
+		if (buffer) {
+			hammer_unlock(&buffer->io.lock);
+			hammer_rel_buffer(buffer, 0);
+		}
 		buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
 		*bufferp = buffer;
 		if (buffer == NULL)
 			return(NULL);
+		hammer_lock_ex(&buffer->io.lock);
 	}
 
 	/*
-	 * Validate the buffer type and crc XXX
+	 * Validate the buffer type
 	 */
 	buf_off = cloff & HAMMER_BUFMASK;
 	if (buf_type) {
@@ -306,7 +407,6 @@ hammer_bread(struct hammer_cluster *cluster, int32_t cloff,
 			*errorp = EIO;
 			return(NULL);
 		}
-		/* XXX crc */
 	}
 
 	/*
diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c
index 48db524450..5addc7f4a6 100644
--- a/sys/vfs/hammer/hammer_io.c
+++ b/sys/vfs/hammer/hammer_io.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.1 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.2 2007/11/19 00:53:40 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
@@ -268,7 +268,8 @@ hammer_io_complete(struct buf *bp)
 /*
  * Callback from kernel when it wishes to deallocate a passively
  * associated structure.  This can only occur if the buffer is
- * passively associated with the structure.
+ * passively associated with the structure.  The kernel has locked
+ * the buffer.
  *
  * If we cannot disassociate we set B_LOCKED to prevent the buffer
  * from getting reused.
@@ -282,26 +283,55 @@ hammer_io_deallocate(struct buf *bp)
 
 	KKASSERT(io->io.released);
 	crit_enter();
-	if (io->io.lock.refs) {
-		bp->b_flags |= B_LOCKED;
-	} else {
-		hammer_lock(&io->io.lock);
-		KKASSERT(io->io.lock.refs == 1);
+
+	/*
+	 * First, ref the structure to prevent either the buffer or the
+	 * structure from going away.
+	 */
+	hammer_ref(&io->io.lock);
+
+	/*
+	 * Buffers can have active references from cached hammer_node's,
+	 * even if those nodes are themselves passively cached.  Attempt
+	 * to clean them out.  This may not succeed.
+	 */
+	if (io->io.type == HAMMER_STRUCTURE_BUFFER &&
+	    hammer_lock_ex_try(&io->io.lock) == 0) {
+		hammer_flush_buffer_nodes(&io->buffer);
+		hammer_unlock(&io->io.lock);
+	}
+
+	if (hammer_islastref(&io->io.lock)) {
+		/*
+		 * If we are the only ref left we can disassociate the
+		 * I/O.  It had better still be in a released state because
+		 * the kernel is holding a lock on the buffer.
+		 */
+		KKASSERT(io->io.released);
+		hammer_io_disassociate(io);
+		bp->b_flags &= ~B_LOCKED;
+
 		switch(io->io.type) {
 		case HAMMER_STRUCTURE_VOLUME:
-			hammer_put_volume(&io->volume, 1);
+			hammer_rel_volume(&io->volume, 1);
 			break;
 		case HAMMER_STRUCTURE_SUPERCL:
-			hammer_put_supercl(&io->supercl, 1);
+			hammer_rel_supercl(&io->supercl, 1);
 			break;
 		case HAMMER_STRUCTURE_CLUSTER:
-			hammer_put_cluster(&io->cluster, 1);
+			hammer_rel_cluster(&io->cluster, 1);
 			break;
 		case HAMMER_STRUCTURE_BUFFER:
-			hammer_put_buffer(&io->buffer, 1);
+			hammer_rel_buffer(&io->buffer, 1);
 			break;
 		}
 		/* NOTE: io may be invalid (kfree'd) here */
+	} else {
+		/*
+		 * Otherwise tell the kernel not to destroy the buffer
+		 */
+		bp->b_flags |= B_LOCKED;
+		hammer_unlock(&io->io.lock);
 	}
 	crit_exit();
 }
@@ -356,6 +386,17 @@ hammer_io_checkwrite(struct buf *bp)
 	}
 }
 
+/*
+ * Return non-zero if the caller should flush the structure associated
+ * with this io sub-structure.
+ */
+int
+hammer_io_checkflush(struct hammer_io *io)
+{
+	if (io->bp == NULL || (io->bp->b_flags & B_LOCKED))
+		return(1);
+	return(0);
+}
 
 /*
  * Return non-zero if we wish to delay the kernel's attempt to flush
diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c
index 9ee4c33876..6f8e303dbf 100644
--- a/sys/vfs/hammer/hammer_object.c
+++ b/sys/vfs/hammer/hammer_object.c
@@ -31,41 +31,180 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.1 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.2 2007/11/19 00:53:40 dillon Exp $
  */
 
 #include "hammer.h"
 
+static int hammer_add_record(hammer_transaction_t trans,
+			     hammer_record_t record);
+
 /*
- * Allocate a new filesystem object
+ * Red-black tree support.
  */
-int
-hammer_alloc_inode(struct hammer_transaction *trans, struct vattr *vap,
-		   struct ucred *cred, struct hammer_inode **ipp)
+static int
+hammer_rec_rb_compare(struct hammer_record *rec1, struct hammer_record *rec2)
+{
+	if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
+		return(-1);
+	if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
+		return(1);
+
+	if (rec1->rec.base.base.key < rec2->rec.base.base.key)
+		return(-1);
+	if (rec1->rec.base.base.key > rec2->rec.base.base.key)
+		return(1);
+
+	if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
+		return(-1);
+	if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
+		return(1);
+        return(0);
+}
+
+static int
+hammer_rec_compare(struct hammer_base_elm *info, struct hammer_record *rec)
 {
-	return(EINVAL);
+        /*
+         * A key1->rec_type of 0 matches any record type.
+         */
+        if (info->rec_type) {
+                if (info->rec_type < rec->rec.base.base.rec_type)
+                        return(-3);
+                if (info->rec_type > rec->rec.base.base.rec_type)
+                        return(3);
+        }
+
+        /*
+         * There is no special case for key.  0 means 0.
+         */
+        if (info->key < rec->rec.base.base.key)
+                return(-2);
+        if (info->key > rec->rec.base.base.key)
+                return(2);
+
+        /*
+         * This test has a number of special cases.  create_tid in key1 is
+         * the as-of transction id, and delete_tid in key1 is NOT USED.
+         *
+         * A key1->create_tid of 0 matches any record regardles of when
+         * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
+         * used to search for the most current state of the object.
+         *
+         * key2->create_tid is a HAMMER record and will never be
+         * 0.   key2->delete_tid is the deletion transaction id or 0 if
+         * the record has not yet been deleted.
+         */
+        if (info->create_tid) {
+                if (info->create_tid < rec->rec.base.base.create_tid)
+                        return(-1);
+                if (rec->rec.base.base.delete_tid &&
+		    info->create_tid >= rec->rec.base.base.delete_tid) {
+                        return(1);
+		}
+        }
+        return(0);
 }
 
+RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
+RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
+		    hammer_rec_compare, hammer_base_elm_t);
+
+/*
+ * Add a directory entry (dip,ncp) which references inode (ip).
+ *
+ * Note that the low 32 bits of the namekey are set temporarily to create
+ * a unique in-memory record, and may be modified a second time when the
+ * record is synchronized to disk.  In particular, the low 32 bits cannot be
+ * all 0's when synching to disk, which is not handled here.
+ */
 int
 hammer_add_directory(struct hammer_transaction *trans,
 		     struct hammer_inode *dip, struct namecache *ncp,
 		     struct hammer_inode *ip)
 {
-	return(EINVAL);
+	struct hammer_record *record;
+	int error;
+	int bytes;
+
+	record = hammer_alloc_ip_record(trans, dip);
+
+	bytes = ncp->nc_nlen + 1;
+
+	record->rec.entry.base.base.obj_id = dip->obj_id;
+	record->rec.entry.base.base.key = hammer_directory_namekey(ncp->nc_name, bytes - 1);
+	record->rec.entry.base.base.key += trans->hmp->namekey_iterator++;
+	record->rec.entry.base.base.create_tid = trans->tid;
+	record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
+	record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
+	record->rec.entry.base.base.obj_id = ip->obj_id;
+	if (bytes <= sizeof(record->rec.entry.den_name)) {
+		record->data = (void *)record->rec.entry.den_name;
+	} else {
+		record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
+		record->flags |= HAMMER_RECF_ALLOCDATA;
+		bcopy(ncp->nc_name, record->data, bytes);
+	}
+	record->data_len = bytes;
+	++dip->ino_rec.ino_nlinks;
+	hammer_modify_inode(trans, dip, HAMMER_INODE_RDIRTY);
+	error = hammer_add_record(trans, record);
+	return(error);
+}
+
+/*
+ * Allocate a record for the caller to finish filling in
+ */
+struct hammer_record *
+hammer_alloc_ip_record(struct hammer_transaction *trans, hammer_inode_t ip)
+{
+	hammer_record_t record;
+
+	record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
+	record->last_tid = trans->tid;
+	record->ip = ip;
+	return (record);
 }
 
-#if 0
+/*
+ * Free a record.  Clean the structure up even though we are throwing it
+ * away as a sanity check.
+ */
+void
+hammer_free_ip_record(struct hammer_record *record)
+{
+	if (record->flags & HAMMER_RECF_ONRBTREE) {
+		RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
+		record->flags &= ~HAMMER_RECF_ONRBTREE;
+	}
+	if (record->flags & HAMMER_RECF_ALLOCDATA) {
+		kfree(record->data, M_HAMMER);
+		record->flags &= ~HAMMER_RECF_ALLOCDATA;
+	}
+	record->data = NULL;
+	kfree(record, M_HAMMER);
+}
 
 /*
- * 
+ * Add the record to the inode's rec_tree.  Directory entries
  */
+static
 int
-hammer_add_record(struct hammer_transaction *trans,
-		  struct hammer_inode *ip, hammer_record_ondisk_t rec,
-		  void *data, int bytes)
+hammer_add_record(struct hammer_transaction *trans, hammer_record_t record)
 {
+	while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
+		if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
+			hammer_free_ip_record(record);
+			return (EEXIST);
+		}
+		record->rec.base.base.key &= ~(0xFFFFFFFFLL);
+		record->rec.base.base.key |= trans->hmp->namekey_iterator++;
+	}
+	record->flags |= HAMMER_RECF_ONRBTREE;
+	return(0);
 }
 
+#if 0
 /*
  * Delete records belonging to the specified range.  Deal with edge and
  * overlap cases.  This function sets the delete tid and breaks adds
diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c
index e4e8633416..740cde76b1 100644
--- a/sys/vfs/hammer/hammer_ondisk.c
+++ b/sys/vfs/hammer/hammer_ondisk.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.3 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.4 2007/11/19 00:53:40 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -45,27 +45,27 @@
 #include <sys/buf.h>
 #include <sys/buf2.h>
 
-static void hammer_free_volume(struct hammer_volume *volume);
-static struct hammer_cluster *
-	    hammer_load_cluster(struct hammer_cluster *cluster, int *errorp,
-			int new);
+static void hammer_free_volume(hammer_volume_t volume);
+static int hammer_load_volume(hammer_volume_t volume);
+static int hammer_load_supercl(hammer_supercl_t supercl, int isnew);
+static int hammer_load_cluster(hammer_cluster_t cluster, int isnew);
+static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
+static void hammer_remove_node_clist(hammer_buffer_t buffer,
+			hammer_node_t node);
 static void initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head,
 			u_int64_t type);
-static void alloc_new_buffer(struct hammer_cluster *cluster,
+static void alloc_new_buffer(hammer_cluster_t cluster,
 			hammer_alist_t live, u_int64_t type, int32_t nelements,
 			int32_t start,
 			int *errorp, struct hammer_buffer **bufferp);
 #if 0
-static void readhammerbuf(struct hammer_volume *vol, void *data,
+static void readhammerbuf(hammer_volume_t vol, void *data,
 			int64_t offset);
-static void writehammerbuf(struct hammer_volume *vol, const void *data,
+static void writehammerbuf(hammer_volume_t vol, const void *data,
 			int64_t offset);
 #endif
-static int64_t calculate_cluster_offset(struct hammer_volume *vol,
-			int32_t clu_no);
-static int64_t calculate_supercl_offset(struct hammer_volume *vol,
-			int32_t scl_no);
-
+static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
+static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
 
 struct hammer_alist_config Buf_alist_config;
 struct hammer_alist_config Vol_normal_alist_config;
@@ -78,7 +78,7 @@ struct hammer_alist_config Clu_slave_alist_config;
  * Red-Black tree support for various structures
  */
 static int
-hammer_ino_rb_compare(struct hammer_inode *ip1, struct hammer_inode *ip2)
+hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
 {
 	if (ip1->obj_id < ip2->obj_id)
 		return(-1);
@@ -92,7 +92,7 @@ hammer_ino_rb_compare(struct hammer_inode *ip1, struct hammer_inode *ip2)
 }
 
 static int
-hammer_inode_info_cmp(hammer_inode_info_t info, struct hammer_inode *ip)
+hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
 {
 	if (info->obj_id < ip->obj_id)
 		return(-1);
@@ -106,7 +106,7 @@ hammer_inode_info_cmp(hammer_inode_info_t info, struct hammer_inode *ip)
 }
 
 static int
-hammer_vol_rb_compare(struct hammer_volume *vol1, struct hammer_volume *vol2)
+hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
 {
 	if (vol1->vol_no < vol2->vol_no)
 		return(-1);
@@ -116,7 +116,7 @@ hammer_vol_rb_compare(struct hammer_volume *vol1, struct hammer_volume *vol2)
 }
 
 static int
-hammer_scl_rb_compare(struct hammer_supercl *cl1, struct hammer_supercl *cl2)
+hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
 {
 	if (cl1->scl_no < cl2->scl_no)
 		return(-1);
@@ -126,7 +126,7 @@ hammer_scl_rb_compare(struct hammer_supercl *cl1, struct hammer_supercl *cl2)
 }
 
 static int
-hammer_clu_rb_compare(struct hammer_cluster *cl1, struct hammer_cluster *cl2)
+hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
 {
 	if (cl1->clu_no < cl2->clu_no)
 		return(-1);
@@ -136,7 +136,7 @@ hammer_clu_rb_compare(struct hammer_cluster *cl1, struct hammer_cluster *cl2)
 }
 
 static int
-hammer_buf_rb_compare(struct hammer_buffer *buf1, struct hammer_buffer *buf2)
+hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
 {
 	if (buf1->buf_no < buf2->buf_no)
 		return(-1);
@@ -145,6 +145,16 @@ hammer_buf_rb_compare(struct hammer_buffer *buf1, struct hammer_buffer *buf2)
 	return(0);
 }
 
+static int
+hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
+{
+	if (node1->node_offset < node2->node_offset)
+		return(-1);
+	if (node1->node_offset < node2->node_offset)
+		return(1);
+	return(0);
+}
+
 /*
  * Note: The lookup function for hammer_ino_rb_tree winds up being named
  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
@@ -161,8 +171,13 @@ RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
 	     hammer_clu_rb_compare, int32_t, clu_no);
 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
 	     hammer_buf_rb_compare, int32_t, buf_no);
+RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
+	     hammer_nod_rb_compare, int32_t, node_offset);
 
-/*
+/************************************************************************
+ *				VOLUMES					*
+ ************************************************************************
+ *
  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
  * code on failure.  Volumes must be loaded at mount time, get_volume() will
  * not load a new volume.
@@ -170,10 +185,10 @@ RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
  * Calls made to hammer_load_volume() or single-threaded
  */
 int
-hammer_load_volume(struct hammer_mount *hmp, const char *volname)
+hammer_install_volume(struct hammer_mount *hmp, const char *volname)
 {
 	struct mount *mp;
-	struct hammer_volume *volume;
+	hammer_volume_t volume;
 	struct hammer_volume_ondisk *ondisk;
 	struct nlookupdata nd;
 	struct buf *bp = NULL;
@@ -257,13 +272,15 @@ hammer_load_volume(struct hammer_mount *hmp, const char *volname)
 	/*
 	 * Set the root volume and load the root cluster.  HAMMER special
 	 * cases rootvol and rootcl and will not deallocate the structures.
+	 * We do not hold a ref because this would prevent related I/O
+	 * from being flushed.
 	 */
 	if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
 		hmp->rootvol = volume;
 		hmp->rootcl = hammer_get_cluster(volume,
 						 ondisk->vol0_root_clu_no,
 						 &error, 0);
-		hammer_put_cluster(hmp->rootcl, 0);
+		hammer_rel_cluster(hmp->rootcl, 0);
 		hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
 	}
 late_failure:
@@ -282,10 +299,10 @@ late_failure:
  * so returns -1 on failure.
  */
 int
-hammer_unload_volume(struct hammer_volume *volume, void *data __unused)
+hammer_unload_volume(hammer_volume_t volume, void *data __unused)
 {
 	struct hammer_mount *hmp = volume->hmp;
-	struct hammer_cluster *rootcl;
+	hammer_cluster_t rootcl;
 	int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
 
 	/*
@@ -296,18 +313,17 @@ hammer_unload_volume(struct hammer_volume *volume, void *data __unused)
 	 * Clean up the root cluster, which is held unlocked in the root
 	 * volume.
 	 */
+	hammer_ref(&volume->io.lock);
 	if (hmp->rootvol == volume) {
-		if ((rootcl = hmp->rootcl) != NULL) {
-			hammer_lock(&rootcl->io.lock);
+		if ((rootcl = hmp->rootcl) != NULL)
 			hmp->rootcl = NULL;
-			hammer_put_cluster(rootcl, 1);
-		}
 		hmp->rootvol = NULL;
 	}
 
 	/*
 	 * Flush the volume
 	 */
+	KKASSERT(volume->io.lock.refs == 1);
 	hammer_io_release(&volume->io, 1);
 	volume->ondisk = NULL;
 	if (volume->devvp) {
@@ -330,7 +346,7 @@ hammer_unload_volume(struct hammer_volume *volume, void *data __unused)
 
 static
 void
-hammer_free_volume(struct hammer_volume *volume)
+hammer_free_volume(hammer_volume_t volume)
 {
 	if (volume->vol_name) {
 		kfree(volume->vol_name, M_HAMMER);
@@ -346,11 +362,10 @@ hammer_free_volume(struct hammer_volume *volume)
 /*
  * Get a HAMMER volume.  The volume must already exist.
  */
-struct hammer_volume *
+hammer_volume_t
 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
 {
 	struct hammer_volume *volume;
-	struct hammer_volume_ondisk *ondisk;
 
 	/*
 	 * Locate the volume structure
@@ -360,18 +375,64 @@ hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
 		*errorp = ENOENT;
 		return(NULL);
 	}
+	hammer_ref(&volume->io.lock);
 
 	/*
-	 * Load the ondisk buffer if necessary.  Create a b_dep dependancy
-	 * for the buffer that allows us to manage our structure with the
-	 * buffer (mostly) left in a released state.
+	 * Deal with on-disk info
 	 */
-	hammer_lock(&volume->io.lock);
 	if (volume->ondisk == NULL) {
-		*errorp = hammer_io_read(volume->devvp, &volume->io);
+		*errorp = hammer_load_volume(volume);
 		if (*errorp) {
+			hammer_rel_volume(volume, 1);
+			volume = NULL;
+		}
+	} else {
+		*errorp = 0;
+	}
+	return(volume);
+}
+
+hammer_volume_t
+hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
+{
+	hammer_volume_t volume;
+
+	volume = hmp->rootvol;
+	KKASSERT(volume != NULL);
+	hammer_ref(&volume->io.lock);
+
+	/*
+	 * Deal with on-disk info
+	 */
+	if (volume->ondisk == NULL) {
+		*errorp = hammer_load_volume(volume);
+		if (*errorp) {
+			hammer_rel_volume(volume, 1);
+			volume = NULL;
+		}
+	} else {
+		*errorp = 0;
+	}
+	return (volume);
+}
+
+/*
+ * Load a volume's on-disk information.  The volume must be referenced and
+ * not locked.  We temporarily acquire an exclusive lock to interlock
+ * against releases or multiple get's.
+ */
+static int
+hammer_load_volume(hammer_volume_t volume)
+{
+	struct hammer_volume_ondisk *ondisk;
+	int error;
+
+	hammer_lock_ex(&volume->io.lock);
+	if (volume->ondisk == NULL) {
+		error = hammer_io_read(volume->devvp, &volume->io);
+		if (error) {
 			hammer_unlock(&volume->io.lock);
-			return(NULL);
+			return (error);
 		}
 		volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
 
@@ -389,34 +450,44 @@ hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
 			volume->alist.info = NULL;
 		}
 		hammer_alist_init(&volume->alist);
+	} else {
+		error = 0;
 	}
-	*errorp = 0;
-	return(volume);
+	hammer_unlock(&volume->io.lock);
+	return(0);
 }
 
 /*
- * Unlock and release a volume.
- *
- * Buffer cache interactions (applies to volumes, superclusters, clusters,
- * and buffer structures): 
- *
+ * Release a volume.  Call hammer_io_release on the last reference.  We have
+ * to acquire an exclusive lock to interlock against volume->ondisk tests
+ * in hammer_load_volume().
  */
 void
-hammer_put_volume(struct hammer_volume *volume, int flush)
+hammer_rel_volume(hammer_volume_t volume, int flush)
 {
 	if (hammer_islastref(&volume->io.lock)) {
-		hammer_io_release(&volume->io, flush);
-	} else {
+		hammer_lock_ex(&volume->io.lock);
+		if (hammer_islastref(&volume->io.lock)) {
+			volume->ondisk = NULL;
+			hammer_io_release(&volume->io, flush);
+		}
 		hammer_unlock(&volume->io.lock);
 	}
+	hammer_unref(&volume->io.lock);
 }
 
-struct hammer_supercl *
-hammer_get_supercl(struct hammer_volume *volume, int32_t scl_no,
+/************************************************************************
+ *				SUPER-CLUSTERS				*
+ ************************************************************************
+ *
+ * Manage super-clusters.  Note that a supercl holds a reference to its
+ * associated volume.
+ */
+hammer_supercl_t
+hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
 		   int *errorp, int isnew)
 {
-	struct hammer_supercl_ondisk *ondisk;
-	struct hammer_supercl *supercl;
+	hammer_supercl_t supercl;
 
 	/*
 	 * Locate and lock the super-cluster structure, creating one
@@ -430,34 +501,53 @@ again:
 		supercl->volume = volume;
 		supercl->io.offset = calculate_supercl_offset(volume, scl_no);
 		supercl->io.type = HAMMER_STRUCTURE_SUPERCL;
-		hammer_lock(&supercl->io.lock);
+		hammer_ref(&supercl->io.lock);
 
 		/*
 		 * Insert the cluster into the RB tree and handle late
 		 * collisions.
 		 */
 		if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
-			hammer_unlock(&supercl->io.lock);
+			hammer_unref(&supercl->io.lock);
 			kfree(supercl, M_HAMMER);
 			goto again;
 		}
 		hammer_ref(&volume->io.lock);
 	} else {
-		hammer_lock(&supercl->io.lock);
+		hammer_ref(&supercl->io.lock);
 	}
 
 	/*
-	 * Load the cluster's on-disk info
+	 * Deal with on-disk info
 	 */
-	*errorp = 0;
+	if (supercl->ondisk == NULL || isnew) {
+		*errorp = hammer_load_supercl(supercl, isnew);
+		if (*errorp) {
+			hammer_rel_supercl(supercl, 1);
+			supercl = NULL;
+		}
+	} else {
+		*errorp = 0;
+	}
+	return(supercl);
+}
+
+static int
+hammer_load_supercl(hammer_supercl_t supercl, int isnew)
+{
+	struct hammer_supercl_ondisk *ondisk;
+	hammer_volume_t volume = supercl->volume;
+	int error;
+
+	hammer_lock_ex(&supercl->io.lock);
 	if (supercl->ondisk == NULL) {
 		if (isnew)
-			*errorp = hammer_io_new(volume->devvp, &supercl->io);
+			error = hammer_io_new(volume->devvp, &supercl->io);
 		else
-			*errorp = hammer_io_read(volume->devvp, &supercl->io);
-		if (*errorp) {
-			hammer_put_supercl(supercl, 1);
-			return(NULL);
+			error = hammer_io_read(volume->devvp, &supercl->io);
+		if (error) {
+			hammer_unlock(&supercl->io.lock);
+			return (error);
 		}
 		supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
 
@@ -480,47 +570,61 @@ again:
 			hammer_alist_init(&supercl->alist);
 		}
 	} else if (isnew) {
-		*errorp = hammer_io_new(volume->devvp, &supercl->io);
-		if (*errorp) {
-			hammer_put_supercl(supercl, 1);
-			supercl = NULL;
-		}
+		error = hammer_io_new(volume->devvp, &supercl->io);
+	} else {
+		error = 0;
 	}
-	return (supercl);
+	hammer_unlock(&supercl->io.lock);
+	return (error);
 }
 
+/*
+ * Release a super-cluster.  We have to deal with several places where
+ * another thread can ref the super-cluster.
+ *
+ * Only destroy the structure itself if the related buffer cache buffer
+ * was disassociated from it.  This ties the management of the structure
+ * to the buffer cache subsystem.
+ */
 void
-hammer_put_supercl(struct hammer_supercl *supercl, int flush)
+hammer_rel_supercl(hammer_supercl_t supercl, int flush)
 {
-	struct hammer_volume *volume;
+	hammer_volume_t volume;
 
 	if (hammer_islastref(&supercl->io.lock)) {
-		volume = supercl->volume;
-		hammer_io_release(&supercl->io, flush);
-		if (supercl->io.bp == NULL &&
-		    hammer_islastref(&supercl->io.lock)) {
-			RB_REMOVE(hammer_scl_rb_tree,
-				  &volume->rb_scls_root, supercl);
-			kfree(supercl, M_HAMMER);
-			if (hammer_islastref(&volume->io.lock)) {
-				hammer_ref_to_lock(&volume->io.lock);
-				hammer_put_volume(volume, 0);
+		hammer_lock_ex(&supercl->io.lock);
+		if (hammer_islastref(&supercl->io.lock)) {
+			supercl->ondisk = NULL;
+			hammer_io_release(&supercl->io, flush);
+			if (supercl->io.bp == NULL &&
+			    hammer_islastref(&supercl->io.lock)) {
+				volume = supercl->volume;
+				RB_REMOVE(hammer_scl_rb_tree,
+					  &volume->rb_scls_root, supercl);
+				supercl->volume = NULL;	/* sanity */
+				kfree(supercl, M_HAMMER);
+				hammer_rel_volume(volume, 0);
+				return;
 			}
 		}
-	} else {
 		hammer_unlock(&supercl->io.lock);
 	}
+	hammer_unref(&supercl->io.lock);
 }
 
-struct hammer_cluster *
-hammer_get_cluster(struct hammer_volume *volume, int32_t clu_no,
+/************************************************************************
+ *				CLUSTERS				*
+ ************************************************************************
+ *
+ * Manage clusters.  Note that a cluster holds a reference to its
+ * associated volume.
+ */
+hammer_cluster_t
+hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
 		   int *errorp, int isnew)
 {
-	struct hammer_cluster *cluster;
+	hammer_cluster_t cluster;
 
-	/*
-	 * Locate and lock the cluster structure, creating one if necessary.
-	 */
 again:
 	cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
 	if (cluster == NULL) {
@@ -529,55 +633,83 @@ again:
 		cluster->volume = volume;
 		cluster->io.offset = calculate_cluster_offset(volume, clu_no);
 		RB_INIT(&cluster->rb_bufs_root);
+		RB_INIT(&cluster->rb_nods_root);
 		cluster->io.type = HAMMER_STRUCTURE_CLUSTER;
-		hammer_lock(&cluster->io.lock);
+		hammer_ref(&cluster->io.lock);
 
 		/*
 		 * Insert the cluster into the RB tree and handle late
 		 * collisions.
 		 */
 		if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
-			hammer_unlock(&cluster->io.lock);
+			hammer_unref(&cluster->io.lock);
 			kfree(cluster, M_HAMMER);
 			goto again;
 		}
 		hammer_ref(&volume->io.lock);
 	} else {
-		hammer_lock(&cluster->io.lock);
+		hammer_ref(&cluster->io.lock);
 	}
-	return(hammer_load_cluster(cluster, errorp, isnew));
+
+	/*
+	 * Deal with on-disk info
+	 */
+	if (cluster->ondisk == NULL || isnew) {
+		*errorp = hammer_load_cluster(cluster, isnew);
+		if (*errorp) {
+			hammer_rel_cluster(cluster, 1);
+			cluster = NULL;
+		}
+	} else {
+		*errorp = 0;
+	}
+	return (cluster);
 }
 
-struct hammer_cluster *
-hammer_get_rootcl(struct hammer_mount *hmp)
+hammer_cluster_t
+hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
 {
-	struct hammer_cluster *cluster = hmp->rootcl;
-	int dummy_error;
+	hammer_cluster_t cluster;
 
-	hammer_lock(&cluster->io.lock);
-	return(hammer_load_cluster(cluster, &dummy_error, 0));
-}
+	cluster = hmp->rootcl;
+	KKASSERT(cluster != NULL);
+	hammer_ref(&cluster->io.lock);
 
+	/*
+	 * Deal with on-disk info
+	 */
+	if (cluster->ondisk == NULL) {
+		*errorp = hammer_load_cluster(cluster, 0);
+		if (*errorp) {
+			hammer_rel_cluster(cluster, 1);
+			cluster = NULL;
+		}
+	} else {
+		*errorp = 0;
+	}
+	return (cluster);
+}
 
 static
-struct hammer_cluster *
-hammer_load_cluster(struct hammer_cluster *cluster, int *errorp, int isnew)
+int
+hammer_load_cluster(hammer_cluster_t cluster, int isnew)
 {
-	struct hammer_volume *volume = cluster->volume;
+	hammer_volume_t volume = cluster->volume;
 	struct hammer_cluster_ondisk *ondisk;
+	int error;
 
 	/*
 	 * Load the cluster's on-disk info
 	 */
-	*errorp = 0;
+	hammer_lock_ex(&cluster->io.lock);
 	if (cluster->ondisk == NULL) {
 		if (isnew)
-			*errorp = hammer_io_new(volume->devvp, &cluster->io);
+			error = hammer_io_new(volume->devvp, &cluster->io);
 		else
-			*errorp = hammer_io_read(volume->devvp, &cluster->io);
-		if (*errorp) {
-			hammer_put_cluster(cluster, 1);
-			return(NULL);
+			error = hammer_io_read(volume->devvp, &cluster->io);
+		if (error) {
+			hammer_unlock(&cluster->io.lock);
+			return (error);
 		}
 		cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
 
@@ -593,6 +725,9 @@ hammer_load_cluster(struct hammer_cluster *cluster, int *errorp, int isnew)
 		cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
 		cluster->alist_mdata.info = cluster;
 
+		cluster->clu_btree_beg = ondisk->clu_btree_beg;
+		cluster->clu_btree_end = ondisk->clu_btree_end;
+
 		/*
 		 * If this is a new cluster we have to initialize
 		 * various ondisk structural elements.  The caller is
@@ -612,52 +747,108 @@ hammer_load_cluster(struct hammer_cluster *cluster, int *errorp, int isnew)
 			hammer_alist_init(&cluster->alist_mdata);
 		}
 	} else if (isnew) {
-		*errorp = hammer_io_new(volume->devvp, &cluster->io);
-		if (*errorp) {
-			hammer_put_cluster(cluster, 1);
-			cluster = NULL;
-		}
+		error = hammer_io_new(volume->devvp, &cluster->io);
+	} else {
+		error = 0;
+	}
+	hammer_unlock(&cluster->io.lock);
+	return (error);
+}
+
+/*
+ * Reference a cluster that is either already referenced or via a specially
+ * handled pointer (aka rootcl).
+ */
+int
+hammer_ref_cluster(hammer_cluster_t cluster)
+{
+	int error;
+
+	KKASSERT(cluster != NULL);
+	hammer_ref(&cluster->io.lock);
+
+	/*
+	 * Deal with on-disk info
+	 */
+	if (cluster->ondisk == NULL) {
+		error = hammer_load_cluster(cluster, 0);
+		if (error)
+			hammer_rel_cluster(cluster, 1);
+	} else {
+		error = 0;
 	}
-	return(cluster);
+	return(error);
 }
 
+/*
+ * Release a cluster.  We have to deal with several places where
+ * another thread can ref the cluster.
+ *
+ * Only destroy the structure itself if the related buffer cache buffer
+ * was disassociated from it.  This ties the management of the structure
+ * to the buffer cache subsystem.
+ */
 void
-hammer_put_cluster(struct hammer_cluster *cluster, int flush)
+hammer_rel_cluster(hammer_cluster_t cluster, int flush)
 {
-	struct hammer_volume *volume;
+	hammer_node_t node;
+	hammer_volume_t volume;
 
 	if (hammer_islastref(&cluster->io.lock)) {
-		volume = cluster->volume;
-		hammer_io_release(&cluster->io, flush);
-		if (cluster->io.bp == NULL &&
-		    volume->hmp->rootcl != cluster &&
-		    hammer_islastref(&cluster->io.lock)) {
-			RB_REMOVE(hammer_clu_rb_tree,
-				  &volume->rb_clus_root, cluster);
-			kfree(cluster, M_HAMMER);
-			if (hammer_islastref(&volume->io.lock)) {
-				hammer_ref_to_lock(&volume->io.lock);
-				hammer_put_volume(volume, 0);
+		hammer_lock_ex(&cluster->io.lock);
+		if (hammer_islastref(&cluster->io.lock)) {
+			cluster->ondisk = NULL;
+			hammer_io_release(&cluster->io, flush);
+
+			/*
+			 * Clean out the B-Tree node cache, if any, then
+			 * clean up the volume ref and free the cluster.
+			 *
+			 * If the cluster acquires a new reference while we
+			 * are trying to clean it out, abort the cleaning.
+			 *
+			 * There really shouldn't be any nodes at this point
+			 * but we allow a node with no buffer association
+			 * so handle the case.
+			 */
+			while (cluster->io.bp == NULL &&
+			       hammer_islastref(&cluster->io.lock) &&
+			       (node = RB_ROOT(&cluster->rb_nods_root)) != NULL
+			) {
+				KKASSERT(node->lock.refs == 0);
+				hammer_flush_node(node);
+			}
+			if (cluster->io.bp == NULL &&
+			    hammer_islastref(&cluster->io.lock)) {
+				volume = cluster->volume;
+				RB_REMOVE(hammer_clu_rb_tree,
+					  &volume->rb_clus_root, cluster);
+				cluster->volume = NULL;	/* sanity */
+				kfree(cluster, M_HAMMER);
+				hammer_rel_volume(volume, 0);
+				return;
 			}
 		}
-	} else {
 		hammer_unlock(&cluster->io.lock);
 	}
+	hammer_unref(&cluster->io.lock);
 }
 
-/*
- * Get a buffer from a cluster.  Note that buffer #0 is the cluster header
- * itself and may not be retrieved with this function.
+/************************************************************************
+ *				BUFFERS					*
+ ************************************************************************
+ *
+ * Manage buffers.  Note that a buffer holds a reference to its associated
+ * cluster, and its cluster will hold a reference to the cluster's volume.
  *
- * If buf_type is 0 the buffer already exists in-memory or on-disk.
- * Otherwise a new buffer is initialized with the specified buffer type.
+ * A non-zero buf_type indicates that a new buffer should be created and
+ * zero'd.
  */
-struct hammer_buffer *
-hammer_get_buffer(struct hammer_cluster *cluster, int32_t buf_no,
-		  int64_t buf_type, int *errorp)
+hammer_buffer_t
+hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
+		  u_int64_t buf_type, int *errorp)
 {
-	hammer_fsbuf_ondisk_t ondisk;
-	struct hammer_buffer *buffer;
+	hammer_buffer_t buffer;
 
 	/*
 	 * Find the buffer.  Note that buffer 0 corresponds to the cluster
@@ -678,34 +869,59 @@ again:
 		buffer->io.offset = cluster->io.offset +
 				    (buf_no * HAMMER_BUFSIZE);
 		buffer->io.type = HAMMER_STRUCTURE_BUFFER;
-		hammer_lock(&buffer->io.lock);
+		TAILQ_INIT(&buffer->clist);
+		hammer_ref(&buffer->io.lock);
 
 		/*
 		 * Insert the cluster into the RB tree and handle late
 		 * collisions.
 		 */
 		if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
-			hammer_unlock(&buffer->io.lock);
+			hammer_unref(&buffer->io.lock);
 			kfree(buffer, M_HAMMER);
 			goto again;
 		}
 		hammer_ref(&cluster->io.lock);
 	} else {
-		hammer_lock(&buffer->io.lock);
+		hammer_ref(&buffer->io.lock);
 	}
 
-	*errorp = 0;
+	/*
+	 * Deal with on-disk info
+	 */
+	if (buffer->ondisk == NULL || buf_type) {
+		*errorp = hammer_load_buffer(buffer, buf_type);
+		if (*errorp) {
+			hammer_rel_buffer(buffer, 1);
+			buffer = NULL;
+		}
+	} else {
+		*errorp = 0;
+	}
+	return(buffer);
+}
+
+static int
+hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
+{
+	hammer_volume_t volume;
+	hammer_fsbuf_ondisk_t ondisk;
+	int error;
+
+	/*
+	 * Load the buffer's on-disk info
+	 */
+	volume = buffer->volume;
+	hammer_lock_ex(&buffer->io.lock);
 	if (buffer->ondisk == NULL) {
 		if (buf_type) {
-			*errorp = hammer_io_new(buffer->volume->devvp,
-						&buffer->io);
+			error = hammer_io_new(volume->devvp, &buffer->io);
 		} else {
-			*errorp = hammer_io_read(buffer->volume->devvp,
-						&buffer->io);
+			error = hammer_io_read(volume->devvp, &buffer->io);
 		}
-		if (*errorp) {
-			hammer_put_buffer(buffer, 1);
-			return(NULL);
+		if (error) {
+			hammer_unlock(&buffer->io.lock);
+			return (error);
 		}
 		buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
 		buffer->alist.config = &Buf_alist_config;
@@ -714,65 +930,389 @@ again:
 		if (buf_type) {
 			initbuffer(&buffer->alist, &ondisk->head, buf_type);
 		}
+		buffer->buf_type = ondisk->head.buf_type;
 	} else if (buf_type) {
-		*errorp = hammer_io_new(buffer->volume->devvp,
-					&buffer->io);
-		if (*errorp) {
-			hammer_put_buffer(buffer, 1);
-			buffer = NULL;
+		error = hammer_io_new(volume->devvp, &buffer->io);
+	} else {
+		error = 0;
+	}
+	hammer_unlock(&buffer->io.lock);
+	return (error);
+}
+
+/*
+ * Reference a buffer that is either already referenced or via a specially
+ * handled pointer (aka cursor->buffer).
+ */
+int
+hammer_ref_buffer(hammer_buffer_t buffer)
+{
+	int error;
+
+	hammer_ref(&buffer->io.lock);
+	if (buffer->ondisk == NULL) {
+		error = hammer_load_buffer(buffer, 0);
+		if (error) {
+			hammer_rel_buffer(buffer, 1);
+			/*
+			 * NOTE: buffer pointer can become stale after
+			 * the above release.
+			 */
+		} else {
+			KKASSERT(buffer->buf_type ==
+				 buffer->ondisk->head.buf_type);
 		}
+	} else {
+		error = 0;
 	}
-	return (buffer);
+	return(error);
 }
 
+/*
+ * Release a buffer.  We have to deal with several places where
+ * another thread can ref the buffer.
+ *
+ * Only destroy the structure itself if the related buffer cache buffer
+ * was disassociated from it.  This ties the management of the structure
+ * to the buffer cache subsystem.  buffer->ondisk determines whether the
+ * embedded io is referenced or not.
+ */
 void
-hammer_put_buffer(struct hammer_buffer *buffer, int flush)
+hammer_rel_buffer(hammer_buffer_t buffer, int flush)
 {
-	struct hammer_cluster *cluster;
+	hammer_cluster_t cluster;
+	hammer_node_t node;
 
 	if (hammer_islastref(&buffer->io.lock)) {
-		hammer_io_release(&buffer->io, flush);
-		if (buffer->io.bp == NULL &&
-		    hammer_islastref(&buffer->io.lock)) {
-			RB_REMOVE(hammer_buf_rb_tree,
-				  &buffer->cluster->rb_bufs_root, buffer);
-			cluster = buffer->cluster;
-			kfree(buffer, M_HAMMER);
-			if (hammer_islastref(&cluster->io.lock)) {
-				hammer_ref_to_lock(&cluster->io.lock);
-				hammer_put_cluster(cluster, 0);
+		hammer_lock_ex(&buffer->io.lock);
+		if (hammer_islastref(&buffer->io.lock)) {
+			buffer->ondisk = NULL;
+			hammer_io_release(&buffer->io, flush);
+
+			/*
+			 * Clean out the B-Tree node cache, if any, then
+			 * clean up the cluster ref and free the buffer.
+			 *
+			 * If the buffer acquires a new reference while we
+			 * are trying to clean it out, abort the cleaning.
+			 */
+			while (buffer->io.bp == NULL &&
+			       hammer_islastref(&buffer->io.lock) &&
+			       (node = TAILQ_FIRST(&buffer->clist)) != NULL
+			) {
+				KKASSERT(node->lock.refs == 0);
+				hammer_flush_node(node);
+			}
+			if (buffer->io.bp == NULL &&
+			    hammer_islastref(&buffer->io.lock)) {
+				cluster = buffer->cluster;
+				RB_REMOVE(hammer_buf_rb_tree,
+					  &cluster->rb_bufs_root, buffer);
+				buffer->cluster = NULL; /* sanity */
+				kfree(buffer, M_HAMMER);
+				hammer_rel_cluster(cluster, 0);
+				return;
 			}
 		}
-	} else {
 		hammer_unlock(&buffer->io.lock);
 	}
+	hammer_unref(&buffer->io.lock);
 }
 
+/*
+ * Flush passively cached B-Tree nodes associated with this buffer.
+ *
+ * NOTE: The buffer is referenced and locked.
+ */
 void
-hammer_dup_buffer(struct hammer_buffer **bufferp, struct hammer_buffer *buffer)
+hammer_flush_buffer_nodes(hammer_buffer_t buffer)
 {
-	if (buffer != *bufferp) {
-		if (buffer)
-			hammer_lock(&buffer->io.lock);
-		if (*bufferp)
-			hammer_put_buffer(*bufferp, 0);
-		*bufferp = buffer;
+	hammer_node_t node;
+
+	node = TAILQ_FIRST(&buffer->clist);
+	while (node) {
+		buffer->save_scan = TAILQ_NEXT(node, entry);
+		if (node->lock.refs == 0)
+			hammer_flush_node(node);
+		node = buffer->save_scan;
+	}
+}
+
+/************************************************************************
+ *				NODES					*
+ ************************************************************************
+ *
+ * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
+ * method used by the HAMMER filesystem.
+ *
+ * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
+ * associated with its buffer.  It can have an active buffer reference
+ * even when the node itself has no references.  The node also passively
+ * associates itself with its cluster without holding any cluster refs.
+ * The cluster ref is indirectly maintained by the active buffer ref when
+ * a node is acquired.
+ *
+ * A hammer_node can also be passively associated with other HAMMER
+ * structures, such as inodes, while retaining 0 references.  These
+ * associations can be cleared backwards using a pointer-to-pointer in
+ * the hammer_node.
+ *
+ * This allows the HAMMER implementation to cache hammer_node's long-term
+ * and short-cut a great deal of the infrastructure's complexity.  In
+ * most cases a cached node can be reacquired without having to dip into
+ * either the buffer or cluster management code.
+ *
+ * The caller must pass a referenced cluster on call and will retain
+ * ownership of the reference on return.  The node will acquire its own
+ * additional references, if necessary.
+ */
+hammer_node_t
+hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
+{
+	hammer_node_t node;
+
+	/*
+	 * Locate the structure, allocating one if necessary.
+	 */
+again:
+	node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
+			 node_offset);
+	if (node == NULL) {
+		node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
+		node->node_offset = node_offset;
+		node->cluster = cluster;
+		if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
+			      node)) {
+			kfree(node, M_HAMMER);
+			goto again;
+		}
+	}
+	*errorp = hammer_ref_node(node);
+	if (*errorp) {
+		/*
+		 * NOTE: The node pointer may be stale on error return.
+		 * In fact, its probably been destroyed.
+		 */
+		node = NULL;
+	}
+	return(node);
+}
+
+/*
+ * Reference the node to prevent disassociations, then associate and
+ * load the related buffer.  This routine can also be called to reference
+ * a node from a cache pointer.
+ *
+ * NOTE: Because the caller does not have a ref on the node, the caller's
+ * node pointer will be stale if an error is returned.  We may also wind
+ * up clearing the related cache pointers.
+ *
+ * NOTE: The cluster is indirectly referenced by our buffer ref.
+ */
+int
+hammer_ref_node(hammer_node_t node)
+{
+	hammer_buffer_t buffer;
+	int32_t buf_no;
+	int error;
+
+	hammer_ref(&node->lock);
+	if (node->ondisk == NULL) {
+		hammer_lock_ex(&node->lock);
+		if (node->ondisk == NULL) {
+			/*
+			 * This is a little confusing but the jist is that
+			 * node->buffer determines whether the node is on
+			 * the buffer's clist and node->ondisk determines
+			 * whether the buffer is referenced.
+			 */
+			if ((buffer = node->buffer) != NULL) {
+				error = hammer_ref_buffer(buffer);
+			} else {
+				buf_no = node->node_offset / HAMMER_BUFSIZE;
+				buffer = hammer_get_buffer(node->cluster,
+							   buf_no, 0, &error);
+				if (buffer) {
+					KKASSERT(error == 0);
+					TAILQ_INSERT_TAIL(&buffer->clist,
+							  node, entry);
+					node->buffer = buffer;
+				}
+			}
+			if (error == 0) {
+				node->ondisk = (void *)((char *)buffer->ondisk +
+				       (node->node_offset & HAMMER_BUFMASK));
+			}
+		}
+		hammer_unlock(&node->lock);
 	}
+	if (error)
+		hammer_rel_node(node);
+	return (error);
 }
 
+/*
+ * Release a hammer_node.  The node retains a passive association with
+ * its cluster, buffer and caches.
+ *
+ * However, to avoid cluttering up kernel memory with tons of B-Tree
+ * node cache structures we destroy the node if no passive cache or
+ * (instantiated) buffer references exist.
+ */
 void
-hammer_dup_cluster(struct hammer_cluster **clusterp,
-		   struct hammer_cluster *cluster)
-{
-	if (cluster != *clusterp) {
-		if (cluster)
-			hammer_lock(&cluster->io.lock);
-		if (*clusterp)
-			hammer_put_cluster(*clusterp, 0);
-		*clusterp = cluster;
+hammer_rel_node(hammer_node_t node)
+{
+	hammer_cluster_t cluster;
+	hammer_buffer_t buffer;
+
+	if (hammer_islastref(&node->lock)) {
+		cluster = node->cluster;
+		/*
+		 * Clutter control, this case only occurs after a failed
+		 * load since otherwise ondisk will be non-NULL.
+		 */
+		if (node->cache1 == NULL && node->cache2 == NULL && 
+		    node->ondisk == NULL) {
+			RB_REMOVE(hammer_nod_rb_tree, &cluster->rb_nods_root,
+				  node);
+			if ((buffer = node->buffer) != NULL) {
+				node->buffer = NULL;
+				hammer_remove_node_clist(buffer, node);
+			}
+			kfree(node, M_HAMMER);
+			return;
+		}
+
+		/*
+		 * node->ondisk determines whether we have a buffer reference
+		 * to get rid of or not.  Only get rid of the reference if
+		 * the kernel tried to flush the buffer.
+		 *
+		 * NOTE: Once unref'd the node can be physically destroyed,
+		 * so our node is stale afterwords.
+		 *
+		 * This case occurs if the node still has cache references.
+		 * We could remove the references and free the structure
+		 * but for now we allow them (and the node structure) to
+		 * remain intact.
+		 */
+		if (node->ondisk && hammer_io_checkflush(&node->buffer->io)) {
+			buffer = node->buffer;
+			node->buffer = NULL;
+			node->ondisk = NULL;
+			hammer_remove_node_clist(buffer, node);
+			hammer_unref(&node->lock);
+			hammer_rel_buffer(buffer, 0);
+		} else {
+			hammer_unref(&node->lock);
+		}
+	} else {
+		hammer_unref(&node->lock);
 	}
 }
 
+/*
+ * Cache-and-release a hammer_node.  Kinda like catching and releasing a
+ * fish, but keeping an eye on him.  The node is passively cached in *cache.
+ *
+ * NOTE!  HAMMER may NULL *cache at any time, even after you have
+ * referenced the node!
+ */
+void
+hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
+{
+	if (node->cache1 != cache) {
+		if (node->cache2 == cache) {
+			struct hammer_node **tmp;
+			tmp = node->cache1;
+			node->cache1 = node->cache2;
+			node->cache2 = tmp;
+		} else {
+			if (node->cache2)
+				*node->cache2 = NULL;
+			node->cache2 = node->cache1;
+			node->cache1 = cache;
+			*cache = node;
+		}
+	}
+	hammer_rel_node(node);
+}
+
+void
+hammer_uncache_node(struct hammer_node **cache)
+{
+	hammer_node_t node;
+
+	if ((node = *cache) != NULL) {
+		*cache = NULL;
+		if (node->cache1 == cache) {
+			node->cache1 = node->cache2;
+			node->cache2 = NULL;
+		} else if (node->cache2 == cache) {
+			node->cache2 = NULL;
+		} else {
+			panic("hammer_uncache_node: missing cache linkage");
+		}
+		if (node->cache1 == NULL && node->cache2 == NULL &&
+		    node->lock.refs == 0) {
+			hammer_flush_node(node);
+		}
+	}
+}
+
+/*
+ * Remove a node's cache references and destroy the node if it has no
+ * references.  This is typically called from the buffer handling code.
+ *
+ * The node may have an active buffer reference (ondisk != NULL) even
+ * if the node itself has no references.
+ *
+ * Note that a caller iterating through nodes via a buffer must have its
+ * own reference on the buffer or our hammer_rel_buffer() call below may
+ * rip it out from under the caller.
+ */
+void
+hammer_flush_node(hammer_node_t node)
+{
+	hammer_buffer_t buffer;
+
+	if (node->cache1)
+		*node->cache1 = NULL;
+	if (node->cache2)
+		*node->cache2 = NULL;
+	if (node->lock.refs == 0) {
+		RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
+			  node);
+		if ((buffer = node->buffer) != NULL) {
+			node->buffer = NULL;
+			hammer_remove_node_clist(buffer, node);
+			if (node->ondisk) {
+				node->ondisk = NULL;
+				hammer_rel_buffer(buffer, 0);
+			}
+		}
+		kfree(node, M_HAMMER);
+	}
+}
+
+/*
+ * Remove a node from the buffer's clist.  Adjust save_scan as appropriate.
+ * This is in its own little routine to properly handle interactions with
+ * save_scan, so it is possible to block while scanning a buffer's node list.
+ */
+static
+void
+hammer_remove_node_clist(hammer_buffer_t buffer, hammer_node_t node)
+{
+	if (buffer->save_scan == node)
+		buffer->save_scan = TAILQ_NEXT(node, entry);
+	TAILQ_REMOVE(&buffer->clist, node, entry);
+}
+
+/************************************************************************
+ *				A-LIST ALLOCATORS			*
+ ************************************************************************/
+
 /*
  * Allocate HAMMER elements - btree nodes, data storage, and record elements
  *
@@ -786,19 +1326,21 @@ hammer_dup_cluster(struct hammer_cluster **clusterp,
  * fails a new buffer is allocated and associated with the buffer type
  * A-list and the element is allocated out of the new buffer.
  */
-void *
-hammer_alloc_btree(struct hammer_cluster *cluster,
-		    int *errorp, struct hammer_buffer **bufferp)
+
+hammer_node_t
+hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
 {
-	struct hammer_buffer *buffer;
+	hammer_buffer_t buffer;
 	hammer_alist_t live;
+	hammer_node_t node;
 	int32_t elm_no;
 	int32_t buf_no;
-	void *item;
+	int32_t node_offset;
 
 	/*
 	 * Allocate a B-Tree element
 	 */
+	buffer = NULL;
 	live = &cluster->alist_btree;
 	elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
 	if (elm_no == HAMMER_ALIST_BLOCK_NONE)
@@ -806,10 +1348,12 @@ hammer_alloc_btree(struct hammer_cluster *cluster,
 	if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
 		alloc_new_buffer(cluster, live,
 				 HAMMER_FSBUF_BTREE, HAMMER_BTREE_NODES,
-				 cluster->ondisk->idx_index, errorp, bufferp);
+				 cluster->ondisk->idx_index, errorp, &buffer);
 		elm_no = hammer_alist_alloc(live, 1);
 		if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
 			*errorp = ENOSPC;
+			if (buffer)
+				hammer_rel_buffer(buffer, 0);
 			return(NULL);
 		}
 	}
@@ -819,26 +1363,27 @@ hammer_alloc_btree(struct hammer_cluster *cluster,
 	 * Load and return the B-Tree element
 	 */
 	buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
-	buffer = *bufferp;
-	if (buffer == NULL || buffer->cluster != cluster ||
-	    buffer->buf_no != buf_no) {
-		if (buffer)
-			hammer_put_buffer(buffer, 0);
-		buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
-		*bufferp = buffer;
+	node_offset = buf_no * HAMMER_BUFSIZE +
+		      offsetof(union hammer_fsbuf_ondisk,
+			       btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
+	node = hammer_get_node(cluster, node_offset, errorp);
+	if (node) {
+		bzero(node->ondisk, sizeof(*node->ondisk));
+	} else {
+		hammer_alist_free(live, elm_no, 1);
+		hammer_rel_node(node);
+		node = NULL;
 	}
-	KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_BTREE);
-	item = &buffer->ondisk->btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK];
-	bzero(item, sizeof(union hammer_btree_node));
-	*errorp = 0;
-	return(item);
+	if (buffer)
+		hammer_rel_buffer(buffer, 0);
+	return(node);
 }
 
 void *
-hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes,
-		   int *errorp, struct hammer_buffer **bufferp)
+hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
+		  int *errorp, struct hammer_buffer **bufferp)
 {
-	struct hammer_buffer *buffer;
+	hammer_buffer_t buffer;
 	hammer_alist_t live;
 	int32_t elm_no;
 	int32_t buf_no;
@@ -873,7 +1418,7 @@ hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes,
 	if (buffer == NULL || buffer->cluster != cluster ||
 	    buffer->buf_no != buf_no) {
 		if (buffer)
-			hammer_put_buffer(buffer, 0);
+			hammer_rel_buffer(buffer, 0);
 		buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
 		*bufferp = buffer;
 	}
@@ -885,10 +1430,10 @@ hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes,
 }
 
 void *
-hammer_alloc_record(struct hammer_cluster *cluster,
-		     int *errorp, struct hammer_buffer **bufferp)
+hammer_alloc_record(hammer_cluster_t cluster,
+		    int *errorp, struct hammer_buffer **bufferp)
 {
-	struct hammer_buffer *buffer;
+	hammer_buffer_t buffer;
 	hammer_alist_t live;
 	int32_t elm_no;
 	int32_t buf_no;
@@ -921,7 +1466,7 @@ hammer_alloc_record(struct hammer_cluster *cluster,
 	if (buffer == NULL || buffer->cluster != cluster ||
 	    buffer->buf_no != buf_no) {
 		if (buffer)
-			hammer_put_buffer(buffer, 0);
+			hammer_rel_buffer(buffer, 0);
 		buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
 		*bufferp = buffer;
 	}
@@ -937,12 +1482,12 @@ hammer_alloc_record(struct hammer_cluster *cluster,
  * or a cluster-relative byte offset.
  */
 void
-hammer_free_btree_ptr(struct hammer_buffer *buffer, hammer_btree_node_t node)
+hammer_free_btree_ptr(hammer_buffer_t buffer, hammer_node_ondisk_t node)
 {
 	int32_t elm_no;
 	hammer_alist_t live;
 
-	elm_no = node - buffer->ondisk->btree.nodes;
+	elm_no = node - &buffer->ondisk->btree.nodes[0];
 	KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
 	elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
 	live = &buffer->cluster->alist_btree;
@@ -950,7 +1495,7 @@ hammer_free_btree_ptr(struct hammer_buffer *buffer, hammer_btree_node_t node)
 }
 
 void
-hammer_free_data_ptr(struct hammer_buffer *buffer, void *data, int bytes)
+hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
 {
 	int32_t elm_no;
 	int32_t nblks;
@@ -966,8 +1511,7 @@ hammer_free_data_ptr(struct hammer_buffer *buffer, void *data, int bytes)
 }
 
 void
-hammer_free_record_ptr(struct hammer_buffer *buffer,
-		       union hammer_record_ondisk *rec)
+hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec)
 {
 	int32_t elm_no;
 	hammer_alist_t live;
@@ -980,9 +1524,9 @@ hammer_free_record_ptr(struct hammer_buffer *buffer,
 }
 
 void
-hammer_free_btree(struct hammer_cluster *cluster, int32_t bclu_offset)
+hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
 {
-	const int32_t blksize = sizeof(union hammer_btree_node);
+	const int32_t blksize = sizeof(struct hammer_node_ondisk);
 	int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
 	hammer_alist_t live;
 	int32_t elm_no;
@@ -996,8 +1540,7 @@ hammer_free_btree(struct hammer_cluster *cluster, int32_t bclu_offset)
 }
 
 void
-hammer_free_data(struct hammer_cluster *cluster, int32_t bclu_offset,
-		 int32_t bytes)
+hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
 {
 	const int32_t blksize = HAMMER_DATA_BLKSIZE;
 	int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
@@ -1015,7 +1558,7 @@ hammer_free_data(struct hammer_cluster *cluster, int32_t bclu_offset,
 }
 
 void
-hammer_free_record(struct hammer_cluster *cluster, int32_t bclu_offset)
+hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset)
 {
 	const int32_t blksize = sizeof(union hammer_record_ondisk);
 	int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
@@ -1037,11 +1580,11 @@ hammer_free_record(struct hammer_cluster *cluster, int32_t bclu_offset)
  * type-specific A-list and initialized.
  */
 static void
-alloc_new_buffer(struct hammer_cluster *cluster, hammer_alist_t live,
+alloc_new_buffer(hammer_cluster_t cluster, hammer_alist_t live,
 		 u_int64_t type, int32_t nelements,
 		 int start, int *errorp, struct hammer_buffer **bufferp)
 {
-	struct hammer_buffer *buffer;
+	hammer_buffer_t buffer;
 	int32_t buf_no;
 
 	start = start / HAMMER_FSBUF_MAXBLKS;	/* convert to buf_no */
@@ -1070,7 +1613,7 @@ alloc_new_buffer(struct hammer_cluster *cluster, hammer_alist_t live,
 	 */
 	buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
 	if (*bufferp)
-		hammer_put_buffer(*bufferp, 0);
+		hammer_rel_buffer(*bufferp, 0);
 	*bufferp = buffer;
 }
 
@@ -1086,17 +1629,17 @@ alloc_new_buffer(struct hammer_cluster *cluster, hammer_alist_t live,
 void
 flush_all_volumes(void)
 {
-	struct hammer_volume *vol;
+	hammer_volume_t vol;
 
 	for (vol = VolBase; vol; vol = vol->next)
 		flush_volume(vol);
 }
 
 void
-flush_volume(struct hammer_volume *vol)
+flush_volume(hammer_volume_t vol)
 {
-	struct hammer_supercl *supercl;
-	struct hammer_cluster *cl;
+	hammer_supercl_t supercl;
+	hammer_cluster_t cl;
 
 	for (supercl = vol->supercl_base; supercl; supercl = supercl->next)
 		flush_supercl(supercl);
@@ -1106,7 +1649,7 @@ flush_volume(struct hammer_volume *vol)
 }
 
 void
-flush_supercl(struct hammer_supercl *supercl)
+flush_supercl(hammer_supercl_t supercl)
 {
 	int64_t supercl_offset;
 
@@ -1115,9 +1658,9 @@ flush_supercl(struct hammer_supercl *supercl)
 }
 
 void
-flush_cluster(struct hammer_cluster *cl)
+flush_cluster(hammer_cluster_t cl)
 {
-	struct hammer_buffer *buf;
+	hammer_buffer_t buf;
 	int64_t cluster_offset;
 
 	for (buf = cl->buffer_base; buf; buf = buf->next)
@@ -1127,7 +1670,7 @@ flush_cluster(struct hammer_cluster *cl)
 }
 
 void
-flush_buffer(struct hammer_buffer *buf)
+flush_buffer(hammer_buffer_t buf)
 {
 	int64_t buffer_offset;
 
@@ -1154,7 +1697,7 @@ initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
  * is the number of clusters a supercluster can manage.
  */
 static int64_t
-calculate_cluster_offset(struct hammer_volume *volume, int32_t clu_no)
+calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
 {
 	int32_t scl_group;
 	int64_t scl_group_size;
@@ -1188,7 +1731,7 @@ calculate_cluster_offset(struct hammer_volume *volume, int32_t clu_no)
  * Calculate a super-cluster's offset in the volume.
  */
 static int64_t
-calculate_supercl_offset(struct hammer_volume *volume, int32_t scl_no)
+calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
 {
 	int64_t off;
 	int32_t scl_group;
@@ -1228,8 +1771,8 @@ calculate_supercl_offset(struct hammer_volume *volume, int32_t scl_no)
 static int
 buffer_alist_init(void *info, int32_t blk, int32_t radix)
 {
-	struct hammer_cluster *cluster = info;
-	struct hammer_buffer *buffer;
+	hammer_cluster_t cluster = info;
+	hammer_buffer_t buffer;
 	int32_t buf_no;
 	int error = 0;
 
@@ -1241,7 +1784,7 @@ buffer_alist_init(void *info, int32_t blk, int32_t radix)
 	buf_no = blk / HAMMER_FSBUF_MAXBLKS;
 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
 	if (buffer)
-		hammer_put_buffer(buffer, 0);
+		hammer_rel_buffer(buffer, 0);
 	return (error);
 }
 
@@ -1255,8 +1798,8 @@ static int
 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
                       int32_t count, int32_t atblk, int32_t *fullp)
 {
-	struct hammer_cluster *cluster = info;
-	struct hammer_buffer *buffer;
+	hammer_cluster_t cluster = info;
+	hammer_buffer_t buffer;
 	int32_t buf_no;
 	int32_t r;
 	int error = 0;
@@ -1270,7 +1813,7 @@ buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
 		if (r != HAMMER_ALIST_BLOCK_NONE)
 			r += blk;
 		*fullp = hammer_alist_isfull(&buffer->alist);
-		hammer_put_buffer(buffer, 0);
+		hammer_rel_buffer(buffer, 0);
 	} else {
 		r = HAMMER_ALIST_BLOCK_NONE;
 	}
@@ -1281,8 +1824,8 @@ static int
 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
                       int32_t count, int32_t atblk, int32_t *fullp)
 {
-	struct hammer_cluster *cluster = info;
-	struct hammer_buffer *buffer;
+	hammer_cluster_t cluster = info;
+	hammer_buffer_t buffer;
 	int32_t buf_no;
 	int32_t r;
 	int error = 0;
@@ -1296,7 +1839,7 @@ buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
 		if (r != HAMMER_ALIST_BLOCK_NONE)
 			r += blk;
 		*fullp = hammer_alist_isfull(&buffer->alist);
-		hammer_put_buffer(buffer, 0);
+		hammer_rel_buffer(buffer, 0);
 	} else {
 		r = HAMMER_ALIST_BLOCK_NONE;
 		*fullp = 0;
@@ -1308,8 +1851,8 @@ static void
 buffer_alist_free(void *info, int32_t blk, int32_t radix,
                  int32_t base_blk, int32_t count, int32_t *emptyp)
 {
-	struct hammer_cluster *cluster = info;
-	struct hammer_buffer *buffer;
+	hammer_cluster_t cluster = info;
+	hammer_buffer_t buffer;
 	int32_t buf_no;
 	int error = 0;
 
@@ -1319,7 +1862,7 @@ buffer_alist_free(void *info, int32_t blk, int32_t radix,
 		KKASSERT(buffer->ondisk->head.buf_type != 0);
 		hammer_alist_free(&buffer->alist, base_blk, count);
 		*emptyp = hammer_alist_isempty(&buffer->alist);
-		hammer_put_buffer(buffer, 0);
+		hammer_rel_buffer(buffer, 0);
 	} else {
 		*emptyp = 0;
 	}
@@ -1341,8 +1884,8 @@ buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
 static int
 super_alist_init(void *info, int32_t blk, int32_t radix)
 {
-	struct hammer_volume *volume = info;
-	struct hammer_supercl *supercl;
+	hammer_volume_t volume = info;
+	hammer_supercl_t supercl;
 	int32_t scl_no;
 	int error = 0;
 
@@ -1353,7 +1896,7 @@ super_alist_init(void *info, int32_t blk, int32_t radix)
 	scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
 	supercl = hammer_get_supercl(volume, scl_no, &error, 1);
 	if (supercl)
-		hammer_put_supercl(supercl, 0);
+		hammer_rel_supercl(supercl, 0);
 	return (error);
 }
 
@@ -1367,8 +1910,8 @@ static int
 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
 		      int32_t count, int32_t atblk, int32_t *fullp)
 {
-	struct hammer_volume *volume = info;
-	struct hammer_supercl *supercl;
+	hammer_volume_t volume = info;
+	hammer_supercl_t supercl;
 	int32_t scl_no;
 	int32_t r;
 	int error = 0;
@@ -1380,7 +1923,7 @@ super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
 		if (r != HAMMER_ALIST_BLOCK_NONE)
 			r += blk;
 		*fullp = hammer_alist_isfull(&supercl->alist);
-		hammer_put_supercl(supercl, 0);
+		hammer_rel_supercl(supercl, 0);
 	} else {
 		r = HAMMER_ALIST_BLOCK_NONE;
 		*fullp = 0;
@@ -1392,8 +1935,8 @@ static int
 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
 		      int32_t count, int32_t atblk, int32_t *fullp)
 {
-	struct hammer_volume *volume = info;
-	struct hammer_supercl *supercl;
+	hammer_volume_t volume = info;
+	hammer_supercl_t supercl;
 	int32_t scl_no;
 	int32_t r;
 	int error = 0;
@@ -1405,7 +1948,7 @@ super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
 		if (r != HAMMER_ALIST_BLOCK_NONE)
 			r += blk;
 		*fullp = hammer_alist_isfull(&supercl->alist);
-		hammer_put_supercl(supercl, 0);
+		hammer_rel_supercl(supercl, 0);
 	} else { 
 		r = HAMMER_ALIST_BLOCK_NONE;
 		*fullp = 0;
@@ -1417,8 +1960,8 @@ static void
 super_alist_free(void *info, int32_t blk, int32_t radix,
 		 int32_t base_blk, int32_t count, int32_t *emptyp)
 {
-	struct hammer_volume *volume = info;
-	struct hammer_supercl *supercl;
+	hammer_volume_t volume = info;
+	hammer_supercl_t supercl;
 	int32_t scl_no;
 	int error = 0;
 
@@ -1427,7 +1970,7 @@ super_alist_free(void *info, int32_t blk, int32_t radix,
 	if (supercl) {
 		hammer_alist_free(&supercl->alist, base_blk, count);
 		*emptyp = hammer_alist_isempty(&supercl->alist);
-		hammer_put_supercl(supercl, 0);
+		hammer_rel_supercl(supercl, 0);
 	} else {
 		*emptyp = 0;
 	}
diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c
index 3b3502d71e..b9f8e812a1 100644
--- a/sys/vfs/hammer/hammer_subs.c
+++ b/sys/vfs/hammer/hammer_subs.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.2 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.3 2007/11/19 00:53:40 dillon Exp $
  */
 /*
  * HAMMER structural locking
@@ -40,12 +40,12 @@
 #include "hammer.h"
 
 void
-hammer_lock(struct hammer_lock *lock)
+hammer_lock_ex(struct hammer_lock *lock)
 {
 	thread_t td = curthread;
 
+	KKASSERT(lock->refs > 0);
 	crit_enter();
-	++lock->refs;
 	if (lock->locktd != td) {
 		while (lock->locktd != NULL) {
 			lock->wanted = 1;
@@ -53,55 +53,101 @@ hammer_lock(struct hammer_lock *lock)
 		}
 		lock->locktd = td;
 	}
+	++lock->lockcount;
 	crit_exit();
 }
 
+/*
+ * Try to obtain an exclusive lock
+ */
+int
+hammer_lock_ex_try(struct hammer_lock *lock)
+{
+	thread_t td = curthread;
+
+	KKASSERT(lock->refs > 0);
+	crit_enter();
+	if (lock->locktd != td) {
+		if (lock->locktd != NULL)
+			return(EAGAIN);
+		lock->locktd = td;
+	}
+	++lock->lockcount;
+	crit_exit();
+	return(0);
+}
+
+
 void
-hammer_unlock(struct hammer_lock *lock)
+hammer_lock_sh(struct hammer_lock *lock)
 {
-	KKASSERT(lock->locktd == curthread);
 	KKASSERT(lock->refs > 0);
 	crit_enter();
-	if (--lock->refs == 0) {
-		lock->locktd = NULL;
-		if (lock->wanted) {
-			lock->wanted = 0;
-			wakeup(lock);
+	while (lock->locktd != NULL) {
+		if (lock->locktd == curthread) {
+			++lock->lockcount;
+			crit_exit();
+			return;
 		}
+		lock->wanted = 1;
+		tsleep(lock, 0, "hmrlck", 0);
 	}
+	KKASSERT(lock->lockcount <= 0);
+	--lock->lockcount;
 	crit_exit();
 }
 
 void
-hammer_ref(struct hammer_lock *lock)
+hammer_downgrade(struct hammer_lock *lock)
 {
+	KKASSERT(lock->lockcount == 1);
 	crit_enter();
-	++lock->refs;
+	lock->lockcount = -1;
+	lock->locktd = NULL;
+	if (lock->wanted) {
+		lock->wanted = 0;
+		wakeup(lock);
+	}
 	crit_exit();
+	/* XXX memory barrier */
 }
 
 void
-hammer_unref(struct hammer_lock *lock)
+hammer_unlock(struct hammer_lock *lock)
 {
 	crit_enter();
-	--lock->refs;
+	KKASSERT(lock->lockcount != 0);
+	if (lock->lockcount < 0) {
+		if (++lock->lockcount == 0 && lock->wanted) {
+			lock->wanted = 0;
+			wakeup(lock);
+		}
+	} else {
+		KKASSERT(lock->locktd == curthread);
+		if (--lock->lockcount == 0) {
+			lock->locktd = NULL;
+			if (lock->wanted) {
+				lock->wanted = 0;
+				wakeup(lock);
+			}
+		}
+
+	}
 	crit_exit();
 }
 
 void
-hammer_lock_to_ref(struct hammer_lock *lock)
+hammer_ref(struct hammer_lock *lock)
 {
 	crit_enter();
 	++lock->refs;
-	hammer_unlock(lock);
 	crit_exit();
 }
 
 void
-hammer_ref_to_lock(struct hammer_lock *lock)
+hammer_unref(struct hammer_lock *lock)
 {
 	crit_enter();
-	hammer_lock(lock);
 	--lock->refs;
 	crit_exit();
 }
@@ -113,12 +159,29 @@ hammer_to_unix_xid(uuid_t *uuid)
 }
 
 void
-hammer_to_timespec(u_int64_t hammerts, struct timespec *ts)
+hammer_guid_to_uuid(uuid_t *uuid, u_int32_t guid)
+{
+	bzero(uuid, sizeof(*uuid));
+	*(u_int32_t *)&uuid->node[2] = guid;
+}
+
+void
+hammer_to_timespec(hammer_tid_t tid, struct timespec *ts)
 {
-	ts->tv_sec = hammerts / 1000000000;
-	ts->tv_nsec = hammerts % 1000000000;
+	ts->tv_sec = tid / 1000000000;
+	ts->tv_nsec = tid % 1000000000;
 }
 
+hammer_tid_t
+hammer_timespec_to_transid(struct timespec *ts)
+{
+	hammer_tid_t tid;
+
+	tid = ts->tv_nsec + (unsigned long)ts->tv_sec * 1000000000LL;
+	return(tid);
+}
+
+
 /*
  * Convert a HAMMER filesystem object type to a vnode type
  */
diff --git a/sys/vfs/hammer/hammer_transaction.c b/sys/vfs/hammer/hammer_transaction.c
index 35d5fede48..3879b0baef 100644
--- a/sys/vfs/hammer/hammer_transaction.c
+++ b/sys/vfs/hammer/hammer_transaction.c
@@ -31,14 +31,14 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.1 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.2 2007/11/19 00:53:40 dillon Exp $
  */
 
 #include "hammer.h"
 
 void
-hammer_start_transaction(struct hammer_mount *hmp,
-                         struct hammer_transaction *trans)
+hammer_start_transaction(struct hammer_transaction *trans,
+			 struct hammer_mount *hmp)
 {
 	struct timespec ts;
 
diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c
index 9ac2cff487..41f2e9fcbb 100644
--- a/sys/vfs/hammer/hammer_vfsops.c
+++ b/sys/vfs/hammer/hammer_vfsops.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.3 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.4 2007/11/19 00:53:40 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -118,7 +118,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
 		if (error == 0)
 			error = copyinstr(upath, path, MAXPATHLEN, NULL);
 		if (error == 0)
-			error = hammer_load_volume(hmp, path);
+			error = hammer_install_volume(hmp, path);
 		if (error)
 			break;
 	}
diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c
index 077605efce..21a048933e 100644
--- a/sys/vfs/hammer/hammer_vnops.c
+++ b/sys/vfs/hammer/hammer_vnops.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.2 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.3 2007/11/19 00:53:40 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -106,6 +106,11 @@ struct vop_ops hammer_vnode_vops = {
 	.vop_nwhiteout =	hammer_vop_nwhiteout
 };
 
+static int hammer_dounlink(struct nchandle *nch, struct vnode *dvp,
+			   struct ucred *cred, int flags);
+static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
+static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
+
 #if 0
 static
 int
@@ -139,13 +144,15 @@ hammer_vop_read(struct vop_read_args *ap)
 	struct uio *uio;
 	int error;
 	int n;
+	int seqcount;
 
 	if (ap->a_vp->v_type != VREG)
 		return (EINVAL);
 	ip = VTOI(ap->a_vp);
 	error = 0;
+	seqcount = ap->a_ioflag >> 16;
 
-	hammer_start_transaction(ip->hmp, &trans);
+	hammer_start_transaction(&trans, ip->hmp);
 
 	/*
 	 * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
@@ -153,12 +160,14 @@ hammer_vop_read(struct vop_read_args *ap)
 	uio = ap->a_uio;
 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
 		offset = uio->uio_offset & HAMMER_BUFMASK;
-		error = bread(ap->a_vp, uio->uio_offset - offset,
-			      HAMMER_BUFSIZE, &bp);
+		error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
+				     uio->uio_offset - offset, HAMMER_BUFSIZE,
+				     MAXBSIZE, seqcount, &bp);
 		if (error) {
 			brelse(bp);
 			break;
 		}
+		bp->b_flags |= B_CLUSTEROK;
 		n = HAMMER_BUFSIZE - offset;
 		if (n > uio->uio_resid)
 			n = uio->uio_resid;
@@ -166,7 +175,7 @@ hammer_vop_read(struct vop_read_args *ap)
 			n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
 		error = uiomove((char *)bp->b_data + offset, n, uio);
 		if (error) {
-			brelse(bp);
+			bqrelse(bp);
 			break;
 		}
 		ip->ino_rec.ino_atime = trans.tid;
@@ -200,7 +209,7 @@ hammer_vop_write(struct vop_write_args *ap)
 	/*
 	 * Create a transaction to cover the operations we perform.
 	 */
-	hammer_start_transaction(ip->hmp, &trans);
+	hammer_start_transaction(&trans, ip->hmp);
 	uio = ap->a_uio;
 
 	/*
@@ -243,6 +252,7 @@ hammer_vop_write(struct vop_write_args *ap)
 			brelse(bp);
 			break;
 		}
+		bp->b_flags |= B_CLUSTEROK;
 		if (ip->ino_rec.ino_size < uio->uio_offset) {
 			ip->ino_rec.ino_size = uio->uio_offset;
 			ip->ino_rec.ino_mtime = trans.tid;
@@ -252,10 +262,8 @@ hammer_vop_write(struct vop_write_args *ap)
 		if (ap->a_ioflag & IO_SYNC) {
 			bwrite(bp);
 		} else if (ap->a_ioflag & IO_DIRECT) {
-			/* XXX B_CLUSTEROK SUPPORT */
 			bawrite(bp);
 		} else {
-			/* XXX B_CLUSTEROK SUPPORT */
 			bdwrite(bp);
 		}
 	}
@@ -330,20 +338,19 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap)
 	/*
 	 * Create a transaction to cover the operations we perform.
 	 */
-	hammer_start_transaction(dip->hmp, &trans);
+	hammer_start_transaction(&trans, dip->hmp);
 
 	/*
 	 * Create a new filesystem object of the requested type.  The
-	 * returned inode will be locked.  We cannot hold the new
-	 * inode locked while doing other manipulations.
+	 * returned inode will be referenceds but not locked.
 	 */
-	error = hammer_alloc_inode(&trans, ap->a_vap, ap->a_cred, &nip);
+
+	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
 	if (error) {
 		hammer_abort_transaction(&trans);
 		*ap->a_vpp = NULL;
 		return (error);
 	}
-	hammer_lock_to_ref(&nip->lock);
 
 	/*
 	 * Add the new filesystem object to the directory.  This will also
@@ -355,13 +362,13 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap)
 	 * Finish up.
 	 */
 	if (error) {
-		hammer_put_inode_ref(nip);
+		hammer_rel_inode(nip);
 		hammer_abort_transaction(&trans);
 		*ap->a_vpp = NULL;
 	} else {
 		hammer_commit_transaction(&trans);
 		error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
-		hammer_put_inode_ref(nip);
+		hammer_rel_inode(nip);
 	}
 	return (error);
 }
@@ -423,58 +430,58 @@ static
 int
 hammer_vop_nresolve(struct vop_nresolve_args *ap)
 {
-	struct hammer_base_elm key;
 	struct namecache *ncp;
 	struct hammer_inode *dip;
-	struct hammer_btree_info info;
+	struct hammer_cursor cursor;
+	union hammer_record_ondisk *rec;
 	struct vnode *vp;
 	int64_t namekey;
 	int error;
-	const int flags = HAMMER_BTREE_GET_RECORD | HAMMER_BTREE_GET_DATA;
 
+	/*
+	 * Calculate the namekey and setup the key range for the scan.  This
+	 * works kinda like a chained hash table where the lower 32 bits
+	 * of the namekey synthesize the chain.
+	 *
+	 * The key range is inclusive of both key_beg and key_end.
+	 */
 	dip = VTOI(ap->a_dvp);
 	ncp = ap->a_nch->ncp;
 	namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
 
-	hammer_btree_info_init(&info, dip->hmp->rootcl);
-        key.obj_id = dip->obj_id;
-	key.key = namekey;
-        key.create_tid = dip->obj_asof;
-        key.delete_tid = 0;
-        key.rec_type = HAMMER_RECTYPE_DIRENTRY;
-        key.obj_type = 0;
+	hammer_init_cursor_ip(&cursor, dip);
+        cursor.key_beg.obj_id = dip->obj_id;
+	cursor.key_beg.key = namekey;
+        cursor.key_beg.create_tid = dip->obj_asof;
+        cursor.key_beg.delete_tid = 0;
+        cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
+        cursor.key_beg.obj_type = 0;
 
-	/*
-	 * Issue a lookup on the namekey.  The entry should not be found
-	 * since the low bits of the key are 0.  This positions our cursor
-	 * properly for the iteration.
-	 */
-        error = hammer_btree_lookup(&info, &key, 0);
-	if (error != ENOENT) {
-		if (error == 0)
-			error = EIO;
-		goto done;
-	}
+	cursor.key_end = cursor.key_beg;
+	cursor.key_end.key |= 0xFFFFFFFFULL;
 
 	/*
-	 * Iterate through the keys as long as the upper 32 bits are
-	 * the same.
+	 * Scan all matching records (the chain), locate the one matching
+	 * the requested path component.  info->last_error contains the
+	 * error code on search termination and could be 0, ENOENT, or
+	 * something else.
+	 *
+	 * The hammer_ip_*() functions merge in-memory records with on-disk
+	 * records for the purposes of the search.
 	 */
-	while ((error = hammer_btree_iterate(&info.cursor, &key)) == 0) {
-		if ((error = hammer_btree_extract(&info, flags)) != 0)
-			break;
-		if ((namekey ^ info.rec->base.base.key) &
-		    (int64_t)0xFFFFFFFF00000000ULL) {
-			error = ENOENT;
+	rec = hammer_ip_first(&cursor, dip);
+	while (rec) {
+		if (hammer_ip_resolve_data(&cursor) != 0)  /* sets last_error */
 			break;
-		}
-		if (ncp->nc_nlen == info.rec->base.data_len &&
-		    bcmp(ncp->nc_name, (void *)info.data, ncp->nc_nlen) == 0) {
+		if (ncp->nc_nlen == rec->entry.base.data_len &&
+		    bcmp(ncp->nc_name, (void *)cursor.data, ncp->nc_nlen) == 0) {
 			break;
 		}
+		rec = hammer_ip_next(&cursor);
 	}
+	error = cursor.last_error;
 	if (error == 0) {
-		error = hammer_vfs_vget(dip->hmp->mp, info.rec->entry.obj_id, &vp);
+		error = hammer_vfs_vget(dip->hmp->mp, rec->entry.obj_id, &vp);
 		if (error == 0) {
 			vn_unlock(vp);
 			cache_setvp(ap->a_nch, vp);
@@ -483,8 +490,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap)
 	} else if (error == ENOENT) {
 		cache_setvp(ap->a_nch, NULL);
 	}
-done:
-        hammer_btree_info_done(&info);
+	hammer_done_cursor(&cursor);
 	return (error);
 }
 
@@ -533,7 +539,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap)
 	/*
 	 * Create a transaction to cover the operations we perform.
 	 */
-	hammer_start_transaction(dip->hmp, &trans);
+	hammer_start_transaction(&trans, dip->hmp);
 
 	/*
 	 * Add the filesystem object to the directory.  Note that neither
@@ -575,20 +581,18 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
 	/*
 	 * Create a transaction to cover the operations we perform.
 	 */
-	hammer_start_transaction(dip->hmp, &trans);
+	hammer_start_transaction(&trans, dip->hmp);
 
 	/*
 	 * Create a new filesystem object of the requested type.  The
-	 * returned inode will be locked.  We cannot hold the new
-	 * inode locked while doing other manipulations.
+	 * returned inode will be referenced but not locked.
 	 */
-	error = hammer_alloc_inode(&trans, ap->a_vap, ap->a_cred, &nip);
+	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
 	if (error) {
 		hammer_abort_transaction(&trans);
 		*ap->a_vpp = NULL;
 		return (error);
 	}
-	hammer_lock_to_ref(&nip->lock);
 
 	/*
 	 * Add the new filesystem object to the directory.  This will also
@@ -600,13 +604,13 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
 	 * Finish up.
 	 */
 	if (error) {
-		hammer_put_inode_ref(nip);
+		hammer_rel_inode(nip);
 		hammer_abort_transaction(&trans);
 		*ap->a_vpp = NULL;
 	} else {
 		hammer_commit_transaction(&trans);
 		error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
-		hammer_put_inode_ref(nip);
+		hammer_rel_inode(nip);
 	}
 	return (error);
 }
@@ -633,20 +637,18 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap)
 	/*
 	 * Create a transaction to cover the operations we perform.
 	 */
-	hammer_start_transaction(dip->hmp, &trans);
+	hammer_start_transaction(&trans, dip->hmp);
 
 	/*
 	 * Create a new filesystem object of the requested type.  The
-	 * returned inode will be locked.  We cannot hold the new
-	 * inode locked while doing other manipulations.
+	 * returned inode will be referenced but not locked.
 	 */
-	error = hammer_alloc_inode(&trans, ap->a_vap, ap->a_cred, &nip);
+	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
 	if (error) {
 		hammer_abort_transaction(&trans);
 		*ap->a_vpp = NULL;
 		return (error);
 	}
-	hammer_lock_to_ref(&nip->lock);
 
 	/*
 	 * Add the new filesystem object to the directory.  This will also
@@ -658,13 +660,13 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap)
 	 * Finish up.
 	 */
 	if (error) {
-		hammer_put_inode_ref(nip);
+		hammer_rel_inode(nip);
 		hammer_abort_transaction(&trans);
 		*ap->a_vpp = NULL;
 	} else {
 		hammer_commit_transaction(&trans);
 		error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
-		hammer_put_inode_ref(nip);
+		hammer_rel_inode(nip);
 	}
 	return (error);
 }
@@ -726,7 +728,7 @@ static
 int
 hammer_vop_nremove(struct vop_nremove_args *ap)
 {
-	return EOPNOTSUPP;
+	return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
 }
 
 /*
@@ -736,7 +738,93 @@ static
 int
 hammer_vop_nrename(struct vop_nrename_args *ap)
 {
-	return EOPNOTSUPP;
+	struct hammer_transaction trans;
+	struct namecache *fncp;
+	struct namecache *tncp;
+	struct hammer_inode *fdip;
+	struct hammer_inode *tdip;
+	struct hammer_inode *ip;
+	struct hammer_cursor cursor;
+	union hammer_record_ondisk *rec;
+	int64_t namekey;
+	int error;
+
+	fdip = VTOI(ap->a_fdvp);
+	tdip = VTOI(ap->a_tdvp);
+	fncp = ap->a_fnch->ncp;
+	tncp = ap->a_tnch->ncp;
+	hammer_start_transaction(&trans, fdip->hmp);
+
+	/*
+	 * Extract the hammer_inode from fncp and add link to the target
+	 * directory.
+	 */
+	ip = VTOI(fncp->nc_vp);
+	KKASSERT(ip != NULL);
+
+	error = hammer_add_directory(&trans, tdip, tncp, ip);
+
+	/*
+	 * Locate the record in the originating directory and remove it.
+	 *
+	 * Calculate the namekey and setup the key range for the scan.  This
+	 * works kinda like a chained hash table where the lower 32 bits
+	 * of the namekey synthesize the chain.
+	 *
+	 * The key range is inclusive of both key_beg and key_end.
+	 */
+	namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
+
+	hammer_init_cursor_ip(&cursor, fdip);
+        cursor.key_beg.obj_id = fdip->obj_id;
+	cursor.key_beg.key = namekey;
+        cursor.key_beg.create_tid = fdip->obj_asof;
+        cursor.key_beg.delete_tid = 0;
+        cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
+        cursor.key_beg.obj_type = 0;
+
+	cursor.key_end = cursor.key_beg;
+	cursor.key_end.key |= 0xFFFFFFFFULL;
+
+	/*
+	 * Scan all matching records (the chain), locate the one matching
+	 * the requested path component.  info->last_error contains the
+	 * error code on search termination and could be 0, ENOENT, or
+	 * something else.
+	 *
+	 * The hammer_ip_*() functions merge in-memory records with on-disk
+	 * records for the purposes of the search.
+	 */
+	rec = hammer_ip_first(&cursor, fdip);
+	while (rec) {
+		if (hammer_ip_resolve_data(&cursor) != 0)
+			break;
+		if (fncp->nc_nlen == rec->entry.base.data_len &&
+		    bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
+			break;
+		}
+		rec = hammer_ip_next(&cursor);
+	}
+	error = cursor.last_error;
+
+	/*
+	 * If all is ok we have to get the inode so we can adjust nlinks.
+	 */
+	if (error)
+		goto done;
+	error = hammer_del_directory(&trans, &cursor, fdip, ip);
+	if (error == 0) {
+		cache_rename(ap->a_fnch, ap->a_tnch);
+		cache_setvp(ap->a_tnch, ip->vp);
+	}
+done:
+	if (error == 0) {
+		hammer_commit_transaction(&trans);
+	} else {
+		hammer_abort_transaction(&trans);
+	}
+        hammer_done_cursor(&cursor);
+	return (error);
 }
 
 /*
@@ -746,7 +834,9 @@ static
 int
 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
 {
-	return EOPNOTSUPP;
+	/* XXX check that directory is empty */
+
+	return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
 }
 
 /*
@@ -756,7 +846,97 @@ static
 int
 hammer_vop_setattr(struct vop_setattr_args *ap)
 {
-	return EOPNOTSUPP;
+	struct hammer_transaction trans;
+	struct vattr *vap;
+	struct hammer_inode *ip;
+	int modflags;
+	int error;
+	u_int32_t flags;
+	uuid_t uuid;
+
+	vap = ap->a_vap;
+	ip = ap->a_vp->v_data;
+	modflags = 0;
+
+	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
+		return(EROFS);
+
+	hammer_start_transaction(&trans, ip->hmp);
+	error = 0;
+
+	if (vap->va_flags != VNOVAL) {
+		flags = ip->ino_data.uflags;
+		error = vop_helper_setattr_flags(&flags, vap->va_flags,
+					 hammer_to_unix_xid(&ip->ino_data.uid),
+					 ap->a_cred);
+		if (error == 0) {
+			if (ip->ino_data.uflags != flags) {
+				ip->ino_data.uflags = flags;
+				modflags |= HAMMER_INODE_DDIRTY;
+			}
+			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
+				error = 0;
+				goto done;
+			}
+		}
+		goto done;
+	}
+	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
+		error = EPERM;
+		goto done;
+	}
+	if (vap->va_uid != (uid_t)VNOVAL) {
+		hammer_guid_to_uuid(&uuid, vap->va_uid);
+		if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) == 0) {
+			ip->ino_data.uid = uuid;
+			modflags |= HAMMER_INODE_DDIRTY;
+		}
+	}
+	if (vap->va_gid != (uid_t)VNOVAL) {
+		hammer_guid_to_uuid(&uuid, vap->va_uid);
+		if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) == 0) {
+			ip->ino_data.gid = uuid;
+			modflags |= HAMMER_INODE_DDIRTY;
+		}
+	}
+	if (vap->va_size != VNOVAL) {
+		switch(ap->a_vp->v_type) {
+		case VREG:
+		case VDATABASE:
+			error = hammer_delete_range(&trans, ip,
+						    vap->va_size,
+						    0x7FFFFFFFFFFFFFFFLL);
+			break;
+		default:
+			error = EINVAL;
+			goto done;
+		}
+	}
+	if (vap->va_atime.tv_sec != VNOVAL) {
+		ip->ino_rec.ino_atime =
+			hammer_timespec_to_transid(&vap->va_atime);
+		modflags |= HAMMER_INODE_ITIMES;
+	}
+	if (vap->va_mtime.tv_sec != VNOVAL) {
+		ip->ino_rec.ino_mtime =
+			hammer_timespec_to_transid(&vap->va_mtime);
+		modflags |= HAMMER_INODE_ITIMES;
+	}
+	if (vap->va_mode != (mode_t)VNOVAL) {
+		if (ip->ino_data.mode != vap->va_mode) {
+			ip->ino_data.mode = vap->va_mode;
+			modflags |= HAMMER_INODE_DDIRTY;
+		}
+	}
+done:
+	if (error) {
+		hammer_abort_transaction(&trans);
+	} else {
+		if (modflags)
+			hammer_modify_inode(&trans, ip, modflags);
+		hammer_commit_transaction(&trans);
+	}
+	return (error);
 }
 
 /*
@@ -776,115 +956,287 @@ static
 int
 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
 {
-	return EOPNOTSUPP;
+	return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags));
 }
 
 /*
  * hammer_vop_strategy { vp, bio }
+ *
+ * Strategy call, used for regular file read & write only.  Note that the
+ * bp may represent a cluster.
+ *
+ * To simplify operation and allow better optimizations in the future,
+ * this code does not make any assumptions with regards to buffer alignment
+ * or size.
  */
 static
 int
 hammer_vop_strategy(struct vop_strategy_args *ap)
 {
-	return EOPNOTSUPP;
+	struct buf *bp;
+	int error;
+
+	bp = ap->a_bio->bio_buf;
+
+	switch(bp->b_cmd) {
+	case BUF_CMD_READ:
+		error = hammer_vop_strategy_read(ap);
+		break;
+	case BUF_CMD_WRITE:
+		error = hammer_vop_strategy_write(ap);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	bp->b_error = error;
+	if (error)
+		bp->b_flags |= B_ERROR;
+	biodone(ap->a_bio);
+	return (error);
 }
 
-#if 0
-	struct hammer_data_record *data;
-	struct hammer_base_elm_t key;
-	hammer_btree_info info;
-	const int flags = HAMMER_BTREE_GET_RECORD | HAMMER_BTREE_GET_DATA;
-	int64_t base_offset;
-	int didinit;
-	int o;
-
-	hammer_btree_info_init(&info, ip->hmp->rootcl);
-        key.obj_id = ip->obj_id;
-        key.create_tid = ip->obj_asof;
-        key.delete_tid = 0;
-        key.rec_type = HAMMER_RECTYPE_DATA;
-        key.obj_type = 0;
-	key.key = uio->uio_offset;
-
-	/*
-	 * Iterate through matching records.  Note that for data records
-	 * the base offset is the key - data_len, NOT the key.  This way
-	 * we don't have to special case a ranged search.
-	 */
-	error = hammer_btree_lookup(&info, &key, 0);
-	if (error && error != ENOENT)
-		goto done;
-	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
-		if ((error = hammer_btree_iterate(&info.cursor, &key)) != 0)
+/*
+ * Read from a regular file.  Iterate the related records and fill in the
+ * BIO/BUF.  Gaps are zero-filled.
+ *
+ * The support code in hammer_object.c should be used to deal with mixed
+ * in-memory and on-disk records.
+ *
+ * XXX atime update
+ */
+static
+int
+hammer_vop_strategy_read(struct vop_strategy_args *ap)
+{
+	struct hammer_inode *ip = ap->a_vp->v_data;
+	struct hammer_cursor cursor;
+	hammer_record_ondisk_t rec;
+	hammer_base_elm_t base;
+	struct bio *bio;
+	struct buf *bp;
+	int64_t rec_offset;
+	int error;
+	int boff;
+	int roff;
+	int n;
+
+	bio = ap->a_bio;
+	bp = bio->bio_buf;
+
+	hammer_init_cursor_ip(&cursor, ip);
+
+	/*
+	 * Key range (begin and end inclusive) to scan.  Note that the key's
+	 * stored in the actual records represent the 
+	 */
+	cursor.key_beg.obj_id = ip->obj_id;
+	cursor.key_beg.create_tid = ip->obj_asof;
+	cursor.key_beg.delete_tid = 0;
+	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
+	cursor.key_beg.obj_type = 0;
+	cursor.key_beg.key = bio->bio_offset;
+
+	cursor.key_end = cursor.key_beg;
+	cursor.key_end.key = bio->bio_offset + bp->b_bufsize - 1;
+
+	rec = hammer_ip_first(&cursor, ip);
+	boff = 0;
+
+	while (rec) {
+		if (hammer_ip_resolve_data(&cursor) != 0)
 			break;
+		base = &rec->base.base;
+
+		rec_offset = base->key - rec->data.base.data_len;
+
 		/*
-		 * XXX - possible to optimize the extract
+		 * Zero-fill any gap
 		 */
-		if ((error = hammer_btree_extract(&info, flags)) != 0)
-			break;
-		data = &info.rec->data;
-		base_offset = data->base.key - data->base.data_len;
-		if (uio->uio_offset < base_offset) {
-			if (base_offset - uio->uio_offset > HAMMER_BUFSIZE)
-				n = HAMMER_BUFSIZE;
-			else
-				n = (int)(base_offset - uio->uio_offset);
-			error = uiomove(ip->hmp->zbuf, n, uio);
-		} else {
-			o = (int)uio->uio_offset - base_offset;
-			if (o < data->base.data_len) {
-				n = data->base.data_len - o;
-				if (n > uio->uio_resid)
-					n = uio->uio_resid;
-				error = uiomove((char *)info.data + o, n, uio);
-			}
+		n = (int)(rec_offset - (bio->bio_offset + boff));
+		if (n > 0) {
+			kprintf("zfill %d bytes\n", n);
+			bzero((char *)bp->b_data + boff, n);
+			boff += n;
+			n = 0;
 		}
-		if (error)
+
+		/*
+		 * Calculate the data offset in the record and the number
+		 * of bytes we can copy.
+		 */
+		roff = -n;
+		n = rec->data.base.data_len - roff;
+		KKASSERT(n > 0);
+		if (n > bp->b_bufsize - boff)
+			n = bp->b_bufsize - boff;
+		bcopy((char *)cursor.data + roff, (char *)bp->b_data + boff, n);
+		boff += n;
+		if (boff == bp->b_bufsize)
 			break;
+		rec = hammer_ip_next(&cursor);
 	}
+	hammer_done_cursor(&cursor);
 
 	/*
-	 * Issue a lookup on the namekey.  The entry should not be found
-	 * since the low bits of the key are 0.  This positions our cursor
-	 * properly for the iteration.
+	 * There may have been a gap after the last record
 	 */
-	if (error != ENOENT) {
-		if (error == 0)
-			error = EIO;
-		goto done;
+	error = cursor.last_error;
+	if (error == ENOENT)
+		error = 0;
+	if (error == 0 && boff != bp->b_bufsize) {
+		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
+		/* boff = bp->b_bufsize; */
+	}
+	bp->b_resid = 0;
+	return(error);
+}
+
+/*
+ * Write to a regular file.  Iterate the related records and mark for
+ * deletion.  If existing edge records (left and right side) overlap our
+ * write they have to be marked deleted and new records created, usually
+ * referencing a portion of the original data.  Then add a record to
+ * represent the buffer.
+ *
+ * The support code in hammer_object.c should be used to deal with mixed
+ * in-memory and on-disk records.
+ */
+static
+int
+hammer_vop_strategy_write(struct vop_strategy_args *ap)
+{
+	struct hammer_transaction trans;
+	hammer_inode_t ip;
+	struct bio *bio;
+	struct buf *bp;
+	int error;
+
+	bio = ap->a_bio;
+	bp = bio->bio_buf;
+	ip = ap->a_vp->v_data;
+	hammer_start_transaction(&trans, ip->hmp);
+
+	/*
+	 * Delete any records overlapping our range.  This function will
+	 * properly
+	 */
+	error = hammer_delete_range(&trans, ip, bio->bio_offset,
+				    bio->bio_offset + bp->b_bufsize - 1);
+
+	/*
+	 * Add a single record to cover the write
+	 */
+	if (error == 0) {
+		error = hammer_add_data(&trans, ip, bio->bio_offset,
+					bp->b_data, bp->b_bufsize);
 	}
 
 	/*
-	 * Iterate through the keys as long as the upper 32 bits are
-	 * the same.
+	 * If an error occured abort the transaction
 	 */
-	while ((error = hammer_btree_iterate(&info, &key, flags)) == 0) {
-		if ((namekey ^ info.rec->base.base.key) &
-		    (int64_t)0xFFFFFFFF00000000ULL) {
-			error = ENOENT;
+	if (error) {
+		/* XXX undo deletion */
+		hammer_abort_transaction(&trans);
+		bp->b_resid = bp->b_bufsize;
+	} else {
+		hammer_commit_transaction(&trans);
+		bp->b_resid = 0;
+	}
+	return(error);
+}
+
+/*
+ * dounlink - disconnect a directory entry
+ *
+ * XXX whiteout support not really in yet
+ */
+static int
+hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred,
+		int flags)
+{
+	struct hammer_transaction trans;
+	struct namecache *ncp;
+	hammer_inode_t dip;
+	hammer_inode_t ip;
+	hammer_record_ondisk_t rec;
+	struct hammer_cursor cursor;
+	struct vnode *vp;
+	int64_t namekey;
+	int error;
+
+	/*
+	 * Calculate the namekey and setup the key range for the scan.  This
+	 * works kinda like a chained hash table where the lower 32 bits
+	 * of the namekey synthesize the chain.
+	 *
+	 * The key range is inclusive of both key_beg and key_end.
+	 */
+	dip = VTOI(dvp);
+	ncp = nch->ncp;
+	namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
+
+	hammer_init_cursor_ip(&cursor, dip);
+        cursor.key_beg.obj_id = dip->obj_id;
+	cursor.key_beg.key = namekey;
+        cursor.key_beg.create_tid = dip->obj_asof;
+        cursor.key_beg.delete_tid = 0;
+        cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
+        cursor.key_beg.obj_type = 0;
+
+	cursor.key_end = cursor.key_beg;
+	cursor.key_end.key |= 0xFFFFFFFFULL;
+
+	hammer_start_transaction(&trans, dip->hmp);
+
+	/*
+	 * Scan all matching records (the chain), locate the one matching
+	 * the requested path component.  info->last_error contains the
+	 * error code on search termination and could be 0, ENOENT, or
+	 * something else.
+	 *
+	 * The hammer_ip_*() functions merge in-memory records with on-disk
+	 * records for the purposes of the search.
+	 */
+	rec = hammer_ip_first(&cursor, dip);
+	while (rec) {
+		if (hammer_ip_resolve_data(&cursor) != 0)
 			break;
-		}
-		if (ncp->nc_nlen == info.rec->base.data_len &&
-		    bcmp(ncp->nc_name, (void *)info->data, ncp->nc_nlen) == 0) {
+		if (ncp->nc_nlen == rec->entry.base.data_len &&
+		    bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
 			break;
 		}
+		rec = hammer_ip_next(&cursor);
 	}
+	error = cursor.last_error;
+
+	/*
+	 * If all is ok we have to get the inode so we can adjust nlinks.
+	 */
 	if (error == 0) {
-		error = hammer_vfs_vget(dip->hmp->mp, info->rec->entry.obj_id, &vp);
+		ip = hammer_get_inode(dip->hmp, rec->entry.obj_id, &error);
+		if (error == 0)
+			error = hammer_del_directory(&trans, &cursor, dip, ip);
+		if (error == 0) {
+			cache_setunresolved(nch);
+			cache_setvp(nch, NULL);
+			/* XXX locking */
+			if (ip->vp)
+				cache_inval_vp(ip->vp, CINV_DESTROY);
+		}
+		hammer_rel_inode(ip);
+
+		error = hammer_vfs_vget(dip->hmp->mp, rec->entry.obj_id, &vp);
 		if (error == 0) {
 			vn_unlock(vp);
 			cache_setvp(nch, vp);
 			vrele(vp);
+			hammer_commit_transaction(&trans);
+		} else {
+			hammer_abort_transaction(&trans);
 		}
-	} else if (error == ENOENT) {
-		cache_setvp(nch, NULL);
 	}
-done:
-        hammer_btree_info_done(&info);
+        hammer_done_cursor(&cursor);
 	return (error);
-
-
-	return EOPNOTSUPP;
 }
 
-#endif
-- 
2.41.0