gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2007 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.7 2007/11/26 21:38:37 dillon Exp $
	35	*/
	36
	37	/*
	38	* HAMMER B-Tree index
	39	*
	40	* HAMMER implements a modified B+Tree. In documentation this will
	41	* simply be refered to as the HAMMER B-Tree. Basically a B-Tree
	42	* looks like a B+Tree (A B-Tree which stores its records only at the leafs
	43	* of the tree), but adds two additional boundary elements which describe
	44	* the left-most and right-most element a node is able to represent. In
	45	* otherwords, we have boundary elements at the two ends of a B-Tree node
	46	* instead of sub-tree pointers.
	47	*
	48	* A B-Tree internal node looks like this:
	49	*
	50	* B N N N N N N B <-- boundary and internal elements
	51	* S S S S S S S <-- subtree pointers
	52	*
	53	* A B-Tree leaf node basically looks like this:
	54	*
	55	* L L L L L L L L <-- leaf elemenets
	56	*
	57	* The radix for an internal node is 1 less then a leaf but we get a
	58	* number of significant benefits for our troubles.
	59	*
	60	* The big benefit to using a B-Tree containing boundary information
	61	* is that it is possible to cache pointers into the middle of the tree
	62	* and not have to start searches, insertions, OR deletions at the root
	63	* node. In particular, searches are able to progress in a definitive
	64	* direction from any point in the tree without revisting nodes. This
	65	* greatly improves the efficiency of many operations, most especially
	66	* record appends.
	67	*
	68	* B-Trees also make the stacking of trees fairly straightforward.
	69	*
	70	* INTER-CLUSTER ELEMENTS: An element of an internal node may reference
	71	* the root of another cluster rather then a node in the current cluster.
	72	* This is known as an inter-cluster references. Only B-Tree searches
	73	* will cross cluster boundaries. The rebalancing and collapse code does
	74	* not attempt to move children between clusters. A major effect of this
	75	* is that we have to relax minimum element count requirements and allow
	76	* trees to become somewhat unabalanced.
	77	*
	78	* INSERTIONS AND DELETIONS: When inserting we split full nodes on our
	79	* way down as an optimization. I originally experimented with rebalancing
	80	* nodes on the way down for deletions but it created a huge mess due to
	81	* the way inter-cluster linkages work. Instead, now I simply allow
	82	* the tree to become unbalanced and allow leaf nodes to become empty.
	83	* The delete code will try to clean things up from the bottom-up but
	84	* will stop if related elements are not in-core or if it cannot get a node
	85	* lock.
	86	*/
	87	#include "hammer.h"
	88	#include <sys/buf.h>
	89	#include <sys/buf2.h>
	90
	91	static int btree_search(hammer_cursor_t cursor, int flags);
	92	static int btree_split_internal(hammer_cursor_t cursor);
	93	static int btree_split_leaf(hammer_cursor_t cursor);
	94	static int btree_remove(hammer_node_t node, int index);
	95	#if 0
	96	static int btree_rebalance(hammer_cursor_t cursor);
	97	static int btree_collapse(hammer_cursor_t cursor);
	98	#endif
	99	static int btree_node_is_full(hammer_node_ondisk_t node);
	100	static void hammer_make_separator(hammer_base_elm_t key1,
	101	hammer_base_elm_t key2, hammer_base_elm_t dest);
	102
	103	/*
	104	* Iterate records after a search. The cursor is iterated forwards past
	105	* the current record until a record matching the key-range requirements
	106	* is found. ENOENT is returned if the iteration goes past the ending
	107	* key.
	108	*
	109	* key_beg/key_end is an INCLUSVE range. i.e. if you are scanning to load
	110	* a 4096 byte buffer key_beg might specify an offset of 0 and key_end an
	111	* offset of 4095.
	112	*
	113	* cursor->key_beg may or may not be modified by this function during
	114	* the iteration.
	115	*/
	116	int
	117	hammer_btree_iterate(hammer_cursor_t cursor)
	118	{
	119	hammer_node_ondisk_t node;
	120	hammer_btree_elm_t elm;
	121	int error;
	122	int r;
	123	#if 0
	124	int s;
	125	int64_t save_key;
	126	#endif
	127
	128	/*
	129	* Skip past the current record
	130	*/
	131	node = cursor->node->ondisk;
	132	if (node == NULL)
	133	return(ENOENT);
	134	if (cursor->index < node->count &&
	135	(cursor->flags & HAMMER_CURSOR_ATEDISK)) {
	136	++cursor->index;
	137	}
	138
	139	/*
	140	* Loop until an element is found or we are done.
	141	*/
	142	for (;;) {
	143	/*
	144	* We iterate up the tree and then index over one element
	145	* while we are at the last element in the current node.
	146	*
	147	* NOTE: This can pop us up to another cluster.
	148	*
	149	* If we are at the root of the root cluster, cursor_up
	150	* returns ENOENT.
	151	*
	152	* NOTE: hammer_cursor_up() will adjust cursor->key_beg
	153	* when told to re-search for the cluster tag.
	154	*
	155	* XXX this could be optimized by storing the information in
	156	* the parent reference.
	157	*/
	158	if (cursor->index == node->count) {
	159	error = hammer_cursor_up(cursor);
	160	if (error)
	161	break;
	162	node = cursor->node->ondisk;
	163	KKASSERT(cursor->index != node->count);
	164	++cursor->index;
	165	continue;
	166	}
	167
	168	/*
	169	* Iterate down the tree while we are at an internal node.
	170	* Nodes cannot be empty, assert the case because if one is
	171	* we will wind up in an infinite loop.
	172	*
	173	* We can avoid iterating through large swaths of transaction
	174	* id space if the left and right separators are the same
	175	* except for their transaction spaces. We can then skip
	176	* the node if the left and right transaction spaces are the
	177	* same sign. This directly optimized accesses to files with
	178	* HUGE transactional histories, such as database files,
	179	* allowing us to avoid having to iterate through the entire
	180	* history.
	181	*/
	182	if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
	183	KKASSERT(node->count != 0);
	184	elm = &node->elms[cursor->index];
	185	#if 0
	186	/*
	187	* temporarily disable this optimization, it needs
	188	* more of a theoretical review.
	189	*/
	190	if (elm[0].base.obj_id == elm[1].base.obj_id &&
	191	elm[0].base.rec_type == elm[1].base.rec_type &&
	192	elm[0].base.key == elm[1].base.key) {
	193	/*
	194	* Left side transaction space
	195	*/
	196	save_key = cursor->key_beg.key;
	197	cursor->key_beg.key = elm[0].base.key;
	198	r = hammer_btree_cmp(&cursor->key_beg,
	199	&elm[0].base);
	200	cursor->key_beg.key = save_key;
	201
	202	/*
	203	* Right side transaction space
	204	*/
	205	save_key = cursor->key_end.key;
	206	cursor->key_end.key = elm[1].base.key;
	207	s = hammer_btree_cmp(&cursor->key_end,
	208	&elm[1].base);
	209	cursor->key_end.key = save_key;
	210
	211	/*
	212	* If our range is entirely on one side or
	213	* the other side we can skip the sub-tree.
	214	*/
	215	if ((r < 0 && s < 0) \|\| (r > 0 && s > 0)) {
	216	++cursor->index;
	217	continue;
	218	}
	219	}
	220	#endif
	221	error = hammer_cursor_down(cursor);
	222	if (error)
	223	break;
	224	KKASSERT(cursor->index == 0);
	225	node = cursor->node->ondisk;
	226	continue;
	227	}
	228
	229	/*
	230	* We are at a leaf.
	231	*
	232	* Determine if the record at the cursor has gone beyond the
	233	* end of our range. Remember that our key range is inclusive.
	234	*
	235	* When iterating we may have to 'pick out' records matching
	236	* our transaction requirements. A comparison return of
	237	* +1 or -1 indicates a transactional record that is too
	238	* old or too new but does not terminate the search.
	239	*/
	240	elm = &node->elms[cursor->index];
	241	r = hammer_btree_range_cmp(cursor, &elm->base);
	242	if (r == -1 \|\| r == 1) {
	243	++cursor->index;
	244	continue;
	245	}
	246
	247	/*
	248	* We either found a match or are now out of bounds.
	249	*/
	250	error = r ? ENOENT : 0;
	251	break;
	252	}
	253	return(error);
	254	}
	255
	256	/*
	257	* Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry
	258	* could not be found, and a fatal error otherwise.
	259	*
	260	* The cursor is suitably positioned for a deletion on success, and suitably
	261	* positioned for an insertion on ENOENT.
	262	*
	263	* The cursor may begin anywhere, the search will traverse clusters in
	264	* either direction to locate the requested element.
	265	*/
	266	int
	267	hammer_btree_lookup(hammer_cursor_t cursor)
	268	{
	269	int error;
	270
	271	error = btree_search(cursor, 0);
	272	if (error == 0 && cursor->flags)
	273	error = hammer_btree_extract(cursor, cursor->flags);
	274	return(error);
	275	}
	276
	277	/*
	278	* Extract the record and/or data associated with the cursor's current
	279	* position. Any prior record or data stored in the cursor is replaced.
	280	* The cursor must be positioned at a leaf node.
	281	*
	282	* NOTE: Only records can be extracted from internal B-Tree nodes, and
	283	* only for inter-cluster references. At the moment we only support
	284	* extractions from leaf nodes.
	285	*/
	286	int
	287	hammer_btree_extract(hammer_cursor_t cursor, int flags)
	288	{
	289	hammer_node_ondisk_t node;
	290	hammer_btree_elm_t elm;
	291	hammer_cluster_t cluster;
	292	u_int64_t buf_type;
	293	int32_t cloff;
	294	int error;
	295
	296	/*
	297	* A cluster record type has no data reference, the information
	298	* is stored directly in the record and B-Tree element.
	299	*
	300	* The case where the data reference resolves to the same buffer
	301	* as the record reference must be handled.
	302	*/
	303	node = cursor->node->ondisk;
	304	KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
	305	elm = &node->elms[cursor->index];
	306	cluster = cursor->node->cluster;
	307	error = 0;
	308
	309	if ((flags & HAMMER_CURSOR_GET_RECORD) && error == 0) {
	310	cloff = elm->leaf.rec_offset;
	311	cursor->record = hammer_bread(cluster, cloff,
	312	HAMMER_FSBUF_RECORDS, &error,
	313	&cursor->record_buffer);
	314	} else {
	315	cloff = 0;
	316	}
	317	if ((flags & HAMMER_CURSOR_GET_DATA) && error == 0) {
	318	if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) {
	319	/*
	320	* The data is not in the same buffer as the last
	321	* record we cached, but it could still be embedded
	322	* in a record. Note that we may not have loaded the
	323	* record's buffer above, depending on flags.
	324	*/
	325	if ((elm->leaf.rec_offset ^ elm->leaf.data_offset) &
	326	~HAMMER_BUFMASK) {
	327	if (elm->leaf.data_len & HAMMER_BUFMASK)
	328	buf_type = HAMMER_FSBUF_DATA;
	329	else
	330	buf_type = 0; /* pure data buffer */
	331	} else {
	332	buf_type = HAMMER_FSBUF_RECORDS;
	333	}
	334	cursor->data = hammer_bread(cluster,
	335	elm->leaf.data_offset,
	336	buf_type, &error,
	337	&cursor->data_buffer);
	338	} else {
	339	/*
	340	* Data in same buffer as record. Note that we
	341	* leave any existing data_buffer intact, even
	342	* though we don't use it in this case, in case
	343	* other records extracted during an iteration
	344	* go back to it.
	345	*
	346	* Just assume the buffer type is correct.
	347	*/
	348	cursor->data = (void *)
	349	((char *)cursor->record_buffer->ondisk +
	350	(elm->leaf.data_offset & HAMMER_BUFMASK));
	351	}
	352	}
	353	return(error);
	354	}
	355
	356
	357	/*
	358	* Insert a leaf element into the B-Tree at the current cursor position.
	359	* The cursor is positioned such that the element at and beyond the cursor
	360	* are shifted to make room for the new record.
	361	*
	362	* The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
	363	* flag set and that call must return ENOENT before this function can be
	364	* called.
	365	*
	366	* ENOSPC is returned if there is no room to insert a new record.
	367	*/
	368	int
	369	hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm)
	370	{
	371	hammer_node_ondisk_t parent;
	372	hammer_node_ondisk_t node;
	373	int i;
	374
	375	#if 0
	376	/* HANDLED BY CALLER */
	377	/*
	378	* Issue a search to get our cursor at the right place. The search
	379	* will get us to a leaf node.
	380	*
	381	* The search also does some setup for our insert, so there is always
	382	* room in the leaf.
	383	*/
	384	error = btree_search(cursor, HAMMER_CURSOR_INSERT);
	385	if (error != ENOENT) {
	386	if (error == 0)
	387	error = EEXIST;
	388	return (error);
	389	}
	390	#endif
	391
	392	/*
	393	* Insert the element at the leaf node and update the count in the
	394	* parent. It is possible for parent to be NULL, indicating that
	395	* the root of the B-Tree in the cluster is a leaf. It is also
	396	* possible for the leaf to be empty.
	397	*
	398	* Remember that the right-hand boundary is not included in the
	399	* count.
	400	*/
	401	node = cursor->node->ondisk;
	402	i = cursor->index;
	403	KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
	404	KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
	405	if (i != node->count) {
	406	bcopy(&node->elms[i], &node->elms[i+1],
	407	(node->count - i) * sizeof(*elm));
	408	}
	409	node->elms[i] = *elm;
	410	++node->count;
	411	hammer_modify_node(cursor->node);
	412
	413	/*
	414	* Adjust the sub-tree count in the parent. note that the parent
	415	* may be in a different cluster.
	416	*/
	417	if (cursor->parent) {
	418	parent = cursor->parent->ondisk;
	419	i = cursor->parent_index;
	420	++parent->elms[i].internal.subtree_count;
	421	KKASSERT(parent->elms[i].internal.subtree_count <= node->count);
	422	hammer_modify_node(cursor->parent);
	423	}
	424	return(0);
	425	}
	426
	427	/*
	428	* Delete a record from the B-Tree's at the current cursor position.
	429	* The cursor is positioned such that the current element is the one
	430	* to be deleted.
	431	*
	432	* The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_DELETE
	433	* flag set and that call must return 0 before this function can be
	434	* called.
	435	*
	436	* It is possible that we will be asked to delete the last element in a
	437	* leaf. This case only occurs if the downward search was unable to
	438	* rebalance us, which in turn can occur if our parent has inter-cluster
	439	* elements. So the 0-element case for a leaf is allowed.
	440	*/
	441	int
	442	hammer_btree_delete(hammer_cursor_t cursor)
	443	{
	444	hammer_node_ondisk_t ondisk;
	445	hammer_node_t node;
	446	hammer_node_t parent;
	447	hammer_btree_elm_t elm;
	448	int error;
	449	int i;
	450
	451	#if 0
	452	/* HANDLED BY CALLER */
	453	/*
	454	* Locate the leaf element to delete. The search is also responsible
	455	* for doing some of the rebalancing work on its way down.
	456	*/
	457	error = btree_search(cursor, HAMMER_CURSOR_DELETE);
	458	if (error)
	459	return (error);
	460	#endif
	461
	462	/*
	463	* Delete the element from the leaf node.
	464	*
	465	* Remember that leaf nodes do not have boundaries.
	466	*/
	467	node = cursor->node;
	468	ondisk = node->ondisk;
	469	i = cursor->index;
	470
	471	KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
	472	if (i + 1 != ondisk->count) {
	473	bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
	474	(ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
	475	}
	476	--ondisk->count;
	477	if (cursor->parent != NULL) {
	478	/*
	479	* Adjust parent's notion of the leaf's count. subtree_count
	480	* is only approximate, it is allowed to be too small but
	481	* never allowed to be too large. Make sure we don't drop
	482	* the count below 0.
	483	*/
	484	parent = cursor->parent;
	485	elm = &parent->ondisk->elms[cursor->parent_index];
	486	if (elm->internal.subtree_count)
	487	--elm->internal.subtree_count;
	488	KKASSERT(elm->internal.subtree_count <= ondisk->count);
	489	hammer_modify_node(parent);
	490	}
	491
	492	/*
	493	* If the leaf is empty try to remove the subtree reference
	494	* in at (parent, parent_index). This will unbalance the
	495	* tree.
	496	*
	497	* Note that internal nodes must have at least one element
	498	* so their boundary information is properly laid out. If
	499	* we would cause our parent to become empty we try to
	500	* recurse up the tree, but if that doesn't work we just
	501	* leave the tree with an empty leaf.
	502	*/
	503	if (ondisk->count == 0) {
	504	error = btree_remove(cursor->parent, cursor->parent_index);
	505	if (error == 0) {
	506	hammer_free_btree(node->cluster, node->node_offset);
	507	} else if (error == EAGAIN) {
	508	hammer_modify_node(node);
	509	error = 0;
	510	} /* else a real error occured XXX */
	511	} else {
	512	hammer_modify_node(node);
	513	error = 0;
	514	}
	515	return(error);
	516	}
	517
	518	/*
	519	* PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
	520	*
	521	* Search a cluster's B-Tree for cursor->key_beg, return the matching node.
	522	*
	523	* The search begins at the current node and will instantiate a NULL
	524	* parent if necessary and if not at the root of the cluster. On return
	525	* parent will be non-NULL unless the cursor is sitting at a root-leaf.
	526	*
	527	* The search code may be forced to iterate up the tree if the conditions
	528	* required for an insertion or deletion are not met. This does not occur
	529	* very often.
	530	*
	531	* INSERTIONS: The search will split full nodes and leaves on its way down
	532	* and guarentee that the leaf it ends up on is not full.
	533	*
	534	* DELETIONS: The search will rebalance the tree on its way down.
	535	*/
	536	static
	537	int
	538	btree_search(hammer_cursor_t cursor, int flags)
	539	{
	540	hammer_node_ondisk_t node;
	541	hammer_cluster_t cluster;
	542	int error;
	543	int i;
	544	int r;
	545
	546	flags \|= cursor->flags;
	547
	548	/*
	549	* Move our cursor up the tree until we find a node whos range covers
	550	* the key we are trying to locate. This may move us between
	551	* clusters.
	552	*
	553	* The left bound is inclusive, the right bound is non-inclusive.
	554	* It is ok to cursor up too far so when cursoring across a cluster
	555	* boundary.
	556	*
	557	* First see if we can skip the whole cluster. hammer_cursor_up()
	558	* handles both cases but this way we don't check the cluster
	559	* bounds when going up the tree within a cluster.
	560	*/
	561	cluster = cursor->node->cluster;
	562	while (
	563	hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 \|\|
	564	hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) {
	565	error = hammer_cursor_toroot(cursor);
	566	if (error)
	567	goto done;
	568	error = hammer_cursor_up(cursor);
	569	if (error)
	570	goto done;
	571	cluster = cursor->node->cluster;
	572	}
	573
	574	/*
	575	* Deal with normal cursoring within a cluster. The right bound
	576	* is non-inclusive. That is, the bounds form a separator.
	577	*/
	578	while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 \|\|
	579	hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) {
	580	error = hammer_cursor_up(cursor);
	581	if (error)
	582	goto done;
	583	}
	584
	585	/*
	586	* We better have ended up with a node somewhere, and our second
	587	* while loop had better not have traversed up a cluster.
	588	*/
	589	KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster);
	590
	591	/*
	592	* If we are inserting we can't start at a full node if the parent
	593	* is also full (because there is no way to split the node),
	594	* continue running up the tree until we hit the root of the
	595	* root cluster or until the requirement is satisfied.
	596	*
	597	* NOTE: These cursor-up's CAN continue to cross cluster boundaries.
	598	*
	599	* XXX as an optimization it should be possible to unbalance the tree
	600	* and stop at the root of the current cluster.
	601	*/
	602	while (flags & HAMMER_CURSOR_INSERT) {
	603	if (btree_node_is_full(cursor->node->ondisk) == 0)
	604	break;
	605	if (cursor->parent == NULL)
	606	break;
	607	if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS)
	608	break;
	609	error = hammer_cursor_up(cursor);
	610	/* cluster and node are now may become stale */
	611	if (error)
	612	goto done;
	613	}
	614	/* cluster = cursor->node->cluster; not needed until next cluster = */
	615
	616	#if 0
	617	/*
	618	* If we are deleting we can't start at an internal node with only
	619	* one element unless it is root, because all of our code assumes
	620	* that internal nodes will never be empty. Just do this generally
	621	* for both leaf and internal nodes to get better balance.
	622	*
	623	* This handles the case where the cursor is sitting at a leaf and
	624	* either the leaf or parent contain an insufficient number of
	625	* elements.
	626	*
	627	* NOTE: These cursor-up's CAN continue to cross cluster boundaries.
	628	*
	629	* XXX NOTE: Iterations may not set this flag anyway.
	630	*/
	631	while (flags & HAMMER_CURSOR_DELETE) {
	632	if (cursor->node->ondisk->count > 1)
	633	break;
	634	if (cursor->parent == NULL)
	635	break;
	636	KKASSERT(cursor->node->ondisk->count != 0);
	637	error = hammer_cursor_up(cursor);
	638	/* cluster and node are now may become stale */
	639	if (error)
	640	goto done;
	641	}
	642	#endif
	643
	644	/new_cluster:/
	645	/*
	646	* Push down through internal nodes to locate the requested key.
	647	*/
	648	cluster = cursor->node->cluster;
	649	node = cursor->node->ondisk;
	650	while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
	651	#if 0
	652	/*
	653	* If we are a the root node and deleting, try to collapse
	654	* all of the root's children into the root. This is the
	655	* only point where tree depth is reduced.
	656	*
	657	* XXX NOTE: Iterations may not set this flag anyway.
	658	*/
	659	if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) {
	660	error = btree_collapse(cursor);
	661	/* node becomes stale after call */
	662	if (error)
	663	goto done;
	664	}
	665	node = cursor->node->ondisk;
	666	#endif
	667
	668	/*
	669	* Scan the node to find the subtree index to push down into.
	670	* We go one-past, then back-up. The key should never be
	671	* less then the left-hand boundary so I should never wind
	672	* up 0.
	673	*/
	674	for (i = 0; i < node->count; ++i) {
	675	r = hammer_btree_cmp(&cursor->key_beg,
	676	&node->elms[i].base);
	677	if (r < 0)
	678	break;
	679	}
	680	KKASSERT(i != 0);
	681
	682	/*
	683	* The push-down index is now i - 1.
	684	*/
	685	--i;
	686	cursor->index = i;
	687
	688	/*
	689	* Handle insertion and deletion requirements.
	690	*
	691	* If inserting split full nodes. The split code will
	692	* adjust cursor->node and cursor->index if the current
	693	* index winds up in the new node.
	694	*/
	695	if (flags & HAMMER_CURSOR_INSERT) {
	696	if (node->count == HAMMER_BTREE_INT_ELMS) {
	697	error = btree_split_internal(cursor);
	698	if (error)
	699	goto done;
	700	/*
	701	* reload stale pointers
	702	*/
	703	i = cursor->index;
	704	node = cursor->node->ondisk;
	705	}
	706	}
	707
	708	#if 0
	709	/*
	710	* If deleting rebalance - do not allow the child to have
	711	* just one element or we will not be able to delete it.
	712	*
	713	* Neither internal or leaf nodes (except a root-leaf) are
	714	* allowed to drop to 0 elements. (XXX - well, leaf nodes
	715	* can at the moment).
	716	*
	717	* Our separators may have been reorganized after rebalancing,
	718	* so we have to pop back up and rescan.
	719	*
	720	* XXX test for subtree_count < maxelms / 2, minus 1 or 2
	721	* for hysteresis?
	722	*
	723	* XXX NOTE: Iterations may not set this flag anyway.
	724	*/
	725	if (flags & HAMMER_CURSOR_DELETE) {
	726	if (node->elms[i].internal.subtree_count <= 1) {
	727	error = btree_rebalance(cursor);
	728	if (error)
	729	goto done;
	730	/* cursor->index is invalid after call */
	731	goto new_cluster;
	732	}
	733	}
	734	#endif
	735
	736	/*
	737	* Push down (push into new node, existing node becomes
	738	* the parent).
	739	*/
	740	error = hammer_cursor_down(cursor);
	741	/* node and cluster become stale */
	742	if (error)
	743	goto done;
	744	node = cursor->node->ondisk;
	745	cluster = cursor->node->cluster;
	746	}
	747
	748	/*
	749	* We are at a leaf, do a linear search of the key array.
	750	* (XXX do a binary search). On success the index is set to the
	751	* matching element, on failure the index is set to the insertion
	752	* point.
	753	*
	754	* Boundaries are not stored in leaf nodes, so the index can wind
	755	* up to the left of element 0 (index == 0) or past the end of
	756	* the array (index == node->count).
	757	*/
	758	KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
	759
	760	for (i = 0; i < node->count; ++i) {
	761	r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base);
	762
	763	/*
	764	* Stop if we've flipped past key_beg
	765	*/
	766	if (r < 0)
	767	break;
	768
	769	/*
	770	* Return an exact match
	771	*/
	772	if (r == 0) {
	773	cursor->index = i;
	774	error = 0;
	775	goto done;
	776	}
	777	}
	778
	779	/*
	780	* No exact match was found, i is now at the insertion point.
	781	*
	782	* If inserting split a full leaf before returning. This
	783	* may have the side effect of adjusting cursor->node and
	784	* cursor->index.
	785	*/
	786	cursor->index = i;
	787	if ((flags & HAMMER_CURSOR_INSERT) &&
	788	node->count == HAMMER_BTREE_LEAF_ELMS) {
	789	error = btree_split_leaf(cursor);
	790	/* NOT USED
	791	i = cursor->index;
	792	node = &cursor->node->internal;
	793	*/
	794	if (error)
	795	goto done;
	796	}
	797	error = ENOENT;
	798	done:
	799	return(error);
	800	}
	801
	802
	803	/************************************************************************
	804	* SPLITTING AND MERGING *
	805	************************************************************************
	806	*
	807	* These routines do all the dirty work required to split and merge nodes.
	808	*/
	809
	810	/*
	811	* Split an internal node into two nodes and move the separator at the split
	812	* point to the parent. Note that the parent's parent's element pointing
	813	* to our parent will have an incorrect subtree_count (we don't update it).
	814	* It will be low, which is ok.
	815	*
	816	* (cursor->node, cursor->index) indicates the element the caller intends
	817	* to push into. We will adjust node and index if that element winds
	818	* up in the split node.
	819	*
	820	* If we are at the root of a cluster a new root must be created with two
	821	* elements, one pointing to the original root and one pointing to the
	822	* newly allocated split node.
	823	*
	824	* NOTE! Being at the root of a cluster is different from being at the
	825	* root of the root cluster. cursor->parent will not be NULL and
	826	* cursor->node->ondisk.parent must be tested against 0. Theoretically
	827	* we could propogate the algorithm into the parent and deal with multiple
	828	* 'roots' in the cluster header, but it's easier not to.
	829	*/
	830	static
	831	int
	832	btree_split_internal(hammer_cursor_t cursor)
	833	{
	834	hammer_node_ondisk_t ondisk;
	835	hammer_node_t node;
	836	hammer_node_t parent;
	837	hammer_node_t new_node;
	838	hammer_btree_elm_t elm;
	839	hammer_btree_elm_t parent_elm;
	840	int parent_index;
	841	int made_root;
	842	int split;
	843	int error;
	844	const int esize = sizeof(*elm);
	845
	846	/*
	847	* We are splitting but elms[split] will be promoted to the parent,
	848	* leaving the right hand node with one less element. If the
	849	* insertion point will be on the left-hand side adjust the split
	850	* point to give the right hand side one additional node.
	851	*/
	852	node = cursor->node;
	853	ondisk = node->ondisk;
	854	split = (ondisk->count + 1) / 2;
	855	if (cursor->index <= split)
	856	--split;
	857	error = 0;
	858
	859	/*
	860	* If we are at the root of the cluster, create a new root node with
	861	* 1 element and split normally. Avoid making major modifications
	862	* until we know the whole operation will work.
	863	*
	864	* The root of the cluster is different from the root of the root
	865	* cluster. Use the node's on-disk structure's parent offset to
	866	* detect the case.
	867	*/
	868	if (ondisk->parent == 0) {
	869	parent = hammer_alloc_btree(node->cluster, &error);
	870	if (parent == NULL)
	871	return(error);
	872	hammer_lock_ex(&parent->lock);
	873	ondisk = parent->ondisk;
	874	ondisk->count = 1;
	875	ondisk->parent = 0;
	876	ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
	877	ondisk->elms[0].base = node->cluster->clu_btree_beg;
	878	ondisk->elms[0].internal.subtree_type = node->ondisk->type;
	879	ondisk->elms[0].internal.subtree_offset = node->node_offset;
	880	ondisk->elms[1].base = node->cluster->clu_btree_end;
	881	made_root = 1;
	882	parent_index = 0; /* index of current node in parent */
	883	} else {
	884	made_root = 0;
	885	parent = cursor->parent;
	886	parent_index = cursor->parent_index;
	887	}
	888
	889	/*
	890	* Split node into new_node at the split point.
	891	*
	892	* B O O O P N N B <-- P = node->elms[split]
	893	* 0 1 2 3 4 5 6 <-- subtree indices
	894	*
	895	* x x P x x
	896	* s S S s
	897	* / \
	898	* B O O O B B N N B <--- inner boundary points are 'P'
	899	* 0 1 2 3 4 5 6
	900	*
	901	*/
	902	new_node = hammer_alloc_btree(node->cluster, &error);
	903	if (new_node == NULL) {
	904	if (made_root) {
	905	hammer_unlock(&parent->lock);
	906	hammer_free_btree(node->cluster, parent->node_offset);
	907	hammer_rel_node(parent);
	908	}
	909	return(error);
	910	}
	911	hammer_lock_ex(&new_node->lock);
	912
	913	/*
	914	* Create the new node. P becomes the left-hand boundary in the
	915	* new node. Copy the right-hand boundary as well.
	916	*
	917	* elm is the new separator.
	918	*/
	919	ondisk = node->ondisk;
	920	elm = &ondisk->elms[split];
	921	bcopy(elm, &new_node->ondisk->elms[0],
	922	(ondisk->count - split + 1) * esize);
	923	new_node->ondisk->count = ondisk->count - split;
	924	new_node->ondisk->parent = parent->node_offset;
	925	new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
	926	KKASSERT(ondisk->type == new_node->ondisk->type);
	927
	928	/*
	929	* Cleanup the original node. P becomes the new boundary, its
	930	* subtree_offset was moved to the new node. If we had created
	931	* a new root its parent pointer may have changed.
	932	*/
	933	elm->internal.subtree_offset = 0;
	934	ondisk->count = split;
	935
	936	/*
	937	* Insert the separator into the parent, fixup the parent's
	938	* reference to the original node, and reference the new node.
	939	* The separator is P.
	940	*
	941	* Remember that base.count does not include the right-hand boundary.
	942	*/
	943	ondisk = parent->ondisk;
	944	ondisk->elms[parent_index].internal.subtree_count = split;
	945	parent_elm = &ondisk->elms[parent_index+1];
	946	bcopy(parent_elm, parent_elm + 1,
	947	(ondisk->count - parent_index) * esize);
	948	parent_elm->internal.base = elm->base; /* separator P */
	949	parent_elm->internal.subtree_offset = new_node->node_offset;
	950	parent_elm->internal.subtree_count = new_node->ondisk->count;
	951	++ondisk->count;
	952
	953	/*
	954	* The cluster's root pointer may have to be updated.
	955	*/
	956	if (made_root) {
	957	node->cluster->ondisk->clu_btree_root = parent->node_offset;
	958	hammer_modify_cluster(node->cluster);
	959	node->ondisk->parent = parent->node_offset;
	960	if (cursor->parent) {
	961	hammer_unlock(&cursor->parent->lock);
	962	hammer_rel_node(cursor->parent);
	963	}
	964	cursor->parent = parent; /* lock'd and ref'd */
	965	}
	966
	967	hammer_modify_node(node);
	968	hammer_modify_node(new_node);
	969	hammer_modify_node(parent);
	970
	971	/*
	972	* Ok, now adjust the cursor depending on which element the original
	973	* index was pointing at. If we are >= the split point the push node
	974	* is now in the new node.
	975	*
	976	* NOTE: If we are at the split point itself we cannot stay with the
	977	* original node because the push index will point at the right-hand
	978	* boundary, which is illegal.
	979	*
	980	* NOTE: The cursor's parent or parent_index must be adjusted for
	981	* the case where a new parent (new root) was created, and the case
	982	* where the cursor is now pointing at the split node.
	983	*/
	984	if (cursor->index >= split) {
	985	cursor->parent_index = parent_index + 1;
	986	cursor->index -= split;
	987	hammer_unlock(&cursor->node->lock);
	988	hammer_rel_node(cursor->node);
	989	cursor->node = new_node; /* locked and ref'd */
	990	} else {
	991	cursor->parent_index = parent_index;
	992	hammer_unlock(&new_node->lock);
	993	hammer_rel_node(new_node);
	994	}
	995
	996	/*
	997	* Fixup left and right bounds
	998	*/
	999	parent_elm = &parent->ondisk->elms[cursor->parent_index];
	1000	cursor->left_bound = &elm[0].internal.base;
	1001	cursor->right_bound = &elm[1].internal.base;
	1002
	1003	return (0);
	1004	}
	1005
	1006	/*
	1007	* Same as the above, but splits a full leaf node.
	1008	*/
	1009	static
	1010	int
	1011	btree_split_leaf(hammer_cursor_t cursor)
	1012	{
	1013	hammer_node_ondisk_t ondisk;
	1014	hammer_node_t parent;
	1015	hammer_node_t leaf;
	1016	hammer_node_t new_leaf;
	1017	hammer_btree_elm_t elm;
	1018	hammer_btree_elm_t parent_elm;
	1019	int parent_index;
	1020	int made_root;
	1021	int split;
	1022	int error;
	1023	const size_t esize = sizeof(*elm);
	1024
	1025	/*
	1026	* Calculate the split point. If the insertion point will be on
	1027	* the left-hand side adjust the split point to give the right
	1028	* hand side one additional node.
	1029	*/
	1030	leaf = cursor->node;
	1031	ondisk = leaf->ondisk;
	1032	split = (ondisk->count + 1) / 2;
	1033	if (cursor->index <= split)
	1034	--split;
	1035	error = 0;
	1036
	1037	/*
	1038	* If we are at the root of the tree, create a new root node with
	1039	* 1 element and split normally. Avoid making major modifications
	1040	* until we know the whole operation will work.
	1041	*/
	1042	if (ondisk->parent == 0) {
	1043	parent = hammer_alloc_btree(leaf->cluster, &error);
	1044	if (parent == NULL)
	1045	return(error);
	1046	hammer_lock_ex(&parent->lock);
	1047	ondisk = parent->ondisk;
	1048	ondisk->count = 1;
	1049	ondisk->parent = 0;
	1050	ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
	1051	ondisk->elms[0].base = leaf->cluster->clu_btree_beg;
	1052	ondisk->elms[0].internal.subtree_type = leaf->ondisk->type;
	1053	ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
	1054	ondisk->elms[1].base = leaf->cluster->clu_btree_end;
	1055	made_root = 1;
	1056	parent_index = 0; /* insertion point in parent */
	1057	} else {
	1058	made_root = 0;
	1059	parent = cursor->parent;
	1060	parent_index = cursor->parent_index;
	1061	}
	1062
	1063	/*
	1064	* Split leaf into new_leaf at the split point. Select a separator
	1065	* value in-between the two leafs but with a bent towards the right
	1066	* leaf since comparisons use an 'elm >= separator' inequality.
	1067	*
	1068	* L L L L L L L L
	1069	*
	1070	* x x P x x
	1071	* s S S s
	1072	* / \
	1073	* L L L L L L L L
	1074	*/
	1075	new_leaf = hammer_alloc_btree(leaf->cluster, &error);
	1076	if (new_leaf == NULL) {
	1077	if (made_root) {
	1078	hammer_unlock(&parent->lock);
	1079	hammer_free_btree(leaf->cluster, parent->node_offset);
	1080	hammer_rel_node(parent);
	1081	}
	1082	return(error);
	1083	}
	1084	hammer_lock_ex(&new_leaf->lock);
	1085
	1086	/*
	1087	* Create the new node. P become the left-hand boundary in the
	1088	* new node. Copy the right-hand boundary as well.
	1089	*/
	1090	ondisk = leaf->ondisk;
	1091	elm = &ondisk->elms[split];
	1092	bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
	1093	new_leaf->ondisk->count = ondisk->count - split;
	1094	new_leaf->ondisk->parent = parent->node_offset;
	1095	new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
	1096	KKASSERT(ondisk->type == new_leaf->ondisk->type);
	1097
	1098	/*
	1099	* Cleanup the original node. Because this is a leaf node and
	1100	* leaf nodes do not have a right-hand boundary, there
	1101	* aren't any special edge cases to clean up. We just fixup the
	1102	* count.
	1103	*/
	1104	ondisk->count = split;
	1105
	1106	/*
	1107	* Insert the separator into the parent, fixup the parent's
	1108	* reference to the original node, and reference the new node.
	1109	* The separator is P.
	1110	*
	1111	* Remember that base.count does not include the right-hand boundary.
	1112	* We are copying parent_index+1 to parent_index+2, not +0 to +1.
	1113	*/
	1114	ondisk = parent->ondisk;
	1115	ondisk->elms[parent_index].internal.subtree_count = split;
	1116	parent_elm = &ondisk->elms[parent_index+1];
	1117	if (parent_index + 1 != ondisk->count) {
	1118	bcopy(parent_elm, parent_elm + 1,
	1119	(ondisk->count - parent_index - 1) * esize);
	1120	}
	1121	hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
	1122	parent_elm->internal.subtree_offset = new_leaf->node_offset;
	1123	parent_elm->internal.subtree_count = new_leaf->ondisk->count;
	1124	++ondisk->count;
	1125
	1126	/*
	1127	* The cluster's root pointer may have to be updated.
	1128	*/
	1129	if (made_root) {
	1130	leaf->cluster->ondisk->clu_btree_root = parent->node_offset;
	1131	hammer_modify_cluster(leaf->cluster);
	1132	leaf->ondisk->parent = parent->node_offset;
	1133	if (cursor->parent) {
	1134	hammer_unlock(&cursor->parent->lock);
	1135	hammer_rel_node(cursor->parent);
	1136	}
	1137	cursor->parent = parent; /* lock'd and ref'd */
	1138	}
	1139
	1140	hammer_modify_node(leaf);
	1141	hammer_modify_node(new_leaf);
	1142	hammer_modify_node(parent);
	1143
	1144	/*
	1145	* Ok, now adjust the cursor depending on which element the original
	1146	* index was pointing at. If we are >= the split point the push node
	1147	* is now in the new node.
	1148	*
	1149	* NOTE: If we are at the split point itself we cannot stay with the
	1150	* original node because the push index will point at the right-hand
	1151	* boundary, which is illegal.
	1152	*/
	1153	if (cursor->index >= split) {
	1154	cursor->parent_index = parent_index + 1;
	1155	cursor->index -= split;
	1156	hammer_unlock(&cursor->node->lock);
	1157	hammer_rel_node(cursor->node);
	1158	cursor->node = new_leaf;
	1159	} else {
	1160	cursor->parent_index = parent_index;
	1161	hammer_unlock(&new_leaf->lock);
	1162	hammer_rel_node(new_leaf);
	1163	}
	1164
	1165	/*
	1166	* Fixup left and right bounds
	1167	*/
	1168	parent_elm = &parent->ondisk->elms[cursor->parent_index];
	1169	cursor->left_bound = &elm[0].internal.base;
	1170	cursor->right_bound = &elm[1].internal.base;
	1171
	1172	return (0);
	1173	}
	1174
	1175	/*
	1176	* Remove the element at (node, index). If the internal node would become
	1177	* empty passively recurse up the tree.
	1178	*
	1179	* A locked internal node is passed to this function, the node remains
	1180	* locked on return. Leaf nodes cannot be passed to this function.
	1181	*
	1182	* Returns EAGAIN if we were unable to acquire the needed locks. The caller
	1183	* does not deal with the empty leaf until determines whether this recursion
	1184	* has succeeded or not.
	1185	*/
	1186	int
	1187	btree_remove(hammer_node_t node, int index)
	1188	{
	1189	hammer_node_ondisk_t ondisk;
	1190	hammer_node_t parent;
	1191	int error;
	1192
	1193	ondisk = node->ondisk;
	1194	KKASSERT(ondisk->count > 0);
	1195
	1196	/*
	1197	* Remove the element, shifting remaining elements left one.
	1198	* Note that our move must include the right-boundary element.
	1199	*/
	1200	if (ondisk->count != 1) {
	1201	bcopy(&ondisk->elms[index+1], &ondisk->elms[index],
	1202	(ondisk->count - index) * sizeof(ondisk->elms[0]));
	1203	--ondisk->count;
	1204	hammer_modify_node(node);
	1205	return(0);
	1206	}
	1207
	1208	/*
	1209	* Internal nodes cannot drop to 0 elements, so remove the node
	1210	* from ITS parent. If the node is the root node, convert it to
	1211	* an empty leaf node (which can drop to 0 elements).
	1212	*/
	1213	if (ondisk->parent == 0) {
	1214	ondisk->count = 0;
	1215	ondisk->type = HAMMER_BTREE_TYPE_LEAF;
	1216	hammer_modify_node(node);
	1217	return(0);
	1218	}
	1219
	1220	/*
	1221	* Try to remove the node from its parent. Return EAGAIN if we
	1222	* cannot.
	1223	*/
	1224	parent = hammer_get_node(node->cluster, ondisk->parent, &error);
	1225	if (hammer_lock_ex_try(&parent->lock)) {
	1226	hammer_rel_node(parent);
	1227	return(EAGAIN);
	1228	}
	1229	ondisk = parent->ondisk;
	1230	for (index = 0; index < ondisk->count; ++index) {
	1231	if (ondisk->elms[index].internal.subtree_offset ==
	1232	node->node_offset) {
	1233	break;
	1234	}
	1235	}
	1236	if (index == ondisk->count) {
	1237	kprintf("btree_remove: lost parent linkage to node\n");
	1238	error = EIO;
	1239	} else {
	1240	error = btree_remove(parent, index);
	1241	if (error == 0) {
	1242	hammer_free_btree(node->cluster, node->node_offset);
	1243	/* NOTE: node can be reallocated at any time now */
	1244	}
	1245	}
	1246	hammer_unlock(&parent->lock);
	1247	hammer_rel_node(parent);
	1248	return (error);
	1249	}
	1250
	1251	#if 0
	1252
	1253	/*
	1254	* This routine is called on the internal node (node) prior to recursing down
	1255	* through (node, index) when the node referenced by (node, index) MIGHT
	1256	* have too few elements for the caller to perform a deletion.
	1257	*
	1258	* cursor->index is invalid on return because the separators may have gotten
	1259	* adjusted, the caller must rescan the node's elements. The caller may set
	1260	* cursor->index to -1 if it wants us to do a general rebalancing.
	1261	*
	1262	* This routine rebalances the children of the (node), collapsing children
	1263	* together if possible. On return each child will have at least L/2-1
	1264	* elements unless the node only has one child.
	1265	*
	1266	* NOTE: Because we do not update the parent's parent in the split code,
	1267	* the subtree_count used by the caller may be incorrect. We correct it
	1268	* here. Also note that we cannot change the depth of the tree's leaf
	1269	* nodes here (see btree_collapse()).
	1270	*
	1271	* NOTE: We make no attempt to rebalance inter-cluster elements.
	1272	*/
	1273	static
	1274	int
	1275	btree_rebalance(hammer_cursor_t cursor)
	1276	{
	1277	hammer_node_ondisk_t ondisk;
	1278	hammer_node_t node;
	1279	hammer_node_t children[HAMMER_BTREE_INT_ELMS];
	1280	hammer_node_t child;
	1281	hammer_btree_elm_t elm;
	1282	hammer_btree_elm_t elms;
	1283	int i, j, n, nelms, goal;
	1284	int maxelms, halfelms;
	1285	int error;
	1286
	1287	/*
	1288	* If the elm being recursed through is an inter-cluster reference,
	1289	* don't worry about it.
	1290	*/
	1291	ondisk = cursor->node->ondisk;
	1292	elm = &ondisk->elms[cursor->index];
	1293	if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER)
	1294	return(0);
	1295
	1296	KKASSERT(elm->internal.subtree_offset != 0);
	1297	error = 0;
	1298
	1299	/*
	1300	* Load the children of node and do any necessary corrections
	1301	* to subtree_count. subtree_count may be too low due to the
	1302	* way insertions split nodes. Get a count of the total number
	1303	* of actual elements held by our children.
	1304	*/
	1305	error = 0;
	1306
	1307	for (i = n = 0; i < node->base.count; ++i) {
	1308	struct hammer_btree_internal_elm *elm;
	1309
	1310	elm = &node->elms[i];
	1311	children[i] = NULL;
	1312	child_buffer[i] = NULL; /* must be preinitialized for bread */
	1313	if (elm->subtree_offset == 0)
	1314	continue;
	1315	child = hammer_bread(cursor->cluster, elm->subtree_offset,
	1316	HAMMER_FSBUF_BTREE, &error,
	1317	&child_buffer[i], XXX);
	1318	children[i] = child;
	1319	if (child == NULL)
	1320	continue;
	1321	XXX
	1322	KKASSERT(node->base.subtype == child->base.type);
	1323
	1324	/*
	1325	* Accumulate n for a good child, update the node's count
	1326	* if it was wrong.
	1327	*/
	1328	if (node->elms[i].subtree_count != child->base.count) {
	1329	node->elms[i].subtree_count = child->base.count;
	1330	}
	1331	n += node->elms[i].subtree_count;
	1332	}
	1333	if (error)
	1334	goto failed;
	1335
	1336	/*
	1337	* Collect all the children's elements together
	1338	*/
	1339	nelms = n;
	1340	elms = kmalloc(sizeof(elms) (nelms + 1), M_HAMMER, M_WAITOK\|M_ZERO);
	1341	for (i = n = 0; i < node->base.count; ++i) {
	1342	child = children[i];
	1343	for (j = 0; j < child->base.count; ++j) {
	1344	elms[n].owner = child;
	1345	if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF)
	1346	elms[n].u.leaf = child->leaf.elms[j];
	1347	else
	1348	elms[n].u.internal = child->internal.elms[j];
	1349	++n;
	1350	}
	1351	}
	1352	KKASSERT(n == nelms);
	1353
	1354	/*
	1355	* Store a boundary in the elms array to ease the code below. This
	1356	* is only used if the children are internal nodes.
	1357	*/
	1358	elms[n].u.internal = node->elms[i];
	1359
	1360	/*
	1361	* Calculate the number of elements each child should have (goal) by
	1362	* reducing the number of elements until we achieve at least
	1363	* halfelms - 1 per child, unless we are a degenerate case.
	1364	*/
	1365	maxelms = btree_max_elements(node->base.subtype);
	1366	halfelms = maxelms / 2;
	1367
	1368	goal = halfelms - 1;
	1369	while (i && n / i < goal)
	1370	--i;
	1371
	1372	/*
	1373	* Now rebalance using the specified goal
	1374	*/
	1375	for (i = n = 0; i < node->base.count; ++i) {
	1376	struct hammer_buffer *subchild_buffer = NULL;
	1377	struct hammer_btree_internal_node *subchild;
	1378
	1379	child = children[i];
	1380	for (j = 0; j < goal && n < nelms; ++j) {
	1381	if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
	1382	child->leaf.elms[j] = elms[n].u.leaf;
	1383	} else {
	1384	child->internal.elms[j] = elms[n].u.internal;
	1385	}
	1386
	1387	/*
	1388	* If the element's parent has changed we have to
	1389	* update the parent pointer. This is somewhat
	1390	* expensive.
	1391	*/
	1392	if (elms[n].owner != child &&
	1393	node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) {
	1394	subchild = hammer_bread(cursor->cluster,
	1395	elms[n].u.internal.subtree_offset,
	1396	HAMMER_FSBUF_BTREE,
	1397	&error,
	1398	&subchild_buffer, XXX);
	1399	if (subchild) {
	1400	subchild->base.parent =
	1401	hammer_bclu_offset(child_buffer[i],
	1402	child);
	1403	hammer_modify_buffer(subchild_buffer);
	1404	}
	1405	/* XXX error */
	1406	}
	1407	++n;
	1408	}
	1409	/*
	1410	* Set right boundary if the children are internal nodes.
	1411	*/
	1412	if (node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL)
	1413	child->internal.elms[j] = elms[n].u.internal;
	1414	child->base.count = j;
	1415	hammer_modify_buffer(child_buffer[i]);
	1416	if (subchild_buffer)
	1417	hammer_put_buffer(subchild_buffer, 0);
	1418
	1419	/*
	1420	* If we have run out of elements, break out
	1421	*/
	1422	if (n == nelms)
	1423	break;
	1424	}
	1425
	1426	/*
	1427	* Physically destroy any left-over children. These children's
	1428	* elements have been packed into prior children. The node's
	1429	* right hand boundary and count gets shifted to index i.
	1430	*
	1431	* The subtree count in the node's parent MUST be updated because
	1432	* we are removing elements. The subtree_count field is allowed to
	1433	* be too small, but not too large!
	1434	*/
	1435	if (i != node->base.count) {
	1436	n = i;
	1437	node->elms[n] = node->elms[node->base.count];
	1438	while (i < node->base.count) {
	1439	hammer_free_btree_ptr(child_buffer[i], children[i]);
	1440	hammer_put_buffer(child_buffer[i], 0);
	1441	++i;
	1442	}
	1443	node->base.count = n;
	1444	if (cursor->parent) {
	1445	cursor->parent->elms[cursor->parent_index].subtree_count = n;
	1446	hammer_modify_buffer(cursor->parent_buffer);
	1447	}
	1448	}
	1449
	1450	kfree(elms, M_HAMMER);
	1451	failed:
	1452	hammer_modify_buffer(cursor->node_buffer);
	1453	for (i = 0; i < node->base.count; ++i) {
	1454	if (child_buffer[i])
	1455	hammer_put_buffer(child_buffer[i], 0);
	1456	}
	1457	return (error);
	1458	}
	1459
	1460	/*
	1461	* This routine is only called if the cursor is at the root node and the
	1462	* root node is an internal node. We attempt to collapse the root node
	1463	* by replacing it with all of its children, reducing tree depth by one.
	1464	*
	1465	* This is the only way to reduce tree depth in a HAMMER filesystem.
	1466	* Note that all leaf nodes are at the same depth.
	1467	*
	1468	* This is a fairly expensive operation because we not only have to load
	1469	* the root's children, we also have to scan each child and adjust the
	1470	* parent offset for each element in each child. Nasty all around.
	1471	*/
	1472	static
	1473	int
	1474	btree_collapse(hammer_cursor_t cursor)
	1475	{
	1476	hammer_btree_node_ondisk_t root, child;
	1477	hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS];
	1478	struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
	1479	int count;
	1480	int i, j, n;
	1481	int root_modified;
	1482	int error;
	1483	int32_t root_offset;
	1484	u_int8_t subsubtype;
	1485
	1486	root = cursor->node;
	1487	count = root->base.count;
	1488	root_offset = hammer_bclu_offset(cursor->node_buffer, root);
	1489
	1490	/*
	1491	* Sum up the number of children each element has. This value is
	1492	* only approximate due to the way the insertion node works. It
	1493	* may be too small but it will never be too large.
	1494	*
	1495	* Quickly terminate the collapse if the elements have too many
	1496	* children.
	1497	*/
	1498	KKASSERT(root->base.parent == 0); /* must be root node */
	1499	KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL);
	1500	KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
	1501
	1502	for (i = n = 0; i < count; ++i) {
	1503	n += root->internal.elms[i].subtree_count;
	1504	}
	1505	if (n > btree_max_elements(root->base.subtype))
	1506	return(0);
	1507
	1508	/*
	1509	* Iterate through the elements again and correct the subtree_count.
	1510	* Terminate the collapse if we wind up with too many.
	1511	*/
	1512	error = 0;
	1513	root_modified = 0;
	1514
	1515	for (i = n = 0; i < count; ++i) {
	1516	struct hammer_btree_internal_elm *elm;
	1517
	1518	elm = &root->internal.elms[i];
	1519	child_buffer[i] = NULL;
	1520	children[i] = NULL;
	1521	if (elm->subtree_offset == 0)
	1522	continue;
	1523	child = hammer_bread(cursor->cluster, elm->subtree_offset,
	1524	HAMMER_FSBUF_BTREE, &error,
	1525	&child_buffer[i], XXX);
	1526	children[i] = child;
	1527	if (child == NULL)
	1528	continue;
	1529	KKASSERT(root->base.subtype == child->base.type);
	1530
	1531	/*
	1532	* Accumulate n for a good child, update the root's count
	1533	* if it was wrong.
	1534	*/
	1535	if (root->internal.elms[i].subtree_count != child->base.count) {
	1536	root->internal.elms[i].subtree_count = child->base.count;
	1537	root_modified = 1;
	1538	}
	1539	n += root->internal.elms[i].subtree_count;
	1540	}
	1541	if (error \|\| n > btree_max_elements(root->base.subtype))
	1542	goto done;
	1543
	1544	/*
	1545	* Ok, we can collapse the root. If the root's children are leafs
	1546	* the collapse is really simple. If they are internal nodes the
	1547	* collapse is not so simple because we have to fixup the parent
	1548	* pointers for the root's children's children.
	1549	*
	1550	* When collapsing an internal node the far left and far right
	1551	* element's boundaries should match the root's left and right
	1552	* boundaries.
	1553	*/
	1554	if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
	1555	for (i = n = 0; i < count; ++i) {
	1556	child = children[i];
	1557	for (j = 0; j < child->base.count; ++j) {
	1558	root->leaf.elms[n] = child->leaf.elms[j];
	1559	++n;
	1560	}
	1561	}
	1562	root->base.type = root->base.subtype;
	1563	root->base.subtype = 0;
	1564	root->base.count = n;
	1565	root->leaf.link_left = 0;
	1566	root->leaf.link_right = 0;
	1567	} else {
	1568	struct hammer_btree_internal_elm *elm;
	1569	struct hammer_btree_internal_node *subchild;
	1570	struct hammer_buffer *subchild_buffer = NULL;
	1571
	1572	if (count) {
	1573	child = children[0];
	1574	subsubtype = child->base.subtype;
	1575	KKASSERT(child->base.count > 0);
	1576	KKASSERT(root->internal.elms[0].base.key ==
	1577	child->internal.elms[0].base.key);
	1578	child = children[count-1];
	1579	KKASSERT(child->base.count > 0);
	1580	KKASSERT(root->internal.elms[count].base.key ==
	1581	child->internal.elms[child->base.count].base.key);
	1582	} else {
	1583	subsubtype = 0;
	1584	}
	1585	for (i = n = 0; i < count; ++i) {
	1586	child = children[i];
	1587	KKASSERT(child->base.subtype == subsubtype);
	1588	for (j = 0; j < child->base.count; ++j) {
	1589	elm = &child->internal.elms[j];
	1590
	1591	root->internal.elms[n] = *elm;
	1592	subchild = hammer_bread(cursor->cluster,
	1593	elm->subtree_offset,
	1594	HAMMER_FSBUF_BTREE,
	1595	&error,
	1596	&subchild_buffer,
	1597	XXX);
	1598	if (subchild) {
	1599	subchild->base.parent = root_offset;
	1600	hammer_modify_buffer(subchild_buffer);
	1601	}
	1602	++n;
	1603	}
	1604	/* make sure the right boundary is correct */
	1605	/* (this gets overwritten when the loop continues) */
	1606	/* XXX generate a new separator? */
	1607	root->internal.elms[n] = child->internal.elms[j];
	1608	}
	1609	root->base.type = HAMMER_BTREE_TYPE_INTERNAL;
	1610	root->base.subtype = subsubtype;
	1611	if (subchild_buffer)
	1612	hammer_put_buffer(subchild_buffer, 0);
	1613	}
	1614	root_modified = 1;
	1615
	1616	/*
	1617	* Cleanup
	1618	*/
	1619	done:
	1620	if (root_modified)
	1621	hammer_modify_buffer(cursor->node_buffer);
	1622	for (i = 0; i < count; ++i) {
	1623	if (child_buffer[i])
	1624	hammer_put_buffer(child_buffer[i], 0);
	1625	}
	1626	return(error);
	1627	}
	1628
	1629	#endif
	1630
	1631	/************************************************************************
	1632	* MISCELLANIOUS SUPPORT *
	1633	************************************************************************/
	1634
	1635	/*
	1636	* Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
	1637	*
	1638	* See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
	1639	*
	1640	* Note that key1 and key2 are treated differently. key1 is allowed to
	1641	* wildcard some of its fields by setting them to 0, while key2 is expected
	1642	* to be in an on-disk form (no wildcards).
	1643	*/
	1644	int
	1645	hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
	1646	{
	1647	#if 0
	1648	kprintf("compare obj_id %016llx %016llx\n",
	1649	key1->obj_id, key2->obj_id);
	1650	kprintf("compare rec_type %04x %04x\n",
	1651	key1->rec_type, key2->rec_type);
	1652	kprintf("compare key %016llx %016llx\n",
	1653	key1->key, key2->key);
	1654	#endif
	1655
	1656	/*
	1657	* A key1->obj_id of 0 matches any object id
	1658	*/
	1659	if (key1->obj_id) {
	1660	if (key1->obj_id < key2->obj_id)
	1661	return(-4);
	1662	if (key1->obj_id > key2->obj_id)
	1663	return(4);
	1664	}
	1665
	1666	/*
	1667	* A key1->rec_type of 0 matches any record type.
	1668	*/
	1669	if (key1->rec_type) {
	1670	if (key1->rec_type < key2->rec_type)
	1671	return(-3);
	1672	if (key1->rec_type > key2->rec_type)
	1673	return(3);
	1674	}
	1675
	1676	/*
	1677	* There is no special case for key. 0 means 0.
	1678	*/
	1679	if (key1->key < key2->key)
	1680	return(-2);
	1681	if (key1->key > key2->key)
	1682	return(2);
	1683
	1684	/*
	1685	* This test has a number of special cases. create_tid in key1 is
	1686	* the as-of transction id, and delete_tid in key1 is NOT USED.
	1687	*
	1688	* A key1->create_tid of 0 matches any record regardles of when
	1689	* it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
	1690	* used to search for the most current state of the object.
	1691	*
	1692	* key2->create_tid is a HAMMER record and will never be
	1693	* 0. key2->delete_tid is the deletion transaction id or 0 if
	1694	* the record has not yet been deleted.
	1695	*/
	1696	if (key1->create_tid) {
	1697	if (key1->create_tid < key2->create_tid)
	1698	return(-1);
	1699	if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
	1700	return(1);
	1701	}
	1702
	1703	return(0);
	1704	}
	1705
	1706	/*
	1707	* Compare the element against the cursor's beginning and ending keys
	1708	*/
	1709	int
	1710	hammer_btree_range_cmp(hammer_cursor_t cursor, hammer_base_elm_t key2)
	1711	{
	1712	/*
	1713	* A cursor->key_beg.obj_id of 0 matches any object id
	1714	*/
	1715	if (cursor->key_beg.obj_id) {
	1716	if (cursor->key_end.obj_id < key2->obj_id)
	1717	return(-4);
	1718	if (cursor->key_beg.obj_id > key2->obj_id)
	1719	return(4);
	1720	}
	1721
	1722	/*
	1723	* A cursor->key_beg.rec_type of 0 matches any record type.
	1724	*/
	1725	if (cursor->key_beg.rec_type) {
	1726	if (cursor->key_end.rec_type < key2->rec_type)
	1727	return(-3);
	1728	if (cursor->key_beg.rec_type > key2->rec_type)
	1729	return(3);
	1730	}
	1731
	1732	/*
	1733	* There is no special case for key. 0 means 0.
	1734	*/
	1735	if (cursor->key_end.key < key2->key)
	1736	return(-2);
	1737	if (cursor->key_beg.key > key2->key)
	1738	return(2);
	1739
	1740	/*
	1741	* This test has a number of special cases. create_tid in key1 is
	1742	* the as-of transction id, and delete_tid in key1 is NOT USED.
	1743	*
	1744	* A key1->create_tid of 0 matches any record regardles of when
	1745	* it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
	1746	* used to search for the most current state of the object.
	1747	*
	1748	* key2->create_tid is a HAMMER record and will never be
	1749	* 0. key2->delete_tid is the deletion transaction id or 0 if
	1750	* the record has not yet been deleted.
	1751	*
	1752	* NOTE: only key_beg.create_tid is used for create_tid, we can only
	1753	* do as-of scans at the moment.
	1754	*/
	1755	if (cursor->key_beg.create_tid) {
	1756	if (cursor->key_beg.create_tid < key2->create_tid)
	1757	return(-1);
	1758	if (key2->delete_tid && cursor->key_beg.create_tid >= key2->delete_tid)
	1759	return(1);
	1760	}
	1761
	1762	return(0);
	1763	}
	1764
	1765	/*
	1766	* Create a separator half way inbetween key1 and key2. For fields just
	1767	* one unit apart, the separator will match key2.
	1768	*
	1769	* The handling of delete_tid is a little confusing. It is only possible
	1770	* to have one record in the B-Tree where all fields match except delete_tid.
	1771	* This means, worse case, two adjacent elements may have a create_tid that
	1772	* is one-apart and cause the separator to choose the right-hand element's
	1773	* create_tid. e.g. (create,delete): (1,x)(2,x) -> separator is (2,x).
	1774	*
	1775	* So all we have to do is set delete_tid to the right-hand element to
	1776	* guarentee that the separator is properly between the two elements.
	1777	*/
	1778	#define MAKE_SEPARATOR(key1, key2, dest, field) \
	1779	dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
	1780
	1781	static void
	1782	hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
	1783	hammer_base_elm_t dest)
	1784	{
	1785	bzero(dest, sizeof(*dest));
	1786	MAKE_SEPARATOR(key1, key2, dest, obj_id);
	1787	MAKE_SEPARATOR(key1, key2, dest, rec_type);
	1788	MAKE_SEPARATOR(key1, key2, dest, key);
	1789	MAKE_SEPARATOR(key1, key2, dest, create_tid);
	1790	dest->delete_tid = key2->delete_tid;
	1791	}
	1792
	1793	#undef MAKE_SEPARATOR
	1794
	1795	/*
	1796	* Return whether a generic internal or leaf node is full
	1797	*/
	1798	static int
	1799	btree_node_is_full(hammer_node_ondisk_t node)
	1800	{
	1801	switch(node->type) {
	1802	case HAMMER_BTREE_TYPE_INTERNAL:
	1803	if (node->count == HAMMER_BTREE_INT_ELMS)
	1804	return(1);
	1805	break;
	1806	case HAMMER_BTREE_TYPE_LEAF:
	1807	if (node->count == HAMMER_BTREE_LEAF_ELMS)
	1808	return(1);
	1809	break;
	1810	default:
	1811	panic("illegal btree subtype");
	1812	}
	1813	return(0);
	1814	}
	1815
	1816	#if 0
	1817	static int
	1818	btree_max_elements(u_int8_t type)
	1819	{
	1820	if (type == HAMMER_BTREE_TYPE_LEAF)
	1821	return(HAMMER_BTREE_LEAF_ELMS);
	1822	if (type == HAMMER_BTREE_TYPE_INTERNAL)
	1823	return(HAMMER_BTREE_INT_ELMS);
	1824	panic("btree_max_elements: bad type %d\n", type);
	1825	}
	1826	#endif
	1827
	1828	void
	1829	hammer_print_btree_node(hammer_node_ondisk_t ondisk)
	1830	{
	1831	hammer_btree_elm_t elm;
	1832	int i;
	1833
	1834	kprintf("node %p count=%d parent=%d type=%c\n",
	1835	ondisk, ondisk->count, ondisk->parent, ondisk->type);
	1836
	1837	/*
	1838	* Dump both boundary elements if an internal node
	1839	*/
	1840	if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
	1841	for (i = 0; i <= ondisk->count; ++i) {
	1842	elm = &ondisk->elms[i];
	1843	hammer_print_btree_elm(elm, ondisk->type, i);
	1844	}
	1845	} else {
	1846	for (i = 0; i < ondisk->count; ++i) {
	1847	elm = &ondisk->elms[i];
	1848	hammer_print_btree_elm(elm, ondisk->type, i);
	1849	}
	1850	}
	1851	}
	1852
	1853	void
	1854	hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
	1855	{
	1856	kprintf(" %2d", i);
	1857	kprintf("\tobjid = %016llx\n", elm->base.obj_id);
	1858	kprintf("\tkey = %016llx\n", elm->base.key);
	1859	kprintf("\tcreate_tid = %016llx\n", elm->base.create_tid);
	1860	kprintf("\tdelete_tid = %016llx\n", elm->base.delete_tid);
	1861	kprintf("\trec_type = %04x\n", elm->base.rec_type);
	1862	kprintf("\tobj_type = %02x\n", elm->base.obj_type);
	1863	kprintf("\tsubtree_type = %02x\n", elm->subtree_type);
	1864
	1865	if (type == HAMMER_BTREE_TYPE_INTERNAL) {
	1866	if (elm->internal.rec_offset) {
	1867	kprintf("\tcluster_rec = %08x\n",
	1868	elm->internal.rec_offset);
	1869	kprintf("\tcluster_id = %08x\n",
	1870	elm->internal.subtree_cluid);
	1871	kprintf("\tvolno = %08x\n",
	1872	elm->internal.subtree_volno);
	1873	} else {
	1874	kprintf("\tsubtree_off = %08x\n",
	1875	elm->internal.subtree_offset);
	1876	}
	1877	kprintf("\tsubtree_count= %d\n", elm->internal.subtree_count);
	1878	} else {
	1879	kprintf("\trec_offset = %08x\n", elm->leaf.rec_offset);
	1880	kprintf("\tdata_offset = %08x\n", elm->leaf.data_offset);
	1881	kprintf("\tdata_len = %08x\n", elm->leaf.data_len);
	1882	kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc);
	1883	}
	1884	}