2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.3 2007/11/07 00:43:24 dillon Exp $
38 * HAMMER BH-Tree index
40 * HAMMER implements a modified B+Tree. In documentation this will
41 * simply be refered to as the HAMMER BH-Tree. Basically a BH-Tree
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
45 * otherwords, we have boundary elements at the two ends of a BH-Tree node
46 * instead of sub-tree pointers.
48 * A BH-Tree internal node looks like this:
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
53 * A BH-Tree leaf node basically looks like this:
55 * L L L L L L L L <-- leaf elemenets
57 * The recursion radix is reduced by 2 relative to a normal B-Tree but
58 * we get a number of significant benefits for our troubles.
60 * The big benefit to using a BH-Tree is that it is possible to cache
61 * pointers into the middle of the tree and not have to start searches,
62 * insertions, OR deletions at the root node. In particular, searches are
63 * able to progress in a definitive direction from any point in the tree
64 * without revisting nodes. This greatly improves the efficiency of many
65 * operations, most especially record appends.
67 * BH-Trees also make the stacking of trees fairly straightforward.
73 static int btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key,
75 static int btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
76 struct hammer_cluster *cluster);
77 static int btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
78 int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id);
79 static int btree_cursor_up(hammer_btree_cursor_t cursor);
80 static int btree_cursor_down(hammer_btree_cursor_t cursor);
81 static int btree_split_internal(hammer_btree_cursor_t cursor);
82 static int btree_split_leaf(hammer_btree_cursor_t cursor);
83 static int btree_rebalance_node(hammer_btree_cursor_t cursor);
84 static int btree_collapse(hammer_btree_cursor_t cursor);
87 * Compare two BH-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
90 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
93 kprintf("compare obj_id %016llx %016llx\n",
94 key1->obj_id, key2->obj_id);
95 kprintf("compare rec_type %04x %04x\n",
96 key1->rec_type, key2->rec_type);
97 kprintf("compare key %016llx %016llx\n",
98 key1->key, key2->key);
102 * A key1->obj_id of 0 matches any object id
105 if (key1->obj_id < key2->obj_id)
107 if (key1->obj_id > key2->obj_id)
112 * A key1->rec_type of 0 matches any record type.
114 if (key1->rec_type) {
115 if (key1->rec_type < key2->rec_type)
117 if (key1->rec_type > key2->rec_type)
122 * There is no special case for key. 0 means 0.
124 if (key1->key < key2->key)
126 if (key1->key > key2->key)
130 * This test has a number of special cases. create_tid in key1 is
131 * the as-of transction id, and delete_tid in key1 is NOT USED.
133 * A key1->create_tid of 0 matches any record regardles of when
134 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
135 * used to search for the most current state of the object.
137 * key2->create_tid is a HAMMER record and will never be
138 * 0. key2->delete_tid is the deletion transaction id or 0 if
139 * the record has not yet been deleted.
141 if (key1->create_tid) {
142 if (key1->create_tid < key2->create_tid)
144 if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
152 * Create a separator half way inbetween key1 and key2. For fields just
153 * one unit apart, the separator will match key2.
155 #define MAKE_SEPARATOR(key1, key2, dest, field) \
156 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
159 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
160 hammer_base_elm_t dest)
162 MAKE_SEPARATOR(key1, key2, dest, obj_id);
163 MAKE_SEPARATOR(key1, key2, dest, rec_type);
164 MAKE_SEPARATOR(key1, key2, dest, key);
165 MAKE_SEPARATOR(key1, key2, dest, create_tid);
166 dest->delete_tid = 0;
168 dest->reserved07 = 0;
171 #undef MAKE_SEPARATOR
174 * Return whether a generic internal or leaf node is full
177 btree_node_is_full(struct hammer_base_node *node)
180 case HAMMER_BTREE_INTERNAL_NODE:
181 if (node->count == HAMMER_BTREE_INT_ELMS)
184 case HAMMER_BTREE_LEAF_NODE:
185 if (node->count == HAMMER_BTREE_LEAF_ELMS)
189 panic("illegal btree subtype");
195 btree_max_elements(u_int8_t type)
197 if (type == HAMMER_BTREE_LEAF_NODE)
198 return(HAMMER_BTREE_LEAF_ELMS);
199 if (type == HAMMER_BTREE_INTERNAL_NODE)
200 return(HAMMER_BTREE_INT_ELMS);
201 panic("btree_max_elements: bad type %d\n", type);
205 * Initialize a cursor, setting the starting point for a BH-Tree search.
207 * The passed cluster must be locked. This function will add a reference
211 hammer_btree_cursor_init(hammer_btree_cursor_t cursor,
212 struct hammer_cluster *cluster)
216 cursor->cluster = NULL;
217 cursor->node_buffer = NULL;
218 cursor->parent_buffer = NULL;
220 cursor->parent = NULL;
222 cursor->parent_index = 0;
223 error = btree_cursor_set_cluster(cursor, cluster);
228 * Clean up a HAMMER BH-Tree cursor after we are finished using it.
231 hammer_btree_cursor_done(hammer_btree_cursor_t cursor)
233 if (cursor->node_buffer) {
234 hammer_put_buffer(cursor->node_buffer, 0);
235 cursor->node_buffer = NULL;
237 if (cursor->parent_buffer) {
238 hammer_put_buffer(cursor->parent_buffer, 0);
239 cursor->parent_buffer = NULL;
241 if (cursor->cluster) {
242 hammer_put_cluster(cursor->cluster, 0);
243 cursor->cluster = NULL;
246 cursor->parent = NULL;
247 cursor->left_bound = NULL;
248 cursor->right_bound = NULL;
250 cursor->parent_index = 0;
254 * Initialize a btree info structure and its associated cursor prior to
255 * running a BH-Tree operation.
258 hammer_btree_info_init(hammer_btree_info_t info, struct hammer_cluster *cluster)
262 error = hammer_btree_cursor_init(&info->cursor, cluster);
263 info->data_buffer = NULL;
264 info->record_buffer = NULL;
271 * Clean up a BH-Tree info structure after we are finished using it.
274 hammer_btree_info_done(hammer_btree_info_t info)
276 hammer_btree_cursor_done(&info->cursor);
277 if (info->data_buffer) {
278 hammer_put_buffer(info->data_buffer, 0);
279 info->data_buffer = NULL;
281 if (info->record_buffer) {
282 hammer_put_buffer(info->record_buffer, 0);
283 info->record_buffer = NULL;
290 * Search a cluster's BH-Tree. Return the matching node. The search
291 * normally traverses clusters but will terminate at a cluster entry
292 * in a leaf if HAMMER_BTREE_CLUSTER_TAG is specified. If this flag
293 * is specified EIO is returned if the search would otherwise have to
294 * cursor up into a parent cluster.
296 * The cursor must be completely initialized on entry. If the cursor is
297 * at the root of a cluster, the parent pointer & buffer may be NULL (in
298 * that case the bounds point to the bounds in the cluster header). The
299 * node_buffer and node must always be valid.
301 * The search code may be forced to iterate up the tree if the conditions
302 * required for an insertion or deletion are not met. This does not occur
305 * INSERTIONS: The search will split full nodes and leaves on its way down
306 * and guarentee that the leaf it ends up on is not full.
308 * DELETIONS: The search will rebalance the tree on its way down.
312 btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key, int flags)
314 struct hammer_btree_leaf_node *leaf;
320 * Move our cursor up the tree until we find a node whos range covers
321 * the key we are trying to locate. This may move us between
324 * Since the root cluster always has a root BH-Tree node, info->node
325 * is always non-NULL if no error occured. The parent field will be
326 * non-NULL unless we are at the root of a cluster.
328 while (hammer_btree_cmp(key, cursor->left_bound) < 0 ||
329 hammer_btree_cmp(key, cursor->right_bound) >= 0) {
331 * Must stay in current cluster if flagged, code should never
332 * use the flag if it wants us to traverse to the parent
335 if (cursor->parent == NULL &&
336 (flags & HAMMER_BTREE_CLUSTER_TAG)) {
340 error = btree_cursor_up(cursor);
344 KKASSERT(cursor->node != NULL);
347 * If we are inserting we can't start at a full node if the parent
348 * is also full (because there is no way to split the node),
349 * continue running up the tree until we hit the root of the
350 * current cluster or until the requirement is satisfied.
352 while (flags & HAMMER_BTREE_INSERT) {
353 if (btree_node_is_full(&cursor->node->base) == 0)
355 if (cursor->parent == NULL)
357 if (cursor->parent->base.count != HAMMER_BTREE_INT_ELMS)
359 error = btree_cursor_up(cursor);
365 * If we are deleting we can't start at a node with only one element
366 * unless it is root, because all of our code assumes that nodes
367 * will never be empty.
369 * This handles the case where the cursor is sitting at a leaf and
370 * either the leaf or parent contain an insufficient number of
373 while (flags & HAMMER_BTREE_DELETE) {
374 if (cursor->node->base.count > 1)
376 if (cursor->parent == NULL)
378 KKASSERT(cursor->node->base.count != 0);
379 error = btree_cursor_up(cursor);
386 * Push down through internal nodes to locate the requested key.
388 while (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
389 struct hammer_btree_internal_node *node;
392 * If we are a the root node and deleting, try to collapse
393 * all of the root's children into the root. This is the
394 * only point where tree depth is reduced.
396 if ((flags & HAMMER_BTREE_DELETE) && cursor->parent == NULL) {
397 error = btree_collapse(cursor);
403 * Scan the node to find the subtree index to push down into.
404 * We go one-past, then back-up. The key should never be
405 * less then the left-hand boundary so I should never wind
408 node = &cursor->node->internal;
409 for (i = 0; i < node->base.count; ++i) {
410 r = hammer_btree_cmp(key, &node->elms[i].base);
417 * The push-down index is now i - 1.
423 * Handle insertion and deletion requirements.
425 * If inserting split full nodes. The split code will
426 * adjust cursor->node and cursor->index if the current
427 * index winds up in the new node.
429 if (flags & HAMMER_BTREE_INSERT) {
430 if (node->base.count == HAMMER_BTREE_INT_ELMS) {
431 error = btree_split_internal(cursor);
435 * reload stale pointers
438 node = &cursor->node->internal;
443 * If deleting rebalance - do not allow the child to have
444 * just one element or we will not be able to delete it.
446 * Neither internal or leaf nodes (except a root-leaf) are
447 * allowed to drop to 0 elements.
449 * Our separators may have been reorganized after rebalancing,
450 * so we have to pop back up and rescan.
452 if (flags & HAMMER_BTREE_DELETE) {
453 if (node->elms[i].subtree_count <= 1) {
454 error = btree_rebalance_node(cursor);
457 /* cursor->index is invalid after call */
463 * Push down (push into new node, existing node becomes
466 error = btree_cursor_down(cursor);
473 * We are at a leaf, do a linear search of the key array.
474 * (XXX do a binary search). On success the index is set to the
475 * matching element, on failure the index is set to the insertion
478 * Boundaries are not stored in leaf nodes, so the index can wind
479 * up to the left of element 0 (index == 0) or past the end of
480 * the array (index == leaf->base.count).
482 leaf = &cursor->node->leaf;
483 KKASSERT(leaf->base.count <= HAMMER_BTREE_LEAF_ELMS);
485 for (i = 0; i < leaf->base.count; ++i) {
486 r = hammer_btree_cmp(key, &leaf->elms[i].base);
491 * Return an exact match unless this is a cluster
492 * element. If it is, and the cluster tag flag has
493 * not been set, push into the new cluster and
494 * continue the search.
497 if ((leaf->elms[i].base.obj_type &
498 HAMMER_OBJTYPE_CLUSTER_FLAG) &&
499 (flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
500 error = btree_cursor_down(cursor);
511 * We could not find an exact match. Check for a cluster
512 * recursion. The cluster's range is bracketed by two
513 * leaf elements. One of the two must be in this node.
515 if ((flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
516 if (i == leaf->base.count) {
517 if (leaf->elms[i-1].base.obj_type &
518 HAMMER_OBJTYPE_CLUSTER_FLAG) {
519 cursor->index = i - 1;
520 error = btree_cursor_down(cursor);
526 if (leaf->elms[i].base.obj_type &
527 HAMMER_OBJTYPE_CLUSTER_FLAG) {
529 error = btree_cursor_down(cursor);
538 * If inserting split a full leaf before returning. This
539 * may have the side effect of adjusting cursor->node and
542 * We delayed the split in order to avoid any unnecessary splits.
544 * XXX parent's parent's subtree_count will be wrong after
545 * this (keep parent of parent around too? ugh).
548 if ((flags & HAMMER_BTREE_INSERT) &&
549 leaf->base.count == HAMMER_BTREE_LEAF_ELMS) {
550 error = btree_split_leaf(cursor);
553 node = &cursor->node->internal;
561 * Set the cursor's last_error. This is used primarily by
562 * btree_search_fwd() to determine whether it needs to skip
563 * the current element or not.
566 cursor->last_error = error;
571 * Ignoring key->key, iterate records after a search. The next record which
572 * matches the key (except for key->key) is returned. To synchronize with
573 * btree_search() we only check the record at the current cursor index if
574 * cursor->last_error is ENOENT, indicating that it is at an insertion point
575 * (so if we are not at the end of the node's array, the current record
576 * will be the 'next' record to check).
578 * Records associated with regular files use the ending offset of the data
579 * block (non inclusive) as their key. E.g. if you write 20 bytes to a
580 * file at offset 10, the HAMMER record will use a key of 30 for that record.
581 * This way the cursor is properly positioned after a search so the first
582 * record returned by btree_iterate() is (likely) the one containing the
583 * desired data. This also means the results of the initial search are
586 * This function is also used to iterate through database records, though
587 * in that case the caller utilizes any exact match located by btree_search()
588 * before diving into the iteration.
591 hammer_btree_iterate(hammer_btree_cursor_t cursor, hammer_base_elm_t key)
593 hammer_btree_node_t node;
594 struct hammer_base_elm save_base;
601 * If last_error is not zero we are at the insertion point and must
602 * start at the current element. If last_error is zero the caller
603 * has already processed the current cursor (or doesn't care about
604 * it) and we must iterate forwards.
607 if (cursor->index < node->base.count && cursor->last_error == 0)
612 * We iterate up the tree while we are at the last element.
614 * If we hit the root of the cluster we have to cursor up
615 * into the parent cluster, but then search to position
616 * the cursor at the next logical element in the iteration.
617 * We ignore an ENOENT error in this case.
619 * XXX this could be optimized by storing the information in
620 * the parent reference.
623 while (cursor->index == node->base.count) {
624 if (cursor->parent == NULL) {
625 save_base = *cursor->right_bound;
626 error = btree_cursor_up(cursor);
629 error = btree_search(cursor, &save_base, 0);
635 /* node cannot be empty. cursor->index is 0 */
636 KKASSERT(cursor->index != node->base.count);
637 /* do not further increment the index */
639 error = btree_cursor_up(cursor);
643 KKASSERT(cursor->index != node->base.count);
649 * Iterate down the tree while we are at an internal node.
650 * Nodes cannot be empty, assert the case because if one is
651 * we will wind up in an infinite loop.
653 * We can avoid iterating through large swaths of transaction
654 * id space if the left and right separators are the same
655 * except for their transaction spaces. We can then skip
656 * the node if the left and right transaction spaces are the
657 * same sign. This directly optimized accesses to files with
658 * HUGE transactional histories, such as database files.
660 while (node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
661 struct hammer_btree_internal_elm *elm;
663 elm = &node->internal.elms[cursor->index];
664 if (elm[0].base.obj_id == elm[1].base.obj_id &&
665 elm[0].base.rec_type == elm[1].base.rec_type &&
666 elm[0].base.key == elm[1].base.key) {
668 key->key = elm[0].base.key;
669 r = hammer_btree_cmp(key, &elm[0].base);
670 key->key = elm[1].base.key;
671 s = hammer_btree_cmp(key, &elm[1].base);
673 if ((r < 0 && s < 0) || (r > 0 && s > 0)) {
678 error = btree_cursor_down(cursor);
681 KKASSERT(cursor->index != node->base.count);
686 * Determine if the record at the cursor matches, sans
687 * key->key (which is what we are iterating).
690 union hammer_btree_leaf_elm *elm;
692 elm = &node->leaf.elms[cursor->index];
694 key->key = elm->base.key;
695 r = hammer_btree_cmp(key, &elm->base);
700 * The iteration stops if the obj_id or rec_type no longer
701 * match (unless the original key set those to 0, meaning the
702 * caller WANTS to iterate through those as well), denoted
703 * by -3,-2 or +2,+3 return values. Otherwise the mismatch
704 * is due to the transaction id and we can get both negative
705 * and positive values... we have to skip those records and
706 * continue searching.
712 if (r < -2 || r > 2) {
719 cursor->last_error = error;
724 * Look up the key in the HAMMER BH-Tree and fill in the rest of the
725 * info structure with the results according to flags. 0 is returned on
726 * success, non-zero on error.
728 * The caller initializes (key, cluster) and makes the call. The cluster
729 * must be referenced and locked on call. The function can chain through
730 * multiple clusters and will replace the passed cluster as it does so,
731 * dereferencing and unlocking it, and referencing and locking the chain.
732 * On return info->cluster will still be referenced and locked but may
733 * represent a different cluster.
736 hammer_btree_lookup(hammer_btree_info_t info, hammer_base_elm_t key, int flags)
740 error = btree_search(&info->cursor, key, flags);
742 (flags & (HAMMER_BTREE_GET_RECORD|HAMMER_BTREE_GET_DATA))) {
743 error = hammer_btree_extract(info, flags);
749 hammer_btree_extract(hammer_btree_info_t info, int flags)
751 struct hammer_cluster *cluster;
752 hammer_btree_leaf_elm_t elm;
758 * Extract the record and data reference if requested.
760 * A cluster record type has no data reference, the information
761 * is stored directly in the record and BH-Tree element.
763 * The case where the data reference resolves to the same buffer
764 * as the record reference must be handled.
766 elm = &info->cursor.node->leaf.elms[info->cursor.index];
767 iscl = (elm->base.obj_type & HAMMER_OBJTYPE_CLUSTER_FLAG) != 0;
768 cluster = info->cursor.cluster;
771 if ((flags & HAMMER_BTREE_GET_RECORD) && error == 0) {
772 cloff = iscl ? elm->cluster.rec_offset : elm->record.rec_offset;
775 info->rec = hammer_bread(cluster, cloff, HAMMER_FSBUF_RECORDS,
776 &error, &info->record_buffer);
780 if ((flags & HAMMER_BTREE_GET_DATA) && iscl == 0 && error == 0) {
781 if ((cloff ^ elm->record.data_offset) & ~HAMMER_BUFMASK) {
782 info->data = hammer_bread(cluster,
783 elm->record.data_offset,
785 &error, &info->record_buffer);
787 info->data = (void *)
788 ((char *)info->data_buffer->ondisk +
789 (elm->record.data_offset & HAMMER_BUFMASK));
797 * Insert a record into a BH-Tree's cluster. The caller has already
798 * reserved space for the record and data and must handle a ENOSPC
802 hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm)
804 struct hammer_btree_cursor *cursor;
805 struct hammer_btree_internal_node *parent;
806 struct hammer_btree_leaf_node *leaf;
810 /* HANDLED BY CALLER */
812 * Issue a search to get our cursor at the right place. The search
813 * will get us to a leaf node.
815 * The search also does some setup for our insert, so there is always
818 error = btree_search(&info->cursor, &elm->base, HAMMER_BTREE_INSERT);
819 if (error != ENOENT) {
827 * Insert the element at the leaf node and update the count in the
828 * parent. It is possible for parent to be NULL, indicating that
829 * the root of the BH-Tree in the cluster is a leaf. It is also
830 * possible for the leaf to be empty.
832 * Remember that the right-hand boundary is not included in the
835 cursor = &info->cursor;
836 leaf = &cursor->node->leaf;
838 KKASSERT(leaf->base.count < HAMMER_BTREE_LEAF_ELMS);
839 bcopy(&leaf->elms[i], &leaf->elms[i+1],
840 (leaf->base.count - i + 1) * sizeof(leaf->elms[0]));
841 leaf->elms[i] = *elm;
844 if ((parent = cursor->parent) != NULL) {
845 i = cursor->parent_index;
846 ++parent->elms[i].subtree_count;
847 KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
848 hammer_modify_buffer(cursor->parent_buffer);
850 hammer_modify_buffer(cursor->node_buffer);
855 * Delete a record from the BH-Tree.
858 hammer_btree_delete(struct hammer_btree_info *info, hammer_base_elm_t key)
860 struct hammer_btree_cursor *cursor;
861 struct hammer_btree_internal_node *parent;
862 struct hammer_btree_leaf_node *leaf;
866 /* HANDLED BY CALLER */
868 * Locate the leaf element to delete. The search is also responsible
869 * for doing some of the rebalancing work on its way down.
871 error = btree_search(&info->cursor, key, HAMMER_BTREE_DELETE);
877 * Delete the element from the leaf node. We leave empty leafs alone
878 * and instead depend on a future search to locate and destroy them.
879 * Otherwise we would have to recurse back up the tree to adjust
880 * the parent's subtree_count and we do not want to do that.
882 * Remember that the right-hand boundary is not included in the
885 cursor = &info->cursor;
886 leaf = &cursor->node->leaf;
889 KKASSERT(cursor->node->base.type == HAMMER_BTREE_LEAF_NODE);
890 bcopy(&leaf->elms[i+1], &leaf->elms[i],
891 (leaf->base.count - i) * sizeof(leaf->elms[0]));
893 if ((parent = cursor->parent) != NULL) {
895 * Adjust parent's notion of the leaf's count. subtree_count
896 * is only approximately, it is allowed to be too small but
897 * never allowed to be too large. Make sure we don't drop
900 i = cursor->parent_index;
901 if (parent->elms[i].subtree_count)
902 --parent->elms[i].subtree_count;
903 KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
904 hammer_modify_buffer(cursor->parent_buffer);
906 hammer_modify_buffer(cursor->node_buffer);
910 /************************************************************************
912 ************************************************************************
914 * These routines do basic cursor operations. This support will probably
915 * be expanded in the future to add link fields for linear scans.
919 * Unconditionally set the cursor to the root of the specified cluster.
920 * The current cursor node is set to the root node of the cluster (which
921 * can be an internal node or a degenerate leaf), and the parent info
922 * is cleaned up and cleared.
924 * The passed cluster must be locked. This function will add a reference
925 * to it. The cursor must already have a cluster assigned to it, which we
930 btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
931 int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id)
933 struct hammer_cluster *cluster;
934 struct hammer_volume *volume;
939 cluster = cursor->cluster;
940 KKASSERT(cluster != NULL);
941 if (vol_no == cluster->volume->vol_no) {
942 cluster = hammer_get_cluster(cluster->volume, clu_no,
945 volume = hammer_get_volume(cluster->volume->hmp,
948 cluster = hammer_get_cluster(volume, clu_no,
950 hammer_put_volume(volume, 0);
957 * Make sure the cluster id matches. XXX At the moment the
958 * clu_id in the btree cluster element is only 32 bits, so only
959 * compare the low 32 bits.
962 if ((int32_t)cluster->ondisk->clu_id == (int32_t)clu_id) {
963 btree_cursor_set_cluster(cursor, cluster);
967 hammer_put_cluster(cluster, 0);
974 btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
975 struct hammer_cluster *cluster)
979 hammer_dup_cluster(&cursor->cluster, cluster);
980 cursor->node = hammer_bread(cluster,
981 cluster->ondisk->clu_btree_root,
984 &cursor->node_buffer);
986 if (cursor->parent) {
987 hammer_put_buffer(cursor->parent_buffer, 0);
988 cursor->parent_buffer = NULL;
989 cursor->parent = NULL;
990 cursor->parent_index = 0;
992 cursor->left_bound = &cluster->ondisk->clu_btree_beg;
993 cursor->right_bound = &cluster->ondisk->clu_btree_end;
998 * Cursor the node up to the parent node. If at the root of a cluster,
999 * cursor to the root of the cluster's parent cluster. Note carefully
1000 * that we do NOT scan the parent cluster to find the leaf that dropped
1001 * into our current cluster.
1003 * This function is primarily used by the search code to avoid having
1004 * to search from the root of the filesystem BH-Tree.
1008 btree_cursor_up(hammer_btree_cursor_t cursor)
1010 struct hammer_cluster_ondisk *ondisk;
1011 struct hammer_btree_internal_node *parent;
1017 if (cursor->parent == NULL) {
1019 * We are at the root of the cluster, pop up to the root
1020 * of the parent cluster. Return ENOENT if we are at the
1021 * root cluster of the filesystem.
1023 ondisk = cursor->cluster->ondisk;
1024 if (ondisk->clu_btree_parent_vol_no < 0) {
1027 error = btree_cursor_set_cluster_by_value(
1029 ondisk->clu_btree_parent_vol_no,
1030 ondisk->clu_btree_parent_clu_no,
1031 ondisk->clu_btree_parent_clu_id);
1035 * Copy the current node's parent info into the node and load
1038 cursor->index = cursor->parent_index;
1039 cursor->node = (hammer_btree_node_t)cursor->parent;
1040 hammer_dup_buffer(&cursor->node_buffer, cursor->parent_buffer);
1043 * Load the parent's parent into parent and figure out the
1044 * parent's element index for its child. Just NULL it out
1045 * if we hit the root of the cluster.
1047 if (cursor->parent->base.parent) {
1048 parent = hammer_bread(cursor->cluster,
1049 cursor->node->base.parent,
1052 &cursor->parent_buffer);
1053 for (i = 0; i < parent->base.count; ++i) {
1054 r = hammer_btree_cmp(
1055 &cursor->node->internal.elms[0].base,
1056 &parent->elms[i].base);
1060 cursor->parent = parent;
1061 cursor->parent_index = i - 1;
1062 KKASSERT(parent->elms[i].subtree_offset ==
1063 hammer_bclu_offset(cursor->node_buffer,
1066 hammer_put_buffer(cursor->parent_buffer, 0);
1067 cursor->parent = NULL;
1068 cursor->parent_buffer = NULL;
1069 cursor->parent_index = 0;
1076 * Cursor down into (node, index)
1078 * Push down into the current cursor. The current cursor becomes the parent.
1079 * If the current cursor represents a leaf cluster element this function will
1080 * push into the root of a new cluster and clear the parent fields.
1082 * Pushin down at a leaf which is not a cluster element returns EIO.
1086 btree_cursor_down(hammer_btree_cursor_t cursor)
1088 hammer_btree_node_t node;
1091 node = cursor->node;
1092 if (node->base.type == HAMMER_BTREE_LEAF_NODE) {
1094 * Push into another cluster
1096 hammer_btree_leaf_elm_t elm;
1098 elm = &node->leaf.elms[cursor->index];
1099 if (elm->base.rec_type == HAMMER_RECTYPE_CLUSTER) {
1100 error = btree_cursor_set_cluster_by_value(
1102 elm->cluster.vol_no,
1103 elm->cluster.clu_no,
1104 elm->cluster.verifier);
1110 * Push into another BH-Tree node (internal or leaf)
1112 struct hammer_btree_internal_elm *elm;
1114 KKASSERT(node->base.type == HAMMER_BTREE_INTERNAL_NODE);
1115 elm = &node->internal.elms[cursor->index];
1116 KKASSERT(elm->subtree_offset != 0);
1117 cursor->parent_index = cursor->index;
1118 cursor->parent = &cursor->node->internal;
1119 hammer_dup_buffer(&cursor->parent_buffer, cursor->node_buffer);
1121 cursor->node = hammer_bread(cursor->cluster,
1122 elm->subtree_offset,
1125 &cursor->node_buffer);
1127 cursor->left_bound = &elm[0].base;
1128 cursor->right_bound = &elm[1].base;
1129 KKASSERT(cursor->node == NULL ||
1130 cursor->node->base.parent == elm->subtree_offset);
1133 * The bounds of an internal node must match the parent's
1134 * partitioning values. Leaf nodes do not store bounds.
1136 if (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
1137 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->node->internal.elms[0].base) == 0);
1138 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->node->internal.elms[cursor->node->base.count].base) == 0);
1145 /************************************************************************
1146 * SPLITTING AND MERGING *
1147 ************************************************************************
1149 * These routines do all the dirty work required to split and merge nodes.
1153 * Split an internal into two nodes and move the separator at the split
1154 * point to the parent. Note that the parent's parent's element pointing
1155 * to our parent will have an incorrect subtree_count (we don't update it).
1156 * It will be low, which is ok.
1158 * Cursor->index indicates the element the caller intends to push into.
1159 * We will adjust cursor->node and cursor->index if that element winds
1160 * up in the split node.
1164 btree_split_internal(hammer_btree_cursor_t cursor)
1166 struct hammer_btree_internal_node *parent;
1167 struct hammer_btree_internal_node *node;
1168 struct hammer_btree_internal_node *new_node;
1169 struct hammer_btree_internal_elm *elm;
1170 struct hammer_btree_internal_elm *parent_elm;
1171 struct hammer_buffer *new_buffer;
1172 int32_t parent_offset;
1177 const size_t esize = sizeof(struct hammer_btree_internal_elm);
1180 * We are splitting but elms[split] will be promoted to the parent,
1181 * leaving the right hand node with one less element. If the
1182 * insertion point will be on the left-hand side adjust the split
1183 * point to give the right hand side one additional node.
1185 node = &cursor->node->internal;
1186 split = (node->base.count + 1) / 2;
1187 if (cursor->index <= split)
1192 * If we are at the root of the tree, create a new root node with
1193 * 1 element and split normally. Avoid making major modifications
1194 * until we know the whole operation will work.
1196 parent = cursor->parent;
1197 if (parent == NULL) {
1198 parent = hammer_alloc_btree(cursor->cluster, &error,
1199 &cursor->parent_buffer);
1202 parent->base.count = 1;
1203 parent->base.parent = 0;
1204 parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
1205 parent->base.subtype = node->base.type;
1206 parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
1207 parent->elms[0].subtree_offset =
1208 hammer_bclu_offset(cursor->node_buffer, node);
1209 parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
1211 cursor->parent_index = 0; /* insertion point in parent */
1215 parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
1218 * Split node into new_node at the split point.
1220 * B O O O P N N B <-- P = node->elms[split]
1221 * 0 1 2 3 4 5 6 <-- subtree indices
1226 * B O O O B B N N B <--- inner boundary points are 'P'
1231 new_node = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
1232 if (new_node == NULL) {
1234 hammer_free_btree_ptr(cursor->parent_buffer,
1235 (hammer_btree_node_t)parent);
1240 * Create the new node. P become the left-hand boundary in the
1241 * new node. Copy the right-hand boundary as well.
1243 * elm is the new separator.
1245 elm = &node->elms[split];
1246 bcopy(elm, &new_node->elms[0], (node->base.count - split + 1) * esize);
1247 new_node->base.count = node->base.count - split;
1248 new_node->base.parent = parent_offset;
1249 new_node->base.type = HAMMER_BTREE_INTERNAL_NODE;
1250 new_node->base.subtype = node->base.subtype;
1251 KKASSERT(node->base.type == new_node->base.type);
1254 * Cleanup the original node. P becomes the new boundary, its
1255 * subtree_offset was moved to the new node. If we had created
1256 * a new root its parent pointer may have changed.
1258 node->base.parent = parent_offset;
1259 elm->subtree_offset = 0;
1262 * Insert the separator into the parent, fixup the parent's
1263 * reference to the original node, and reference the new node.
1264 * The separator is P.
1266 * Remember that base.count does not include the right-hand boundary.
1268 parent_index = cursor->parent_index;
1269 parent->elms[parent_index].subtree_count = split;
1270 parent_elm = &parent->elms[parent_index+1];
1271 bcopy(parent_elm, parent_elm + 1,
1272 (parent->base.count - parent_index) * esize);
1273 parent_elm->base = elm->base; /* separator P */
1274 parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_node);
1275 parent_elm->subtree_count = new_node->base.count;
1277 hammer_modify_buffer(new_buffer);
1278 hammer_modify_buffer(cursor->parent_buffer);
1279 hammer_modify_buffer(cursor->node_buffer);
1282 * The cluster's root pointer may have to be updated.
1285 cursor->cluster->ondisk->clu_btree_root = node->base.parent;
1286 hammer_modify_cluster(cursor->cluster);
1290 * Ok, now adjust the cursor depending on which element the original
1291 * index was pointing at. If we are >= the split point the push node
1292 * is now in the new node.
1294 * NOTE: If we are at the split point itself we cannot stay with the
1295 * original node because the push index will point at the right-hand
1296 * boundary, which is illegal.
1298 if (cursor->index >= split) {
1299 cursor->index -= split;
1300 cursor->node = (hammer_btree_node_t)new_node;
1301 hammer_put_buffer(cursor->node_buffer, 0);
1302 cursor->node_buffer = new_buffer;
1309 * Same as the above, but splits a full leaf node.
1313 btree_split_leaf(hammer_btree_cursor_t cursor)
1315 struct hammer_btree_internal_node *parent;
1316 struct hammer_btree_leaf_node *leaf;
1317 struct hammer_btree_leaf_node *new_leaf;
1318 union hammer_btree_leaf_elm *elm;
1319 struct hammer_btree_internal_elm *parent_elm;
1320 struct hammer_buffer *new_buffer;
1321 int32_t parent_offset;
1326 const size_t esize = sizeof(struct hammer_btree_internal_elm);
1329 * We are splitting but elms[split] will be promoted to the parent,
1330 * leaving the right hand node with one less element. If the
1331 * insertion point will be on the left-hand side adjust the split
1332 * point to give the right hand side one additional node.
1334 leaf = &cursor->node->leaf;
1335 split = (leaf->base.count + 1) / 2;
1336 if (cursor->index <= split)
1341 * If we are at the root of the tree, create a new root node with
1342 * 1 element and split normally. Avoid making major modifications
1343 * until we know the whole operation will work.
1345 parent = cursor->parent;
1346 if (parent == NULL) {
1347 parent = hammer_alloc_btree(cursor->cluster, &error,
1348 &cursor->parent_buffer);
1351 parent->base.count = 1;
1352 parent->base.parent = 0;
1353 parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
1354 parent->base.subtype = leaf->base.type;
1355 parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
1356 parent->elms[0].subtree_offset =
1357 hammer_bclu_offset(cursor->node_buffer, leaf);
1358 parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
1360 cursor->parent_index = 0; /* insertion point in parent */
1364 parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
1367 * Split leaf into new_leaf at the split point. Select a separator
1368 * value in-between the two leafs but with a bent towards the right
1369 * leaf since comparisons use an 'elm >= separator' inequality.
1379 new_leaf = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
1380 if (new_leaf == NULL) {
1382 hammer_free_btree_ptr(cursor->parent_buffer,
1383 (hammer_btree_node_t)parent);
1388 * Create the new node. P become the left-hand boundary in the
1389 * new node. Copy the right-hand boundary as well.
1391 elm = &leaf->elms[split];
1392 bcopy(elm, &new_leaf->elms[0], (leaf->base.count - split) * esize);
1393 new_leaf->base.count = leaf->base.count - split;
1394 new_leaf->base.parent = parent_offset;
1395 new_leaf->base.type = HAMMER_BTREE_LEAF_NODE;
1396 new_leaf->base.subtype = 0;
1397 KKASSERT(leaf->base.type == new_leaf->base.type);
1400 * Cleanup the original node. P becomes the new boundary, its
1401 * subtree_offset was moved to the new node. If we had created
1402 * a new root its parent pointer may have changed.
1404 leaf->base.parent = parent_offset;
1407 * Insert the separator into the parent, fixup the parent's
1408 * reference to the original node, and reference the new node.
1409 * The separator is P.
1411 * Remember that base.count does not include the right-hand boundary.
1412 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1414 parent_index = cursor->parent_index;
1415 parent->elms[parent_index].subtree_count = split;
1416 parent_elm = &parent->elms[parent_index+1];
1417 if (parent_index + 1 != parent->base.count) {
1418 bcopy(parent_elm, parent_elm + 1,
1419 (parent->base.count - parent_index - 1) * esize);
1421 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1422 parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_leaf);
1423 parent_elm->subtree_count = new_leaf->base.count;
1425 hammer_modify_buffer(new_buffer);
1426 hammer_modify_buffer(cursor->parent_buffer);
1427 hammer_modify_buffer(cursor->node_buffer);
1430 * The cluster's root pointer may have to be updated.
1433 cursor->cluster->ondisk->clu_btree_root = leaf->base.parent;
1434 hammer_modify_cluster(cursor->cluster);
1438 * Ok, now adjust the cursor depending on which element the original
1439 * index was pointing at. If we are >= the split point the push node
1440 * is now in the new node.
1442 * NOTE: If we are at the split point itself we cannot stay with the
1443 * original node because the push index will point at the right-hand
1444 * boundary, which is illegal.
1446 if (cursor->index >= split) {
1447 cursor->index -= split;
1448 cursor->node = (hammer_btree_node_t)new_leaf;
1449 hammer_put_buffer(cursor->node_buffer, 0);
1450 cursor->node_buffer = new_buffer;
1457 * This routine is called on an internal node prior to recursing down
1458 * through (node, index) when (node, index) MIGHT have too few elements for
1459 * the caller to perform a deletion.
1461 * cursor->index is invalid on return because the separators may have gotten
1462 * adjusted, the caller must rescan the node's elements. The caller may set
1463 * cursor->index to -1 if it wants us to do a general rebalancing.
1465 * NOTE: Because we do not update the parent's parent in the split code,
1466 * the subtree_count used by the caller may be incorrect. We correct it
1467 * here. Also note that we cannot change the depth of the tree's leaf
1468 * nodes here (see btree_collapse()).
1470 * This routine rebalances the children of the node, collapsing children
1471 * together if possible. On return each child will have at least L/2-1
1472 * elements unless the node only has one child.
1476 btree_rebalance_node(hammer_btree_cursor_t cursor)
1478 struct hammer_btree_internal_node *node;
1479 hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
1480 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1481 hammer_btree_node_t child;
1482 hammer_btree_elm_inmemory_t elms;
1483 int i, j, n, nelms, goal;
1484 int maxelms, halfelms;
1490 node = &cursor->node->internal;
1491 KKASSERT(node->elms[cursor->index].subtree_offset != 0);
1495 * Load the children of node and do any necessary corrections
1496 * to subtree_count. subtree_count may be incorrect due to the
1497 * way insertions split nodes. Get a count of the total number
1498 * of elements held by our children.
1502 for (i = n = 0; i < node->base.count; ++i) {
1503 struct hammer_btree_internal_elm *elm;
1505 elm = &node->elms[i];
1507 child_buffer[i] = NULL; /* must be preinitialized for bread */
1508 if (elm->subtree_offset == 0)
1510 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1511 HAMMER_FSBUF_BTREE, &error,
1513 children[i] = child;
1516 KKASSERT(node->base.subtype == child->base.type);
1519 * Accumulate n for a good child, update the node's count
1522 if (node->elms[i].subtree_count != child->base.count) {
1523 node->elms[i].subtree_count = child->base.count;
1525 n += node->elms[i].subtree_count;
1531 * Collect all the children's elements together
1534 elms = kmalloc(sizeof(*elms) * (nelms + 1), M_HAMMER, M_WAITOK|M_ZERO);
1535 for (i = n = 0; i < node->base.count; ++i) {
1536 child = children[i];
1537 for (j = 0; j < child->base.count; ++j) {
1538 elms[n].owner = child;
1539 if (node->base.subtype == HAMMER_BTREE_LEAF_NODE)
1540 elms[n].u.leaf = child->leaf.elms[j];
1542 elms[n].u.internal = child->internal.elms[j];
1546 KKASSERT(n == nelms);
1549 * Store a boundary in the elms array to ease the code below. This
1550 * is only used if the children are internal nodes.
1552 elms[n].u.internal = node->elms[i];
1555 * Calculate the number of elements each child should have (goal) by
1556 * reducing the number of elements until we achieve at least
1557 * halfelms - 1 per child, unless we are a degenerate case.
1559 maxelms = btree_max_elements(node->base.subtype);
1560 halfelms = maxelms / 2;
1562 goal = halfelms - 1;
1563 while (i && n / i < goal)
1567 * Now rebalance using the specified goal
1569 for (i = n = 0; i < node->base.count; ++i) {
1570 struct hammer_buffer *subchild_buffer = NULL;
1571 struct hammer_btree_internal_node *subchild;
1573 child = children[i];
1574 for (j = 0; j < goal && n < nelms; ++j) {
1575 if (node->base.subtype == HAMMER_BTREE_LEAF_NODE) {
1576 child->leaf.elms[j] = elms[n].u.leaf;
1578 child->internal.elms[j] = elms[n].u.internal;
1582 * If the element's parent has changed we have to
1583 * update the parent pointer. This is somewhat
1586 if (elms[n].owner != child &&
1587 node->base.subtype == HAMMER_BTREE_INTERNAL_NODE) {
1588 subchild = hammer_bread(cursor->cluster,
1589 elms[n].u.internal.subtree_offset,
1594 subchild->base.parent =
1595 hammer_bclu_offset(child_buffer[i],
1597 hammer_modify_buffer(subchild_buffer);
1604 * Set right boundary if the children are internal nodes.
1606 if (node->base.subtype == HAMMER_BTREE_INTERNAL_NODE)
1607 child->internal.elms[j] = elms[n].u.internal;
1608 child->base.count = j;
1609 hammer_modify_buffer(child_buffer[i]);
1610 if (subchild_buffer)
1611 hammer_put_buffer(subchild_buffer, 0);
1614 * If we have run out of elements, break out
1621 * Physically destroy any left-over children. These children's
1622 * elements have been packed into prior children. The node's
1623 * right hand boundary and count gets shifted to index i.
1625 * The subtree count in the node's parent MUST be updated because
1626 * we are removing elements. The subtree_count field is allowed to
1627 * be too small, but not too large!
1629 if (i != node->base.count) {
1631 node->elms[n] = node->elms[node->base.count];
1632 while (i < node->base.count) {
1633 hammer_free_btree_ptr(child_buffer[i], children[i]);
1634 hammer_put_buffer(child_buffer[i], 0);
1637 node->base.count = n;
1638 if (cursor->parent) {
1639 cursor->parent->elms[cursor->parent_index].subtree_count = n;
1640 hammer_modify_buffer(cursor->parent_buffer);
1644 kfree(elms, M_HAMMER);
1646 hammer_modify_buffer(cursor->node_buffer);
1647 for (i = 0; i < node->base.count; ++i) {
1648 if (child_buffer[i])
1649 hammer_put_buffer(child_buffer[i], 0);
1655 * This routine is only called if the cursor is at the root node and the
1656 * root node is an internal node. We attempt to collapse the root node
1657 * by replacing it with all of its children, reducing tree depth by one.
1659 * This is the only way to reduce tree depth in a HAMMER filesystem.
1660 * Note that all leaf nodes are at the same depth.
1662 * This is a fairly expensive operation because we not only have to load
1663 * the root's children, we also have to scan each child and adjust the
1664 * parent offset for each element in each child. Nasty all around.
1668 btree_collapse(hammer_btree_cursor_t cursor)
1670 hammer_btree_node_t root, child;
1671 hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
1672 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1677 int32_t root_offset;
1678 u_int8_t subsubtype;
1680 root = cursor->node;
1681 count = root->base.count;
1682 root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1685 * Sum up the number of children each element has. This value is
1686 * only approximate due to the way the insertion node works. It
1687 * may be too small but it will never be too large.
1689 * Quickly terminate the collapse if the elements have too many
1692 KKASSERT(root->base.parent == 0); /* must be root node */
1693 KKASSERT(root->base.type == HAMMER_BTREE_INTERNAL_NODE);
1694 KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1696 for (i = n = 0; i < count; ++i) {
1697 n += root->internal.elms[i].subtree_count;
1699 if (n > btree_max_elements(root->base.subtype))
1703 * Iterate through the elements again and correct the subtree_count.
1704 * Terminate the collapse if we wind up with too many.
1709 for (i = n = 0; i < count; ++i) {
1710 struct hammer_btree_internal_elm *elm;
1712 elm = &root->internal.elms[i];
1713 child_buffer[i] = NULL;
1715 if (elm->subtree_offset == 0)
1717 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1718 HAMMER_FSBUF_BTREE, &error,
1720 children[i] = child;
1723 KKASSERT(root->base.subtype == child->base.type);
1726 * Accumulate n for a good child, update the root's count
1729 if (root->internal.elms[i].subtree_count != child->base.count) {
1730 root->internal.elms[i].subtree_count = child->base.count;
1733 n += root->internal.elms[i].subtree_count;
1735 if (error || n > btree_max_elements(root->base.subtype))
1739 * Ok, we can collapse the root. If the root's children are leafs
1740 * the collapse is really simple. If they are internal nodes the
1741 * collapse is not so simple because we have to fixup the parent
1742 * pointers for the root's children's children.
1744 * When collapsing an internal node the far left and far right
1745 * element's boundaries should match the root's left and right
1748 if (root->base.subtype == HAMMER_BTREE_LEAF_NODE) {
1749 for (i = n = 0; i < count; ++i) {
1750 child = children[i];
1751 for (j = 0; j < child->base.count; ++j) {
1752 root->leaf.elms[n] = child->leaf.elms[j];
1756 root->base.type = root->base.subtype;
1757 root->base.subtype = 0;
1758 root->base.count = n;
1759 root->leaf.link_left = 0;
1760 root->leaf.link_right = 0;
1762 struct hammer_btree_internal_elm *elm;
1763 struct hammer_btree_internal_node *subchild;
1764 struct hammer_buffer *subchild_buffer = NULL;
1767 child = children[0];
1768 subsubtype = child->base.subtype;
1769 KKASSERT(child->base.count > 0);
1770 KKASSERT(root->internal.elms[0].base.key ==
1771 child->internal.elms[0].base.key);
1772 child = children[count-1];
1773 KKASSERT(child->base.count > 0);
1774 KKASSERT(root->internal.elms[count].base.key ==
1775 child->internal.elms[child->base.count].base.key);
1779 for (i = n = 0; i < count; ++i) {
1780 child = children[i];
1781 KKASSERT(child->base.subtype == subsubtype);
1782 for (j = 0; j < child->base.count; ++j) {
1783 elm = &child->internal.elms[j];
1785 root->internal.elms[n] = *elm;
1786 subchild = hammer_bread(cursor->cluster,
1787 elm->subtree_offset,
1792 subchild->base.parent = root_offset;
1793 hammer_modify_buffer(subchild_buffer);
1797 /* make sure the right boundary is correct */
1798 /* (this gets overwritten when the loop continues) */
1799 /* XXX generate a new separator? */
1800 root->internal.elms[n] = child->internal.elms[j];
1802 root->base.type = HAMMER_BTREE_INTERNAL_NODE;
1803 root->base.subtype = subsubtype;
1804 if (subchild_buffer)
1805 hammer_put_buffer(subchild_buffer, 0);
1814 hammer_modify_buffer(cursor->node_buffer);
1815 for (i = 0; i < count; ++i) {
1816 if (child_buffer[i])
1817 hammer_put_buffer(child_buffer[i], 0);