2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.14 2007/12/31 05:33:12 dillon Exp $
40 * HAMMER implements a modified B+Tree. In documentation this will
41 * simply be refered to as the HAMMER B-Tree. Basically a B-Tree
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
45 * otherwords, we have boundary elements at the two ends of a B-Tree node
46 * instead of sub-tree pointers.
48 * A B-Tree internal node looks like this:
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
53 * A B-Tree leaf node basically looks like this:
55 * L L L L L L L L <-- leaf elemenets
57 * The radix for an internal node is 1 less then a leaf but we get a
58 * number of significant benefits for our troubles.
60 * The big benefit to using a B-Tree containing boundary information
61 * is that it is possible to cache pointers into the middle of the tree
62 * and not have to start searches, insertions, OR deletions at the root
63 * node. In particular, searches are able to progress in a definitive
64 * direction from any point in the tree without revisting nodes. This
65 * greatly improves the efficiency of many operations, most especially
68 * B-Trees also make the stacking of trees fairly straightforward.
70 * INTER-CLUSTER ELEMENTS: An element of an internal node may reference
71 * the root of another cluster rather then a node in the current cluster.
72 * This is known as an inter-cluster references. Only B-Tree searches
73 * will cross cluster boundaries. The rebalancing and collapse code does
74 * not attempt to move children between clusters. A major effect of this
75 * is that we have to relax minimum element count requirements and allow
76 * trees to become somewhat unabalanced.
78 * INSERTIONS AND DELETIONS: When inserting we split full nodes on our
79 * way down as an optimization. I originally experimented with rebalancing
80 * nodes on the way down for deletions but it created a huge mess due to
81 * the way inter-cluster linkages work. Instead, now I simply allow
82 * the tree to become unbalanced and allow leaf nodes to become empty.
83 * The delete code will try to clean things up from the bottom-up but
84 * will stop if related elements are not in-core or if it cannot get a node
91 static int btree_search(hammer_cursor_t cursor, int flags);
92 static int btree_split_internal(hammer_cursor_t cursor);
93 static int btree_split_leaf(hammer_cursor_t cursor);
94 static int btree_remove(hammer_cursor_t cursor);
95 static int btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm);
97 static int btree_rebalance(hammer_cursor_t cursor);
98 static int btree_collapse(hammer_cursor_t cursor);
100 static int btree_node_is_full(hammer_node_ondisk_t node);
101 static void hammer_make_separator(hammer_base_elm_t key1,
102 hammer_base_elm_t key2, hammer_base_elm_t dest);
105 * Iterate records after a search. The cursor is iterated forwards past
106 * the current record until a record matching the key-range requirements
107 * is found. ENOENT is returned if the iteration goes past the ending
110 * The iteration is inclusive of key_beg and can be inclusive or exclusive
111 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set.
113 * cursor->key_beg may or may not be modified by this function during
114 * the iteration. XXX future - in case of an inverted lock we may have
115 * to reinitiate the lookup and set key_beg to properly pick up where we
119 hammer_btree_iterate(hammer_cursor_t cursor)
121 hammer_node_ondisk_t node;
122 hammer_btree_elm_t elm;
128 * Skip past the current record
130 node = cursor->node->ondisk;
133 if (cursor->index < node->count &&
134 (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
139 * Loop until an element is found or we are done.
143 * We iterate up the tree and then index over one element
144 * while we are at the last element in the current node.
146 * NOTE: This can pop us up to another cluster.
148 * If we are at the root of the root cluster, cursor_up
151 * NOTE: hammer_cursor_up() will adjust cursor->key_beg
152 * when told to re-search for the cluster tag.
154 * XXX this could be optimized by storing the information in
155 * the parent reference.
157 * XXX we can lose the node lock temporarily, this could mess
160 if (cursor->index == node->count) {
161 error = hammer_cursor_up(cursor, 0);
164 node = cursor->node->ondisk;
165 KKASSERT(cursor->index != node->count);
171 * Check internal or leaf element. Determine if the record
172 * at the cursor has gone beyond the end of our range.
173 * Remember that our key range is end-exclusive.
175 * Generally we recurse down through internal nodes. An
176 * internal node can only be returned if INCLUSTER is set
177 * and the node represents a cluster-push record. Internal
178 * elements do not contain create_tid/delete_tid information.
180 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
181 elm = &node->elms[cursor->index];
182 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
183 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
184 if (hammer_debug_btree) {
185 kprintf("BRACKETL %p:%d %016llx %02x %016llx %d\n",
186 cursor->node, cursor->index,
187 elm[0].internal.base.obj_id,
188 elm[0].internal.base.rec_type,
189 elm[0].internal.base.key,
192 kprintf("BRACKETR %p:%d %016llx %02x %016llx %d\n",
193 cursor->node, cursor->index + 1,
194 elm[1].internal.base.obj_id,
195 elm[1].internal.base.rec_type,
196 elm[1].internal.base.key,
205 if (r == 0 && (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
210 if ((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0 ||
211 elm->internal.rec_offset == 0) {
212 error = hammer_cursor_down(cursor);
215 KKASSERT(cursor->index == 0);
216 node = cursor->node->ondisk;
220 elm = &node->elms[cursor->index];
221 r = hammer_btree_cmp(&cursor->key_end, &elm->base);
222 if (hammer_debug_btree) {
223 kprintf("ELEMENT %p:%d %016llx %02x %016llx %d\n",
224 cursor->node, cursor->index,
225 elm[0].leaf.base.obj_id,
226 elm[0].leaf.base.rec_type,
227 elm[0].leaf.base.key,
235 if (r == 0 && (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
239 if ((cursor->flags & HAMMER_CURSOR_ALLHISTORY) == 0 &&
240 hammer_btree_chkts(cursor->key_beg.create_tid,
250 if (hammer_debug_btree) {
251 int i = cursor->index;
252 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i];
253 kprintf("ITERATE %p:%d %016llx %02x %016llx\n",
255 elm->internal.base.obj_id,
256 elm->internal.base.rec_type,
257 elm->internal.base.key
266 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry
267 * could not be found, and a fatal error otherwise.
269 * The cursor is suitably positioned for a deletion on success, and suitably
270 * positioned for an insertion on ENOENT.
272 * The cursor may begin anywhere, the search will traverse clusters in
273 * either direction to locate the requested element.
276 hammer_btree_lookup(hammer_cursor_t cursor)
280 error = btree_search(cursor, 0);
281 if (error == 0 && cursor->flags)
282 error = hammer_btree_extract(cursor, cursor->flags);
287 * Execute the logic required to start an iteration. The first record
288 * located within the specified range is returned and iteration control
289 * flags are adjusted for successive hammer_btree_iterate() calls.
292 hammer_btree_first(hammer_cursor_t cursor)
296 error = hammer_btree_lookup(cursor);
297 if (error == ENOENT) {
298 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
299 error = hammer_btree_iterate(cursor);
301 cursor->flags |= HAMMER_CURSOR_ATEDISK;
306 * Extract the record and/or data associated with the cursor's current
307 * position. Any prior record or data stored in the cursor is replaced.
308 * The cursor must be positioned at a leaf node.
310 * NOTE: Most extractions occur at the leaf of the B-Tree. The only
311 * extraction allowed at an internal element is at a cluster-push.
312 * Cluster-push elements have records but no data.
315 hammer_btree_extract(hammer_cursor_t cursor, int flags)
317 hammer_node_ondisk_t node;
318 hammer_btree_elm_t elm;
319 hammer_cluster_t cluster;
326 * A cluster record type has no data reference, the information
327 * is stored directly in the record and B-Tree element.
329 * The case where the data reference resolves to the same buffer
330 * as the record reference must be handled.
332 node = cursor->node->ondisk;
333 elm = &node->elms[cursor->index];
334 cluster = cursor->node->cluster;
335 cursor->flags &= ~HAMMER_CURSOR_DATA_EMBEDDED;
340 * Internal elements can only be cluster pushes. A cluster push has
343 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
344 cloff = elm->leaf.rec_offset;
345 KKASSERT(cloff != 0);
346 cursor->record = hammer_bread(cluster, cloff,
347 HAMMER_FSBUF_RECORDS, &error,
348 &cursor->record_buffer);
355 if ((flags & HAMMER_CURSOR_GET_RECORD) && error == 0) {
356 cloff = elm->leaf.rec_offset;
357 cursor->record = hammer_bread(cluster, cloff,
358 HAMMER_FSBUF_RECORDS, &error,
359 &cursor->record_buffer);
363 if ((flags & HAMMER_CURSOR_GET_DATA) && error == 0) {
364 if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) {
366 * The data is not in the same buffer as the last
367 * record we cached, but it could still be embedded
368 * in a record. Note that we may not have loaded the
369 * record's buffer above, depending on flags.
371 if ((elm->leaf.rec_offset ^ elm->leaf.data_offset) &
373 if (elm->leaf.data_len & HAMMER_BUFMASK)
374 buf_type = HAMMER_FSBUF_DATA;
376 buf_type = 0; /* pure data buffer */
378 buf_type = HAMMER_FSBUF_RECORDS;
380 cursor->data = hammer_bread(cluster,
381 elm->leaf.data_offset,
383 &cursor->data_buffer);
386 * Data in same buffer as record. Note that we
387 * leave any existing data_buffer intact, even
388 * though we don't use it in this case, in case
389 * other records extracted during an iteration
392 * The data must be embedded in the record for this
395 * Just assume the buffer type is correct.
397 cursor->data = (void *)
398 ((char *)cursor->record_buffer->ondisk +
399 (elm->leaf.data_offset & HAMMER_BUFMASK));
400 roff = (char *)cursor->data - (char *)cursor->record;
401 KKASSERT (roff >= 0 && roff < HAMMER_RECORD_SIZE);
402 cursor->flags |= HAMMER_CURSOR_DATA_EMBEDDED;
410 * Insert a leaf element into the B-Tree at the current cursor position.
411 * The cursor is positioned such that the element at and beyond the cursor
412 * are shifted to make room for the new record.
414 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
415 * flag set and that call must return ENOENT before this function can be
418 * ENOSPC is returned if there is no room to insert a new record.
421 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm)
423 hammer_node_ondisk_t parent;
424 hammer_node_ondisk_t node;
428 /* HANDLED BY CALLER */
430 * Issue a search to get our cursor at the right place. The search
431 * will get us to a leaf node.
433 * The search also does some setup for our insert, so there is always
436 error = btree_search(cursor, HAMMER_CURSOR_INSERT);
437 if (error != ENOENT) {
445 * Insert the element at the leaf node and update the count in the
446 * parent. It is possible for parent to be NULL, indicating that
447 * the root of the B-Tree in the cluster is a leaf. It is also
448 * possible for the leaf to be empty.
450 * Remember that the right-hand boundary is not included in the
453 hammer_modify_node(cursor->node);
454 node = cursor->node->ondisk;
456 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
457 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
458 if (i != node->count) {
459 bcopy(&node->elms[i], &node->elms[i+1],
460 (node->count - i) * sizeof(*elm));
462 node->elms[i] = *elm;
464 hammer_modify_node_done(cursor->node);
466 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0);
467 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0);
469 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->leaf.base) < 0);
470 if (i != node->count - 1)
471 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->leaf.base) > 0);
474 * Adjust the sub-tree count in the parent. note that the parent
475 * may be in a different cluster.
477 if (cursor->parent) {
478 hammer_modify_node(cursor->parent);
479 parent = cursor->parent->ondisk;
480 i = cursor->parent_index;
481 ++parent->elms[i].internal.subtree_count;
482 hammer_modify_node_done(cursor->parent);
483 KKASSERT(parent->elms[i].internal.subtree_count <= node->count);
489 * Delete a record from the B-Tree's at the current cursor position.
490 * The cursor is positioned such that the current element is the one
493 * On return the cursor will be positioned after the deleted element and
494 * MAY point to an internal node. It will be suitable for the continuation
495 * of an iteration but not for an insertion or deletion.
497 * Deletions will attempt to partially rebalance the B-Tree in an upward
498 * direction. It is possible to end up with empty leafs. An empty internal
499 * node is impossible (worst case: it has one element pointing to an empty
503 hammer_btree_delete(hammer_cursor_t cursor)
505 hammer_node_ondisk_t ondisk;
507 hammer_node_t parent;
508 hammer_btree_elm_t elm;
513 /* HANDLED BY CALLER */
515 * Locate the leaf element to delete. The search is also responsible
516 * for doing some of the rebalancing work on its way down.
518 error = btree_search(cursor, HAMMER_CURSOR_DELETE);
524 * Delete the element from the leaf node.
526 * Remember that leaf nodes do not have boundaries.
529 ondisk = node->ondisk;
532 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
533 hammer_modify_node(node);
534 if (i + 1 != ondisk->count) {
535 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
536 (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
539 hammer_modify_node_done(node);
540 if (cursor->parent != NULL) {
542 * Adjust parent's notion of the leaf's count. subtree_count
543 * is only approximate, it is allowed to be too small but
544 * never allowed to be too large. Make sure we don't drop
547 parent = cursor->parent;
548 hammer_modify_node(parent);
549 elm = &parent->ondisk->elms[cursor->parent_index];
550 if (elm->internal.subtree_count)
551 --elm->internal.subtree_count;
552 hammer_modify_node_done(parent);
553 KKASSERT(elm->internal.subtree_count <= ondisk->count);
557 * It is possible, but not desireable, to stop here. If the element
558 * count drops to 0 (which is allowed for a leaf), try recursively
559 * remove the B-Tree node.
561 * XXX rebalancing calls would go here too.
563 * This may reposition the cursor at one of the parent's of the
566 KKASSERT(cursor->index <= ondisk->count);
567 if (ondisk->count == 0) {
568 error = btree_remove(cursor);
578 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
580 * Search a cluster's B-Tree for cursor->key_beg, return the matching node.
582 * The search can begin ANYWHERE in the B-Tree. As a first step the search
583 * iterates up the tree as necessary to properly position itself prior to
584 * actually doing the sarch.
586 * INSERTIONS: The search will split full nodes and leaves on its way down
587 * and guarentee that the leaf it ends up on is not full. If we run out
588 * of space the search continues to the leaf (to position the cursor for
589 * the spike), but ENOSPC is returned.
591 * DELETIONS: The search will rebalance the tree on its way down. XXX
593 * The search is only guarenteed to end up on a leaf if an error code of 0
594 * is returned, or if inserting and an error code of ENOENT is returned.
595 * Otherwise it can stop at an internal node. On success a search returns
596 * a leaf node unless INCLUSTER is set and the search located a cluster push
597 * node (which is an internal node).
601 btree_search(hammer_cursor_t cursor, int flags)
603 hammer_node_ondisk_t node;
604 hammer_cluster_t cluster;
610 flags |= cursor->flags;
612 if (hammer_debug_btree) {
613 kprintf("SEARCH %p:%d %016llx %02x %016llx\n",
614 cursor->node, cursor->index,
615 cursor->key_beg.obj_id,
616 cursor->key_beg.rec_type,
622 * Move our cursor up the tree until we find a node whos range covers
623 * the key we are trying to locate. This may move us between
626 * The left bound is inclusive, the right bound is non-inclusive.
627 * It is ok to cursor up too far so when cursoring across a cluster
630 * First see if we can skip the whole cluster. hammer_cursor_up()
631 * handles both cases but this way we don't check the cluster
632 * bounds when going up the tree within a cluster.
634 * NOTE: If INCLUSTER is set and we are at the root of the cluster,
635 * hammer_cursor_up() will return ENOENT.
637 cluster = cursor->node->cluster;
639 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 ||
640 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) {
641 error = hammer_cursor_toroot(cursor);
644 error = hammer_cursor_up(cursor, 0);
647 cluster = cursor->node->cluster;
651 * Deal with normal cursoring within a cluster. The right bound
652 * is non-inclusive. That is, the bounds form a separator.
654 while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 ||
655 hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) {
656 error = hammer_cursor_up(cursor, 0);
662 * We better have ended up with a node somewhere, and our second
663 * while loop had better not have traversed up a cluster.
665 KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster);
668 * If we are inserting we can't start at a full node if the parent
669 * is also full (because there is no way to split the node),
670 * continue running up the tree until we hit the root of the
671 * root cluster or until the requirement is satisfied.
673 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
675 * XXX as an optimization it should be possible to unbalance the tree
676 * and stop at the root of the current cluster.
678 while (flags & HAMMER_CURSOR_INSERT) {
679 if (btree_node_is_full(cursor->node->ondisk) == 0)
681 if (cursor->parent == NULL)
683 if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS)
685 error = hammer_cursor_up(cursor, 0);
686 /* cluster and node are now may become stale */
690 /* cluster = cursor->node->cluster; not needed until next cluster = */
694 * If we are deleting we can't start at an internal node with only
695 * one element unless it is root, because all of our code assumes
696 * that internal nodes will never be empty. Just do this generally
697 * for both leaf and internal nodes to get better balance.
699 * This handles the case where the cursor is sitting at a leaf and
700 * either the leaf or parent contain an insufficient number of
703 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
705 * XXX NOTE: Iterations may not set this flag anyway.
707 while (flags & HAMMER_CURSOR_DELETE) {
708 if (cursor->node->ondisk->count > 1)
710 if (cursor->parent == NULL)
712 KKASSERT(cursor->node->ondisk->count != 0);
713 error = hammer_cursor_up(cursor, 0);
714 /* cluster and node are now may become stale */
722 * Push down through internal nodes to locate the requested key.
724 cluster = cursor->node->cluster;
725 node = cursor->node->ondisk;
726 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
729 * If we are a the root node and deleting, try to collapse
730 * all of the root's children into the root. This is the
731 * only point where tree depth is reduced.
733 * XXX NOTE: Iterations may not set this flag anyway.
735 if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) {
736 error = btree_collapse(cursor);
737 /* node becomes stale after call */
742 node = cursor->node->ondisk;
745 * Scan the node to find the subtree index to push down into.
746 * We go one-past, then back-up.
748 for (i = 0; i < node->count; ++i) {
749 r = hammer_btree_cmp(&cursor->key_beg,
750 &node->elms[i].base);
756 * It is possible for the search to terminate at i == 0,
757 * which is to the LEFT of the LEFT boundary but the RIGHT
758 * of the parent's boundary on the left of the sub-tree
759 * element. This case can occur due to deletions (see
762 * When this case occurs an ENOENT return is guarenteed but
763 * if inserting we must still terminate at a leaf. The
764 * solution is to make the node's left boundary inherit the
765 * boundary stored in the parent.
767 * When doing this inheritance some fields in 'base' are
768 * actually related to the internal element's child
769 * specification and not to the key. These have to be
772 * If we terminate at i == count it is still possible to
773 * be to the RIGHT of the RIGHT boundary but still to the
774 * LEFT of the parent's RIGHT boundary. The solution is to
775 * adjust the RIGHT boundary to match the parent. This
776 * case can occur due to deletions (see btree_remove()).
781 if ((flags & HAMMER_CURSOR_INSERT) == 0) {
785 hammer_modify_node(cursor->node);
786 save = node->elms[0].subtree_type;
787 node->elms[0].base = *cursor->left_bound;
788 node->elms[0].subtree_type = save;
789 hammer_modify_node_done(cursor->node);
790 } else if (i == node->count) {
792 * Terminate early if not inserting and the key is
793 * beyond the uncorrected right hand boundary. The
794 * index must be PAST the last element to prevent an
795 * iteration from returning elements to the left of
798 if ((flags & HAMMER_CURSOR_INSERT) == 0 &&
799 hammer_btree_cmp(&cursor->key_beg,
800 &node->elms[i].base) >= 0
807 * Correct a right-hand boundary mismatch. The push
808 * index is the last element (i-1).
810 if (hammer_btree_cmp(&node->elms[i].base,
811 cursor->right_bound) != 0) {
812 hammer_modify_node(cursor->node);
813 node->elms[i].base = *cursor->right_bound;
814 hammer_modify_node_done(cursor->node);
819 * The push-down index is now i - 1.
825 if (hammer_debug_btree) {
826 hammer_btree_elm_t elm = &node->elms[i];
827 kprintf("SEARCH-I %p:%d %016llx %02x %016llx\n",
829 elm->internal.base.obj_id,
830 elm->internal.base.rec_type,
831 elm->internal.base.key
836 * Handle insertion and deletion requirements.
838 * If inserting split full nodes. The split code will
839 * adjust cursor->node and cursor->index if the current
840 * index winds up in the new node.
842 if (flags & HAMMER_CURSOR_INSERT) {
843 if (node->count == HAMMER_BTREE_INT_ELMS) {
844 error = btree_split_internal(cursor);
849 flags &= ~HAMMER_CURSOR_INSERT;
852 * reload stale pointers
855 node = cursor->node->ondisk;
861 * If deleting rebalance - do not allow the child to have
862 * just one element or we will not be able to delete it.
864 * Neither internal or leaf nodes (except a root-leaf) are
865 * allowed to drop to 0 elements. (XXX - well, leaf nodes
866 * can at the moment).
868 * Our separators may have been reorganized after rebalancing,
869 * so we have to pop back up and rescan.
871 * XXX test for subtree_count < maxelms / 2, minus 1 or 2
874 * XXX NOTE: Iterations may not set this flag anyway.
876 if (flags & HAMMER_CURSOR_DELETE) {
877 if (node->elms[i].internal.subtree_count <= 1) {
878 error = btree_rebalance(cursor);
881 /* cursor->index is invalid after call */
887 * Cluster pushes are done with internal elements. If this
888 * is a cluster push (rec_offset != 0), and INCLUSTER is set,
891 * However, because this is an internal node we have to
892 * determine whether key_beg is within its range and return
893 * 0 or ENOENT appropriately.
895 if ((flags & HAMMER_CURSOR_INCLUSTER) &&
896 node->elms[i].internal.rec_offset) {
897 r = hammer_btree_cmp(&cursor->key_beg,
898 &node->elms[i+1].base);
899 error = (r < 0) ? 0 : (enospc ? ENOSPC : ENOENT);
904 * Push down (push into new node, existing node becomes
905 * the parent) and continue the search.
907 error = hammer_cursor_down(cursor);
908 /* node and cluster become stale */
911 node = cursor->node->ondisk;
912 cluster = cursor->node->cluster;
916 * We are at a leaf, do a linear search of the key array.
918 * On success the index is set to the matching element and 0
921 * On failure the index is set to the insertion point and ENOENT
924 * Boundaries are not stored in leaf nodes, so the index can wind
925 * up to the left of element 0 (index == 0) or past the end of
926 * the array (index == node->count).
928 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
930 for (i = 0; i < node->count; ++i) {
931 r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base);
934 * Stop if we've flipped past key_beg
940 * Return an exact match
945 if (hammer_debug_btree) {
946 kprintf("SEARCH-L %p:%d (SUCCESS)\n",
953 if (hammer_debug_btree) {
954 kprintf("SEARCH-L %p:%d (FAILED)\n",
959 * No exact match was found, i is now at the insertion point.
961 * If inserting split a full leaf before returning. This
962 * may have the side effect of adjusting cursor->node and
966 if ((flags & HAMMER_CURSOR_INSERT) &&
967 node->count == HAMMER_BTREE_LEAF_ELMS) {
968 error = btree_split_leaf(cursor);
973 flags &= ~HAMMER_CURSOR_INSERT;
976 * reload stale pointers
980 node = &cursor->node->internal;
985 * We reached a leaf but did not find the key we were looking for.
986 * If this is an insert we will be properly positioned for an insert
987 * (ENOENT) or spike (ENOSPC) operation.
989 error = enospc ? ENOSPC : ENOENT;
995 /************************************************************************
996 * SPLITTING AND MERGING *
997 ************************************************************************
999 * These routines do all the dirty work required to split and merge nodes.
1003 * Split an internal node into two nodes and move the separator at the split
1004 * point to the parent. Note that the parent's parent's element pointing
1005 * to our parent will have an incorrect subtree_count (we don't update it).
1006 * It will be low, which is ok.
1008 * (cursor->node, cursor->index) indicates the element the caller intends
1009 * to push into. We will adjust node and index if that element winds
1010 * up in the split node.
1012 * If we are at the root of a cluster a new root must be created with two
1013 * elements, one pointing to the original root and one pointing to the
1014 * newly allocated split node.
1016 * NOTE! Being at the root of a cluster is different from being at the
1017 * root of the root cluster. cursor->parent will not be NULL and
1018 * cursor->node->ondisk.parent must be tested against 0. Theoretically
1019 * we could propogate the algorithm into the parent and deal with multiple
1020 * 'roots' in the cluster header, but it's easier not to.
1024 btree_split_internal(hammer_cursor_t cursor)
1026 hammer_node_ondisk_t ondisk;
1028 hammer_node_t parent;
1029 hammer_node_t new_node;
1030 hammer_btree_elm_t elm;
1031 hammer_btree_elm_t parent_elm;
1037 const int esize = sizeof(*elm);
1040 * We are splitting but elms[split] will be promoted to the parent,
1041 * leaving the right hand node with one less element. If the
1042 * insertion point will be on the left-hand side adjust the split
1043 * point to give the right hand side one additional node.
1045 node = cursor->node;
1046 ondisk = node->ondisk;
1047 split = (ondisk->count + 1) / 2;
1048 if (cursor->index <= split)
1053 * If we are at the root of the cluster, create a new root node with
1054 * 1 element and split normally. Avoid making major modifications
1055 * until we know the whole operation will work.
1057 * The root of the cluster is different from the root of the root
1058 * cluster. Use the node's on-disk structure's parent offset to
1061 if (ondisk->parent == 0) {
1062 parent = hammer_alloc_btree(node->cluster, &error);
1065 hammer_lock_ex(&parent->lock);
1066 hammer_modify_node(parent);
1067 ondisk = parent->ondisk;
1070 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1071 ondisk->elms[0].base = node->cluster->clu_btree_beg;
1072 ondisk->elms[0].internal.subtree_type = node->ondisk->type;
1073 ondisk->elms[0].internal.subtree_offset = node->node_offset;
1074 ondisk->elms[1].base = node->cluster->clu_btree_end;
1076 parent_index = 0; /* index of current node in parent */
1077 hammer_modify_node_done(parent);
1080 parent = cursor->parent;
1081 parent_index = cursor->parent_index;
1082 KKASSERT(parent->cluster == node->cluster);
1086 * Split node into new_node at the split point.
1088 * B O O O P N N B <-- P = node->elms[split]
1089 * 0 1 2 3 4 5 6 <-- subtree indices
1094 * B O O O B B N N B <--- inner boundary points are 'P'
1098 new_node = hammer_alloc_btree(node->cluster, &error);
1099 if (new_node == NULL) {
1101 hammer_unlock(&parent->lock);
1102 parent->flags |= HAMMER_NODE_DELETED;
1103 hammer_rel_node(parent);
1107 hammer_lock_ex(&new_node->lock);
1110 * Create the new node. P becomes the left-hand boundary in the
1111 * new node. Copy the right-hand boundary as well.
1113 * elm is the new separator.
1115 hammer_modify_node(new_node);
1116 hammer_modify_node(node);
1117 ondisk = node->ondisk;
1118 elm = &ondisk->elms[split];
1119 bcopy(elm, &new_node->ondisk->elms[0],
1120 (ondisk->count - split + 1) * esize);
1121 new_node->ondisk->count = ondisk->count - split;
1122 new_node->ondisk->parent = parent->node_offset;
1123 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1124 KKASSERT(ondisk->type == new_node->ondisk->type);
1127 * Cleanup the original node. P becomes the new boundary, its
1128 * subtree_offset was moved to the new node. If we had created
1129 * a new root its parent pointer may have changed.
1131 elm->internal.subtree_offset = 0;
1132 elm->internal.rec_offset = 0;
1133 ondisk->count = split;
1136 * Insert the separator into the parent, fixup the parent's
1137 * reference to the original node, and reference the new node.
1138 * The separator is P.
1140 * Remember that base.count does not include the right-hand boundary.
1142 hammer_modify_node(parent);
1143 ondisk = parent->ondisk;
1144 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
1145 ondisk->elms[parent_index].internal.subtree_count = split;
1146 parent_elm = &ondisk->elms[parent_index+1];
1147 bcopy(parent_elm, parent_elm + 1,
1148 (ondisk->count - parent_index) * esize);
1149 parent_elm->internal.base = elm->base; /* separator P */
1150 parent_elm->internal.subtree_offset = new_node->node_offset;
1151 parent_elm->internal.subtree_count = new_node->ondisk->count;
1152 parent_elm->internal.subtree_type = new_node->ondisk->type;
1153 parent_elm->internal.subtree_vol_no = 0;
1154 parent_elm->internal.rec_offset = 0;
1156 hammer_modify_node_done(parent);
1159 * The children of new_node need their parent pointer set to new_node.
1161 for (i = 0; i < new_node->ondisk->count; ++i) {
1162 elm = &new_node->ondisk->elms[i];
1163 error = btree_set_parent(new_node, elm);
1165 panic("btree_split_internal: btree-fixup problem");
1170 * The cluster's root pointer may have to be updated.
1173 hammer_modify_cluster(node->cluster);
1174 node->cluster->ondisk->clu_btree_root = parent->node_offset;
1175 hammer_modify_cluster_done(node->cluster);
1176 node->ondisk->parent = parent->node_offset;
1177 if (cursor->parent) {
1178 hammer_unlock(&cursor->parent->lock);
1179 hammer_rel_node(cursor->parent);
1181 cursor->parent = parent; /* lock'd and ref'd */
1183 hammer_modify_node_done(new_node);
1184 hammer_modify_node_done(node);
1188 * Ok, now adjust the cursor depending on which element the original
1189 * index was pointing at. If we are >= the split point the push node
1190 * is now in the new node.
1192 * NOTE: If we are at the split point itself we cannot stay with the
1193 * original node because the push index will point at the right-hand
1194 * boundary, which is illegal.
1196 * NOTE: The cursor's parent or parent_index must be adjusted for
1197 * the case where a new parent (new root) was created, and the case
1198 * where the cursor is now pointing at the split node.
1200 if (cursor->index >= split) {
1201 cursor->parent_index = parent_index + 1;
1202 cursor->index -= split;
1203 hammer_unlock(&cursor->node->lock);
1204 hammer_rel_node(cursor->node);
1205 cursor->node = new_node; /* locked and ref'd */
1207 cursor->parent_index = parent_index;
1208 hammer_unlock(&new_node->lock);
1209 hammer_rel_node(new_node);
1213 * Fixup left and right bounds
1215 parent_elm = &parent->ondisk->elms[cursor->parent_index];
1216 cursor->left_bound = &parent_elm[0].internal.base;
1217 cursor->right_bound = &parent_elm[1].internal.base;
1218 KKASSERT(hammer_btree_cmp(cursor->left_bound,
1219 &cursor->node->ondisk->elms[0].internal.base) <= 0);
1220 KKASSERT(hammer_btree_cmp(cursor->right_bound,
1221 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].internal.base) > 0);
1227 * Same as the above, but splits a full leaf node.
1231 btree_split_leaf(hammer_cursor_t cursor)
1233 hammer_node_ondisk_t ondisk;
1234 hammer_node_t parent;
1236 hammer_node_t new_leaf;
1237 hammer_btree_elm_t elm;
1238 hammer_btree_elm_t parent_elm;
1239 hammer_base_elm_t mid_boundary;
1244 const size_t esize = sizeof(*elm);
1247 * Calculate the split point. If the insertion point will be on
1248 * the left-hand side adjust the split point to give the right
1249 * hand side one additional node.
1251 leaf = cursor->node;
1252 ondisk = leaf->ondisk;
1253 split = (ondisk->count + 1) / 2;
1254 if (cursor->index <= split)
1259 * If we are at the root of the tree, create a new root node with
1260 * 1 element and split normally. Avoid making major modifications
1261 * until we know the whole operation will work.
1263 if (ondisk->parent == 0) {
1264 parent = hammer_alloc_btree(leaf->cluster, &error);
1267 hammer_lock_ex(&parent->lock);
1268 hammer_modify_node(parent);
1269 ondisk = parent->ondisk;
1272 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1273 ondisk->elms[0].base = leaf->cluster->clu_btree_beg;
1274 ondisk->elms[0].internal.subtree_type = leaf->ondisk->type;
1275 ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
1276 ondisk->elms[1].base = leaf->cluster->clu_btree_end;
1277 hammer_modify_node_done(parent);
1279 parent_index = 0; /* insertion point in parent */
1282 parent = cursor->parent;
1283 parent_index = cursor->parent_index;
1284 KKASSERT(parent->cluster == leaf->cluster);
1288 * Split leaf into new_leaf at the split point. Select a separator
1289 * value in-between the two leafs but with a bent towards the right
1290 * leaf since comparisons use an 'elm >= separator' inequality.
1299 new_leaf = hammer_alloc_btree(leaf->cluster, &error);
1300 if (new_leaf == NULL) {
1302 hammer_unlock(&parent->lock);
1303 parent->flags |= HAMMER_NODE_DELETED;
1304 hammer_rel_node(parent);
1308 hammer_lock_ex(&new_leaf->lock);
1311 * Create the new node. P become the left-hand boundary in the
1312 * new node. Copy the right-hand boundary as well.
1314 hammer_modify_node(leaf);
1315 hammer_modify_node(new_leaf);
1316 ondisk = leaf->ondisk;
1317 elm = &ondisk->elms[split];
1318 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1319 new_leaf->ondisk->count = ondisk->count - split;
1320 new_leaf->ondisk->parent = parent->node_offset;
1321 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1322 KKASSERT(ondisk->type == new_leaf->ondisk->type);
1325 * Cleanup the original node. Because this is a leaf node and
1326 * leaf nodes do not have a right-hand boundary, there
1327 * aren't any special edge cases to clean up. We just fixup the
1330 ondisk->count = split;
1333 * Insert the separator into the parent, fixup the parent's
1334 * reference to the original node, and reference the new node.
1335 * The separator is P.
1337 * Remember that base.count does not include the right-hand boundary.
1338 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1340 hammer_modify_node(parent);
1341 ondisk = parent->ondisk;
1342 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
1343 ondisk->elms[parent_index].internal.subtree_count = split;
1344 parent_elm = &ondisk->elms[parent_index+1];
1345 bcopy(parent_elm, parent_elm + 1,
1346 (ondisk->count - parent_index) * esize);
1347 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1348 parent_elm->internal.subtree_offset = new_leaf->node_offset;
1349 parent_elm->internal.subtree_count = new_leaf->ondisk->count;
1350 parent_elm->internal.subtree_type = new_leaf->ondisk->type;
1351 parent_elm->internal.subtree_vol_no = 0;
1352 parent_elm->internal.rec_offset = 0;
1353 mid_boundary = &parent_elm->base;
1355 hammer_modify_node_done(parent);
1358 * The cluster's root pointer may have to be updated.
1361 hammer_modify_cluster(leaf->cluster);
1362 leaf->cluster->ondisk->clu_btree_root = parent->node_offset;
1363 hammer_modify_cluster_done(leaf->cluster);
1364 leaf->ondisk->parent = parent->node_offset;
1365 if (cursor->parent) {
1366 hammer_unlock(&cursor->parent->lock);
1367 hammer_rel_node(cursor->parent);
1369 cursor->parent = parent; /* lock'd and ref'd */
1371 hammer_modify_node_done(leaf);
1372 hammer_modify_node_done(new_leaf);
1375 * Ok, now adjust the cursor depending on which element the original
1376 * index was pointing at. If we are >= the split point the push node
1377 * is now in the new node.
1379 * NOTE: If we are at the split point itself we need to select the
1380 * old or new node based on where key_beg's insertion point will be.
1381 * If we pick the wrong side the inserted element will wind up in
1382 * the wrong leaf node and outside that node's bounds.
1384 if (cursor->index > split ||
1385 (cursor->index == split &&
1386 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) {
1387 cursor->parent_index = parent_index + 1;
1388 cursor->index -= split;
1389 hammer_unlock(&cursor->node->lock);
1390 hammer_rel_node(cursor->node);
1391 cursor->node = new_leaf;
1393 cursor->parent_index = parent_index;
1394 hammer_unlock(&new_leaf->lock);
1395 hammer_rel_node(new_leaf);
1399 * Fixup left and right bounds
1401 parent_elm = &parent->ondisk->elms[cursor->parent_index];
1402 cursor->left_bound = &parent_elm[0].internal.base;
1403 cursor->right_bound = &parent_elm[1].internal.base;
1404 KKASSERT(hammer_btree_cmp(cursor->left_bound,
1405 &cursor->node->ondisk->elms[0].leaf.base) <= 0);
1406 KKASSERT(hammer_btree_cmp(cursor->right_bound,
1407 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0);
1413 * Attempt to remove the empty B-Tree node at (cursor->node). Returns 0
1414 * on success, EAGAIN if we could not acquire the necessary locks, or some
1417 * On return the cursor may end up pointing at an internal node, suitable
1418 * for further iteration but not for an immediate insertion or deletion.
1420 * cursor->node may be an internal node or a leaf node.
1422 * NOTE: If cursor->node has one element it is the parent trying to delete
1423 * that element, make sure cursor->index is properly adjusted on success.
1426 btree_remove(hammer_cursor_t cursor)
1428 hammer_node_ondisk_t ondisk;
1429 hammer_btree_elm_t elm;
1432 hammer_node_t parent;
1437 * If we are at the root of the root cluster there is nothing to
1438 * remove, but an internal node at the root of a cluster is not
1439 * allowed to be empty so convert it to a leaf node.
1441 if (cursor->parent == NULL) {
1442 hammer_modify_node(cursor->node);
1443 ondisk = cursor->node->ondisk;
1444 KKASSERT(ondisk->parent == 0);
1445 ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1448 hammer_modify_node_done(cursor->node);
1449 kprintf("EMPTY ROOT OF ROOT CLUSTER -> LEAF\n");
1454 * Retain a reference to cursor->node, ex-lock again (2 locks now)
1455 * so we do not lose the lock when we cursor around.
1457 save = cursor->node;
1458 hammer_ref_node(save);
1459 hammer_lock_ex(&save->lock);
1462 * We need to be able to lock the parent of the parent. Do this
1463 * non-blocking and return EAGAIN if the lock cannot be acquired.
1464 * non-blocking is required in order to avoid a deadlock.
1466 * After we cursor up, parent is moved to node and the new parent
1467 * is the parent of the parent.
1469 error = hammer_cursor_up(cursor, 1);
1471 kprintf("BTREE_REMOVE: Cannot lock parent, skipping\n");
1476 * At this point we want to remove the element at (node, index),
1477 * which is now the (original) parent pointing to the saved node.
1478 * Removing the element allows us to then free the node it was
1481 * However, an internal node is not allowed to have 0 elements, so
1482 * if the count would drop to 0 we have to recurse. It is possible
1483 * for the recursion to fail.
1485 * NOTE: The cursor is in an indeterminant position after recursing,
1486 * but will still be suitable for an iteration.
1488 node = cursor->node;
1489 KKASSERT(node->ondisk->count > 0);
1490 if (node->ondisk->count == 1) {
1491 error = btree_remove(cursor);
1493 /*kprintf("BTREE_REMOVE: Successful!\n");*/
1496 kprintf("BTREE_REMOVE: Recursion failed %d\n", error);
1502 * Remove the element at (node, index) and adjust the parent's
1505 * NOTE! If removing element 0 an internal node's left-hand boundary
1506 * will no longer match its parent. If removing a mid-element the
1507 * boundary will no longer match a child's left hand or right hand
1510 * BxBxBxB remove a (x[0]): internal node's left-hand
1511 * | | | boundary no longer matches
1514 * remove b (x[1]): a's right hand boundary no
1515 * longer matches parent.
1517 * remove c (x[2]): b's right hand boundary no
1518 * longer matches parent.
1520 * These cases are corrected in btree_search().
1523 kprintf("BTREE_REMOVE: Removing element %d\n", cursor->index);
1525 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
1526 KKASSERT(cursor->index < node->ondisk->count);
1527 hammer_modify_node(node);
1528 ondisk = node->ondisk;
1530 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
1531 (ondisk->count - i) * sizeof(ondisk->elms[0]));
1533 hammer_modify_node_done(node);
1536 * Adjust the parent-parent's (now parent) reference to the parent
1539 if ((parent = cursor->parent) != NULL) {
1540 elm = &parent->ondisk->elms[cursor->parent_index];
1541 if (elm->internal.subtree_count != ondisk->count) {
1542 hammer_modify_node(parent);
1543 elm->internal.subtree_count = ondisk->count;
1544 hammer_modify_node_done(parent);
1546 if (elm->subtree_type != HAMMER_BTREE_TYPE_CLUSTER &&
1547 elm->subtree_type != ondisk->type) {
1548 hammer_modify_node(parent);
1549 elm->subtree_type = ondisk->type;
1550 hammer_modify_node_done(parent);
1556 * Free the saved node. If the saved node was the root of a
1557 * cluster, free the entire cluster.
1559 hammer_flush_node(save);
1560 save->flags |= HAMMER_NODE_DELETED;
1564 hammer_unlock(&save->lock);
1565 hammer_rel_node(save);
1570 * The child represented by the element in internal node node needs
1571 * to have its parent pointer adjusted.
1575 btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm)
1577 hammer_volume_t volume;
1578 hammer_cluster_t cluster;
1579 hammer_node_t child;
1584 switch(elm->internal.subtree_type) {
1585 case HAMMER_BTREE_TYPE_LEAF:
1586 case HAMMER_BTREE_TYPE_INTERNAL:
1587 child = hammer_get_node(node->cluster,
1588 elm->internal.subtree_offset, &error);
1590 hammer_modify_node(child);
1591 hammer_lock_ex(&child->lock);
1592 child->ondisk->parent = node->node_offset;
1593 hammer_unlock(&child->lock);
1594 hammer_modify_node_done(child);
1595 hammer_rel_node(child);
1598 case HAMMER_BTREE_TYPE_CLUSTER:
1599 volume = hammer_get_volume(node->cluster->volume->hmp,
1600 elm->internal.subtree_vol_no, &error);
1603 cluster = hammer_get_cluster(volume,
1604 elm->internal.subtree_clu_no,
1606 hammer_rel_volume(volume, 0);
1609 hammer_modify_cluster(cluster);
1610 hammer_lock_ex(&cluster->io.lock);
1611 cluster->ondisk->clu_btree_parent_offset = node->node_offset;
1612 hammer_unlock(&cluster->io.lock);
1613 hammer_modify_cluster_done(cluster);
1614 KKASSERT(cluster->ondisk->clu_btree_parent_clu_no ==
1615 node->cluster->clu_no);
1616 KKASSERT(cluster->ondisk->clu_btree_parent_vol_no ==
1617 node->cluster->volume->vol_no);
1618 hammer_rel_cluster(cluster, 0);
1621 hammer_print_btree_elm(elm, HAMMER_BTREE_TYPE_INTERNAL, -1);
1622 panic("btree_set_parent: bad subtree_type");
1623 break; /* NOT REACHED */
1631 * This routine is only called if the cursor is at the root node and the
1632 * root node is an internal node. We attempt to collapse the root node
1633 * by replacing it with all of its children, reducing tree depth by one.
1635 * This is the only way to reduce tree depth in a HAMMER filesystem.
1636 * Note that all leaf nodes are at the same depth.
1638 * This is a fairly expensive operation because we not only have to load
1639 * the root's children, we also have to scan each child and adjust the
1640 * parent offset for each element in each child. Nasty all around.
1644 btree_collapse(hammer_cursor_t cursor)
1646 hammer_btree_node_ondisk_t root, child;
1647 hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS];
1648 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1653 int32_t root_offset;
1654 u_int8_t subsubtype;
1656 root = cursor->node;
1657 count = root->base.count;
1658 root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1661 * Sum up the number of children each element has. This value is
1662 * only approximate due to the way the insertion node works. It
1663 * may be too small but it will never be too large.
1665 * Quickly terminate the collapse if the elements have too many
1668 KKASSERT(root->base.parent == 0); /* must be root node */
1669 KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL);
1670 KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1672 for (i = n = 0; i < count; ++i) {
1673 n += root->internal.elms[i].subtree_count;
1675 if (n > btree_max_elements(root->base.subtype))
1679 * Iterate through the elements again and correct the subtree_count.
1680 * Terminate the collapse if we wind up with too many.
1685 for (i = n = 0; i < count; ++i) {
1686 struct hammer_btree_internal_elm *elm;
1688 elm = &root->internal.elms[i];
1689 child_buffer[i] = NULL;
1691 if (elm->subtree_offset == 0)
1693 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1694 HAMMER_FSBUF_BTREE, &error,
1695 &child_buffer[i], XXX);
1696 children[i] = child;
1699 KKASSERT(root->base.subtype == child->base.type);
1702 * Accumulate n for a good child, update the root's count
1705 if (root->internal.elms[i].subtree_count != child->base.count) {
1706 root->internal.elms[i].subtree_count = child->base.count;
1709 n += root->internal.elms[i].subtree_count;
1711 if (error || n > btree_max_elements(root->base.subtype))
1715 * Ok, we can collapse the root. If the root's children are leafs
1716 * the collapse is really simple. If they are internal nodes the
1717 * collapse is not so simple because we have to fixup the parent
1718 * pointers for the root's children's children.
1720 * When collapsing an internal node the far left and far right
1721 * element's boundaries should match the root's left and right
1724 if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
1725 for (i = n = 0; i < count; ++i) {
1726 child = children[i];
1727 for (j = 0; j < child->base.count; ++j) {
1728 root->leaf.elms[n] = child->leaf.elms[j];
1732 root->base.type = root->base.subtype;
1733 root->base.subtype = 0;
1734 root->base.count = n;
1735 root->leaf.link_left = 0;
1736 root->leaf.link_right = 0;
1738 struct hammer_btree_internal_elm *elm;
1739 struct hammer_btree_internal_node *subchild;
1740 struct hammer_buffer *subchild_buffer = NULL;
1743 child = children[0];
1744 subsubtype = child->base.subtype;
1745 KKASSERT(child->base.count > 0);
1746 KKASSERT(root->internal.elms[0].base.key ==
1747 child->internal.elms[0].base.key);
1748 child = children[count-1];
1749 KKASSERT(child->base.count > 0);
1750 KKASSERT(root->internal.elms[count].base.key ==
1751 child->internal.elms[child->base.count].base.key);
1755 for (i = n = 0; i < count; ++i) {
1756 child = children[i];
1757 KKASSERT(child->base.subtype == subsubtype);
1758 for (j = 0; j < child->base.count; ++j) {
1759 elm = &child->internal.elms[j];
1761 root->internal.elms[n] = *elm;
1762 subchild = hammer_bread(cursor->cluster,
1763 elm->subtree_offset,
1769 subchild->base.parent = root_offset;
1770 hammer_modify_buffer(subchild_buffer);
1774 /* make sure the right boundary is correct */
1775 /* (this gets overwritten when the loop continues) */
1776 /* XXX generate a new separator? */
1777 root->internal.elms[n] = child->internal.elms[j];
1779 root->base.type = HAMMER_BTREE_TYPE_INTERNAL;
1780 root->base.subtype = subsubtype;
1781 if (subchild_buffer)
1782 hammer_put_buffer(subchild_buffer, 0);
1791 hammer_modify_buffer(cursor->node_buffer);
1792 for (i = 0; i < count; ++i) {
1793 if (child_buffer[i])
1794 hammer_put_buffer(child_buffer[i], 0);
1801 /************************************************************************
1802 * MISCELLANIOUS SUPPORT *
1803 ************************************************************************/
1806 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp).
1808 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
1811 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
1813 if (key1->obj_id < key2->obj_id)
1815 if (key1->obj_id > key2->obj_id)
1818 if (key1->rec_type < key2->rec_type)
1820 if (key1->rec_type > key2->rec_type)
1823 if (key1->key < key2->key)
1825 if (key1->key > key2->key)
1831 * Test a non-zero timestamp against an element to determine whether the
1832 * element is visible.
1835 hammer_btree_chkts(hammer_tid_t create_tid, hammer_base_elm_t base)
1837 if (create_tid < base->create_tid)
1839 if (base->delete_tid && create_tid >= base->delete_tid)
1845 * Create a separator half way inbetween key1 and key2. For fields just
1846 * one unit apart, the separator will match key2.
1848 * The handling of delete_tid is a little confusing. It is only possible
1849 * to have one record in the B-Tree where all fields match except delete_tid.
1850 * This means, worse case, two adjacent elements may have a create_tid that
1851 * is one-apart and cause the separator to choose the right-hand element's
1852 * create_tid. e.g. (create,delete): (1,x)(2,x) -> separator is (2,x).
1854 * So all we have to do is set delete_tid to the right-hand element to
1855 * guarentee that the separator is properly between the two elements.
1857 #define MAKE_SEPARATOR(key1, key2, dest, field) \
1858 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
1861 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
1862 hammer_base_elm_t dest)
1864 bzero(dest, sizeof(*dest));
1865 MAKE_SEPARATOR(key1, key2, dest, obj_id);
1866 MAKE_SEPARATOR(key1, key2, dest, rec_type);
1867 MAKE_SEPARATOR(key1, key2, dest, key);
1868 MAKE_SEPARATOR(key1, key2, dest, create_tid);
1869 dest->delete_tid = key2->delete_tid;
1872 #undef MAKE_SEPARATOR
1875 * Return whether a generic internal or leaf node is full
1878 btree_node_is_full(hammer_node_ondisk_t node)
1880 switch(node->type) {
1881 case HAMMER_BTREE_TYPE_INTERNAL:
1882 if (node->count == HAMMER_BTREE_INT_ELMS)
1885 case HAMMER_BTREE_TYPE_LEAF:
1886 if (node->count == HAMMER_BTREE_LEAF_ELMS)
1890 panic("illegal btree subtype");
1897 btree_max_elements(u_int8_t type)
1899 if (type == HAMMER_BTREE_TYPE_LEAF)
1900 return(HAMMER_BTREE_LEAF_ELMS);
1901 if (type == HAMMER_BTREE_TYPE_INTERNAL)
1902 return(HAMMER_BTREE_INT_ELMS);
1903 panic("btree_max_elements: bad type %d\n", type);
1908 hammer_print_btree_node(hammer_node_ondisk_t ondisk)
1910 hammer_btree_elm_t elm;
1913 kprintf("node %p count=%d parent=%d type=%c\n",
1914 ondisk, ondisk->count, ondisk->parent, ondisk->type);
1917 * Dump both boundary elements if an internal node
1919 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
1920 for (i = 0; i <= ondisk->count; ++i) {
1921 elm = &ondisk->elms[i];
1922 hammer_print_btree_elm(elm, ondisk->type, i);
1925 for (i = 0; i < ondisk->count; ++i) {
1926 elm = &ondisk->elms[i];
1927 hammer_print_btree_elm(elm, ondisk->type, i);
1933 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
1936 kprintf("\tobjid = %016llx\n", elm->base.obj_id);
1937 kprintf("\tkey = %016llx\n", elm->base.key);
1938 kprintf("\tcreate_tid = %016llx\n", elm->base.create_tid);
1939 kprintf("\tdelete_tid = %016llx\n", elm->base.delete_tid);
1940 kprintf("\trec_type = %04x\n", elm->base.rec_type);
1941 kprintf("\tobj_type = %02x\n", elm->base.obj_type);
1942 kprintf("\tsubtree_type = %02x\n", elm->subtree_type);
1944 if (type == HAMMER_BTREE_TYPE_INTERNAL) {
1945 if (elm->internal.rec_offset) {
1946 kprintf("\tcluster_rec = %08x\n",
1947 elm->internal.rec_offset);
1948 kprintf("\tcluster_id = %08x\n",
1949 elm->internal.subtree_clu_no);
1950 kprintf("\tvolno = %08x\n",
1951 elm->internal.subtree_vol_no);
1953 kprintf("\tsubtree_off = %08x\n",
1954 elm->internal.subtree_offset);
1956 kprintf("\tsubtree_count= %d\n", elm->internal.subtree_count);
1958 kprintf("\trec_offset = %08x\n", elm->leaf.rec_offset);
1959 kprintf("\tdata_offset = %08x\n", elm->leaf.data_offset);
1960 kprintf("\tdata_len = %08x\n", elm->leaf.data_len);
1961 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc);