2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.2 2007/11/02 00:57:15 dillon Exp $
38 * HAMMER BH-Tree index
40 * HAMMER implements a modified B+Tree. In documentation this will
41 * simply be refered to as the HAMMER BH-Tree. Basically a BH-Tree
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
45 * otherwords, we have boundary elements at the two ends of a BH-Tree node
46 * instead of sub-tree pointers.
48 * A BH-Tree internal node looks like this:
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
53 * A BH-Tree leaf node basically looks like this:
55 * L L L L L L L L <-- leaf elemenets
57 * The recursion radix is reduced by 2 relative to a normal B-Tree but
58 * we get a number of significant benefits for our troubles.
60 * The big benefit to using a BH-Tree is that it is possible to cache
61 * pointers into the middle of the tree and not have to start searches,
62 * insertions, OR deletions at the root node. In particular, searches are
63 * able to progress in a definitive direction from any point in the tree
64 * without revisting nodes. This greatly improves the efficiency of many
65 * operations, most especially record appends.
67 * BH-Trees also make the stacking of trees fairly straightforward.
73 static int btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key,
75 static int btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
76 struct hammer_cluster *cluster);
77 static int btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
78 int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id);
79 static int btree_cursor_up(hammer_btree_cursor_t cursor);
80 static int btree_cursor_down(hammer_btree_cursor_t cursor);
81 static int btree_split_internal(hammer_btree_cursor_t cursor);
82 static int btree_split_leaf(hammer_btree_cursor_t cursor);
83 static int btree_rebalance_node(hammer_btree_cursor_t cursor);
84 static int btree_collapse(hammer_btree_cursor_t cursor);
87 * Compare two BH-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
90 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
93 kprintf("compare obj_id %016llx %016llx\n",
94 key1->obj_id, key2->obj_id);
95 kprintf("compare rec_type %04x %04x\n",
96 key1->rec_type, key2->rec_type);
97 kprintf("compare key %016llx %016llx\n",
98 key1->key, key2->key);
101 if (key1->obj_id < key2->obj_id)
103 if (key1->obj_id > key2->obj_id)
106 if (key1->rec_type < key2->rec_type)
108 if (key1->rec_type > key2->rec_type)
111 if (key1->key < key2->key)
113 if (key1->key > key2->key)
117 * This test has a number of special cases. create_tid in key1 is
118 * the as-of transction id, and delete_tid in key1 is NOT USED.
120 * A key1->create_tid of 0 indicates 'the most recent record if not
121 * deleted'. key2->create_tid is a HAMMER record and will never be
122 * 0. key2->delete_tid is the deletion transaction id or 0 if
123 * the record has not yet been deleted.
125 if (key1->create_tid == 0) {
126 if (key2->delete_tid)
129 if (key1->create_tid < key2->create_tid)
131 if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
139 * Create a separator half way inbetween key1 and key2. For fields just
140 * one unit apart, the separator will match key2.
142 #define MAKE_SEPARATOR(key1, key2, dest, field) \
143 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
146 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
147 hammer_base_elm_t dest)
149 MAKE_SEPARATOR(key1, key2, dest, obj_id);
150 MAKE_SEPARATOR(key1, key2, dest, rec_type);
151 MAKE_SEPARATOR(key1, key2, dest, key);
152 MAKE_SEPARATOR(key1, key2, dest, create_tid);
153 dest->delete_tid = 0;
155 dest->reserved07 = 0;
158 #undef MAKE_SEPARATOR
161 * Return whether a generic internal or leaf node is full
164 btree_node_is_full(struct hammer_base_node *node)
167 case HAMMER_BTREE_INTERNAL_NODE:
168 if (node->count == HAMMER_BTREE_INT_ELMS)
171 case HAMMER_BTREE_LEAF_NODE:
172 if (node->count == HAMMER_BTREE_LEAF_ELMS)
176 panic("illegal btree subtype");
182 btree_max_elements(u_int8_t type)
184 if (type == HAMMER_BTREE_LEAF_NODE)
185 return(HAMMER_BTREE_LEAF_ELMS);
186 if (type == HAMMER_BTREE_INTERNAL_NODE)
187 return(HAMMER_BTREE_INT_ELMS);
188 panic("btree_max_elements: bad type %d\n", type);
192 * Initialize a cursor, setting the starting point for a BH-Tree search.
194 * The passed cluster must be locked. This function will add a reference
198 hammer_btree_cursor_init(hammer_btree_cursor_t cursor,
199 struct hammer_cluster *cluster)
203 cursor->cluster = NULL;
204 cursor->node_buffer = NULL;
205 cursor->parent_buffer = NULL;
207 cursor->parent = NULL;
209 cursor->parent_index = 0;
210 error = btree_cursor_set_cluster(cursor, cluster);
215 * Clean up a HAMMER BH-Tree cursor after we are finished using it.
218 hammer_btree_cursor_done(hammer_btree_cursor_t cursor)
220 if (cursor->node_buffer) {
221 hammer_put_buffer(cursor->node_buffer);
222 cursor->node_buffer = NULL;
224 if (cursor->parent_buffer) {
225 hammer_put_buffer(cursor->parent_buffer);
226 cursor->parent_buffer = NULL;
228 if (cursor->cluster) {
229 hammer_put_cluster(cursor->cluster);
230 cursor->cluster = NULL;
233 cursor->parent = NULL;
234 cursor->left_bound = NULL;
235 cursor->right_bound = NULL;
237 cursor->parent_index = 0;
241 * Initialize a btree info structure and its associated cursor prior to
242 * running a BH-Tree operation.
245 hammer_btree_info_init(hammer_btree_info_t info, struct hammer_cluster *cluster)
249 error = hammer_btree_cursor_init(&info->cursor, cluster);
250 info->data_buffer = NULL;
251 info->record_buffer = NULL;
258 * Clean up a BH-Tree info structure after we are finished using it.
261 hammer_btree_info_done(hammer_btree_info_t info)
263 hammer_btree_cursor_done(&info->cursor);
264 if (info->data_buffer) {
265 hammer_put_buffer(info->data_buffer);
266 info->data_buffer = NULL;
268 if (info->record_buffer) {
269 hammer_put_buffer(info->record_buffer);
270 info->record_buffer = NULL;
277 * Search a cluster's BH-Tree. Return the matching node. The search
278 * normally traverses clusters but will terminate at a cluster entry
279 * in a leaf if HAMMER_BTREE_CLUSTER_TAG is specified. If this flag
280 * is specified EIO is returned if the search would otherwise have to
281 * cursor up into a parent cluster.
283 * The cursor must be completely initialized on entry. If the cursor is
284 * at the root of a cluster, the parent pointer & buffer may be NULL (in
285 * that case the bounds point to the bounds in the cluster header). The
286 * node_buffer and node must always be valid.
288 * The search code may be forced to iterate up the tree if the conditions
289 * required for an insertion or deletion are not met. This does not occur
292 * INSERTIONS: The search will split full nodes and leaves on its way down
293 * and guarentee that the leaf it ends up on is not full.
295 * DELETIONS: The search will rebalance the tree on its way down.
299 btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key, int flags)
301 struct hammer_btree_leaf_node *leaf;
307 * Move our cursor up the tree until we find a node whos range covers
308 * the key we are trying to locate. This may move us between
311 * Since the root cluster always has a root BH-Tree node, info->node
312 * is always non-NULL if no error occured. The parent field will be
313 * non-NULL unless we are at the root of a cluster.
315 while (hammer_btree_cmp(key, cursor->left_bound) < 0 ||
316 hammer_btree_cmp(key, cursor->right_bound) >= 0) {
318 * Must stay in current cluster if flagged, code should never
319 * use the flag if it wants us to traverse to the parent
322 if (cursor->parent == NULL &&
323 (flags & HAMMER_BTREE_CLUSTER_TAG)) {
326 error = btree_cursor_up(cursor);
330 KKASSERT(cursor->node != NULL);
333 * If we are inserting we can't start at a full node if the parent
334 * is also full (because there is no way to split the node),
335 * continue running up the tree until we hit the root of the
336 * current cluster or until the requirement is satisfied.
338 while (flags & HAMMER_BTREE_INSERT) {
339 if (btree_node_is_full(&cursor->node->base) == 0)
341 if (cursor->parent == NULL)
343 if (cursor->parent->base.count != HAMMER_BTREE_INT_ELMS)
345 error = btree_cursor_up(cursor);
351 * If we are deleting we can't start at a node with only one element
352 * unless it is root, because all of our code assumes that nodes
353 * will never be empty.
355 * This handles the case where the cursor is sitting at a leaf and
356 * either the leaf or parent contain an insufficient number of
359 while (flags & HAMMER_BTREE_DELETE) {
360 if (cursor->node->base.count > 1)
362 if (cursor->parent == NULL)
364 KKASSERT(cursor->node->base.count != 0);
365 error = btree_cursor_up(cursor);
372 * Push down through internal nodes to locate the requested key.
374 while (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
375 struct hammer_btree_internal_node *node;
378 * If we are a the root node and deleting, try to collapse
379 * all of the root's children into the root. This is the
380 * only point where tree depth is reduced.
382 if ((flags & HAMMER_BTREE_DELETE) && cursor->parent == NULL) {
383 error = btree_collapse(cursor);
389 * Scan the node (XXX binary search) to find the subtree
390 * index to push down into. We go one-past, then back-up.
391 * The key should never be less then the left-hand boundary
392 * so I should never wind up 0.
394 node = &cursor->node->internal;
395 for (i = 0; i < node->base.count; ++i) {
396 r = hammer_btree_cmp(key, &node->elms[i].base);
403 * The push-down index is now i - 1.
409 * Handle insertion and deletion requirements.
411 * If inserting split full nodes. The split code will
412 * adjust cursor->node and cursor->index if the current
413 * index winds up in the new node.
415 if (flags & HAMMER_BTREE_INSERT) {
416 if (node->base.count == HAMMER_BTREE_INT_ELMS) {
417 error = btree_split_internal(cursor);
421 * reload stale pointers
424 node = &cursor->node->internal;
429 * If deleting rebalance - do not allow the child to have
430 * just one element or we will not be able to delete it.
432 * Neither internal or leaf nodes (except a root-leaf) are
433 * allowed to drop to 0 elements.
435 * Our separators may have been reorganized after rebalancing,
436 * so we have to pop back up and rescan.
438 if (flags & HAMMER_BTREE_DELETE) {
439 if (node->elms[i].subtree_count <= 1) {
440 error = btree_rebalance_node(cursor);
443 /* cursor->index is invalid after call */
449 * Push down (push into new node, existing node becomes
452 error = btree_cursor_down(cursor);
459 * We are at a leaf, do a linear search of the key array.
460 * (XXX do a binary search). On success the index is set to the
461 * matching element, on failure the index is set to the insertion
464 * Boundaries are not stored in leaf nodes, so the index can wind
465 * up to the left of element 0 (index == 0) or past the end of
466 * the array (index == leaf->base.count).
468 leaf = &cursor->node->leaf;
469 KKASSERT(leaf->base.count <= HAMMER_BTREE_LEAF_ELMS);
471 for (i = 0; i < leaf->base.count; ++i) {
472 r = hammer_btree_cmp(key, &leaf->elms[i].base);
477 * Return an exact match unless this is a cluster
478 * element. If it is, and the cluster tag flag has
479 * not been set, push into the new cluster and
480 * continue the search.
483 if ((leaf->elms[i].base.obj_type &
484 HAMMER_OBJTYPE_CLUSTER_FLAG) &&
485 (flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
486 error = btree_cursor_down(cursor);
496 * We could not find an exact match. Check for a cluster
497 * recursion. The cluster's range is bracketed by two
498 * leaf elements. One of the two must be in this node.
500 if ((flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
501 if (i == leaf->base.count) {
502 if (leaf->elms[i-1].base.obj_type &
503 HAMMER_OBJTYPE_CLUSTER_FLAG) {
504 cursor->index = i - 1;
505 error = btree_cursor_down(cursor);
511 if (leaf->elms[i].base.obj_type &
512 HAMMER_OBJTYPE_CLUSTER_FLAG) {
514 error = btree_cursor_down(cursor);
523 * If inserting split a full leaf before returning. This
524 * may have the side effect of adjusting cursor->node and
527 * We delayed the split in order to avoid any unnecessary splits.
529 * XXX parent's parent's subtree_count will be wrong after
530 * this (keep parent of parent around too? ugh).
533 if ((flags & HAMMER_BTREE_INSERT) &&
534 leaf->base.count == HAMMER_BTREE_LEAF_ELMS) {
535 error = btree_split_leaf(cursor);
538 node = &cursor->node->internal;
547 * Look up the key in the HAMMER BH-Tree and fill in the rest of the
548 * info structure with the results according to flags. 0 is returned on
549 * success, non-zero on error.
551 * The caller initializes (key, cluster) and makes the call. The cluster
552 * must be referenced and locked on call. The function can chain through
553 * multiple clusters and will replace the passed cluster as it does so,
554 * dereferencing and unlocking it, and referencing and locking the chain.
555 * On return info->cluster will still be referenced and locked but may
556 * represent a different cluster.
559 hammer_btree_lookup(hammer_btree_info_t info, hammer_base_elm_t key, int flags)
561 hammer_btree_leaf_elm_t elm;
562 struct hammer_cluster *cluster;
567 error = btree_search(&info->cursor, key, flags);
572 * Extract the record and data reference if requested.
574 * A cluster record type has no data reference, the information
575 * is stored directly in the record and BH-Tree element.
577 * The case where the data reference resolves to the same buffer
578 * as the record reference must be handled.
580 elm = &info->cursor.node->leaf.elms[info->cursor.index];
581 iscl = (elm->base.obj_type & HAMMER_OBJTYPE_CLUSTER_FLAG) != 0;
582 cluster = info->cursor.cluster;
583 if ((flags & HAMMER_BTREE_GET_RECORD) && error == 0) {
584 cloff = iscl ? elm->cluster.rec_offset : elm->record.rec_offset;
587 info->rec = hammer_bread(cluster, cloff, HAMMER_FSBUF_RECORDS,
588 &error, &info->record_buffer);
592 if ((flags & HAMMER_BTREE_GET_DATA) && iscl == 0 && error == 0) {
593 if ((cloff ^ elm->record.data_offset) & ~HAMMER_BUFMASK) {
594 info->data = hammer_bread(cluster,
595 elm->record.data_offset,
597 &error, &info->record_buffer);
599 info->data = (void *)
600 ((char *)info->data_buffer->ondisk +
601 (elm->record.data_offset & HAMMER_BUFMASK));
609 * Insert a record into a BH-Tree's cluster. The caller has already
610 * reserved space for the record and data and must handle a ENOSPC
614 hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm)
616 struct hammer_btree_cursor *cursor;
617 struct hammer_btree_internal_node *parent;
618 struct hammer_btree_leaf_node *leaf;
623 * Issue a search to get our cursor at the right place. The search
624 * will get us to a leaf node.
626 * The search also does some setup for our insert, so there is always
629 error = btree_search(&info->cursor, &elm->base, HAMMER_BTREE_INSERT);
630 if (error != ENOENT) {
637 * Insert the element at the leaf node and update the count in the
638 * parent. It is possible for parent to be NULL, indicating that
639 * the root of the BH-Tree in the cluster is a leaf. It is also
640 * possible for the leaf to be empty.
642 * Remember that the right-hand boundary is not included in the
645 cursor = &info->cursor;
646 leaf = &cursor->node->leaf;
648 KKASSERT(leaf->base.count < HAMMER_BTREE_LEAF_ELMS);
649 bcopy(&leaf->elms[i], &leaf->elms[i+1],
650 (leaf->base.count - i + 1) * sizeof(leaf->elms[0]));
651 leaf->elms[i] = *elm;
654 if ((parent = cursor->parent) != NULL) {
655 i = cursor->parent_index;
656 ++parent->elms[i].subtree_count;
657 KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
658 hammer_modify_buffer(cursor->parent_buffer);
660 hammer_modify_buffer(cursor->node_buffer);
665 * Delete a record from the BH-Tree.
668 hammer_btree_delete(struct hammer_btree_info *info, hammer_base_elm_t key)
670 struct hammer_btree_cursor *cursor;
671 struct hammer_btree_internal_node *parent;
672 struct hammer_btree_leaf_node *leaf;
677 * Locate the leaf element to delete. The search is also responsible
678 * for doing some of the rebalancing work on its way down.
680 error = btree_search(&info->cursor, key, HAMMER_BTREE_DELETE);
685 * Delete the element from the leaf node. We leave empty leafs alone
686 * and instead depend on a future search to locate and destroy them.
687 * Otherwise we would have to recurse back up the tree to adjust
688 * the parent's subtree_count and we do not want to do that.
690 * Remember that the right-hand boundary is not included in the
693 cursor = &info->cursor;
694 leaf = &cursor->node->leaf;
697 KKASSERT(cursor->node->base.type == HAMMER_BTREE_LEAF_NODE);
698 bcopy(&leaf->elms[i+1], &leaf->elms[i],
699 (leaf->base.count - i) * sizeof(leaf->elms[0]));
701 if ((parent = cursor->parent) != NULL) {
703 * Adjust parent's notion of the leaf's count. subtree_count
704 * is only approximately, it is allowed to be too small but
705 * never allowed to be too large. Make sure we don't drop
708 i = cursor->parent_index;
709 if (parent->elms[i].subtree_count)
710 --parent->elms[i].subtree_count;
711 KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
712 hammer_modify_buffer(cursor->parent_buffer);
714 hammer_modify_buffer(cursor->node_buffer);
718 /************************************************************************
720 ************************************************************************
722 * These routines do basic cursor operations. This support will probably
723 * be expanded in the future to add link fields for linear scans.
727 * Unconditionally set the cursor to the root of the specified cluster.
728 * The current cursor node is set to the root node of the cluster (which
729 * can be an internal node or a degenerate leaf), and the parent info
730 * is cleaned up and cleared.
732 * The passed cluster must be locked. This function will add a reference
733 * to it. The cursor must already have a cluster assigned to it, which we
738 btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
739 int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id)
741 struct hammer_cluster *cluster;
742 struct hammer_volume *volume;
747 cluster = cursor->cluster;
748 KKASSERT(cluster != NULL);
749 if (vol_no == cluster->volume->vol_no) {
750 cluster = hammer_get_cluster(cluster->volume, clu_no,
753 volume = hammer_get_volume(cluster->volume->hmp,
756 cluster = hammer_get_cluster(volume, clu_no,
758 hammer_put_volume(volume);
765 * Make sure the cluster id matches. XXX At the moment the
766 * clu_id in the btree cluster element is only 32 bits, so only
767 * compare the low 32 bits.
770 if ((int32_t)cluster->ondisk->clu_id == (int32_t)clu_id) {
771 btree_cursor_set_cluster(cursor, cluster);
775 hammer_put_cluster(cluster);
782 btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
783 struct hammer_cluster *cluster)
787 hammer_dup_cluster(&cursor->cluster, cluster);
788 cursor->node = hammer_bread(cluster,
789 cluster->ondisk->clu_btree_root,
792 &cursor->node_buffer);
794 if (cursor->parent) {
795 hammer_put_buffer(cursor->parent_buffer);
796 cursor->parent_buffer = NULL;
797 cursor->parent = NULL;
798 cursor->parent_index = 0;
800 cursor->left_bound = &cluster->ondisk->clu_btree_beg;
801 cursor->right_bound = &cluster->ondisk->clu_btree_end;
806 * Cursor the node up to the parent node. If at the root of a cluster,
807 * cursor to the root of the cluster's parent cluster. Note carefully
808 * that we do NOT scan the parent cluster to find the leaf that dropped
809 * into our current cluster.
811 * This function is primarily used by the search code to avoid having
812 * to search from the root of the filesystem BH-Tree.
816 btree_cursor_up(hammer_btree_cursor_t cursor)
818 struct hammer_cluster_ondisk *ondisk;
819 struct hammer_btree_internal_node *parent;
825 if (cursor->parent == NULL) {
827 * We are at the root of the cluster, pop up to the root
828 * of the parent cluster. Return EIO if we are at the
829 * root cluster of the filesystem.
831 ondisk = cursor->cluster->ondisk;
832 error = btree_cursor_set_cluster_by_value(
834 ondisk->clu_btree_parent_vol_no,
835 ondisk->clu_btree_parent_clu_no,
836 ondisk->clu_btree_parent_clu_id);
839 * Copy the current node's parent info into the node and load
842 cursor->index = cursor->parent_index;
843 cursor->node = (hammer_btree_node_t)cursor->parent;
844 hammer_dup_buffer(&cursor->node_buffer, cursor->parent_buffer);
847 * Load the parent's parent into parent and figure out the
848 * parent's element index for its child. Just NULL it out
849 * if we hit the root of the cluster.
851 if (cursor->parent->base.parent) {
852 parent = hammer_bread(cursor->cluster,
853 cursor->node->base.parent,
856 &cursor->parent_buffer);
857 for (i = 0; i < parent->base.count; ++i) {
858 r = hammer_btree_cmp(
859 &cursor->node->internal.elms[0].base,
860 &parent->elms[i].base);
864 cursor->parent = parent;
865 cursor->parent_index = i - 1;
866 KKASSERT(parent->elms[i].subtree_offset ==
867 hammer_bclu_offset(cursor->node_buffer,
870 hammer_put_buffer(cursor->parent_buffer);
871 cursor->parent = NULL;
872 cursor->parent_buffer = NULL;
873 cursor->parent_index = 0;
880 * Cursor down into (node, index)
882 * Push down into the current cursor. The current cursor becomes the parent.
883 * If the current cursor represents a leaf cluster element this function will
884 * push into the root of a new cluster and clear the parent fields.
886 * Pushin down at a leaf which is not a cluster element returns EIO.
890 btree_cursor_down(hammer_btree_cursor_t cursor)
892 hammer_btree_node_t node;
896 if (node->base.type == HAMMER_BTREE_LEAF_NODE) {
898 * Push into another cluster
900 hammer_btree_leaf_elm_t elm;
902 elm = &node->leaf.elms[cursor->index];
903 if (elm->base.rec_type == HAMMER_RECTYPE_CLUSTER) {
904 error = btree_cursor_set_cluster_by_value(
908 elm->cluster.verifier);
914 * Push into another BH-Tree node (internal or leaf)
916 struct hammer_btree_internal_elm *elm;
918 KKASSERT(node->base.type == HAMMER_BTREE_INTERNAL_NODE);
919 elm = &node->internal.elms[cursor->index];
920 KKASSERT(elm->subtree_offset != 0);
922 cursor->parent_index = cursor->index;
923 cursor->parent = &cursor->node->internal;
924 hammer_dup_buffer(&cursor->parent_buffer, cursor->node_buffer);
926 cursor->node = hammer_bread(cursor->cluster,
930 &cursor->node_buffer);
932 KKASSERT(cursor->node == NULL ||
933 cursor->node->base.parent == elm->subtree_offset);
938 /************************************************************************
939 * SPLITTING AND MERGING *
940 ************************************************************************
942 * These routines do all the dirty work required to split and merge nodes.
946 * Split an internal into two nodes and move the separator at the split
947 * point to the parent. Note that the parent's parent's element pointing
948 * to our parent will have an incorrect subtree_count (we don't update it).
949 * It will be low, which is ok.
951 * Cursor->index indicates the element the caller intends to push into.
952 * We will adjust cursor->node and cursor->index if that element winds
953 * up in the split node.
957 btree_split_internal(hammer_btree_cursor_t cursor)
959 struct hammer_btree_internal_node *parent;
960 struct hammer_btree_internal_node *node;
961 struct hammer_btree_internal_node *new_node;
962 struct hammer_btree_internal_elm *elm;
963 struct hammer_btree_internal_elm *parent_elm;
964 struct hammer_buffer *new_buffer;
965 int32_t parent_offset;
970 const size_t esize = sizeof(struct hammer_btree_internal_elm);
973 * We are splitting but elms[split] will be promoted to the parent,
974 * leaving the right hand node with one less element. If the
975 * insertion point will be on the left-hand side adjust the split
976 * point to give the right hand side one additional node.
978 node = &cursor->node->internal;
979 split = (node->base.count + 1) / 2;
980 if (cursor->index <= split)
985 * If we are at the root of the tree, create a new root node with
986 * 1 element and split normally. Avoid making major modifications
987 * until we know the whole operation will work.
989 parent = cursor->parent;
990 if (parent == NULL) {
991 parent = hammer_alloc_btree(cursor->cluster, &error,
992 &cursor->parent_buffer);
995 parent->base.count = 1;
996 parent->base.parent = 0;
997 parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
998 parent->base.subtype = node->base.type;
999 parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
1000 parent->elms[0].subtree_offset =
1001 hammer_bclu_offset(cursor->node_buffer, node);
1002 parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
1004 cursor->parent_index = 0; /* insertion point in parent */
1008 parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
1011 * Split node into new_node at the split point.
1013 * B O O O P N N B <-- P = node->elms[split]
1014 * 0 1 2 3 4 5 6 <-- subtree indices
1019 * B O O O B B N N B <--- inner boundary points are 'P'
1024 new_node = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
1025 if (new_node == NULL) {
1027 hammer_free_btree_ptr(cursor->parent_buffer,
1028 (hammer_btree_node_t)parent);
1033 * Create the new node. P become the left-hand boundary in the
1034 * new node. Copy the right-hand boundary as well.
1036 * elm is the new separator.
1038 elm = &node->elms[split];
1039 bcopy(elm, &new_node->elms[0], (node->base.count - split + 1) * esize);
1040 new_node->base.count = node->base.count - split;
1041 new_node->base.parent = parent_offset;
1042 new_node->base.type = HAMMER_BTREE_INTERNAL_NODE;
1043 new_node->base.subtype = node->base.subtype;
1044 KKASSERT(node->base.type == new_node->base.type);
1047 * Cleanup the original node. P becomes the new boundary, its
1048 * subtree_offset was moved to the new node. If we had created
1049 * a new root its parent pointer may have changed.
1051 node->base.parent = parent_offset;
1052 elm->subtree_offset = 0;
1055 * Insert the separator into the parent, fixup the parent's
1056 * reference to the original node, and reference the new node.
1057 * The separator is P.
1059 * Remember that base.count does not include the right-hand boundary.
1061 parent_index = cursor->parent_index;
1062 parent->elms[parent_index].subtree_count = split;
1063 parent_elm = &parent->elms[parent_index+1];
1064 bcopy(parent_elm, parent_elm + 1,
1065 (parent->base.count - parent_index) * esize);
1066 parent_elm->base = elm->base; /* separator P */
1067 parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_node);
1068 parent_elm->subtree_count = new_node->base.count;
1070 hammer_modify_buffer(new_buffer);
1071 hammer_modify_buffer(cursor->parent_buffer);
1072 hammer_modify_buffer(cursor->node_buffer);
1075 * The cluster's root pointer may have to be updated.
1078 cursor->cluster->ondisk->clu_btree_root = node->base.parent;
1079 hammer_modify_cluster(cursor->cluster);
1083 * Ok, now adjust the cursor depending on which element the original
1084 * index was pointing at. If we are >= the split point the push node
1085 * is now in the new node.
1087 * NOTE: If we are at the split point itself we cannot stay with the
1088 * original node because the push index will point at the right-hand
1089 * boundary, which is illegal.
1091 if (cursor->index >= split) {
1092 cursor->index -= split;
1093 cursor->node = (hammer_btree_node_t)new_node;
1094 hammer_put_buffer(cursor->node_buffer);
1095 cursor->node_buffer = new_buffer;
1102 * Same as the above, but splits a full leaf node.
1106 btree_split_leaf(hammer_btree_cursor_t cursor)
1108 struct hammer_btree_internal_node *parent;
1109 struct hammer_btree_leaf_node *leaf;
1110 struct hammer_btree_leaf_node *new_leaf;
1111 union hammer_btree_leaf_elm *elm;
1112 struct hammer_btree_internal_elm *parent_elm;
1113 struct hammer_buffer *new_buffer;
1114 int32_t parent_offset;
1119 const size_t esize = sizeof(struct hammer_btree_internal_elm);
1122 * We are splitting but elms[split] will be promoted to the parent,
1123 * leaving the right hand node with one less element. If the
1124 * insertion point will be on the left-hand side adjust the split
1125 * point to give the right hand side one additional node.
1127 leaf = &cursor->node->leaf;
1128 split = (leaf->base.count + 1) / 2;
1129 if (cursor->index <= split)
1134 * If we are at the root of the tree, create a new root node with
1135 * 1 element and split normally. Avoid making major modifications
1136 * until we know the whole operation will work.
1138 parent = cursor->parent;
1139 if (parent == NULL) {
1140 parent = hammer_alloc_btree(cursor->cluster, &error,
1141 &cursor->parent_buffer);
1144 parent->base.count = 1;
1145 parent->base.parent = 0;
1146 parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
1147 parent->base.subtype = leaf->base.type;
1148 parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
1149 parent->elms[0].subtree_offset =
1150 hammer_bclu_offset(cursor->node_buffer, leaf);
1151 parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
1153 cursor->parent_index = 0; /* insertion point in parent */
1157 parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
1160 * Split leaf into new_leaf at the split point. Select a separator
1161 * value in-between the two leafs but with a bent towards the right
1162 * leaf since comparisons use an 'elm >= separator' inequality.
1172 new_leaf = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
1173 if (new_leaf == NULL) {
1175 hammer_free_btree_ptr(cursor->parent_buffer,
1176 (hammer_btree_node_t)parent);
1181 * Create the new node. P become the left-hand boundary in the
1182 * new node. Copy the right-hand boundary as well.
1184 elm = &leaf->elms[split];
1185 bcopy(elm, &new_leaf->elms[0], (leaf->base.count - split) * esize);
1186 new_leaf->base.count = leaf->base.count - split;
1187 new_leaf->base.parent = parent_offset;
1188 new_leaf->base.type = HAMMER_BTREE_LEAF_NODE;
1189 new_leaf->base.subtype = 0;
1190 KKASSERT(leaf->base.type == new_leaf->base.type);
1193 * Cleanup the original node. P becomes the new boundary, its
1194 * subtree_offset was moved to the new node. If we had created
1195 * a new root its parent pointer may have changed.
1197 leaf->base.parent = parent_offset;
1200 * Insert the separator into the parent, fixup the parent's
1201 * reference to the original node, and reference the new node.
1202 * The separator is P.
1204 * Remember that base.count does not include the right-hand boundary.
1205 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1207 parent_index = cursor->parent_index;
1208 parent->elms[parent_index].subtree_count = split;
1209 parent_elm = &parent->elms[parent_index+1];
1210 if (parent_index + 1 != parent->base.count) {
1211 bcopy(parent_elm, parent_elm + 1,
1212 (parent->base.count - parent_index - 1) * esize);
1214 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1215 parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_leaf);
1216 parent_elm->subtree_count = new_leaf->base.count;
1218 hammer_modify_buffer(new_buffer);
1219 hammer_modify_buffer(cursor->parent_buffer);
1220 hammer_modify_buffer(cursor->node_buffer);
1223 * The cluster's root pointer may have to be updated.
1226 cursor->cluster->ondisk->clu_btree_root = leaf->base.parent;
1227 hammer_modify_cluster(cursor->cluster);
1231 * Ok, now adjust the cursor depending on which element the original
1232 * index was pointing at. If we are >= the split point the push node
1233 * is now in the new node.
1235 * NOTE: If we are at the split point itself we cannot stay with the
1236 * original node because the push index will point at the right-hand
1237 * boundary, which is illegal.
1239 if (cursor->index >= split) {
1240 cursor->index -= split;
1241 cursor->node = (hammer_btree_node_t)new_leaf;
1242 hammer_put_buffer(cursor->node_buffer);
1243 cursor->node_buffer = new_buffer;
1250 * This routine is called on an internal node prior to recursing down
1251 * through (node, index) when (node, index) MIGHT have too few elements for
1252 * the caller to perform a deletion.
1254 * cursor->index is invalid on return because the separators may have gotten
1255 * adjusted, the caller must rescan the node's elements. The caller may set
1256 * cursor->index to -1 if it wants us to do a general rebalancing.
1258 * NOTE: Because we do not update the parent's parent in the split code,
1259 * the subtree_count used by the caller may be incorrect. We correct it
1260 * here. Also note that we cannot change the depth of the tree's leaf
1261 * nodes here (see btree_collapse()).
1263 * This routine rebalances the children of the node, collapsing children
1264 * together if possible. On return each child will have at least L/2-1
1265 * elements unless the node only has one child.
1269 btree_rebalance_node(hammer_btree_cursor_t cursor)
1271 struct hammer_btree_internal_node *node;
1272 hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
1273 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1274 hammer_btree_node_t child;
1275 hammer_btree_elm_inmemory_t elms;
1276 int i, j, n, nelms, goal;
1277 int maxelms, halfelms;
1283 node = &cursor->node->internal;
1284 KKASSERT(node->elms[cursor->index].subtree_offset != 0);
1288 * Load the children of node and do any necessary corrections
1289 * to subtree_count. subtree_count may be incorrect due to the
1290 * way insertions split nodes. Get a count of the total number
1291 * of elements held by our children.
1295 for (i = n = 0; i < node->base.count; ++i) {
1296 struct hammer_btree_internal_elm *elm;
1298 elm = &node->elms[i];
1300 child_buffer[i] = NULL; /* must be preinitialized for bread */
1301 if (elm->subtree_offset == 0)
1303 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1304 HAMMER_FSBUF_BTREE, &error,
1306 children[i] = child;
1309 KKASSERT(node->base.subtype == child->base.type);
1312 * Accumulate n for a good child, update the node's count
1315 if (node->elms[i].subtree_count != child->base.count) {
1316 node->elms[i].subtree_count = child->base.count;
1318 n += node->elms[i].subtree_count;
1324 * Collect all the children's elements together
1327 elms = kmalloc(sizeof(*elms) * (nelms + 1), M_HAMMER, M_WAITOK|M_ZERO);
1328 for (i = n = 0; i < node->base.count; ++i) {
1329 child = children[i];
1330 for (j = 0; j < child->base.count; ++j) {
1331 elms[n].owner = child;
1332 if (node->base.subtype == HAMMER_BTREE_LEAF_NODE)
1333 elms[n].u.leaf = child->leaf.elms[j];
1335 elms[n].u.internal = child->internal.elms[j];
1339 KKASSERT(n == nelms);
1342 * Store a boundary in the elms array to ease the code below. This
1343 * is only used if the children are internal nodes.
1345 elms[n].u.internal = node->elms[i];
1348 * Calculate the number of elements each child should have (goal) by
1349 * reducing the number of elements until we achieve at least
1350 * halfelms - 1 per child, unless we are a degenerate case.
1352 maxelms = btree_max_elements(node->base.subtype);
1353 halfelms = maxelms / 2;
1355 goal = halfelms - 1;
1356 while (i && n / i < goal)
1360 * Now rebalance using the specified goal
1362 for (i = n = 0; i < node->base.count; ++i) {
1363 struct hammer_buffer *subchild_buffer = NULL;
1364 struct hammer_btree_internal_node *subchild;
1366 child = children[i];
1367 for (j = 0; j < goal && n < nelms; ++j) {
1368 if (node->base.subtype == HAMMER_BTREE_LEAF_NODE) {
1369 child->leaf.elms[j] = elms[n].u.leaf;
1371 child->internal.elms[j] = elms[n].u.internal;
1375 * If the element's parent has changed we have to
1376 * update the parent pointer. This is somewhat
1379 if (elms[n].owner != child &&
1380 node->base.subtype == HAMMER_BTREE_INTERNAL_NODE) {
1381 subchild = hammer_bread(cursor->cluster,
1382 elms[n].u.internal.subtree_offset,
1387 subchild->base.parent =
1388 hammer_bclu_offset(child_buffer[i],
1390 hammer_modify_buffer(subchild_buffer);
1397 * Set right boundary if the children are internal nodes.
1399 if (node->base.subtype == HAMMER_BTREE_INTERNAL_NODE)
1400 child->internal.elms[j] = elms[n].u.internal;
1401 child->base.count = j;
1402 hammer_modify_buffer(child_buffer[i]);
1403 if (subchild_buffer)
1404 hammer_put_buffer(subchild_buffer);
1407 * If we have run out of elements, break out
1414 * Physically destroy any left-over children. These children's
1415 * elements have been packed into prior children. The node's
1416 * right hand boundary and count gets shifted to index i.
1418 * The subtree count in the node's parent MUST be updated because
1419 * we are removing elements. The subtree_count field is allowed to
1420 * be too small, but not too large!
1422 if (i != node->base.count) {
1424 node->elms[n] = node->elms[node->base.count];
1425 while (i < node->base.count) {
1426 hammer_free_btree_ptr(child_buffer[i], children[i]);
1427 hammer_put_buffer(child_buffer[i]);
1430 node->base.count = n;
1431 if (cursor->parent) {
1432 cursor->parent->elms[cursor->parent_index].subtree_count = n;
1433 hammer_modify_buffer(cursor->parent_buffer);
1437 kfree(elms, M_HAMMER);
1439 hammer_modify_buffer(cursor->node_buffer);
1440 for (i = 0; i < node->base.count; ++i) {
1441 if (child_buffer[i])
1442 hammer_put_buffer(child_buffer[i]);
1448 * This routine is only called if the cursor is at the root node and the
1449 * root node is an internal node. We attempt to collapse the root node
1450 * by replacing it with all of its children, reducing tree depth by one.
1452 * This is the only way to reduce tree depth in a HAMMER filesystem.
1453 * Note that all leaf nodes are at the same depth.
1455 * This is a fairly expensive operation because we not only have to load
1456 * the root's children, we also have to scan each child and adjust the
1457 * parent offset for each element in each child. Nasty all around.
1461 btree_collapse(hammer_btree_cursor_t cursor)
1463 hammer_btree_node_t root, child;
1464 hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
1465 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1470 int32_t root_offset;
1471 u_int8_t subsubtype;
1473 root = cursor->node;
1474 count = root->base.count;
1475 root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1478 * Sum up the number of children each element has. This value is
1479 * only approximate due to the way the insertion node works. It
1480 * may be too small but it will never be too large.
1482 * Quickly terminate the collapse if the elements have too many
1485 KKASSERT(root->base.parent == 0); /* must be root node */
1486 KKASSERT(root->base.type == HAMMER_BTREE_INTERNAL_NODE);
1487 KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1489 for (i = n = 0; i < count; ++i) {
1490 n += root->internal.elms[i].subtree_count;
1492 if (n > btree_max_elements(root->base.subtype))
1496 * Iterate through the elements again and correct the subtree_count.
1497 * Terminate the collapse if we wind up with too many.
1502 for (i = n = 0; i < count; ++i) {
1503 struct hammer_btree_internal_elm *elm;
1505 elm = &root->internal.elms[i];
1506 child_buffer[i] = NULL;
1508 if (elm->subtree_offset == 0)
1510 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1511 HAMMER_FSBUF_BTREE, &error,
1513 children[i] = child;
1516 KKASSERT(root->base.subtype == child->base.type);
1519 * Accumulate n for a good child, update the root's count
1522 if (root->internal.elms[i].subtree_count != child->base.count) {
1523 root->internal.elms[i].subtree_count = child->base.count;
1526 n += root->internal.elms[i].subtree_count;
1528 if (error || n > btree_max_elements(root->base.subtype))
1532 * Ok, we can collapse the root. If the root's children are leafs
1533 * the collapse is really simple. If they are internal nodes the
1534 * collapse is not so simple because we have to fixup the parent
1535 * pointers for the root's children's children.
1537 * When collapsing an internal node the far left and far right
1538 * element's boundaries should match the root's left and right
1541 if (root->base.subtype == HAMMER_BTREE_LEAF_NODE) {
1542 for (i = n = 0; i < count; ++i) {
1543 child = children[i];
1544 for (j = 0; j < child->base.count; ++j) {
1545 root->leaf.elms[n] = child->leaf.elms[j];
1549 root->base.type = root->base.subtype;
1550 root->base.subtype = 0;
1551 root->base.count = n;
1552 root->leaf.link_left = 0;
1553 root->leaf.link_right = 0;
1555 struct hammer_btree_internal_elm *elm;
1556 struct hammer_btree_internal_node *subchild;
1557 struct hammer_buffer *subchild_buffer = NULL;
1560 child = children[0];
1561 subsubtype = child->base.subtype;
1562 KKASSERT(child->base.count > 0);
1563 KKASSERT(root->internal.elms[0].base.key ==
1564 child->internal.elms[0].base.key);
1565 child = children[count-1];
1566 KKASSERT(child->base.count > 0);
1567 KKASSERT(root->internal.elms[count].base.key ==
1568 child->internal.elms[child->base.count].base.key);
1572 for (i = n = 0; i < count; ++i) {
1573 child = children[i];
1574 KKASSERT(child->base.subtype == subsubtype);
1575 for (j = 0; j < child->base.count; ++j) {
1576 elm = &child->internal.elms[j];
1578 root->internal.elms[n] = *elm;
1579 subchild = hammer_bread(cursor->cluster,
1580 elm->subtree_offset,
1585 subchild->base.parent = root_offset;
1586 hammer_modify_buffer(subchild_buffer);
1590 /* make sure the right boundary is correct */
1591 /* (this gets overwritten when the loop continues) */
1592 /* XXX generate a new separator? */
1593 root->internal.elms[n] = child->internal.elms[j];
1595 root->base.type = HAMMER_BTREE_INTERNAL_NODE;
1596 root->base.subtype = subsubtype;
1597 if (subchild_buffer)
1598 hammer_put_buffer(subchild_buffer);
1607 hammer_modify_buffer(cursor->node_buffer);
1608 for (i = 0; i < count; ++i) {
1609 if (child_buffer[i])
1610 hammer_put_buffer(child_buffer[i]);