HAMMER part 1/many. This is a clear-my-plate commit.
[dragonfly.git] / sys / vfs / hammer / hammer_btree.c
CommitLineData
427e5fc6
MD
1/*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.1 2007/11/01 20:53:05 dillon Exp $
35 */
36
37/*
38 * HAMMER BH-Tree index
39 *
40 * HAMMER implements a modified B+Tree. In documentation this will
41 * simply be refered to as the HAMMER BH-Tree. Basically a BH-Tree
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
45 * otherwords, we have boundary elements at the two ends of a BH-Tree node
46 * instead of sub-tree pointers.
47 *
48 * A BH-Tree internal node looks like this:
49 *
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
52 *
53 * A BH-Tree leaf node basically looks like this:
54 *
55 * L L L L L L L L <-- leaf elemenets
56 *
57 * The recursion radix is reduced by 2 relative to a normal B-Tree but
58 * we get a number of significant benefits for our troubles.
59 *
60 * The big benefit to using a BH-Tree is that it is possible to cache
61 * pointers into the middle of the tree and not have to start searches,
62 * insertions, OR deletions at the root node. In particular, searches are
63 * able to progress in a definitive direction from any point in the tree
64 * without revisting nodes. This greatly improves the efficiency of many
65 * operations, most especially record appends.
66 *
67 * BH-Trees also make the stacking of trees fairly straightforward.
68 */
69#include "hammer.h"
70#include <sys/buf.h>
71#include <sys/buf2.h>
72
73static int btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key,
74 int flags);
75static int btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
76 struct hammer_cluster *cluster);
77static int btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
78 int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id);
79static int btree_cursor_up(hammer_btree_cursor_t cursor);
80static int btree_cursor_down(hammer_btree_cursor_t cursor);
81static int btree_split_internal(hammer_btree_cursor_t cursor);
82static int btree_split_leaf(hammer_btree_cursor_t cursor);
83static int btree_rebalance_node(hammer_btree_cursor_t cursor);
84static int btree_collapse(hammer_btree_cursor_t cursor);
85
86/*
87 * Compare two BH-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
88 */
89static int
90hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
91{
92 if (key1->obj_id < key2->obj_id)
93 return(-1);
94 if (key1->obj_id > key2->obj_id)
95 return(1);
96
97 if (key1->rec_type < key2->rec_type)
98 return(-1);
99 if (key1->rec_type > key2->rec_type)
100 return(1);
101
102 if (key1->key < key2->key)
103 return(-1);
104 if (key1->key > key2->key)
105 return(1);
106
107 if (key1->create_tid < key2->create_tid)
108 return(-1);
109 if (key1->create_tid > key2->create_tid)
110 return(1);
111
112 return(0);
113}
114
115/*
116 * Create a separator half way inbetween key1 and key2. For fields just
117 * one unit apart, the separator will match key2.
118 */
119#define MAKE_SEPARATOR(key1, key2, dest, field) \
120 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
121
122static void
123hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
124 hammer_base_elm_t dest)
125{
126 MAKE_SEPARATOR(key1, key2, dest, obj_id);
127 MAKE_SEPARATOR(key1, key2, dest, rec_type);
128 MAKE_SEPARATOR(key1, key2, dest, key);
129 MAKE_SEPARATOR(key1, key2, dest, create_tid);
130 dest->delete_tid = 0;
131 dest->obj_type = 0;
132 dest->reserved07 = 0;
133}
134
135#undef MAKE_SEPARATOR
136
137/*
138 * Return whether a generic internal or leaf node is full
139 */
140static int
141btree_node_is_full(struct hammer_base_node *node)
142{
143 switch(node->type) {
144 case HAMMER_BTREE_INTERNAL_NODE:
145 if (node->count == HAMMER_BTREE_INT_ELMS)
146 return(1);
147 break;
148 case HAMMER_BTREE_LEAF_NODE:
149 if (node->count == HAMMER_BTREE_LEAF_ELMS)
150 return(1);
151 break;
152 default:
153 panic("illegal btree subtype");
154 }
155 return(0);
156}
157
158static int
159btree_max_elements(u_int8_t type)
160{
161 if (type == HAMMER_BTREE_LEAF_NODE)
162 return(HAMMER_BTREE_LEAF_ELMS);
163 if (type == HAMMER_BTREE_INTERNAL_NODE)
164 return(HAMMER_BTREE_INT_ELMS);
165 panic("btree_max_elements: bad type %d\n", type);
166}
167
168/*
169 * Initialize a cursor, setting the starting point for a BH-Tree search.
170 *
171 * The passed cluster must be locked. This function will add a reference
172 * to it.
173 */
174int
175hammer_btree_cursor_init(hammer_btree_cursor_t cursor,
176 struct hammer_cluster *cluster)
177{
178 int error;
179
180 cursor->cluster = NULL;
181 cursor->node_buffer = NULL;
182 cursor->parent_buffer = NULL;
183 cursor->node = NULL;
184 cursor->parent = NULL;
185 cursor->index = 0;
186 cursor->parent_index = 0;
187 error = btree_cursor_set_cluster(cursor, cluster);
188 return(error);
189}
190
191/*
192 * Clean up a HAMMER BH-Tree cursor after we are finished using it.
193 */
194void
195hammer_btree_cursor_done(hammer_btree_cursor_t cursor)
196{
197 if (cursor->node_buffer) {
198 hammer_put_buffer(cursor->node_buffer);
199 cursor->node_buffer = NULL;
200 }
201 if (cursor->parent_buffer) {
202 hammer_put_buffer(cursor->parent_buffer);
203 cursor->parent_buffer = NULL;
204 }
205 if (cursor->cluster) {
206 hammer_put_cluster(cursor->cluster);
207 cursor->cluster = NULL;
208 }
209 cursor->node = NULL;
210 cursor->parent = NULL;
211 cursor->left_bound = NULL;
212 cursor->right_bound = NULL;
213 cursor->index = 0;
214 cursor->parent_index = 0;
215}
216
217/*
218 * Initialize a btree info structure and its associated cursor prior to
219 * running a BH-Tree operation.
220 */
221int
222hammer_btree_info_init(hammer_btree_info_t info, struct hammer_cluster *cluster)
223{
224 int error;
225
226 error = hammer_btree_cursor_init(&info->cursor, cluster);
227 info->data_buffer = NULL;
228 info->record_buffer = NULL;
229 info->data = NULL;
230 info->rec = NULL;
231 return (error);
232}
233
234/*
235 * Clean up a BH-Tree info structure after we are finished using it.
236 */
237void
238hammer_btree_info_done(hammer_btree_info_t info)
239{
240 hammer_btree_cursor_done(&info->cursor);
241 if (info->data_buffer) {
242 hammer_put_buffer(info->data_buffer);
243 info->data_buffer = NULL;
244 }
245 if (info->record_buffer) {
246 hammer_put_buffer(info->record_buffer);
247 info->record_buffer = NULL;
248 }
249 info->data = NULL;
250 info->rec = NULL;
251}
252
253/*
254 * Search a cluster's BH-Tree. Return the matching node. The search
255 * normally traverses clusters but will terminate at a cluster entry
256 * in a leaf if HAMMER_BTREE_CLUSTER_TAG is specified. If this flag
257 * is specified EIO is returned if the search would otherwise have to
258 * cursor up into a parent cluster.
259 *
260 * The cursor must be completely initialized on entry. If the cursor is
261 * at the root of a cluster, the parent pointer & buffer may be NULL (in
262 * that case the bounds point to the bounds in the cluster header). The
263 * node_buffer and node must always be valid.
264 *
265 * The search code may be forced to iterate up the tree if the conditions
266 * required for an insertion or deletion are not met. This does not occur
267 * very often.
268 *
269 * INSERTIONS: The search will split full nodes and leaves on its way down
270 * and guarentee that the leaf it ends up on is not full.
271 *
272 * DELETIONS: The search will rebalance the tree on its way down.
273 */
274static
275int
276btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key, int flags)
277{
278 struct hammer_btree_leaf_node *leaf;
279 int error;
280 int i;
281 int r;
282
283 /*
284 * Move our cursor up the tree until we find a node whos range covers
285 * the key we are trying to locate. This may move us between
286 * clusters.
287 *
288 * Since the root cluster always has a root BH-Tree node, info->node
289 * is always non-NULL if no error occured. The parent field will be
290 * non-NULL unless we are at the root of a cluster.
291 */
292 while (hammer_btree_cmp(key, cursor->left_bound) < 0 ||
293 hammer_btree_cmp(key, cursor->right_bound) >= 0) {
294 /*
295 * Must stay in current cluster if flagged, code should never
296 * use the flag if it wants us to traverse to the parent
297 * cluster.
298 */
299 if (cursor->parent == NULL &&
300 (flags & HAMMER_BTREE_CLUSTER_TAG)) {
301 return(EIO);
302 }
303 error = btree_cursor_up(cursor);
304 if (error)
305 return(error);
306 }
307 KKASSERT(cursor->node != NULL);
308
309 /*
310 * If we are inserting we can't start at a full node if the parent
311 * is also full (because there is no way to split the node),
312 * continue running up the tree until we hit the root of the
313 * current cluster or until the requirement is satisfied.
314 */
315 while (flags & HAMMER_BTREE_INSERT) {
316 if (btree_node_is_full(&cursor->node->base) == 0)
317 break;
318 if (cursor->parent == NULL)
319 break;
320 if (cursor->parent->base.count != HAMMER_BTREE_INT_ELMS)
321 break;
322 error = btree_cursor_up(cursor);
323 if (error)
324 return (error);
325 }
326
327 /*
328 * If we are deleting we can't start at a node with only one element
329 * unless it is root, because all of our code assumes that nodes
330 * will never be empty.
331 *
332 * This handles the case where the cursor is sitting at a leaf and
333 * either the leaf or parent contain an insufficient number of
334 * elements.
335 */
336 while (flags & HAMMER_BTREE_DELETE) {
337 if (cursor->node->base.count > 1)
338 break;
339 if (cursor->parent == NULL)
340 break;
341 KKASSERT(cursor->node->base.count != 0);
342 error = btree_cursor_up(cursor);
343 if (error)
344 return (error);
345 }
346
347new_cluster:
348 /*
349 * Push down through internal nodes to locate the requested key.
350 */
351 while (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) {
352 struct hammer_btree_internal_node *node;
353
354 /*
355 * If we are a the root node and deleting, try to collapse
356 * all of the root's children into the root. This is the
357 * only point where tree depth is reduced.
358 */
359 if ((flags & HAMMER_BTREE_DELETE) && cursor->parent == NULL) {
360 error = btree_collapse(cursor);
361 if (error)
362 return (error);
363 }
364
365 /*
366 * Scan the node (XXX binary search) to find the subtree
367 * index to push down into. We go one-past, then back-up.
368 * The key should never be less then the left-hand boundary
369 * so I should never wind up 0.
370 */
371 node = &cursor->node->internal;
372 for (i = 0; i < node->base.count; ++i) {
373 r = hammer_btree_cmp(key, &node->elms[i].base);
374 if (r < 0)
375 break;
376 }
377 KKASSERT(i != 0);
378
379 /*
380 * The push-down index is now i - 1.
381 */
382 --i;
383 cursor->index = i;
384
385 /*
386 * Handle insertion and deletion requirements.
387 *
388 * If inserting split full nodes. The split code will
389 * adjust cursor->node and cursor->index if the current
390 * index winds up in the new node.
391 */
392 if (flags & HAMMER_BTREE_INSERT) {
393 if (node->base.count == HAMMER_BTREE_INT_ELMS) {
394 error = btree_split_internal(cursor);
395 if (error)
396 return(error);
397 /*
398 * reload stale pointers
399 */
400 i = cursor->index;
401 node = &cursor->node->internal;
402 }
403 }
404
405 /*
406 * If deleting rebalance - do not allow the child to have
407 * just one element or we will not be able to delete it.
408 *
409 * Neither internal or leaf nodes (except a root-leaf) are
410 * allowed to drop to 0 elements.
411 *
412 * Our separators may have been reorganized after rebalancing,
413 * so we have to pop back up and rescan.
414 */
415 if (flags & HAMMER_BTREE_DELETE) {
416 if (node->elms[i].subtree_count <= 1) {
417 error = btree_rebalance_node(cursor);
418 if (error)
419 return(error);
420 /* cursor->index is invalid after call */
421 goto new_cluster;
422 }
423 }
424
425 /*
426 * Push down (push into new node, existing node becomes
427 * the parent).
428 */
429 error = btree_cursor_down(cursor);
430 if (error)
431 return (error);
432 }
433
434
435 /*
436 * We are at a leaf, do a linear search of the key array.
437 * (XXX do a binary search). On success the index is set to the
438 * matching element, on failure the index is set to the insertion
439 * point.
440 *
441 * Boundaries are not stored in leaf nodes, so the index can wind
442 * up to the left of element 0 (index == 0) or past the end of
443 * the array (index == leaf->base.count).
444 */
445 leaf = &cursor->node->leaf;
446 KKASSERT(leaf->base.count <= HAMMER_BTREE_LEAF_ELMS);
447
448 for (i = 0; i < leaf->base.count; ++i) {
449 r = hammer_btree_cmp(key, &leaf->elms[i].base);
450 if (r < 0)
451 break;
452 if (r == 0) {
453 /*
454 * Return an exact match unless this is a cluster
455 * element. If it is, and the cluster tag flag has
456 * not been set, push into the new cluster and
457 * continue the search.
458 */
459 cursor->index = i;
460 if ((leaf->elms[i].base.obj_type &
461 HAMMER_OBJTYPE_CLUSTER_FLAG) &&
462 (flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
463 error = btree_cursor_down(cursor);
464 if (error)
465 return (error);
466 goto new_cluster;
467 }
468 return(0);
469 }
470 }
471
472 /*
473 * We could not find an exact match. Check for a cluster
474 * recursion. The cluster's range is bracketed by two
475 * leaf elements. One of the two must be in this node.
476 */
477 if ((flags & HAMMER_BTREE_CLUSTER_TAG) == 0) {
478 if (i == leaf->base.count) {
479 if (leaf->elms[i-1].base.obj_type &
480 HAMMER_OBJTYPE_CLUSTER_FLAG) {
481 cursor->index = i - 1;
482 error = btree_cursor_down(cursor);
483 if (error)
484 return (error);
485 goto new_cluster;
486 }
487 } else {
488 if (leaf->elms[i].base.obj_type &
489 HAMMER_OBJTYPE_CLUSTER_FLAG) {
490 cursor->index = i;
491 error = btree_cursor_down(cursor);
492 if (error)
493 return (error);
494 goto new_cluster;
495 }
496 }
497 }
498
499 /*
500 * If inserting split a full leaf before returning. This
501 * may have the side effect of adjusting cursor->node and
502 * cursor->index.
503 *
504 * We delayed the split in order to avoid any unnecessary splits.
505 *
506 * XXX parent's parent's subtree_count will be wrong after
507 * this (keep parent of parent around too? ugh).
508 */
509 cursor->index = i;
510 if ((flags & HAMMER_BTREE_INSERT) &&
511 leaf->base.count == HAMMER_BTREE_LEAF_ELMS) {
512 error = btree_split_leaf(cursor);
513 /* NOT USED
514 i = cursor->index;
515 node = &cursor->node->internal;
516 */
517 if (error)
518 return(error);
519 }
520 return(ENOENT);
521}
522
523/*
524 * Look up the key in the HAMMER BH-Tree and fill in the rest of the
525 * info structure with the results according to flags. 0 is returned on
526 * success, non-zero on error.
527 *
528 * The caller initializes (key, cluster) and makes the call. The cluster
529 * must be referenced and locked on call. The function can chain through
530 * multiple clusters and will replace the passed cluster as it does so,
531 * dereferencing and unlocking it, and referencing and locking the chain.
532 * On return info->cluster will still be referenced and locked but may
533 * represent a different cluster.
534 */
535int
536hammer_btree_lookup(hammer_btree_info_t info, hammer_base_elm_t key, int flags)
537{
538 hammer_btree_leaf_elm_t elm;
539 struct hammer_cluster *cluster;
540 int32_t cloff;
541 int error;
542 int iscl;
543
544 error = btree_search(&info->cursor, key, flags);
545 if (error)
546 return (error);
547
548 /*
549 * Extract the record and data reference if requested.
550 *
551 * A cluster record type has no data reference, the information
552 * is stored directly in the record and BH-Tree element.
553 *
554 * The case where the data reference resolves to the same buffer
555 * as the record reference must be handled.
556 */
557 elm = &info->cursor.node->leaf.elms[info->cursor.index];
558 iscl = (elm->base.obj_type & HAMMER_OBJTYPE_CLUSTER_FLAG) != 0;
559 cluster = info->cursor.cluster;
560 if ((flags & HAMMER_BTREE_GET_RECORD) && error == 0) {
561 cloff = iscl ? elm->cluster.rec_offset : elm->record.rec_offset;
562
563
564 info->rec = hammer_bread(cluster, cloff, HAMMER_FSBUF_RECORDS,
565 &error, &info->record_buffer);
566 } else {
567 cloff = 0;
568 }
569 if ((flags & HAMMER_BTREE_GET_DATA) && iscl == 0 && error == 0) {
570 if ((cloff ^ elm->record.data_offset) & ~HAMMER_BUFMASK) {
571 info->data = hammer_bread(cluster,
572 elm->record.data_offset,
573 HAMMER_FSBUF_DATA,
574 &error, &info->record_buffer);
575 } else {
576 info->data = (void *)
577 ((char *)info->data_buffer->ondisk +
578 (elm->record.data_offset & HAMMER_BUFMASK));
579 }
580 }
581 return(error);
582}
583
584
585/*
586 * Insert a record into a BH-Tree's cluster. The caller has already
587 * reserved space for the record and data and must handle a ENOSPC
588 * return.
589 */
590int
591hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm)
592{
593 struct hammer_btree_cursor *cursor;
594 struct hammer_btree_internal_node *parent;
595 struct hammer_btree_leaf_node *leaf;
596 int error;
597 int i;
598
599 /*
600 * Issue a search to get our cursor at the right place. The search
601 * will get us to a leaf node.
602 *
603 * The search also does some setup for our insert, so there is always
604 * room in the leaf.
605 */
606 error = btree_search(&info->cursor, &elm->base, HAMMER_BTREE_INSERT);
607 if (error != ENOENT) {
608 if (error == 0)
609 error = EEXIST;
610 return (error);
611 }
612
613 /*
614 * Insert the element at the leaf node and update the count in the
615 * parent. It is possible for parent to be NULL, indicating that
616 * the root of the BH-Tree in the cluster is a leaf. It is also
617 * possible for the leaf to be empty.
618 *
619 * Remember that the right-hand boundary is not included in the
620 * count.
621 */
622 cursor = &info->cursor;
623 leaf = &cursor->node->leaf;
624 i = cursor->index;
625 KKASSERT(leaf->base.count < HAMMER_BTREE_LEAF_ELMS);
626 bcopy(&leaf->elms[i], &leaf->elms[i+1],
627 (leaf->base.count - i + 1) * sizeof(leaf->elms[0]));
628 leaf->elms[i] = *elm;
629 ++leaf->base.count;
630
631 if ((parent = cursor->parent) != NULL) {
632 i = cursor->parent_index;
633 ++parent->elms[i].subtree_count;
634 KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
635 hammer_modify_buffer(cursor->parent_buffer);
636 }
637 hammer_modify_buffer(cursor->node_buffer);
638 return(0);
639}
640
641/*
642 * Delete a record from the BH-Tree.
643 */
644int
645hammer_btree_delete(struct hammer_btree_info *info, hammer_base_elm_t key)
646{
647 struct hammer_btree_cursor *cursor;
648 struct hammer_btree_internal_node *parent;
649 struct hammer_btree_leaf_node *leaf;
650 int error;
651 int i;
652
653 /*
654 * Locate the leaf element to delete. The search is also responsible
655 * for doing some of the rebalancing work on its way down.
656 */
657 error = btree_search(&info->cursor, key, HAMMER_BTREE_DELETE);
658 if (error)
659 return (error);
660
661 /*
662 * Delete the element from the leaf node. We leave empty leafs alone
663 * and instead depend on a future search to locate and destroy them.
664 * Otherwise we would have to recurse back up the tree to adjust
665 * the parent's subtree_count and we do not want to do that.
666 *
667 * Remember that the right-hand boundary is not included in the
668 * count.
669 */
670 cursor = &info->cursor;
671 leaf = &cursor->node->leaf;
672 i = cursor->index;
673
674 KKASSERT(cursor->node->base.type == HAMMER_BTREE_LEAF_NODE);
675 bcopy(&leaf->elms[i+1], &leaf->elms[i],
676 (leaf->base.count - i) * sizeof(leaf->elms[0]));
677 --leaf->base.count;
678 if ((parent = cursor->parent) != NULL) {
679 /*
680 * Adjust parent's notion of the leaf's count. subtree_count
681 * is only approximately, it is allowed to be too small but
682 * never allowed to be too large. Make sure we don't drop
683 * the count below 0.
684 */
685 i = cursor->parent_index;
686 if (parent->elms[i].subtree_count)
687 --parent->elms[i].subtree_count;
688 KKASSERT(parent->elms[i].subtree_count <= leaf->base.count);
689 hammer_modify_buffer(cursor->parent_buffer);
690 }
691 hammer_modify_buffer(cursor->node_buffer);
692 return(0);
693}
694
695/************************************************************************
696 * CURSOR SUPPORT *
697 ************************************************************************
698 *
699 * These routines do basic cursor operations. This support will probably
700 * be expanded in the future to add link fields for linear scans.
701 */
702
703/*
704 * Unconditionally set the cursor to the root of the specified cluster.
705 * The current cursor node is set to the root node of the cluster (which
706 * can be an internal node or a degenerate leaf), and the parent info
707 * is cleaned up and cleared.
708 *
709 * The passed cluster must be locked. This function will add a reference
710 * to it. The cursor must already have a cluster assigned to it, which we
711 * will replace.
712 */
713static
714int
715btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor,
716 int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id)
717{
718 struct hammer_cluster *cluster;
719 struct hammer_volume *volume;
720 int error = 0;
721
722 if (vol_no < 0)
723 return(EIO);
724 cluster = cursor->cluster;
725 KKASSERT(cluster != NULL);
726 if (vol_no == cluster->volume->vol_no) {
727 cluster = hammer_get_cluster(cluster->volume, clu_no,
728 &error, 0);
729 } else {
730 volume = hammer_get_volume(cluster->volume->hmp,
731 vol_no, &error);
732 if (volume) {
733 cluster = hammer_get_cluster(volume, clu_no,
734 &error, 0);
735 hammer_put_volume(volume);
736 } else {
737 cluster = NULL;
738 }
739 }
740
741 /*
742 * Make sure the cluster id matches. XXX At the moment the
743 * clu_id in the btree cluster element is only 32 bits, so only
744 * compare the low 32 bits.
745 */
746 if (cluster) {
747 if ((int32_t)cluster->ondisk->clu_id == (int32_t)clu_id) {
748 btree_cursor_set_cluster(cursor, cluster);
749 } else {
750 error = EIO;
751 }
752 hammer_put_cluster(cluster);
753 }
754 return (error);
755}
756
757static
758int
759btree_cursor_set_cluster(hammer_btree_cursor_t cursor,
760 struct hammer_cluster *cluster)
761{
762 int error = 0;
763
764 hammer_dup_cluster(&cursor->cluster, cluster);
765 cursor->node = hammer_bread(cluster,
766 cluster->ondisk->clu_btree_root,
767 HAMMER_FSBUF_BTREE,
768 &error,
769 &cursor->node_buffer);
770 cursor->index = 0;
771 if (cursor->parent) {
772 hammer_put_buffer(cursor->parent_buffer);
773 cursor->parent_buffer = NULL;
774 cursor->parent = NULL;
775 cursor->parent_index = 0;
776 }
777 cursor->left_bound = &cluster->ondisk->clu_btree_beg;
778 cursor->right_bound = &cluster->ondisk->clu_btree_end;
779 return (error);
780}
781
782/*
783 * Cursor the node up to the parent node. If at the root of a cluster,
784 * cursor to the root of the cluster's parent cluster. Note carefully
785 * that we do NOT scan the parent cluster to find the leaf that dropped
786 * into our current cluster.
787 *
788 * This function is primarily used by the search code to avoid having
789 * to search from the root of the filesystem BH-Tree.
790 */
791static
792int
793btree_cursor_up(hammer_btree_cursor_t cursor)
794{
795 struct hammer_cluster_ondisk *ondisk;
796 struct hammer_btree_internal_node *parent;
797 int error;
798 int i;
799 int r;
800
801 error = 0;
802 if (cursor->parent == NULL) {
803 /*
804 * We are at the root of the cluster, pop up to the root
805 * of the parent cluster. Return EIO if we are at the
806 * root cluster of the filesystem.
807 */
808 ondisk = cursor->cluster->ondisk;
809 error = btree_cursor_set_cluster_by_value(
810 cursor,
811 ondisk->clu_btree_parent_vol_no,
812 ondisk->clu_btree_parent_clu_no,
813 ondisk->clu_btree_parent_clu_id);
814 } else {
815 /*
816 * Copy the current node's parent info into the node and load
817 * a new parent.
818 */
819 cursor->index = cursor->parent_index;
820 cursor->node = (hammer_btree_node_t)cursor->parent;
821 hammer_dup_buffer(&cursor->node_buffer, cursor->parent_buffer);
822
823 /*
824 * Load the parent's parent into parent and figure out the
825 * parent's element index for its child. Just NULL it out
826 * if we hit the root of the cluster.
827 */
828 if (cursor->parent->base.parent) {
829 parent = hammer_bread(cursor->cluster,
830 cursor->node->base.parent,
831 HAMMER_FSBUF_BTREE,
832 &error,
833 &cursor->parent_buffer);
834 for (i = 0; i < parent->base.count; ++i) {
835 r = hammer_btree_cmp(
836 &cursor->node->internal.elms[0].base,
837 &parent->elms[i].base);
838 if (r < 0)
839 break;
840 }
841 cursor->parent = parent;
842 cursor->parent_index = i - 1;
843 KKASSERT(parent->elms[i].subtree_offset ==
844 hammer_bclu_offset(cursor->node_buffer,
845 cursor->node));
846 } else {
847 hammer_put_buffer(cursor->parent_buffer);
848 cursor->parent = NULL;
849 cursor->parent_buffer = NULL;
850 cursor->parent_index = 0;
851 }
852 }
853 return(error);
854}
855
856/*
857 * Cursor down into (node, index)
858 *
859 * Push down into the current cursor. The current cursor becomes the parent.
860 * If the current cursor represents a leaf cluster element this function will
861 * push into the root of a new cluster and clear the parent fields.
862 *
863 * Pushin down at a leaf which is not a cluster element returns EIO.
864 */
865static
866int
867btree_cursor_down(hammer_btree_cursor_t cursor)
868{
869 hammer_btree_node_t node;
870 int error;
871
872 node = cursor->node;
873 if (node->base.type == HAMMER_BTREE_LEAF_NODE) {
874 /*
875 * Push into another cluster
876 */
877 hammer_btree_leaf_elm_t elm;
878
879 elm = &node->leaf.elms[cursor->index];
880 if (elm->base.rec_type == HAMMER_RECTYPE_CLUSTER) {
881 error = btree_cursor_set_cluster_by_value(
882 cursor,
883 elm->cluster.vol_no,
884 elm->cluster.clu_no,
885 elm->cluster.verifier);
886 } else {
887 error = EIO;
888 }
889 } else {
890 /*
891 * Push into another BH-Tree node (internal or leaf)
892 */
893 struct hammer_btree_internal_elm *elm;
894
895 KKASSERT(node->base.type == HAMMER_BTREE_INTERNAL_NODE);
896 elm = &node->internal.elms[cursor->index];
897 KKASSERT(elm->subtree_offset != 0);
898
899 cursor->parent_index = cursor->index;
900 cursor->parent = &cursor->node->internal;
901 hammer_dup_buffer(&cursor->parent_buffer, cursor->node_buffer);
902
903 cursor->node = hammer_bread(cursor->cluster,
904 elm->subtree_offset,
905 HAMMER_FSBUF_BTREE,
906 &error,
907 &cursor->node_buffer);
908 cursor->index = 0;
909 KKASSERT(cursor->node == NULL ||
910 cursor->node->base.parent == elm->subtree_offset);
911 }
912 return(error);
913}
914
915/************************************************************************
916 * SPLITTING AND MERGING *
917 ************************************************************************
918 *
919 * These routines do all the dirty work required to split and merge nodes.
920 */
921
922/*
923 * Split an internal into two nodes and move the separator at the split
924 * point to the parent. Note that the parent's parent's element pointing
925 * to our parent will have an incorrect subtree_count (we don't update it).
926 * It will be low, which is ok.
927 *
928 * Cursor->index indicates the element the caller intends to push into.
929 * We will adjust cursor->node and cursor->index if that element winds
930 * up in the split node.
931 */
932static
933int
934btree_split_internal(hammer_btree_cursor_t cursor)
935{
936 struct hammer_btree_internal_node *parent;
937 struct hammer_btree_internal_node *node;
938 struct hammer_btree_internal_node *new_node;
939 struct hammer_btree_internal_elm *elm;
940 struct hammer_btree_internal_elm *parent_elm;
941 struct hammer_buffer *new_buffer;
942 int32_t parent_offset;
943 int parent_index;
944 int made_root;
945 int split;
946 int error;
947 const size_t esize = sizeof(struct hammer_btree_internal_elm);
948
949 /*
950 * We are splitting but elms[split] will be promoted to the parent,
951 * leaving the right hand node with one less element. If the
952 * insertion point will be on the left-hand side adjust the split
953 * point to give the right hand side one additional node.
954 */
955 node = &cursor->node->internal;
956 split = (node->base.count + 1) / 2;
957 if (cursor->index <= split)
958 --split;
959 error = 0;
960
961 /*
962 * If we are at the root of the tree, create a new root node with
963 * 1 element and split normally. Avoid making major modifications
964 * until we know the whole operation will work.
965 */
966 parent = cursor->parent;
967 if (parent == NULL) {
968 parent = hammer_alloc_btree(cursor->cluster, &error,
969 &cursor->parent_buffer);
970 if (parent == NULL)
971 return(error);
972 parent->base.count = 1;
973 parent->base.parent = 0;
974 parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
975 parent->base.subtype = node->base.type;
976 parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
977 parent->elms[0].subtree_offset =
978 hammer_bclu_offset(cursor->node_buffer, node);
979 parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
980 made_root = 1;
981 cursor->parent_index = 0; /* insertion point in parent */
982 } else {
983 made_root = 0;
984 }
985 parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
986
987 /*
988 * Split node into new_node at the split point.
989 *
990 * B O O O P N N B <-- P = node->elms[split]
991 * 0 1 2 3 4 5 6 <-- subtree indices
992 *
993 * x x P x x
994 * s S S s
995 * / \
996 * B O O O B B N N B <--- inner boundary points are 'P'
997 * 0 1 2 3 4 5 6
998 *
999 */
1000 new_buffer = NULL;
1001 new_node = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
1002 if (new_node == NULL) {
1003 if (made_root)
1004 hammer_free_btree_ptr(cursor->parent_buffer,
1005 (hammer_btree_node_t)parent);
1006 return(error);
1007 }
1008
1009 /*
1010 * Create the new node. P become the left-hand boundary in the
1011 * new node. Copy the right-hand boundary as well.
1012 *
1013 * elm is the new separator.
1014 */
1015 elm = &node->elms[split];
1016 bcopy(elm, &new_node->elms[0], (node->base.count - split + 1) * esize);
1017 new_node->base.count = node->base.count - split;
1018 new_node->base.parent = parent_offset;
1019 new_node->base.type = HAMMER_BTREE_INTERNAL_NODE;
1020 new_node->base.subtype = node->base.subtype;
1021 KKASSERT(node->base.type == new_node->base.type);
1022
1023 /*
1024 * Cleanup the original node. P becomes the new boundary, its
1025 * subtree_offset was moved to the new node. If we had created
1026 * a new root its parent pointer may have changed.
1027 */
1028 node->base.parent = parent_offset;
1029 elm->subtree_offset = 0;
1030
1031 /*
1032 * Insert the separator into the parent, fixup the parent's
1033 * reference to the original node, and reference the new node.
1034 * The separator is P.
1035 *
1036 * Remember that base.count does not include the right-hand boundary.
1037 */
1038 parent_index = cursor->parent_index;
1039 parent->elms[parent_index].subtree_count = split;
1040 parent_elm = &parent->elms[parent_index+1];
1041 bcopy(parent_elm, parent_elm + 1,
1042 (parent->base.count - parent_index) * esize);
1043 parent_elm->base = elm->base; /* separator P */
1044 parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_node);
1045 parent_elm->subtree_count = new_node->base.count;
1046
1047 hammer_modify_buffer(new_buffer);
1048 hammer_modify_buffer(cursor->parent_buffer);
1049 hammer_modify_buffer(cursor->node_buffer);
1050
1051 /*
1052 * The cluster's root pointer may have to be updated.
1053 */
1054 if (made_root) {
1055 cursor->cluster->ondisk->clu_btree_root = node->base.parent;
1056 hammer_modify_cluster(cursor->cluster);
1057 }
1058
1059 /*
1060 * Ok, now adjust the cursor depending on which element the original
1061 * index was pointing at. If we are >= the split point the push node
1062 * is now in the new node.
1063 *
1064 * NOTE: If we are at the split point itself we cannot stay with the
1065 * original node because the push index will point at the right-hand
1066 * boundary, which is illegal.
1067 */
1068 if (cursor->index >= split) {
1069 cursor->index -= split;
1070 cursor->node = (hammer_btree_node_t)new_node;
1071 hammer_put_buffer(cursor->node_buffer);
1072 cursor->node_buffer = new_buffer;
1073 }
1074
1075 return (0);
1076}
1077
1078/*
1079 * Same as the above, but splits a full leaf node.
1080 */
1081static
1082int
1083btree_split_leaf(hammer_btree_cursor_t cursor)
1084{
1085 struct hammer_btree_internal_node *parent;
1086 struct hammer_btree_leaf_node *leaf;
1087 struct hammer_btree_leaf_node *new_leaf;
1088 union hammer_btree_leaf_elm *elm;
1089 struct hammer_btree_internal_elm *parent_elm;
1090 struct hammer_buffer *new_buffer;
1091 int32_t parent_offset;
1092 int parent_index;
1093 int made_root;
1094 int split;
1095 int error;
1096 const size_t esize = sizeof(struct hammer_btree_internal_elm);
1097
1098 /*
1099 * We are splitting but elms[split] will be promoted to the parent,
1100 * leaving the right hand node with one less element. If the
1101 * insertion point will be on the left-hand side adjust the split
1102 * point to give the right hand side one additional node.
1103 */
1104 leaf = &cursor->node->leaf;
1105 split = (leaf->base.count + 1) / 2;
1106 if (cursor->index <= split)
1107 --split;
1108 error = 0;
1109
1110 /*
1111 * If we are at the root of the tree, create a new root node with
1112 * 1 element and split normally. Avoid making major modifications
1113 * until we know the whole operation will work.
1114 */
1115 parent = cursor->parent;
1116 if (parent == NULL) {
1117 parent = hammer_alloc_btree(cursor->cluster, &error,
1118 &cursor->parent_buffer);
1119 if (parent == NULL)
1120 return(error);
1121 parent->base.count = 1;
1122 parent->base.parent = 0;
1123 parent->base.type = HAMMER_BTREE_INTERNAL_NODE;
1124 parent->base.subtype = leaf->base.type;
1125 parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg;
1126 parent->elms[0].subtree_offset =
1127 hammer_bclu_offset(cursor->node_buffer, leaf);
1128 parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end;
1129 made_root = 1;
1130 cursor->parent_index = 0; /* insertion point in parent */
1131 } else {
1132 made_root = 0;
1133 }
1134 parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent);
1135
1136 /*
1137 * Split leaf into new_leaf at the split point. Select a separator
1138 * value in-between the two leafs but with a bent towards the right
1139 * leaf since comparisons use an 'elm >= separator' inequality.
1140 *
1141 * L L L L L L L L
1142 *
1143 * x x P x x
1144 * s S S s
1145 * / \
1146 * L L L L L L L L
1147 */
1148 new_buffer = NULL;
1149 new_leaf = hammer_alloc_btree(cursor->cluster, &error, &new_buffer);
1150 if (new_leaf == NULL) {
1151 if (made_root)
1152 hammer_free_btree_ptr(cursor->parent_buffer,
1153 (hammer_btree_node_t)parent);
1154 return(error);
1155 }
1156
1157 /*
1158 * Create the new node. P become the left-hand boundary in the
1159 * new node. Copy the right-hand boundary as well.
1160 */
1161 elm = &leaf->elms[split];
1162 bcopy(elm, &new_leaf->elms[0], (leaf->base.count - split) * esize);
1163 new_leaf->base.count = leaf->base.count - split;
1164 new_leaf->base.parent = parent_offset;
1165 new_leaf->base.type = HAMMER_BTREE_LEAF_NODE;
1166 new_leaf->base.subtype = 0;
1167 KKASSERT(leaf->base.type == new_leaf->base.type);
1168
1169 /*
1170 * Cleanup the original node. P becomes the new boundary, its
1171 * subtree_offset was moved to the new node. If we had created
1172 * a new root its parent pointer may have changed.
1173 */
1174 leaf->base.parent = parent_offset;
1175
1176 /*
1177 * Insert the separator into the parent, fixup the parent's
1178 * reference to the original node, and reference the new node.
1179 * The separator is P.
1180 *
1181 * Remember that base.count does not include the right-hand boundary.
1182 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1183 */
1184 parent_index = cursor->parent_index;
1185 parent->elms[parent_index].subtree_count = split;
1186 parent_elm = &parent->elms[parent_index+1];
1187 if (parent_index + 1 != parent->base.count) {
1188 bcopy(parent_elm, parent_elm + 1,
1189 (parent->base.count - parent_index - 1) * esize);
1190 }
1191 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1192 parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_leaf);
1193 parent_elm->subtree_count = new_leaf->base.count;
1194
1195 hammer_modify_buffer(new_buffer);
1196 hammer_modify_buffer(cursor->parent_buffer);
1197 hammer_modify_buffer(cursor->node_buffer);
1198
1199 /*
1200 * The cluster's root pointer may have to be updated.
1201 */
1202 if (made_root) {
1203 cursor->cluster->ondisk->clu_btree_root = leaf->base.parent;
1204 hammer_modify_cluster(cursor->cluster);
1205 }
1206
1207 /*
1208 * Ok, now adjust the cursor depending on which element the original
1209 * index was pointing at. If we are >= the split point the push node
1210 * is now in the new node.
1211 *
1212 * NOTE: If we are at the split point itself we cannot stay with the
1213 * original node because the push index will point at the right-hand
1214 * boundary, which is illegal.
1215 */
1216 if (cursor->index >= split) {
1217 cursor->index -= split;
1218 cursor->node = (hammer_btree_node_t)new_leaf;
1219 hammer_put_buffer(cursor->node_buffer);
1220 cursor->node_buffer = new_buffer;
1221 }
1222
1223 return (0);
1224}
1225
1226/*
1227 * This routine is called on an internal node prior to recursing down
1228 * through (node, index) when (node, index) MIGHT have too few elements for
1229 * the caller to perform a deletion.
1230 *
1231 * cursor->index is invalid on return because the separators may have gotten
1232 * adjusted, the caller must rescan the node's elements. The caller may set
1233 * cursor->index to -1 if it wants us to do a general rebalancing.
1234 *
1235 * NOTE: Because we do not update the parent's parent in the split code,
1236 * the subtree_count used by the caller may be incorrect. We correct it
1237 * here. Also note that we cannot change the depth of the tree's leaf
1238 * nodes here (see btree_collapse()).
1239 *
1240 * This routine rebalances the children of the node, collapsing children
1241 * together if possible. On return each child will have at least L/2-1
1242 * elements unless the node only has one child.
1243 */
1244static
1245int
1246btree_rebalance_node(hammer_btree_cursor_t cursor)
1247{
1248 struct hammer_btree_internal_node *node;
1249 hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
1250 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1251 hammer_btree_node_t child;
1252 hammer_btree_elm_inmemory_t elms;
1253 int i, j, n, nelms, goal;
1254 int maxelms, halfelms;
1255 int error;
1256
1257 /*
1258 * Basic setup
1259 */
1260 node = &cursor->node->internal;
1261 KKASSERT(node->elms[cursor->index].subtree_offset != 0);
1262 error = 0;
1263
1264 /*
1265 * Load the children of node and do any necessary corrections
1266 * to subtree_count. subtree_count may be incorrect due to the
1267 * way insertions split nodes. Get a count of the total number
1268 * of elements held by our children.
1269 */
1270 error = 0;
1271
1272 for (i = n = 0; i < node->base.count; ++i) {
1273 struct hammer_btree_internal_elm *elm;
1274
1275 elm = &node->elms[i];
1276 children[i] = NULL;
1277 child_buffer[i] = NULL; /* must be preinitialized for bread */
1278 if (elm->subtree_offset == 0)
1279 continue;
1280 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1281 HAMMER_FSBUF_BTREE, &error,
1282 &child_buffer[i]);
1283 children[i] = child;
1284 if (child == NULL)
1285 continue;
1286 KKASSERT(node->base.subtype == child->base.type);
1287
1288 /*
1289 * Accumulate n for a good child, update the node's count
1290 * if it was wrong.
1291 */
1292 if (node->elms[i].subtree_count != child->base.count) {
1293 node->elms[i].subtree_count = child->base.count;
1294 }
1295 n += node->elms[i].subtree_count;
1296 }
1297 if (error)
1298 goto failed;
1299
1300 /*
1301 * Collect all the children's elements together
1302 */
1303 nelms = n;
1304 elms = kmalloc(sizeof(*elms) * (nelms + 1), M_HAMMER, M_WAITOK|M_ZERO);
1305 for (i = n = 0; i < node->base.count; ++i) {
1306 child = children[i];
1307 for (j = 0; j < child->base.count; ++j) {
1308 elms[n].owner = child;
1309 if (node->base.subtype == HAMMER_BTREE_LEAF_NODE)
1310 elms[n].u.leaf = child->leaf.elms[j];
1311 else
1312 elms[n].u.internal = child->internal.elms[j];
1313 ++n;
1314 }
1315 }
1316 KKASSERT(n == nelms);
1317
1318 /*
1319 * Store a boundary in the elms array to ease the code below. This
1320 * is only used if the children are internal nodes.
1321 */
1322 elms[n].u.internal = node->elms[i];
1323
1324 /*
1325 * Calculate the number of elements each child should have (goal) by
1326 * reducing the number of elements until we achieve at least
1327 * halfelms - 1 per child, unless we are a degenerate case.
1328 */
1329 maxelms = btree_max_elements(node->base.subtype);
1330 halfelms = maxelms / 2;
1331
1332 goal = halfelms - 1;
1333 while (i && n / i < goal)
1334 --i;
1335
1336 /*
1337 * Now rebalance using the specified goal
1338 */
1339 for (i = n = 0; i < node->base.count; ++i) {
1340 struct hammer_buffer *subchild_buffer = NULL;
1341 struct hammer_btree_internal_node *subchild;
1342
1343 child = children[i];
1344 for (j = 0; j < goal && n < nelms; ++j) {
1345 if (node->base.subtype == HAMMER_BTREE_LEAF_NODE) {
1346 child->leaf.elms[j] = elms[n].u.leaf;
1347 } else {
1348 child->internal.elms[j] = elms[n].u.internal;
1349 }
1350
1351 /*
1352 * If the element's parent has changed we have to
1353 * update the parent pointer. This is somewhat
1354 * expensive.
1355 */
1356 if (elms[n].owner != child &&
1357 node->base.subtype == HAMMER_BTREE_INTERNAL_NODE) {
1358 subchild = hammer_bread(cursor->cluster,
1359 elms[n].u.internal.subtree_offset,
1360 HAMMER_FSBUF_BTREE,
1361 &error,
1362 &subchild_buffer);
1363 if (subchild) {
1364 subchild->base.parent =
1365 hammer_bclu_offset(child_buffer[i],
1366 child);
1367 hammer_modify_buffer(subchild_buffer);
1368 }
1369 /* XXX error */
1370 }
1371 ++n;
1372 }
1373 /*
1374 * Set right boundary if the children are internal nodes.
1375 */
1376 if (node->base.subtype == HAMMER_BTREE_INTERNAL_NODE)
1377 child->internal.elms[j] = elms[n].u.internal;
1378 child->base.count = j;
1379 hammer_modify_buffer(child_buffer[i]);
1380 if (subchild_buffer)
1381 hammer_put_buffer(subchild_buffer);
1382
1383 /*
1384 * If we have run out of elements, break out
1385 */
1386 if (n == nelms)
1387 break;
1388 }
1389
1390 /*
1391 * Physically destroy any left-over children. These children's
1392 * elements have been packed into prior children. The node's
1393 * right hand boundary and count gets shifted to index i.
1394 *
1395 * The subtree count in the node's parent MUST be updated because
1396 * we are removing elements. The subtree_count field is allowed to
1397 * be too small, but not too large!
1398 */
1399 if (i != node->base.count) {
1400 n = i;
1401 node->elms[n] = node->elms[node->base.count];
1402 while (i < node->base.count) {
1403 hammer_free_btree_ptr(child_buffer[i], children[i]);
1404 hammer_put_buffer(child_buffer[i]);
1405 ++i;
1406 }
1407 node->base.count = n;
1408 if (cursor->parent) {
1409 cursor->parent->elms[cursor->parent_index].subtree_count = n;
1410 hammer_modify_buffer(cursor->parent_buffer);
1411 }
1412 }
1413
1414 kfree(elms, M_HAMMER);
1415failed:
1416 hammer_modify_buffer(cursor->node_buffer);
1417 for (i = 0; i < node->base.count; ++i) {
1418 if (child_buffer[i])
1419 hammer_put_buffer(child_buffer[i]);
1420 }
1421 return (error);
1422}
1423
1424/*
1425 * This routine is only called if the cursor is at the root node and the
1426 * root node is an internal node. We attempt to collapse the root node
1427 * by replacing it with all of its children, reducing tree depth by one.
1428 *
1429 * This is the only way to reduce tree depth in a HAMMER filesystem.
1430 * Note that all leaf nodes are at the same depth.
1431 *
1432 * This is a fairly expensive operation because we not only have to load
1433 * the root's children, we also have to scan each child and adjust the
1434 * parent offset for each element in each child. Nasty all around.
1435 */
1436static
1437int
1438btree_collapse(hammer_btree_cursor_t cursor)
1439{
1440 hammer_btree_node_t root, child;
1441 hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS];
1442 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1443 int count;
1444 int i, j, n;
1445 int root_modified;
1446 int error;
1447 int32_t root_offset;
1448 u_int8_t subsubtype;
1449
1450 root = cursor->node;
1451 count = root->base.count;
1452 root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1453
1454 /*
1455 * Sum up the number of children each element has. This value is
1456 * only approximate due to the way the insertion node works. It
1457 * may be too small but it will never be too large.
1458 *
1459 * Quickly terminate the collapse if the elements have too many
1460 * children.
1461 */
1462 KKASSERT(root->base.parent == 0); /* must be root node */
1463 KKASSERT(root->base.type == HAMMER_BTREE_INTERNAL_NODE);
1464 KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1465
1466 for (i = n = 0; i < count; ++i) {
1467 n += root->internal.elms[i].subtree_count;
1468 }
1469 if (n > btree_max_elements(root->base.subtype))
1470 return(0);
1471
1472 /*
1473 * Iterate through the elements again and correct the subtree_count.
1474 * Terminate the collapse if we wind up with too many.
1475 */
1476 error = 0;
1477 root_modified = 0;
1478
1479 for (i = n = 0; i < count; ++i) {
1480 struct hammer_btree_internal_elm *elm;
1481
1482 elm = &root->internal.elms[i];
1483 child_buffer[i] = NULL;
1484 children[i] = NULL;
1485 if (elm->subtree_offset == 0)
1486 continue;
1487 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1488 HAMMER_FSBUF_BTREE, &error,
1489 &child_buffer[i]);
1490 children[i] = child;
1491 if (child == NULL)
1492 continue;
1493 KKASSERT(root->base.subtype == child->base.type);
1494
1495 /*
1496 * Accumulate n for a good child, update the root's count
1497 * if it was wrong.
1498 */
1499 if (root->internal.elms[i].subtree_count != child->base.count) {
1500 root->internal.elms[i].subtree_count = child->base.count;
1501 root_modified = 1;
1502 }
1503 n += root->internal.elms[i].subtree_count;
1504 }
1505 if (error || n > btree_max_elements(root->base.subtype))
1506 goto done;
1507
1508 /*
1509 * Ok, we can collapse the root. If the root's children are leafs
1510 * the collapse is really simple. If they are internal nodes the
1511 * collapse is not so simple because we have to fixup the parent
1512 * pointers for the root's children's children.
1513 *
1514 * When collapsing an internal node the far left and far right
1515 * element's boundaries should match the root's left and right
1516 * boundaries.
1517 */
1518 if (root->base.subtype == HAMMER_BTREE_LEAF_NODE) {
1519 for (i = n = 0; i < count; ++i) {
1520 child = children[i];
1521 for (j = 0; j < child->base.count; ++j) {
1522 root->leaf.elms[n] = child->leaf.elms[j];
1523 ++n;
1524 }
1525 }
1526 root->base.type = root->base.subtype;
1527 root->base.subtype = 0;
1528 root->base.count = n;
1529 root->leaf.link_left = 0;
1530 root->leaf.link_right = 0;
1531 } else {
1532 struct hammer_btree_internal_elm *elm;
1533 struct hammer_btree_internal_node *subchild;
1534 struct hammer_buffer *subchild_buffer = NULL;
1535
1536 if (count) {
1537 child = children[0];
1538 subsubtype = child->base.subtype;
1539 KKASSERT(child->base.count > 0);
1540 KKASSERT(root->internal.elms[0].base.key ==
1541 child->internal.elms[0].base.key);
1542 child = children[count-1];
1543 KKASSERT(child->base.count > 0);
1544 KKASSERT(root->internal.elms[count].base.key ==
1545 child->internal.elms[child->base.count].base.key);
1546 } else {
1547 subsubtype = 0;
1548 }
1549 for (i = n = 0; i < count; ++i) {
1550 child = children[i];
1551 KKASSERT(child->base.subtype == subsubtype);
1552 for (j = 0; j < child->base.count; ++j) {
1553 elm = &child->internal.elms[j];
1554
1555 root->internal.elms[n] = *elm;
1556 subchild = hammer_bread(cursor->cluster,
1557 elm->subtree_offset,
1558 HAMMER_FSBUF_BTREE,
1559 &error,
1560 &subchild_buffer);
1561 if (subchild) {
1562 subchild->base.parent = root_offset;
1563 hammer_modify_buffer(subchild_buffer);
1564 }
1565 ++n;
1566 }
1567 /* make sure the right boundary is correct */
1568 /* (this gets overwritten when the loop continues) */
1569 /* XXX generate a new separator? */
1570 root->internal.elms[n] = child->internal.elms[j];
1571 }
1572 root->base.type = HAMMER_BTREE_INTERNAL_NODE;
1573 root->base.subtype = subsubtype;
1574 if (subchild_buffer)
1575 hammer_put_buffer(subchild_buffer);
1576 }
1577 root_modified = 1;
1578
1579 /*
1580 * Cleanup
1581 */
1582done:
1583 if (root_modified)
1584 hammer_modify_buffer(cursor->node_buffer);
1585 for (i = 0; i < count; ++i) {
1586 if (child_buffer[i])
1587 hammer_put_buffer(child_buffer[i]);
1588 }
1589 return(error);
1590}
1591