Make fixes to the A-list initialization and properly allocate records
[dragonfly.git] / sys / vfs / hammer / hammer_btree.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.7 2007/11/26 21:38:37 dillon Exp $
35 */
36
37/*
38 * HAMMER B-Tree index
39 *
40 * HAMMER implements a modified B+Tree. In documentation this will
41 * simply be refered to as the HAMMER B-Tree. Basically a B-Tree
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
45 * otherwords, we have boundary elements at the two ends of a B-Tree node
46 * instead of sub-tree pointers.
47 *
48 * A B-Tree internal node looks like this:
49 *
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
52 *
53 * A B-Tree leaf node basically looks like this:
54 *
55 * L L L L L L L L <-- leaf elemenets
56 *
57 * The radix for an internal node is 1 less then a leaf but we get a
58 * number of significant benefits for our troubles.
59 *
60 * The big benefit to using a B-Tree containing boundary information
61 * is that it is possible to cache pointers into the middle of the tree
62 * and not have to start searches, insertions, OR deletions at the root
63 * node. In particular, searches are able to progress in a definitive
64 * direction from any point in the tree without revisting nodes. This
65 * greatly improves the efficiency of many operations, most especially
66 * record appends.
67 *
68 * B-Trees also make the stacking of trees fairly straightforward.
69 *
70 * INTER-CLUSTER ELEMENTS: An element of an internal node may reference
71 * the root of another cluster rather then a node in the current cluster.
72 * This is known as an inter-cluster references. Only B-Tree searches
73 * will cross cluster boundaries. The rebalancing and collapse code does
74 * not attempt to move children between clusters. A major effect of this
75 * is that we have to relax minimum element count requirements and allow
76 * trees to become somewhat unabalanced.
77 *
78 * INSERTIONS AND DELETIONS: When inserting we split full nodes on our
79 * way down as an optimization. I originally experimented with rebalancing
80 * nodes on the way down for deletions but it created a huge mess due to
81 * the way inter-cluster linkages work. Instead, now I simply allow
82 * the tree to become unbalanced and allow leaf nodes to become empty.
83 * The delete code will try to clean things up from the bottom-up but
84 * will stop if related elements are not in-core or if it cannot get a node
85 * lock.
86 */
87#include "hammer.h"
88#include <sys/buf.h>
89#include <sys/buf2.h>
90
91static int btree_search(hammer_cursor_t cursor, int flags);
92static int btree_split_internal(hammer_cursor_t cursor);
93static int btree_split_leaf(hammer_cursor_t cursor);
94static int btree_remove(hammer_node_t node, int index);
95#if 0
96static int btree_rebalance(hammer_cursor_t cursor);
97static int btree_collapse(hammer_cursor_t cursor);
98#endif
99static int btree_node_is_full(hammer_node_ondisk_t node);
100static void hammer_make_separator(hammer_base_elm_t key1,
101 hammer_base_elm_t key2, hammer_base_elm_t dest);
102
103/*
104 * Iterate records after a search. The cursor is iterated forwards past
105 * the current record until a record matching the key-range requirements
106 * is found. ENOENT is returned if the iteration goes past the ending
107 * key.
108 *
109 * key_beg/key_end is an INCLUSVE range. i.e. if you are scanning to load
110 * a 4096 byte buffer key_beg might specify an offset of 0 and key_end an
111 * offset of 4095.
112 *
113 * cursor->key_beg may or may not be modified by this function during
114 * the iteration.
115 */
116int
117hammer_btree_iterate(hammer_cursor_t cursor)
118{
119 hammer_node_ondisk_t node;
120 hammer_btree_elm_t elm;
121 int error;
122 int r;
123#if 0
124 int s;
125 int64_t save_key;
126#endif
127
128 /*
129 * Skip past the current record
130 */
131 node = cursor->node->ondisk;
132 if (node == NULL)
133 return(ENOENT);
134 if (cursor->index < node->count &&
135 (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
136 ++cursor->index;
137 }
138
139 /*
140 * Loop until an element is found or we are done.
141 */
142 for (;;) {
143 /*
144 * We iterate up the tree and then index over one element
145 * while we are at the last element in the current node.
146 *
147 * NOTE: This can pop us up to another cluster.
148 *
149 * If we are at the root of the root cluster, cursor_up
150 * returns ENOENT.
151 *
152 * NOTE: hammer_cursor_up() will adjust cursor->key_beg
153 * when told to re-search for the cluster tag.
154 *
155 * XXX this could be optimized by storing the information in
156 * the parent reference.
157 */
158 if (cursor->index == node->count) {
159 error = hammer_cursor_up(cursor);
160 if (error)
161 break;
162 node = cursor->node->ondisk;
163 KKASSERT(cursor->index != node->count);
164 ++cursor->index;
165 continue;
166 }
167
168 /*
169 * Iterate down the tree while we are at an internal node.
170 * Nodes cannot be empty, assert the case because if one is
171 * we will wind up in an infinite loop.
172 *
173 * We can avoid iterating through large swaths of transaction
174 * id space if the left and right separators are the same
175 * except for their transaction spaces. We can then skip
176 * the node if the left and right transaction spaces are the
177 * same sign. This directly optimized accesses to files with
178 * HUGE transactional histories, such as database files,
179 * allowing us to avoid having to iterate through the entire
180 * history.
181 */
182 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
183 KKASSERT(node->count != 0);
184 elm = &node->elms[cursor->index];
185#if 0
186 /*
187 * temporarily disable this optimization, it needs
188 * more of a theoretical review.
189 */
190 if (elm[0].base.obj_id == elm[1].base.obj_id &&
191 elm[0].base.rec_type == elm[1].base.rec_type &&
192 elm[0].base.key == elm[1].base.key) {
193 /*
194 * Left side transaction space
195 */
196 save_key = cursor->key_beg.key;
197 cursor->key_beg.key = elm[0].base.key;
198 r = hammer_btree_cmp(&cursor->key_beg,
199 &elm[0].base);
200 cursor->key_beg.key = save_key;
201
202 /*
203 * Right side transaction space
204 */
205 save_key = cursor->key_end.key;
206 cursor->key_end.key = elm[1].base.key;
207 s = hammer_btree_cmp(&cursor->key_end,
208 &elm[1].base);
209 cursor->key_end.key = save_key;
210
211 /*
212 * If our range is entirely on one side or
213 * the other side we can skip the sub-tree.
214 */
215 if ((r < 0 && s < 0) || (r > 0 && s > 0)) {
216 ++cursor->index;
217 continue;
218 }
219 }
220#endif
221 error = hammer_cursor_down(cursor);
222 if (error)
223 break;
224 KKASSERT(cursor->index == 0);
225 node = cursor->node->ondisk;
226 continue;
227 }
228
229 /*
230 * We are at a leaf.
231 *
232 * Determine if the record at the cursor has gone beyond the
233 * end of our range. Remember that our key range is inclusive.
234 *
235 * When iterating we may have to 'pick out' records matching
236 * our transaction requirements. A comparison return of
237 * +1 or -1 indicates a transactional record that is too
238 * old or too new but does not terminate the search.
239 */
240 elm = &node->elms[cursor->index];
241 r = hammer_btree_range_cmp(cursor, &elm->base);
242 if (r == -1 || r == 1) {
243 ++cursor->index;
244 continue;
245 }
246
247 /*
248 * We either found a match or are now out of bounds.
249 */
250 error = r ? ENOENT : 0;
251 break;
252 }
253 return(error);
254}
255
256/*
257 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry
258 * could not be found, and a fatal error otherwise.
259 *
260 * The cursor is suitably positioned for a deletion on success, and suitably
261 * positioned for an insertion on ENOENT.
262 *
263 * The cursor may begin anywhere, the search will traverse clusters in
264 * either direction to locate the requested element.
265 */
266int
267hammer_btree_lookup(hammer_cursor_t cursor)
268{
269 int error;
270
271 error = btree_search(cursor, 0);
272 if (error == 0 && cursor->flags)
273 error = hammer_btree_extract(cursor, cursor->flags);
274 return(error);
275}
276
277/*
278 * Extract the record and/or data associated with the cursor's current
279 * position. Any prior record or data stored in the cursor is replaced.
280 * The cursor must be positioned at a leaf node.
281 *
282 * NOTE: Only records can be extracted from internal B-Tree nodes, and
283 * only for inter-cluster references. At the moment we only support
284 * extractions from leaf nodes.
285 */
286int
287hammer_btree_extract(hammer_cursor_t cursor, int flags)
288{
289 hammer_node_ondisk_t node;
290 hammer_btree_elm_t elm;
291 hammer_cluster_t cluster;
292 u_int64_t buf_type;
293 int32_t cloff;
294 int error;
295
296 /*
297 * A cluster record type has no data reference, the information
298 * is stored directly in the record and B-Tree element.
299 *
300 * The case where the data reference resolves to the same buffer
301 * as the record reference must be handled.
302 */
303 node = cursor->node->ondisk;
304 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
305 elm = &node->elms[cursor->index];
306 cluster = cursor->node->cluster;
307 error = 0;
308
309 if ((flags & HAMMER_CURSOR_GET_RECORD) && error == 0) {
310 cloff = elm->leaf.rec_offset;
311 cursor->record = hammer_bread(cluster, cloff,
312 HAMMER_FSBUF_RECORDS, &error,
313 &cursor->record_buffer);
314 } else {
315 cloff = 0;
316 }
317 if ((flags & HAMMER_CURSOR_GET_DATA) && error == 0) {
318 if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) {
319 /*
320 * The data is not in the same buffer as the last
321 * record we cached, but it could still be embedded
322 * in a record. Note that we may not have loaded the
323 * record's buffer above, depending on flags.
324 */
325 if ((elm->leaf.rec_offset ^ elm->leaf.data_offset) &
326 ~HAMMER_BUFMASK) {
327 if (elm->leaf.data_len & HAMMER_BUFMASK)
328 buf_type = HAMMER_FSBUF_DATA;
329 else
330 buf_type = 0; /* pure data buffer */
331 } else {
332 buf_type = HAMMER_FSBUF_RECORDS;
333 }
334 cursor->data = hammer_bread(cluster,
335 elm->leaf.data_offset,
336 buf_type, &error,
337 &cursor->data_buffer);
338 } else {
339 /*
340 * Data in same buffer as record. Note that we
341 * leave any existing data_buffer intact, even
342 * though we don't use it in this case, in case
343 * other records extracted during an iteration
344 * go back to it.
345 *
346 * Just assume the buffer type is correct.
347 */
348 cursor->data = (void *)
349 ((char *)cursor->record_buffer->ondisk +
350 (elm->leaf.data_offset & HAMMER_BUFMASK));
351 }
352 }
353 return(error);
354}
355
356
357/*
358 * Insert a leaf element into the B-Tree at the current cursor position.
359 * The cursor is positioned such that the element at and beyond the cursor
360 * are shifted to make room for the new record.
361 *
362 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
363 * flag set and that call must return ENOENT before this function can be
364 * called.
365 *
366 * ENOSPC is returned if there is no room to insert a new record.
367 */
368int
369hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm)
370{
371 hammer_node_ondisk_t parent;
372 hammer_node_ondisk_t node;
373 int i;
374
375#if 0
376 /* HANDLED BY CALLER */
377 /*
378 * Issue a search to get our cursor at the right place. The search
379 * will get us to a leaf node.
380 *
381 * The search also does some setup for our insert, so there is always
382 * room in the leaf.
383 */
384 error = btree_search(cursor, HAMMER_CURSOR_INSERT);
385 if (error != ENOENT) {
386 if (error == 0)
387 error = EEXIST;
388 return (error);
389 }
390#endif
391
392 /*
393 * Insert the element at the leaf node and update the count in the
394 * parent. It is possible for parent to be NULL, indicating that
395 * the root of the B-Tree in the cluster is a leaf. It is also
396 * possible for the leaf to be empty.
397 *
398 * Remember that the right-hand boundary is not included in the
399 * count.
400 */
401 node = cursor->node->ondisk;
402 i = cursor->index;
403 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
404 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
405 if (i != node->count) {
406 bcopy(&node->elms[i], &node->elms[i+1],
407 (node->count - i) * sizeof(*elm));
408 }
409 node->elms[i] = *elm;
410 ++node->count;
411 hammer_modify_node(cursor->node);
412
413 /*
414 * Adjust the sub-tree count in the parent. note that the parent
415 * may be in a different cluster.
416 */
417 if (cursor->parent) {
418 parent = cursor->parent->ondisk;
419 i = cursor->parent_index;
420 ++parent->elms[i].internal.subtree_count;
421 KKASSERT(parent->elms[i].internal.subtree_count <= node->count);
422 hammer_modify_node(cursor->parent);
423 }
424 return(0);
425}
426
427/*
428 * Delete a record from the B-Tree's at the current cursor position.
429 * The cursor is positioned such that the current element is the one
430 * to be deleted.
431 *
432 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_DELETE
433 * flag set and that call must return 0 before this function can be
434 * called.
435 *
436 * It is possible that we will be asked to delete the last element in a
437 * leaf. This case only occurs if the downward search was unable to
438 * rebalance us, which in turn can occur if our parent has inter-cluster
439 * elements. So the 0-element case for a leaf is allowed.
440 */
441int
442hammer_btree_delete(hammer_cursor_t cursor)
443{
444 hammer_node_ondisk_t ondisk;
445 hammer_node_t node;
446 hammer_node_t parent;
447 hammer_btree_elm_t elm;
448 int error;
449 int i;
450
451#if 0
452 /* HANDLED BY CALLER */
453 /*
454 * Locate the leaf element to delete. The search is also responsible
455 * for doing some of the rebalancing work on its way down.
456 */
457 error = btree_search(cursor, HAMMER_CURSOR_DELETE);
458 if (error)
459 return (error);
460#endif
461
462 /*
463 * Delete the element from the leaf node.
464 *
465 * Remember that leaf nodes do not have boundaries.
466 */
467 node = cursor->node;
468 ondisk = node->ondisk;
469 i = cursor->index;
470
471 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
472 if (i + 1 != ondisk->count) {
473 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
474 (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
475 }
476 --ondisk->count;
477 if (cursor->parent != NULL) {
478 /*
479 * Adjust parent's notion of the leaf's count. subtree_count
480 * is only approximate, it is allowed to be too small but
481 * never allowed to be too large. Make sure we don't drop
482 * the count below 0.
483 */
484 parent = cursor->parent;
485 elm = &parent->ondisk->elms[cursor->parent_index];
486 if (elm->internal.subtree_count)
487 --elm->internal.subtree_count;
488 KKASSERT(elm->internal.subtree_count <= ondisk->count);
489 hammer_modify_node(parent);
490 }
491
492 /*
493 * If the leaf is empty try to remove the subtree reference
494 * in at (parent, parent_index). This will unbalance the
495 * tree.
496 *
497 * Note that internal nodes must have at least one element
498 * so their boundary information is properly laid out. If
499 * we would cause our parent to become empty we try to
500 * recurse up the tree, but if that doesn't work we just
501 * leave the tree with an empty leaf.
502 */
503 if (ondisk->count == 0) {
504 error = btree_remove(cursor->parent, cursor->parent_index);
505 if (error == 0) {
506 hammer_free_btree(node->cluster, node->node_offset);
507 } else if (error == EAGAIN) {
508 hammer_modify_node(node);
509 error = 0;
510 } /* else a real error occured XXX */
511 } else {
512 hammer_modify_node(node);
513 error = 0;
514 }
515 return(error);
516}
517
518/*
519 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
520 *
521 * Search a cluster's B-Tree for cursor->key_beg, return the matching node.
522 *
523 * The search begins at the current node and will instantiate a NULL
524 * parent if necessary and if not at the root of the cluster. On return
525 * parent will be non-NULL unless the cursor is sitting at a root-leaf.
526 *
527 * The search code may be forced to iterate up the tree if the conditions
528 * required for an insertion or deletion are not met. This does not occur
529 * very often.
530 *
531 * INSERTIONS: The search will split full nodes and leaves on its way down
532 * and guarentee that the leaf it ends up on is not full.
533 *
534 * DELETIONS: The search will rebalance the tree on its way down.
535 */
536static
537int
538btree_search(hammer_cursor_t cursor, int flags)
539{
540 hammer_node_ondisk_t node;
541 hammer_cluster_t cluster;
542 int error;
543 int i;
544 int r;
545
546 flags |= cursor->flags;
547
548 /*
549 * Move our cursor up the tree until we find a node whos range covers
550 * the key we are trying to locate. This may move us between
551 * clusters.
552 *
553 * The left bound is inclusive, the right bound is non-inclusive.
554 * It is ok to cursor up too far so when cursoring across a cluster
555 * boundary.
556 *
557 * First see if we can skip the whole cluster. hammer_cursor_up()
558 * handles both cases but this way we don't check the cluster
559 * bounds when going up the tree within a cluster.
560 */
561 cluster = cursor->node->cluster;
562 while (
563 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 ||
564 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) {
565 error = hammer_cursor_toroot(cursor);
566 if (error)
567 goto done;
568 error = hammer_cursor_up(cursor);
569 if (error)
570 goto done;
571 cluster = cursor->node->cluster;
572 }
573
574 /*
575 * Deal with normal cursoring within a cluster. The right bound
576 * is non-inclusive. That is, the bounds form a separator.
577 */
578 while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 ||
579 hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) {
580 error = hammer_cursor_up(cursor);
581 if (error)
582 goto done;
583 }
584
585 /*
586 * We better have ended up with a node somewhere, and our second
587 * while loop had better not have traversed up a cluster.
588 */
589 KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster);
590
591 /*
592 * If we are inserting we can't start at a full node if the parent
593 * is also full (because there is no way to split the node),
594 * continue running up the tree until we hit the root of the
595 * root cluster or until the requirement is satisfied.
596 *
597 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
598 *
599 * XXX as an optimization it should be possible to unbalance the tree
600 * and stop at the root of the current cluster.
601 */
602 while (flags & HAMMER_CURSOR_INSERT) {
603 if (btree_node_is_full(cursor->node->ondisk) == 0)
604 break;
605 if (cursor->parent == NULL)
606 break;
607 if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS)
608 break;
609 error = hammer_cursor_up(cursor);
610 /* cluster and node are now may become stale */
611 if (error)
612 goto done;
613 }
614 /* cluster = cursor->node->cluster; not needed until next cluster = */
615
616#if 0
617 /*
618 * If we are deleting we can't start at an internal node with only
619 * one element unless it is root, because all of our code assumes
620 * that internal nodes will never be empty. Just do this generally
621 * for both leaf and internal nodes to get better balance.
622 *
623 * This handles the case where the cursor is sitting at a leaf and
624 * either the leaf or parent contain an insufficient number of
625 * elements.
626 *
627 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
628 *
629 * XXX NOTE: Iterations may not set this flag anyway.
630 */
631 while (flags & HAMMER_CURSOR_DELETE) {
632 if (cursor->node->ondisk->count > 1)
633 break;
634 if (cursor->parent == NULL)
635 break;
636 KKASSERT(cursor->node->ondisk->count != 0);
637 error = hammer_cursor_up(cursor);
638 /* cluster and node are now may become stale */
639 if (error)
640 goto done;
641 }
642#endif
643
644/*new_cluster:*/
645 /*
646 * Push down through internal nodes to locate the requested key.
647 */
648 cluster = cursor->node->cluster;
649 node = cursor->node->ondisk;
650 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
651#if 0
652 /*
653 * If we are a the root node and deleting, try to collapse
654 * all of the root's children into the root. This is the
655 * only point where tree depth is reduced.
656 *
657 * XXX NOTE: Iterations may not set this flag anyway.
658 */
659 if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) {
660 error = btree_collapse(cursor);
661 /* node becomes stale after call */
662 if (error)
663 goto done;
664 }
665 node = cursor->node->ondisk;
666#endif
667
668 /*
669 * Scan the node to find the subtree index to push down into.
670 * We go one-past, then back-up. The key should never be
671 * less then the left-hand boundary so I should never wind
672 * up 0.
673 */
674 for (i = 0; i < node->count; ++i) {
675 r = hammer_btree_cmp(&cursor->key_beg,
676 &node->elms[i].base);
677 if (r < 0)
678 break;
679 }
680 KKASSERT(i != 0);
681
682 /*
683 * The push-down index is now i - 1.
684 */
685 --i;
686 cursor->index = i;
687
688 /*
689 * Handle insertion and deletion requirements.
690 *
691 * If inserting split full nodes. The split code will
692 * adjust cursor->node and cursor->index if the current
693 * index winds up in the new node.
694 */
695 if (flags & HAMMER_CURSOR_INSERT) {
696 if (node->count == HAMMER_BTREE_INT_ELMS) {
697 error = btree_split_internal(cursor);
698 if (error)
699 goto done;
700 /*
701 * reload stale pointers
702 */
703 i = cursor->index;
704 node = cursor->node->ondisk;
705 }
706 }
707
708#if 0
709 /*
710 * If deleting rebalance - do not allow the child to have
711 * just one element or we will not be able to delete it.
712 *
713 * Neither internal or leaf nodes (except a root-leaf) are
714 * allowed to drop to 0 elements. (XXX - well, leaf nodes
715 * can at the moment).
716 *
717 * Our separators may have been reorganized after rebalancing,
718 * so we have to pop back up and rescan.
719 *
720 * XXX test for subtree_count < maxelms / 2, minus 1 or 2
721 * for hysteresis?
722 *
723 * XXX NOTE: Iterations may not set this flag anyway.
724 */
725 if (flags & HAMMER_CURSOR_DELETE) {
726 if (node->elms[i].internal.subtree_count <= 1) {
727 error = btree_rebalance(cursor);
728 if (error)
729 goto done;
730 /* cursor->index is invalid after call */
731 goto new_cluster;
732 }
733 }
734#endif
735
736 /*
737 * Push down (push into new node, existing node becomes
738 * the parent).
739 */
740 error = hammer_cursor_down(cursor);
741 /* node and cluster become stale */
742 if (error)
743 goto done;
744 node = cursor->node->ondisk;
745 cluster = cursor->node->cluster;
746 }
747
748 /*
749 * We are at a leaf, do a linear search of the key array.
750 * (XXX do a binary search). On success the index is set to the
751 * matching element, on failure the index is set to the insertion
752 * point.
753 *
754 * Boundaries are not stored in leaf nodes, so the index can wind
755 * up to the left of element 0 (index == 0) or past the end of
756 * the array (index == node->count).
757 */
758 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
759
760 for (i = 0; i < node->count; ++i) {
761 r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base);
762
763 /*
764 * Stop if we've flipped past key_beg
765 */
766 if (r < 0)
767 break;
768
769 /*
770 * Return an exact match
771 */
772 if (r == 0) {
773 cursor->index = i;
774 error = 0;
775 goto done;
776 }
777 }
778
779 /*
780 * No exact match was found, i is now at the insertion point.
781 *
782 * If inserting split a full leaf before returning. This
783 * may have the side effect of adjusting cursor->node and
784 * cursor->index.
785 */
786 cursor->index = i;
787 if ((flags & HAMMER_CURSOR_INSERT) &&
788 node->count == HAMMER_BTREE_LEAF_ELMS) {
789 error = btree_split_leaf(cursor);
790 /* NOT USED
791 i = cursor->index;
792 node = &cursor->node->internal;
793 */
794 if (error)
795 goto done;
796 }
797 error = ENOENT;
798done:
799 return(error);
800}
801
802
803/************************************************************************
804 * SPLITTING AND MERGING *
805 ************************************************************************
806 *
807 * These routines do all the dirty work required to split and merge nodes.
808 */
809
810/*
811 * Split an internal node into two nodes and move the separator at the split
812 * point to the parent. Note that the parent's parent's element pointing
813 * to our parent will have an incorrect subtree_count (we don't update it).
814 * It will be low, which is ok.
815 *
816 * (cursor->node, cursor->index) indicates the element the caller intends
817 * to push into. We will adjust node and index if that element winds
818 * up in the split node.
819 *
820 * If we are at the root of a cluster a new root must be created with two
821 * elements, one pointing to the original root and one pointing to the
822 * newly allocated split node.
823 *
824 * NOTE! Being at the root of a cluster is different from being at the
825 * root of the root cluster. cursor->parent will not be NULL and
826 * cursor->node->ondisk.parent must be tested against 0. Theoretically
827 * we could propogate the algorithm into the parent and deal with multiple
828 * 'roots' in the cluster header, but it's easier not to.
829 */
830static
831int
832btree_split_internal(hammer_cursor_t cursor)
833{
834 hammer_node_ondisk_t ondisk;
835 hammer_node_t node;
836 hammer_node_t parent;
837 hammer_node_t new_node;
838 hammer_btree_elm_t elm;
839 hammer_btree_elm_t parent_elm;
840 int parent_index;
841 int made_root;
842 int split;
843 int error;
844 const int esize = sizeof(*elm);
845
846 /*
847 * We are splitting but elms[split] will be promoted to the parent,
848 * leaving the right hand node with one less element. If the
849 * insertion point will be on the left-hand side adjust the split
850 * point to give the right hand side one additional node.
851 */
852 node = cursor->node;
853 ondisk = node->ondisk;
854 split = (ondisk->count + 1) / 2;
855 if (cursor->index <= split)
856 --split;
857 error = 0;
858
859 /*
860 * If we are at the root of the cluster, create a new root node with
861 * 1 element and split normally. Avoid making major modifications
862 * until we know the whole operation will work.
863 *
864 * The root of the cluster is different from the root of the root
865 * cluster. Use the node's on-disk structure's parent offset to
866 * detect the case.
867 */
868 if (ondisk->parent == 0) {
869 parent = hammer_alloc_btree(node->cluster, &error);
870 if (parent == NULL)
871 return(error);
872 hammer_lock_ex(&parent->lock);
873 ondisk = parent->ondisk;
874 ondisk->count = 1;
875 ondisk->parent = 0;
876 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
877 ondisk->elms[0].base = node->cluster->clu_btree_beg;
878 ondisk->elms[0].internal.subtree_type = node->ondisk->type;
879 ondisk->elms[0].internal.subtree_offset = node->node_offset;
880 ondisk->elms[1].base = node->cluster->clu_btree_end;
881 made_root = 1;
882 parent_index = 0; /* index of current node in parent */
883 } else {
884 made_root = 0;
885 parent = cursor->parent;
886 parent_index = cursor->parent_index;
887 }
888
889 /*
890 * Split node into new_node at the split point.
891 *
892 * B O O O P N N B <-- P = node->elms[split]
893 * 0 1 2 3 4 5 6 <-- subtree indices
894 *
895 * x x P x x
896 * s S S s
897 * / \
898 * B O O O B B N N B <--- inner boundary points are 'P'
899 * 0 1 2 3 4 5 6
900 *
901 */
902 new_node = hammer_alloc_btree(node->cluster, &error);
903 if (new_node == NULL) {
904 if (made_root) {
905 hammer_unlock(&parent->lock);
906 hammer_free_btree(node->cluster, parent->node_offset);
907 hammer_rel_node(parent);
908 }
909 return(error);
910 }
911 hammer_lock_ex(&new_node->lock);
912
913 /*
914 * Create the new node. P becomes the left-hand boundary in the
915 * new node. Copy the right-hand boundary as well.
916 *
917 * elm is the new separator.
918 */
919 ondisk = node->ondisk;
920 elm = &ondisk->elms[split];
921 bcopy(elm, &new_node->ondisk->elms[0],
922 (ondisk->count - split + 1) * esize);
923 new_node->ondisk->count = ondisk->count - split;
924 new_node->ondisk->parent = parent->node_offset;
925 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
926 KKASSERT(ondisk->type == new_node->ondisk->type);
927
928 /*
929 * Cleanup the original node. P becomes the new boundary, its
930 * subtree_offset was moved to the new node. If we had created
931 * a new root its parent pointer may have changed.
932 */
933 elm->internal.subtree_offset = 0;
934 ondisk->count = split;
935
936 /*
937 * Insert the separator into the parent, fixup the parent's
938 * reference to the original node, and reference the new node.
939 * The separator is P.
940 *
941 * Remember that base.count does not include the right-hand boundary.
942 */
943 ondisk = parent->ondisk;
944 ondisk->elms[parent_index].internal.subtree_count = split;
945 parent_elm = &ondisk->elms[parent_index+1];
946 bcopy(parent_elm, parent_elm + 1,
947 (ondisk->count - parent_index) * esize);
948 parent_elm->internal.base = elm->base; /* separator P */
949 parent_elm->internal.subtree_offset = new_node->node_offset;
950 parent_elm->internal.subtree_count = new_node->ondisk->count;
951 ++ondisk->count;
952
953 /*
954 * The cluster's root pointer may have to be updated.
955 */
956 if (made_root) {
957 node->cluster->ondisk->clu_btree_root = parent->node_offset;
958 hammer_modify_cluster(node->cluster);
959 node->ondisk->parent = parent->node_offset;
960 if (cursor->parent) {
961 hammer_unlock(&cursor->parent->lock);
962 hammer_rel_node(cursor->parent);
963 }
964 cursor->parent = parent; /* lock'd and ref'd */
965 }
966
967 hammer_modify_node(node);
968 hammer_modify_node(new_node);
969 hammer_modify_node(parent);
970
971 /*
972 * Ok, now adjust the cursor depending on which element the original
973 * index was pointing at. If we are >= the split point the push node
974 * is now in the new node.
975 *
976 * NOTE: If we are at the split point itself we cannot stay with the
977 * original node because the push index will point at the right-hand
978 * boundary, which is illegal.
979 *
980 * NOTE: The cursor's parent or parent_index must be adjusted for
981 * the case where a new parent (new root) was created, and the case
982 * where the cursor is now pointing at the split node.
983 */
984 if (cursor->index >= split) {
985 cursor->parent_index = parent_index + 1;
986 cursor->index -= split;
987 hammer_unlock(&cursor->node->lock);
988 hammer_rel_node(cursor->node);
989 cursor->node = new_node; /* locked and ref'd */
990 } else {
991 cursor->parent_index = parent_index;
992 hammer_unlock(&new_node->lock);
993 hammer_rel_node(new_node);
994 }
995
996 /*
997 * Fixup left and right bounds
998 */
999 parent_elm = &parent->ondisk->elms[cursor->parent_index];
1000 cursor->left_bound = &elm[0].internal.base;
1001 cursor->right_bound = &elm[1].internal.base;
1002
1003 return (0);
1004}
1005
1006/*
1007 * Same as the above, but splits a full leaf node.
1008 */
1009static
1010int
1011btree_split_leaf(hammer_cursor_t cursor)
1012{
1013 hammer_node_ondisk_t ondisk;
1014 hammer_node_t parent;
1015 hammer_node_t leaf;
1016 hammer_node_t new_leaf;
1017 hammer_btree_elm_t elm;
1018 hammer_btree_elm_t parent_elm;
1019 int parent_index;
1020 int made_root;
1021 int split;
1022 int error;
1023 const size_t esize = sizeof(*elm);
1024
1025 /*
1026 * Calculate the split point. If the insertion point will be on
1027 * the left-hand side adjust the split point to give the right
1028 * hand side one additional node.
1029 */
1030 leaf = cursor->node;
1031 ondisk = leaf->ondisk;
1032 split = (ondisk->count + 1) / 2;
1033 if (cursor->index <= split)
1034 --split;
1035 error = 0;
1036
1037 /*
1038 * If we are at the root of the tree, create a new root node with
1039 * 1 element and split normally. Avoid making major modifications
1040 * until we know the whole operation will work.
1041 */
1042 if (ondisk->parent == 0) {
1043 parent = hammer_alloc_btree(leaf->cluster, &error);
1044 if (parent == NULL)
1045 return(error);
1046 hammer_lock_ex(&parent->lock);
1047 ondisk = parent->ondisk;
1048 ondisk->count = 1;
1049 ondisk->parent = 0;
1050 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1051 ondisk->elms[0].base = leaf->cluster->clu_btree_beg;
1052 ondisk->elms[0].internal.subtree_type = leaf->ondisk->type;
1053 ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
1054 ondisk->elms[1].base = leaf->cluster->clu_btree_end;
1055 made_root = 1;
1056 parent_index = 0; /* insertion point in parent */
1057 } else {
1058 made_root = 0;
1059 parent = cursor->parent;
1060 parent_index = cursor->parent_index;
1061 }
1062
1063 /*
1064 * Split leaf into new_leaf at the split point. Select a separator
1065 * value in-between the two leafs but with a bent towards the right
1066 * leaf since comparisons use an 'elm >= separator' inequality.
1067 *
1068 * L L L L L L L L
1069 *
1070 * x x P x x
1071 * s S S s
1072 * / \
1073 * L L L L L L L L
1074 */
1075 new_leaf = hammer_alloc_btree(leaf->cluster, &error);
1076 if (new_leaf == NULL) {
1077 if (made_root) {
1078 hammer_unlock(&parent->lock);
1079 hammer_free_btree(leaf->cluster, parent->node_offset);
1080 hammer_rel_node(parent);
1081 }
1082 return(error);
1083 }
1084 hammer_lock_ex(&new_leaf->lock);
1085
1086 /*
1087 * Create the new node. P become the left-hand boundary in the
1088 * new node. Copy the right-hand boundary as well.
1089 */
1090 ondisk = leaf->ondisk;
1091 elm = &ondisk->elms[split];
1092 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1093 new_leaf->ondisk->count = ondisk->count - split;
1094 new_leaf->ondisk->parent = parent->node_offset;
1095 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1096 KKASSERT(ondisk->type == new_leaf->ondisk->type);
1097
1098 /*
1099 * Cleanup the original node. Because this is a leaf node and
1100 * leaf nodes do not have a right-hand boundary, there
1101 * aren't any special edge cases to clean up. We just fixup the
1102 * count.
1103 */
1104 ondisk->count = split;
1105
1106 /*
1107 * Insert the separator into the parent, fixup the parent's
1108 * reference to the original node, and reference the new node.
1109 * The separator is P.
1110 *
1111 * Remember that base.count does not include the right-hand boundary.
1112 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1113 */
1114 ondisk = parent->ondisk;
1115 ondisk->elms[parent_index].internal.subtree_count = split;
1116 parent_elm = &ondisk->elms[parent_index+1];
1117 if (parent_index + 1 != ondisk->count) {
1118 bcopy(parent_elm, parent_elm + 1,
1119 (ondisk->count - parent_index - 1) * esize);
1120 }
1121 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1122 parent_elm->internal.subtree_offset = new_leaf->node_offset;
1123 parent_elm->internal.subtree_count = new_leaf->ondisk->count;
1124 ++ondisk->count;
1125
1126 /*
1127 * The cluster's root pointer may have to be updated.
1128 */
1129 if (made_root) {
1130 leaf->cluster->ondisk->clu_btree_root = parent->node_offset;
1131 hammer_modify_cluster(leaf->cluster);
1132 leaf->ondisk->parent = parent->node_offset;
1133 if (cursor->parent) {
1134 hammer_unlock(&cursor->parent->lock);
1135 hammer_rel_node(cursor->parent);
1136 }
1137 cursor->parent = parent; /* lock'd and ref'd */
1138 }
1139
1140 hammer_modify_node(leaf);
1141 hammer_modify_node(new_leaf);
1142 hammer_modify_node(parent);
1143
1144 /*
1145 * Ok, now adjust the cursor depending on which element the original
1146 * index was pointing at. If we are >= the split point the push node
1147 * is now in the new node.
1148 *
1149 * NOTE: If we are at the split point itself we cannot stay with the
1150 * original node because the push index will point at the right-hand
1151 * boundary, which is illegal.
1152 */
1153 if (cursor->index >= split) {
1154 cursor->parent_index = parent_index + 1;
1155 cursor->index -= split;
1156 hammer_unlock(&cursor->node->lock);
1157 hammer_rel_node(cursor->node);
1158 cursor->node = new_leaf;
1159 } else {
1160 cursor->parent_index = parent_index;
1161 hammer_unlock(&new_leaf->lock);
1162 hammer_rel_node(new_leaf);
1163 }
1164
1165 /*
1166 * Fixup left and right bounds
1167 */
1168 parent_elm = &parent->ondisk->elms[cursor->parent_index];
1169 cursor->left_bound = &elm[0].internal.base;
1170 cursor->right_bound = &elm[1].internal.base;
1171
1172 return (0);
1173}
1174
1175/*
1176 * Remove the element at (node, index). If the internal node would become
1177 * empty passively recurse up the tree.
1178 *
1179 * A locked internal node is passed to this function, the node remains
1180 * locked on return. Leaf nodes cannot be passed to this function.
1181 *
1182 * Returns EAGAIN if we were unable to acquire the needed locks. The caller
1183 * does not deal with the empty leaf until determines whether this recursion
1184 * has succeeded or not.
1185 */
1186int
1187btree_remove(hammer_node_t node, int index)
1188{
1189 hammer_node_ondisk_t ondisk;
1190 hammer_node_t parent;
1191 int error;
1192
1193 ondisk = node->ondisk;
1194 KKASSERT(ondisk->count > 0);
1195
1196 /*
1197 * Remove the element, shifting remaining elements left one.
1198 * Note that our move must include the right-boundary element.
1199 */
1200 if (ondisk->count != 1) {
1201 bcopy(&ondisk->elms[index+1], &ondisk->elms[index],
1202 (ondisk->count - index) * sizeof(ondisk->elms[0]));
1203 --ondisk->count;
1204 hammer_modify_node(node);
1205 return(0);
1206 }
1207
1208 /*
1209 * Internal nodes cannot drop to 0 elements, so remove the node
1210 * from ITS parent. If the node is the root node, convert it to
1211 * an empty leaf node (which can drop to 0 elements).
1212 */
1213 if (ondisk->parent == 0) {
1214 ondisk->count = 0;
1215 ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1216 hammer_modify_node(node);
1217 return(0);
1218 }
1219
1220 /*
1221 * Try to remove the node from its parent. Return EAGAIN if we
1222 * cannot.
1223 */
1224 parent = hammer_get_node(node->cluster, ondisk->parent, &error);
1225 if (hammer_lock_ex_try(&parent->lock)) {
1226 hammer_rel_node(parent);
1227 return(EAGAIN);
1228 }
1229 ondisk = parent->ondisk;
1230 for (index = 0; index < ondisk->count; ++index) {
1231 if (ondisk->elms[index].internal.subtree_offset ==
1232 node->node_offset) {
1233 break;
1234 }
1235 }
1236 if (index == ondisk->count) {
1237 kprintf("btree_remove: lost parent linkage to node\n");
1238 error = EIO;
1239 } else {
1240 error = btree_remove(parent, index);
1241 if (error == 0) {
1242 hammer_free_btree(node->cluster, node->node_offset);
1243 /* NOTE: node can be reallocated at any time now */
1244 }
1245 }
1246 hammer_unlock(&parent->lock);
1247 hammer_rel_node(parent);
1248 return (error);
1249}
1250
1251#if 0
1252
1253/*
1254 * This routine is called on the internal node (node) prior to recursing down
1255 * through (node, index) when the node referenced by (node, index) MIGHT
1256 * have too few elements for the caller to perform a deletion.
1257 *
1258 * cursor->index is invalid on return because the separators may have gotten
1259 * adjusted, the caller must rescan the node's elements. The caller may set
1260 * cursor->index to -1 if it wants us to do a general rebalancing.
1261 *
1262 * This routine rebalances the children of the (node), collapsing children
1263 * together if possible. On return each child will have at least L/2-1
1264 * elements unless the node only has one child.
1265 *
1266 * NOTE: Because we do not update the parent's parent in the split code,
1267 * the subtree_count used by the caller may be incorrect. We correct it
1268 * here. Also note that we cannot change the depth of the tree's leaf
1269 * nodes here (see btree_collapse()).
1270 *
1271 * NOTE: We make no attempt to rebalance inter-cluster elements.
1272 */
1273static
1274int
1275btree_rebalance(hammer_cursor_t cursor)
1276{
1277 hammer_node_ondisk_t ondisk;
1278 hammer_node_t node;
1279 hammer_node_t children[HAMMER_BTREE_INT_ELMS];
1280 hammer_node_t child;
1281 hammer_btree_elm_t elm;
1282 hammer_btree_elm_t elms;
1283 int i, j, n, nelms, goal;
1284 int maxelms, halfelms;
1285 int error;
1286
1287 /*
1288 * If the elm being recursed through is an inter-cluster reference,
1289 * don't worry about it.
1290 */
1291 ondisk = cursor->node->ondisk;
1292 elm = &ondisk->elms[cursor->index];
1293 if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER)
1294 return(0);
1295
1296 KKASSERT(elm->internal.subtree_offset != 0);
1297 error = 0;
1298
1299 /*
1300 * Load the children of node and do any necessary corrections
1301 * to subtree_count. subtree_count may be too low due to the
1302 * way insertions split nodes. Get a count of the total number
1303 * of actual elements held by our children.
1304 */
1305 error = 0;
1306
1307 for (i = n = 0; i < node->base.count; ++i) {
1308 struct hammer_btree_internal_elm *elm;
1309
1310 elm = &node->elms[i];
1311 children[i] = NULL;
1312 child_buffer[i] = NULL; /* must be preinitialized for bread */
1313 if (elm->subtree_offset == 0)
1314 continue;
1315 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1316 HAMMER_FSBUF_BTREE, &error,
1317 &child_buffer[i], XXX);
1318 children[i] = child;
1319 if (child == NULL)
1320 continue;
1321 XXX
1322 KKASSERT(node->base.subtype == child->base.type);
1323
1324 /*
1325 * Accumulate n for a good child, update the node's count
1326 * if it was wrong.
1327 */
1328 if (node->elms[i].subtree_count != child->base.count) {
1329 node->elms[i].subtree_count = child->base.count;
1330 }
1331 n += node->elms[i].subtree_count;
1332 }
1333 if (error)
1334 goto failed;
1335
1336 /*
1337 * Collect all the children's elements together
1338 */
1339 nelms = n;
1340 elms = kmalloc(sizeof(*elms) * (nelms + 1), M_HAMMER, M_WAITOK|M_ZERO);
1341 for (i = n = 0; i < node->base.count; ++i) {
1342 child = children[i];
1343 for (j = 0; j < child->base.count; ++j) {
1344 elms[n].owner = child;
1345 if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF)
1346 elms[n].u.leaf = child->leaf.elms[j];
1347 else
1348 elms[n].u.internal = child->internal.elms[j];
1349 ++n;
1350 }
1351 }
1352 KKASSERT(n == nelms);
1353
1354 /*
1355 * Store a boundary in the elms array to ease the code below. This
1356 * is only used if the children are internal nodes.
1357 */
1358 elms[n].u.internal = node->elms[i];
1359
1360 /*
1361 * Calculate the number of elements each child should have (goal) by
1362 * reducing the number of elements until we achieve at least
1363 * halfelms - 1 per child, unless we are a degenerate case.
1364 */
1365 maxelms = btree_max_elements(node->base.subtype);
1366 halfelms = maxelms / 2;
1367
1368 goal = halfelms - 1;
1369 while (i && n / i < goal)
1370 --i;
1371
1372 /*
1373 * Now rebalance using the specified goal
1374 */
1375 for (i = n = 0; i < node->base.count; ++i) {
1376 struct hammer_buffer *subchild_buffer = NULL;
1377 struct hammer_btree_internal_node *subchild;
1378
1379 child = children[i];
1380 for (j = 0; j < goal && n < nelms; ++j) {
1381 if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
1382 child->leaf.elms[j] = elms[n].u.leaf;
1383 } else {
1384 child->internal.elms[j] = elms[n].u.internal;
1385 }
1386
1387 /*
1388 * If the element's parent has changed we have to
1389 * update the parent pointer. This is somewhat
1390 * expensive.
1391 */
1392 if (elms[n].owner != child &&
1393 node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) {
1394 subchild = hammer_bread(cursor->cluster,
1395 elms[n].u.internal.subtree_offset,
1396 HAMMER_FSBUF_BTREE,
1397 &error,
1398 &subchild_buffer, XXX);
1399 if (subchild) {
1400 subchild->base.parent =
1401 hammer_bclu_offset(child_buffer[i],
1402 child);
1403 hammer_modify_buffer(subchild_buffer);
1404 }
1405 /* XXX error */
1406 }
1407 ++n;
1408 }
1409 /*
1410 * Set right boundary if the children are internal nodes.
1411 */
1412 if (node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL)
1413 child->internal.elms[j] = elms[n].u.internal;
1414 child->base.count = j;
1415 hammer_modify_buffer(child_buffer[i]);
1416 if (subchild_buffer)
1417 hammer_put_buffer(subchild_buffer, 0);
1418
1419 /*
1420 * If we have run out of elements, break out
1421 */
1422 if (n == nelms)
1423 break;
1424 }
1425
1426 /*
1427 * Physically destroy any left-over children. These children's
1428 * elements have been packed into prior children. The node's
1429 * right hand boundary and count gets shifted to index i.
1430 *
1431 * The subtree count in the node's parent MUST be updated because
1432 * we are removing elements. The subtree_count field is allowed to
1433 * be too small, but not too large!
1434 */
1435 if (i != node->base.count) {
1436 n = i;
1437 node->elms[n] = node->elms[node->base.count];
1438 while (i < node->base.count) {
1439 hammer_free_btree_ptr(child_buffer[i], children[i]);
1440 hammer_put_buffer(child_buffer[i], 0);
1441 ++i;
1442 }
1443 node->base.count = n;
1444 if (cursor->parent) {
1445 cursor->parent->elms[cursor->parent_index].subtree_count = n;
1446 hammer_modify_buffer(cursor->parent_buffer);
1447 }
1448 }
1449
1450 kfree(elms, M_HAMMER);
1451failed:
1452 hammer_modify_buffer(cursor->node_buffer);
1453 for (i = 0; i < node->base.count; ++i) {
1454 if (child_buffer[i])
1455 hammer_put_buffer(child_buffer[i], 0);
1456 }
1457 return (error);
1458}
1459
1460/*
1461 * This routine is only called if the cursor is at the root node and the
1462 * root node is an internal node. We attempt to collapse the root node
1463 * by replacing it with all of its children, reducing tree depth by one.
1464 *
1465 * This is the only way to reduce tree depth in a HAMMER filesystem.
1466 * Note that all leaf nodes are at the same depth.
1467 *
1468 * This is a fairly expensive operation because we not only have to load
1469 * the root's children, we also have to scan each child and adjust the
1470 * parent offset for each element in each child. Nasty all around.
1471 */
1472static
1473int
1474btree_collapse(hammer_cursor_t cursor)
1475{
1476 hammer_btree_node_ondisk_t root, child;
1477 hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS];
1478 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1479 int count;
1480 int i, j, n;
1481 int root_modified;
1482 int error;
1483 int32_t root_offset;
1484 u_int8_t subsubtype;
1485
1486 root = cursor->node;
1487 count = root->base.count;
1488 root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1489
1490 /*
1491 * Sum up the number of children each element has. This value is
1492 * only approximate due to the way the insertion node works. It
1493 * may be too small but it will never be too large.
1494 *
1495 * Quickly terminate the collapse if the elements have too many
1496 * children.
1497 */
1498 KKASSERT(root->base.parent == 0); /* must be root node */
1499 KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL);
1500 KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1501
1502 for (i = n = 0; i < count; ++i) {
1503 n += root->internal.elms[i].subtree_count;
1504 }
1505 if (n > btree_max_elements(root->base.subtype))
1506 return(0);
1507
1508 /*
1509 * Iterate through the elements again and correct the subtree_count.
1510 * Terminate the collapse if we wind up with too many.
1511 */
1512 error = 0;
1513 root_modified = 0;
1514
1515 for (i = n = 0; i < count; ++i) {
1516 struct hammer_btree_internal_elm *elm;
1517
1518 elm = &root->internal.elms[i];
1519 child_buffer[i] = NULL;
1520 children[i] = NULL;
1521 if (elm->subtree_offset == 0)
1522 continue;
1523 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1524 HAMMER_FSBUF_BTREE, &error,
1525 &child_buffer[i], XXX);
1526 children[i] = child;
1527 if (child == NULL)
1528 continue;
1529 KKASSERT(root->base.subtype == child->base.type);
1530
1531 /*
1532 * Accumulate n for a good child, update the root's count
1533 * if it was wrong.
1534 */
1535 if (root->internal.elms[i].subtree_count != child->base.count) {
1536 root->internal.elms[i].subtree_count = child->base.count;
1537 root_modified = 1;
1538 }
1539 n += root->internal.elms[i].subtree_count;
1540 }
1541 if (error || n > btree_max_elements(root->base.subtype))
1542 goto done;
1543
1544 /*
1545 * Ok, we can collapse the root. If the root's children are leafs
1546 * the collapse is really simple. If they are internal nodes the
1547 * collapse is not so simple because we have to fixup the parent
1548 * pointers for the root's children's children.
1549 *
1550 * When collapsing an internal node the far left and far right
1551 * element's boundaries should match the root's left and right
1552 * boundaries.
1553 */
1554 if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
1555 for (i = n = 0; i < count; ++i) {
1556 child = children[i];
1557 for (j = 0; j < child->base.count; ++j) {
1558 root->leaf.elms[n] = child->leaf.elms[j];
1559 ++n;
1560 }
1561 }
1562 root->base.type = root->base.subtype;
1563 root->base.subtype = 0;
1564 root->base.count = n;
1565 root->leaf.link_left = 0;
1566 root->leaf.link_right = 0;
1567 } else {
1568 struct hammer_btree_internal_elm *elm;
1569 struct hammer_btree_internal_node *subchild;
1570 struct hammer_buffer *subchild_buffer = NULL;
1571
1572 if (count) {
1573 child = children[0];
1574 subsubtype = child->base.subtype;
1575 KKASSERT(child->base.count > 0);
1576 KKASSERT(root->internal.elms[0].base.key ==
1577 child->internal.elms[0].base.key);
1578 child = children[count-1];
1579 KKASSERT(child->base.count > 0);
1580 KKASSERT(root->internal.elms[count].base.key ==
1581 child->internal.elms[child->base.count].base.key);
1582 } else {
1583 subsubtype = 0;
1584 }
1585 for (i = n = 0; i < count; ++i) {
1586 child = children[i];
1587 KKASSERT(child->base.subtype == subsubtype);
1588 for (j = 0; j < child->base.count; ++j) {
1589 elm = &child->internal.elms[j];
1590
1591 root->internal.elms[n] = *elm;
1592 subchild = hammer_bread(cursor->cluster,
1593 elm->subtree_offset,
1594 HAMMER_FSBUF_BTREE,
1595 &error,
1596 &subchild_buffer,
1597 XXX);
1598 if (subchild) {
1599 subchild->base.parent = root_offset;
1600 hammer_modify_buffer(subchild_buffer);
1601 }
1602 ++n;
1603 }
1604 /* make sure the right boundary is correct */
1605 /* (this gets overwritten when the loop continues) */
1606 /* XXX generate a new separator? */
1607 root->internal.elms[n] = child->internal.elms[j];
1608 }
1609 root->base.type = HAMMER_BTREE_TYPE_INTERNAL;
1610 root->base.subtype = subsubtype;
1611 if (subchild_buffer)
1612 hammer_put_buffer(subchild_buffer, 0);
1613 }
1614 root_modified = 1;
1615
1616 /*
1617 * Cleanup
1618 */
1619done:
1620 if (root_modified)
1621 hammer_modify_buffer(cursor->node_buffer);
1622 for (i = 0; i < count; ++i) {
1623 if (child_buffer[i])
1624 hammer_put_buffer(child_buffer[i], 0);
1625 }
1626 return(error);
1627}
1628
1629#endif
1630
1631/************************************************************************
1632 * MISCELLANIOUS SUPPORT *
1633 ************************************************************************/
1634
1635/*
1636 * Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
1637 *
1638 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
1639 *
1640 * Note that key1 and key2 are treated differently. key1 is allowed to
1641 * wildcard some of its fields by setting them to 0, while key2 is expected
1642 * to be in an on-disk form (no wildcards).
1643 */
1644int
1645hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
1646{
1647#if 0
1648 kprintf("compare obj_id %016llx %016llx\n",
1649 key1->obj_id, key2->obj_id);
1650 kprintf("compare rec_type %04x %04x\n",
1651 key1->rec_type, key2->rec_type);
1652 kprintf("compare key %016llx %016llx\n",
1653 key1->key, key2->key);
1654#endif
1655
1656 /*
1657 * A key1->obj_id of 0 matches any object id
1658 */
1659 if (key1->obj_id) {
1660 if (key1->obj_id < key2->obj_id)
1661 return(-4);
1662 if (key1->obj_id > key2->obj_id)
1663 return(4);
1664 }
1665
1666 /*
1667 * A key1->rec_type of 0 matches any record type.
1668 */
1669 if (key1->rec_type) {
1670 if (key1->rec_type < key2->rec_type)
1671 return(-3);
1672 if (key1->rec_type > key2->rec_type)
1673 return(3);
1674 }
1675
1676 /*
1677 * There is no special case for key. 0 means 0.
1678 */
1679 if (key1->key < key2->key)
1680 return(-2);
1681 if (key1->key > key2->key)
1682 return(2);
1683
1684 /*
1685 * This test has a number of special cases. create_tid in key1 is
1686 * the as-of transction id, and delete_tid in key1 is NOT USED.
1687 *
1688 * A key1->create_tid of 0 matches any record regardles of when
1689 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
1690 * used to search for the most current state of the object.
1691 *
1692 * key2->create_tid is a HAMMER record and will never be
1693 * 0. key2->delete_tid is the deletion transaction id or 0 if
1694 * the record has not yet been deleted.
1695 */
1696 if (key1->create_tid) {
1697 if (key1->create_tid < key2->create_tid)
1698 return(-1);
1699 if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
1700 return(1);
1701 }
1702
1703 return(0);
1704}
1705
1706/*
1707 * Compare the element against the cursor's beginning and ending keys
1708 */
1709int
1710hammer_btree_range_cmp(hammer_cursor_t cursor, hammer_base_elm_t key2)
1711{
1712 /*
1713 * A cursor->key_beg.obj_id of 0 matches any object id
1714 */
1715 if (cursor->key_beg.obj_id) {
1716 if (cursor->key_end.obj_id < key2->obj_id)
1717 return(-4);
1718 if (cursor->key_beg.obj_id > key2->obj_id)
1719 return(4);
1720 }
1721
1722 /*
1723 * A cursor->key_beg.rec_type of 0 matches any record type.
1724 */
1725 if (cursor->key_beg.rec_type) {
1726 if (cursor->key_end.rec_type < key2->rec_type)
1727 return(-3);
1728 if (cursor->key_beg.rec_type > key2->rec_type)
1729 return(3);
1730 }
1731
1732 /*
1733 * There is no special case for key. 0 means 0.
1734 */
1735 if (cursor->key_end.key < key2->key)
1736 return(-2);
1737 if (cursor->key_beg.key > key2->key)
1738 return(2);
1739
1740 /*
1741 * This test has a number of special cases. create_tid in key1 is
1742 * the as-of transction id, and delete_tid in key1 is NOT USED.
1743 *
1744 * A key1->create_tid of 0 matches any record regardles of when
1745 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
1746 * used to search for the most current state of the object.
1747 *
1748 * key2->create_tid is a HAMMER record and will never be
1749 * 0. key2->delete_tid is the deletion transaction id or 0 if
1750 * the record has not yet been deleted.
1751 *
1752 * NOTE: only key_beg.create_tid is used for create_tid, we can only
1753 * do as-of scans at the moment.
1754 */
1755 if (cursor->key_beg.create_tid) {
1756 if (cursor->key_beg.create_tid < key2->create_tid)
1757 return(-1);
1758 if (key2->delete_tid && cursor->key_beg.create_tid >= key2->delete_tid)
1759 return(1);
1760 }
1761
1762 return(0);
1763}
1764
1765/*
1766 * Create a separator half way inbetween key1 and key2. For fields just
1767 * one unit apart, the separator will match key2.
1768 *
1769 * The handling of delete_tid is a little confusing. It is only possible
1770 * to have one record in the B-Tree where all fields match except delete_tid.
1771 * This means, worse case, two adjacent elements may have a create_tid that
1772 * is one-apart and cause the separator to choose the right-hand element's
1773 * create_tid. e.g. (create,delete): (1,x)(2,x) -> separator is (2,x).
1774 *
1775 * So all we have to do is set delete_tid to the right-hand element to
1776 * guarentee that the separator is properly between the two elements.
1777 */
1778#define MAKE_SEPARATOR(key1, key2, dest, field) \
1779 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
1780
1781static void
1782hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
1783 hammer_base_elm_t dest)
1784{
1785 bzero(dest, sizeof(*dest));
1786 MAKE_SEPARATOR(key1, key2, dest, obj_id);
1787 MAKE_SEPARATOR(key1, key2, dest, rec_type);
1788 MAKE_SEPARATOR(key1, key2, dest, key);
1789 MAKE_SEPARATOR(key1, key2, dest, create_tid);
1790 dest->delete_tid = key2->delete_tid;
1791}
1792
1793#undef MAKE_SEPARATOR
1794
1795/*
1796 * Return whether a generic internal or leaf node is full
1797 */
1798static int
1799btree_node_is_full(hammer_node_ondisk_t node)
1800{
1801 switch(node->type) {
1802 case HAMMER_BTREE_TYPE_INTERNAL:
1803 if (node->count == HAMMER_BTREE_INT_ELMS)
1804 return(1);
1805 break;
1806 case HAMMER_BTREE_TYPE_LEAF:
1807 if (node->count == HAMMER_BTREE_LEAF_ELMS)
1808 return(1);
1809 break;
1810 default:
1811 panic("illegal btree subtype");
1812 }
1813 return(0);
1814}
1815
1816#if 0
1817static int
1818btree_max_elements(u_int8_t type)
1819{
1820 if (type == HAMMER_BTREE_TYPE_LEAF)
1821 return(HAMMER_BTREE_LEAF_ELMS);
1822 if (type == HAMMER_BTREE_TYPE_INTERNAL)
1823 return(HAMMER_BTREE_INT_ELMS);
1824 panic("btree_max_elements: bad type %d\n", type);
1825}
1826#endif
1827
1828void
1829hammer_print_btree_node(hammer_node_ondisk_t ondisk)
1830{
1831 hammer_btree_elm_t elm;
1832 int i;
1833
1834 kprintf("node %p count=%d parent=%d type=%c\n",
1835 ondisk, ondisk->count, ondisk->parent, ondisk->type);
1836
1837 /*
1838 * Dump both boundary elements if an internal node
1839 */
1840 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
1841 for (i = 0; i <= ondisk->count; ++i) {
1842 elm = &ondisk->elms[i];
1843 hammer_print_btree_elm(elm, ondisk->type, i);
1844 }
1845 } else {
1846 for (i = 0; i < ondisk->count; ++i) {
1847 elm = &ondisk->elms[i];
1848 hammer_print_btree_elm(elm, ondisk->type, i);
1849 }
1850 }
1851}
1852
1853void
1854hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
1855{
1856 kprintf(" %2d", i);
1857 kprintf("\tobjid = %016llx\n", elm->base.obj_id);
1858 kprintf("\tkey = %016llx\n", elm->base.key);
1859 kprintf("\tcreate_tid = %016llx\n", elm->base.create_tid);
1860 kprintf("\tdelete_tid = %016llx\n", elm->base.delete_tid);
1861 kprintf("\trec_type = %04x\n", elm->base.rec_type);
1862 kprintf("\tobj_type = %02x\n", elm->base.obj_type);
1863 kprintf("\tsubtree_type = %02x\n", elm->subtree_type);
1864
1865 if (type == HAMMER_BTREE_TYPE_INTERNAL) {
1866 if (elm->internal.rec_offset) {
1867 kprintf("\tcluster_rec = %08x\n",
1868 elm->internal.rec_offset);
1869 kprintf("\tcluster_id = %08x\n",
1870 elm->internal.subtree_cluid);
1871 kprintf("\tvolno = %08x\n",
1872 elm->internal.subtree_volno);
1873 } else {
1874 kprintf("\tsubtree_off = %08x\n",
1875 elm->internal.subtree_offset);
1876 }
1877 kprintf("\tsubtree_count= %d\n", elm->internal.subtree_count);
1878 } else {
1879 kprintf("\trec_offset = %08x\n", elm->leaf.rec_offset);
1880 kprintf("\tdata_offset = %08x\n", elm->leaf.data_offset);
1881 kprintf("\tdata_len = %08x\n", elm->leaf.data_len);
1882 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc);
1883 }
1884}