Catch vkernel divide-by-0 traps a bit earlier so they are reported properly.
[dragonfly.git] / sys / vfs / hammer / hammer_btree.c
CommitLineData
427e5fc6
MD
1/*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
a89aec1b 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.5 2007/11/20 07:16:28 dillon Exp $
427e5fc6
MD
35 */
36
37/*
8cd0a023 38 * HAMMER B-Tree index
427e5fc6
MD
39 *
40 * HAMMER implements a modified B+Tree. In documentation this will
8cd0a023 41 * simply be refered to as the HAMMER B-Tree. Basically a B-Tree
427e5fc6
MD
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
8cd0a023 45 * otherwords, we have boundary elements at the two ends of a B-Tree node
427e5fc6
MD
46 * instead of sub-tree pointers.
47 *
8cd0a023 48 * A B-Tree internal node looks like this:
427e5fc6
MD
49 *
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
52 *
8cd0a023 53 * A B-Tree leaf node basically looks like this:
427e5fc6
MD
54 *
55 * L L L L L L L L <-- leaf elemenets
56 *
8cd0a023
MD
57 * The radix for an internal node is 1 less then a leaf but we get a
58 * number of significant benefits for our troubles.
427e5fc6 59 *
8cd0a023
MD
60 * The big benefit to using a B-Tree containing boundary information
61 * is that it is possible to cache pointers into the middle of the tree
62 * and not have to start searches, insertions, OR deletions at the root
63 * node. In particular, searches are able to progress in a definitive
64 * direction from any point in the tree without revisting nodes. This
65 * greatly improves the efficiency of many operations, most especially
66 * record appends.
427e5fc6 67 *
8cd0a023
MD
68 * B-Trees also make the stacking of trees fairly straightforward.
69 *
70 * INTER-CLUSTER ELEMENTS: An element of an internal node may reference
71 * the root of another cluster rather then a node in the current cluster.
72 * This is known as an inter-cluster references. Only B-Tree searches
73 * will cross cluster boundaries. The rebalancing and collapse code does
74 * not attempt to move children between clusters. A major effect of this
75 * is that we have to relax minimum element count requirements and allow
76 * trees to become somewhat unabalanced.
77 *
78 * INSERTIONS AND DELETIONS: When inserting we split full nodes on our
79 * way down as an optimization. I originally experimented with rebalancing
80 * nodes on the way down for deletions but it created a huge mess due to
81 * the way inter-cluster linkages work. Instead, now I simply allow
82 * the tree to become unbalanced and allow leaf nodes to become empty.
83 * The delete code will try to clean things up from the bottom-up but
84 * will stop if related elements are not in-core or if it cannot get a node
85 * lock.
86 */
87#include "hammer.h"
88#include <sys/buf.h>
89#include <sys/buf2.h>
66325755 90
8cd0a023
MD
91static int btree_search(hammer_cursor_t cursor, int flags);
92static int btree_split_internal(hammer_cursor_t cursor);
93static int btree_split_leaf(hammer_cursor_t cursor);
94static int btree_remove(hammer_node_t node, int index);
95#if 0
96static int btree_rebalance(hammer_cursor_t cursor);
97static int btree_collapse(hammer_cursor_t cursor);
98#endif
99static int btree_node_is_full(hammer_node_ondisk_t node);
100static void hammer_make_separator(hammer_base_elm_t key1,
101 hammer_base_elm_t key2, hammer_base_elm_t dest);
66325755
MD
102
103/*
8cd0a023
MD
104 * Iterate records after a search. The cursor is iterated forwards past
105 * the current record until a record matching the key-range requirements
106 * is found. ENOENT is returned if the iteration goes past the ending
107 * key.
66325755 108 *
8cd0a023
MD
109 * key_beg/key_end is an INCLUSVE range. i.e. if you are scanning to load
110 * a 4096 byte buffer key_beg might specify an offset of 0 and key_end an
111 * offset of 4095.
66325755 112 *
8cd0a023
MD
113 * cursor->key_beg may or may not be modified by this function during
114 * the iteration.
66325755
MD
115 */
116int
8cd0a023 117hammer_btree_iterate(hammer_cursor_t cursor)
66325755 118{
8cd0a023
MD
119 hammer_node_ondisk_t node;
120 hammer_btree_elm_t elm;
66325755
MD
121 int64_t save_key;
122 int error;
123 int r;
124 int s;
125
126 /*
8cd0a023 127 * Skip past the current record
66325755 128 */
8cd0a023 129 node = cursor->node->ondisk;
a89aec1b
MD
130 if (node == NULL)
131 return(ENOENT);
8cd0a023 132 if (cursor->index < node->count)
66325755
MD
133 ++cursor->index;
134
8cd0a023
MD
135 /*
136 * Loop until an element is found or we are done.
137 */
66325755
MD
138 for (;;) {
139 /*
8cd0a023
MD
140 * We iterate up the tree and then index over one element
141 * while we are at the last element in the current node.
142 *
143 * NOTE: This can pop us up to another cluster.
66325755 144 *
8cd0a023
MD
145 * If we are at the root of the root cluster, cursor_up
146 * returns ENOENT.
147 *
148 * NOTE: hammer_cursor_up() will adjust cursor->key_beg
149 * when told to re-search for the cluster tag.
66325755
MD
150 *
151 * XXX this could be optimized by storing the information in
152 * the parent reference.
153 */
8cd0a023
MD
154 if (cursor->index == node->count) {
155 error = hammer_cursor_up(cursor);
156 if (error)
157 break;
158 node = cursor->node->ondisk;
159 KKASSERT(cursor->index != node->count);
160 ++cursor->index;
161 continue;
66325755
MD
162 }
163
164 /*
165 * Iterate down the tree while we are at an internal node.
166 * Nodes cannot be empty, assert the case because if one is
167 * we will wind up in an infinite loop.
168 *
169 * We can avoid iterating through large swaths of transaction
170 * id space if the left and right separators are the same
171 * except for their transaction spaces. We can then skip
172 * the node if the left and right transaction spaces are the
173 * same sign. This directly optimized accesses to files with
8cd0a023
MD
174 * HUGE transactional histories, such as database files,
175 * allowing us to avoid having to iterate through the entire
176 * history.
66325755 177 */
8cd0a023 178 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
a89aec1b 179 KKASSERT(node->count != 0);
8cd0a023 180 elm = &node->elms[cursor->index];
66325755
MD
181 if (elm[0].base.obj_id == elm[1].base.obj_id &&
182 elm[0].base.rec_type == elm[1].base.rec_type &&
183 elm[0].base.key == elm[1].base.key) {
8cd0a023
MD
184 /*
185 * Left side transaction space
186 */
187 save_key = cursor->key_beg.key;
188 cursor->key_beg.key = elm[0].base.key;
189 r = hammer_btree_cmp(&cursor->key_beg,
190 &elm[0].base);
191 cursor->key_beg.key = save_key;
192
193 /*
194 * Right side transaction space
195 */
196 save_key = cursor->key_end.key;
197 cursor->key_end.key = elm[1].base.key;
198 s = hammer_btree_cmp(&cursor->key_end,
199 &elm[1].base);
200 cursor->key_end.key = save_key;
201
202 /*
203 * If our range is entirely on one side or
204 * the other side we can skip the sub-tree.
205 */
66325755
MD
206 if ((r < 0 && s < 0) || (r > 0 && s > 0)) {
207 ++cursor->index;
8cd0a023 208 continue;
66325755
MD
209 }
210 }
8cd0a023 211 error = hammer_cursor_down(cursor);
66325755 212 if (error)
8cd0a023
MD
213 break;
214 KKASSERT(cursor->index == 0);
8cd0a023
MD
215 node = cursor->node->ondisk;
216 continue;
66325755
MD
217 }
218
219 /*
8cd0a023
MD
220 * We are at a leaf.
221 *
222 * Determine if the record at the cursor has gone beyond the
223 * end of our range. Remember that our key range is inclusive.
224 *
225 * When iterating we may have to 'pick out' records matching
226 * our transaction requirements. A comparison return of
227 * +1 or -1 indicates a transactional record that is too
228 * old or too new but does not terminate the search.
66325755 229 */
8cd0a023
MD
230 elm = &node->elms[cursor->index];
231 r = hammer_btree_cmp(&cursor->key_end, &elm->base);
232 if (r == -1 || r == 1) {
233 ++cursor->index;
234 continue;
66325755
MD
235 }
236
237 /*
8cd0a023
MD
238 * The search ends if the element goes beyond our key_end
239 * (after checking transactional return values above).
240 * Otherwise we have a successful match.
66325755 241 */
8cd0a023
MD
242 error = (r < 0) ? ENOENT : 0;
243 break;
427e5fc6 244 }
66325755 245 return(error);
427e5fc6
MD
246}
247
248/*
8cd0a023
MD
249 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry
250 * could not be found, and a fatal error otherwise.
251 *
252 * The cursor is suitably positioned for a deletion on success, and suitably
253 * positioned for an insertion on ENOENT.
427e5fc6 254 *
8cd0a023
MD
255 * The cursor may begin anywhere, the search will traverse clusters in
256 * either direction to locate the requested element.
427e5fc6
MD
257 */
258int
8cd0a023 259hammer_btree_lookup(hammer_cursor_t cursor)
427e5fc6 260{
66325755
MD
261 int error;
262
8cd0a023
MD
263 error = btree_search(cursor, 0);
264 if (error == 0 && cursor->flags)
265 error = hammer_btree_extract(cursor, cursor->flags);
66325755
MD
266 return(error);
267}
268
8cd0a023
MD
269/*
270 * Extract the record and/or data associated with the cursor's current
271 * position. Any prior record or data stored in the cursor is replaced.
272 * The cursor must be positioned at a leaf node.
273 *
274 * NOTE: Only records can be extracted from internal B-Tree nodes, and
275 * only for inter-cluster references. At the moment we only support
276 * extractions from leaf nodes.
277 */
66325755 278int
8cd0a023 279hammer_btree_extract(hammer_cursor_t cursor, int flags)
66325755 280{
8cd0a023
MD
281 hammer_node_ondisk_t node;
282 hammer_btree_elm_t elm;
283 hammer_cluster_t cluster;
427e5fc6
MD
284 int32_t cloff;
285 int error;
427e5fc6 286
8cd0a023 287 /*
427e5fc6 288 * A cluster record type has no data reference, the information
8cd0a023 289 * is stored directly in the record and B-Tree element.
427e5fc6
MD
290 *
291 * The case where the data reference resolves to the same buffer
292 * as the record reference must be handled.
293 */
8cd0a023
MD
294 node = cursor->node->ondisk;
295 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
296 elm = &node->elms[cursor->index];
297 cluster = cursor->node->cluster;
66325755
MD
298 error = 0;
299
a89aec1b 300 if ((flags & HAMMER_CURSOR_GET_RECORD) && error == 0) {
8cd0a023
MD
301 cloff = elm->leaf.rec_offset;
302 cursor->record = hammer_bread(cluster, cloff,
303 HAMMER_FSBUF_RECORDS, &error,
304 &cursor->record_buffer);
427e5fc6
MD
305 } else {
306 cloff = 0;
307 }
a89aec1b 308 if ((flags & HAMMER_CURSOR_GET_DATA) && error == 0) {
8cd0a023
MD
309 if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) {
310 /*
311 * Data in different buffer than record
312 */
313 cursor->data = hammer_bread(cluster,
314 elm->leaf.data_offset,
315 HAMMER_FSBUF_DATA, &error,
316 &cursor->data_buffer);
427e5fc6 317 } else {
8cd0a023
MD
318 /*
319 * Data in same buffer as record. Note that we
320 * leave any existing data_buffer intact, even
321 * though we don't use it in this case, in case
322 * other records extracted during an iteration
323 * go back to it.
324 */
325 cursor->data = (void *)
326 ((char *)cursor->record_buffer->ondisk +
327 (elm->leaf.data_offset & HAMMER_BUFMASK));
427e5fc6
MD
328 }
329 }
330 return(error);
331}
332
333
334/*
8cd0a023
MD
335 * Insert a leaf element into the B-Tree at the current cursor position.
336 * The cursor is positioned such that the element at and beyond the cursor
337 * are shifted to make room for the new record.
338 *
a89aec1b 339 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
8cd0a023
MD
340 * flag set and that call must return ENOENT before this function can be
341 * called.
342 *
343 * ENOSPC is returned if there is no room to insert a new record.
427e5fc6
MD
344 */
345int
8cd0a023 346hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm)
427e5fc6 347{
8cd0a023
MD
348 hammer_node_ondisk_t parent;
349 hammer_node_ondisk_t node;
427e5fc6
MD
350 int i;
351
66325755
MD
352#if 0
353 /* HANDLED BY CALLER */
427e5fc6
MD
354 /*
355 * Issue a search to get our cursor at the right place. The search
356 * will get us to a leaf node.
357 *
358 * The search also does some setup for our insert, so there is always
359 * room in the leaf.
360 */
a89aec1b 361 error = btree_search(cursor, HAMMER_CURSOR_INSERT);
427e5fc6
MD
362 if (error != ENOENT) {
363 if (error == 0)
364 error = EEXIST;
365 return (error);
366 }
66325755 367#endif
427e5fc6
MD
368
369 /*
370 * Insert the element at the leaf node and update the count in the
371 * parent. It is possible for parent to be NULL, indicating that
8cd0a023 372 * the root of the B-Tree in the cluster is a leaf. It is also
427e5fc6
MD
373 * possible for the leaf to be empty.
374 *
375 * Remember that the right-hand boundary is not included in the
376 * count.
377 */
8cd0a023 378 node = cursor->node->ondisk;
427e5fc6 379 i = cursor->index;
8cd0a023
MD
380 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
381 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
382 if (i != node->count) {
383 bcopy(&node->elms[i], &node->elms[i+1],
384 (node->count - i) * sizeof(*elm));
385 }
386 node->elms[i] = *elm;
387 ++node->count;
388 hammer_modify_node(cursor->node);
427e5fc6 389
8cd0a023 390 if ((parent = cursor->parent->ondisk) != NULL) {
427e5fc6 391 i = cursor->parent_index;
8cd0a023
MD
392 ++parent->elms[i].internal.subtree_count;
393 KKASSERT(parent->elms[i].internal.subtree_count <= node->count);
394 hammer_modify_node(cursor->parent);
427e5fc6 395 }
427e5fc6
MD
396 return(0);
397}
398
399/*
8cd0a023
MD
400 * Delete a record from the B-Tree's at the current cursor position.
401 * The cursor is positioned such that the current element is the one
402 * to be deleted.
403 *
a89aec1b 404 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_DELETE
8cd0a023
MD
405 * flag set and that call must return 0 before this function can be
406 * called.
407 *
408 * It is possible that we will be asked to delete the last element in a
409 * leaf. This case only occurs if the downward search was unable to
410 * rebalance us, which in turn can occur if our parent has inter-cluster
411 * elements. So the 0-element case for a leaf is allowed.
427e5fc6
MD
412 */
413int
8cd0a023 414hammer_btree_delete(hammer_cursor_t cursor)
427e5fc6 415{
8cd0a023
MD
416 hammer_node_ondisk_t ondisk;
417 hammer_node_t node;
418 hammer_node_t parent;
419 hammer_btree_elm_t elm;
420 int error;
427e5fc6
MD
421 int i;
422
66325755
MD
423#if 0
424 /* HANDLED BY CALLER */
427e5fc6
MD
425 /*
426 * Locate the leaf element to delete. The search is also responsible
427 * for doing some of the rebalancing work on its way down.
428 */
a89aec1b 429 error = btree_search(cursor, HAMMER_CURSOR_DELETE);
427e5fc6
MD
430 if (error)
431 return (error);
66325755 432#endif
427e5fc6
MD
433
434 /*
8cd0a023 435 * Delete the element from the leaf node.
427e5fc6 436 *
8cd0a023 437 * Remember that leaf nodes do not have boundaries.
427e5fc6 438 */
8cd0a023
MD
439 node = cursor->node;
440 ondisk = node->ondisk;
427e5fc6
MD
441 i = cursor->index;
442
8cd0a023
MD
443 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
444 if (i + 1 != ondisk->count) {
445 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
446 (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
447 }
448 --ondisk->count;
449 if (cursor->parent != NULL) {
427e5fc6
MD
450 /*
451 * Adjust parent's notion of the leaf's count. subtree_count
8cd0a023 452 * is only approximate, it is allowed to be too small but
427e5fc6
MD
453 * never allowed to be too large. Make sure we don't drop
454 * the count below 0.
455 */
8cd0a023
MD
456 parent = cursor->parent;
457 elm = &parent->ondisk->elms[cursor->parent_index];
458 if (elm->internal.subtree_count)
459 --elm->internal.subtree_count;
460 KKASSERT(elm->internal.subtree_count <= ondisk->count);
461 hammer_modify_node(parent);
427e5fc6 462 }
427e5fc6 463
8cd0a023
MD
464 /*
465 * If the leaf is empty try to remove the subtree reference
466 * in at (parent, parent_index). This will unbalance the
467 * tree.
468 *
469 * Note that internal nodes must have at least one element
470 * so their boundary information is properly laid out. If
471 * we would cause our parent to become empty we try to
472 * recurse up the tree, but if that doesn't work we just
473 * leave the tree with an empty leaf.
474 */
475 if (ondisk->count == 0) {
476 error = btree_remove(cursor->parent, cursor->parent_index);
477 if (error == 0) {
478 hammer_free_btree(node->cluster, node->node_offset);
479 } else if (error == EAGAIN) {
480 hammer_modify_node(node);
481 error = 0;
482 } /* else a real error occured XXX */
483 } else {
484 hammer_modify_node(node);
485 error = 0;
486 }
487 return(error);
488}
427e5fc6
MD
489
490/*
8cd0a023
MD
491 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
492 *
493 * Search a cluster's B-Tree for cursor->key_beg, return the matching node.
494 *
495 * The search begins at the current node and will instantiate a NULL
496 * parent if necessary and if not at the root of the cluster. On return
497 * parent will be non-NULL unless the cursor is sitting at a root-leaf.
498 *
499 * The search code may be forced to iterate up the tree if the conditions
500 * required for an insertion or deletion are not met. This does not occur
501 * very often.
502 *
503 * INSERTIONS: The search will split full nodes and leaves on its way down
504 * and guarentee that the leaf it ends up on is not full.
427e5fc6 505 *
8cd0a023 506 * DELETIONS: The search will rebalance the tree on its way down.
427e5fc6 507 */
8cd0a023 508static
427e5fc6 509int
8cd0a023 510btree_search(hammer_cursor_t cursor, int flags)
427e5fc6 511{
8cd0a023
MD
512 hammer_node_ondisk_t node;
513 hammer_cluster_t cluster;
514 int error;
515 int i;
516 int r;
517
518 flags |= cursor->flags;
519
520 /*
521 * Move our cursor up the tree until we find a node whos range covers
522 * the key we are trying to locate. This may move us between
523 * clusters.
524 *
525 * The left bound is inclusive, the right bound is non-inclusive.
526 * It is ok to cursor up too far so when cursoring across a cluster
527 * boundary.
528 *
529 * First see if we can skip the whole cluster. hammer_cursor_up()
530 * handles both cases but this way we don't check the cluster
531 * bounds when going up the tree within a cluster.
532 */
533 cluster = cursor->node->cluster;
534 while (
535 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 ||
536 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) {
537 error = hammer_cursor_toroot(cursor);
538 if (error)
539 goto done;
540 error = hammer_cursor_up(cursor);
541 if (error)
542 goto done;
543 cluster = cursor->node->cluster;
427e5fc6
MD
544 }
545
546 /*
8cd0a023
MD
547 * Deal with normal cursoring within a cluster. The right bound
548 * is non-inclusive. That is, the bounds form a separator.
427e5fc6 549 */
8cd0a023
MD
550 while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 ||
551 hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) {
552 error = hammer_cursor_up(cursor);
553 if (error)
554 goto done;
427e5fc6 555 }
427e5fc6 556
8cd0a023
MD
557 /*
558 * We better have ended up with a node somewhere, and our second
559 * while loop had better not have traversed up a cluster.
560 */
561 KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster);
562
563 /*
564 * If we are inserting we can't start at a full node if the parent
565 * is also full (because there is no way to split the node),
566 * continue running up the tree until we hit the root of the
567 * root cluster or until the requirement is satisfied.
568 *
569 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
570 *
571 * XXX as an optimization it should be possible to unbalance the tree
572 * and stop at the root of the current cluster.
573 */
a89aec1b 574 while (flags & HAMMER_CURSOR_INSERT) {
8cd0a023
MD
575 if (btree_node_is_full(cursor->node->ondisk) == 0)
576 break;
577 if (cursor->parent == NULL)
578 break;
579 if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS)
580 break;
581 error = hammer_cursor_up(cursor);
582 /* cluster and node are now may become stale */
583 if (error)
584 goto done;
427e5fc6 585 }
8cd0a023 586 /* cluster = cursor->node->cluster; not needed until next cluster = */
427e5fc6 587
8cd0a023
MD
588#if 0
589 /*
590 * If we are deleting we can't start at an internal node with only
591 * one element unless it is root, because all of our code assumes
592 * that internal nodes will never be empty. Just do this generally
593 * for both leaf and internal nodes to get better balance.
594 *
595 * This handles the case where the cursor is sitting at a leaf and
596 * either the leaf or parent contain an insufficient number of
597 * elements.
598 *
599 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
a89aec1b
MD
600 *
601 * XXX NOTE: Iterations may not set this flag anyway.
8cd0a023 602 */
a89aec1b 603 while (flags & HAMMER_CURSOR_DELETE) {
8cd0a023
MD
604 if (cursor->node->ondisk->count > 1)
605 break;
606 if (cursor->parent == NULL)
607 break;
608 KKASSERT(cursor->node->ondisk->count != 0);
609 error = hammer_cursor_up(cursor);
610 /* cluster and node are now may become stale */
611 if (error)
612 goto done;
613 }
614#endif
427e5fc6 615
8cd0a023
MD
616/*new_cluster:*/
617 /*
618 * Push down through internal nodes to locate the requested key.
619 */
620 cluster = cursor->node->cluster;
621 node = cursor->node->ondisk;
622 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
623#if 0
427e5fc6 624 /*
8cd0a023
MD
625 * If we are a the root node and deleting, try to collapse
626 * all of the root's children into the root. This is the
627 * only point where tree depth is reduced.
a89aec1b
MD
628 *
629 * XXX NOTE: Iterations may not set this flag anyway.
427e5fc6 630 */
a89aec1b 631 if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) {
8cd0a023
MD
632 error = btree_collapse(cursor);
633 /* node becomes stale after call */
634 if (error)
635 goto done;
66325755 636 }
8cd0a023
MD
637 node = cursor->node->ondisk;
638#endif
639
640 /*
641 * Scan the node to find the subtree index to push down into.
642 * We go one-past, then back-up. The key should never be
643 * less then the left-hand boundary so I should never wind
644 * up 0.
645 */
646 for (i = 0; i < node->count; ++i) {
647 r = hammer_btree_cmp(&cursor->key_beg,
648 &node->elms[i].base);
649 if (r < 0)
650 break;
651 }
652 KKASSERT(i != 0);
653
654 /*
655 * The push-down index is now i - 1.
656 */
657 --i;
658 cursor->index = i;
659
660 /*
661 * Handle insertion and deletion requirements.
662 *
663 * If inserting split full nodes. The split code will
664 * adjust cursor->node and cursor->index if the current
665 * index winds up in the new node.
666 */
a89aec1b 667 if (flags & HAMMER_CURSOR_INSERT) {
8cd0a023
MD
668 if (node->count == HAMMER_BTREE_INT_ELMS) {
669 error = btree_split_internal(cursor);
670 if (error)
671 goto done;
672 /*
673 * reload stale pointers
674 */
675 i = cursor->index;
676 node = cursor->node->ondisk;
677 }
678 }
679
680#if 0
427e5fc6 681 /*
8cd0a023
MD
682 * If deleting rebalance - do not allow the child to have
683 * just one element or we will not be able to delete it.
684 *
685 * Neither internal or leaf nodes (except a root-leaf) are
686 * allowed to drop to 0 elements. (XXX - well, leaf nodes
687 * can at the moment).
688 *
689 * Our separators may have been reorganized after rebalancing,
690 * so we have to pop back up and rescan.
691 *
692 * XXX test for subtree_count < maxelms / 2, minus 1 or 2
693 * for hysteresis?
a89aec1b
MD
694 *
695 * XXX NOTE: Iterations may not set this flag anyway.
427e5fc6 696 */
a89aec1b 697 if (flags & HAMMER_CURSOR_DELETE) {
8cd0a023
MD
698 if (node->elms[i].internal.subtree_count <= 1) {
699 error = btree_rebalance(cursor);
700 if (error)
701 goto done;
702 /* cursor->index is invalid after call */
703 goto new_cluster;
704 }
705 }
706#endif
427e5fc6
MD
707
708 /*
8cd0a023
MD
709 * Push down (push into new node, existing node becomes
710 * the parent).
427e5fc6 711 */
8cd0a023
MD
712 error = hammer_cursor_down(cursor);
713 /* node and cluster become stale */
714 if (error)
715 goto done;
716 node = cursor->node->ondisk;
717 cluster = cursor->node->cluster;
427e5fc6 718 }
427e5fc6 719
8cd0a023
MD
720 /*
721 * We are at a leaf, do a linear search of the key array.
722 * (XXX do a binary search). On success the index is set to the
723 * matching element, on failure the index is set to the insertion
724 * point.
725 *
726 * Boundaries are not stored in leaf nodes, so the index can wind
727 * up to the left of element 0 (index == 0) or past the end of
728 * the array (index == node->count).
729 */
730 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
731
732 for (i = 0; i < node->count; ++i) {
733 r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base);
427e5fc6 734
427e5fc6 735 /*
8cd0a023 736 * Stop if we've flipped past key_beg
427e5fc6 737 */
8cd0a023
MD
738 if (r < 0)
739 break;
427e5fc6 740
66325755 741 /*
8cd0a023 742 * Return an exact match
66325755 743 */
8cd0a023
MD
744 if (r == 0) {
745 cursor->index = i;
746 error = 0;
747 goto done;
66325755 748 }
427e5fc6 749 }
8cd0a023
MD
750
751 /*
752 * No exact match was found, i is now at the insertion point.
753 *
754 * If inserting split a full leaf before returning. This
755 * may have the side effect of adjusting cursor->node and
756 * cursor->index.
757 */
758 cursor->index = i;
a89aec1b 759 if ((flags & HAMMER_CURSOR_INSERT) &&
8cd0a023
MD
760 node->count == HAMMER_BTREE_LEAF_ELMS) {
761 error = btree_split_leaf(cursor);
762 /* NOT USED
763 i = cursor->index;
764 node = &cursor->node->internal;
765 */
766 if (error)
767 goto done;
768 }
769 error = ENOENT;
8cd0a023 770done:
427e5fc6
MD
771 return(error);
772}
773
8cd0a023 774
427e5fc6 775/************************************************************************
8cd0a023 776 * SPLITTING AND MERGING *
427e5fc6
MD
777 ************************************************************************
778 *
779 * These routines do all the dirty work required to split and merge nodes.
780 */
781
782/*
8cd0a023 783 * Split an internal node into two nodes and move the separator at the split
427e5fc6
MD
784 * point to the parent. Note that the parent's parent's element pointing
785 * to our parent will have an incorrect subtree_count (we don't update it).
786 * It will be low, which is ok.
787 *
8cd0a023
MD
788 * (cursor->node, cursor->index) indicates the element the caller intends
789 * to push into. We will adjust node and index if that element winds
427e5fc6 790 * up in the split node.
8cd0a023
MD
791 *
792 * If we are at the root of a cluster a new root must be created with two
793 * elements, one pointing to the original root and one pointing to the
794 * newly allocated split node.
795 *
796 * NOTE! Being at the root of a cluster is different from being at the
797 * root of the root cluster. cursor->parent will not be NULL and
798 * cursor->node->ondisk.parent must be tested against 0. Theoretically
799 * we could propogate the algorithm into the parent and deal with multiple
800 * 'roots' in the cluster header, but it's easier not to.
427e5fc6
MD
801 */
802static
803int
8cd0a023 804btree_split_internal(hammer_cursor_t cursor)
427e5fc6 805{
8cd0a023
MD
806 hammer_node_ondisk_t ondisk;
807 hammer_node_t node;
808 hammer_node_t parent;
809 hammer_node_t new_node;
810 hammer_btree_elm_t elm;
811 hammer_btree_elm_t parent_elm;
427e5fc6
MD
812 int parent_index;
813 int made_root;
814 int split;
815 int error;
8cd0a023 816 const int esize = sizeof(*elm);
427e5fc6
MD
817
818 /*
819 * We are splitting but elms[split] will be promoted to the parent,
820 * leaving the right hand node with one less element. If the
821 * insertion point will be on the left-hand side adjust the split
822 * point to give the right hand side one additional node.
823 */
8cd0a023
MD
824 node = cursor->node;
825 ondisk = node->ondisk;
826 split = (ondisk->count + 1) / 2;
427e5fc6
MD
827 if (cursor->index <= split)
828 --split;
829 error = 0;
830
831 /*
8cd0a023 832 * If we are at the root of the cluster, create a new root node with
427e5fc6
MD
833 * 1 element and split normally. Avoid making major modifications
834 * until we know the whole operation will work.
8cd0a023
MD
835 *
836 * The root of the cluster is different from the root of the root
837 * cluster. Use the node's on-disk structure's parent offset to
838 * detect the case.
427e5fc6 839 */
8cd0a023
MD
840 if (ondisk->parent == 0) {
841 parent = hammer_alloc_btree(node->cluster, &error);
427e5fc6
MD
842 if (parent == NULL)
843 return(error);
8cd0a023
MD
844 hammer_lock_ex(&parent->lock);
845 ondisk = parent->ondisk;
846 ondisk->count = 1;
847 ondisk->parent = 0;
848 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
849 ondisk->elms[0].base = node->cluster->clu_btree_beg;
850 ondisk->elms[0].internal.subtree_type = node->ondisk->type;
851 ondisk->elms[0].internal.subtree_offset = node->node_offset;
852 ondisk->elms[1].base = node->cluster->clu_btree_end;
427e5fc6 853 made_root = 1;
8cd0a023 854 parent_index = 0; /* index of current node in parent */
427e5fc6
MD
855 } else {
856 made_root = 0;
8cd0a023
MD
857 parent = cursor->parent;
858 parent_index = cursor->parent_index;
427e5fc6 859 }
427e5fc6
MD
860
861 /*
862 * Split node into new_node at the split point.
863 *
864 * B O O O P N N B <-- P = node->elms[split]
865 * 0 1 2 3 4 5 6 <-- subtree indices
866 *
867 * x x P x x
868 * s S S s
869 * / \
870 * B O O O B B N N B <--- inner boundary points are 'P'
871 * 0 1 2 3 4 5 6
872 *
873 */
8cd0a023 874 new_node = hammer_alloc_btree(node->cluster, &error);
427e5fc6 875 if (new_node == NULL) {
8cd0a023
MD
876 if (made_root) {
877 hammer_unlock(&parent->lock);
878 hammer_free_btree(node->cluster, parent->node_offset);
879 hammer_rel_node(parent);
880 }
427e5fc6
MD
881 return(error);
882 }
8cd0a023 883 hammer_lock_ex(&new_node->lock);
427e5fc6
MD
884
885 /*
8cd0a023 886 * Create the new node. P becomes the left-hand boundary in the
427e5fc6
MD
887 * new node. Copy the right-hand boundary as well.
888 *
889 * elm is the new separator.
890 */
8cd0a023
MD
891 ondisk = node->ondisk;
892 elm = &ondisk->elms[split];
893 bcopy(elm, &new_node->ondisk->elms[0],
894 (ondisk->count - split + 1) * esize);
895 new_node->ondisk->count = ondisk->count - split;
896 new_node->ondisk->parent = parent->node_offset;
897 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
898 KKASSERT(ondisk->type == new_node->ondisk->type);
427e5fc6
MD
899
900 /*
901 * Cleanup the original node. P becomes the new boundary, its
902 * subtree_offset was moved to the new node. If we had created
903 * a new root its parent pointer may have changed.
904 */
8cd0a023 905 elm->internal.subtree_offset = 0;
427e5fc6
MD
906
907 /*
908 * Insert the separator into the parent, fixup the parent's
909 * reference to the original node, and reference the new node.
910 * The separator is P.
911 *
912 * Remember that base.count does not include the right-hand boundary.
913 */
8cd0a023
MD
914 ondisk = parent->ondisk;
915 ondisk->elms[parent_index].internal.subtree_count = split;
916 parent_elm = &ondisk->elms[parent_index+1];
427e5fc6 917 bcopy(parent_elm, parent_elm + 1,
8cd0a023
MD
918 (ondisk->count - parent_index) * esize);
919 parent_elm->internal.base = elm->base; /* separator P */
920 parent_elm->internal.subtree_offset = new_node->node_offset;
921 parent_elm->internal.subtree_count = new_node->ondisk->count;
427e5fc6
MD
922
923 /*
924 * The cluster's root pointer may have to be updated.
925 */
926 if (made_root) {
8cd0a023
MD
927 node->cluster->ondisk->clu_btree_root = parent->node_offset;
928 hammer_modify_cluster(node->cluster);
929 node->ondisk->parent = parent->node_offset;
930 if (cursor->parent) {
931 hammer_unlock(&cursor->parent->lock);
932 hammer_rel_node(cursor->parent);
933 }
934 cursor->parent = parent; /* lock'd and ref'd */
427e5fc6
MD
935 }
936
8cd0a023
MD
937 hammer_modify_node(node);
938 hammer_modify_node(new_node);
939 hammer_modify_node(parent);
940
427e5fc6
MD
941 /*
942 * Ok, now adjust the cursor depending on which element the original
943 * index was pointing at. If we are >= the split point the push node
944 * is now in the new node.
945 *
946 * NOTE: If we are at the split point itself we cannot stay with the
947 * original node because the push index will point at the right-hand
948 * boundary, which is illegal.
8cd0a023
MD
949 *
950 * NOTE: The cursor's parent or parent_index must be adjusted for
951 * the case where a new parent (new root) was created, and the case
952 * where the cursor is now pointing at the split node.
427e5fc6
MD
953 */
954 if (cursor->index >= split) {
8cd0a023 955 cursor->parent_index = parent_index + 1;
427e5fc6 956 cursor->index -= split;
8cd0a023
MD
957 hammer_unlock(&cursor->node->lock);
958 hammer_rel_node(cursor->node);
959 cursor->node = new_node; /* locked and ref'd */
960 } else {
961 cursor->parent_index = parent_index;
962 hammer_unlock(&new_node->lock);
963 hammer_rel_node(new_node);
427e5fc6 964 }
427e5fc6
MD
965 return (0);
966}
967
968/*
969 * Same as the above, but splits a full leaf node.
970 */
971static
972int
8cd0a023 973btree_split_leaf(hammer_cursor_t cursor)
427e5fc6 974{
8cd0a023
MD
975 hammer_node_ondisk_t ondisk;
976 hammer_node_t parent;
977 hammer_node_t leaf;
978 hammer_node_t new_leaf;
979 hammer_btree_elm_t elm;
980 hammer_btree_elm_t parent_elm;
427e5fc6
MD
981 int parent_index;
982 int made_root;
983 int split;
984 int error;
8cd0a023 985 const size_t esize = sizeof(*elm);
427e5fc6
MD
986
987 /*
8cd0a023
MD
988 * Calculate the split point. If the insertion point will be on
989 * the left-hand side adjust the split point to give the right
990 * hand side one additional node.
427e5fc6 991 */
8cd0a023
MD
992 leaf = cursor->node;
993 ondisk = leaf->ondisk;
994 split = (ondisk->count + 1) / 2;
427e5fc6
MD
995 if (cursor->index <= split)
996 --split;
997 error = 0;
998
999 /*
1000 * If we are at the root of the tree, create a new root node with
1001 * 1 element and split normally. Avoid making major modifications
1002 * until we know the whole operation will work.
1003 */
8cd0a023
MD
1004 if (ondisk->parent == 0) {
1005 parent = hammer_alloc_btree(leaf->cluster, &error);
427e5fc6
MD
1006 if (parent == NULL)
1007 return(error);
8cd0a023
MD
1008 hammer_lock_ex(&parent->lock);
1009 ondisk = parent->ondisk;
1010 ondisk->count = 1;
1011 ondisk->parent = 0;
1012 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1013 ondisk->elms[0].base = leaf->cluster->clu_btree_beg;
1014 ondisk->elms[0].internal.subtree_type = leaf->ondisk->type;
1015 ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
1016 ondisk->elms[1].base = leaf->cluster->clu_btree_end;
427e5fc6 1017 made_root = 1;
8cd0a023 1018 parent_index = 0; /* insertion point in parent */
427e5fc6
MD
1019 } else {
1020 made_root = 0;
8cd0a023
MD
1021 parent = cursor->parent;
1022 parent_index = cursor->parent_index;
427e5fc6 1023 }
427e5fc6
MD
1024
1025 /*
1026 * Split leaf into new_leaf at the split point. Select a separator
1027 * value in-between the two leafs but with a bent towards the right
1028 * leaf since comparisons use an 'elm >= separator' inequality.
1029 *
1030 * L L L L L L L L
1031 *
1032 * x x P x x
1033 * s S S s
1034 * / \
1035 * L L L L L L L L
1036 */
8cd0a023 1037 new_leaf = hammer_alloc_btree(leaf->cluster, &error);
427e5fc6 1038 if (new_leaf == NULL) {
8cd0a023
MD
1039 if (made_root) {
1040 hammer_unlock(&parent->lock);
1041 hammer_free_btree(leaf->cluster, parent->node_offset);
1042 hammer_rel_node(parent);
1043 }
427e5fc6
MD
1044 return(error);
1045 }
8cd0a023 1046 hammer_lock_ex(&new_leaf->lock);
427e5fc6
MD
1047
1048 /*
1049 * Create the new node. P become the left-hand boundary in the
1050 * new node. Copy the right-hand boundary as well.
1051 */
8cd0a023
MD
1052 ondisk = leaf->ondisk;
1053 elm = &ondisk->elms[split];
1054 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1055 new_leaf->ondisk->count = ondisk->count - split;
1056 new_leaf->ondisk->parent = parent->node_offset;
1057 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1058 KKASSERT(ondisk->type == new_leaf->ondisk->type);
427e5fc6
MD
1059
1060 /*
8cd0a023
MD
1061 * Cleanup the original node. Because this is a leaf node and
1062 * leaf nodes do not have a right-hand boundary, there
1063 * aren't any special edge cases to clean up.
427e5fc6 1064 */
8cd0a023 1065 /* nothing to do */
427e5fc6
MD
1066
1067 /*
1068 * Insert the separator into the parent, fixup the parent's
1069 * reference to the original node, and reference the new node.
1070 * The separator is P.
1071 *
1072 * Remember that base.count does not include the right-hand boundary.
1073 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1074 */
8cd0a023
MD
1075 ondisk = parent->ondisk;
1076 ondisk->elms[parent_index].internal.subtree_count = split;
1077 parent_elm = &ondisk->elms[parent_index+1];
1078 if (parent_index + 1 != ondisk->count) {
427e5fc6 1079 bcopy(parent_elm, parent_elm + 1,
8cd0a023 1080 (ondisk->count - parent_index - 1) * esize);
427e5fc6
MD
1081 }
1082 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
8cd0a023
MD
1083 parent_elm->internal.subtree_offset = new_leaf->node_offset;
1084 parent_elm->internal.subtree_count = new_leaf->ondisk->count;
427e5fc6
MD
1085
1086 /*
1087 * The cluster's root pointer may have to be updated.
1088 */
1089 if (made_root) {
8cd0a023
MD
1090 leaf->cluster->ondisk->clu_btree_root = parent->node_offset;
1091 hammer_modify_cluster(leaf->cluster);
1092 leaf->ondisk->parent = parent->node_offset;
1093 if (cursor->parent) {
1094 hammer_unlock(&cursor->parent->lock);
1095 hammer_rel_node(cursor->parent);
1096 }
1097 cursor->parent = parent; /* lock'd and ref'd */
427e5fc6
MD
1098 }
1099
8cd0a023
MD
1100 hammer_modify_node(leaf);
1101 hammer_modify_node(new_leaf);
1102 hammer_modify_node(parent);
1103
427e5fc6
MD
1104 /*
1105 * Ok, now adjust the cursor depending on which element the original
1106 * index was pointing at. If we are >= the split point the push node
1107 * is now in the new node.
1108 *
1109 * NOTE: If we are at the split point itself we cannot stay with the
1110 * original node because the push index will point at the right-hand
1111 * boundary, which is illegal.
1112 */
1113 if (cursor->index >= split) {
8cd0a023 1114 cursor->parent_index = parent_index + 1;
427e5fc6 1115 cursor->index -= split;
8cd0a023
MD
1116 cursor->node = new_leaf;
1117 hammer_unlock(&cursor->node->lock);
1118 hammer_rel_node(cursor->node);
1119 cursor->node = new_leaf;
1120 } else {
1121 cursor->parent_index = parent_index;
1122 hammer_unlock(&new_leaf->lock);
1123 hammer_rel_node(new_leaf);
427e5fc6 1124 }
427e5fc6
MD
1125 return (0);
1126}
1127
1128/*
8cd0a023
MD
1129 * Remove the element at (node, index). If the internal node would become
1130 * empty passively recurse up the tree.
1131 *
1132 * A locked internal node is passed to this function, the node remains
1133 * locked on return. Leaf nodes cannot be passed to this function.
1134 *
1135 * Returns EAGAIN if we were unable to acquire the needed locks. The caller
1136 * does not deal with the empty leaf until determines whether this recursion
1137 * has succeeded or not.
1138 */
1139int
1140btree_remove(hammer_node_t node, int index)
1141{
1142 hammer_node_ondisk_t ondisk;
1143 hammer_node_t parent;
1144 int error;
1145
1146 ondisk = node->ondisk;
1147 KKASSERT(ondisk->count > 0);
1148
1149 /*
1150 * Remove the element, shifting remaining elements left one.
1151 * Note that our move must include the right-boundary element.
1152 */
1153 if (ondisk->count != 1) {
1154 bcopy(&ondisk->elms[index+1], &ondisk->elms[index],
1155 (ondisk->count - index) * sizeof(ondisk->elms[0]));
1156 --ondisk->count;
1157 hammer_modify_node(node);
1158 return(0);
1159 }
1160
1161 /*
1162 * Internal nodes cannot drop to 0 elements, so remove the node
1163 * from ITS parent. If the node is the root node, convert it to
1164 * an empty leaf node (which can drop to 0 elements).
1165 */
1166 if (ondisk->parent == 0) {
1167 ondisk->count = 0;
1168 ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1169 hammer_modify_node(node);
1170 return(0);
1171 }
1172
1173 /*
1174 * Try to remove the node from its parent. Return EAGAIN if we
1175 * cannot.
1176 */
1177 parent = hammer_get_node(node->cluster, ondisk->parent, &error);
1178 if (hammer_lock_ex_try(&parent->lock)) {
1179 hammer_rel_node(parent);
1180 return(EAGAIN);
1181 }
1182 ondisk = parent->ondisk;
1183 for (index = 0; index < ondisk->count; ++index) {
1184 if (ondisk->elms[index].internal.subtree_offset ==
1185 node->node_offset) {
1186 break;
1187 }
1188 }
1189 if (index == ondisk->count) {
1190 kprintf("btree_remove: lost parent linkage to node\n");
1191 error = EIO;
1192 } else {
1193 error = btree_remove(parent, index);
1194 if (error == 0) {
1195 hammer_free_btree(node->cluster, node->node_offset);
1196 /* NOTE: node can be reallocated at any time now */
1197 }
1198 }
1199 hammer_unlock(&parent->lock);
1200 hammer_rel_node(parent);
1201 return (error);
1202}
1203
1204#if 0
1205
1206/*
1207 * This routine is called on the internal node (node) prior to recursing down
1208 * through (node, index) when the node referenced by (node, index) MIGHT
1209 * have too few elements for the caller to perform a deletion.
427e5fc6
MD
1210 *
1211 * cursor->index is invalid on return because the separators may have gotten
1212 * adjusted, the caller must rescan the node's elements. The caller may set
1213 * cursor->index to -1 if it wants us to do a general rebalancing.
8cd0a023
MD
1214 *
1215 * This routine rebalances the children of the (node), collapsing children
1216 * together if possible. On return each child will have at least L/2-1
1217 * elements unless the node only has one child.
427e5fc6
MD
1218 *
1219 * NOTE: Because we do not update the parent's parent in the split code,
1220 * the subtree_count used by the caller may be incorrect. We correct it
1221 * here. Also note that we cannot change the depth of the tree's leaf
1222 * nodes here (see btree_collapse()).
1223 *
8cd0a023 1224 * NOTE: We make no attempt to rebalance inter-cluster elements.
427e5fc6
MD
1225 */
1226static
1227int
8cd0a023 1228btree_rebalance(hammer_cursor_t cursor)
427e5fc6 1229{
8cd0a023
MD
1230 hammer_node_ondisk_t ondisk;
1231 hammer_node_t node;
1232 hammer_node_t children[HAMMER_BTREE_INT_ELMS];
1233 hammer_node_t child;
1234 hammer_btree_elm_t elm;
1235 hammer_btree_elm_t elms;
427e5fc6
MD
1236 int i, j, n, nelms, goal;
1237 int maxelms, halfelms;
1238 int error;
1239
1240 /*
8cd0a023
MD
1241 * If the elm being recursed through is an inter-cluster reference,
1242 * don't worry about it.
427e5fc6 1243 */
8cd0a023
MD
1244 ondisk = cursor->node->ondisk;
1245 elm = &ondisk->elms[cursor->index];
1246 if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER)
1247 return(0);
1248
1249 KKASSERT(elm->internal.subtree_offset != 0);
427e5fc6
MD
1250 error = 0;
1251
1252 /*
1253 * Load the children of node and do any necessary corrections
8cd0a023 1254 * to subtree_count. subtree_count may be too low due to the
427e5fc6 1255 * way insertions split nodes. Get a count of the total number
8cd0a023 1256 * of actual elements held by our children.
427e5fc6
MD
1257 */
1258 error = 0;
1259
1260 for (i = n = 0; i < node->base.count; ++i) {
1261 struct hammer_btree_internal_elm *elm;
1262
1263 elm = &node->elms[i];
1264 children[i] = NULL;
1265 child_buffer[i] = NULL; /* must be preinitialized for bread */
1266 if (elm->subtree_offset == 0)
1267 continue;
1268 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1269 HAMMER_FSBUF_BTREE, &error,
8cd0a023 1270 &child_buffer[i], XXX);
427e5fc6
MD
1271 children[i] = child;
1272 if (child == NULL)
1273 continue;
8cd0a023 1274 XXX
427e5fc6
MD
1275 KKASSERT(node->base.subtype == child->base.type);
1276
1277 /*
1278 * Accumulate n for a good child, update the node's count
1279 * if it was wrong.
1280 */
1281 if (node->elms[i].subtree_count != child->base.count) {
1282 node->elms[i].subtree_count = child->base.count;
1283 }
1284 n += node->elms[i].subtree_count;
1285 }
1286 if (error)
1287 goto failed;
1288
1289 /*
1290 * Collect all the children's elements together
1291 */
1292 nelms = n;
1293 elms = kmalloc(sizeof(*elms) * (nelms + 1), M_HAMMER, M_WAITOK|M_ZERO);
1294 for (i = n = 0; i < node->base.count; ++i) {
1295 child = children[i];
1296 for (j = 0; j < child->base.count; ++j) {
1297 elms[n].owner = child;
8cd0a023 1298 if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF)
427e5fc6
MD
1299 elms[n].u.leaf = child->leaf.elms[j];
1300 else
1301 elms[n].u.internal = child->internal.elms[j];
1302 ++n;
1303 }
1304 }
1305 KKASSERT(n == nelms);
1306
1307 /*
1308 * Store a boundary in the elms array to ease the code below. This
1309 * is only used if the children are internal nodes.
1310 */
1311 elms[n].u.internal = node->elms[i];
1312
1313 /*
1314 * Calculate the number of elements each child should have (goal) by
1315 * reducing the number of elements until we achieve at least
1316 * halfelms - 1 per child, unless we are a degenerate case.
1317 */
1318 maxelms = btree_max_elements(node->base.subtype);
1319 halfelms = maxelms / 2;
1320
1321 goal = halfelms - 1;
1322 while (i && n / i < goal)
1323 --i;
1324
1325 /*
1326 * Now rebalance using the specified goal
1327 */
1328 for (i = n = 0; i < node->base.count; ++i) {
1329 struct hammer_buffer *subchild_buffer = NULL;
1330 struct hammer_btree_internal_node *subchild;
1331
1332 child = children[i];
1333 for (j = 0; j < goal && n < nelms; ++j) {
8cd0a023 1334 if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
427e5fc6
MD
1335 child->leaf.elms[j] = elms[n].u.leaf;
1336 } else {
1337 child->internal.elms[j] = elms[n].u.internal;
1338 }
1339
1340 /*
1341 * If the element's parent has changed we have to
1342 * update the parent pointer. This is somewhat
1343 * expensive.
1344 */
1345 if (elms[n].owner != child &&
8cd0a023 1346 node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) {
427e5fc6
MD
1347 subchild = hammer_bread(cursor->cluster,
1348 elms[n].u.internal.subtree_offset,
1349 HAMMER_FSBUF_BTREE,
1350 &error,
8cd0a023 1351 &subchild_buffer, XXX);
427e5fc6
MD
1352 if (subchild) {
1353 subchild->base.parent =
1354 hammer_bclu_offset(child_buffer[i],
1355 child);
1356 hammer_modify_buffer(subchild_buffer);
1357 }
1358 /* XXX error */
1359 }
1360 ++n;
1361 }
1362 /*
1363 * Set right boundary if the children are internal nodes.
1364 */
8cd0a023 1365 if (node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL)
427e5fc6
MD
1366 child->internal.elms[j] = elms[n].u.internal;
1367 child->base.count = j;
1368 hammer_modify_buffer(child_buffer[i]);
1369 if (subchild_buffer)
66325755 1370 hammer_put_buffer(subchild_buffer, 0);
427e5fc6
MD
1371
1372 /*
1373 * If we have run out of elements, break out
1374 */
1375 if (n == nelms)
1376 break;
1377 }
1378
1379 /*
1380 * Physically destroy any left-over children. These children's
1381 * elements have been packed into prior children. The node's
1382 * right hand boundary and count gets shifted to index i.
1383 *
1384 * The subtree count in the node's parent MUST be updated because
1385 * we are removing elements. The subtree_count field is allowed to
1386 * be too small, but not too large!
1387 */
1388 if (i != node->base.count) {
1389 n = i;
1390 node->elms[n] = node->elms[node->base.count];
1391 while (i < node->base.count) {
1392 hammer_free_btree_ptr(child_buffer[i], children[i]);
66325755 1393 hammer_put_buffer(child_buffer[i], 0);
427e5fc6
MD
1394 ++i;
1395 }
1396 node->base.count = n;
1397 if (cursor->parent) {
1398 cursor->parent->elms[cursor->parent_index].subtree_count = n;
1399 hammer_modify_buffer(cursor->parent_buffer);
1400 }
1401 }
1402
1403 kfree(elms, M_HAMMER);
1404failed:
1405 hammer_modify_buffer(cursor->node_buffer);
1406 for (i = 0; i < node->base.count; ++i) {
1407 if (child_buffer[i])
66325755 1408 hammer_put_buffer(child_buffer[i], 0);
427e5fc6
MD
1409 }
1410 return (error);
1411}
1412
1413/*
1414 * This routine is only called if the cursor is at the root node and the
1415 * root node is an internal node. We attempt to collapse the root node
1416 * by replacing it with all of its children, reducing tree depth by one.
1417 *
1418 * This is the only way to reduce tree depth in a HAMMER filesystem.
1419 * Note that all leaf nodes are at the same depth.
1420 *
1421 * This is a fairly expensive operation because we not only have to load
1422 * the root's children, we also have to scan each child and adjust the
1423 * parent offset for each element in each child. Nasty all around.
1424 */
1425static
1426int
8cd0a023 1427btree_collapse(hammer_cursor_t cursor)
427e5fc6 1428{
8cd0a023
MD
1429 hammer_btree_node_ondisk_t root, child;
1430 hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS];
427e5fc6
MD
1431 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1432 int count;
1433 int i, j, n;
1434 int root_modified;
1435 int error;
1436 int32_t root_offset;
1437 u_int8_t subsubtype;
1438
1439 root = cursor->node;
1440 count = root->base.count;
1441 root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1442
1443 /*
1444 * Sum up the number of children each element has. This value is
1445 * only approximate due to the way the insertion node works. It
1446 * may be too small but it will never be too large.
1447 *
1448 * Quickly terminate the collapse if the elements have too many
1449 * children.
1450 */
1451 KKASSERT(root->base.parent == 0); /* must be root node */
8cd0a023 1452 KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL);
427e5fc6
MD
1453 KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1454
1455 for (i = n = 0; i < count; ++i) {
1456 n += root->internal.elms[i].subtree_count;
1457 }
1458 if (n > btree_max_elements(root->base.subtype))
1459 return(0);
1460
1461 /*
1462 * Iterate through the elements again and correct the subtree_count.
1463 * Terminate the collapse if we wind up with too many.
1464 */
1465 error = 0;
1466 root_modified = 0;
1467
1468 for (i = n = 0; i < count; ++i) {
1469 struct hammer_btree_internal_elm *elm;
1470
1471 elm = &root->internal.elms[i];
1472 child_buffer[i] = NULL;
1473 children[i] = NULL;
1474 if (elm->subtree_offset == 0)
1475 continue;
1476 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1477 HAMMER_FSBUF_BTREE, &error,
8cd0a023 1478 &child_buffer[i], XXX);
427e5fc6
MD
1479 children[i] = child;
1480 if (child == NULL)
1481 continue;
1482 KKASSERT(root->base.subtype == child->base.type);
1483
1484 /*
1485 * Accumulate n for a good child, update the root's count
1486 * if it was wrong.
1487 */
1488 if (root->internal.elms[i].subtree_count != child->base.count) {
1489 root->internal.elms[i].subtree_count = child->base.count;
1490 root_modified = 1;
1491 }
1492 n += root->internal.elms[i].subtree_count;
1493 }
1494 if (error || n > btree_max_elements(root->base.subtype))
1495 goto done;
1496
1497 /*
1498 * Ok, we can collapse the root. If the root's children are leafs
1499 * the collapse is really simple. If they are internal nodes the
1500 * collapse is not so simple because we have to fixup the parent
1501 * pointers for the root's children's children.
1502 *
1503 * When collapsing an internal node the far left and far right
1504 * element's boundaries should match the root's left and right
1505 * boundaries.
1506 */
8cd0a023 1507 if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
427e5fc6
MD
1508 for (i = n = 0; i < count; ++i) {
1509 child = children[i];
1510 for (j = 0; j < child->base.count; ++j) {
1511 root->leaf.elms[n] = child->leaf.elms[j];
1512 ++n;
1513 }
1514 }
1515 root->base.type = root->base.subtype;
1516 root->base.subtype = 0;
1517 root->base.count = n;
1518 root->leaf.link_left = 0;
1519 root->leaf.link_right = 0;
1520 } else {
1521 struct hammer_btree_internal_elm *elm;
1522 struct hammer_btree_internal_node *subchild;
1523 struct hammer_buffer *subchild_buffer = NULL;
1524
1525 if (count) {
1526 child = children[0];
1527 subsubtype = child->base.subtype;
1528 KKASSERT(child->base.count > 0);
1529 KKASSERT(root->internal.elms[0].base.key ==
1530 child->internal.elms[0].base.key);
1531 child = children[count-1];
1532 KKASSERT(child->base.count > 0);
1533 KKASSERT(root->internal.elms[count].base.key ==
1534 child->internal.elms[child->base.count].base.key);
1535 } else {
1536 subsubtype = 0;
1537 }
1538 for (i = n = 0; i < count; ++i) {
1539 child = children[i];
1540 KKASSERT(child->base.subtype == subsubtype);
1541 for (j = 0; j < child->base.count; ++j) {
1542 elm = &child->internal.elms[j];
1543
1544 root->internal.elms[n] = *elm;
1545 subchild = hammer_bread(cursor->cluster,
1546 elm->subtree_offset,
1547 HAMMER_FSBUF_BTREE,
1548 &error,
8cd0a023
MD
1549 &subchild_buffer,
1550 XXX);
427e5fc6
MD
1551 if (subchild) {
1552 subchild->base.parent = root_offset;
1553 hammer_modify_buffer(subchild_buffer);
1554 }
1555 ++n;
1556 }
1557 /* make sure the right boundary is correct */
1558 /* (this gets overwritten when the loop continues) */
1559 /* XXX generate a new separator? */
1560 root->internal.elms[n] = child->internal.elms[j];
1561 }
8cd0a023 1562 root->base.type = HAMMER_BTREE_TYPE_INTERNAL;
427e5fc6
MD
1563 root->base.subtype = subsubtype;
1564 if (subchild_buffer)
66325755 1565 hammer_put_buffer(subchild_buffer, 0);
427e5fc6
MD
1566 }
1567 root_modified = 1;
1568
1569 /*
1570 * Cleanup
1571 */
1572done:
1573 if (root_modified)
1574 hammer_modify_buffer(cursor->node_buffer);
1575 for (i = 0; i < count; ++i) {
1576 if (child_buffer[i])
66325755 1577 hammer_put_buffer(child_buffer[i], 0);
427e5fc6
MD
1578 }
1579 return(error);
1580}
1581
8cd0a023
MD
1582#endif
1583
1584/************************************************************************
1585 * MISCELLANIOUS SUPPORT *
1586 ************************************************************************/
1587
1588/*
1589 * Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
1590 *
1591 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
1592 *
1593 * Note that key1 and key2 are treated differently. key1 is allowed to
1594 * wildcard some of its fields by setting them to 0, while key2 is expected
1595 * to be in an on-disk form (no wildcards).
1596 */
1597int
1598hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
1599{
1600#if 0
1601 kprintf("compare obj_id %016llx %016llx\n",
1602 key1->obj_id, key2->obj_id);
1603 kprintf("compare rec_type %04x %04x\n",
1604 key1->rec_type, key2->rec_type);
1605 kprintf("compare key %016llx %016llx\n",
1606 key1->key, key2->key);
1607#endif
1608
1609 /*
1610 * A key1->obj_id of 0 matches any object id
1611 */
1612 if (key1->obj_id) {
1613 if (key1->obj_id < key2->obj_id)
1614 return(-4);
1615 if (key1->obj_id > key2->obj_id)
1616 return(4);
1617 }
1618
1619 /*
1620 * A key1->rec_type of 0 matches any record type.
1621 */
1622 if (key1->rec_type) {
1623 if (key1->rec_type < key2->rec_type)
1624 return(-3);
1625 if (key1->rec_type > key2->rec_type)
1626 return(3);
1627 }
1628
1629 /*
1630 * There is no special case for key. 0 means 0.
1631 */
1632 if (key1->key < key2->key)
1633 return(-2);
1634 if (key1->key > key2->key)
1635 return(2);
1636
1637 /*
1638 * This test has a number of special cases. create_tid in key1 is
1639 * the as-of transction id, and delete_tid in key1 is NOT USED.
1640 *
1641 * A key1->create_tid of 0 matches any record regardles of when
1642 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
1643 * used to search for the most current state of the object.
1644 *
1645 * key2->create_tid is a HAMMER record and will never be
1646 * 0. key2->delete_tid is the deletion transaction id or 0 if
1647 * the record has not yet been deleted.
1648 */
1649 if (key1->create_tid) {
1650 if (key1->create_tid < key2->create_tid)
1651 return(-1);
1652 if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
1653 return(1);
1654 }
1655
1656 return(0);
1657}
1658
1659/*
1660 * Create a separator half way inbetween key1 and key2. For fields just
1661 * one unit apart, the separator will match key2.
1662 *
1663 * The handling of delete_tid is a little confusing. It is only possible
1664 * to have one record in the B-Tree where all fields match except delete_tid.
1665 * This means, worse case, two adjacent elements may have a create_tid that
1666 * is one-apart and cause the separator to choose the right-hand element's
1667 * create_tid. e.g. (create,delete): (1,x)(2,x) -> separator is (2,x).
1668 *
1669 * So all we have to do is set delete_tid to the right-hand element to
1670 * guarentee that the separator is properly between the two elements.
1671 */
1672#define MAKE_SEPARATOR(key1, key2, dest, field) \
1673 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
1674
1675static void
1676hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
1677 hammer_base_elm_t dest)
1678{
1679 bzero(dest, sizeof(*dest));
1680 MAKE_SEPARATOR(key1, key2, dest, obj_id);
1681 MAKE_SEPARATOR(key1, key2, dest, rec_type);
1682 MAKE_SEPARATOR(key1, key2, dest, key);
1683 MAKE_SEPARATOR(key1, key2, dest, create_tid);
1684 dest->delete_tid = key2->delete_tid;
1685}
1686
1687#undef MAKE_SEPARATOR
1688
1689/*
1690 * Return whether a generic internal or leaf node is full
1691 */
1692static int
1693btree_node_is_full(hammer_node_ondisk_t node)
1694{
1695 switch(node->type) {
1696 case HAMMER_BTREE_TYPE_INTERNAL:
1697 if (node->count == HAMMER_BTREE_INT_ELMS)
1698 return(1);
1699 break;
1700 case HAMMER_BTREE_TYPE_LEAF:
1701 if (node->count == HAMMER_BTREE_LEAF_ELMS)
1702 return(1);
1703 break;
1704 default:
1705 panic("illegal btree subtype");
1706 }
1707 return(0);
1708}
1709
1710#if 0
1711static int
1712btree_max_elements(u_int8_t type)
1713{
1714 if (type == HAMMER_BTREE_TYPE_LEAF)
1715 return(HAMMER_BTREE_LEAF_ELMS);
1716 if (type == HAMMER_BTREE_TYPE_INTERNAL)
1717 return(HAMMER_BTREE_INT_ELMS);
1718 panic("btree_max_elements: bad type %d\n", type);
1719}
1720#endif
1721