Sync the adduser(8) command with FreeBSD. Summary:
[dragonfly.git] / sys / vfs / hammer / hammer_btree.c
CommitLineData
427e5fc6
MD
1/*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
fbc6e32a 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.10 2007/12/14 08:05:39 dillon Exp $
427e5fc6
MD
35 */
36
37/*
8cd0a023 38 * HAMMER B-Tree index
427e5fc6
MD
39 *
40 * HAMMER implements a modified B+Tree. In documentation this will
8cd0a023 41 * simply be refered to as the HAMMER B-Tree. Basically a B-Tree
427e5fc6
MD
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
8cd0a023 45 * otherwords, we have boundary elements at the two ends of a B-Tree node
427e5fc6
MD
46 * instead of sub-tree pointers.
47 *
8cd0a023 48 * A B-Tree internal node looks like this:
427e5fc6
MD
49 *
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
52 *
8cd0a023 53 * A B-Tree leaf node basically looks like this:
427e5fc6
MD
54 *
55 * L L L L L L L L <-- leaf elemenets
56 *
8cd0a023
MD
57 * The radix for an internal node is 1 less then a leaf but we get a
58 * number of significant benefits for our troubles.
427e5fc6 59 *
8cd0a023
MD
60 * The big benefit to using a B-Tree containing boundary information
61 * is that it is possible to cache pointers into the middle of the tree
62 * and not have to start searches, insertions, OR deletions at the root
63 * node. In particular, searches are able to progress in a definitive
64 * direction from any point in the tree without revisting nodes. This
65 * greatly improves the efficiency of many operations, most especially
66 * record appends.
427e5fc6 67 *
8cd0a023
MD
68 * B-Trees also make the stacking of trees fairly straightforward.
69 *
70 * INTER-CLUSTER ELEMENTS: An element of an internal node may reference
71 * the root of another cluster rather then a node in the current cluster.
72 * This is known as an inter-cluster references. Only B-Tree searches
73 * will cross cluster boundaries. The rebalancing and collapse code does
74 * not attempt to move children between clusters. A major effect of this
75 * is that we have to relax minimum element count requirements and allow
76 * trees to become somewhat unabalanced.
77 *
78 * INSERTIONS AND DELETIONS: When inserting we split full nodes on our
79 * way down as an optimization. I originally experimented with rebalancing
80 * nodes on the way down for deletions but it created a huge mess due to
81 * the way inter-cluster linkages work. Instead, now I simply allow
82 * the tree to become unbalanced and allow leaf nodes to become empty.
83 * The delete code will try to clean things up from the bottom-up but
84 * will stop if related elements are not in-core or if it cannot get a node
85 * lock.
86 */
87#include "hammer.h"
88#include <sys/buf.h>
89#include <sys/buf2.h>
66325755 90
8cd0a023
MD
91static int btree_search(hammer_cursor_t cursor, int flags);
92static int btree_split_internal(hammer_cursor_t cursor);
93static int btree_split_leaf(hammer_cursor_t cursor);
195c19a1 94static int btree_remove(hammer_cursor_t cursor);
7f7c1f84 95static int btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm);
8cd0a023
MD
96#if 0
97static int btree_rebalance(hammer_cursor_t cursor);
98static int btree_collapse(hammer_cursor_t cursor);
99#endif
100static int btree_node_is_full(hammer_node_ondisk_t node);
101static void hammer_make_separator(hammer_base_elm_t key1,
102 hammer_base_elm_t key2, hammer_base_elm_t dest);
66325755
MD
103
104/*
8cd0a023
MD
105 * Iterate records after a search. The cursor is iterated forwards past
106 * the current record until a record matching the key-range requirements
107 * is found. ENOENT is returned if the iteration goes past the ending
108 * key.
66325755 109 *
8cd0a023
MD
110 * key_beg/key_end is an INCLUSVE range. i.e. if you are scanning to load
111 * a 4096 byte buffer key_beg might specify an offset of 0 and key_end an
112 * offset of 4095.
66325755 113 *
8cd0a023
MD
114 * cursor->key_beg may or may not be modified by this function during
115 * the iteration.
66325755
MD
116 */
117int
8cd0a023 118hammer_btree_iterate(hammer_cursor_t cursor)
66325755 119{
8cd0a023
MD
120 hammer_node_ondisk_t node;
121 hammer_btree_elm_t elm;
66325755
MD
122 int error;
123 int r;
76376933 124#if 0
66325755 125 int s;
76376933
MD
126 int64_t save_key;
127#endif
66325755
MD
128
129 /*
8cd0a023 130 * Skip past the current record
66325755 131 */
8cd0a023 132 node = cursor->node->ondisk;
a89aec1b
MD
133 if (node == NULL)
134 return(ENOENT);
c0ade690
MD
135 if (cursor->index < node->count &&
136 (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
66325755 137 ++cursor->index;
c0ade690 138 }
66325755 139
8cd0a023
MD
140 /*
141 * Loop until an element is found or we are done.
142 */
66325755
MD
143 for (;;) {
144 /*
8cd0a023
MD
145 * We iterate up the tree and then index over one element
146 * while we are at the last element in the current node.
147 *
148 * NOTE: This can pop us up to another cluster.
66325755 149 *
8cd0a023
MD
150 * If we are at the root of the root cluster, cursor_up
151 * returns ENOENT.
152 *
153 * NOTE: hammer_cursor_up() will adjust cursor->key_beg
154 * when told to re-search for the cluster tag.
66325755
MD
155 *
156 * XXX this could be optimized by storing the information in
157 * the parent reference.
195c19a1
MD
158 *
159 * XXX we can lose the node lock temporarily, this could mess
160 * up our scan.
66325755 161 */
8cd0a023 162 if (cursor->index == node->count) {
195c19a1 163 error = hammer_cursor_up(cursor, 0);
8cd0a023
MD
164 if (error)
165 break;
166 node = cursor->node->ondisk;
167 KKASSERT(cursor->index != node->count);
168 ++cursor->index;
169 continue;
66325755
MD
170 }
171
172 /*
173 * Iterate down the tree while we are at an internal node.
174 * Nodes cannot be empty, assert the case because if one is
175 * we will wind up in an infinite loop.
176 *
177 * We can avoid iterating through large swaths of transaction
178 * id space if the left and right separators are the same
179 * except for their transaction spaces. We can then skip
180 * the node if the left and right transaction spaces are the
181 * same sign. This directly optimized accesses to files with
8cd0a023
MD
182 * HUGE transactional histories, such as database files,
183 * allowing us to avoid having to iterate through the entire
184 * history.
66325755 185 */
8cd0a023 186 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
a89aec1b 187 KKASSERT(node->count != 0);
8cd0a023 188 elm = &node->elms[cursor->index];
76376933
MD
189#if 0
190 /*
191 * temporarily disable this optimization, it needs
192 * more of a theoretical review.
193 */
66325755
MD
194 if (elm[0].base.obj_id == elm[1].base.obj_id &&
195 elm[0].base.rec_type == elm[1].base.rec_type &&
196 elm[0].base.key == elm[1].base.key) {
8cd0a023
MD
197 /*
198 * Left side transaction space
199 */
200 save_key = cursor->key_beg.key;
201 cursor->key_beg.key = elm[0].base.key;
202 r = hammer_btree_cmp(&cursor->key_beg,
203 &elm[0].base);
204 cursor->key_beg.key = save_key;
205
206 /*
207 * Right side transaction space
208 */
209 save_key = cursor->key_end.key;
210 cursor->key_end.key = elm[1].base.key;
211 s = hammer_btree_cmp(&cursor->key_end,
212 &elm[1].base);
213 cursor->key_end.key = save_key;
214
215 /*
216 * If our range is entirely on one side or
217 * the other side we can skip the sub-tree.
218 */
66325755
MD
219 if ((r < 0 && s < 0) || (r > 0 && s > 0)) {
220 ++cursor->index;
8cd0a023 221 continue;
66325755
MD
222 }
223 }
76376933 224#endif
8cd0a023 225 error = hammer_cursor_down(cursor);
66325755 226 if (error)
8cd0a023
MD
227 break;
228 KKASSERT(cursor->index == 0);
8cd0a023
MD
229 node = cursor->node->ondisk;
230 continue;
66325755
MD
231 }
232
233 /*
8cd0a023
MD
234 * We are at a leaf.
235 *
236 * Determine if the record at the cursor has gone beyond the
237 * end of our range. Remember that our key range is inclusive.
238 *
239 * When iterating we may have to 'pick out' records matching
240 * our transaction requirements. A comparison return of
241 * +1 or -1 indicates a transactional record that is too
242 * old or too new but does not terminate the search.
66325755 243 */
8cd0a023 244 elm = &node->elms[cursor->index];
c0ade690 245 r = hammer_btree_range_cmp(cursor, &elm->base);
8cd0a023
MD
246 if (r == -1 || r == 1) {
247 ++cursor->index;
248 continue;
66325755
MD
249 }
250
251 /*
76376933 252 * We either found a match or are now out of bounds.
66325755 253 */
76376933 254 error = r ? ENOENT : 0;
8cd0a023 255 break;
427e5fc6 256 }
66325755 257 return(error);
427e5fc6
MD
258}
259
260/*
8cd0a023
MD
261 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry
262 * could not be found, and a fatal error otherwise.
263 *
264 * The cursor is suitably positioned for a deletion on success, and suitably
265 * positioned for an insertion on ENOENT.
427e5fc6 266 *
8cd0a023
MD
267 * The cursor may begin anywhere, the search will traverse clusters in
268 * either direction to locate the requested element.
427e5fc6
MD
269 */
270int
8cd0a023 271hammer_btree_lookup(hammer_cursor_t cursor)
427e5fc6 272{
66325755
MD
273 int error;
274
8cd0a023
MD
275 error = btree_search(cursor, 0);
276 if (error == 0 && cursor->flags)
277 error = hammer_btree_extract(cursor, cursor->flags);
66325755
MD
278 return(error);
279}
280
8cd0a023
MD
281/*
282 * Extract the record and/or data associated with the cursor's current
283 * position. Any prior record or data stored in the cursor is replaced.
284 * The cursor must be positioned at a leaf node.
285 *
286 * NOTE: Only records can be extracted from internal B-Tree nodes, and
287 * only for inter-cluster references. At the moment we only support
288 * extractions from leaf nodes.
289 */
66325755 290int
8cd0a023 291hammer_btree_extract(hammer_cursor_t cursor, int flags)
66325755 292{
8cd0a023
MD
293 hammer_node_ondisk_t node;
294 hammer_btree_elm_t elm;
295 hammer_cluster_t cluster;
c0ade690 296 u_int64_t buf_type;
427e5fc6
MD
297 int32_t cloff;
298 int error;
427e5fc6 299
8cd0a023 300 /*
427e5fc6 301 * A cluster record type has no data reference, the information
8cd0a023 302 * is stored directly in the record and B-Tree element.
427e5fc6
MD
303 *
304 * The case where the data reference resolves to the same buffer
305 * as the record reference must be handled.
306 */
8cd0a023
MD
307 node = cursor->node->ondisk;
308 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
309 elm = &node->elms[cursor->index];
310 cluster = cursor->node->cluster;
66325755
MD
311 error = 0;
312
a89aec1b 313 if ((flags & HAMMER_CURSOR_GET_RECORD) && error == 0) {
8cd0a023
MD
314 cloff = elm->leaf.rec_offset;
315 cursor->record = hammer_bread(cluster, cloff,
316 HAMMER_FSBUF_RECORDS, &error,
317 &cursor->record_buffer);
427e5fc6
MD
318 } else {
319 cloff = 0;
320 }
a89aec1b 321 if ((flags & HAMMER_CURSOR_GET_DATA) && error == 0) {
8cd0a023
MD
322 if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) {
323 /*
c0ade690
MD
324 * The data is not in the same buffer as the last
325 * record we cached, but it could still be embedded
326 * in a record. Note that we may not have loaded the
327 * record's buffer above, depending on flags.
8cd0a023 328 */
c0ade690
MD
329 if ((elm->leaf.rec_offset ^ elm->leaf.data_offset) &
330 ~HAMMER_BUFMASK) {
331 if (elm->leaf.data_len & HAMMER_BUFMASK)
332 buf_type = HAMMER_FSBUF_DATA;
333 else
334 buf_type = 0; /* pure data buffer */
335 } else {
336 buf_type = HAMMER_FSBUF_RECORDS;
337 }
8cd0a023
MD
338 cursor->data = hammer_bread(cluster,
339 elm->leaf.data_offset,
c0ade690 340 buf_type, &error,
8cd0a023 341 &cursor->data_buffer);
427e5fc6 342 } else {
8cd0a023
MD
343 /*
344 * Data in same buffer as record. Note that we
345 * leave any existing data_buffer intact, even
346 * though we don't use it in this case, in case
347 * other records extracted during an iteration
348 * go back to it.
c0ade690
MD
349 *
350 * Just assume the buffer type is correct.
8cd0a023
MD
351 */
352 cursor->data = (void *)
353 ((char *)cursor->record_buffer->ondisk +
354 (elm->leaf.data_offset & HAMMER_BUFMASK));
427e5fc6
MD
355 }
356 }
357 return(error);
358}
359
360
361/*
8cd0a023
MD
362 * Insert a leaf element into the B-Tree at the current cursor position.
363 * The cursor is positioned such that the element at and beyond the cursor
364 * are shifted to make room for the new record.
365 *
a89aec1b 366 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
8cd0a023
MD
367 * flag set and that call must return ENOENT before this function can be
368 * called.
369 *
370 * ENOSPC is returned if there is no room to insert a new record.
427e5fc6
MD
371 */
372int
8cd0a023 373hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm)
427e5fc6 374{
8cd0a023
MD
375 hammer_node_ondisk_t parent;
376 hammer_node_ondisk_t node;
427e5fc6
MD
377 int i;
378
66325755
MD
379#if 0
380 /* HANDLED BY CALLER */
427e5fc6
MD
381 /*
382 * Issue a search to get our cursor at the right place. The search
383 * will get us to a leaf node.
384 *
385 * The search also does some setup for our insert, so there is always
386 * room in the leaf.
387 */
a89aec1b 388 error = btree_search(cursor, HAMMER_CURSOR_INSERT);
427e5fc6
MD
389 if (error != ENOENT) {
390 if (error == 0)
391 error = EEXIST;
392 return (error);
393 }
66325755 394#endif
427e5fc6
MD
395
396 /*
397 * Insert the element at the leaf node and update the count in the
398 * parent. It is possible for parent to be NULL, indicating that
8cd0a023 399 * the root of the B-Tree in the cluster is a leaf. It is also
427e5fc6
MD
400 * possible for the leaf to be empty.
401 *
402 * Remember that the right-hand boundary is not included in the
403 * count.
404 */
8cd0a023 405 node = cursor->node->ondisk;
427e5fc6 406 i = cursor->index;
8cd0a023
MD
407 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
408 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
409 if (i != node->count) {
410 bcopy(&node->elms[i], &node->elms[i+1],
411 (node->count - i) * sizeof(*elm));
412 }
413 node->elms[i] = *elm;
414 ++node->count;
415 hammer_modify_node(cursor->node);
427e5fc6 416
c0ade690
MD
417 /*
418 * Adjust the sub-tree count in the parent. note that the parent
419 * may be in a different cluster.
420 */
421 if (cursor->parent) {
422 parent = cursor->parent->ondisk;
427e5fc6 423 i = cursor->parent_index;
8cd0a023
MD
424 ++parent->elms[i].internal.subtree_count;
425 KKASSERT(parent->elms[i].internal.subtree_count <= node->count);
426 hammer_modify_node(cursor->parent);
427e5fc6 427 }
427e5fc6
MD
428 return(0);
429}
430
431/*
8cd0a023
MD
432 * Delete a record from the B-Tree's at the current cursor position.
433 * The cursor is positioned such that the current element is the one
434 * to be deleted.
435 *
195c19a1
MD
436 * On return the cursor will be positioned after the deleted element and
437 * MAY point to an internal node. It will be suitable for the continuation
438 * of an iteration but not for an insertion or deletion.
8cd0a023 439 *
195c19a1
MD
440 * Deletions will attempt to partially rebalance the B-Tree in an upward
441 * direction. It is possible to end up with empty leafs. An empty internal
442 * node is impossible (worst case: it has one element pointing to an empty
443 * leaf).
427e5fc6
MD
444 */
445int
8cd0a023 446hammer_btree_delete(hammer_cursor_t cursor)
427e5fc6 447{
8cd0a023
MD
448 hammer_node_ondisk_t ondisk;
449 hammer_node_t node;
450 hammer_node_t parent;
451 hammer_btree_elm_t elm;
452 int error;
427e5fc6
MD
453 int i;
454
66325755
MD
455#if 0
456 /* HANDLED BY CALLER */
427e5fc6
MD
457 /*
458 * Locate the leaf element to delete. The search is also responsible
459 * for doing some of the rebalancing work on its way down.
460 */
a89aec1b 461 error = btree_search(cursor, HAMMER_CURSOR_DELETE);
427e5fc6
MD
462 if (error)
463 return (error);
66325755 464#endif
427e5fc6
MD
465
466 /*
8cd0a023 467 * Delete the element from the leaf node.
427e5fc6 468 *
8cd0a023 469 * Remember that leaf nodes do not have boundaries.
427e5fc6 470 */
8cd0a023
MD
471 node = cursor->node;
472 ondisk = node->ondisk;
427e5fc6
MD
473 i = cursor->index;
474
8cd0a023
MD
475 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
476 if (i + 1 != ondisk->count) {
477 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
478 (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
479 }
480 --ondisk->count;
195c19a1 481 hammer_modify_node(node);
8cd0a023 482 if (cursor->parent != NULL) {
427e5fc6
MD
483 /*
484 * Adjust parent's notion of the leaf's count. subtree_count
8cd0a023 485 * is only approximate, it is allowed to be too small but
427e5fc6
MD
486 * never allowed to be too large. Make sure we don't drop
487 * the count below 0.
488 */
8cd0a023
MD
489 parent = cursor->parent;
490 elm = &parent->ondisk->elms[cursor->parent_index];
491 if (elm->internal.subtree_count)
492 --elm->internal.subtree_count;
493 KKASSERT(elm->internal.subtree_count <= ondisk->count);
494 hammer_modify_node(parent);
427e5fc6 495 }
427e5fc6 496
8cd0a023 497 /*
195c19a1
MD
498 * It is possible, but not desireable, to stop here. If the element
499 * count drops to 0 (which is allowed for a leaf), try recursively
500 * remove the B-Tree node.
8cd0a023 501 *
195c19a1
MD
502 * XXX rebalancing calls would go here too.
503 *
504 * This may reposition the cursor at one of the parent's of the
505 * current node.
8cd0a023
MD
506 */
507 if (ondisk->count == 0) {
195c19a1
MD
508 error = btree_remove(cursor);
509 if (error == EAGAIN)
8cd0a023 510 error = 0;
8cd0a023 511 } else {
8cd0a023
MD
512 error = 0;
513 }
514 return(error);
515}
427e5fc6
MD
516
517/*
8cd0a023
MD
518 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
519 *
520 * Search a cluster's B-Tree for cursor->key_beg, return the matching node.
521 *
522 * The search begins at the current node and will instantiate a NULL
523 * parent if necessary and if not at the root of the cluster. On return
524 * parent will be non-NULL unless the cursor is sitting at a root-leaf.
525 *
526 * The search code may be forced to iterate up the tree if the conditions
527 * required for an insertion or deletion are not met. This does not occur
528 * very often.
529 *
530 * INSERTIONS: The search will split full nodes and leaves on its way down
531 * and guarentee that the leaf it ends up on is not full.
427e5fc6 532 *
8cd0a023 533 * DELETIONS: The search will rebalance the tree on its way down.
fbc6e32a
MD
534 *
535 * The search is only guarenteed to end up on a leaf if an error code of 0
536 * is returned, or if inserting and an error code of ENOENT is returned.
427e5fc6 537 */
8cd0a023 538static
427e5fc6 539int
8cd0a023 540btree_search(hammer_cursor_t cursor, int flags)
427e5fc6 541{
8cd0a023
MD
542 hammer_node_ondisk_t node;
543 hammer_cluster_t cluster;
544 int error;
545 int i;
546 int r;
547
548 flags |= cursor->flags;
549
550 /*
551 * Move our cursor up the tree until we find a node whos range covers
552 * the key we are trying to locate. This may move us between
553 * clusters.
554 *
555 * The left bound is inclusive, the right bound is non-inclusive.
556 * It is ok to cursor up too far so when cursoring across a cluster
557 * boundary.
558 *
559 * First see if we can skip the whole cluster. hammer_cursor_up()
560 * handles both cases but this way we don't check the cluster
561 * bounds when going up the tree within a cluster.
562 */
563 cluster = cursor->node->cluster;
564 while (
565 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 ||
566 hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) {
567 error = hammer_cursor_toroot(cursor);
568 if (error)
569 goto done;
195c19a1 570 error = hammer_cursor_up(cursor, 0);
8cd0a023
MD
571 if (error)
572 goto done;
573 cluster = cursor->node->cluster;
427e5fc6
MD
574 }
575
576 /*
8cd0a023
MD
577 * Deal with normal cursoring within a cluster. The right bound
578 * is non-inclusive. That is, the bounds form a separator.
427e5fc6 579 */
8cd0a023
MD
580 while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 ||
581 hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) {
195c19a1 582 error = hammer_cursor_up(cursor, 0);
8cd0a023
MD
583 if (error)
584 goto done;
427e5fc6 585 }
427e5fc6 586
8cd0a023
MD
587 /*
588 * We better have ended up with a node somewhere, and our second
589 * while loop had better not have traversed up a cluster.
590 */
591 KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster);
592
593 /*
594 * If we are inserting we can't start at a full node if the parent
595 * is also full (because there is no way to split the node),
596 * continue running up the tree until we hit the root of the
597 * root cluster or until the requirement is satisfied.
598 *
599 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
600 *
601 * XXX as an optimization it should be possible to unbalance the tree
602 * and stop at the root of the current cluster.
603 */
a89aec1b 604 while (flags & HAMMER_CURSOR_INSERT) {
8cd0a023
MD
605 if (btree_node_is_full(cursor->node->ondisk) == 0)
606 break;
607 if (cursor->parent == NULL)
608 break;
609 if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS)
610 break;
195c19a1 611 error = hammer_cursor_up(cursor, 0);
8cd0a023
MD
612 /* cluster and node are now may become stale */
613 if (error)
614 goto done;
427e5fc6 615 }
8cd0a023 616 /* cluster = cursor->node->cluster; not needed until next cluster = */
427e5fc6 617
8cd0a023
MD
618#if 0
619 /*
620 * If we are deleting we can't start at an internal node with only
621 * one element unless it is root, because all of our code assumes
622 * that internal nodes will never be empty. Just do this generally
623 * for both leaf and internal nodes to get better balance.
624 *
625 * This handles the case where the cursor is sitting at a leaf and
626 * either the leaf or parent contain an insufficient number of
627 * elements.
628 *
629 * NOTE: These cursor-up's CAN continue to cross cluster boundaries.
a89aec1b
MD
630 *
631 * XXX NOTE: Iterations may not set this flag anyway.
8cd0a023 632 */
a89aec1b 633 while (flags & HAMMER_CURSOR_DELETE) {
8cd0a023
MD
634 if (cursor->node->ondisk->count > 1)
635 break;
636 if (cursor->parent == NULL)
637 break;
638 KKASSERT(cursor->node->ondisk->count != 0);
195c19a1 639 error = hammer_cursor_up(cursor, 0);
8cd0a023
MD
640 /* cluster and node are now may become stale */
641 if (error)
642 goto done;
643 }
644#endif
427e5fc6 645
8cd0a023
MD
646/*new_cluster:*/
647 /*
648 * Push down through internal nodes to locate the requested key.
649 */
650 cluster = cursor->node->cluster;
651 node = cursor->node->ondisk;
652 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
653#if 0
427e5fc6 654 /*
8cd0a023
MD
655 * If we are a the root node and deleting, try to collapse
656 * all of the root's children into the root. This is the
657 * only point where tree depth is reduced.
a89aec1b
MD
658 *
659 * XXX NOTE: Iterations may not set this flag anyway.
427e5fc6 660 */
a89aec1b 661 if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) {
8cd0a023
MD
662 error = btree_collapse(cursor);
663 /* node becomes stale after call */
664 if (error)
665 goto done;
66325755 666 }
8cd0a023
MD
667 node = cursor->node->ondisk;
668#endif
8cd0a023
MD
669 /*
670 * Scan the node to find the subtree index to push down into.
fbc6e32a 671 * We go one-past, then back-up.
8cd0a023
MD
672 */
673 for (i = 0; i < node->count; ++i) {
674 r = hammer_btree_cmp(&cursor->key_beg,
675 &node->elms[i].base);
676 if (r < 0)
677 break;
678 }
8cd0a023
MD
679
680 /*
fbc6e32a
MD
681 * It is possible for the search to terminate at i == 0,
682 * which is to the LEFT of the LEFT boundary but the RIGHT
683 * of the parent's boundary on the left of the sub-tree
684 * element. This case can occur due to deletions (see
685 * btree_remove()).
686 *
687 * When this case occurs an ENOENT return is guarenteed but
688 * if inserting we must still terminate at a leaf. The
689 * solution is to make the node's left boundary inherit the
690 * boundary stored in the parent.
691 *
692 * When doing this inheritance some fields in 'base' are
693 * actually related to the internal element's child
694 * specification and not to the key. These have to be
695 * retained.
8cd0a023 696 */
fbc6e32a
MD
697 if (i == 0) {
698 u_int8_t save;
699 if ((flags & HAMMER_CURSOR_INSERT) == 0) {
700 cursor->index = 0;
701 return(ENOENT);
702 }
703 save = node->elms[0].subtree_type;
704 node->elms[0].base = *cursor->left_bound;
705 node->elms[0].subtree_type = save;
706 hammer_modify_node(cursor->node);
707 } else {
708 /*
709 * The push-down index is now i - 1.
710 */
711 --i;
712 }
8cd0a023
MD
713 cursor->index = i;
714
715 /*
716 * Handle insertion and deletion requirements.
717 *
718 * If inserting split full nodes. The split code will
719 * adjust cursor->node and cursor->index if the current
720 * index winds up in the new node.
721 */
a89aec1b 722 if (flags & HAMMER_CURSOR_INSERT) {
8cd0a023
MD
723 if (node->count == HAMMER_BTREE_INT_ELMS) {
724 error = btree_split_internal(cursor);
725 if (error)
726 goto done;
727 /*
728 * reload stale pointers
729 */
730 i = cursor->index;
731 node = cursor->node->ondisk;
732 }
733 }
734
735#if 0
427e5fc6 736 /*
8cd0a023
MD
737 * If deleting rebalance - do not allow the child to have
738 * just one element or we will not be able to delete it.
739 *
740 * Neither internal or leaf nodes (except a root-leaf) are
741 * allowed to drop to 0 elements. (XXX - well, leaf nodes
742 * can at the moment).
743 *
744 * Our separators may have been reorganized after rebalancing,
745 * so we have to pop back up and rescan.
746 *
747 * XXX test for subtree_count < maxelms / 2, minus 1 or 2
748 * for hysteresis?
a89aec1b
MD
749 *
750 * XXX NOTE: Iterations may not set this flag anyway.
427e5fc6 751 */
a89aec1b 752 if (flags & HAMMER_CURSOR_DELETE) {
8cd0a023
MD
753 if (node->elms[i].internal.subtree_count <= 1) {
754 error = btree_rebalance(cursor);
755 if (error)
756 goto done;
757 /* cursor->index is invalid after call */
758 goto new_cluster;
759 }
760 }
761#endif
427e5fc6
MD
762
763 /*
8cd0a023
MD
764 * Push down (push into new node, existing node becomes
765 * the parent).
427e5fc6 766 */
8cd0a023
MD
767 error = hammer_cursor_down(cursor);
768 /* node and cluster become stale */
769 if (error)
770 goto done;
771 node = cursor->node->ondisk;
772 cluster = cursor->node->cluster;
427e5fc6 773 }
427e5fc6 774
8cd0a023
MD
775 /*
776 * We are at a leaf, do a linear search of the key array.
777 * (XXX do a binary search). On success the index is set to the
778 * matching element, on failure the index is set to the insertion
779 * point.
780 *
781 * Boundaries are not stored in leaf nodes, so the index can wind
782 * up to the left of element 0 (index == 0) or past the end of
783 * the array (index == node->count).
784 */
785 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
786
787 for (i = 0; i < node->count; ++i) {
788 r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base);
427e5fc6 789
427e5fc6 790 /*
8cd0a023 791 * Stop if we've flipped past key_beg
427e5fc6 792 */
8cd0a023
MD
793 if (r < 0)
794 break;
427e5fc6 795
66325755 796 /*
8cd0a023 797 * Return an exact match
66325755 798 */
8cd0a023
MD
799 if (r == 0) {
800 cursor->index = i;
801 error = 0;
802 goto done;
66325755 803 }
427e5fc6 804 }
8cd0a023
MD
805
806 /*
807 * No exact match was found, i is now at the insertion point.
808 *
809 * If inserting split a full leaf before returning. This
810 * may have the side effect of adjusting cursor->node and
811 * cursor->index.
812 */
813 cursor->index = i;
a89aec1b 814 if ((flags & HAMMER_CURSOR_INSERT) &&
8cd0a023
MD
815 node->count == HAMMER_BTREE_LEAF_ELMS) {
816 error = btree_split_leaf(cursor);
817 /* NOT USED
818 i = cursor->index;
819 node = &cursor->node->internal;
820 */
821 if (error)
822 goto done;
823 }
824 error = ENOENT;
8cd0a023 825done:
427e5fc6
MD
826 return(error);
827}
828
8cd0a023 829
427e5fc6 830/************************************************************************
8cd0a023 831 * SPLITTING AND MERGING *
427e5fc6
MD
832 ************************************************************************
833 *
834 * These routines do all the dirty work required to split and merge nodes.
835 */
836
837/*
8cd0a023 838 * Split an internal node into two nodes and move the separator at the split
427e5fc6
MD
839 * point to the parent. Note that the parent's parent's element pointing
840 * to our parent will have an incorrect subtree_count (we don't update it).
841 * It will be low, which is ok.
842 *
8cd0a023
MD
843 * (cursor->node, cursor->index) indicates the element the caller intends
844 * to push into. We will adjust node and index if that element winds
427e5fc6 845 * up in the split node.
8cd0a023
MD
846 *
847 * If we are at the root of a cluster a new root must be created with two
848 * elements, one pointing to the original root and one pointing to the
849 * newly allocated split node.
850 *
851 * NOTE! Being at the root of a cluster is different from being at the
852 * root of the root cluster. cursor->parent will not be NULL and
853 * cursor->node->ondisk.parent must be tested against 0. Theoretically
854 * we could propogate the algorithm into the parent and deal with multiple
855 * 'roots' in the cluster header, but it's easier not to.
427e5fc6
MD
856 */
857static
858int
8cd0a023 859btree_split_internal(hammer_cursor_t cursor)
427e5fc6 860{
8cd0a023
MD
861 hammer_node_ondisk_t ondisk;
862 hammer_node_t node;
863 hammer_node_t parent;
864 hammer_node_t new_node;
865 hammer_btree_elm_t elm;
866 hammer_btree_elm_t parent_elm;
427e5fc6
MD
867 int parent_index;
868 int made_root;
869 int split;
870 int error;
7f7c1f84 871 int i;
8cd0a023 872 const int esize = sizeof(*elm);
427e5fc6
MD
873
874 /*
875 * We are splitting but elms[split] will be promoted to the parent,
876 * leaving the right hand node with one less element. If the
877 * insertion point will be on the left-hand side adjust the split
878 * point to give the right hand side one additional node.
879 */
8cd0a023
MD
880 node = cursor->node;
881 ondisk = node->ondisk;
882 split = (ondisk->count + 1) / 2;
427e5fc6
MD
883 if (cursor->index <= split)
884 --split;
885 error = 0;
886
887 /*
8cd0a023 888 * If we are at the root of the cluster, create a new root node with
427e5fc6
MD
889 * 1 element and split normally. Avoid making major modifications
890 * until we know the whole operation will work.
8cd0a023
MD
891 *
892 * The root of the cluster is different from the root of the root
893 * cluster. Use the node's on-disk structure's parent offset to
894 * detect the case.
427e5fc6 895 */
8cd0a023
MD
896 if (ondisk->parent == 0) {
897 parent = hammer_alloc_btree(node->cluster, &error);
427e5fc6
MD
898 if (parent == NULL)
899 return(error);
8cd0a023
MD
900 hammer_lock_ex(&parent->lock);
901 ondisk = parent->ondisk;
902 ondisk->count = 1;
903 ondisk->parent = 0;
904 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
905 ondisk->elms[0].base = node->cluster->clu_btree_beg;
906 ondisk->elms[0].internal.subtree_type = node->ondisk->type;
907 ondisk->elms[0].internal.subtree_offset = node->node_offset;
908 ondisk->elms[1].base = node->cluster->clu_btree_end;
427e5fc6 909 made_root = 1;
8cd0a023 910 parent_index = 0; /* index of current node in parent */
427e5fc6
MD
911 } else {
912 made_root = 0;
8cd0a023
MD
913 parent = cursor->parent;
914 parent_index = cursor->parent_index;
195c19a1 915 KKASSERT(parent->cluster == node->cluster);
427e5fc6 916 }
427e5fc6
MD
917
918 /*
919 * Split node into new_node at the split point.
920 *
921 * B O O O P N N B <-- P = node->elms[split]
922 * 0 1 2 3 4 5 6 <-- subtree indices
923 *
924 * x x P x x
925 * s S S s
926 * / \
927 * B O O O B B N N B <--- inner boundary points are 'P'
928 * 0 1 2 3 4 5 6
929 *
930 */
8cd0a023 931 new_node = hammer_alloc_btree(node->cluster, &error);
427e5fc6 932 if (new_node == NULL) {
8cd0a023
MD
933 if (made_root) {
934 hammer_unlock(&parent->lock);
195c19a1 935 hammer_free_btree(parent->cluster, parent->node_offset);
8cd0a023
MD
936 hammer_rel_node(parent);
937 }
427e5fc6
MD
938 return(error);
939 }
8cd0a023 940 hammer_lock_ex(&new_node->lock);
427e5fc6
MD
941
942 /*
8cd0a023 943 * Create the new node. P becomes the left-hand boundary in the
427e5fc6
MD
944 * new node. Copy the right-hand boundary as well.
945 *
946 * elm is the new separator.
947 */
8cd0a023
MD
948 ondisk = node->ondisk;
949 elm = &ondisk->elms[split];
950 bcopy(elm, &new_node->ondisk->elms[0],
951 (ondisk->count - split + 1) * esize);
952 new_node->ondisk->count = ondisk->count - split;
953 new_node->ondisk->parent = parent->node_offset;
954 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
955 KKASSERT(ondisk->type == new_node->ondisk->type);
427e5fc6
MD
956
957 /*
958 * Cleanup the original node. P becomes the new boundary, its
959 * subtree_offset was moved to the new node. If we had created
960 * a new root its parent pointer may have changed.
961 */
8cd0a023 962 elm->internal.subtree_offset = 0;
c0ade690 963 ondisk->count = split;
427e5fc6
MD
964
965 /*
966 * Insert the separator into the parent, fixup the parent's
967 * reference to the original node, and reference the new node.
968 * The separator is P.
969 *
970 * Remember that base.count does not include the right-hand boundary.
971 */
8cd0a023
MD
972 ondisk = parent->ondisk;
973 ondisk->elms[parent_index].internal.subtree_count = split;
974 parent_elm = &ondisk->elms[parent_index+1];
427e5fc6 975 bcopy(parent_elm, parent_elm + 1,
8cd0a023
MD
976 (ondisk->count - parent_index) * esize);
977 parent_elm->internal.base = elm->base; /* separator P */
978 parent_elm->internal.subtree_offset = new_node->node_offset;
979 parent_elm->internal.subtree_count = new_node->ondisk->count;
7f7c1f84 980 parent_elm->internal.subtree_type = new_node->ondisk->type;
76376933 981 ++ondisk->count;
427e5fc6 982
7f7c1f84
MD
983 /*
984 * The children of new_node need their parent pointer set to new_node.
985 */
986 for (i = 0; i < new_node->ondisk->count; ++i) {
987 elm = &new_node->ondisk->elms[i];
988 error = btree_set_parent(new_node, elm);
989 if (error) {
990 panic("btree_split_internal: btree-fixup problem");
991 }
992 }
993
427e5fc6
MD
994 /*
995 * The cluster's root pointer may have to be updated.
996 */
997 if (made_root) {
8cd0a023
MD
998 node->cluster->ondisk->clu_btree_root = parent->node_offset;
999 hammer_modify_cluster(node->cluster);
1000 node->ondisk->parent = parent->node_offset;
1001 if (cursor->parent) {
1002 hammer_unlock(&cursor->parent->lock);
1003 hammer_rel_node(cursor->parent);
1004 }
1005 cursor->parent = parent; /* lock'd and ref'd */
427e5fc6
MD
1006 }
1007
8cd0a023
MD
1008 hammer_modify_node(node);
1009 hammer_modify_node(new_node);
1010 hammer_modify_node(parent);
1011
427e5fc6
MD
1012 /*
1013 * Ok, now adjust the cursor depending on which element the original
1014 * index was pointing at. If we are >= the split point the push node
1015 * is now in the new node.
1016 *
1017 * NOTE: If we are at the split point itself we cannot stay with the
1018 * original node because the push index will point at the right-hand
1019 * boundary, which is illegal.
8cd0a023
MD
1020 *
1021 * NOTE: The cursor's parent or parent_index must be adjusted for
1022 * the case where a new parent (new root) was created, and the case
1023 * where the cursor is now pointing at the split node.
427e5fc6
MD
1024 */
1025 if (cursor->index >= split) {
8cd0a023 1026 cursor->parent_index = parent_index + 1;
427e5fc6 1027 cursor->index -= split;
8cd0a023
MD
1028 hammer_unlock(&cursor->node->lock);
1029 hammer_rel_node(cursor->node);
1030 cursor->node = new_node; /* locked and ref'd */
1031 } else {
1032 cursor->parent_index = parent_index;
1033 hammer_unlock(&new_node->lock);
1034 hammer_rel_node(new_node);
427e5fc6 1035 }
76376933
MD
1036
1037 /*
1038 * Fixup left and right bounds
1039 */
1040 parent_elm = &parent->ondisk->elms[cursor->parent_index];
fbc6e32a
MD
1041 cursor->left_bound = &parent_elm[0].internal.base;
1042 cursor->right_bound = &parent_elm[1].internal.base;
76376933 1043
427e5fc6
MD
1044 return (0);
1045}
1046
1047/*
1048 * Same as the above, but splits a full leaf node.
1049 */
1050static
1051int
8cd0a023 1052btree_split_leaf(hammer_cursor_t cursor)
427e5fc6 1053{
8cd0a023
MD
1054 hammer_node_ondisk_t ondisk;
1055 hammer_node_t parent;
1056 hammer_node_t leaf;
1057 hammer_node_t new_leaf;
1058 hammer_btree_elm_t elm;
1059 hammer_btree_elm_t parent_elm;
427e5fc6
MD
1060 int parent_index;
1061 int made_root;
1062 int split;
1063 int error;
8cd0a023 1064 const size_t esize = sizeof(*elm);
427e5fc6
MD
1065
1066 /*
8cd0a023
MD
1067 * Calculate the split point. If the insertion point will be on
1068 * the left-hand side adjust the split point to give the right
1069 * hand side one additional node.
427e5fc6 1070 */
8cd0a023
MD
1071 leaf = cursor->node;
1072 ondisk = leaf->ondisk;
1073 split = (ondisk->count + 1) / 2;
427e5fc6
MD
1074 if (cursor->index <= split)
1075 --split;
1076 error = 0;
1077
1078 /*
1079 * If we are at the root of the tree, create a new root node with
1080 * 1 element and split normally. Avoid making major modifications
1081 * until we know the whole operation will work.
1082 */
8cd0a023
MD
1083 if (ondisk->parent == 0) {
1084 parent = hammer_alloc_btree(leaf->cluster, &error);
427e5fc6
MD
1085 if (parent == NULL)
1086 return(error);
8cd0a023
MD
1087 hammer_lock_ex(&parent->lock);
1088 ondisk = parent->ondisk;
1089 ondisk->count = 1;
1090 ondisk->parent = 0;
1091 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1092 ondisk->elms[0].base = leaf->cluster->clu_btree_beg;
1093 ondisk->elms[0].internal.subtree_type = leaf->ondisk->type;
1094 ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
1095 ondisk->elms[1].base = leaf->cluster->clu_btree_end;
427e5fc6 1096 made_root = 1;
8cd0a023 1097 parent_index = 0; /* insertion point in parent */
427e5fc6
MD
1098 } else {
1099 made_root = 0;
8cd0a023
MD
1100 parent = cursor->parent;
1101 parent_index = cursor->parent_index;
195c19a1 1102 KKASSERT(parent->cluster == leaf->cluster);
427e5fc6 1103 }
427e5fc6
MD
1104
1105 /*
1106 * Split leaf into new_leaf at the split point. Select a separator
1107 * value in-between the two leafs but with a bent towards the right
1108 * leaf since comparisons use an 'elm >= separator' inequality.
1109 *
1110 * L L L L L L L L
1111 *
1112 * x x P x x
1113 * s S S s
1114 * / \
1115 * L L L L L L L L
1116 */
8cd0a023 1117 new_leaf = hammer_alloc_btree(leaf->cluster, &error);
427e5fc6 1118 if (new_leaf == NULL) {
8cd0a023
MD
1119 if (made_root) {
1120 hammer_unlock(&parent->lock);
195c19a1 1121 hammer_free_btree(parent->cluster, parent->node_offset);
8cd0a023
MD
1122 hammer_rel_node(parent);
1123 }
427e5fc6
MD
1124 return(error);
1125 }
8cd0a023 1126 hammer_lock_ex(&new_leaf->lock);
427e5fc6
MD
1127
1128 /*
1129 * Create the new node. P become the left-hand boundary in the
1130 * new node. Copy the right-hand boundary as well.
1131 */
8cd0a023
MD
1132 ondisk = leaf->ondisk;
1133 elm = &ondisk->elms[split];
1134 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1135 new_leaf->ondisk->count = ondisk->count - split;
1136 new_leaf->ondisk->parent = parent->node_offset;
1137 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1138 KKASSERT(ondisk->type == new_leaf->ondisk->type);
427e5fc6
MD
1139
1140 /*
8cd0a023
MD
1141 * Cleanup the original node. Because this is a leaf node and
1142 * leaf nodes do not have a right-hand boundary, there
c0ade690
MD
1143 * aren't any special edge cases to clean up. We just fixup the
1144 * count.
427e5fc6 1145 */
c0ade690 1146 ondisk->count = split;
427e5fc6
MD
1147
1148 /*
1149 * Insert the separator into the parent, fixup the parent's
1150 * reference to the original node, and reference the new node.
1151 * The separator is P.
1152 *
1153 * Remember that base.count does not include the right-hand boundary.
1154 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1155 */
8cd0a023
MD
1156 ondisk = parent->ondisk;
1157 ondisk->elms[parent_index].internal.subtree_count = split;
1158 parent_elm = &ondisk->elms[parent_index+1];
1159 if (parent_index + 1 != ondisk->count) {
427e5fc6 1160 bcopy(parent_elm, parent_elm + 1,
8cd0a023 1161 (ondisk->count - parent_index - 1) * esize);
427e5fc6
MD
1162 }
1163 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
8cd0a023
MD
1164 parent_elm->internal.subtree_offset = new_leaf->node_offset;
1165 parent_elm->internal.subtree_count = new_leaf->ondisk->count;
7f7c1f84 1166 parent_elm->internal.subtree_type = new_leaf->ondisk->type;
76376933 1167 ++ondisk->count;
427e5fc6
MD
1168
1169 /*
1170 * The cluster's root pointer may have to be updated.
1171 */
1172 if (made_root) {
8cd0a023
MD
1173 leaf->cluster->ondisk->clu_btree_root = parent->node_offset;
1174 hammer_modify_cluster(leaf->cluster);
1175 leaf->ondisk->parent = parent->node_offset;
1176 if (cursor->parent) {
1177 hammer_unlock(&cursor->parent->lock);
1178 hammer_rel_node(cursor->parent);
1179 }
1180 cursor->parent = parent; /* lock'd and ref'd */
427e5fc6
MD
1181 }
1182
8cd0a023
MD
1183 hammer_modify_node(leaf);
1184 hammer_modify_node(new_leaf);
1185 hammer_modify_node(parent);
1186
427e5fc6
MD
1187 /*
1188 * Ok, now adjust the cursor depending on which element the original
1189 * index was pointing at. If we are >= the split point the push node
1190 * is now in the new node.
1191 *
1192 * NOTE: If we are at the split point itself we cannot stay with the
1193 * original node because the push index will point at the right-hand
1194 * boundary, which is illegal.
1195 */
1196 if (cursor->index >= split) {
8cd0a023 1197 cursor->parent_index = parent_index + 1;
427e5fc6 1198 cursor->index -= split;
8cd0a023
MD
1199 hammer_unlock(&cursor->node->lock);
1200 hammer_rel_node(cursor->node);
1201 cursor->node = new_leaf;
1202 } else {
1203 cursor->parent_index = parent_index;
1204 hammer_unlock(&new_leaf->lock);
1205 hammer_rel_node(new_leaf);
427e5fc6 1206 }
76376933
MD
1207
1208 /*
1209 * Fixup left and right bounds
1210 */
1211 parent_elm = &parent->ondisk->elms[cursor->parent_index];
fbc6e32a
MD
1212 cursor->left_bound = &parent_elm[0].internal.base;
1213 cursor->right_bound = &parent_elm[1].internal.base;
76376933 1214
427e5fc6
MD
1215 return (0);
1216}
1217
1218/*
195c19a1
MD
1219 * Attempt to remove the empty B-Tree node at (cursor->node). Returns 0
1220 * on success, EAGAIN if we could not acquire the necessary locks, or some
1221 * other error.
8cd0a023 1222 *
195c19a1
MD
1223 * On return the cursor may end up pointing at an internal node, suitable
1224 * for further iteration but not for insertion or deletion.
8cd0a023 1225 *
195c19a1 1226 * cursor->node may be an internal node or a leaf node.
8cd0a023
MD
1227 */
1228int
195c19a1 1229btree_remove(hammer_cursor_t cursor)
8cd0a023
MD
1230{
1231 hammer_node_ondisk_t ondisk;
195c19a1
MD
1232 hammer_btree_elm_t elm;
1233 hammer_node_t save;
1234 hammer_node_t node;
8cd0a023
MD
1235 hammer_node_t parent;
1236 int error;
195c19a1 1237 int i;
8cd0a023
MD
1238
1239 /*
195c19a1
MD
1240 * If we are at the root of the root cluster there is nothing to
1241 * remove, but an internal node at the root of a cluster is not
1242 * allowed to be empty so convert it to a leaf node.
8cd0a023 1243 */
195c19a1
MD
1244 if (cursor->parent == NULL) {
1245 ondisk = cursor->node->ondisk;
1246 KKASSERT(ondisk->parent == 0);
1247 ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1248 ondisk->count = 0;
1249 hammer_modify_node(cursor->node);
1250 kprintf("EMPTY ROOT OF ROOT CLUSTER -> LEAF\n");
8cd0a023
MD
1251 return(0);
1252 }
1253
1254 /*
195c19a1
MD
1255 * Retain a reference to cursor->node, ex-lock again (2 locks now)
1256 * so we do not lose the lock when we cursor around.
8cd0a023 1257 */
195c19a1
MD
1258 save = cursor->node;
1259 hammer_ref_node(save);
1260 hammer_lock_ex(&save->lock);
8cd0a023
MD
1261
1262 /*
195c19a1
MD
1263 * We need to be able to lock the parent of the parent. Do this
1264 * non-blocking and return EAGAIN if the lock cannot be acquired.
1265 * non-blocking is required in order to avoid a deadlock.
1266 *
1267 * After we cursor up, parent is moved to node and the new parent
1268 * is the parent of the parent.
8cd0a023 1269 */
195c19a1
MD
1270 error = hammer_cursor_up(cursor, 1);
1271 if (error) {
1272 kprintf("BTREE_REMOVE: Cannot lock parent\n");
1273 hammer_unlock(&save->lock);
1274 hammer_rel_node(save);
1275 return(error);
8cd0a023 1276 }
195c19a1
MD
1277
1278 /*
1279 * At this point we want to remove the element at (node, index),
1280 * which is now the (original) parent pointing to the saved node.
1281 * Removing the element allows us to then free the node it was
1282 * pointing to.
1283 *
1284 * However, an internal node is not allowed to have 0 elements, so
1285 * if the count would drop to 0 we have to recurse. It is possible
1286 * for the recursion to fail.
1287 *
1288 * NOTE: The cursor is in an indeterminant position after recursing,
1289 * but will still be suitable for an iteration.
1290 */
1291 node = cursor->node;
1292 KKASSERT(node->ondisk->count > 0);
1293 if (node->ondisk->count == 1) {
1294 error = btree_remove(cursor);
1295 if (error == 0) {
1296 kprintf("BTREE_REMOVE: Successful!\n");
1297 hammer_flush_node(save);
1298 hammer_free_btree(save->cluster, save->node_offset);
1299 } else {
1300 kprintf("BTREE_REMOVE: Recursion failed %d\n", error);
8cd0a023 1301 }
195c19a1
MD
1302 hammer_unlock(&save->lock);
1303 hammer_rel_node(save);
1304 return(error);
8cd0a023 1305 }
195c19a1
MD
1306
1307 /*
1308 * Remove the element at (node, index) and adjust the parent's
1309 * subtree_count.
fbc6e32a
MD
1310 *
1311 * NOTE! Removing the element will cause the left-hand boundary
1312 * to no longer match the child node:
1313 *
1314 * LxMxR (left, middle, right). If the first 'x' element is
1315 * removed then you get LxR and L no longer matches the left-hand
1316 * boundary of the sub-node pointed to by what used to be the
1317 * second x. This case is handled in btree_search().
195c19a1 1318 */
fbc6e32a 1319#if 0
195c19a1 1320 kprintf("BTREE_REMOVE: Removing element %d\n", cursor->index);
fbc6e32a 1321#endif
195c19a1
MD
1322 ondisk = node->ondisk;
1323 i = cursor->index;
1324 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
1325 (ondisk->count - i) * sizeof(ondisk->elms[0]));
1326 --ondisk->count;
1327 hammer_modify_node(cursor->node);
1328
1329 /*
1330 * Adjust the parent-parent's (now parent) reference to the parent
1331 * (now node).
1332 */
1333 if ((parent = cursor->parent) != NULL) {
1334 elm = &parent->ondisk->elms[cursor->parent_index];
1335 if (elm->internal.subtree_count != ondisk->count) {
1336 elm->internal.subtree_count = ondisk->count;
1337 hammer_modify_node(parent);
1338 }
1339 if (elm->subtree_type != HAMMER_BTREE_TYPE_CLUSTER &&
1340 elm->subtree_type != ondisk->type) {
1341 elm->subtree_type = ondisk->type;
1342 hammer_modify_node(parent);
8cd0a023
MD
1343 }
1344 }
195c19a1
MD
1345
1346 /*
1347 * Free the saved node.
1348 */
1349 hammer_flush_node(save);
1350 hammer_free_btree(save->cluster, save->node_offset);
1351 hammer_unlock(&save->lock);
1352 hammer_rel_node(save);
1353 return(0);
8cd0a023
MD
1354}
1355
7f7c1f84
MD
1356/*
1357 * The child represented by the element in internal node node needs
1358 * to have its parent pointer adjusted.
1359 */
1360static
1361int
1362btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm)
1363{
1364 hammer_volume_t volume;
1365 hammer_cluster_t cluster;
1366 hammer_node_t child;
1367 int error;
1368
1369 error = 0;
1370
1371 switch(elm->internal.subtree_type) {
1372 case HAMMER_BTREE_TYPE_LEAF:
1373 case HAMMER_BTREE_TYPE_INTERNAL:
1374 child = hammer_get_node(node->cluster,
1375 elm->internal.subtree_offset, &error);
1376 if (error == 0) {
1377 hammer_lock_ex(&child->lock);
1378 child->ondisk->parent = node->node_offset;
1379 hammer_modify_node(child);
1380 hammer_unlock(&child->lock);
1381 hammer_rel_node(child);
1382 }
1383 break;
1384 case HAMMER_BTREE_TYPE_CLUSTER:
1385 volume = hammer_get_volume(node->cluster->volume->hmp,
1386 elm->internal.subtree_volno, &error);
1387 if (error)
1388 break;
1389 cluster = hammer_get_cluster(volume,
1390 elm->internal.subtree_cluid,
1391 &error, 0);
1392 hammer_rel_volume(volume, 0);
1393 if (error)
1394 break;
1395 hammer_lock_ex(&cluster->io.lock);
1396 cluster->ondisk->clu_btree_parent_offset = node->node_offset;
1397 hammer_unlock(&cluster->io.lock);
1398 KKASSERT(cluster->ondisk->clu_btree_parent_clu_no ==
1399 node->cluster->clu_no);
1400 KKASSERT(cluster->ondisk->clu_btree_parent_vol_no ==
1401 node->cluster->volume->vol_no);
1402 hammer_modify_cluster(cluster);
1403 hammer_rel_cluster(cluster, 0);
1404 break;
1405 default:
1406 hammer_print_btree_elm(elm, HAMMER_BTREE_TYPE_INTERNAL, -1);
1407 panic("btree_set_parent: bad subtree_type");
1408 break; /* NOT REACHED */
1409 }
1410 return(error);
1411}
1412
8cd0a023
MD
1413#if 0
1414
1415/*
1416 * This routine is called on the internal node (node) prior to recursing down
1417 * through (node, index) when the node referenced by (node, index) MIGHT
1418 * have too few elements for the caller to perform a deletion.
427e5fc6
MD
1419 *
1420 * cursor->index is invalid on return because the separators may have gotten
1421 * adjusted, the caller must rescan the node's elements. The caller may set
1422 * cursor->index to -1 if it wants us to do a general rebalancing.
8cd0a023
MD
1423 *
1424 * This routine rebalances the children of the (node), collapsing children
1425 * together if possible. On return each child will have at least L/2-1
1426 * elements unless the node only has one child.
427e5fc6
MD
1427 *
1428 * NOTE: Because we do not update the parent's parent in the split code,
1429 * the subtree_count used by the caller may be incorrect. We correct it
1430 * here. Also note that we cannot change the depth of the tree's leaf
1431 * nodes here (see btree_collapse()).
1432 *
8cd0a023 1433 * NOTE: We make no attempt to rebalance inter-cluster elements.
427e5fc6
MD
1434 */
1435static
1436int
8cd0a023 1437btree_rebalance(hammer_cursor_t cursor)
427e5fc6 1438{
8cd0a023
MD
1439 hammer_node_ondisk_t ondisk;
1440 hammer_node_t node;
1441 hammer_node_t children[HAMMER_BTREE_INT_ELMS];
1442 hammer_node_t child;
1443 hammer_btree_elm_t elm;
1444 hammer_btree_elm_t elms;
427e5fc6
MD
1445 int i, j, n, nelms, goal;
1446 int maxelms, halfelms;
1447 int error;
1448
1449 /*
8cd0a023
MD
1450 * If the elm being recursed through is an inter-cluster reference,
1451 * don't worry about it.
427e5fc6 1452 */
8cd0a023
MD
1453 ondisk = cursor->node->ondisk;
1454 elm = &ondisk->elms[cursor->index];
1455 if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER)
1456 return(0);
1457
1458 KKASSERT(elm->internal.subtree_offset != 0);
427e5fc6
MD
1459 error = 0;
1460
1461 /*
1462 * Load the children of node and do any necessary corrections
8cd0a023 1463 * to subtree_count. subtree_count may be too low due to the
427e5fc6 1464 * way insertions split nodes. Get a count of the total number
8cd0a023 1465 * of actual elements held by our children.
427e5fc6
MD
1466 */
1467 error = 0;
1468
1469 for (i = n = 0; i < node->base.count; ++i) {
1470 struct hammer_btree_internal_elm *elm;
1471
1472 elm = &node->elms[i];
1473 children[i] = NULL;
1474 child_buffer[i] = NULL; /* must be preinitialized for bread */
1475 if (elm->subtree_offset == 0)
1476 continue;
1477 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1478 HAMMER_FSBUF_BTREE, &error,
8cd0a023 1479 &child_buffer[i], XXX);
427e5fc6
MD
1480 children[i] = child;
1481 if (child == NULL)
1482 continue;
8cd0a023 1483 XXX
427e5fc6
MD
1484 KKASSERT(node->base.subtype == child->base.type);
1485
1486 /*
1487 * Accumulate n for a good child, update the node's count
1488 * if it was wrong.
1489 */
1490 if (node->elms[i].subtree_count != child->base.count) {
1491 node->elms[i].subtree_count = child->base.count;
1492 }
1493 n += node->elms[i].subtree_count;
1494 }
1495 if (error)
1496 goto failed;
1497
1498 /*
1499 * Collect all the children's elements together
1500 */
1501 nelms = n;
1502 elms = kmalloc(sizeof(*elms) * (nelms + 1), M_HAMMER, M_WAITOK|M_ZERO);
1503 for (i = n = 0; i < node->base.count; ++i) {
1504 child = children[i];
1505 for (j = 0; j < child->base.count; ++j) {
1506 elms[n].owner = child;
8cd0a023 1507 if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF)
427e5fc6
MD
1508 elms[n].u.leaf = child->leaf.elms[j];
1509 else
1510 elms[n].u.internal = child->internal.elms[j];
1511 ++n;
1512 }
1513 }
1514 KKASSERT(n == nelms);
1515
1516 /*
1517 * Store a boundary in the elms array to ease the code below. This
1518 * is only used if the children are internal nodes.
1519 */
1520 elms[n].u.internal = node->elms[i];
1521
1522 /*
1523 * Calculate the number of elements each child should have (goal) by
1524 * reducing the number of elements until we achieve at least
1525 * halfelms - 1 per child, unless we are a degenerate case.
1526 */
1527 maxelms = btree_max_elements(node->base.subtype);
1528 halfelms = maxelms / 2;
1529
1530 goal = halfelms - 1;
1531 while (i && n / i < goal)
1532 --i;
1533
1534 /*
1535 * Now rebalance using the specified goal
1536 */
1537 for (i = n = 0; i < node->base.count; ++i) {
1538 struct hammer_buffer *subchild_buffer = NULL;
1539 struct hammer_btree_internal_node *subchild;
1540
1541 child = children[i];
1542 for (j = 0; j < goal && n < nelms; ++j) {
8cd0a023 1543 if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
427e5fc6
MD
1544 child->leaf.elms[j] = elms[n].u.leaf;
1545 } else {
1546 child->internal.elms[j] = elms[n].u.internal;
1547 }
1548
1549 /*
1550 * If the element's parent has changed we have to
1551 * update the parent pointer. This is somewhat
1552 * expensive.
1553 */
1554 if (elms[n].owner != child &&
8cd0a023 1555 node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) {
427e5fc6
MD
1556 subchild = hammer_bread(cursor->cluster,
1557 elms[n].u.internal.subtree_offset,
1558 HAMMER_FSBUF_BTREE,
1559 &error,
8cd0a023 1560 &subchild_buffer, XXX);
427e5fc6
MD
1561 if (subchild) {
1562 subchild->base.parent =
1563 hammer_bclu_offset(child_buffer[i],
1564 child);
1565 hammer_modify_buffer(subchild_buffer);
1566 }
1567 /* XXX error */
1568 }
1569 ++n;
1570 }
1571 /*
1572 * Set right boundary if the children are internal nodes.
1573 */
8cd0a023 1574 if (node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL)
427e5fc6
MD
1575 child->internal.elms[j] = elms[n].u.internal;
1576 child->base.count = j;
1577 hammer_modify_buffer(child_buffer[i]);
1578 if (subchild_buffer)
66325755 1579 hammer_put_buffer(subchild_buffer, 0);
427e5fc6
MD
1580
1581 /*
1582 * If we have run out of elements, break out
1583 */
1584 if (n == nelms)
1585 break;
1586 }
1587
1588 /*
1589 * Physically destroy any left-over children. These children's
1590 * elements have been packed into prior children. The node's
1591 * right hand boundary and count gets shifted to index i.
1592 *
1593 * The subtree count in the node's parent MUST be updated because
1594 * we are removing elements. The subtree_count field is allowed to
1595 * be too small, but not too large!
1596 */
1597 if (i != node->base.count) {
1598 n = i;
1599 node->elms[n] = node->elms[node->base.count];
1600 while (i < node->base.count) {
1601 hammer_free_btree_ptr(child_buffer[i], children[i]);
66325755 1602 hammer_put_buffer(child_buffer[i], 0);
427e5fc6
MD
1603 ++i;
1604 }
1605 node->base.count = n;
1606 if (cursor->parent) {
1607 cursor->parent->elms[cursor->parent_index].subtree_count = n;
1608 hammer_modify_buffer(cursor->parent_buffer);
1609 }
1610 }
1611
1612 kfree(elms, M_HAMMER);
1613failed:
1614 hammer_modify_buffer(cursor->node_buffer);
1615 for (i = 0; i < node->base.count; ++i) {
1616 if (child_buffer[i])
66325755 1617 hammer_put_buffer(child_buffer[i], 0);
427e5fc6
MD
1618 }
1619 return (error);
1620}
1621
1622/*
1623 * This routine is only called if the cursor is at the root node and the
1624 * root node is an internal node. We attempt to collapse the root node
1625 * by replacing it with all of its children, reducing tree depth by one.
1626 *
1627 * This is the only way to reduce tree depth in a HAMMER filesystem.
1628 * Note that all leaf nodes are at the same depth.
1629 *
1630 * This is a fairly expensive operation because we not only have to load
1631 * the root's children, we also have to scan each child and adjust the
1632 * parent offset for each element in each child. Nasty all around.
1633 */
1634static
1635int
8cd0a023 1636btree_collapse(hammer_cursor_t cursor)
427e5fc6 1637{
8cd0a023
MD
1638 hammer_btree_node_ondisk_t root, child;
1639 hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS];
427e5fc6
MD
1640 struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS];
1641 int count;
1642 int i, j, n;
1643 int root_modified;
1644 int error;
1645 int32_t root_offset;
1646 u_int8_t subsubtype;
1647
1648 root = cursor->node;
1649 count = root->base.count;
1650 root_offset = hammer_bclu_offset(cursor->node_buffer, root);
1651
1652 /*
1653 * Sum up the number of children each element has. This value is
1654 * only approximate due to the way the insertion node works. It
1655 * may be too small but it will never be too large.
1656 *
1657 * Quickly terminate the collapse if the elements have too many
1658 * children.
1659 */
1660 KKASSERT(root->base.parent == 0); /* must be root node */
8cd0a023 1661 KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL);
427e5fc6
MD
1662 KKASSERT(count <= HAMMER_BTREE_INT_ELMS);
1663
1664 for (i = n = 0; i < count; ++i) {
1665 n += root->internal.elms[i].subtree_count;
1666 }
1667 if (n > btree_max_elements(root->base.subtype))
1668 return(0);
1669
1670 /*
1671 * Iterate through the elements again and correct the subtree_count.
1672 * Terminate the collapse if we wind up with too many.
1673 */
1674 error = 0;
1675 root_modified = 0;
1676
1677 for (i = n = 0; i < count; ++i) {
1678 struct hammer_btree_internal_elm *elm;
1679
1680 elm = &root->internal.elms[i];
1681 child_buffer[i] = NULL;
1682 children[i] = NULL;
1683 if (elm->subtree_offset == 0)
1684 continue;
1685 child = hammer_bread(cursor->cluster, elm->subtree_offset,
1686 HAMMER_FSBUF_BTREE, &error,
8cd0a023 1687 &child_buffer[i], XXX);
427e5fc6
MD
1688 children[i] = child;
1689 if (child == NULL)
1690 continue;
1691 KKASSERT(root->base.subtype == child->base.type);
1692
1693 /*
1694 * Accumulate n for a good child, update the root's count
1695 * if it was wrong.
1696 */
1697 if (root->internal.elms[i].subtree_count != child->base.count) {
1698 root->internal.elms[i].subtree_count = child->base.count;
1699 root_modified = 1;
1700 }
1701 n += root->internal.elms[i].subtree_count;
1702 }
1703 if (error || n > btree_max_elements(root->base.subtype))
1704 goto done;
1705
1706 /*
1707 * Ok, we can collapse the root. If the root's children are leafs
1708 * the collapse is really simple. If they are internal nodes the
1709 * collapse is not so simple because we have to fixup the parent
1710 * pointers for the root's children's children.
1711 *
1712 * When collapsing an internal node the far left and far right
1713 * element's boundaries should match the root's left and right
1714 * boundaries.
1715 */
8cd0a023 1716 if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) {
427e5fc6
MD
1717 for (i = n = 0; i < count; ++i) {
1718 child = children[i];
1719 for (j = 0; j < child->base.count; ++j) {
1720 root->leaf.elms[n] = child->leaf.elms[j];
1721 ++n;
1722 }
1723 }
1724 root->base.type = root->base.subtype;
1725 root->base.subtype = 0;
1726 root->base.count = n;
1727 root->leaf.link_left = 0;
1728 root->leaf.link_right = 0;
1729 } else {
1730 struct hammer_btree_internal_elm *elm;
1731 struct hammer_btree_internal_node *subchild;
1732 struct hammer_buffer *subchild_buffer = NULL;
1733
1734 if (count) {
1735 child = children[0];
1736 subsubtype = child->base.subtype;
1737 KKASSERT(child->base.count > 0);
1738 KKASSERT(root->internal.elms[0].base.key ==
1739 child->internal.elms[0].base.key);
1740 child = children[count-1];
1741 KKASSERT(child->base.count > 0);
1742 KKASSERT(root->internal.elms[count].base.key ==
1743 child->internal.elms[child->base.count].base.key);
1744 } else {
1745 subsubtype = 0;
1746 }
1747 for (i = n = 0; i < count; ++i) {
1748 child = children[i];
1749 KKASSERT(child->base.subtype == subsubtype);
1750 for (j = 0; j < child->base.count; ++j) {
1751 elm = &child->internal.elms[j];
1752
1753 root->internal.elms[n] = *elm;
1754 subchild = hammer_bread(cursor->cluster,
1755 elm->subtree_offset,
1756 HAMMER_FSBUF_BTREE,
1757 &error,
8cd0a023
MD
1758 &subchild_buffer,
1759 XXX);
427e5fc6
MD
1760 if (subchild) {
1761 subchild->base.parent = root_offset;
1762 hammer_modify_buffer(subchild_buffer);
1763 }
1764 ++n;
1765 }
1766 /* make sure the right boundary is correct */
1767 /* (this gets overwritten when the loop continues) */
1768 /* XXX generate a new separator? */
1769 root->internal.elms[n] = child->internal.elms[j];
1770 }
8cd0a023 1771 root->base.type = HAMMER_BTREE_TYPE_INTERNAL;
427e5fc6
MD
1772 root->base.subtype = subsubtype;
1773 if (subchild_buffer)
66325755 1774 hammer_put_buffer(subchild_buffer, 0);
427e5fc6
MD
1775 }
1776 root_modified = 1;
1777
1778 /*
1779 * Cleanup
1780 */
1781done:
1782 if (root_modified)
1783 hammer_modify_buffer(cursor->node_buffer);
1784 for (i = 0; i < count; ++i) {
1785 if (child_buffer[i])
66325755 1786 hammer_put_buffer(child_buffer[i], 0);
427e5fc6
MD
1787 }
1788 return(error);
1789}
1790
8cd0a023
MD
1791#endif
1792
1793/************************************************************************
1794 * MISCELLANIOUS SUPPORT *
1795 ************************************************************************/
1796
1797/*
1798 * Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
1799 *
1800 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
1801 *
1802 * Note that key1 and key2 are treated differently. key1 is allowed to
1803 * wildcard some of its fields by setting them to 0, while key2 is expected
1804 * to be in an on-disk form (no wildcards).
1805 */
1806int
1807hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
1808{
1809#if 0
1810 kprintf("compare obj_id %016llx %016llx\n",
1811 key1->obj_id, key2->obj_id);
1812 kprintf("compare rec_type %04x %04x\n",
1813 key1->rec_type, key2->rec_type);
1814 kprintf("compare key %016llx %016llx\n",
1815 key1->key, key2->key);
1816#endif
1817
1818 /*
1819 * A key1->obj_id of 0 matches any object id
1820 */
1821 if (key1->obj_id) {
1822 if (key1->obj_id < key2->obj_id)
1823 return(-4);
1824 if (key1->obj_id > key2->obj_id)
1825 return(4);
1826 }
1827
1828 /*
1829 * A key1->rec_type of 0 matches any record type.
1830 */
1831 if (key1->rec_type) {
1832 if (key1->rec_type < key2->rec_type)
1833 return(-3);
1834 if (key1->rec_type > key2->rec_type)
1835 return(3);
1836 }
1837
1838 /*
1839 * There is no special case for key. 0 means 0.
1840 */
1841 if (key1->key < key2->key)
1842 return(-2);
1843 if (key1->key > key2->key)
1844 return(2);
1845
1846 /*
1847 * This test has a number of special cases. create_tid in key1 is
1848 * the as-of transction id, and delete_tid in key1 is NOT USED.
1849 *
1850 * A key1->create_tid of 0 matches any record regardles of when
1851 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
1852 * used to search for the most current state of the object.
1853 *
1854 * key2->create_tid is a HAMMER record and will never be
1855 * 0. key2->delete_tid is the deletion transaction id or 0 if
1856 * the record has not yet been deleted.
1857 */
1858 if (key1->create_tid) {
1859 if (key1->create_tid < key2->create_tid)
1860 return(-1);
1861 if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
1862 return(1);
1863 }
1864
1865 return(0);
1866}
1867
c0ade690
MD
1868/*
1869 * Compare the element against the cursor's beginning and ending keys
1870 */
1871int
1872hammer_btree_range_cmp(hammer_cursor_t cursor, hammer_base_elm_t key2)
1873{
1874 /*
1875 * A cursor->key_beg.obj_id of 0 matches any object id
1876 */
1877 if (cursor->key_beg.obj_id) {
1878 if (cursor->key_end.obj_id < key2->obj_id)
1879 return(-4);
1880 if (cursor->key_beg.obj_id > key2->obj_id)
1881 return(4);
1882 }
1883
1884 /*
1885 * A cursor->key_beg.rec_type of 0 matches any record type.
1886 */
1887 if (cursor->key_beg.rec_type) {
1888 if (cursor->key_end.rec_type < key2->rec_type)
1889 return(-3);
1890 if (cursor->key_beg.rec_type > key2->rec_type)
1891 return(3);
1892 }
1893
1894 /*
1895 * There is no special case for key. 0 means 0.
1896 */
1897 if (cursor->key_end.key < key2->key)
1898 return(-2);
1899 if (cursor->key_beg.key > key2->key)
1900 return(2);
1901
1902 /*
1903 * This test has a number of special cases. create_tid in key1 is
1904 * the as-of transction id, and delete_tid in key1 is NOT USED.
1905 *
1906 * A key1->create_tid of 0 matches any record regardles of when
1907 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
1908 * used to search for the most current state of the object.
1909 *
1910 * key2->create_tid is a HAMMER record and will never be
1911 * 0. key2->delete_tid is the deletion transaction id or 0 if
1912 * the record has not yet been deleted.
1913 *
1914 * NOTE: only key_beg.create_tid is used for create_tid, we can only
1915 * do as-of scans at the moment.
1916 */
1917 if (cursor->key_beg.create_tid) {
1918 if (cursor->key_beg.create_tid < key2->create_tid)
1919 return(-1);
1920 if (key2->delete_tid && cursor->key_beg.create_tid >= key2->delete_tid)
1921 return(1);
1922 }
1923
1924 return(0);
1925}
1926
8cd0a023
MD
1927/*
1928 * Create a separator half way inbetween key1 and key2. For fields just
1929 * one unit apart, the separator will match key2.
1930 *
1931 * The handling of delete_tid is a little confusing. It is only possible
1932 * to have one record in the B-Tree where all fields match except delete_tid.
1933 * This means, worse case, two adjacent elements may have a create_tid that
1934 * is one-apart and cause the separator to choose the right-hand element's
1935 * create_tid. e.g. (create,delete): (1,x)(2,x) -> separator is (2,x).
1936 *
1937 * So all we have to do is set delete_tid to the right-hand element to
1938 * guarentee that the separator is properly between the two elements.
1939 */
1940#define MAKE_SEPARATOR(key1, key2, dest, field) \
1941 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
1942
1943static void
1944hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
1945 hammer_base_elm_t dest)
1946{
1947 bzero(dest, sizeof(*dest));
1948 MAKE_SEPARATOR(key1, key2, dest, obj_id);
1949 MAKE_SEPARATOR(key1, key2, dest, rec_type);
1950 MAKE_SEPARATOR(key1, key2, dest, key);
1951 MAKE_SEPARATOR(key1, key2, dest, create_tid);
1952 dest->delete_tid = key2->delete_tid;
1953}
1954
1955#undef MAKE_SEPARATOR
1956
1957/*
1958 * Return whether a generic internal or leaf node is full
1959 */
1960static int
1961btree_node_is_full(hammer_node_ondisk_t node)
1962{
1963 switch(node->type) {
1964 case HAMMER_BTREE_TYPE_INTERNAL:
1965 if (node->count == HAMMER_BTREE_INT_ELMS)
1966 return(1);
1967 break;
1968 case HAMMER_BTREE_TYPE_LEAF:
1969 if (node->count == HAMMER_BTREE_LEAF_ELMS)
1970 return(1);
1971 break;
1972 default:
1973 panic("illegal btree subtype");
1974 }
1975 return(0);
1976}
1977
1978#if 0
1979static int
1980btree_max_elements(u_int8_t type)
1981{
1982 if (type == HAMMER_BTREE_TYPE_LEAF)
1983 return(HAMMER_BTREE_LEAF_ELMS);
1984 if (type == HAMMER_BTREE_TYPE_INTERNAL)
1985 return(HAMMER_BTREE_INT_ELMS);
1986 panic("btree_max_elements: bad type %d\n", type);
1987}
1988#endif
1989
c0ade690
MD
1990void
1991hammer_print_btree_node(hammer_node_ondisk_t ondisk)
1992{
1993 hammer_btree_elm_t elm;
1994 int i;
1995
1996 kprintf("node %p count=%d parent=%d type=%c\n",
1997 ondisk, ondisk->count, ondisk->parent, ondisk->type);
1998
1999 /*
2000 * Dump both boundary elements if an internal node
2001 */
2002 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
2003 for (i = 0; i <= ondisk->count; ++i) {
2004 elm = &ondisk->elms[i];
2005 hammer_print_btree_elm(elm, ondisk->type, i);
2006 }
2007 } else {
2008 for (i = 0; i < ondisk->count; ++i) {
2009 elm = &ondisk->elms[i];
2010 hammer_print_btree_elm(elm, ondisk->type, i);
2011 }
2012 }
2013}
2014
2015void
2016hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
2017{
2018 kprintf(" %2d", i);
2019 kprintf("\tobjid = %016llx\n", elm->base.obj_id);
2020 kprintf("\tkey = %016llx\n", elm->base.key);
2021 kprintf("\tcreate_tid = %016llx\n", elm->base.create_tid);
2022 kprintf("\tdelete_tid = %016llx\n", elm->base.delete_tid);
2023 kprintf("\trec_type = %04x\n", elm->base.rec_type);
2024 kprintf("\tobj_type = %02x\n", elm->base.obj_type);
2025 kprintf("\tsubtree_type = %02x\n", elm->subtree_type);
2026
2027 if (type == HAMMER_BTREE_TYPE_INTERNAL) {
2028 if (elm->internal.rec_offset) {
2029 kprintf("\tcluster_rec = %08x\n",
2030 elm->internal.rec_offset);
2031 kprintf("\tcluster_id = %08x\n",
2032 elm->internal.subtree_cluid);
2033 kprintf("\tvolno = %08x\n",
2034 elm->internal.subtree_volno);
2035 } else {
2036 kprintf("\tsubtree_off = %08x\n",
2037 elm->internal.subtree_offset);
2038 }
2039 kprintf("\tsubtree_count= %d\n", elm->internal.subtree_count);
2040 } else {
2041 kprintf("\trec_offset = %08x\n", elm->leaf.rec_offset);
2042 kprintf("\tdata_offset = %08x\n", elm->leaf.data_offset);
2043 kprintf("\tdata_len = %08x\n", elm->leaf.data_len);
2044 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc);
2045 }
2046}