nrelease - fix/improve livecd
[dragonfly.git] / sys / vfs / hammer / hammer_btree.c
CommitLineData
427e5fc6 1/*
b84de5af 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
745703c7 3 *
427e5fc6
MD
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
745703c7 6 *
427e5fc6
MD
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
745703c7 10 *
427e5fc6
MD
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
745703c7 20 *
427e5fc6
MD
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
427e5fc6
MD
33 */
34
35/*
8cd0a023 36 * HAMMER B-Tree index
427e5fc6
MD
37 *
38 * HAMMER implements a modified B+Tree. In documentation this will
9944ae54 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree
427e5fc6
MD
40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
41 * of the tree), but adds two additional boundary elements which describe
42 * the left-most and right-most element a node is able to represent. In
8cd0a023 43 * otherwords, we have boundary elements at the two ends of a B-Tree node
106ccd08 44 * with no valid sub-tree pointer for the right-most element.
427e5fc6 45 *
8cd0a023 46 * A B-Tree internal node looks like this:
427e5fc6
MD
47 *
48 * B N N N N N N B <-- boundary and internal elements
49 * S S S S S S S <-- subtree pointers
50 *
8cd0a023 51 * A B-Tree leaf node basically looks like this:
427e5fc6
MD
52 *
53 * L L L L L L L L <-- leaf elemenets
54 *
8cd0a023
MD
55 * The radix for an internal node is 1 less then a leaf but we get a
56 * number of significant benefits for our troubles.
106ccd08
TK
57 * The left-hand boundary (B in the left) is integrated into the first
58 * element so it doesn't require 2 elements to accomodate boundaries.
427e5fc6 59 *
8cd0a023
MD
60 * The big benefit to using a B-Tree containing boundary information
61 * is that it is possible to cache pointers into the middle of the tree
62 * and not have to start searches, insertions, OR deletions at the root
63 * node. In particular, searches are able to progress in a definitive
64 * direction from any point in the tree without revisting nodes. This
65 * greatly improves the efficiency of many operations, most especially
66 * record appends.
427e5fc6 67 *
8cd0a023
MD
68 * B-Trees also make the stacking of trees fairly straightforward.
69 *
fe7678ee
MD
70 * INSERTIONS: A search performed with the intention of doing
71 * an insert will guarantee that the terminal leaf node is not full by
72 * splitting full nodes. Splits occur top-down during the dive down the
73 * B-Tree.
74 *
75 * DELETIONS: A deletion makes no attempt to proactively balance the
f36a9737
MD
76 * tree and will recursively remove nodes that become empty. If a
77 * deadlock occurs a deletion may not be able to remove an empty leaf.
78 * Deletions never allow internal nodes to become empty (that would blow
79 * up the boundaries).
8cd0a023
MD
80 */
81#include "hammer.h"
66325755 82
8cd0a023
MD
83static int btree_search(hammer_cursor_t cursor, int flags);
84static int btree_split_internal(hammer_cursor_t cursor);
85static int btree_split_leaf(hammer_cursor_t cursor);
03b6feea 86static int btree_remove(hammer_cursor_t cursor, int *ndelete);
b70819e9 87static __inline int btree_node_is_full(hammer_node_ondisk_t node);
66611793 88static int hammer_btree_mirror_propagate(hammer_cursor_t cursor,
adf01747 89 hammer_tid_t mirror_tid);
8cd0a023
MD
90static void hammer_make_separator(hammer_base_elm_t key1,
91 hammer_base_elm_t key2, hammer_base_elm_t dest);
4c038e17 92static void hammer_cursor_mirror_filter(hammer_cursor_t cursor);
fe4fc0c0
TK
93static __inline void hammer_debug_btree_elm(hammer_cursor_t cursor,
94 hammer_btree_elm_t elm, const char *s, int res);
95static __inline void hammer_debug_btree_parent(hammer_cursor_t cursor,
96 const char *s);
66325755
MD
97
98/*
8cd0a023
MD
99 * Iterate records after a search. The cursor is iterated forwards past
100 * the current record until a record matching the key-range requirements
101 * is found. ENOENT is returned if the iteration goes past the ending
745703c7 102 * key.
66325755 103 *
d26d0ae9
MD
104 * The iteration is inclusive of key_beg and can be inclusive or exclusive
105 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set.
66325755 106 *
eaeff70d 107 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid
9582c7da 108 * may be modified by B-Tree functions.
d5530d22 109 *
8cd0a023 110 * cursor->key_beg may or may not be modified by this function during
d26d0ae9
MD
111 * the iteration. XXX future - in case of an inverted lock we may have
112 * to reinitiate the lookup and set key_beg to properly pick up where we
113 * left off.
6a37e7e4 114 *
6dc17446
MD
115 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor
116 * was reverse indexed due to being moved to a parent while unlocked,
117 * and something else might have inserted an element outside the iteration
118 * range. When this case occurs the iterator just keeps iterating until
119 * it gets back into the iteration range (instead of asserting).
120 *
6a37e7e4 121 * NOTE! EDEADLK *CANNOT* be returned by this procedure.
66325755
MD
122 */
123int
8cd0a023 124hammer_btree_iterate(hammer_cursor_t cursor)
66325755 125{
8cd0a023
MD
126 hammer_node_ondisk_t node;
127 hammer_btree_elm_t elm;
3e583440 128 hammer_mount_t hmp;
1d4077f7 129 int error = 0;
66325755
MD
130 int r;
131 int s;
132
133 /*
8cd0a023 134 * Skip past the current record
66325755 135 */
3e583440 136 hmp = cursor->trans->hmp;
8cd0a023 137 node = cursor->node->ondisk;
a89aec1b
MD
138 if (node == NULL)
139 return(ENOENT);
745703c7 140 if (cursor->index < node->count &&
c0ade690 141 (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
66325755 142 ++cursor->index;
c0ade690 143 }
66325755 144
3e583440
MD
145 /*
146 * HAMMER can wind up being cpu-bound.
147 */
148 if (++hmp->check_yield > hammer_yield_check) {
149 hmp->check_yield = 0;
150 lwkt_user_yield();
151 }
152
153
8cd0a023
MD
154 /*
155 * Loop until an element is found or we are done.
156 */
66325755
MD
157 for (;;) {
158 /*
8cd0a023
MD
159 * We iterate up the tree and then index over one element
160 * while we are at the last element in the current node.
161 *
47197d71 162 * If we are at the root of the filesystem, cursor_up
8cd0a023
MD
163 * returns ENOENT.
164 *
66325755
MD
165 * XXX this could be optimized by storing the information in
166 * the parent reference.
195c19a1
MD
167 *
168 * XXX we can lose the node lock temporarily, this could mess
169 * up our scan.
66325755 170 */
47637bff 171 ++hammer_stats_btree_iterations;
3e583440 172 hammer_flusher_clean_loose_ios(hmp);
77fec802 173
8cd0a023 174 if (cursor->index == node->count) {
a84a197d 175 if (hammer_debug_btree) {
35a5249b
TK
176 hkprintf("BRACKETU %016jx[%d] -> %016jx[%d] td=%p\n",
177 (intmax_t)cursor->node->node_offset,
a84a197d 178 cursor->index,
35a5249b 179 (intmax_t)(cursor->parent ? cursor->parent->node_offset : -1),
a84a197d
MD
180 cursor->parent_index,
181 curthread);
182 }
653fa4cd
TK
183 KKASSERT(cursor->parent == NULL ||
184 cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset);
6a37e7e4 185 error = hammer_cursor_up(cursor);
8cd0a023
MD
186 if (error)
187 break;
46fe7ae1 188 /* reload stale pointer */
8cd0a023
MD
189 node = cursor->node->ondisk;
190 KKASSERT(cursor->index != node->count);
2f85fa4d
MD
191
192 /*
193 * If we are reblocking we want to return internal
1775b6a0
MD
194 * nodes. Note that the internal node will be
195 * returned multiple times, on each upward recursion
196 * from its children. The caller selects which
197 * revisit it cares about (usually first or last only).
2f85fa4d
MD
198 */
199 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) {
200 cursor->flags |= HAMMER_CURSOR_ATEDISK;
201 return(0);
202 }
8cd0a023
MD
203 ++cursor->index;
204 continue;
66325755
MD
205 }
206
207 /*
d26d0ae9
MD
208 * Check internal or leaf element. Determine if the record
209 * at the cursor has gone beyond the end of our range.
66325755 210 *
47197d71 211 * We recurse down through internal nodes.
66325755 212 */
8cd0a023
MD
213 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
214 elm = &node->elms[cursor->index];
c82af904 215
d26d0ae9
MD
216 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
217 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
b3deaf57 218 if (hammer_debug_btree) {
fe4fc0c0
TK
219 hammer_debug_btree_elm(cursor, elm, "BRACKETL", r);
220 hammer_debug_btree_elm(cursor, elm + 1, "BRACKETR", s);
b3deaf57
MD
221 }
222
d26d0ae9
MD
223 if (r < 0) {
224 error = ENOENT;
225 break;
66325755 226 }
fe7678ee
MD
227 if (r == 0 && (cursor->flags &
228 HAMMER_CURSOR_END_INCLUSIVE) == 0) {
d26d0ae9 229 error = ENOENT;
8cd0a023 230 break;
d26d0ae9 231 }
6a37e7e4
MD
232
233 /*
f36a9737 234 * Better not be zero
6a37e7e4 235 */
f36a9737
MD
236 KKASSERT(elm->internal.subtree_offset != 0);
237
6dc17446
MD
238 if (s <= 0) {
239 /*
240 * If running the mirror filter see if we
241 * can skip one or more entire sub-trees.
242 * If we can we return the internal node
243 * and the caller processes the skipped
244 * range (see mirror_read).
245 */
246 if (cursor->flags &
247 HAMMER_CURSOR_MIRROR_FILTERED) {
248 if (elm->internal.mirror_tid <
249 cursor->cmirror->mirror_tid) {
250 hammer_cursor_mirror_filter(cursor);
251 return(0);
252 }
c82af904 253 }
6dc17446
MD
254 } else {
255 /*
256 * Normally it would be impossible for the
257 * cursor to have gotten back-indexed,
258 * but it can happen if a node is deleted
259 * and the cursor is moved to its parent
260 * internal node. ITERATE_CHECK will be set.
261 */
262 KKASSERT(cursor->flags &
263 HAMMER_CURSOR_ITERATE_CHECK);
11605a5c 264 hdkprintf("DEBUG: Caught parent seek "
6dc17446 265 "in internal iteration\n");
c82af904
MD
266 }
267
f36a9737
MD
268 error = hammer_cursor_down(cursor);
269 if (error)
270 break;
271 KKASSERT(cursor->index == 0);
46fe7ae1
MD
272 /* reload stale pointer */
273 node = cursor->node->ondisk;
fe7678ee 274 continue;
d26d0ae9
MD
275 } else {
276 elm = &node->elms[cursor->index];
277 r = hammer_btree_cmp(&cursor->key_end, &elm->base);
b3deaf57 278 if (hammer_debug_btree) {
fe4fc0c0 279 hammer_debug_btree_elm(cursor, elm, "ELEMENT", r);
b3deaf57 280 }
d26d0ae9
MD
281 if (r < 0) {
282 error = ENOENT;
283 break;
284 }
b33e2cc0
MD
285
286 /*
287 * We support both end-inclusive and
288 * end-exclusive searches.
289 */
290 if (r == 0 &&
291 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
292 error = ENOENT;
293 break;
294 }
295
6dc17446
MD
296 /*
297 * If ITERATE_CHECK is set an unlocked cursor may
298 * have been moved to a parent and the iterate can
299 * happen upon elements that are not in the requested
300 * range.
301 */
302 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) {
303 s = hammer_btree_cmp(&cursor->key_beg,
304 &elm->base);
305 if (s > 0) {
11605a5c 306 hdkprintf("DEBUG: Caught parent seek "
6dc17446
MD
307 "in leaf iteration\n");
308 ++cursor->index;
309 continue;
310 }
311 }
312 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK;
313
314 /*
315 * Return the element
316 */
fe7678ee
MD
317 switch(elm->leaf.base.btype) {
318 case HAMMER_BTREE_TYPE_RECORD:
319 if ((cursor->flags & HAMMER_CURSOR_ASOF) &&
320 hammer_btree_chkts(cursor->asof, &elm->base)) {
321 ++cursor->index;
322 continue;
323 }
1d4077f7 324 error = 0;
fe7678ee 325 break;
fe7678ee
MD
326 default:
327 error = EINVAL;
328 break;
d26d0ae9 329 }
fe7678ee
MD
330 if (error)
331 break;
66325755
MD
332 }
333
334 /*
d26d0ae9 335 * Return entry
66325755 336 */
b3deaf57 337 if (hammer_debug_btree) {
fe4fc0c0
TK
338 elm = &cursor->node->ondisk->elms[cursor->index];
339 hammer_debug_btree_elm(cursor, elm, "ITERATE", 0xffff);
b3deaf57 340 }
d26d0ae9 341 return(0);
427e5fc6 342 }
66325755 343 return(error);
427e5fc6
MD
344}
345
4c038e17
MD
346/*
347 * We hit an internal element that we could skip as part of a mirroring
348 * scan. Calculate the entire range being skipped.
349 *
350 * It is important to include any gaps between the parent's left_bound
351 * and the node's left_bound, and same goes for the right side.
352 */
353static void
354hammer_cursor_mirror_filter(hammer_cursor_t cursor)
355{
356 struct hammer_cmirror *cmirror;
357 hammer_node_ondisk_t ondisk;
358 hammer_btree_elm_t elm;
359
360 ondisk = cursor->node->ondisk;
361 cmirror = cursor->cmirror;
362
363 /*
364 * Calculate the skipped range
365 */
366 elm = &ondisk->elms[cursor->index];
367 if (cursor->index == 0)
368 cmirror->skip_beg = *cursor->left_bound;
369 else
370 cmirror->skip_beg = elm->internal.base;
371 while (cursor->index < ondisk->count) {
372 if (elm->internal.mirror_tid >= cmirror->mirror_tid)
373 break;
374 ++cursor->index;
375 ++elm;
376 }
377 if (cursor->index == ondisk->count)
378 cmirror->skip_end = *cursor->right_bound;
379 else
380 cmirror->skip_end = elm->internal.base;
381
382 /*
383 * clip the returned result.
384 */
385 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0)
386 cmirror->skip_beg = cursor->key_beg;
387 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0)
388 cmirror->skip_end = cursor->key_end;
389}
390
32c90105
MD
391/*
392 * Iterate in the reverse direction. This is used by the pruning code to
393 * avoid overlapping records.
394 */
395int
396hammer_btree_iterate_reverse(hammer_cursor_t cursor)
397{
398 hammer_node_ondisk_t node;
399 hammer_btree_elm_t elm;
bb5add8c 400 hammer_mount_t hmp;
f75df937 401 int error = 0;
32c90105
MD
402 int r;
403 int s;
404
4c038e17
MD
405 /* mirror filtering not supported for reverse iteration */
406 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0);
407
32c90105
MD
408 /*
409 * Skip past the current record. For various reasons the cursor
410 * may end up set to -1 or set to point at the end of the current
411 * node. These cases must be addressed.
412 */
413 node = cursor->node->ondisk;
414 if (node == NULL)
415 return(ENOENT);
745703c7 416 if (cursor->index != -1 &&
32c90105
MD
417 (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
418 --cursor->index;
419 }
420 if (cursor->index == cursor->node->ondisk->count)
421 --cursor->index;
422
bb5add8c
MD
423 /*
424 * HAMMER can wind up being cpu-bound.
425 */
426 hmp = cursor->trans->hmp;
427 if (++hmp->check_yield > hammer_yield_check) {
428 hmp->check_yield = 0;
429 lwkt_user_yield();
430 }
431
32c90105
MD
432 /*
433 * Loop until an element is found or we are done.
434 */
435 for (;;) {
77fec802 436 ++hammer_stats_btree_iterations;
bb5add8c 437 hammer_flusher_clean_loose_ios(hmp);
77fec802 438
32c90105
MD
439 /*
440 * We iterate up the tree and then index over one element
441 * while we are at the last element in the current node.
32c90105
MD
442 */
443 if (cursor->index == -1) {
444 error = hammer_cursor_up(cursor);
445 if (error) {
446 cursor->index = 0; /* sanity */
447 break;
448 }
449 /* reload stale pointer */
450 node = cursor->node->ondisk;
451 KKASSERT(cursor->index != node->count);
452 --cursor->index;
453 continue;
454 }
455
456 /*
457 * Check internal or leaf element. Determine if the record
458 * at the cursor has gone beyond the end of our range.
459 *
745703c7 460 * We recurse down through internal nodes.
32c90105
MD
461 */
462 KKASSERT(cursor->index != node->count);
463 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
464 elm = &node->elms[cursor->index];
7c682412 465
32c90105
MD
466 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
467 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
468 if (hammer_debug_btree) {
fe4fc0c0
TK
469 hammer_debug_btree_elm(cursor, elm, "BRACKETL", r);
470 hammer_debug_btree_elm(cursor, elm + 1, "BRACKETR", s);
32c90105
MD
471 }
472
473 if (s >= 0) {
474 error = ENOENT;
475 break;
476 }
6dc17446
MD
477
478 /*
479 * It shouldn't be possible to be seeked past key_end,
480 * even if the cursor got moved to a parent.
481 */
32c90105
MD
482 KKASSERT(r >= 0);
483
484 /*
f36a9737 485 * Better not be zero
32c90105 486 */
f36a9737
MD
487 KKASSERT(elm->internal.subtree_offset != 0);
488
489 error = hammer_cursor_down(cursor);
490 if (error)
491 break;
492 KKASSERT(cursor->index == 0);
32c90105
MD
493 /* reload stale pointer */
494 node = cursor->node->ondisk;
f36a9737
MD
495
496 /* this can assign -1 if the leaf was empty */
497 cursor->index = node->count - 1;
32c90105
MD
498 continue;
499 } else {
500 elm = &node->elms[cursor->index];
501 s = hammer_btree_cmp(&cursor->key_beg, &elm->base);
502 if (hammer_debug_btree) {
fe4fc0c0 503 hammer_debug_btree_elm(cursor, elm, "ELEMENTR", s);
32c90105
MD
504 }
505 if (s > 0) {
506 error = ENOENT;
507 break;
508 }
509
6dc17446
MD
510 /*
511 * It shouldn't be possible to be seeked past key_end,
512 * even if the cursor got moved to a parent.
513 */
514 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK;
515
516 /*
517 * Return the element
518 */
32c90105
MD
519 switch(elm->leaf.base.btype) {
520 case HAMMER_BTREE_TYPE_RECORD:
521 if ((cursor->flags & HAMMER_CURSOR_ASOF) &&
522 hammer_btree_chkts(cursor->asof, &elm->base)) {
523 --cursor->index;
524 continue;
525 }
f75df937 526 error = 0;
32c90105 527 break;
32c90105
MD
528 default:
529 error = EINVAL;
530 break;
531 }
532 if (error)
533 break;
534 }
32c90105
MD
535
536 /*
537 * Return entry
538 */
539 if (hammer_debug_btree) {
fe4fc0c0
TK
540 elm = &cursor->node->ondisk->elms[cursor->index];
541 hammer_debug_btree_elm(cursor, elm, "ITERATER", 0xffff);
32c90105
MD
542 }
543 return(0);
544 }
545 return(error);
546}
547
427e5fc6 548/*
8cd0a023 549 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry
6a37e7e4
MD
550 * could not be found, EDEADLK if inserting and a retry is needed, and a
551 * fatal error otherwise. When retrying, the caller must terminate the
eaeff70d 552 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting.
745703c7 553 *
8cd0a023 554 * The cursor is suitably positioned for a deletion on success, and suitably
eaeff70d
MD
555 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was
556 * specified.
427e5fc6 557 *
47197d71 558 * The cursor may begin anywhere, the search will traverse the tree in
8cd0a023 559 * either direction to locate the requested element.
eaeff70d
MD
560 *
561 * Most of the logic implementing historical searches is handled here. We
9582c7da
MD
562 * do an initial lookup with create_tid set to the asof TID. Due to the
563 * way records are laid out, a backwards iteration may be required if
eaeff70d
MD
564 * ENOENT is returned to locate the historical record. Here's the
565 * problem:
566 *
9582c7da 567 * create_tid: 10 15 20
eaeff70d
MD
568 * LEAF1 LEAF2
569 * records: (11) (18)
570 *
9582c7da
MD
571 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse
572 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is
573 * not visible and thus causes ENOENT to be returned. We really need
574 * to check record 11 in LEAF1. If it also fails then the search fails
575 * (e.g. it might represent the range 11-16 and thus still not match our
f36a9737
MD
576 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring
577 * further iterations.
b33e2cc0 578 *
9582c7da
MD
579 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK
580 * and the cursor->create_check TID if an iteration might be needed.
581 * In the above example create_check would be set to 14.
427e5fc6
MD
582 */
583int
8cd0a023 584hammer_btree_lookup(hammer_cursor_t cursor)
427e5fc6 585{
66325755
MD
586 int error;
587
6dc17446 588 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK;
98da6d8c
MD
589 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 ||
590 cursor->trans->sync_lock_refs > 0);
cb51be26 591 ++hammer_stats_btree_lookups;
d5530d22 592 if (cursor->flags & HAMMER_CURSOR_ASOF) {
eaeff70d 593 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0);
9582c7da 594 cursor->key_beg.create_tid = cursor->asof;
eaeff70d 595 for (;;) {
9582c7da 596 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK;
d5530d22 597 error = btree_search(cursor, 0);
b33e2cc0 598 if (error != ENOENT ||
9582c7da 599 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) {
b33e2cc0
MD
600 /*
601 * Stop if no error.
602 * Stop if error other then ENOENT.
603 * Stop if ENOENT and not special case.
604 */
eaeff70d
MD
605 break;
606 }
32c90105 607 if (hammer_debug_btree) {
35a5249b
TK
608 hkprintf("CREATE_CHECK %016jx\n",
609 (intmax_t)cursor->create_check);
32c90105 610 }
9582c7da 611 cursor->key_beg.create_tid = cursor->create_check;
eaeff70d
MD
612 /* loop */
613 }
d5530d22
MD
614 } else {
615 error = btree_search(cursor, 0);
616 }
bf3b416b 617 if (error == 0)
8cd0a023 618 error = hammer_btree_extract(cursor, cursor->flags);
66325755
MD
619 return(error);
620}
621
d26d0ae9
MD
622/*
623 * Execute the logic required to start an iteration. The first record
624 * located within the specified range is returned and iteration control
625 * flags are adjusted for successive hammer_btree_iterate() calls.
3214ade6
MD
626 *
627 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate
628 * in a loop without worrying about it. Higher-level merged searches will
629 * adjust the flag appropriately.
d26d0ae9
MD
630 */
631int
632hammer_btree_first(hammer_cursor_t cursor)
633{
634 int error;
635
636 error = hammer_btree_lookup(cursor);
637 if (error == ENOENT) {
638 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
639 error = hammer_btree_iterate(cursor);
640 }
641 cursor->flags |= HAMMER_CURSOR_ATEDISK;
642 return(error);
643}
644
32c90105
MD
645/*
646 * Similarly but for an iteration in the reverse direction.
814387f6
MD
647 *
648 * Set ATEDISK when iterating backwards to skip the current entry,
649 * which after an ENOENT lookup will be pointing beyond our end point.
3214ade6
MD
650 *
651 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse
652 * in a loop without worrying about it. Higher-level merged searches will
653 * adjust the flag appropriately.
32c90105
MD
654 */
655int
656hammer_btree_last(hammer_cursor_t cursor)
657{
658 struct hammer_base_elm save;
659 int error;
660
661 save = cursor->key_beg;
662 cursor->key_beg = cursor->key_end;
663 error = hammer_btree_lookup(cursor);
664 cursor->key_beg = save;
665 if (error == ENOENT ||
666 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
814387f6 667 cursor->flags |= HAMMER_CURSOR_ATEDISK;
32c90105
MD
668 error = hammer_btree_iterate_reverse(cursor);
669 }
670 cursor->flags |= HAMMER_CURSOR_ATEDISK;
671 return(error);
672}
673
8cd0a023
MD
674/*
675 * Extract the record and/or data associated with the cursor's current
676 * position. Any prior record or data stored in the cursor is replaced.
8cd0a023 677 *
47197d71 678 * NOTE: All extractions occur at the leaf of the B-Tree.
8cd0a023 679 */
66325755 680int
8cd0a023 681hammer_btree_extract(hammer_cursor_t cursor, int flags)
66325755 682{
8cd0a023
MD
683 hammer_node_ondisk_t node;
684 hammer_btree_elm_t elm;
47197d71 685 hammer_off_t data_off;
bac808fe 686 hammer_mount_t hmp;
19619882 687 int32_t data_len;
427e5fc6 688 int error;
427e5fc6 689
b961cdbb
MD
690 /*
691 * Certain types of corruption can result in a NULL node pointer.
692 */
693 if (cursor->node == NULL) {
d053aa8a 694 hkprintf("NULL cursor->node, filesystem might "
b961cdbb
MD
695 "have gotten corrupted\n");
696 return (EINVAL);
697 }
698
8cd0a023 699 /*
427e5fc6
MD
700 * The case where the data reference resolves to the same buffer
701 * as the record reference must be handled.
702 */
8cd0a023 703 node = cursor->node->ondisk;
8cd0a023 704 elm = &node->elms[cursor->index];
40043e7f
MD
705 cursor->data = NULL;
706 hmp = cursor->node->hmp;
66325755 707
d26d0ae9 708 /*
fe7678ee 709 * There is nothing to extract for an internal element.
d26d0ae9 710 */
fe7678ee
MD
711 if (node->type == HAMMER_BTREE_TYPE_INTERNAL)
712 return(EINVAL);
713
47197d71
MD
714 /*
715 * Only record types have data.
716 */
fe7678ee 717 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
11ad5ade 718 cursor->leaf = &elm->leaf;
4a2796f3 719
c11c5877
TK
720 /*
721 * Returns here unless HAMMER_CURSOR_GET_DATA is set.
722 */
4a2796f3
MD
723 if ((flags & HAMMER_CURSOR_GET_DATA) == 0)
724 return(0);
c11c5877 725
47197d71 726 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD)
c9d1310c 727 return(EINVAL);
47197d71 728 data_off = elm->leaf.data_offset;
19619882 729 data_len = elm->leaf.data_len;
47197d71 730 if (data_off == 0)
4a2796f3 731 return(0);
d26d0ae9 732
4a2796f3
MD
733 /*
734 * Load the data
735 */
736 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE);
737 cursor->data = hammer_bread_ext(hmp, data_off, data_len,
738 &error, &cursor->data_buffer);
b8a41159
MD
739
740 /*
741 * Mark the data buffer as not being meta-data if it isn't
742 * meta-data (sometimes bulk data is accessed via a volume
743 * block device).
744 */
745 if (error == 0) {
746 switch(elm->leaf.base.rec_type) {
747 case HAMMER_RECTYPE_DATA:
748 case HAMMER_RECTYPE_DB:
18bee4a2
MD
749 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0)
750 break;
751 if (hammer_double_buffer == 0 ||
752 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) {
753 hammer_io_notmeta(cursor->data_buffer);
754 }
b8a41159
MD
755 break;
756 default:
757 break;
758 }
759 }
760
761 /*
762 * Deal with CRC errors on the extracted data.
763 */
2faf0737 764 if (error == 0 &&
4c09d9c4 765 hammer_crc_test_leaf(hmp->version, cursor->data, &elm->leaf) == 0) {
35a5249b
TK
766 hdkprintf("CRC DATA @ %016jx/%d FAILED\n",
767 (intmax_t)elm->leaf.data_offset, elm->leaf.data_len);
fc73edd8 768 if (hammer_debug_critical)
4c286c36
MD
769 Debugger("CRC FAILED: DATA");
770 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM)
771 error = EDOM; /* less critical (mirroring) */
772 else
773 error = EIO; /* critical */
e469566b 774 }
427e5fc6
MD
775 return(error);
776}
777
778
779/*
8cd0a023
MD
780 * Insert a leaf element into the B-Tree at the current cursor position.
781 * The cursor is positioned such that the element at and beyond the cursor
782 * are shifted to make room for the new record.
783 *
a89aec1b 784 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
8cd0a023 785 * flag set and that call must return ENOENT before this function can be
2f49d4c5 786 * called. ENOSPC is returned if there is no room to insert a new record.
8cd0a023 787 *
d36ec43b 788 * The caller may depend on the cursor's exclusive lock after return to
1f07f686 789 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE).
427e5fc6
MD
790 */
791int
602c6cb8
MD
792hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm,
793 int *doprop)
427e5fc6 794{
8cd0a023 795 hammer_node_ondisk_t node;
427e5fc6 796 int i;
6a37e7e4
MD
797 int error;
798
602c6cb8 799 *doprop = 0;
7bc5b8c2 800 if ((error = hammer_cursor_upgrade_node(cursor)) != 0)
6a37e7e4 801 return(error);
cb51be26 802 ++hammer_stats_btree_inserts;
427e5fc6 803
427e5fc6
MD
804 /*
805 * Insert the element at the leaf node and update the count in the
806 * parent. It is possible for parent to be NULL, indicating that
47197d71
MD
807 * the filesystem's ROOT B-Tree node is a leaf itself, which is
808 * possible. The root inode can never be deleted so the leaf should
809 * never be empty.
427e5fc6 810 *
2f49d4c5 811 * Remember that leaf nodes do not have boundaries.
427e5fc6 812 */
36f82b23 813 hammer_modify_node_all(cursor->trans, cursor->node);
8cd0a023 814 node = cursor->node->ondisk;
427e5fc6 815 i = cursor->index;
fe7678ee 816 KKASSERT(elm->base.btype != 0);
8cd0a023
MD
817 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
818 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
819 if (i != node->count) {
820 bcopy(&node->elms[i], &node->elms[i+1],
821 (node->count - i) * sizeof(*elm));
822 }
11ad5ade 823 node->elms[i].leaf = *elm;
8cd0a023 824 ++node->count;
e4a5ff06 825 hammer_cursor_inserted_element(cursor->node, i);
c82af904
MD
826
827 /*
828 * Update the leaf node's aggregate mirror_tid for mirroring
829 * support.
830 */
602c6cb8 831 if (node->mirror_tid < elm->base.delete_tid) {
c82af904 832 node->mirror_tid = elm->base.delete_tid;
602c6cb8
MD
833 *doprop = 1;
834 }
835 if (node->mirror_tid < elm->base.create_tid) {
c82af904 836 node->mirror_tid = elm->base.create_tid;
602c6cb8 837 *doprop = 1;
c82af904 838 }
602c6cb8 839 hammer_modify_node_done(cursor->node);
c82af904 840
eaeff70d 841 /*
47197d71 842 * Debugging sanity checks.
eaeff70d 843 */
11ad5ade
MD
844 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0);
845 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0);
eaeff70d 846 if (i) {
11ad5ade 847 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0);
eaeff70d 848 }
b3deaf57 849 if (i != node->count - 1)
11ad5ade 850 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0);
b3deaf57 851
427e5fc6
MD
852 return(0);
853}
854
855/*
fe7678ee 856 * Delete a record from the B-Tree at the current cursor position.
8cd0a023
MD
857 * The cursor is positioned such that the current element is the one
858 * to be deleted.
859 *
195c19a1
MD
860 * On return the cursor will be positioned after the deleted element and
861 * MAY point to an internal node. It will be suitable for the continuation
862 * of an iteration but not for an insertion or deletion.
8cd0a023 863 *
195c19a1 864 * Deletions will attempt to partially rebalance the B-Tree in an upward
f36a9737
MD
865 * direction, but will terminate rather then deadlock. Empty internal nodes
866 * are never allowed by a deletion which deadlocks may end up giving us an
867 * empty leaf. The pruner will clean up and rebalance the tree.
46fe7ae1
MD
868 *
869 * This function can return EDEADLK, requiring the caller to retry the
870 * operation after clearing the deadlock.
03b6feea
TK
871 *
872 * This function will store the number of deleted btree nodes in *ndelete
873 * if ndelete is not NULL.
427e5fc6
MD
874 */
875int
03b6feea 876hammer_btree_delete(hammer_cursor_t cursor, int *ndelete)
427e5fc6 877{
8cd0a023
MD
878 hammer_node_ondisk_t ondisk;
879 hammer_node_t node;
f31f6d84 880 hammer_node_t parent __debugvar;
8cd0a023 881 int error;
427e5fc6
MD
882 int i;
883
98da6d8c 884 KKASSERT (cursor->trans->sync_lock_refs > 0);
03b6feea
TK
885 if (ndelete)
886 *ndelete = 0;
6a37e7e4
MD
887 if ((error = hammer_cursor_upgrade(cursor)) != 0)
888 return(error);
cb51be26 889 ++hammer_stats_btree_deletes;
6a37e7e4 890
427e5fc6 891 /*
745703c7 892 * Delete the element from the leaf node.
427e5fc6 893 *
8cd0a023 894 * Remember that leaf nodes do not have boundaries.
427e5fc6 895 */
8cd0a023
MD
896 node = cursor->node;
897 ondisk = node->ondisk;
427e5fc6
MD
898 i = cursor->index;
899
8cd0a023 900 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
fe7678ee 901 KKASSERT(i >= 0 && i < ondisk->count);
36f82b23 902 hammer_modify_node_all(cursor->trans, node);
8cd0a023
MD
903 if (i + 1 != ondisk->count) {
904 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
905 (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
906 }
907 --ondisk->count;
10a5d1ba 908 hammer_modify_node_done(node);
b3bad96f 909 hammer_cursor_deleted_element(node, i);
fe7678ee
MD
910
911 /*
912 * Validate local parent
913 */
914 if (ondisk->parent) {
8cd0a023 915 parent = cursor->parent;
fe7678ee
MD
916
917 KKASSERT(parent != NULL);
918 KKASSERT(parent->node_offset == ondisk->parent);
427e5fc6 919 }
427e5fc6 920
8cd0a023 921 /*
fe7678ee 922 * If the leaf becomes empty it must be detached from the parent,
47197d71 923 * potentially recursing through to the filesystem root.
195c19a1
MD
924 *
925 * This may reposition the cursor at one of the parent's of the
926 * current node.
6a37e7e4
MD
927 *
928 * Ignore deadlock errors, that simply means that btree_remove
745703c7 929 * was unable to recurse and had to leave us with an empty leaf.
8cd0a023 930 */
b3deaf57 931 KKASSERT(cursor->index <= ondisk->count);
8cd0a023 932 if (ondisk->count == 0) {
03b6feea 933 error = btree_remove(cursor, ndelete);
6a37e7e4
MD
934 if (error == EDEADLK)
935 error = 0;
8cd0a023 936 } else {
8cd0a023
MD
937 error = 0;
938 }
eaeff70d
MD
939 KKASSERT(cursor->parent == NULL ||
940 cursor->parent_index < cursor->parent->ondisk->count);
8cd0a023
MD
941 return(error);
942}
427e5fc6
MD
943
944/*
2f49d4c5 945 * PRIMARY B-TREE SEARCH SUPPORT PROCEDURE
8cd0a023 946 *
47197d71 947 * Search the filesystem B-Tree for cursor->key_beg, return the matching node.
8cd0a023 948 *
d26d0ae9
MD
949 * The search can begin ANYWHERE in the B-Tree. As a first step the search
950 * iterates up the tree as necessary to properly position itself prior to
951 * actually doing the sarch.
745703c7 952 *
8cd0a023 953 * INSERTIONS: The search will split full nodes and leaves on its way down
d26d0ae9 954 * and guarentee that the leaf it ends up on is not full. If we run out
fd1fcc1a 955 * of space the search continues to the leaf, but ENOSPC is returned.
427e5fc6 956 *
fbc6e32a
MD
957 * The search is only guarenteed to end up on a leaf if an error code of 0
958 * is returned, or if inserting and an error code of ENOENT is returned.
d26d0ae9 959 * Otherwise it can stop at an internal node. On success a search returns
47197d71 960 * a leaf node.
eaeff70d
MD
961 *
962 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire
963 * filesystem, and it is not simple code. Please note the following facts:
964 *
965 * - Internal node recursions have a boundary on the left AND right. The
9582c7da 966 * right boundary is non-inclusive. The create_tid is a generic part
eaeff70d
MD
967 * of the key for internal nodes.
968 *
eaeff70d 969 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a
b33e2cc0
MD
970 * historical search. ASOF and INSERT are mutually exclusive. When
971 * doing an as-of lookup btree_search() checks for a right-edge boundary
9582c7da
MD
972 * case. If while recursing down the left-edge differs from the key
973 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along
974 * with cursor->create_check. This is used by btree_lookup() to iterate.
975 * The iteration backwards because as-of searches can wind up going
b33e2cc0 976 * down the wrong branch of the B-Tree.
427e5fc6 977 */
745703c7 978static
427e5fc6 979int
8cd0a023 980btree_search(hammer_cursor_t cursor, int flags)
427e5fc6 981{
8cd0a023 982 hammer_node_ondisk_t node;
61aeeb33 983 hammer_btree_elm_t elm;
8cd0a023 984 int error;
d26d0ae9 985 int enospc = 0;
8cd0a023
MD
986 int i;
987 int r;
b33e2cc0 988 int s;
8cd0a023
MD
989
990 flags |= cursor->flags;
cb51be26 991 ++hammer_stats_btree_searches;
8cd0a023 992
b3deaf57 993 if (hammer_debug_btree) {
fe4fc0c0
TK
994 hammer_debug_btree_elm(cursor,
995 (hammer_btree_elm_t)&cursor->key_beg,
996 "SEARCH", 0xffff);
a84a197d 997 if (cursor->parent)
fe4fc0c0 998 hammer_debug_btree_parent(cursor, "SEARCHP");
b3deaf57
MD
999 }
1000
8cd0a023
MD
1001 /*
1002 * Move our cursor up the tree until we find a node whos range covers
47197d71 1003 * the key we are trying to locate.
8cd0a023
MD
1004 *
1005 * The left bound is inclusive, the right bound is non-inclusive.
47197d71 1006 * It is ok to cursor up too far.
8cd0a023 1007 */
b33e2cc0
MD
1008 for (;;) {
1009 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound);
1010 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound);
1011 if (r >= 0 && s < 0)
1012 break;
9944ae54 1013 KKASSERT(cursor->parent);
cb51be26 1014 ++hammer_stats_btree_iterations;
6a37e7e4 1015 error = hammer_cursor_up(cursor);
8cd0a023
MD
1016 if (error)
1017 goto done;
427e5fc6 1018 }
427e5fc6 1019
b33e2cc0
MD
1020 /*
1021 * The delete-checks below are based on node, not parent. Set the
1022 * initial delete-check based on the parent.
1023 */
9582c7da
MD
1024 if (r == 1) {
1025 KKASSERT(cursor->left_bound->create_tid != 1);
1026 cursor->create_check = cursor->left_bound->create_tid - 1;
1027 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK;
b33e2cc0
MD
1028 }
1029
8cd0a023 1030 /*
47197d71 1031 * We better have ended up with a node somewhere.
8cd0a023 1032 */
47197d71 1033 KKASSERT(cursor->node != NULL);
8cd0a023
MD
1034
1035 /*
1036 * If we are inserting we can't start at a full node if the parent
1037 * is also full (because there is no way to split the node),
b33e2cc0 1038 * continue running up the tree until the requirement is satisfied
47197d71 1039 * or we hit the root of the filesystem.
9582c7da
MD
1040 *
1041 * (If inserting we aren't doing an as-of search so we don't have
1042 * to worry about create_check).
8cd0a023 1043 */
a9fd59b8
TK
1044 while (flags & HAMMER_CURSOR_INSERT) {
1045 if (btree_node_is_full(cursor->node->ondisk) == 0)
1046 break;
b33e2cc0
MD
1047 if (cursor->node->ondisk->parent == 0 ||
1048 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) {
8cd0a023 1049 break;
b33e2cc0 1050 }
cb51be26 1051 ++hammer_stats_btree_iterations;
6a37e7e4 1052 error = hammer_cursor_up(cursor);
47197d71 1053 /* node may have become stale */
8cd0a023
MD
1054 if (error)
1055 goto done;
427e5fc6 1056 }
427e5fc6 1057
8cd0a023
MD
1058 /*
1059 * Push down through internal nodes to locate the requested key.
1060 */
8cd0a023
MD
1061 node = cursor->node->ondisk;
1062 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
8cd0a023
MD
1063 /*
1064 * Scan the node to find the subtree index to push down into.
fbc6e32a 1065 * We go one-past, then back-up.
d113fda1 1066 *
fe7678ee
MD
1067 * We must proactively remove deleted elements which may
1068 * have been left over from a deadlocked btree_remove().
1069 *
eaeff70d 1070 * The left and right boundaries are included in the loop
d5530d22 1071 * in order to detect edge cases.
9944ae54 1072 *
9582c7da 1073 * If the separator only differs by create_tid (r == 1)
eaeff70d
MD
1074 * and we are doing an as-of search, we may end up going
1075 * down a branch to the left of the one containing the
1076 * desired key. This requires numerous special cases.
8cd0a023 1077 */
47637bff 1078 ++hammer_stats_btree_iterations;
46fe7ae1 1079 if (hammer_debug_btree) {
35a5249b
TK
1080 hkprintf("SEARCH-I %016jx count=%d\n",
1081 (intmax_t)cursor->node->node_offset,
46fe7ae1
MD
1082 node->count);
1083 }
af209b0f
MD
1084
1085 /*
1086 * Try to shortcut the search before dropping into the
1087 * linear loop. Locate the first node where r <= 1.
1088 */
1089 i = hammer_btree_search_node(&cursor->key_beg, node);
1090 while (i <= node->count) {
cb51be26 1091 ++hammer_stats_btree_elements;
61aeeb33
MD
1092 elm = &node->elms[i];
1093 r = hammer_btree_cmp(&cursor->key_beg, &elm->base);
b33e2cc0 1094 if (hammer_debug_btree > 2) {
33234d14 1095 hkprintf(" IELM %p [%d] r=%d\n",
b33e2cc0
MD
1096 &node->elms[i], i, r);
1097 }
9582c7da 1098 if (r < 0)
8cd0a023 1099 break;
9582c7da
MD
1100 if (r == 1) {
1101 KKASSERT(elm->base.create_tid != 1);
1102 cursor->create_check = elm->base.create_tid - 1;
1103 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK;
b33e2cc0 1104 }
af209b0f 1105 ++i;
8cd0a023 1106 }
eaeff70d 1107 if (hammer_debug_btree) {
33234d14 1108 hkprintf("SEARCH-I preI=%d/%d r=%d\n",
46fe7ae1 1109 i, node->count, r);
eaeff70d 1110 }
8cd0a023
MD
1111
1112 /*
ae744f81
TK
1113 * The first two cases (i == 0 or i == node->count + 1)
1114 * occur when the parent's idea of the boundary
9944ae54
MD
1115 * is wider then the child's idea of the boundary, and
1116 * require special handling. If not inserting we can
1117 * terminate the search early for these cases but the
1118 * child's boundaries cannot be unconditionally modified.
ae744f81
TK
1119 *
1120 * The last case (neither of the above) fits in child's
1121 * idea of the boundary, so we can simply push down the
1122 * cursor.
8cd0a023 1123 */
fbc6e32a 1124 if (i == 0) {
9944ae54
MD
1125 /*
1126 * If i == 0 the search terminated to the LEFT of the
1127 * left_boundary but to the RIGHT of the parent's left
1128 * boundary.
1129 */
46137e17 1130 uint8_t save;
d26d0ae9 1131
eaeff70d
MD
1132 elm = &node->elms[0];
1133
1134 /*
1135 * If we aren't inserting we can stop here.
1136 */
11ad5ade
MD
1137 if ((flags & (HAMMER_CURSOR_INSERT |
1138 HAMMER_CURSOR_PRUNING)) == 0) {
fbc6e32a
MD
1139 cursor->index = 0;
1140 return(ENOENT);
1141 }
9944ae54 1142
d5530d22
MD
1143 /*
1144 * Correct a left-hand boundary mismatch.
6a37e7e4 1145 *
f36a9737
MD
1146 * We can only do this if we can upgrade the lock,
1147 * and synchronized as a background cursor (i.e.
1148 * inserting or pruning).
10a5d1ba
MD
1149 *
1150 * WARNING: We can only do this if inserting, i.e.
1151 * we are running on the backend.
d5530d22 1152 */
eaeff70d
MD
1153 if ((error = hammer_cursor_upgrade(cursor)) != 0)
1154 return(error);
10a5d1ba 1155 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
c9b9e29d
MD
1156 hammer_modify_node_field(cursor->trans, cursor->node,
1157 elms[0]);
fe7678ee 1158 save = node->elms[0].base.btype;
d5530d22 1159 node->elms[0].base = *cursor->left_bound;
fe7678ee 1160 node->elms[0].base.btype = save;
10a5d1ba 1161 hammer_modify_node_done(cursor->node);
9944ae54 1162 } else if (i == node->count + 1) {
d26d0ae9 1163 /*
9944ae54
MD
1164 * If i == node->count + 1 the search terminated to
1165 * the RIGHT of the right boundary but to the LEFT
eaeff70d
MD
1166 * of the parent's right boundary. If we aren't
1167 * inserting we can stop here.
d113fda1 1168 *
9944ae54
MD
1169 * Note that the last element in this case is
1170 * elms[i-2] prior to adjustments to 'i'.
d26d0ae9 1171 */
9944ae54 1172 --i;
11ad5ade
MD
1173 if ((flags & (HAMMER_CURSOR_INSERT |
1174 HAMMER_CURSOR_PRUNING)) == 0) {
9944ae54 1175 cursor->index = i;
eaeff70d 1176 return (ENOENT);
d26d0ae9
MD
1177 }
1178
d5530d22
MD
1179 /*
1180 * Correct a right-hand boundary mismatch.
1181 * (actual push-down record is i-2 prior to
1182 * adjustments to i).
6a37e7e4 1183 *
f36a9737
MD
1184 * We can only do this if we can upgrade the lock,
1185 * and synchronized as a background cursor (i.e.
1186 * inserting or pruning).
10a5d1ba
MD
1187 *
1188 * WARNING: We can only do this if inserting, i.e.
1189 * we are running on the backend.
d5530d22 1190 */
eaeff70d
MD
1191 if ((error = hammer_cursor_upgrade(cursor)) != 0)
1192 return(error);
9944ae54 1193 elm = &node->elms[i];
10a5d1ba 1194 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
36f82b23
MD
1195 hammer_modify_node(cursor->trans, cursor->node,
1196 &elm->base, sizeof(elm->base));
d5530d22 1197 elm->base = *cursor->right_bound;
10a5d1ba 1198 hammer_modify_node_done(cursor->node);
d5530d22 1199 --i;
fbc6e32a
MD
1200 } else {
1201 /*
9944ae54
MD
1202 * The push-down index is now i - 1. If we had
1203 * terminated on the right boundary this will point
1204 * us at the last element.
fbc6e32a
MD
1205 */
1206 --i;
1207 }
8cd0a023 1208 cursor->index = i;
6a37e7e4 1209 elm = &node->elms[i];
8cd0a023 1210
b3deaf57 1211 if (hammer_debug_btree) {
fe4fc0c0 1212 hammer_debug_btree_elm(cursor, elm, "RESULT-I", 0xffff);
b3deaf57
MD
1213 }
1214
6a37e7e4 1215 /*
f36a9737 1216 * We better have a valid subtree offset.
6a37e7e4 1217 */
f36a9737 1218 KKASSERT(elm->internal.subtree_offset != 0);
6a37e7e4 1219
8cd0a023
MD
1220 /*
1221 * Handle insertion and deletion requirements.
1222 *
1223 * If inserting split full nodes. The split code will
1224 * adjust cursor->node and cursor->index if the current
1225 * index winds up in the new node.
61aeeb33 1226 *
9944ae54
MD
1227 * If inserting and a left or right edge case was detected,
1228 * we cannot correct the left or right boundary and must
1229 * prepend and append an empty leaf node in order to make
1230 * the boundary correction.
1231 *
fd1fcc1a
TK
1232 * If we run out of space we set enospc but continue on
1233 * to a leaf.
8cd0a023 1234 */
61aeeb33 1235 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) {
fe7678ee 1236 if (btree_node_is_full(node)) {
8cd0a023 1237 error = btree_split_internal(cursor);
d26d0ae9
MD
1238 if (error) {
1239 if (error != ENOSPC)
1240 goto done;
1241 enospc = 1;
d26d0ae9 1242 }
8cd0a023
MD
1243 /*
1244 * reload stale pointers
1245 */
1246 i = cursor->index;
1247 node = cursor->node->ondisk;
1248 }
d26d0ae9 1249 }
427e5fc6
MD
1250
1251 /*
8cd0a023 1252 * Push down (push into new node, existing node becomes
d26d0ae9 1253 * the parent) and continue the search.
427e5fc6 1254 */
8cd0a023 1255 error = hammer_cursor_down(cursor);
47197d71 1256 /* node may have become stale */
8cd0a023
MD
1257 if (error)
1258 goto done;
1259 node = cursor->node->ondisk;
427e5fc6 1260 }
427e5fc6 1261
8cd0a023
MD
1262 /*
1263 * We are at a leaf, do a linear search of the key array.
d26d0ae9
MD
1264 *
1265 * On success the index is set to the matching element and 0
1266 * is returned.
1267 *
1268 * On failure the index is set to the insertion point and ENOENT
1269 * is returned.
8cd0a023
MD
1270 *
1271 * Boundaries are not stored in leaf nodes, so the index can wind
1272 * up to the left of element 0 (index == 0) or past the end of
f36a9737
MD
1273 * the array (index == node->count). It is also possible that the
1274 * leaf might be empty.
8cd0a023 1275 */
47637bff 1276 ++hammer_stats_btree_iterations;
fe7678ee 1277 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF);
8cd0a023 1278 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
46fe7ae1 1279 if (hammer_debug_btree) {
35a5249b
TK
1280 hkprintf("SEARCH-L %016jx count=%d\n",
1281 (intmax_t)cursor->node->node_offset,
46fe7ae1
MD
1282 node->count);
1283 }
8cd0a023 1284
af209b0f
MD
1285 /*
1286 * Try to shortcut the search before dropping into the
1287 * linear loop. Locate the first node where r <= 1.
1288 */
1289 i = hammer_btree_search_node(&cursor->key_beg, node);
1290 while (i < node->count) {
cb51be26 1291 ++hammer_stats_btree_elements;
fe7678ee
MD
1292 elm = &node->elms[i];
1293
1294 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base);
427e5fc6 1295
d5530d22 1296 if (hammer_debug_btree > 1)
33234d14 1297 hkprintf(" LELM %p [%d] r=%d\n", &node->elms[i], i, r);
d5530d22 1298
427e5fc6 1299 /*
fe7678ee 1300 * We are at a record element. Stop if we've flipped past
9582c7da
MD
1301 * key_beg, not counting the create_tid test. Allow the
1302 * r == 1 case (key_beg > element but differs only by its
1303 * create_tid) to fall through to the AS-OF check.
427e5fc6 1304 */
fe7678ee
MD
1305 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD);
1306
9582c7da 1307 if (r < 0)
d5530d22 1308 goto failed;
af209b0f
MD
1309 if (r > 1) {
1310 ++i;
d5530d22 1311 continue;
af209b0f 1312 }
427e5fc6 1313
66325755 1314 /*
9582c7da 1315 * Check our as-of timestamp against the element.
66325755 1316 */
eaeff70d 1317 if (flags & HAMMER_CURSOR_ASOF) {
fe7678ee 1318 if (hammer_btree_chkts(cursor->asof,
d113fda1 1319 &node->elms[i].base) != 0) {
af209b0f 1320 ++i;
d113fda1
MD
1321 continue;
1322 }
eaeff70d
MD
1323 /* success */
1324 } else {
af209b0f
MD
1325 if (r > 0) { /* can only be +1 */
1326 ++i;
9582c7da 1327 continue;
af209b0f 1328 }
eaeff70d 1329 /* success */
66325755 1330 }
d5530d22
MD
1331 cursor->index = i;
1332 error = 0;
eaeff70d 1333 if (hammer_debug_btree) {
35a5249b
TK
1334 hkprintf("RESULT-L %016jx[%d] (SUCCESS)\n",
1335 (intmax_t)cursor->node->node_offset, i);
eaeff70d 1336 }
d5530d22
MD
1337 goto done;
1338 }
1339
1340 /*
eaeff70d 1341 * The search of the leaf node failed. i is the insertion point.
d5530d22 1342 */
d5530d22 1343failed:
b3deaf57 1344 if (hammer_debug_btree) {
35a5249b
TK
1345 hkprintf("RESULT-L %016jx[%d] (FAILED)\n",
1346 (intmax_t)cursor->node->node_offset, i);
b3deaf57
MD
1347 }
1348
8cd0a023
MD
1349 /*
1350 * No exact match was found, i is now at the insertion point.
1351 *
1352 * If inserting split a full leaf before returning. This
1353 * may have the side effect of adjusting cursor->node and
1354 * cursor->index.
1355 */
1356 cursor->index = i;
eaeff70d 1357 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 &&
47197d71 1358 btree_node_is_full(node)) {
8cd0a023 1359 error = btree_split_leaf(cursor);
d26d0ae9
MD
1360 if (error) {
1361 if (error != ENOSPC)
1362 goto done;
1363 enospc = 1;
d26d0ae9
MD
1364 }
1365 /*
1366 * reload stale pointers
1367 */
8cd0a023
MD
1368 /* NOT USED
1369 i = cursor->index;
1370 node = &cursor->node->internal;
1371 */
8cd0a023 1372 }
d26d0ae9
MD
1373
1374 /*
1375 * We reached a leaf but did not find the key we were looking for.
1376 * If this is an insert we will be properly positioned for an insert
fd1fcc1a 1377 * (ENOENT) or unable to insert (ENOSPC).
d26d0ae9
MD
1378 */
1379 error = enospc ? ENOSPC : ENOENT;
8cd0a023 1380done:
427e5fc6
MD
1381 return(error);
1382}
1383
af209b0f
MD
1384/*
1385 * Heuristical search for the first element whos comparison is <= 1. May
1386 * return an index whos compare result is > 1 but may only return an index
1387 * whos compare result is <= 1 if it is the first element with that result.
1388 */
bcac4bbb 1389int
af209b0f
MD
1390hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node)
1391{
1392 int b;
1393 int s;
1394 int i;
1395 int r;
1396
1397 /*
1398 * Don't bother if the node does not have very many elements
1399 */
1400 b = 0;
1401 s = node->count;
1402 while (s - b > 4) {
1403 i = b + (s - b) / 2;
cb51be26 1404 ++hammer_stats_btree_elements;
af209b0f
MD
1405 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base);
1406 if (r <= 1) {
1407 s = i;
1408 } else {
1409 b = i;
1410 }
1411 }
1412 return(b);
1413}
1414
8cd0a023 1415
427e5fc6 1416/************************************************************************
8cd0a023 1417 * SPLITTING AND MERGING *
427e5fc6
MD
1418 ************************************************************************
1419 *
1420 * These routines do all the dirty work required to split and merge nodes.
1421 */
1422
1423/*
8cd0a023 1424 * Split an internal node into two nodes and move the separator at the split
fe7678ee 1425 * point to the parent.
427e5fc6 1426 *
8cd0a023
MD
1427 * (cursor->node, cursor->index) indicates the element the caller intends
1428 * to push into. We will adjust node and index if that element winds
427e5fc6 1429 * up in the split node.
8cd0a023 1430 *
47197d71
MD
1431 * If we are at the root of the filesystem a new root must be created with
1432 * two elements, one pointing to the original root and one pointing to the
8cd0a023 1433 * newly allocated split node.
427e5fc6
MD
1434 */
1435static
1436int
8cd0a023 1437btree_split_internal(hammer_cursor_t cursor)
427e5fc6 1438{
8cd0a023
MD
1439 hammer_node_ondisk_t ondisk;
1440 hammer_node_t node;
1441 hammer_node_t parent;
1442 hammer_node_t new_node;
1443 hammer_btree_elm_t elm;
1444 hammer_btree_elm_t parent_elm;
1775b6a0 1445 struct hammer_node_lock lockroot;
36f82b23 1446 hammer_mount_t hmp = cursor->trans->hmp;
427e5fc6
MD
1447 int parent_index;
1448 int made_root;
1449 int split;
1450 int error;
7f7c1f84 1451 int i;
8cd0a023 1452 const int esize = sizeof(*elm);
427e5fc6 1453
1775b6a0 1454 hammer_node_lock_init(&lockroot, cursor->node);
24cf83d2 1455 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL);
47197d71
MD
1456 if (error)
1457 goto done;
7bc5b8c2
MD
1458 if ((error = hammer_cursor_upgrade(cursor)) != 0)
1459 goto done;
cb51be26 1460 ++hammer_stats_btree_splits;
6a37e7e4 1461
745703c7 1462 /*
fa2b9a03
MD
1463 * Calculate the split point. If the insertion point is at the
1464 * end of the leaf we adjust the split point significantly to the
1465 * right to try to optimize node fill and flag it. If we hit
1466 * that same leaf again our heuristic failed and we don't try
1467 * to optimize node fill (it could lead to a degenerate case).
427e5fc6 1468 */
8cd0a023
MD
1469 node = cursor->node;
1470 ondisk = node->ondisk;
fa2b9a03
MD
1471 KKASSERT(ondisk->count > 4);
1472 if (cursor->index == ondisk->count &&
1473 (node->flags & HAMMER_NODE_NONLINEAR) == 0) {
1474 split = (ondisk->count + 1) * 3 / 4;
1475 node->flags |= HAMMER_NODE_NONLINEAR;
1476 } else {
1477 /*
1478 * We are splitting but elms[split] will be promoted to
1479 * the parent, leaving the right hand node with one less
1480 * element. If the insertion point will be on the
1481 * left-hand side adjust the split point to give the
1482 * right hand side one additional node.
1483 */
1484 split = (ondisk->count + 1) / 2;
1485 if (cursor->index <= split)
1486 --split;
1487 }
427e5fc6
MD
1488
1489 /*
47197d71
MD
1490 * If we are at the root of the filesystem, create a new root node
1491 * with 1 element and split normally. Avoid making major
1492 * modifications until we know the whole operation will work.
427e5fc6 1493 */
8cd0a023 1494 if (ondisk->parent == 0) {
b4f86ea3 1495 parent = hammer_alloc_btree(cursor->trans, 0, &error);
427e5fc6 1496 if (parent == NULL)
6a37e7e4 1497 goto done;
8cd0a023 1498 hammer_lock_ex(&parent->lock);
36f82b23 1499 hammer_modify_node_noundo(cursor->trans, parent);
8cd0a023
MD
1500 ondisk = parent->ondisk;
1501 ondisk->count = 1;
1502 ondisk->parent = 0;
a56cb012 1503 ondisk->mirror_tid = node->ondisk->mirror_tid;
8cd0a023 1504 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
47197d71 1505 ondisk->elms[0].base = hmp->root_btree_beg;
fe7678ee 1506 ondisk->elms[0].base.btype = node->ondisk->type;
8cd0a023 1507 ondisk->elms[0].internal.subtree_offset = node->node_offset;
f0b930f3 1508 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid;
47197d71 1509 ondisk->elms[1].base = hmp->root_btree_end;
10a5d1ba 1510 hammer_modify_node_done(parent);
427e5fc6 1511 made_root = 1;
8cd0a023 1512 parent_index = 0; /* index of current node in parent */
427e5fc6
MD
1513 } else {
1514 made_root = 0;
8cd0a023
MD
1515 parent = cursor->parent;
1516 parent_index = cursor->parent_index;
427e5fc6 1517 }
427e5fc6
MD
1518
1519 /*
1520 * Split node into new_node at the split point.
1521 *
df2ccbac 1522 * B O O O P N N B <-- P = node->elms[split] (index 4)
427e5fc6
MD
1523 * 0 1 2 3 4 5 6 <-- subtree indices
1524 *
1525 * x x P x x
745703c7 1526 * s S S s
427e5fc6
MD
1527 * / \
1528 * B O O O B B N N B <--- inner boundary points are 'P'
745703c7 1529 * 0 1 2 3 4 5 6
427e5fc6 1530 */
b4f86ea3 1531 new_node = hammer_alloc_btree(cursor->trans, 0, &error);
427e5fc6 1532 if (new_node == NULL) {
8cd0a023
MD
1533 if (made_root) {
1534 hammer_unlock(&parent->lock);
36f82b23 1535 hammer_delete_node(cursor->trans, parent);
8cd0a023
MD
1536 hammer_rel_node(parent);
1537 }
6a37e7e4 1538 goto done;
427e5fc6 1539 }
8cd0a023 1540 hammer_lock_ex(&new_node->lock);
427e5fc6
MD
1541
1542 /*
8cd0a023 1543 * Create the new node. P becomes the left-hand boundary in the
427e5fc6
MD
1544 * new node. Copy the right-hand boundary as well.
1545 *
1546 * elm is the new separator.
1547 */
36f82b23
MD
1548 hammer_modify_node_noundo(cursor->trans, new_node);
1549 hammer_modify_node_all(cursor->trans, node);
8cd0a023
MD
1550 ondisk = node->ondisk;
1551 elm = &ondisk->elms[split];
1552 bcopy(elm, &new_node->ondisk->elms[0],
ae744f81 1553 (ondisk->count - split + 1) * esize); /* +1 for boundary */
8cd0a023
MD
1554 new_node->ondisk->count = ondisk->count - split;
1555 new_node->ondisk->parent = parent->node_offset;
1556 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
a56cb012 1557 new_node->ondisk->mirror_tid = ondisk->mirror_tid;
8cd0a023 1558 KKASSERT(ondisk->type == new_node->ondisk->type);
b3bad96f 1559 hammer_cursor_split_node(node, new_node, split);
427e5fc6
MD
1560
1561 /*
fe7678ee
MD
1562 * Cleanup the original node. Elm (P) becomes the new boundary,
1563 * its subtree_offset was moved to the new node. If we had created
427e5fc6
MD
1564 * a new root its parent pointer may have changed.
1565 */
b6361ed8 1566 elm->base.btype = HAMMER_BTREE_TYPE_NONE;
8cd0a023 1567 elm->internal.subtree_offset = 0;
c0ade690 1568 ondisk->count = split;
427e5fc6
MD
1569
1570 /*
1571 * Insert the separator into the parent, fixup the parent's
1572 * reference to the original node, and reference the new node.
1573 * The separator is P.
1574 *
3b7d788e 1575 * Remember that ondisk->count does not include the right-hand boundary.
427e5fc6 1576 */
36f82b23 1577 hammer_modify_node_all(cursor->trans, parent);
8cd0a023 1578 ondisk = parent->ondisk;
d26d0ae9 1579 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
8cd0a023 1580 parent_elm = &ondisk->elms[parent_index+1];
427e5fc6 1581 bcopy(parent_elm, parent_elm + 1,
8cd0a023 1582 (ondisk->count - parent_index) * esize);
ae744f81
TK
1583
1584 /*
1585 * Why not use hammer_make_separator() here ?
1586 */
8cd0a023 1587 parent_elm->internal.base = elm->base; /* separator P */
fe7678ee 1588 parent_elm->internal.base.btype = new_node->ondisk->type;
8cd0a023 1589 parent_elm->internal.subtree_offset = new_node->node_offset;
a56cb012 1590 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid;
76376933 1591 ++ondisk->count;
10a5d1ba 1592 hammer_modify_node_done(parent);
b3bad96f 1593 hammer_cursor_inserted_element(parent, parent_index + 1);
427e5fc6 1594
7f7c1f84
MD
1595 /*
1596 * The children of new_node need their parent pointer set to new_node.
b33e2cc0
MD
1597 * The children have already been locked by
1598 * hammer_btree_lock_children().
7f7c1f84
MD
1599 */
1600 for (i = 0; i < new_node->ondisk->count; ++i) {
1601 elm = &new_node->ondisk->elms[i];
90f96c37 1602 error = btree_set_parent_of_child(cursor->trans, new_node, elm);
7f7c1f84 1603 if (error) {
903fdd05 1604 hpanic("btree-fixup problem");
7f7c1f84
MD
1605 }
1606 }
10a5d1ba 1607 hammer_modify_node_done(new_node);
7f7c1f84 1608
427e5fc6 1609 /*
47197d71 1610 * The filesystem's root B-Tree pointer may have to be updated.
427e5fc6
MD
1611 */
1612 if (made_root) {
47197d71
MD
1613 hammer_volume_t volume;
1614
1615 volume = hammer_get_root_volume(hmp, &error);
1616 KKASSERT(error == 0);
1617
e8599db1
MD
1618 hammer_modify_volume_field(cursor->trans, volume,
1619 vol0_btree_root);
47197d71 1620 volume->ondisk->vol0_btree_root = parent->node_offset;
10a5d1ba 1621 hammer_modify_volume_done(volume);
8cd0a023
MD
1622 node->ondisk->parent = parent->node_offset;
1623 if (cursor->parent) {
1624 hammer_unlock(&cursor->parent->lock);
1625 hammer_rel_node(cursor->parent);
1626 }
1627 cursor->parent = parent; /* lock'd and ref'd */
47197d71 1628 hammer_rel_volume(volume, 0);
427e5fc6 1629 }
10a5d1ba 1630 hammer_modify_node_done(node);
427e5fc6
MD
1631
1632 /*
1633 * Ok, now adjust the cursor depending on which element the original
1634 * index was pointing at. If we are >= the split point the push node
1635 * is now in the new node.
1636 *
1637 * NOTE: If we are at the split point itself we cannot stay with the
1638 * original node because the push index will point at the right-hand
1639 * boundary, which is illegal.
8cd0a023
MD
1640 *
1641 * NOTE: The cursor's parent or parent_index must be adjusted for
1642 * the case where a new parent (new root) was created, and the case
1643 * where the cursor is now pointing at the split node.
427e5fc6
MD
1644 */
1645 if (cursor->index >= split) {
8cd0a023 1646 cursor->parent_index = parent_index + 1;
427e5fc6 1647 cursor->index -= split;
8cd0a023
MD
1648 hammer_unlock(&cursor->node->lock);
1649 hammer_rel_node(cursor->node);
1650 cursor->node = new_node; /* locked and ref'd */
1651 } else {
1652 cursor->parent_index = parent_index;
1653 hammer_unlock(&new_node->lock);
1654 hammer_rel_node(new_node);
427e5fc6 1655 }
76376933
MD
1656
1657 /*
1658 * Fixup left and right bounds
1659 */
1660 parent_elm = &parent->ondisk->elms[cursor->parent_index];
fbc6e32a
MD
1661 cursor->left_bound = &parent_elm[0].internal.base;
1662 cursor->right_bound = &parent_elm[1].internal.base;
b3deaf57
MD
1663 KKASSERT(hammer_btree_cmp(cursor->left_bound,
1664 &cursor->node->ondisk->elms[0].internal.base) <= 0);
1665 KKASSERT(hammer_btree_cmp(cursor->right_bound,
9944ae54 1666 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0);
76376933 1667
6a37e7e4 1668done:
24cf83d2 1669 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL);
6a37e7e4
MD
1670 hammer_cursor_downgrade(cursor);
1671 return (error);
427e5fc6
MD
1672}
1673
1674/*
1675 * Same as the above, but splits a full leaf node.
1676 */
1677static
1678int
8cd0a023 1679btree_split_leaf(hammer_cursor_t cursor)
427e5fc6 1680{
8cd0a023
MD
1681 hammer_node_ondisk_t ondisk;
1682 hammer_node_t parent;
1683 hammer_node_t leaf;
47197d71 1684 hammer_mount_t hmp;
8cd0a023
MD
1685 hammer_node_t new_leaf;
1686 hammer_btree_elm_t elm;
1687 hammer_btree_elm_t parent_elm;
b3deaf57 1688 hammer_base_elm_t mid_boundary;
427e5fc6
MD
1689 int parent_index;
1690 int made_root;
1691 int split;
1692 int error;
8cd0a023 1693 const size_t esize = sizeof(*elm);
427e5fc6 1694
6a37e7e4
MD
1695 if ((error = hammer_cursor_upgrade(cursor)) != 0)
1696 return(error);
cb51be26 1697 ++hammer_stats_btree_splits;
6a37e7e4 1698
36f82b23
MD
1699 KKASSERT(hammer_btree_cmp(cursor->left_bound,
1700 &cursor->node->ondisk->elms[0].leaf.base) <= 0);
1701 KKASSERT(hammer_btree_cmp(cursor->right_bound,
1702 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0);
1703
745703c7 1704 /*
fa2b9a03
MD
1705 * Calculate the split point. If the insertion point is at the
1706 * end of the leaf we adjust the split point significantly to the
1707 * right to try to optimize node fill and flag it. If we hit
1708 * that same leaf again our heuristic failed and we don't try
1709 * to optimize node fill (it could lead to a degenerate case).
427e5fc6 1710 */
8cd0a023
MD
1711 leaf = cursor->node;
1712 ondisk = leaf->ondisk;
fa2b9a03
MD
1713 KKASSERT(ondisk->count > 4);
1714 if (cursor->index == ondisk->count &&
1715 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) {
1716 split = (ondisk->count + 1) * 3 / 4;
1717 leaf->flags |= HAMMER_NODE_NONLINEAR;
1718 } else {
1719 split = (ondisk->count + 1) / 2;
1720 }
1721
1722#if 0
1723 /*
1724 * If the insertion point is at the split point shift the
1725 * split point left so we don't have to worry about
1726 */
1727 if (cursor->index == split)
427e5fc6 1728 --split;
fa2b9a03
MD
1729#endif
1730 KKASSERT(split > 0 && split < ondisk->count);
1731
427e5fc6 1732 error = 0;
40043e7f 1733 hmp = leaf->hmp;
427e5fc6 1734
fe7678ee 1735 elm = &ondisk->elms[split];
fe7678ee 1736
36f82b23
MD
1737 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0);
1738 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0);
1739 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0);
1740 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0);
1741
427e5fc6
MD
1742 /*
1743 * If we are at the root of the tree, create a new root node with
1744 * 1 element and split normally. Avoid making major modifications
1745 * until we know the whole operation will work.
1746 */
8cd0a023 1747 if (ondisk->parent == 0) {
b4f86ea3 1748 parent = hammer_alloc_btree(cursor->trans, 0, &error);
427e5fc6 1749 if (parent == NULL)
6a37e7e4 1750 goto done;
8cd0a023 1751 hammer_lock_ex(&parent->lock);
36f82b23 1752 hammer_modify_node_noundo(cursor->trans, parent);
8cd0a023
MD
1753 ondisk = parent->ondisk;
1754 ondisk->count = 1;
1755 ondisk->parent = 0;
a56cb012 1756 ondisk->mirror_tid = leaf->ondisk->mirror_tid;
8cd0a023 1757 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
47197d71 1758 ondisk->elms[0].base = hmp->root_btree_beg;
fe7678ee 1759 ondisk->elms[0].base.btype = leaf->ondisk->type;
8cd0a023 1760 ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
f0b930f3 1761 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid;
47197d71 1762 ondisk->elms[1].base = hmp->root_btree_end;
10a5d1ba 1763 hammer_modify_node_done(parent);
427e5fc6 1764 made_root = 1;
8cd0a023 1765 parent_index = 0; /* insertion point in parent */
427e5fc6
MD
1766 } else {
1767 made_root = 0;
8cd0a023
MD
1768 parent = cursor->parent;
1769 parent_index = cursor->parent_index;
427e5fc6 1770 }
427e5fc6
MD
1771
1772 /*
1773 * Split leaf into new_leaf at the split point. Select a separator
1774 * value in-between the two leafs but with a bent towards the right
1775 * leaf since comparisons use an 'elm >= separator' inequality.
1776 *
1777 * L L L L L L L L
1778 *
1779 * x x P x x
745703c7 1780 * s S S s
427e5fc6
MD
1781 * / \
1782 * L L L L L L L L
1783 */
b4f86ea3 1784 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error);
427e5fc6 1785 if (new_leaf == NULL) {
8cd0a023
MD
1786 if (made_root) {
1787 hammer_unlock(&parent->lock);
36f82b23 1788 hammer_delete_node(cursor->trans, parent);
8cd0a023
MD
1789 hammer_rel_node(parent);
1790 }
6a37e7e4 1791 goto done;
427e5fc6 1792 }
8cd0a023 1793 hammer_lock_ex(&new_leaf->lock);
427e5fc6
MD
1794
1795 /*
745703c7 1796 * Create the new node and copy the leaf elements from the split
36f82b23 1797 * point on to the new node.
427e5fc6 1798 */
36f82b23
MD
1799 hammer_modify_node_all(cursor->trans, leaf);
1800 hammer_modify_node_noundo(cursor->trans, new_leaf);
8cd0a023
MD
1801 ondisk = leaf->ondisk;
1802 elm = &ondisk->elms[split];
1803 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1804 new_leaf->ondisk->count = ondisk->count - split;
1805 new_leaf->ondisk->parent = parent->node_offset;
1806 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
a56cb012 1807 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid;
8cd0a023 1808 KKASSERT(ondisk->type == new_leaf->ondisk->type);
10a5d1ba 1809 hammer_modify_node_done(new_leaf);
b3bad96f 1810 hammer_cursor_split_node(leaf, new_leaf, split);
427e5fc6
MD
1811
1812 /*
8cd0a023
MD
1813 * Cleanup the original node. Because this is a leaf node and
1814 * leaf nodes do not have a right-hand boundary, there
c0ade690
MD
1815 * aren't any special edge cases to clean up. We just fixup the
1816 * count.
427e5fc6 1817 */
c0ade690 1818 ondisk->count = split;
427e5fc6
MD
1819
1820 /*
1821 * Insert the separator into the parent, fixup the parent's
1822 * reference to the original node, and reference the new node.
1823 * The separator is P.
1824 *
3b7d788e 1825 * Remember that ondisk->count does not include the right-hand boundary.
427e5fc6
MD
1826 * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1827 */
36f82b23 1828 hammer_modify_node_all(cursor->trans, parent);
8cd0a023 1829 ondisk = parent->ondisk;
36f82b23 1830 KKASSERT(split != 0);
d26d0ae9 1831 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
8cd0a023 1832 parent_elm = &ondisk->elms[parent_index+1];
d26d0ae9
MD
1833 bcopy(parent_elm, parent_elm + 1,
1834 (ondisk->count - parent_index) * esize);
eaeff70d 1835
ae744f81
TK
1836 /*
1837 * elm[-1] is the right-most elm in the original node.
1838 * elm[0] equals the left-most elm at index=0 in the new node.
1839 * parent_elm[-1] and parent_elm point to original and new node.
1840 * Update the parent_elm base to meet >elm[-1] and <=elm[0].
1841 */
47197d71 1842 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
fe7678ee 1843 parent_elm->internal.base.btype = new_leaf->ondisk->type;
8cd0a023 1844 parent_elm->internal.subtree_offset = new_leaf->node_offset;
a56cb012 1845 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid;
b3deaf57 1846 mid_boundary = &parent_elm->base;
76376933 1847 ++ondisk->count;
10a5d1ba 1848 hammer_modify_node_done(parent);
b3bad96f 1849 hammer_cursor_inserted_element(parent, parent_index + 1);
427e5fc6 1850
fe7678ee 1851 /*
47197d71 1852 * The filesystem's root B-Tree pointer may have to be updated.
427e5fc6
MD
1853 */
1854 if (made_root) {
47197d71
MD
1855 hammer_volume_t volume;
1856
1857 volume = hammer_get_root_volume(hmp, &error);
1858 KKASSERT(error == 0);
1859
e8599db1
MD
1860 hammer_modify_volume_field(cursor->trans, volume,
1861 vol0_btree_root);
47197d71 1862 volume->ondisk->vol0_btree_root = parent->node_offset;
10a5d1ba 1863 hammer_modify_volume_done(volume);
8cd0a023
MD
1864 leaf->ondisk->parent = parent->node_offset;
1865 if (cursor->parent) {
1866 hammer_unlock(&cursor->parent->lock);
1867 hammer_rel_node(cursor->parent);
1868 }
1869 cursor->parent = parent; /* lock'd and ref'd */
47197d71 1870 hammer_rel_volume(volume, 0);
427e5fc6 1871 }
10a5d1ba 1872 hammer_modify_node_done(leaf);
8cd0a023 1873
427e5fc6
MD
1874 /*
1875 * Ok, now adjust the cursor depending on which element the original
1876 * index was pointing at. If we are >= the split point the push node
1877 * is now in the new node.
1878 *
b3deaf57
MD
1879 * NOTE: If we are at the split point itself we need to select the
1880 * old or new node based on where key_beg's insertion point will be.
1881 * If we pick the wrong side the inserted element will wind up in
1882 * the wrong leaf node and outside that node's bounds.
427e5fc6 1883 */
b3deaf57
MD
1884 if (cursor->index > split ||
1885 (cursor->index == split &&
1886 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) {
8cd0a023 1887 cursor->parent_index = parent_index + 1;
427e5fc6 1888 cursor->index -= split;
8cd0a023
MD
1889 hammer_unlock(&cursor->node->lock);
1890 hammer_rel_node(cursor->node);
1891 cursor->node = new_leaf;
1892 } else {
1893 cursor->parent_index = parent_index;
1894 hammer_unlock(&new_leaf->lock);
1895 hammer_rel_node(new_leaf);
427e5fc6 1896 }
76376933
MD
1897
1898 /*
1899 * Fixup left and right bounds
1900 */
1901 parent_elm = &parent->ondisk->elms[cursor->parent_index];
fbc6e32a
MD
1902 cursor->left_bound = &parent_elm[0].internal.base;
1903 cursor->right_bound = &parent_elm[1].internal.base;
eaeff70d
MD
1904
1905 /*
47197d71 1906 * Assert that the bounds are correct.
eaeff70d 1907 */
b3deaf57
MD
1908 KKASSERT(hammer_btree_cmp(cursor->left_bound,
1909 &cursor->node->ondisk->elms[0].leaf.base) <= 0);
1910 KKASSERT(hammer_btree_cmp(cursor->right_bound,
47197d71 1911 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0);
36f82b23
MD
1912 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0);
1913 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0);
76376933 1914
6a37e7e4
MD
1915done:
1916 hammer_cursor_downgrade(cursor);
1917 return (error);
427e5fc6
MD
1918}
1919
adf01747
MD
1920#if 0
1921
32c90105
MD
1922/*
1923 * Recursively correct the right-hand boundary's create_tid to (tid) as
1924 * long as the rest of the key matches. We have to recurse upward in
1925 * the tree as well as down the left side of each parent's right node.
1926 *
1927 * Return EDEADLK if we were only partially successful, forcing the caller
1928 * to try again. The original cursor is not modified. This routine can
1929 * also fail with EDEADLK if it is forced to throw away a portion of its
1930 * record history.
1931 *
1932 * The caller must pass a downgraded cursor to us (otherwise we can't dup it).
1933 */
1934struct hammer_rhb {
1935 TAILQ_ENTRY(hammer_rhb) entry;
1936 hammer_node_t node;
1937 int index;
1938};
1939
1940TAILQ_HEAD(hammer_rhb_list, hammer_rhb);
1941
1942int
1943hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid)
1944{
ba2be8e9 1945 hammer_mount_t hmp;
32c90105
MD
1946 struct hammer_rhb_list rhb_list;
1947 hammer_base_elm_t elm;
1948 hammer_node_t orig_node;
1949 struct hammer_rhb *rhb;
1950 int orig_index;
1951 int error;
1952
1953 TAILQ_INIT(&rhb_list);
bac808fe 1954 hmp = cursor->trans->hmp;
32c90105
MD
1955
1956 /*
1957 * Save our position so we can restore it on return. This also
1958 * gives us a stable 'elm'.
1959 */
1960 orig_node = cursor->node;
1961 hammer_ref_node(orig_node);
1962 hammer_lock_sh(&orig_node->lock);
1963 orig_index = cursor->index;
1964 elm = &orig_node->ondisk->elms[orig_index].base;
1965
1966 /*
1967 * Now build a list of parents going up, allocating a rhb
1968 * structure for each one.
1969 */
1970 while (cursor->parent) {
1971 /*
1972 * Stop if we no longer have any right-bounds to fix up
1973 */
1974 if (elm->obj_id != cursor->right_bound->obj_id ||
1975 elm->rec_type != cursor->right_bound->rec_type ||
1976 elm->key != cursor->right_bound->key) {
1977 break;
1978 }
1979
1980 /*
1981 * Stop if the right-hand bound's create_tid does not
47197d71 1982 * need to be corrected.
32c90105
MD
1983 */
1984 if (cursor->right_bound->create_tid >= tid)
1985 break;
1986
bac808fe 1987 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO);
32c90105
MD
1988 rhb->node = cursor->parent;
1989 rhb->index = cursor->parent_index;
1990 hammer_ref_node(rhb->node);
1991 hammer_lock_sh(&rhb->node->lock);
1992 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry);
1993
1994 hammer_cursor_up(cursor);
1995 }
1996
1997 /*
1998 * now safely adjust the right hand bound for each rhb. This may
1999 * also require taking the right side of the tree and iterating down
2000 * ITS left side.
2001 */
2002 error = 0;
2003 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2004 error = hammer_cursor_seek(cursor, rhb->node, rhb->index);
32c90105
MD
2005 if (error)
2006 break;
2007 TAILQ_REMOVE(&rhb_list, rhb, entry);
2008 hammer_unlock(&rhb->node->lock);
2009 hammer_rel_node(rhb->node);
bac808fe 2010 kfree(rhb, hmp->m_misc);
32c90105
MD
2011
2012 switch (cursor->node->ondisk->type) {
2013 case HAMMER_BTREE_TYPE_INTERNAL:
2014 /*
2015 * Right-boundary for parent at internal node
2016 * is one element to the right of the element whos
2017 * right boundary needs adjusting. We must then
2018 * traverse down the left side correcting any left
2019 * bounds (which may now be too far to the left).
2020 */
2021 ++cursor->index;
2022 error = hammer_btree_correct_lhb(cursor, tid);
2023 break;
32c90105 2024 default:
903fdd05 2025 hpanic("Bad node type");
32c90105
MD
2026 error = EINVAL;
2027 break;
2028 }
2029 }
2030
2031 /*
2032 * Cleanup
2033 */
2034 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2035 TAILQ_REMOVE(&rhb_list, rhb, entry);
2036 hammer_unlock(&rhb->node->lock);
2037 hammer_rel_node(rhb->node);
bac808fe 2038 kfree(rhb, hmp->m_misc);
32c90105
MD
2039 }
2040 error = hammer_cursor_seek(cursor, orig_node, orig_index);
2041 hammer_unlock(&orig_node->lock);
2042 hammer_rel_node(orig_node);
2043 return (error);
2044}
2045
2046/*
2047 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand
2048 * bound going downward starting at the current cursor position.
2049 *
2050 * This function does not restore the cursor after use.
2051 */
2052int
2053hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid)
2054{
2055 struct hammer_rhb_list rhb_list;
2056 hammer_base_elm_t elm;
2057 hammer_base_elm_t cmp;
2058 struct hammer_rhb *rhb;
ba2be8e9 2059 hammer_mount_t hmp;
32c90105
MD
2060 int error;
2061
2062 TAILQ_INIT(&rhb_list);
bac808fe 2063 hmp = cursor->trans->hmp;
32c90105
MD
2064
2065 cmp = &cursor->node->ondisk->elms[cursor->index].base;
2066
2067 /*
2068 * Record the node and traverse down the left-hand side for all
2069 * matching records needing a boundary correction.
2070 */
2071 error = 0;
2072 for (;;) {
bac808fe 2073 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO);
32c90105
MD
2074 rhb->node = cursor->node;
2075 rhb->index = cursor->index;
2076 hammer_ref_node(rhb->node);
2077 hammer_lock_sh(&rhb->node->lock);
2078 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry);
2079
2080 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
2081 /*
2082 * Nothing to traverse down if we are at the right
2083 * boundary of an internal node.
2084 */
2085 if (cursor->index == cursor->node->ondisk->count)
2086 break;
2087 } else {
2088 elm = &cursor->node->ondisk->elms[cursor->index].base;
2089 if (elm->btype == HAMMER_BTREE_TYPE_RECORD)
2090 break;
5134aacd 2091 hpanic("Illegal leaf record type %02x", elm->btype);
32c90105
MD
2092 }
2093 error = hammer_cursor_down(cursor);
2094 if (error)
2095 break;
2096
2097 elm = &cursor->node->ondisk->elms[cursor->index].base;
2098 if (elm->obj_id != cmp->obj_id ||
2099 elm->rec_type != cmp->rec_type ||
2100 elm->key != cmp->key) {
2101 break;
2102 }
2103 if (elm->create_tid >= tid)
2104 break;
2105
2106 }
2107
2108 /*
2109 * Now we can safely adjust the left-hand boundary from the bottom-up.
2110 * The last element we remove from the list is the caller's right hand
2111 * boundary, which must also be adjusted.
2112 */
2113 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2114 error = hammer_cursor_seek(cursor, rhb->node, rhb->index);
2115 if (error)
2116 break;
2117 TAILQ_REMOVE(&rhb_list, rhb, entry);
2118 hammer_unlock(&rhb->node->lock);
2119 hammer_rel_node(rhb->node);
bac808fe 2120 kfree(rhb, hmp->m_misc);
32c90105
MD
2121
2122 elm = &cursor->node->ondisk->elms[cursor->index].base;
2123 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
36f82b23 2124 hammer_modify_node(cursor->trans, cursor->node,
19619882
MD
2125 &elm->create_tid,
2126 sizeof(elm->create_tid));
32c90105 2127 elm->create_tid = tid;
10a5d1ba 2128 hammer_modify_node_done(cursor->node);
32c90105 2129 } else {
903fdd05 2130 hpanic("Bad element type");
32c90105
MD
2131 }
2132 }
2133
2134 /*
2135 * Cleanup
2136 */
2137 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2138 TAILQ_REMOVE(&rhb_list, rhb, entry);
2139 hammer_unlock(&rhb->node->lock);
2140 hammer_rel_node(rhb->node);
bac808fe 2141 kfree(rhb, hmp->m_misc);
32c90105
MD
2142 }
2143 return (error);
2144}
2145
adf01747
MD
2146#endif
2147
427e5fc6 2148/*
f36a9737
MD
2149 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at
2150 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete
2151 * the operation due to a deadlock, or some other error.
8cd0a023 2152 *
5c8d05e2
MD
2153 * This routine is initially called with an empty leaf and may be
2154 * recursively called with single-element internal nodes.
b3deaf57 2155 *
c82af904
MD
2156 * It should also be noted that when removing empty leaves we must be sure
2157 * to test and update mirror_tid because another thread may have deadlocked
5fa5c92f 2158 * against us (or someone) trying to propagate it up and cannot retry once
c82af904
MD
2159 * the node has been deleted.
2160 *
f36a9737
MD
2161 * On return the cursor may end up pointing to an internal node, suitable
2162 * for further iteration but not for an immediate insertion or deletion.
8cd0a023 2163 */
f36a9737 2164static int
03b6feea 2165btree_remove(hammer_cursor_t cursor, int *ndelete)
8cd0a023
MD
2166{
2167 hammer_node_ondisk_t ondisk;
195c19a1 2168 hammer_btree_elm_t elm;
195c19a1 2169 hammer_node_t node;
8cd0a023 2170 hammer_node_t parent;
fe7678ee 2171 const int esize = sizeof(*elm);
8cd0a023 2172 int error;
8cd0a023 2173
fe7678ee
MD
2174 node = cursor->node;
2175
47197d71
MD
2176 /*
2177 * When deleting the root of the filesystem convert it to
2178 * an empty leaf node. Internal nodes cannot be empty.
2179 */
c82af904
MD
2180 ondisk = node->ondisk;
2181 if (ondisk->parent == 0) {
f36a9737 2182 KKASSERT(cursor->parent == NULL);
36f82b23 2183 hammer_modify_node_all(cursor->trans, node);
c82af904 2184 KKASSERT(ondisk == node->ondisk);
195c19a1
MD
2185 ondisk->type = HAMMER_BTREE_TYPE_LEAF;
2186 ondisk->count = 0;
10a5d1ba 2187 hammer_modify_node_done(node);
b3deaf57 2188 cursor->index = 0;
47197d71 2189 return(0);
8cd0a023
MD
2190 }
2191
c82af904
MD
2192 parent = cursor->parent;
2193
8cd0a023 2194 /*
f36a9737 2195 * Attempt to remove the parent's reference to the child. If the
745703c7 2196 * parent would become empty we have to recurse. If we fail we
f36a9737 2197 * leave the parent pointing to an empty leaf node.
5c8d05e2
MD
2198 *
2199 * We have to recurse successfully before we can delete the internal
2200 * node as it is illegal to have empty internal nodes. Even though
2201 * the operation may be aborted we must still fixup any unlocked
2202 * cursors as if we had deleted the element prior to recursing
2203 * (by calling hammer_cursor_deleted_element()) so those cursors
2204 * are properly forced up the chain by the recursion.
8cd0a023 2205 */
f36a9737
MD
2206 if (parent->ondisk->count == 1) {
2207 /*
2208 * This special cursor_up_locked() call leaves the original
2209 * node exclusively locked and referenced, leaves the
2210 * original parent locked (as the new node), and locks the
2211 * new parent. It can return EDEADLK.
f3a4893b
MD
2212 *
2213 * We cannot call hammer_cursor_removed_node() until we are
2214 * actually able to remove the node. If we did then tracked
2215 * cursors in the middle of iterations could be repointed
2216 * to a parent node. If this occurs they could end up
2217 * scanning newly inserted records into the node (that could
2218 * not be deleted) when they push down again.
2219 *
2220 * Due to the way the recursion works the final parent is left
2221 * in cursor->parent after the recursion returns. Each
2222 * layer on the way back up is thus able to call
2223 * hammer_cursor_removed_node() and 'jump' the node up to
2224 * the (same) final parent.
2225 *
2226 * NOTE! The local variable 'parent' is invalid after we
2227 * call hammer_cursor_up_locked().
f36a9737
MD
2228 */
2229 error = hammer_cursor_up_locked(cursor);
f3a4893b
MD
2230 parent = NULL;
2231
f36a9737 2232 if (error == 0) {
5c8d05e2 2233 hammer_cursor_deleted_element(cursor->node, 0);
03b6feea 2234 error = btree_remove(cursor, ndelete);
f36a9737 2235 if (error == 0) {
901ba05c 2236 KKASSERT(node != cursor->node);
f3a4893b 2237 hammer_cursor_removed_node(
653fa4cd 2238 node, cursor->node, cursor->index);
f36a9737
MD
2239 hammer_modify_node_all(cursor->trans, node);
2240 ondisk = node->ondisk;
2241 ondisk->type = HAMMER_BTREE_TYPE_DELETED;
2242 ondisk->count = 0;
2243 hammer_modify_node_done(node);
250aec18 2244 hammer_flush_node(node, 0);
f36a9737 2245 hammer_delete_node(cursor->trans, node);
03b6feea
TK
2246 if (ndelete)
2247 (*ndelete)++;
f36a9737 2248 } else {
3214ade6
MD
2249 /*
2250 * Defer parent removal because we could not
2251 * get the lock, just let the leaf remain
2252 * empty.
2253 */
8260ce22
TK
2254 /*
2255 * hammer show doesn't consider this as an error.
2256 */
f36a9737
MD
2257 }
2258 hammer_unlock(&node->lock);
2259 hammer_rel_node(node);
2260 } else {
3214ade6
MD
2261 /*
2262 * Defer parent removal because we could not
2263 * get the lock, just let the leaf remain
2264 * empty.
2265 */
8260ce22
TK
2266 /*
2267 * hammer show doesn't consider this as an error.
2268 */
f36a9737
MD
2269 }
2270 } else {
2271 KKASSERT(parent->ondisk->count > 1);
6a37e7e4 2272
f36a9737
MD
2273 hammer_modify_node_all(cursor->trans, parent);
2274 ondisk = parent->ondisk;
2275 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
c82af904 2276
f36a9737
MD
2277 elm = &ondisk->elms[cursor->parent_index];
2278 KKASSERT(elm->internal.subtree_offset == node->node_offset);
2279 KKASSERT(ondisk->count > 0);
4c038e17
MD
2280
2281 /*
2282 * We must retain the highest mirror_tid. The deleted
2283 * range is now encompassed by the element to the left.
2284 * If we are already at the left edge the new left edge
2285 * inherits mirror_tid.
2286 *
2287 * Note that bounds of the parent to our parent may create
2288 * a gap to the left of our left-most node or to the right
2289 * of our right-most node. The gap is silently included
2290 * in the mirror_tid's area of effect from the point of view
2291 * of the scan.
2292 */
2293 if (cursor->parent_index) {
2294 if (elm[-1].internal.mirror_tid <
2295 elm[0].internal.mirror_tid) {
2296 elm[-1].internal.mirror_tid =
2297 elm[0].internal.mirror_tid;
2298 }
2299 } else {
2300 if (elm[1].internal.mirror_tid <
2301 elm[0].internal.mirror_tid) {
2302 elm[1].internal.mirror_tid =
2303 elm[0].internal.mirror_tid;
2304 }
2305 }
2306
2307 /*
f3a4893b
MD
2308 * Delete the subtree reference in the parent. Include
2309 * boundary element at end.
4c038e17 2310 */
f36a9737
MD
2311 bcopy(&elm[1], &elm[0],
2312 (ondisk->count - cursor->parent_index) * esize);
2313 --ondisk->count;
10a5d1ba 2314 hammer_modify_node_done(parent);
f3a4893b 2315 hammer_cursor_removed_node(node, parent, cursor->parent_index);
6c1f89f4 2316 hammer_cursor_deleted_element(parent, cursor->parent_index);
250aec18 2317 hammer_flush_node(node, 0);
f36a9737 2318 hammer_delete_node(cursor->trans, node);
6a37e7e4 2319
f36a9737
MD
2320 /*
2321 * cursor->node is invalid, cursor up to make the cursor
ec9b6294
MD
2322 * valid again. We have to flag the condition in case
2323 * another thread wiggles an insertion in during an
2324 * iteration.
f36a9737 2325 */
ec9b6294 2326 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK;
f36a9737 2327 error = hammer_cursor_up(cursor);
03b6feea
TK
2328 if (ndelete)
2329 (*ndelete)++;
6a37e7e4 2330 }
f36a9737 2331 return (error);
6a37e7e4
MD
2332}
2333
602c6cb8 2334/*
b077a45c 2335 * Propagate mirror_tid up the B-Tree starting at the current cursor.
c9ce54d6
MD
2336 *
2337 * WARNING! Because we push and pop the passed cursor, it may be
2338 * modified by other B-Tree operations while it is unlocked
2339 * and things like the node & leaf pointers, and indexes might
2340 * change.
602c6cb8
MD
2341 */
2342void
4c038e17 2343hammer_btree_do_propagation(hammer_cursor_t cursor,
602c6cb8
MD
2344 hammer_btree_leaf_elm_t leaf)
2345{
adf01747
MD
2346 hammer_cursor_t ncursor;
2347 hammer_tid_t mirror_tid;
f31f6d84 2348 int error __debugvar;
602c6cb8
MD
2349
2350 /*
732a1697
MD
2351 * We do not propagate a mirror_tid if the filesystem was mounted
2352 * in no-mirror mode.
602c6cb8 2353 */
732a1697 2354 if (cursor->trans->hmp->master_id < 0)
602c6cb8 2355 return;
602c6cb8 2356
adf01747
MD
2357 /*
2358 * This is a bit of a hack because we cannot deadlock or return
2359 * EDEADLK here. The related operation has already completed and
2360 * we must propagate the mirror_tid now regardless.
2361 *
2362 * Generate a new cursor which inherits the original's locks and
2363 * unlock the original. Use the new cursor to propagate the
2364 * mirror_tid. Then clean up the new cursor and reacquire locks
2365 * on the original.
2366 *
2367 * hammer_dup_cursor() cannot dup locks. The dup inherits the
2368 * original's locks and the original is tracked and must be
2369 * re-locked.
2370 */
2371 mirror_tid = cursor->node->ondisk->mirror_tid;
a56cb012 2372 KKASSERT(mirror_tid != 0);
3f43fb33 2373 ncursor = hammer_push_cursor(cursor);
adf01747
MD
2374 error = hammer_btree_mirror_propagate(ncursor, mirror_tid);
2375 KKASSERT(error == 0);
3f43fb33 2376 hammer_pop_cursor(cursor, ncursor);
c9ce54d6 2377 /* WARNING: cursor's leaf pointer may change after pop */
602c6cb8
MD
2378}
2379
2380
c82af904
MD
2381/*
2382 * Propagate a mirror TID update upwards through the B-Tree to the root.
2383 *
2384 * A locked internal node must be passed in. The node will remain locked
2385 * on return.
2386 *
2387 * This function syncs mirror_tid at the specified internal node's element,
2388 * adjusts the node's aggregation mirror_tid, and then recurses upwards.
2389 */
602c6cb8 2390static int
adf01747 2391hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid)
c82af904
MD
2392{
2393 hammer_btree_internal_elm_t elm;
adf01747 2394 hammer_node_t node;
c82af904
MD
2395 int error;
2396
adf01747
MD
2397 for (;;) {
2398 error = hammer_cursor_up(cursor);
2399 if (error == 0)
2400 error = hammer_cursor_upgrade(cursor);
6dc17446
MD
2401
2402 /*
2403 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the
2404 * cursor will still be properly positioned for
2405 * mirror propagation, just not for iterations.
2406 */
adf01747
MD
2407 while (error == EDEADLK) {
2408 hammer_recover_cursor(cursor);
2409 error = hammer_cursor_upgrade(cursor);
2410 }
2411 if (error)
2412 break;
95885f20
MD
2413
2414 /*
2415 * If the cursor deadlocked it could end up at a leaf
2416 * after we lost the lock.
2417 */
adf01747 2418 node = cursor->node;
95885f20
MD
2419 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL)
2420 continue;
c82af904 2421
adf01747
MD
2422 /*
2423 * Adjust the node's element
2424 */
2425 elm = &node->ondisk->elms[cursor->index].internal;
2426 if (elm->mirror_tid >= mirror_tid)
2427 break;
2428 hammer_modify_node(cursor->trans, node, &elm->mirror_tid,
2429 sizeof(elm->mirror_tid));
2430 elm->mirror_tid = mirror_tid;
2431 hammer_modify_node_done(node);
02325004 2432 if (hammer_debug_general & 0x0002) {
35a5249b
TK
2433 hdkprintf("propagate %016jx @%016jx:%d\n",
2434 (intmax_t)mirror_tid,
2435 (intmax_t)node->node_offset,
973c11b9 2436 cursor->index);
02325004
MD
2437 }
2438
c82af904 2439
adf01747
MD
2440 /*
2441 * Adjust the node's mirror_tid aggregator
2442 */
2443 if (node->ondisk->mirror_tid >= mirror_tid)
2444 return(0);
2445 hammer_modify_node_field(cursor->trans, node, mirror_tid);
2446 node->ondisk->mirror_tid = mirror_tid;
2447 hammer_modify_node_done(node);
02325004 2448 if (hammer_debug_general & 0x0002) {
35a5249b
TK
2449 hdkprintf("propagate %016jx @%016jx\n",
2450 (intmax_t)mirror_tid,
2451 (intmax_t)node->node_offset);
02325004 2452 }
c82af904 2453 }
adf01747
MD
2454 if (error == ENOENT)
2455 error = 0;
c82af904
MD
2456 return(error);
2457}
2458
ae744f81
TK
2459/*
2460 * Return a pointer to node's parent. If there is no error,
2461 * *parent_index is set to an index of parent's elm that points
2462 * to this node.
2463 */
c82af904 2464hammer_node_t
82010f9f
MD
2465hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node,
2466 int *parent_indexp, int *errorp, int try_exclusive)
c82af904
MD
2467{
2468 hammer_node_t parent;
2469 hammer_btree_elm_t elm;
2470 int i;
2471
2472 /*
2473 * Get the node
2474 */
82010f9f 2475 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp);
c82af904
MD
2476 if (*errorp) {
2477 KKASSERT(parent == NULL);
2478 return(NULL);
2479 }
2480 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0);
2481
2482 /*
2483 * Lock the node
2484 */
2485 if (try_exclusive) {
2486 if (hammer_lock_ex_try(&parent->lock)) {
2487 hammer_rel_node(parent);
2488 *errorp = EDEADLK;
2489 return(NULL);
2490 }
2491 } else {
2492 hammer_lock_sh(&parent->lock);
2493 }
2494
2495 /*
2496 * Figure out which element in the parent is pointing to the
2497 * child.
2498 */
2499 if (node->ondisk->count) {
2500 i = hammer_btree_search_node(&node->ondisk->elms[0].base,
2501 parent->ondisk);
2502 } else {
2503 i = 0;
2504 }
2505 while (i < parent->ondisk->count) {
2506 elm = &parent->ondisk->elms[i];
2507 if (elm->internal.subtree_offset == node->node_offset)
2508 break;
2509 ++i;
2510 }
2511 if (i == parent->ondisk->count) {
2512 hammer_unlock(&parent->lock);
5134aacd 2513 hpanic("Bad B-Tree link: parent %p node %p", parent, node);
c82af904
MD
2514 }
2515 *parent_indexp = i;
2516 KKASSERT(*errorp == 0);
2517 return(parent);
2518}
2519
7f7c1f84 2520/*
fe7678ee
MD
2521 * The element (elm) has been moved to a new internal node (node).
2522 *
2523 * If the element represents a pointer to an internal node that node's
2524 * parent must be adjusted to the element's new location.
2525 *
6a37e7e4 2526 * XXX deadlock potential here with our exclusive locks
7f7c1f84 2527 */
7f7c1f84 2528int
90f96c37 2529btree_set_parent_of_child(hammer_transaction_t trans, hammer_node_t node,
36f82b23 2530 hammer_btree_elm_t elm)
7f7c1f84 2531{
7f7c1f84
MD
2532 hammer_node_t child;
2533 int error;
2534
2535 error = 0;
2536
1424c922 2537 if (hammer_is_internal_node_elm(elm)) {
82010f9f 2538 child = hammer_get_node(trans, elm->internal.subtree_offset,
19619882 2539 0, &error);
7f7c1f84 2540 if (error == 0) {
c9b9e29d 2541 hammer_modify_node_field(trans, child, parent);
7f7c1f84 2542 child->ondisk->parent = node->node_offset;
10a5d1ba 2543 hammer_modify_node_done(child);
7f7c1f84
MD
2544 hammer_rel_node(child);
2545 }
7f7c1f84
MD
2546 }
2547 return(error);
2548}
2549
1775b6a0
MD
2550/*
2551 * Initialize the root of a recursive B-Tree node lock list structure.
2552 */
2553void
2554hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node)
2555{
2556 TAILQ_INIT(&parent->list);
2557 parent->parent = NULL;
2558 parent->node = node;
2559 parent->index = -1;
2560 parent->count = node->ondisk->count;
2561 parent->copy = NULL;
2562 parent->flags = 0;
2563}
2564
24cf83d2
MD
2565/*
2566 * Initialize a cache of hammer_node_lock's including space allocated
2567 * for node copies.
2568 *
2569 * This is used by the rebalancing code to preallocate the copy space
2570 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER
2571 * locks, otherwise we can blow out the pageout daemon's emergency
2572 * reserve and deadlock it.
2573 *
2574 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache.
2575 * The flag is set when the item is pulled off the cache for use.
2576 */
2577void
2578hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache,
2579 int depth)
2580{
2581 hammer_node_lock_t item;
2582 int count;
2583
2584 for (count = 1; depth; --depth)
2585 count *= HAMMER_BTREE_LEAF_ELMS;
2586 bzero(lcache, sizeof(*lcache));
2587 TAILQ_INIT(&lcache->list);
2588 while (count) {
2589 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO);
2590 item->copy = kmalloc(sizeof(*item->copy),
2591 hmp->m_misc, M_WAITOK);
2592 TAILQ_INIT(&item->list);
2593 TAILQ_INSERT_TAIL(&lcache->list, item, entry);
2594 --count;
2595 }
2596}
2597
2598void
2599hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache)
2600{
2601 hammer_node_lock_t item;
2602
2603 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) {
2604 TAILQ_REMOVE(&lcache->list, item, entry);
2605 KKASSERT(item->copy);
2606 KKASSERT(TAILQ_EMPTY(&item->list));
2607 kfree(item->copy, hmp->m_misc);
2608 kfree(item, hmp->m_misc);
2609 }
2610 KKASSERT(lcache->copy == NULL);
2611}
2612
b33e2cc0
MD
2613/*
2614 * Exclusively lock all the children of node. This is used by the split
2615 * code to prevent anyone from accessing the children of a cursor node
2616 * while we fix-up its parent offset.
2617 *
2618 * If we don't lock the children we can really mess up cursors which block
2619 * trying to cursor-up into our node.
2620 *
b33e2cc0
MD
2621 * On failure EDEADLK (or some other error) is returned. If a deadlock
2622 * error is returned the cursor is adjusted to block on termination.
1775b6a0
MD
2623 *
2624 * The caller is responsible for managing parent->node, the root's node
2625 * is usually aliased from a cursor.
b33e2cc0
MD
2626 */
2627int
1775b6a0 2628hammer_btree_lock_children(hammer_cursor_t cursor, int depth,
24cf83d2
MD
2629 hammer_node_lock_t parent,
2630 hammer_node_lock_t lcache)
b33e2cc0
MD
2631{
2632 hammer_node_t node;
1775b6a0 2633 hammer_node_lock_t item;
b33e2cc0
MD
2634 hammer_node_ondisk_t ondisk;
2635 hammer_btree_elm_t elm;
b33e2cc0 2636 hammer_node_t child;
ba2be8e9 2637 hammer_mount_t hmp;
b33e2cc0
MD
2638 int error;
2639 int i;
2640
1775b6a0 2641 node = parent->node;
b33e2cc0
MD
2642 ondisk = node->ondisk;
2643 error = 0;
bac808fe 2644 hmp = cursor->trans->hmp;
7bc5b8c2 2645
5cbc7d52
TK
2646 if (ondisk->type != HAMMER_BTREE_TYPE_INTERNAL)
2647 return(0); /* This could return non-zero */
2648
7bc5b8c2
MD
2649 /*
2650 * We really do not want to block on I/O with exclusive locks held,
1775b6a0
MD
2651 * pre-get the children before trying to lock the mess. This is
2652 * only done one-level deep for now.
7bc5b8c2
MD
2653 */
2654 for (i = 0; i < ondisk->count; ++i) {
cb51be26 2655 ++hammer_stats_btree_elements;
7bc5b8c2 2656 elm = &ondisk->elms[i];
5cbc7d52
TK
2657 child = hammer_get_node(cursor->trans,
2658 elm->internal.subtree_offset,
2659 0, &error);
2660 if (child)
2661 hammer_rel_node(child);
7bc5b8c2
MD
2662 }
2663
2664 /*
2665 * Do it for real
2666 */
b33e2cc0 2667 for (i = 0; error == 0 && i < ondisk->count; ++i) {
cb51be26 2668 ++hammer_stats_btree_elements;
b33e2cc0
MD
2669 elm = &ondisk->elms[i];
2670
5cbc7d52
TK
2671 KKASSERT(elm->internal.subtree_offset != 0);
2672 child = hammer_get_node(cursor->trans,
2673 elm->internal.subtree_offset,
2674 0, &error);
b33e2cc0
MD
2675 if (child) {
2676 if (hammer_lock_ex_try(&child->lock) != 0) {
2677 if (cursor->deadlk_node == NULL) {
1ff9f58e 2678 cursor->deadlk_node = child;
b33e2cc0
MD
2679 hammer_ref_node(cursor->deadlk_node);
2680 }
2681 error = EDEADLK;
a84a197d 2682 hammer_rel_node(child);
b33e2cc0 2683 } else {
24cf83d2
MD
2684 if (lcache) {
2685 item = TAILQ_FIRST(&lcache->list);
2686 KKASSERT(item != NULL);
2687 item->flags |= HAMMER_NODE_LOCK_LCACHE;
653fa4cd 2688 TAILQ_REMOVE(&lcache->list, item, entry);
24cf83d2
MD
2689 } else {
2690 item = kmalloc(sizeof(*item),
2691 hmp->m_misc,
2692 M_WAITOK|M_ZERO);
2693 TAILQ_INIT(&item->list);
2694 }
2695
1775b6a0 2696 TAILQ_INSERT_TAIL(&parent->list, item, entry);
1775b6a0 2697 item->parent = parent;
b33e2cc0 2698 item->node = child;
1775b6a0
MD
2699 item->index = i;
2700 item->count = child->ondisk->count;
2701
2702 /*
2703 * Recurse (used by the rebalancing code)
2704 */
2705 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) {
2706 error = hammer_btree_lock_children(
2707 cursor,
2708 depth - 1,
24cf83d2
MD
2709 item,
2710 lcache);
1775b6a0 2711 }
b33e2cc0
MD
2712 }
2713 }
2714 }
2715 if (error)
24cf83d2 2716 hammer_btree_unlock_children(hmp, parent, lcache);
b33e2cc0
MD
2717 return(error);
2718}
2719
1775b6a0
MD
2720/*
2721 * Create an in-memory copy of all B-Tree nodes listed, recursively,
2722 * including the parent.
2723 */
2724void
2725hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent)
2726{
2727 hammer_mount_t hmp = cursor->trans->hmp;
2728 hammer_node_lock_t item;
2729
2730 if (parent->copy == NULL) {
24cf83d2
MD
2731 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0);
2732 parent->copy = kmalloc(sizeof(*parent->copy),
2733 hmp->m_misc, M_WAITOK);
1775b6a0 2734 }
24cf83d2
MD
2735 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0);
2736 *parent->copy = *parent->node->ondisk;
1775b6a0
MD
2737 TAILQ_FOREACH(item, &parent->list, entry) {
2738 hammer_btree_lock_copy(cursor, item);
2739 }
2740}
b33e2cc0
MD
2741
2742/*
1775b6a0 2743 * Recursively sync modified copies to the media.
b33e2cc0 2744 */
7ddc70d1 2745int
1775b6a0 2746hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent)
b33e2cc0 2747{
1775b6a0 2748 hammer_node_lock_t item;
7ddc70d1 2749 int count = 0;
1775b6a0
MD
2750
2751 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) {
7ddc70d1 2752 ++count;
1775b6a0
MD
2753 hammer_modify_node_all(cursor->trans, parent->node);
2754 *parent->node->ondisk = *parent->copy;
2755 hammer_modify_node_done(parent->node);
2756 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) {
250aec18 2757 hammer_flush_node(parent->node, 0);
1775b6a0
MD
2758 hammer_delete_node(cursor->trans, parent->node);
2759 }
2760 }
2761 TAILQ_FOREACH(item, &parent->list, entry) {
7ddc70d1 2762 count += hammer_btree_sync_copy(cursor, item);
1775b6a0 2763 }
7ddc70d1 2764 return(count);
1775b6a0 2765}
b33e2cc0 2766
1775b6a0
MD
2767/*
2768 * Release previously obtained node locks. The caller is responsible for
2769 * cleaning up parent->node itself (its usually just aliased from a cursor),
2770 * but this function will take care of the copies.
24cf83d2
MD
2771 *
2772 * NOTE: The root node is not placed in the lcache and node->copy is not
2773 * deallocated when lcache != NULL.
1775b6a0
MD
2774 */
2775void
24cf83d2
MD
2776hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent,
2777 hammer_node_lock_t lcache)
1775b6a0
MD
2778{
2779 hammer_node_lock_t item;
24cf83d2 2780 hammer_node_ondisk_t copy;
1775b6a0 2781
1775b6a0
MD
2782 while ((item = TAILQ_FIRST(&parent->list)) != NULL) {
2783 TAILQ_REMOVE(&parent->list, item, entry);
24cf83d2 2784 hammer_btree_unlock_children(hmp, item, lcache);
b33e2cc0
MD
2785 hammer_unlock(&item->node->lock);
2786 hammer_rel_node(item->node);
24cf83d2
MD
2787 if (lcache) {
2788 /*
2789 * NOTE: When placing the item back in the lcache
2790 * the flag is cleared by the bzero().
2791 * Remaining fields are cleared as a safety
2792 * measure.
2793 */
2794 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE);
2795 KKASSERT(TAILQ_EMPTY(&item->list));
2796 copy = item->copy;
2797 bzero(item, sizeof(*item));
2798 TAILQ_INIT(&item->list);
2799 item->copy = copy;
2800 if (copy)
2801 bzero(copy, sizeof(*copy));
2802 TAILQ_INSERT_TAIL(&lcache->list, item, entry);
2803 } else {
2804 kfree(item, hmp->m_misc);
2805 }
2806 }
2807 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) {
2808 kfree(parent->copy, hmp->m_misc);
2809 parent->copy = NULL; /* safety */
b33e2cc0
MD
2810 }
2811}
2812
8cd0a023
MD
2813/************************************************************************
2814 * MISCELLANIOUS SUPPORT *
2815 ************************************************************************/
2816
2817/*
d26d0ae9 2818 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp).
8cd0a023 2819 *
d113fda1 2820 * Note that for this particular function a return value of -1, 0, or +1
9582c7da 2821 * can denote a match if create_tid is otherwise discounted. A create_tid
d5530d22 2822 * of zero is considered to be 'infinity' in comparisons.
d113fda1 2823 *
8cd0a023 2824 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
8cd0a023
MD
2825 */
2826int
2827hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
2828{
2f85fa4d
MD
2829 if (key1->localization < key2->localization)
2830 return(-5);
2831 if (key1->localization > key2->localization)
2832 return(5);
2833
d26d0ae9
MD
2834 if (key1->obj_id < key2->obj_id)
2835 return(-4);
2836 if (key1->obj_id > key2->obj_id)
2837 return(4);
8cd0a023 2838
d26d0ae9
MD
2839 if (key1->rec_type < key2->rec_type)
2840 return(-3);
2841 if (key1->rec_type > key2->rec_type)
2842 return(3);
8cd0a023 2843
8cd0a023
MD
2844 if (key1->key < key2->key)
2845 return(-2);
2846 if (key1->key > key2->key)
2847 return(2);
d113fda1 2848
d5530d22 2849 /*
9582c7da
MD
2850 * A create_tid of zero indicates a record which is undeletable
2851 * and must be considered to have a value of positive infinity.
d5530d22 2852 */
9582c7da
MD
2853 if (key1->create_tid == 0) {
2854 if (key2->create_tid == 0)
d5530d22
MD
2855 return(0);
2856 return(1);
2857 }
9582c7da 2858 if (key2->create_tid == 0)
d5530d22 2859 return(-1);
9582c7da 2860 if (key1->create_tid < key2->create_tid)
d113fda1 2861 return(-1);
9582c7da 2862 if (key1->create_tid > key2->create_tid)
d113fda1 2863 return(1);
8cd0a023
MD
2864 return(0);
2865}
2866
c0ade690 2867/*
d5530d22
MD
2868 * Test a timestamp against an element to determine whether the
2869 * element is visible. A timestamp of 0 means 'infinity'.
c0ade690
MD
2870 */
2871int
d5530d22 2872hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base)
c0ade690 2873{
d5530d22
MD
2874 if (asof == 0) {
2875 if (base->delete_tid)
2876 return(1);
2877 return(0);
2878 }
2879 if (asof < base->create_tid)
d26d0ae9 2880 return(-1);
d5530d22 2881 if (base->delete_tid && asof >= base->delete_tid)
d26d0ae9 2882 return(1);
c0ade690
MD
2883 return(0);
2884}
2885
8cd0a023
MD
2886/*
2887 * Create a separator half way inbetween key1 and key2. For fields just
d5530d22
MD
2888 * one unit apart, the separator will match key2. key1 is on the left-hand
2889 * side and key2 is on the right-hand side.
8cd0a023 2890 *
9391cded 2891 * key2 must be >= the separator. It is ok for the separator to match key2.
36f82b23 2892 *
9391cded
MD
2893 * NOTE: Even if key1 does not match key2, the separator may wind up matching
2894 * key2.
2895 *
2896 * NOTE: It might be beneficial to just scrap this whole mess and just
2897 * set the separator to key2.
8cd0a023
MD
2898 */
2899#define MAKE_SEPARATOR(key1, key2, dest, field) \
2900 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
2901
2902static void
2903hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
2904 hammer_base_elm_t dest)
2905{
2906 bzero(dest, sizeof(*dest));
d5530d22 2907
9391cded
MD
2908 dest->rec_type = key2->rec_type;
2909 dest->key = key2->key;
2f85fa4d 2910 dest->obj_id = key2->obj_id;
9391cded
MD
2911 dest->create_tid = key2->create_tid;
2912
2f85fa4d
MD
2913 MAKE_SEPARATOR(key1, key2, dest, localization);
2914 if (key1->localization == key2->localization) {
2915 MAKE_SEPARATOR(key1, key2, dest, obj_id);
2916 if (key1->obj_id == key2->obj_id) {
2917 MAKE_SEPARATOR(key1, key2, dest, rec_type);
2918 if (key1->rec_type == key2->rec_type) {
2919 MAKE_SEPARATOR(key1, key2, dest, key);
2920 /*
2921 * Don't bother creating a separator for
2922 * create_tid, which also conveniently avoids
2923 * having to handle the create_tid == 0
2924 * (infinity) case. Just leave create_tid
2925 * set to key2.
2926 *
2927 * Worst case, dest matches key2 exactly,
2928 * which is acceptable.
2929 */
2930 }
d5530d22 2931 }
d113fda1 2932 }
8cd0a023
MD
2933}
2934
2935#undef MAKE_SEPARATOR
2936
2937/*
2938 * Return whether a generic internal or leaf node is full
2939 */
b70819e9
TK
2940static __inline
2941int
8cd0a023 2942btree_node_is_full(hammer_node_ondisk_t node)
8cd0a023 2943{
36211fc6
TK
2944 int n;
2945
23b37863 2946 n = hammer_node_max_elements(node->type);
36211fc6 2947 if (n == -1)
23b37863
TK
2948 hpanic("bad type %d", node->type);
2949
2950 return(n == node->count);
8cd0a023 2951}
8cd0a023 2952
c0ade690
MD
2953void
2954hammer_print_btree_node(hammer_node_ondisk_t ondisk)
2955{
a17eeab0 2956 int i, n;
c0ade690 2957
35a5249b 2958 kprintf("node %p count=%d parent=%016jx type=%c\n",
973c11b9 2959 ondisk, ondisk->count,
35a5249b 2960 (intmax_t)ondisk->parent, ondisk->type);
c0ade690 2961
a17eeab0
TK
2962 switch (ondisk->type) {
2963 case HAMMER_BTREE_TYPE_INTERNAL:
2964 n = ondisk->count + 1; /* count is NOT boundary inclusive */
2965 break;
2966 case HAMMER_BTREE_TYPE_LEAF:
2967 n = ondisk->count; /* there is no boundary */
2968 break;
2969 default:
2970 return; /* nothing to do */
2971 }
2972
c0ade690 2973 /*
a17eeab0 2974 * Dump elements including boundary.
c0ade690 2975 */
a17eeab0
TK
2976 for (i = 0; i < n; ++i) {
2977 kprintf(" %2d", i);
2978 hammer_print_btree_elm(&ondisk->elms[i]);
c0ade690
MD
2979 }
2980}
2981
2982void
a17eeab0 2983hammer_print_btree_elm(hammer_btree_elm_t elm)
c0ade690 2984{
35a5249b
TK
2985 kprintf("\tobj_id = %016jx\n", (intmax_t)elm->base.obj_id);
2986 kprintf("\tkey = %016jx\n", (intmax_t)elm->base.key);
2987 kprintf("\tcreate_tid = %016jx\n", (intmax_t)elm->base.create_tid);
2988 kprintf("\tdelete_tid = %016jx\n", (intmax_t)elm->base.delete_tid);
c0ade690
MD
2989 kprintf("\trec_type = %04x\n", elm->base.rec_type);
2990 kprintf("\tobj_type = %02x\n", elm->base.obj_type);
c3378ec7
TK
2991 kprintf("\tbtype = %02x (%c)\n", elm->base.btype,
2992 hammer_elm_btype(elm));
c1789a39 2993 kprintf("\tlocalization = %08x\n", elm->base.localization);
fe7678ee 2994
a17eeab0 2995 if (hammer_is_internal_node_elm(elm)) {
35a5249b
TK
2996 kprintf("\tsubtree_off = %016jx\n",
2997 (intmax_t)elm->internal.subtree_offset);
a17eeab0 2998 } else if (hammer_is_leaf_node_elm(elm)) {
35a5249b
TK
2999 kprintf("\tdata_offset = %016jx\n",
3000 (intmax_t)elm->leaf.data_offset);
c0ade690
MD
3001 kprintf("\tdata_len = %08x\n", elm->leaf.data_len);
3002 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc);
3003 }
3004}
fe4fc0c0
TK
3005
3006static __inline
3007void
3008hammer_debug_btree_elm(hammer_cursor_t cursor, hammer_btree_elm_t elm,
3009 const char *s, int res)
3010{
35a5249b
TK
3011 hkprintf("%-8s %016jx[%02d] %c "
3012 "lo=%08x obj=%016jx rec=%02x key=%016jx tid=%016jx td=%p "
fe4fc0c0
TK
3013 "r=%d\n",
3014 s,
35a5249b 3015 (intmax_t)cursor->node->node_offset,
fe4fc0c0
TK
3016 cursor->index,
3017 hammer_elm_btype(elm),
3018 elm->base.localization,
35a5249b 3019 (intmax_t)elm->base.obj_id,
fe4fc0c0 3020 elm->base.rec_type,
35a5249b
TK
3021 (intmax_t)elm->base.key,
3022 (intmax_t)elm->base.create_tid,
fe4fc0c0
TK
3023 curthread,
3024 res);
3025}
3026
3027static __inline
3028void
3029hammer_debug_btree_parent(hammer_cursor_t cursor, const char *s)
3030{
3031 hammer_btree_elm_t elm =
3032 &cursor->parent->ondisk->elms[cursor->parent_index];
3033
35a5249b
TK
3034 hkprintf("%-8s %016jx[%d] %c "
3035 "(%016jx/%016jx %016jx/%016jx) (%p/%p %p/%p)\n",
fe4fc0c0 3036 s,
35a5249b 3037 (intmax_t)cursor->parent->node_offset,
fe4fc0c0
TK
3038 cursor->parent_index,
3039 hammer_elm_btype(elm),
35a5249b
TK
3040 (intmax_t)cursor->left_bound->obj_id,
3041 (intmax_t)elm->internal.base.obj_id,
3042 (intmax_t)cursor->right_bound->obj_id,
3043 (intmax_t)(elm + 1)->internal.base.obj_id,
fe4fc0c0
TK
3044 cursor->left_bound,
3045 elm,
3046 cursor->right_bound,
3047 elm + 1);
3048}