kernel - SWAP CACHE part 19/many - distinguish bulk data in HAMMER block dev
[dragonfly.git] / sys / vfs / hammer / hammer_btree.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.76 2008/08/06 15:38:58 dillon Exp $
35  */
36
37 /*
38  * HAMMER B-Tree index
39  *
40  * HAMMER implements a modified B+Tree.  In documentation this will
41  * simply be refered to as the HAMMER B-Tree.  Basically a HAMMER B-Tree
42  * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43  * of the tree), but adds two additional boundary elements which describe
44  * the left-most and right-most element a node is able to represent.  In
45  * otherwords, we have boundary elements at the two ends of a B-Tree node
46  * instead of sub-tree pointers.
47  *
48  * A B-Tree internal node looks like this:
49  *
50  *      B N N N N N N B   <-- boundary and internal elements
51  *       S S S S S S S    <-- subtree pointers
52  *
53  * A B-Tree leaf node basically looks like this:
54  *
55  *      L L L L L L L L   <-- leaf elemenets
56  *
57  * The radix for an internal node is 1 less then a leaf but we get a
58  * number of significant benefits for our troubles.
59  *
60  * The big benefit to using a B-Tree containing boundary information
61  * is that it is possible to cache pointers into the middle of the tree
62  * and not have to start searches, insertions, OR deletions at the root
63  * node.   In particular, searches are able to progress in a definitive
64  * direction from any point in the tree without revisting nodes.  This
65  * greatly improves the efficiency of many operations, most especially
66  * record appends.
67  *
68  * B-Trees also make the stacking of trees fairly straightforward.
69  *
70  * INSERTIONS:  A search performed with the intention of doing
71  * an insert will guarantee that the terminal leaf node is not full by
72  * splitting full nodes.  Splits occur top-down during the dive down the
73  * B-Tree.
74  *
75  * DELETIONS: A deletion makes no attempt to proactively balance the
76  * tree and will recursively remove nodes that become empty.  If a
77  * deadlock occurs a deletion may not be able to remove an empty leaf.
78  * Deletions never allow internal nodes to become empty (that would blow
79  * up the boundaries).
80  */
81 #include "hammer.h"
82 #include <sys/buf.h>
83 #include <sys/buf2.h>
84
85 static int btree_search(hammer_cursor_t cursor, int flags);
86 static int btree_split_internal(hammer_cursor_t cursor);
87 static int btree_split_leaf(hammer_cursor_t cursor);
88 static int btree_remove(hammer_cursor_t cursor);
89 static int btree_node_is_full(hammer_node_ondisk_t node);
90 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor,        
91                         hammer_tid_t mirror_tid);
92 static void hammer_make_separator(hammer_base_elm_t key1,
93                         hammer_base_elm_t key2, hammer_base_elm_t dest);
94 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor);
95
96 /*
97  * Iterate records after a search.  The cursor is iterated forwards past
98  * the current record until a record matching the key-range requirements
99  * is found.  ENOENT is returned if the iteration goes past the ending
100  * key. 
101  *
102  * The iteration is inclusive of key_beg and can be inclusive or exclusive
103  * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set.
104  *
105  * When doing an as-of search (cursor->asof != 0), key_beg.create_tid
106  * may be modified by B-Tree functions.
107  *
108  * cursor->key_beg may or may not be modified by this function during
109  * the iteration.  XXX future - in case of an inverted lock we may have
110  * to reinitiate the lookup and set key_beg to properly pick up where we
111  * left off.
112  *
113  * NOTE!  EDEADLK *CANNOT* be returned by this procedure.
114  */
115 int
116 hammer_btree_iterate(hammer_cursor_t cursor)
117 {
118         hammer_node_ondisk_t node;
119         hammer_btree_elm_t elm;
120         hammer_mount_t hmp;
121         int error = 0;
122         int r;
123         int s;
124
125         /*
126          * Skip past the current record
127          */
128         hmp = cursor->trans->hmp;
129         node = cursor->node->ondisk;
130         if (node == NULL)
131                 return(ENOENT);
132         if (cursor->index < node->count && 
133             (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
134                 ++cursor->index;
135         }
136
137         /*
138          * HAMMER can wind up being cpu-bound.
139          */
140         if (++hmp->check_yield > hammer_yield_check) {
141                 hmp->check_yield = 0;
142                 lwkt_user_yield();
143         }
144
145
146         /*
147          * Loop until an element is found or we are done.
148          */
149         for (;;) {
150                 /*
151                  * We iterate up the tree and then index over one element
152                  * while we are at the last element in the current node.
153                  *
154                  * If we are at the root of the filesystem, cursor_up
155                  * returns ENOENT.
156                  *
157                  * XXX this could be optimized by storing the information in
158                  * the parent reference.
159                  *
160                  * XXX we can lose the node lock temporarily, this could mess
161                  * up our scan.
162                  */
163                 ++hammer_stats_btree_iterations;
164                 hammer_flusher_clean_loose_ios(hmp);
165
166                 if (cursor->index == node->count) {
167                         if (hammer_debug_btree) {
168                                 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n",
169                                         (long long)cursor->node->node_offset,
170                                         cursor->index,
171                                         (long long)(cursor->parent ? cursor->parent->node_offset : -1),
172                                         cursor->parent_index,
173                                         curthread);
174                         }
175                         KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset);
176                         error = hammer_cursor_up(cursor);
177                         if (error)
178                                 break;
179                         /* reload stale pointer */
180                         node = cursor->node->ondisk;
181                         KKASSERT(cursor->index != node->count);
182
183                         /*
184                          * If we are reblocking we want to return internal
185                          * nodes.  Note that the internal node will be
186                          * returned multiple times, on each upward recursion
187                          * from its children.  The caller selects which
188                          * revisit it cares about (usually first or last only).
189                          */
190                         if (cursor->flags & HAMMER_CURSOR_REBLOCKING) {
191                                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
192                                 return(0);
193                         }
194                         ++cursor->index;
195                         continue;
196                 }
197
198                 /*
199                  * Check internal or leaf element.  Determine if the record
200                  * at the cursor has gone beyond the end of our range.
201                  *
202                  * We recurse down through internal nodes.
203                  */
204                 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
205                         elm = &node->elms[cursor->index];
206
207                         r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
208                         s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
209                         if (hammer_debug_btree) {
210                                 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d (td=%p)\n",
211                                         (long long)cursor->node->node_offset,
212                                         cursor->index,
213                                         (long long)elm[0].internal.base.obj_id,
214                                         elm[0].internal.base.rec_type,
215                                         (long long)elm[0].internal.base.key,
216                                         elm[0].internal.base.localization,
217                                         r,
218                                         curthread
219                                 );
220                                 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n",
221                                         (long long)cursor->node->node_offset,
222                                         cursor->index + 1,
223                                         (long long)elm[1].internal.base.obj_id,
224                                         elm[1].internal.base.rec_type,
225                                         (long long)elm[1].internal.base.key,
226                                         elm[1].internal.base.localization,
227                                         s
228                                 );
229                         }
230
231                         if (r < 0) {
232                                 error = ENOENT;
233                                 break;
234                         }
235                         if (r == 0 && (cursor->flags &
236                                        HAMMER_CURSOR_END_INCLUSIVE) == 0) {
237                                 error = ENOENT;
238                                 break;
239                         }
240                         KKASSERT(s <= 0);
241
242                         /*
243                          * Better not be zero
244                          */
245                         KKASSERT(elm->internal.subtree_offset != 0);
246
247                         /*
248                          * If running the mirror filter see if we can skip
249                          * one or more entire sub-trees.  If we can we
250                          * return the internal mode and the caller processes
251                          * the skipped range (see mirror_read)
252                          */
253                         if (cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) {
254                                 if (elm->internal.mirror_tid <
255                                     cursor->cmirror->mirror_tid) {
256                                         hammer_cursor_mirror_filter(cursor);
257                                         return(0);
258                                 }
259                         }
260
261                         error = hammer_cursor_down(cursor);
262                         if (error)
263                                 break;
264                         KKASSERT(cursor->index == 0);
265                         /* reload stale pointer */
266                         node = cursor->node->ondisk;
267                         continue;
268                 } else {
269                         elm = &node->elms[cursor->index];
270                         r = hammer_btree_cmp(&cursor->key_end, &elm->base);
271                         if (hammer_debug_btree) {
272                                 kprintf("ELEMENT  %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n",
273                                         (long long)cursor->node->node_offset,
274                                         cursor->index,
275                                         (elm[0].leaf.base.btype ?
276                                          elm[0].leaf.base.btype : '?'),
277                                         (long long)elm[0].leaf.base.obj_id,
278                                         elm[0].leaf.base.rec_type,
279                                         (long long)elm[0].leaf.base.key,
280                                         elm[0].leaf.base.localization,
281                                         r
282                                 );
283                         }
284                         if (r < 0) {
285                                 error = ENOENT;
286                                 break;
287                         }
288
289                         /*
290                          * We support both end-inclusive and
291                          * end-exclusive searches.
292                          */
293                         if (r == 0 &&
294                            (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
295                                 error = ENOENT;
296                                 break;
297                         }
298
299                         switch(elm->leaf.base.btype) {
300                         case HAMMER_BTREE_TYPE_RECORD:
301                                 if ((cursor->flags & HAMMER_CURSOR_ASOF) &&
302                                     hammer_btree_chkts(cursor->asof, &elm->base)) {
303                                         ++cursor->index;
304                                         continue;
305                                 }
306                                 error = 0;
307                                 break;
308                         default:
309                                 error = EINVAL;
310                                 break;
311                         }
312                         if (error)
313                                 break;
314                 }
315                 /*
316                  * node pointer invalid after loop
317                  */
318
319                 /*
320                  * Return entry
321                  */
322                 if (hammer_debug_btree) {
323                         int i = cursor->index;
324                         hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i];
325                         kprintf("ITERATE  %p:%d %016llx %02x %016llx lo=%02x\n",
326                                 cursor->node, i,
327                                 (long long)elm->internal.base.obj_id,
328                                 elm->internal.base.rec_type,
329                                 (long long)elm->internal.base.key,
330                                 elm->internal.base.localization
331                         );
332                 }
333                 return(0);
334         }
335         return(error);
336 }
337
338 /*
339  * We hit an internal element that we could skip as part of a mirroring
340  * scan.  Calculate the entire range being skipped.
341  *
342  * It is important to include any gaps between the parent's left_bound
343  * and the node's left_bound, and same goes for the right side.
344  */
345 static void
346 hammer_cursor_mirror_filter(hammer_cursor_t cursor)
347 {
348         struct hammer_cmirror *cmirror;
349         hammer_node_ondisk_t ondisk;
350         hammer_btree_elm_t elm;
351
352         ondisk = cursor->node->ondisk;
353         cmirror = cursor->cmirror;
354
355         /*
356          * Calculate the skipped range
357          */
358         elm = &ondisk->elms[cursor->index];
359         if (cursor->index == 0)
360                 cmirror->skip_beg = *cursor->left_bound;
361         else
362                 cmirror->skip_beg = elm->internal.base;
363         while (cursor->index < ondisk->count) {
364                 if (elm->internal.mirror_tid >= cmirror->mirror_tid)
365                         break;
366                 ++cursor->index;
367                 ++elm;
368         }
369         if (cursor->index == ondisk->count)
370                 cmirror->skip_end = *cursor->right_bound;
371         else
372                 cmirror->skip_end = elm->internal.base;
373
374         /*
375          * clip the returned result.
376          */
377         if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0)
378                 cmirror->skip_beg = cursor->key_beg;
379         if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0)
380                 cmirror->skip_end = cursor->key_end;
381 }
382
383 /*
384  * Iterate in the reverse direction.  This is used by the pruning code to
385  * avoid overlapping records.
386  */
387 int
388 hammer_btree_iterate_reverse(hammer_cursor_t cursor)
389 {
390         hammer_node_ondisk_t node;
391         hammer_btree_elm_t elm;
392         int error = 0;
393         int r;
394         int s;
395
396         /* mirror filtering not supported for reverse iteration */
397         KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0);
398
399         /*
400          * Skip past the current record.  For various reasons the cursor
401          * may end up set to -1 or set to point at the end of the current
402          * node.  These cases must be addressed.
403          */
404         node = cursor->node->ondisk;
405         if (node == NULL)
406                 return(ENOENT);
407         if (cursor->index != -1 && 
408             (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
409                 --cursor->index;
410         }
411         if (cursor->index == cursor->node->ondisk->count)
412                 --cursor->index;
413
414         /*
415          * Loop until an element is found or we are done.
416          */
417         for (;;) {
418                 ++hammer_stats_btree_iterations;
419                 hammer_flusher_clean_loose_ios(cursor->trans->hmp);
420
421                 /*
422                  * We iterate up the tree and then index over one element
423                  * while we are at the last element in the current node.
424                  */
425                 if (cursor->index == -1) {
426                         error = hammer_cursor_up(cursor);
427                         if (error) {
428                                 cursor->index = 0; /* sanity */
429                                 break;
430                         }
431                         /* reload stale pointer */
432                         node = cursor->node->ondisk;
433                         KKASSERT(cursor->index != node->count);
434                         --cursor->index;
435                         continue;
436                 }
437
438                 /*
439                  * Check internal or leaf element.  Determine if the record
440                  * at the cursor has gone beyond the end of our range.
441                  *
442                  * We recurse down through internal nodes. 
443                  */
444                 KKASSERT(cursor->index != node->count);
445                 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
446                         elm = &node->elms[cursor->index];
447                         r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
448                         s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
449                         if (hammer_debug_btree) {
450                                 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d\n",
451                                         (long long)cursor->node->node_offset,
452                                         cursor->index,
453                                         (long long)elm[0].internal.base.obj_id,
454                                         elm[0].internal.base.rec_type,
455                                         (long long)elm[0].internal.base.key,
456                                         elm[0].internal.base.localization,
457                                         r
458                                 );
459                                 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n",
460                                         (long long)cursor->node->node_offset,
461                                         cursor->index + 1,
462                                         (long long)elm[1].internal.base.obj_id,
463                                         elm[1].internal.base.rec_type,
464                                         (long long)elm[1].internal.base.key,
465                                         elm[1].internal.base.localization,
466                                         s
467                                 );
468                         }
469
470                         if (s >= 0) {
471                                 error = ENOENT;
472                                 break;
473                         }
474                         KKASSERT(r >= 0);
475
476                         /*
477                          * Better not be zero
478                          */
479                         KKASSERT(elm->internal.subtree_offset != 0);
480
481                         error = hammer_cursor_down(cursor);
482                         if (error)
483                                 break;
484                         KKASSERT(cursor->index == 0);
485                         /* reload stale pointer */
486                         node = cursor->node->ondisk;
487
488                         /* this can assign -1 if the leaf was empty */
489                         cursor->index = node->count - 1;
490                         continue;
491                 } else {
492                         elm = &node->elms[cursor->index];
493                         s = hammer_btree_cmp(&cursor->key_beg, &elm->base);
494                         if (hammer_debug_btree) {
495                                 kprintf("ELEMENT  %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n",
496                                         (long long)cursor->node->node_offset,
497                                         cursor->index,
498                                         (elm[0].leaf.base.btype ?
499                                          elm[0].leaf.base.btype : '?'),
500                                         (long long)elm[0].leaf.base.obj_id,
501                                         elm[0].leaf.base.rec_type,
502                                         (long long)elm[0].leaf.base.key,
503                                         elm[0].leaf.base.localization,
504                                         s
505                                 );
506                         }
507                         if (s > 0) {
508                                 error = ENOENT;
509                                 break;
510                         }
511
512                         switch(elm->leaf.base.btype) {
513                         case HAMMER_BTREE_TYPE_RECORD:
514                                 if ((cursor->flags & HAMMER_CURSOR_ASOF) &&
515                                     hammer_btree_chkts(cursor->asof, &elm->base)) {
516                                         --cursor->index;
517                                         continue;
518                                 }
519                                 error = 0;
520                                 break;
521                         default:
522                                 error = EINVAL;
523                                 break;
524                         }
525                         if (error)
526                                 break;
527                 }
528                 /*
529                  * node pointer invalid after loop
530                  */
531
532                 /*
533                  * Return entry
534                  */
535                 if (hammer_debug_btree) {
536                         int i = cursor->index;
537                         hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i];
538                         kprintf("ITERATE  %p:%d %016llx %02x %016llx lo=%02x\n",
539                                 cursor->node, i,
540                                 (long long)elm->internal.base.obj_id,
541                                 elm->internal.base.rec_type,
542                                 (long long)elm->internal.base.key,
543                                 elm->internal.base.localization
544                         );
545                 }
546                 return(0);
547         }
548         return(error);
549 }
550
551 /*
552  * Lookup cursor->key_beg.  0 is returned on success, ENOENT if the entry
553  * could not be found, EDEADLK if inserting and a retry is needed, and a
554  * fatal error otherwise.  When retrying, the caller must terminate the
555  * cursor and reinitialize it.  EDEADLK cannot be returned if not inserting.
556  * 
557  * The cursor is suitably positioned for a deletion on success, and suitably
558  * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was
559  * specified.
560  *
561  * The cursor may begin anywhere, the search will traverse the tree in
562  * either direction to locate the requested element.
563  *
564  * Most of the logic implementing historical searches is handled here.  We
565  * do an initial lookup with create_tid set to the asof TID.  Due to the
566  * way records are laid out, a backwards iteration may be required if
567  * ENOENT is returned to locate the historical record.  Here's the
568  * problem:
569  *
570  * create_tid:    10      15       20
571  *                   LEAF1   LEAF2
572  * records:         (11)        (18)
573  *
574  * Lets say we want to do a lookup AS-OF timestamp 17.  We will traverse
575  * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is
576  * not visible and thus causes ENOENT to be returned.  We really need
577  * to check record 11 in LEAF1.  If it also fails then the search fails
578  * (e.g. it might represent the range 11-16 and thus still not match our
579  * AS-OF timestamp of 17).  Note that LEAF1 could be empty, requiring
580  * further iterations.
581  *
582  * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK
583  * and the cursor->create_check TID if an iteration might be needed.
584  * In the above example create_check would be set to 14.
585  */
586 int
587 hammer_btree_lookup(hammer_cursor_t cursor)
588 {
589         int error;
590
591         KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 ||
592                   cursor->trans->sync_lock_refs > 0);
593         ++hammer_stats_btree_lookups;
594         if (cursor->flags & HAMMER_CURSOR_ASOF) {
595                 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0);
596                 cursor->key_beg.create_tid = cursor->asof;
597                 for (;;) {
598                         cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK;
599                         error = btree_search(cursor, 0);
600                         if (error != ENOENT ||
601                             (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) {
602                                 /*
603                                  * Stop if no error.
604                                  * Stop if error other then ENOENT.
605                                  * Stop if ENOENT and not special case.
606                                  */
607                                 break;
608                         }
609                         if (hammer_debug_btree) {
610                                 kprintf("CREATE_CHECK %016llx\n",
611                                         (long long)cursor->create_check);
612                         }
613                         cursor->key_beg.create_tid = cursor->create_check;
614                         /* loop */
615                 }
616         } else {
617                 error = btree_search(cursor, 0);
618         }
619         if (error == 0)
620                 error = hammer_btree_extract(cursor, cursor->flags);
621         return(error);
622 }
623
624 /*
625  * Execute the logic required to start an iteration.  The first record
626  * located within the specified range is returned and iteration control
627  * flags are adjusted for successive hammer_btree_iterate() calls.
628  *
629  * Set ATEDISK so a low-level caller can call btree_first/btree_iterate
630  * in a loop without worrying about it.  Higher-level merged searches will
631  * adjust the flag appropriately.
632  */
633 int
634 hammer_btree_first(hammer_cursor_t cursor)
635 {
636         int error;
637
638         error = hammer_btree_lookup(cursor);
639         if (error == ENOENT) {
640                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
641                 error = hammer_btree_iterate(cursor);
642         }
643         cursor->flags |= HAMMER_CURSOR_ATEDISK;
644         return(error);
645 }
646
647 /*
648  * Similarly but for an iteration in the reverse direction.
649  *
650  * Set ATEDISK when iterating backwards to skip the current entry,
651  * which after an ENOENT lookup will be pointing beyond our end point.
652  *
653  * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse
654  * in a loop without worrying about it.  Higher-level merged searches will
655  * adjust the flag appropriately.
656  */
657 int
658 hammer_btree_last(hammer_cursor_t cursor)
659 {
660         struct hammer_base_elm save;
661         int error;
662
663         save = cursor->key_beg;
664         cursor->key_beg = cursor->key_end;
665         error = hammer_btree_lookup(cursor);
666         cursor->key_beg = save;
667         if (error == ENOENT ||
668             (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
669                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
670                 error = hammer_btree_iterate_reverse(cursor);
671         }
672         cursor->flags |= HAMMER_CURSOR_ATEDISK;
673         return(error);
674 }
675
676 /*
677  * Extract the record and/or data associated with the cursor's current
678  * position.  Any prior record or data stored in the cursor is replaced.
679  * The cursor must be positioned at a leaf node.
680  *
681  * NOTE: All extractions occur at the leaf of the B-Tree.
682  */
683 int
684 hammer_btree_extract(hammer_cursor_t cursor, int flags)
685 {
686         hammer_node_ondisk_t node;
687         hammer_btree_elm_t elm;
688         hammer_off_t data_off;
689         hammer_mount_t hmp;
690         int32_t data_len;
691         int error;
692
693         /*
694          * The case where the data reference resolves to the same buffer
695          * as the record reference must be handled.
696          */
697         node = cursor->node->ondisk;
698         elm = &node->elms[cursor->index];
699         cursor->data = NULL;
700         hmp = cursor->node->hmp;
701
702         /*
703          * There is nothing to extract for an internal element.
704          */
705         if (node->type == HAMMER_BTREE_TYPE_INTERNAL)
706                 return(EINVAL);
707
708         /*
709          * Only record types have data.
710          */
711         KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
712         cursor->leaf = &elm->leaf;
713
714         if ((flags & HAMMER_CURSOR_GET_DATA) == 0)
715                 return(0);
716         if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD)
717                 return(0);
718         data_off = elm->leaf.data_offset;
719         data_len = elm->leaf.data_len;
720         if (data_off == 0)
721                 return(0);
722
723         /*
724          * Load the data
725          */
726         KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE);
727         cursor->data = hammer_bread_ext(hmp, data_off, data_len,
728                                         &error, &cursor->data_buffer);
729
730         /*
731          * Mark the data buffer as not being meta-data if it isn't
732          * meta-data (sometimes bulk data is accessed via a volume
733          * block device).
734          */
735         if (error == 0) {
736                 switch(elm->leaf.base.rec_type) {
737                 case HAMMER_RECTYPE_DATA:
738                 case HAMMER_RECTYPE_DB:
739                         hammer_io_notmeta(cursor->data_buffer);
740                         break;
741                 default:
742                         break;
743                 }
744         }
745
746         /*
747          * Deal with CRC errors on the extracted data.
748          */
749         if (error == 0 &&
750             hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) {
751                 kprintf("CRC DATA @ %016llx/%d FAILED\n",
752                         (long long)elm->leaf.data_offset, elm->leaf.data_len);
753                 if (hammer_debug_critical)
754                         Debugger("CRC FAILED: DATA");
755                 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM)
756                         error = EDOM;   /* less critical (mirroring) */
757                 else
758                         error = EIO;    /* critical */
759         }
760         return(error);
761 }
762
763
764 /*
765  * Insert a leaf element into the B-Tree at the current cursor position.
766  * The cursor is positioned such that the element at and beyond the cursor
767  * are shifted to make room for the new record.
768  *
769  * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
770  * flag set and that call must return ENOENT before this function can be
771  * called.
772  *
773  * The caller may depend on the cursor's exclusive lock after return to
774  * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE).
775  *
776  * ENOSPC is returned if there is no room to insert a new record.
777  */
778 int
779 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm,
780                     int *doprop)
781 {
782         hammer_node_ondisk_t node;
783         int i;
784         int error;
785
786         *doprop = 0;
787         if ((error = hammer_cursor_upgrade_node(cursor)) != 0)
788                 return(error);
789         ++hammer_stats_btree_inserts;
790
791         /*
792          * Insert the element at the leaf node and update the count in the
793          * parent.  It is possible for parent to be NULL, indicating that
794          * the filesystem's ROOT B-Tree node is a leaf itself, which is
795          * possible.  The root inode can never be deleted so the leaf should
796          * never be empty.
797          *
798          * Remember that the right-hand boundary is not included in the
799          * count.
800          */
801         hammer_modify_node_all(cursor->trans, cursor->node);
802         node = cursor->node->ondisk;
803         i = cursor->index;
804         KKASSERT(elm->base.btype != 0);
805         KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
806         KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
807         if (i != node->count) {
808                 bcopy(&node->elms[i], &node->elms[i+1],
809                       (node->count - i) * sizeof(*elm));
810         }
811         node->elms[i].leaf = *elm;
812         ++node->count;
813         hammer_cursor_inserted_element(cursor->node, i);
814
815         /*
816          * Update the leaf node's aggregate mirror_tid for mirroring
817          * support.
818          */
819         if (node->mirror_tid < elm->base.delete_tid) {
820                 node->mirror_tid = elm->base.delete_tid;
821                 *doprop = 1;
822         }
823         if (node->mirror_tid < elm->base.create_tid) {
824                 node->mirror_tid = elm->base.create_tid;
825                 *doprop = 1;
826         }
827         hammer_modify_node_done(cursor->node);
828
829         /*
830          * Debugging sanity checks.
831          */
832         KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0);
833         KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0);
834         if (i) {
835                 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0);
836         }
837         if (i != node->count - 1)
838                 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0);
839
840         return(0);
841 }
842
843 /*
844  * Delete a record from the B-Tree at the current cursor position.
845  * The cursor is positioned such that the current element is the one
846  * to be deleted.
847  *
848  * On return the cursor will be positioned after the deleted element and
849  * MAY point to an internal node.  It will be suitable for the continuation
850  * of an iteration but not for an insertion or deletion.
851  *
852  * Deletions will attempt to partially rebalance the B-Tree in an upward
853  * direction, but will terminate rather then deadlock.  Empty internal nodes
854  * are never allowed by a deletion which deadlocks may end up giving us an
855  * empty leaf.  The pruner will clean up and rebalance the tree.
856  *
857  * This function can return EDEADLK, requiring the caller to retry the
858  * operation after clearing the deadlock.
859  */
860 int
861 hammer_btree_delete(hammer_cursor_t cursor)
862 {
863         hammer_node_ondisk_t ondisk;
864         hammer_node_t node;
865         hammer_node_t parent;
866         int error;
867         int i;
868
869         KKASSERT (cursor->trans->sync_lock_refs > 0);
870         if ((error = hammer_cursor_upgrade(cursor)) != 0)
871                 return(error);
872         ++hammer_stats_btree_deletes;
873
874         /*
875          * Delete the element from the leaf node. 
876          *
877          * Remember that leaf nodes do not have boundaries.
878          */
879         node = cursor->node;
880         ondisk = node->ondisk;
881         i = cursor->index;
882
883         KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
884         KKASSERT(i >= 0 && i < ondisk->count);
885         hammer_modify_node_all(cursor->trans, node);
886         if (i + 1 != ondisk->count) {
887                 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
888                       (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
889         }
890         --ondisk->count;
891         hammer_modify_node_done(node);
892         hammer_cursor_deleted_element(node, i);
893
894         /*
895          * Validate local parent
896          */
897         if (ondisk->parent) {
898                 parent = cursor->parent;
899
900                 KKASSERT(parent != NULL);
901                 KKASSERT(parent->node_offset == ondisk->parent);
902         }
903
904         /*
905          * If the leaf becomes empty it must be detached from the parent,
906          * potentially recursing through to the filesystem root.
907          *
908          * This may reposition the cursor at one of the parent's of the
909          * current node.
910          *
911          * Ignore deadlock errors, that simply means that btree_remove
912          * was unable to recurse and had to leave us with an empty leaf. 
913          */
914         KKASSERT(cursor->index <= ondisk->count);
915         if (ondisk->count == 0) {
916                 error = btree_remove(cursor);
917                 if (error == EDEADLK)
918                         error = 0;
919         } else {
920                 error = 0;
921         }
922         KKASSERT(cursor->parent == NULL ||
923                  cursor->parent_index < cursor->parent->ondisk->count);
924         return(error);
925 }
926
927 /*
928  * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
929  *
930  * Search the filesystem B-Tree for cursor->key_beg, return the matching node.
931  *
932  * The search can begin ANYWHERE in the B-Tree.  As a first step the search
933  * iterates up the tree as necessary to properly position itself prior to
934  * actually doing the sarch.
935  * 
936  * INSERTIONS: The search will split full nodes and leaves on its way down
937  * and guarentee that the leaf it ends up on is not full.  If we run out
938  * of space the search continues to the leaf (to position the cursor for
939  * the spike), but ENOSPC is returned.
940  *
941  * The search is only guarenteed to end up on a leaf if an error code of 0
942  * is returned, or if inserting and an error code of ENOENT is returned.
943  * Otherwise it can stop at an internal node.  On success a search returns
944  * a leaf node.
945  *
946  * COMPLEXITY WARNING!  This is the core B-Tree search code for the entire
947  * filesystem, and it is not simple code.  Please note the following facts:
948  *
949  * - Internal node recursions have a boundary on the left AND right.  The
950  *   right boundary is non-inclusive.  The create_tid is a generic part
951  *   of the key for internal nodes.
952  *
953  * - Leaf nodes contain terminal elements only now.
954  *
955  * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a
956  *   historical search.  ASOF and INSERT are mutually exclusive.  When
957  *   doing an as-of lookup btree_search() checks for a right-edge boundary
958  *   case.  If while recursing down the left-edge differs from the key
959  *   by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along
960  *   with cursor->create_check.  This is used by btree_lookup() to iterate.
961  *   The iteration backwards because as-of searches can wind up going
962  *   down the wrong branch of the B-Tree.
963  */
964 static 
965 int
966 btree_search(hammer_cursor_t cursor, int flags)
967 {
968         hammer_node_ondisk_t node;
969         hammer_btree_elm_t elm;
970         int error;
971         int enospc = 0;
972         int i;
973         int r;
974         int s;
975
976         flags |= cursor->flags;
977         ++hammer_stats_btree_searches;
978
979         if (hammer_debug_btree) {
980                 kprintf("SEARCH   %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td = %p)\n",
981                         (long long)cursor->node->node_offset,
982                         cursor->index,
983                         (long long)cursor->key_beg.obj_id,
984                         cursor->key_beg.rec_type,
985                         (long long)cursor->key_beg.key,
986                         (long long)cursor->key_beg.create_tid,
987                         cursor->key_beg.localization, 
988                         curthread
989                 );
990                 if (cursor->parent)
991                     kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n",
992                         (long long)cursor->parent->node_offset,
993                         cursor->parent_index,
994                         (long long)cursor->left_bound->obj_id,
995                         (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id,
996                         (long long)cursor->right_bound->obj_id,
997                         (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id,
998                         cursor->left_bound,
999                         &cursor->parent->ondisk->elms[cursor->parent_index],
1000                         cursor->right_bound,
1001                         &cursor->parent->ondisk->elms[cursor->parent_index+1]
1002                     );
1003         }
1004
1005         /*
1006          * Move our cursor up the tree until we find a node whos range covers
1007          * the key we are trying to locate.
1008          *
1009          * The left bound is inclusive, the right bound is non-inclusive.
1010          * It is ok to cursor up too far.
1011          */
1012         for (;;) {
1013                 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound);
1014                 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound);
1015                 if (r >= 0 && s < 0)
1016                         break;
1017                 KKASSERT(cursor->parent);
1018                 ++hammer_stats_btree_iterations;
1019                 error = hammer_cursor_up(cursor);
1020                 if (error)
1021                         goto done;
1022         }
1023
1024         /*
1025          * The delete-checks below are based on node, not parent.  Set the
1026          * initial delete-check based on the parent.
1027          */
1028         if (r == 1) {
1029                 KKASSERT(cursor->left_bound->create_tid != 1);
1030                 cursor->create_check = cursor->left_bound->create_tid - 1;
1031                 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK;
1032         }
1033
1034         /*
1035          * We better have ended up with a node somewhere.
1036          */
1037         KKASSERT(cursor->node != NULL);
1038
1039         /*
1040          * If we are inserting we can't start at a full node if the parent
1041          * is also full (because there is no way to split the node),
1042          * continue running up the tree until the requirement is satisfied
1043          * or we hit the root of the filesystem.
1044          *
1045          * (If inserting we aren't doing an as-of search so we don't have
1046          *  to worry about create_check).
1047          */
1048         while ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) {
1049                 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
1050                         if (btree_node_is_full(cursor->node->ondisk) == 0)
1051                                 break;
1052                 } else {
1053                         if (btree_node_is_full(cursor->node->ondisk) ==0)
1054                                 break;
1055                 }
1056                 if (cursor->node->ondisk->parent == 0 ||
1057                     cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) {
1058                         break;
1059                 }
1060                 ++hammer_stats_btree_iterations;
1061                 error = hammer_cursor_up(cursor);
1062                 /* node may have become stale */
1063                 if (error)
1064                         goto done;
1065         }
1066
1067         /*
1068          * Push down through internal nodes to locate the requested key.
1069          */
1070         node = cursor->node->ondisk;
1071         while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
1072                 /*
1073                  * Scan the node to find the subtree index to push down into.
1074                  * We go one-past, then back-up.
1075                  *
1076                  * We must proactively remove deleted elements which may
1077                  * have been left over from a deadlocked btree_remove().
1078                  *
1079                  * The left and right boundaries are included in the loop
1080                  * in order to detect edge cases.
1081                  *
1082                  * If the separator only differs by create_tid (r == 1)
1083                  * and we are doing an as-of search, we may end up going
1084                  * down a branch to the left of the one containing the
1085                  * desired key.  This requires numerous special cases.
1086                  */
1087                 ++hammer_stats_btree_iterations;
1088                 if (hammer_debug_btree) {
1089                         kprintf("SEARCH-I %016llx count=%d\n",
1090                                 (long long)cursor->node->node_offset,
1091                                 node->count);
1092                 }
1093
1094                 /*
1095                  * Try to shortcut the search before dropping into the
1096                  * linear loop.  Locate the first node where r <= 1.
1097                  */
1098                 i = hammer_btree_search_node(&cursor->key_beg, node);
1099                 while (i <= node->count) {
1100                         ++hammer_stats_btree_elements;
1101                         elm = &node->elms[i];
1102                         r = hammer_btree_cmp(&cursor->key_beg, &elm->base);
1103                         if (hammer_debug_btree > 2) {
1104                                 kprintf(" IELM %p %d r=%d\n",
1105                                         &node->elms[i], i, r);
1106                         }
1107                         if (r < 0)
1108                                 break;
1109                         if (r == 1) {
1110                                 KKASSERT(elm->base.create_tid != 1);
1111                                 cursor->create_check = elm->base.create_tid - 1;
1112                                 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK;
1113                         }
1114                         ++i;
1115                 }
1116                 if (hammer_debug_btree) {
1117                         kprintf("SEARCH-I preI=%d/%d r=%d\n",
1118                                 i, node->count, r);
1119                 }
1120
1121                 /*
1122                  * These cases occur when the parent's idea of the boundary
1123                  * is wider then the child's idea of the boundary, and
1124                  * require special handling.  If not inserting we can
1125                  * terminate the search early for these cases but the
1126                  * child's boundaries cannot be unconditionally modified.
1127                  */
1128                 if (i == 0) {
1129                         /*
1130                          * If i == 0 the search terminated to the LEFT of the
1131                          * left_boundary but to the RIGHT of the parent's left
1132                          * boundary.
1133                          */
1134                         u_int8_t save;
1135
1136                         elm = &node->elms[0];
1137
1138                         /*
1139                          * If we aren't inserting we can stop here.
1140                          */
1141                         if ((flags & (HAMMER_CURSOR_INSERT |
1142                                       HAMMER_CURSOR_PRUNING)) == 0) {
1143                                 cursor->index = 0;
1144                                 return(ENOENT);
1145                         }
1146
1147                         /*
1148                          * Correct a left-hand boundary mismatch.
1149                          *
1150                          * We can only do this if we can upgrade the lock,
1151                          * and synchronized as a background cursor (i.e.
1152                          * inserting or pruning).
1153                          *
1154                          * WARNING: We can only do this if inserting, i.e.
1155                          * we are running on the backend.
1156                          */
1157                         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1158                                 return(error);
1159                         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1160                         hammer_modify_node_field(cursor->trans, cursor->node,
1161                                                  elms[0]);
1162                         save = node->elms[0].base.btype;
1163                         node->elms[0].base = *cursor->left_bound;
1164                         node->elms[0].base.btype = save;
1165                         hammer_modify_node_done(cursor->node);
1166                 } else if (i == node->count + 1) {
1167                         /*
1168                          * If i == node->count + 1 the search terminated to
1169                          * the RIGHT of the right boundary but to the LEFT
1170                          * of the parent's right boundary.  If we aren't
1171                          * inserting we can stop here.
1172                          *
1173                          * Note that the last element in this case is
1174                          * elms[i-2] prior to adjustments to 'i'.
1175                          */
1176                         --i;
1177                         if ((flags & (HAMMER_CURSOR_INSERT |
1178                                       HAMMER_CURSOR_PRUNING)) == 0) {
1179                                 cursor->index = i;
1180                                 return (ENOENT);
1181                         }
1182
1183                         /*
1184                          * Correct a right-hand boundary mismatch.
1185                          * (actual push-down record is i-2 prior to
1186                          * adjustments to i).
1187                          *
1188                          * We can only do this if we can upgrade the lock,
1189                          * and synchronized as a background cursor (i.e.
1190                          * inserting or pruning).
1191                          *
1192                          * WARNING: We can only do this if inserting, i.e.
1193                          * we are running on the backend.
1194                          */
1195                         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1196                                 return(error);
1197                         elm = &node->elms[i];
1198                         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1199                         hammer_modify_node(cursor->trans, cursor->node,
1200                                            &elm->base, sizeof(elm->base));
1201                         elm->base = *cursor->right_bound;
1202                         hammer_modify_node_done(cursor->node);
1203                         --i;
1204                 } else {
1205                         /*
1206                          * The push-down index is now i - 1.  If we had
1207                          * terminated on the right boundary this will point
1208                          * us at the last element.
1209                          */
1210                         --i;
1211                 }
1212                 cursor->index = i;
1213                 elm = &node->elms[i];
1214
1215                 if (hammer_debug_btree) {
1216                         kprintf("RESULT-I %016llx[%d] %016llx %02x "
1217                                 "key=%016llx cre=%016llx lo=%02x\n",
1218                                 (long long)cursor->node->node_offset,
1219                                 i,
1220                                 (long long)elm->internal.base.obj_id,
1221                                 elm->internal.base.rec_type,
1222                                 (long long)elm->internal.base.key,
1223                                 (long long)elm->internal.base.create_tid,
1224                                 elm->internal.base.localization
1225                         );
1226                 }
1227
1228                 /*
1229                  * We better have a valid subtree offset.
1230                  */
1231                 KKASSERT(elm->internal.subtree_offset != 0);
1232
1233                 /*
1234                  * Handle insertion and deletion requirements.
1235                  *
1236                  * If inserting split full nodes.  The split code will
1237                  * adjust cursor->node and cursor->index if the current
1238                  * index winds up in the new node.
1239                  *
1240                  * If inserting and a left or right edge case was detected,
1241                  * we cannot correct the left or right boundary and must
1242                  * prepend and append an empty leaf node in order to make
1243                  * the boundary correction.
1244                  *
1245                  * If we run out of space we set enospc and continue on
1246                  * to a leaf to provide the spike code with a good point
1247                  * of entry.
1248                  */
1249                 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) {
1250                         if (btree_node_is_full(node)) {
1251                                 error = btree_split_internal(cursor);
1252                                 if (error) {
1253                                         if (error != ENOSPC)
1254                                                 goto done;
1255                                         enospc = 1;
1256                                 }
1257                                 /*
1258                                  * reload stale pointers
1259                                  */
1260                                 i = cursor->index;
1261                                 node = cursor->node->ondisk;
1262                         }
1263                 }
1264
1265                 /*
1266                  * Push down (push into new node, existing node becomes
1267                  * the parent) and continue the search.
1268                  */
1269                 error = hammer_cursor_down(cursor);
1270                 /* node may have become stale */
1271                 if (error)
1272                         goto done;
1273                 node = cursor->node->ondisk;
1274         }
1275
1276         /*
1277          * We are at a leaf, do a linear search of the key array.
1278          *
1279          * On success the index is set to the matching element and 0
1280          * is returned.
1281          *
1282          * On failure the index is set to the insertion point and ENOENT
1283          * is returned.
1284          *
1285          * Boundaries are not stored in leaf nodes, so the index can wind
1286          * up to the left of element 0 (index == 0) or past the end of
1287          * the array (index == node->count).  It is also possible that the
1288          * leaf might be empty.
1289          */
1290         ++hammer_stats_btree_iterations;
1291         KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF);
1292         KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
1293         if (hammer_debug_btree) {
1294                 kprintf("SEARCH-L %016llx count=%d\n",
1295                         (long long)cursor->node->node_offset,
1296                         node->count);
1297         }
1298
1299         /*
1300          * Try to shortcut the search before dropping into the
1301          * linear loop.  Locate the first node where r <= 1.
1302          */
1303         i = hammer_btree_search_node(&cursor->key_beg, node);
1304         while (i < node->count) {
1305                 ++hammer_stats_btree_elements;
1306                 elm = &node->elms[i];
1307
1308                 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base);
1309
1310                 if (hammer_debug_btree > 1)
1311                         kprintf("  ELM %p %d r=%d\n", &node->elms[i], i, r);
1312
1313                 /*
1314                  * We are at a record element.  Stop if we've flipped past
1315                  * key_beg, not counting the create_tid test.  Allow the
1316                  * r == 1 case (key_beg > element but differs only by its
1317                  * create_tid) to fall through to the AS-OF check.
1318                  */
1319                 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD);
1320
1321                 if (r < 0)
1322                         goto failed;
1323                 if (r > 1) {
1324                         ++i;
1325                         continue;
1326                 }
1327
1328                 /*
1329                  * Check our as-of timestamp against the element.
1330                  */
1331                 if (flags & HAMMER_CURSOR_ASOF) {
1332                         if (hammer_btree_chkts(cursor->asof,
1333                                                &node->elms[i].base) != 0) {
1334                                 ++i;
1335                                 continue;
1336                         }
1337                         /* success */
1338                 } else {
1339                         if (r > 0) {    /* can only be +1 */
1340                                 ++i;
1341                                 continue;
1342                         }
1343                         /* success */
1344                 }
1345                 cursor->index = i;
1346                 error = 0;
1347                 if (hammer_debug_btree) {
1348                         kprintf("RESULT-L %016llx[%d] (SUCCESS)\n",
1349                                 (long long)cursor->node->node_offset, i);
1350                 }
1351                 goto done;
1352         }
1353
1354         /*
1355          * The search of the leaf node failed.  i is the insertion point.
1356          */
1357 failed:
1358         if (hammer_debug_btree) {
1359                 kprintf("RESULT-L %016llx[%d] (FAILED)\n",
1360                         (long long)cursor->node->node_offset, i);
1361         }
1362
1363         /*
1364          * No exact match was found, i is now at the insertion point.
1365          *
1366          * If inserting split a full leaf before returning.  This
1367          * may have the side effect of adjusting cursor->node and
1368          * cursor->index.
1369          */
1370         cursor->index = i;
1371         if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 &&
1372              btree_node_is_full(node)) {
1373                 error = btree_split_leaf(cursor);
1374                 if (error) {
1375                         if (error != ENOSPC)
1376                                 goto done;
1377                         enospc = 1;
1378                 }
1379                 /*
1380                  * reload stale pointers
1381                  */
1382                 /* NOT USED
1383                 i = cursor->index;
1384                 node = &cursor->node->internal;
1385                 */
1386         }
1387
1388         /*
1389          * We reached a leaf but did not find the key we were looking for.
1390          * If this is an insert we will be properly positioned for an insert
1391          * (ENOENT) or spike (ENOSPC) operation.
1392          */
1393         error = enospc ? ENOSPC : ENOENT;
1394 done:
1395         return(error);
1396 }
1397
1398 /*
1399  * Heuristical search for the first element whos comparison is <= 1.  May
1400  * return an index whos compare result is > 1 but may only return an index
1401  * whos compare result is <= 1 if it is the first element with that result.
1402  */
1403 int
1404 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node)
1405 {
1406         int b;
1407         int s;
1408         int i;
1409         int r;
1410
1411         /*
1412          * Don't bother if the node does not have very many elements
1413          */
1414         b = 0;
1415         s = node->count;
1416         while (s - b > 4) {
1417                 i = b + (s - b) / 2;
1418                 ++hammer_stats_btree_elements;
1419                 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base);
1420                 if (r <= 1) {
1421                         s = i;
1422                 } else {
1423                         b = i;
1424                 }
1425         }
1426         return(b);
1427 }
1428
1429
1430 /************************************************************************
1431  *                         SPLITTING AND MERGING                        *
1432  ************************************************************************
1433  *
1434  * These routines do all the dirty work required to split and merge nodes.
1435  */
1436
1437 /*
1438  * Split an internal node into two nodes and move the separator at the split
1439  * point to the parent.
1440  *
1441  * (cursor->node, cursor->index) indicates the element the caller intends
1442  * to push into.  We will adjust node and index if that element winds
1443  * up in the split node.
1444  *
1445  * If we are at the root of the filesystem a new root must be created with
1446  * two elements, one pointing to the original root and one pointing to the
1447  * newly allocated split node.
1448  */
1449 static
1450 int
1451 btree_split_internal(hammer_cursor_t cursor)
1452 {
1453         hammer_node_ondisk_t ondisk;
1454         hammer_node_t node;
1455         hammer_node_t parent;
1456         hammer_node_t new_node;
1457         hammer_btree_elm_t elm;
1458         hammer_btree_elm_t parent_elm;
1459         struct hammer_node_lock lockroot;
1460         hammer_mount_t hmp = cursor->trans->hmp;
1461         hammer_off_t hint;
1462         int parent_index;
1463         int made_root;
1464         int split;
1465         int error;
1466         int i;
1467         const int esize = sizeof(*elm);
1468
1469         hammer_node_lock_init(&lockroot, cursor->node);
1470         error = hammer_btree_lock_children(cursor, 1, &lockroot);
1471         if (error)
1472                 goto done;
1473         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1474                 goto done;
1475         ++hammer_stats_btree_splits;
1476
1477         /* 
1478          * Calculate the split point.  If the insertion point is at the
1479          * end of the leaf we adjust the split point significantly to the
1480          * right to try to optimize node fill and flag it.  If we hit
1481          * that same leaf again our heuristic failed and we don't try
1482          * to optimize node fill (it could lead to a degenerate case).
1483          */
1484         node = cursor->node;
1485         ondisk = node->ondisk;
1486         KKASSERT(ondisk->count > 4);
1487         if (cursor->index == ondisk->count &&
1488             (node->flags & HAMMER_NODE_NONLINEAR) == 0) {
1489                 split = (ondisk->count + 1) * 3 / 4;
1490                 node->flags |= HAMMER_NODE_NONLINEAR;
1491         } else {
1492                 /*
1493                  * We are splitting but elms[split] will be promoted to
1494                  * the parent, leaving the right hand node with one less
1495                  * element.  If the insertion point will be on the
1496                  * left-hand side adjust the split point to give the
1497                  * right hand side one additional node.
1498                  */
1499                 split = (ondisk->count + 1) / 2;
1500                 if (cursor->index <= split)
1501                         --split;
1502         }
1503
1504         /*
1505          * If we are at the root of the filesystem, create a new root node
1506          * with 1 element and split normally.  Avoid making major
1507          * modifications until we know the whole operation will work.
1508          */
1509         if (ondisk->parent == 0) {
1510                 parent = hammer_alloc_btree(cursor->trans, node->node_offset,
1511                                             &error);
1512                 if (parent == NULL)
1513                         goto done;
1514                 hammer_lock_ex(&parent->lock);
1515                 hammer_modify_node_noundo(cursor->trans, parent);
1516                 ondisk = parent->ondisk;
1517                 ondisk->count = 1;
1518                 ondisk->parent = 0;
1519                 ondisk->mirror_tid = node->ondisk->mirror_tid;
1520                 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1521                 ondisk->elms[0].base = hmp->root_btree_beg;
1522                 ondisk->elms[0].base.btype = node->ondisk->type;
1523                 ondisk->elms[0].internal.subtree_offset = node->node_offset;
1524                 ondisk->elms[1].base = hmp->root_btree_end;
1525                 hammer_modify_node_done(parent);
1526                 /* ondisk->elms[1].base.btype - not used */
1527                 made_root = 1;
1528                 parent_index = 0;       /* index of current node in parent */
1529         } else {
1530                 made_root = 0;
1531                 parent = cursor->parent;
1532                 parent_index = cursor->parent_index;
1533         }
1534
1535         /*
1536          * Calculate a hint for the allocation of the new B-Tree node.
1537          * The most likely expansion is coming from the insertion point
1538          * at cursor->index, so try to localize the allocation of our
1539          * new node to accomodate that sub-tree.
1540          *
1541          * Use the right-most sub-tree when expandinging on the right edge.
1542          * This is a very common case when copying a directory tree.
1543          */
1544         if (cursor->index == ondisk->count)
1545                 hint = ondisk->elms[cursor->index - 1].internal.subtree_offset;
1546         else
1547                 hint = ondisk->elms[cursor->index].internal.subtree_offset;
1548
1549         /*
1550          * Split node into new_node at the split point.
1551          *
1552          *  B O O O P N N B     <-- P = node->elms[split] (index 4)
1553          *   0 1 2 3 4 5 6      <-- subtree indices
1554          *
1555          *       x x P x x
1556          *        s S S s  
1557          *         /   \
1558          *  B O O O B    B N N B        <--- inner boundary points are 'P'
1559          *   0 1 2 3      4 5 6  
1560          */
1561         new_node = hammer_alloc_btree(cursor->trans, hint, &error);
1562         if (new_node == NULL) {
1563                 if (made_root) {
1564                         hammer_unlock(&parent->lock);
1565                         hammer_delete_node(cursor->trans, parent);
1566                         hammer_rel_node(parent);
1567                 }
1568                 goto done;
1569         }
1570         hammer_lock_ex(&new_node->lock);
1571
1572         /*
1573          * Create the new node.  P becomes the left-hand boundary in the
1574          * new node.  Copy the right-hand boundary as well.
1575          *
1576          * elm is the new separator.
1577          */
1578         hammer_modify_node_noundo(cursor->trans, new_node);
1579         hammer_modify_node_all(cursor->trans, node);
1580         ondisk = node->ondisk;
1581         elm = &ondisk->elms[split];
1582         bcopy(elm, &new_node->ondisk->elms[0],
1583               (ondisk->count - split + 1) * esize);
1584         new_node->ondisk->count = ondisk->count - split;
1585         new_node->ondisk->parent = parent->node_offset;
1586         new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1587         new_node->ondisk->mirror_tid = ondisk->mirror_tid;
1588         KKASSERT(ondisk->type == new_node->ondisk->type);
1589         hammer_cursor_split_node(node, new_node, split);
1590
1591         /*
1592          * Cleanup the original node.  Elm (P) becomes the new boundary,
1593          * its subtree_offset was moved to the new node.  If we had created
1594          * a new root its parent pointer may have changed.
1595          */
1596         elm->internal.subtree_offset = 0;
1597         ondisk->count = split;
1598
1599         /*
1600          * Insert the separator into the parent, fixup the parent's
1601          * reference to the original node, and reference the new node.
1602          * The separator is P.
1603          *
1604          * Remember that base.count does not include the right-hand boundary.
1605          */
1606         hammer_modify_node_all(cursor->trans, parent);
1607         ondisk = parent->ondisk;
1608         KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
1609         parent_elm = &ondisk->elms[parent_index+1];
1610         bcopy(parent_elm, parent_elm + 1,
1611               (ondisk->count - parent_index) * esize);
1612         parent_elm->internal.base = elm->base;  /* separator P */
1613         parent_elm->internal.base.btype = new_node->ondisk->type;
1614         parent_elm->internal.subtree_offset = new_node->node_offset;
1615         parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid;
1616         ++ondisk->count;
1617         hammer_modify_node_done(parent);
1618         hammer_cursor_inserted_element(parent, parent_index + 1);
1619
1620         /*
1621          * The children of new_node need their parent pointer set to new_node.
1622          * The children have already been locked by
1623          * hammer_btree_lock_children().
1624          */
1625         for (i = 0; i < new_node->ondisk->count; ++i) {
1626                 elm = &new_node->ondisk->elms[i];
1627                 error = btree_set_parent(cursor->trans, new_node, elm);
1628                 if (error) {
1629                         panic("btree_split_internal: btree-fixup problem");
1630                 }
1631         }
1632         hammer_modify_node_done(new_node);
1633
1634         /*
1635          * The filesystem's root B-Tree pointer may have to be updated.
1636          */
1637         if (made_root) {
1638                 hammer_volume_t volume;
1639
1640                 volume = hammer_get_root_volume(hmp, &error);
1641                 KKASSERT(error == 0);
1642
1643                 hammer_modify_volume_field(cursor->trans, volume,
1644                                            vol0_btree_root);
1645                 volume->ondisk->vol0_btree_root = parent->node_offset;
1646                 hammer_modify_volume_done(volume);
1647                 node->ondisk->parent = parent->node_offset;
1648                 if (cursor->parent) {
1649                         hammer_unlock(&cursor->parent->lock);
1650                         hammer_rel_node(cursor->parent);
1651                 }
1652                 cursor->parent = parent;        /* lock'd and ref'd */
1653                 hammer_rel_volume(volume, 0);
1654         }
1655         hammer_modify_node_done(node);
1656
1657         /*
1658          * Ok, now adjust the cursor depending on which element the original
1659          * index was pointing at.  If we are >= the split point the push node
1660          * is now in the new node.
1661          *
1662          * NOTE: If we are at the split point itself we cannot stay with the
1663          * original node because the push index will point at the right-hand
1664          * boundary, which is illegal.
1665          *
1666          * NOTE: The cursor's parent or parent_index must be adjusted for
1667          * the case where a new parent (new root) was created, and the case
1668          * where the cursor is now pointing at the split node.
1669          */
1670         if (cursor->index >= split) {
1671                 cursor->parent_index = parent_index + 1;
1672                 cursor->index -= split;
1673                 hammer_unlock(&cursor->node->lock);
1674                 hammer_rel_node(cursor->node);
1675                 cursor->node = new_node;        /* locked and ref'd */
1676         } else {
1677                 cursor->parent_index = parent_index;
1678                 hammer_unlock(&new_node->lock);
1679                 hammer_rel_node(new_node);
1680         }
1681
1682         /*
1683          * Fixup left and right bounds
1684          */
1685         parent_elm = &parent->ondisk->elms[cursor->parent_index];
1686         cursor->left_bound = &parent_elm[0].internal.base;
1687         cursor->right_bound = &parent_elm[1].internal.base;
1688         KKASSERT(hammer_btree_cmp(cursor->left_bound,
1689                  &cursor->node->ondisk->elms[0].internal.base) <= 0);
1690         KKASSERT(hammer_btree_cmp(cursor->right_bound,
1691                  &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0);
1692
1693 done:
1694         hammer_btree_unlock_children(cursor, &lockroot);
1695         hammer_cursor_downgrade(cursor);
1696         return (error);
1697 }
1698
1699 /*
1700  * Same as the above, but splits a full leaf node.
1701  *
1702  * This function
1703  */
1704 static
1705 int
1706 btree_split_leaf(hammer_cursor_t cursor)
1707 {
1708         hammer_node_ondisk_t ondisk;
1709         hammer_node_t parent;
1710         hammer_node_t leaf;
1711         hammer_mount_t hmp;
1712         hammer_node_t new_leaf;
1713         hammer_btree_elm_t elm;
1714         hammer_btree_elm_t parent_elm;
1715         hammer_base_elm_t mid_boundary;
1716         hammer_off_t hint;
1717         int parent_index;
1718         int made_root;
1719         int split;
1720         int error;
1721         const size_t esize = sizeof(*elm);
1722
1723         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1724                 return(error);
1725         ++hammer_stats_btree_splits;
1726
1727         KKASSERT(hammer_btree_cmp(cursor->left_bound,
1728                  &cursor->node->ondisk->elms[0].leaf.base) <= 0);
1729         KKASSERT(hammer_btree_cmp(cursor->right_bound,
1730                  &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0);
1731
1732         /* 
1733          * Calculate the split point.  If the insertion point is at the
1734          * end of the leaf we adjust the split point significantly to the
1735          * right to try to optimize node fill and flag it.  If we hit
1736          * that same leaf again our heuristic failed and we don't try
1737          * to optimize node fill (it could lead to a degenerate case).
1738          *
1739          * Spikes are made up of two leaf elements which cannot be
1740          * safely split.
1741          */
1742         leaf = cursor->node;
1743         ondisk = leaf->ondisk;
1744         KKASSERT(ondisk->count > 4);
1745         if (cursor->index == ondisk->count &&
1746             (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) {
1747                 split = (ondisk->count + 1) * 3 / 4;
1748                 leaf->flags |= HAMMER_NODE_NONLINEAR;
1749         } else {
1750                 split = (ondisk->count + 1) / 2;
1751         }
1752
1753 #if 0
1754         /*
1755          * If the insertion point is at the split point shift the
1756          * split point left so we don't have to worry about
1757          */
1758         if (cursor->index == split)
1759                 --split;
1760 #endif
1761         KKASSERT(split > 0 && split < ondisk->count);
1762
1763         error = 0;
1764         hmp = leaf->hmp;
1765
1766         elm = &ondisk->elms[split];
1767
1768         KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0);
1769         KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0);
1770         KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0);
1771         KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0);
1772
1773         /*
1774          * If we are at the root of the tree, create a new root node with
1775          * 1 element and split normally.  Avoid making major modifications
1776          * until we know the whole operation will work.
1777          */
1778         if (ondisk->parent == 0) {
1779                 parent = hammer_alloc_btree(cursor->trans, leaf->node_offset,
1780                                             &error);
1781                 if (parent == NULL)
1782                         goto done;
1783                 hammer_lock_ex(&parent->lock);
1784                 hammer_modify_node_noundo(cursor->trans, parent);
1785                 ondisk = parent->ondisk;
1786                 ondisk->count = 1;
1787                 ondisk->parent = 0;
1788                 ondisk->mirror_tid = leaf->ondisk->mirror_tid;
1789                 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1790                 ondisk->elms[0].base = hmp->root_btree_beg;
1791                 ondisk->elms[0].base.btype = leaf->ondisk->type;
1792                 ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
1793                 ondisk->elms[1].base = hmp->root_btree_end;
1794                 /* ondisk->elms[1].base.btype = not used */
1795                 hammer_modify_node_done(parent);
1796                 made_root = 1;
1797                 parent_index = 0;       /* insertion point in parent */
1798         } else {
1799                 made_root = 0;
1800                 parent = cursor->parent;
1801                 parent_index = cursor->parent_index;
1802         }
1803
1804         /*
1805          * Calculate a hint for the allocation of the new B-Tree leaf node.
1806          * For now just try to localize it within the same bigblock as
1807          * the current leaf.
1808          *
1809          * If the insertion point is at the end of the leaf we recognize a
1810          * likely append sequence of some sort (data, meta-data, inodes,
1811          * whatever).  Set the hint to zero to allocate out of linear space
1812          * instead of trying to completely fill previously hinted space.
1813          *
1814          * This also sets the stage for recursive splits to localize using
1815          * the new space.
1816          */
1817         ondisk = leaf->ondisk;
1818         if (cursor->index == ondisk->count)
1819                 hint = 0;
1820         else
1821                 hint = leaf->node_offset;
1822
1823         /*
1824          * Split leaf into new_leaf at the split point.  Select a separator
1825          * value in-between the two leafs but with a bent towards the right
1826          * leaf since comparisons use an 'elm >= separator' inequality.
1827          *
1828          *  L L L L L L L L
1829          *
1830          *       x x P x x
1831          *        s S S s  
1832          *         /   \
1833          *  L L L L     L L L L
1834          */
1835         new_leaf = hammer_alloc_btree(cursor->trans, hint, &error);
1836         if (new_leaf == NULL) {
1837                 if (made_root) {
1838                         hammer_unlock(&parent->lock);
1839                         hammer_delete_node(cursor->trans, parent);
1840                         hammer_rel_node(parent);
1841                 }
1842                 goto done;
1843         }
1844         hammer_lock_ex(&new_leaf->lock);
1845
1846         /*
1847          * Create the new node and copy the leaf elements from the split 
1848          * point on to the new node.
1849          */
1850         hammer_modify_node_all(cursor->trans, leaf);
1851         hammer_modify_node_noundo(cursor->trans, new_leaf);
1852         ondisk = leaf->ondisk;
1853         elm = &ondisk->elms[split];
1854         bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1855         new_leaf->ondisk->count = ondisk->count - split;
1856         new_leaf->ondisk->parent = parent->node_offset;
1857         new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1858         new_leaf->ondisk->mirror_tid = ondisk->mirror_tid;
1859         KKASSERT(ondisk->type == new_leaf->ondisk->type);
1860         hammer_modify_node_done(new_leaf);
1861         hammer_cursor_split_node(leaf, new_leaf, split);
1862
1863         /*
1864          * Cleanup the original node.  Because this is a leaf node and
1865          * leaf nodes do not have a right-hand boundary, there
1866          * aren't any special edge cases to clean up.  We just fixup the
1867          * count.
1868          */
1869         ondisk->count = split;
1870
1871         /*
1872          * Insert the separator into the parent, fixup the parent's
1873          * reference to the original node, and reference the new node.
1874          * The separator is P.
1875          *
1876          * Remember that base.count does not include the right-hand boundary.
1877          * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1878          */
1879         hammer_modify_node_all(cursor->trans, parent);
1880         ondisk = parent->ondisk;
1881         KKASSERT(split != 0);
1882         KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
1883         parent_elm = &ondisk->elms[parent_index+1];
1884         bcopy(parent_elm, parent_elm + 1,
1885               (ondisk->count - parent_index) * esize);
1886
1887         hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1888         parent_elm->internal.base.btype = new_leaf->ondisk->type;
1889         parent_elm->internal.subtree_offset = new_leaf->node_offset;
1890         parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid;
1891         mid_boundary = &parent_elm->base;
1892         ++ondisk->count;
1893         hammer_modify_node_done(parent);
1894         hammer_cursor_inserted_element(parent, parent_index + 1);
1895
1896         /*
1897          * The filesystem's root B-Tree pointer may have to be updated.
1898          */
1899         if (made_root) {
1900                 hammer_volume_t volume;
1901
1902                 volume = hammer_get_root_volume(hmp, &error);
1903                 KKASSERT(error == 0);
1904
1905                 hammer_modify_volume_field(cursor->trans, volume,
1906                                            vol0_btree_root);
1907                 volume->ondisk->vol0_btree_root = parent->node_offset;
1908                 hammer_modify_volume_done(volume);
1909                 leaf->ondisk->parent = parent->node_offset;
1910                 if (cursor->parent) {
1911                         hammer_unlock(&cursor->parent->lock);
1912                         hammer_rel_node(cursor->parent);
1913                 }
1914                 cursor->parent = parent;        /* lock'd and ref'd */
1915                 hammer_rel_volume(volume, 0);
1916         }
1917         hammer_modify_node_done(leaf);
1918
1919         /*
1920          * Ok, now adjust the cursor depending on which element the original
1921          * index was pointing at.  If we are >= the split point the push node
1922          * is now in the new node.
1923          *
1924          * NOTE: If we are at the split point itself we need to select the
1925          * old or new node based on where key_beg's insertion point will be.
1926          * If we pick the wrong side the inserted element will wind up in
1927          * the wrong leaf node and outside that node's bounds.
1928          */
1929         if (cursor->index > split ||
1930             (cursor->index == split &&
1931              hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) {
1932                 cursor->parent_index = parent_index + 1;
1933                 cursor->index -= split;
1934                 hammer_unlock(&cursor->node->lock);
1935                 hammer_rel_node(cursor->node);
1936                 cursor->node = new_leaf;
1937         } else {
1938                 cursor->parent_index = parent_index;
1939                 hammer_unlock(&new_leaf->lock);
1940                 hammer_rel_node(new_leaf);
1941         }
1942
1943         /*
1944          * Fixup left and right bounds
1945          */
1946         parent_elm = &parent->ondisk->elms[cursor->parent_index];
1947         cursor->left_bound = &parent_elm[0].internal.base;
1948         cursor->right_bound = &parent_elm[1].internal.base;
1949
1950         /*
1951          * Assert that the bounds are correct.
1952          */
1953         KKASSERT(hammer_btree_cmp(cursor->left_bound,
1954                  &cursor->node->ondisk->elms[0].leaf.base) <= 0);
1955         KKASSERT(hammer_btree_cmp(cursor->right_bound,
1956                  &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0);
1957         KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0);
1958         KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0);
1959
1960 done:
1961         hammer_cursor_downgrade(cursor);
1962         return (error);
1963 }
1964
1965 #if 0
1966
1967 /*
1968  * Recursively correct the right-hand boundary's create_tid to (tid) as
1969  * long as the rest of the key matches.  We have to recurse upward in
1970  * the tree as well as down the left side of each parent's right node.
1971  *
1972  * Return EDEADLK if we were only partially successful, forcing the caller
1973  * to try again.  The original cursor is not modified.  This routine can
1974  * also fail with EDEADLK if it is forced to throw away a portion of its
1975  * record history.
1976  *
1977  * The caller must pass a downgraded cursor to us (otherwise we can't dup it).
1978  */
1979 struct hammer_rhb {
1980         TAILQ_ENTRY(hammer_rhb) entry;
1981         hammer_node_t   node;
1982         int             index;
1983 };
1984
1985 TAILQ_HEAD(hammer_rhb_list, hammer_rhb);
1986
1987 int
1988 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid)
1989 {
1990         struct hammer_mount *hmp;
1991         struct hammer_rhb_list rhb_list;
1992         hammer_base_elm_t elm;
1993         hammer_node_t orig_node;
1994         struct hammer_rhb *rhb;
1995         int orig_index;
1996         int error;
1997
1998         TAILQ_INIT(&rhb_list);
1999         hmp = cursor->trans->hmp;
2000
2001         /*
2002          * Save our position so we can restore it on return.  This also
2003          * gives us a stable 'elm'.
2004          */
2005         orig_node = cursor->node;
2006         hammer_ref_node(orig_node);
2007         hammer_lock_sh(&orig_node->lock);
2008         orig_index = cursor->index;
2009         elm = &orig_node->ondisk->elms[orig_index].base;
2010
2011         /*
2012          * Now build a list of parents going up, allocating a rhb
2013          * structure for each one.
2014          */
2015         while (cursor->parent) {
2016                 /*
2017                  * Stop if we no longer have any right-bounds to fix up
2018                  */
2019                 if (elm->obj_id != cursor->right_bound->obj_id ||
2020                     elm->rec_type != cursor->right_bound->rec_type ||
2021                     elm->key != cursor->right_bound->key) {
2022                         break;
2023                 }
2024
2025                 /*
2026                  * Stop if the right-hand bound's create_tid does not
2027                  * need to be corrected.
2028                  */
2029                 if (cursor->right_bound->create_tid >= tid)
2030                         break;
2031
2032                 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO);
2033                 rhb->node = cursor->parent;
2034                 rhb->index = cursor->parent_index;
2035                 hammer_ref_node(rhb->node);
2036                 hammer_lock_sh(&rhb->node->lock);
2037                 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry);
2038
2039                 hammer_cursor_up(cursor);
2040         }
2041
2042         /*
2043          * now safely adjust the right hand bound for each rhb.  This may
2044          * also require taking the right side of the tree and iterating down
2045          * ITS left side.
2046          */
2047         error = 0;
2048         while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2049                 error = hammer_cursor_seek(cursor, rhb->node, rhb->index);
2050                 if (error)
2051                         break;
2052                 TAILQ_REMOVE(&rhb_list, rhb, entry);
2053                 hammer_unlock(&rhb->node->lock);
2054                 hammer_rel_node(rhb->node);
2055                 kfree(rhb, hmp->m_misc);
2056
2057                 switch (cursor->node->ondisk->type) {
2058                 case HAMMER_BTREE_TYPE_INTERNAL:
2059                         /*
2060                          * Right-boundary for parent at internal node
2061                          * is one element to the right of the element whos
2062                          * right boundary needs adjusting.  We must then
2063                          * traverse down the left side correcting any left
2064                          * bounds (which may now be too far to the left).
2065                          */
2066                         ++cursor->index;
2067                         error = hammer_btree_correct_lhb(cursor, tid);
2068                         break;
2069                 default:
2070                         panic("hammer_btree_correct_rhb(): Bad node type");
2071                         error = EINVAL;
2072                         break;
2073                 }
2074         }
2075
2076         /*
2077          * Cleanup
2078          */
2079         while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2080                 TAILQ_REMOVE(&rhb_list, rhb, entry);
2081                 hammer_unlock(&rhb->node->lock);
2082                 hammer_rel_node(rhb->node);
2083                 kfree(rhb, hmp->m_misc);
2084         }
2085         error = hammer_cursor_seek(cursor, orig_node, orig_index);
2086         hammer_unlock(&orig_node->lock);
2087         hammer_rel_node(orig_node);
2088         return (error);
2089 }
2090
2091 /*
2092  * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand
2093  * bound going downward starting at the current cursor position.
2094  *
2095  * This function does not restore the cursor after use.
2096  */
2097 int
2098 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid)
2099 {
2100         struct hammer_rhb_list rhb_list;
2101         hammer_base_elm_t elm;
2102         hammer_base_elm_t cmp;
2103         struct hammer_rhb *rhb;
2104         struct hammer_mount *hmp;
2105         int error;
2106
2107         TAILQ_INIT(&rhb_list);
2108         hmp = cursor->trans->hmp;
2109
2110         cmp = &cursor->node->ondisk->elms[cursor->index].base;
2111
2112         /*
2113          * Record the node and traverse down the left-hand side for all
2114          * matching records needing a boundary correction.
2115          */
2116         error = 0;
2117         for (;;) {
2118                 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO);
2119                 rhb->node = cursor->node;
2120                 rhb->index = cursor->index;
2121                 hammer_ref_node(rhb->node);
2122                 hammer_lock_sh(&rhb->node->lock);
2123                 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry);
2124
2125                 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
2126                         /*
2127                          * Nothing to traverse down if we are at the right
2128                          * boundary of an internal node.
2129                          */
2130                         if (cursor->index == cursor->node->ondisk->count)
2131                                 break;
2132                 } else {
2133                         elm = &cursor->node->ondisk->elms[cursor->index].base;
2134                         if (elm->btype == HAMMER_BTREE_TYPE_RECORD)
2135                                 break;
2136                         panic("Illegal leaf record type %02x", elm->btype);
2137                 }
2138                 error = hammer_cursor_down(cursor);
2139                 if (error)
2140                         break;
2141
2142                 elm = &cursor->node->ondisk->elms[cursor->index].base;
2143                 if (elm->obj_id != cmp->obj_id ||
2144                     elm->rec_type != cmp->rec_type ||
2145                     elm->key != cmp->key) {
2146                         break;
2147                 }
2148                 if (elm->create_tid >= tid)
2149                         break;
2150
2151         }
2152
2153         /*
2154          * Now we can safely adjust the left-hand boundary from the bottom-up.
2155          * The last element we remove from the list is the caller's right hand
2156          * boundary, which must also be adjusted.
2157          */
2158         while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2159                 error = hammer_cursor_seek(cursor, rhb->node, rhb->index);
2160                 if (error)
2161                         break;
2162                 TAILQ_REMOVE(&rhb_list, rhb, entry);
2163                 hammer_unlock(&rhb->node->lock);
2164                 hammer_rel_node(rhb->node);
2165                 kfree(rhb, hmp->m_misc);
2166
2167                 elm = &cursor->node->ondisk->elms[cursor->index].base;
2168                 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
2169                         hammer_modify_node(cursor->trans, cursor->node,
2170                                            &elm->create_tid,
2171                                            sizeof(elm->create_tid));
2172                         elm->create_tid = tid;
2173                         hammer_modify_node_done(cursor->node);
2174                 } else {
2175                         panic("hammer_btree_correct_lhb(): Bad element type");
2176                 }
2177         }
2178
2179         /*
2180          * Cleanup
2181          */
2182         while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
2183                 TAILQ_REMOVE(&rhb_list, rhb, entry);
2184                 hammer_unlock(&rhb->node->lock);
2185                 hammer_rel_node(rhb->node);
2186                 kfree(rhb, hmp->m_misc);
2187         }
2188         return (error);
2189 }
2190
2191 #endif
2192
2193 /*
2194  * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at
2195  * (cursor->node).  Returns 0 on success, EDEADLK if we could not complete
2196  * the operation due to a deadlock, or some other error.
2197  *
2198  * This routine is initially called with an empty leaf and may be
2199  * recursively called with single-element internal nodes.
2200  *
2201  * It should also be noted that when removing empty leaves we must be sure
2202  * to test and update mirror_tid because another thread may have deadlocked
2203  * against us (or someone) trying to propagate it up and cannot retry once
2204  * the node has been deleted.
2205  *
2206  * On return the cursor may end up pointing to an internal node, suitable
2207  * for further iteration but not for an immediate insertion or deletion.
2208  */
2209 static int
2210 btree_remove(hammer_cursor_t cursor)
2211 {
2212         hammer_node_ondisk_t ondisk;
2213         hammer_btree_elm_t elm;
2214         hammer_node_t node;
2215         hammer_node_t parent;
2216         const int esize = sizeof(*elm);
2217         int error;
2218
2219         node = cursor->node;
2220
2221         /*
2222          * When deleting the root of the filesystem convert it to
2223          * an empty leaf node.  Internal nodes cannot be empty.
2224          */
2225         ondisk = node->ondisk;
2226         if (ondisk->parent == 0) {
2227                 KKASSERT(cursor->parent == NULL);
2228                 hammer_modify_node_all(cursor->trans, node);
2229                 KKASSERT(ondisk == node->ondisk);
2230                 ondisk->type = HAMMER_BTREE_TYPE_LEAF;
2231                 ondisk->count = 0;
2232                 hammer_modify_node_done(node);
2233                 cursor->index = 0;
2234                 return(0);
2235         }
2236
2237         parent = cursor->parent;
2238
2239         /*
2240          * Attempt to remove the parent's reference to the child.  If the
2241          * parent would become empty we have to recurse.  If we fail we 
2242          * leave the parent pointing to an empty leaf node.
2243          *
2244          * We have to recurse successfully before we can delete the internal
2245          * node as it is illegal to have empty internal nodes.  Even though
2246          * the operation may be aborted we must still fixup any unlocked
2247          * cursors as if we had deleted the element prior to recursing
2248          * (by calling hammer_cursor_deleted_element()) so those cursors
2249          * are properly forced up the chain by the recursion.
2250          */
2251         if (parent->ondisk->count == 1) {
2252                 /*
2253                  * This special cursor_up_locked() call leaves the original
2254                  * node exclusively locked and referenced, leaves the
2255                  * original parent locked (as the new node), and locks the
2256                  * new parent.  It can return EDEADLK.
2257                  *
2258                  * We cannot call hammer_cursor_removed_node() until we are
2259                  * actually able to remove the node.  If we did then tracked
2260                  * cursors in the middle of iterations could be repointed
2261                  * to a parent node.  If this occurs they could end up
2262                  * scanning newly inserted records into the node (that could
2263                  * not be deleted) when they push down again.
2264                  *
2265                  * Due to the way the recursion works the final parent is left
2266                  * in cursor->parent after the recursion returns.  Each
2267                  * layer on the way back up is thus able to call
2268                  * hammer_cursor_removed_node() and 'jump' the node up to
2269                  * the (same) final parent.
2270                  *
2271                  * NOTE!  The local variable 'parent' is invalid after we
2272                  *        call hammer_cursor_up_locked().
2273                  */
2274                 error = hammer_cursor_up_locked(cursor);
2275                 parent = NULL;
2276
2277                 if (error == 0) {
2278                         hammer_cursor_deleted_element(cursor->node, 0);
2279                         error = btree_remove(cursor);
2280                         if (error == 0) {
2281                                 KKASSERT(node != cursor->node);
2282                                 hammer_cursor_removed_node(
2283                                         node, cursor->node,
2284                                         cursor->index);
2285                                 hammer_modify_node_all(cursor->trans, node);
2286                                 ondisk = node->ondisk;
2287                                 ondisk->type = HAMMER_BTREE_TYPE_DELETED;
2288                                 ondisk->count = 0;
2289                                 hammer_modify_node_done(node);
2290                                 hammer_flush_node(node);
2291                                 hammer_delete_node(cursor->trans, node);
2292                         } else {
2293                                 /*
2294                                  * Defer parent removal because we could not
2295                                  * get the lock, just let the leaf remain
2296                                  * empty.
2297                                  */
2298                                 /**/
2299                         }
2300                         hammer_unlock(&node->lock);
2301                         hammer_rel_node(node);
2302                 } else {
2303                         /*
2304                          * Defer parent removal because we could not
2305                          * get the lock, just let the leaf remain
2306                          * empty.
2307                          */
2308                         /**/
2309                 }
2310         } else {
2311                 KKASSERT(parent->ondisk->count > 1);
2312
2313                 hammer_modify_node_all(cursor->trans, parent);
2314                 ondisk = parent->ondisk;
2315                 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
2316
2317                 elm = &ondisk->elms[cursor->parent_index];
2318                 KKASSERT(elm->internal.subtree_offset == node->node_offset);
2319                 KKASSERT(ondisk->count > 0);
2320
2321                 /*
2322                  * We must retain the highest mirror_tid.  The deleted
2323                  * range is now encompassed by the element to the left.
2324                  * If we are already at the left edge the new left edge
2325                  * inherits mirror_tid.
2326                  *
2327                  * Note that bounds of the parent to our parent may create
2328                  * a gap to the left of our left-most node or to the right
2329                  * of our right-most node.  The gap is silently included
2330                  * in the mirror_tid's area of effect from the point of view
2331                  * of the scan.
2332                  */
2333                 if (cursor->parent_index) {
2334                         if (elm[-1].internal.mirror_tid <
2335                             elm[0].internal.mirror_tid) {
2336                                 elm[-1].internal.mirror_tid =
2337                                     elm[0].internal.mirror_tid;
2338                         }
2339                 } else {
2340                         if (elm[1].internal.mirror_tid <
2341                             elm[0].internal.mirror_tid) {
2342                                 elm[1].internal.mirror_tid =
2343                                     elm[0].internal.mirror_tid;
2344                         }
2345                 }
2346
2347                 /*
2348                  * Delete the subtree reference in the parent.  Include
2349                  * boundary element at end.
2350                  */
2351                 bcopy(&elm[1], &elm[0],
2352                       (ondisk->count - cursor->parent_index) * esize);
2353                 --ondisk->count;
2354                 hammer_modify_node_done(parent);
2355                 hammer_cursor_removed_node(node, parent, cursor->parent_index);
2356                 hammer_cursor_deleted_element(parent, cursor->parent_index);
2357                 hammer_flush_node(node);
2358                 hammer_delete_node(cursor->trans, node);
2359
2360                 /*
2361                  * cursor->node is invalid, cursor up to make the cursor
2362                  * valid again.
2363                  */
2364                 error = hammer_cursor_up(cursor);
2365         }
2366         return (error);
2367 }
2368
2369 /*
2370  * Propagate cursor->trans->tid up the B-Tree starting at the current
2371  * cursor position using pseudofs info gleaned from the passed inode.
2372  *
2373  * The passed inode has no relationship to the cursor position other
2374  * then being in the same pseudofs as the insertion or deletion we
2375  * are propagating the mirror_tid for.
2376  *
2377  * WARNING!  Because we push and pop the passed cursor, it may be
2378  *           modified by other B-Tree operations while it is unlocked
2379  *           and things like the node & leaf pointers, and indexes might
2380  *           change.
2381  */
2382 void
2383 hammer_btree_do_propagation(hammer_cursor_t cursor,
2384                             hammer_pseudofs_inmem_t pfsm,
2385                             hammer_btree_leaf_elm_t leaf)
2386 {
2387         hammer_cursor_t ncursor;
2388         hammer_tid_t mirror_tid;
2389         int error;
2390
2391         /*
2392          * We do not propagate a mirror_tid if the filesystem was mounted
2393          * in no-mirror mode.
2394          */
2395         if (cursor->trans->hmp->master_id < 0)
2396                 return;
2397
2398         /*
2399          * This is a bit of a hack because we cannot deadlock or return
2400          * EDEADLK here.  The related operation has already completed and
2401          * we must propagate the mirror_tid now regardless.
2402          *
2403          * Generate a new cursor which inherits the original's locks and
2404          * unlock the original.  Use the new cursor to propagate the
2405          * mirror_tid.  Then clean up the new cursor and reacquire locks
2406          * on the original.
2407          *
2408          * hammer_dup_cursor() cannot dup locks.  The dup inherits the
2409          * original's locks and the original is tracked and must be
2410          * re-locked.
2411          */
2412         mirror_tid = cursor->node->ondisk->mirror_tid;
2413         KKASSERT(mirror_tid != 0);
2414         ncursor = hammer_push_cursor(cursor);
2415         error = hammer_btree_mirror_propagate(ncursor, mirror_tid);
2416         KKASSERT(error == 0);
2417         hammer_pop_cursor(cursor, ncursor);
2418         /* WARNING: cursor's leaf pointer may change after pop */
2419 }
2420
2421
2422 /*
2423  * Propagate a mirror TID update upwards through the B-Tree to the root.
2424  *
2425  * A locked internal node must be passed in.  The node will remain locked
2426  * on return.
2427  *
2428  * This function syncs mirror_tid at the specified internal node's element,
2429  * adjusts the node's aggregation mirror_tid, and then recurses upwards.
2430  */
2431 static int
2432 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid)
2433 {
2434         hammer_btree_internal_elm_t elm;
2435         hammer_node_t node;
2436         int error;
2437
2438         for (;;) {
2439                 error = hammer_cursor_up(cursor);
2440                 if (error == 0)
2441                         error = hammer_cursor_upgrade(cursor);
2442                 while (error == EDEADLK) {
2443                         hammer_recover_cursor(cursor);
2444                         error = hammer_cursor_upgrade(cursor);
2445                 }
2446                 if (error)
2447                         break;
2448                 node = cursor->node;
2449                 KKASSERT (node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
2450
2451                 /*
2452                  * Adjust the node's element
2453                  */
2454                 elm = &node->ondisk->elms[cursor->index].internal;
2455                 if (elm->mirror_tid >= mirror_tid)
2456                         break;
2457                 hammer_modify_node(cursor->trans, node, &elm->mirror_tid,
2458                                    sizeof(elm->mirror_tid));
2459                 elm->mirror_tid = mirror_tid;
2460                 hammer_modify_node_done(node);
2461                 if (hammer_debug_general & 0x0002) {
2462                         kprintf("mirror_propagate: propagate "
2463                                 "%016llx @%016llx:%d\n",
2464                                 (long long)mirror_tid,
2465                                 (long long)node->node_offset,
2466                                 cursor->index);
2467                 }
2468
2469
2470                 /*
2471                  * Adjust the node's mirror_tid aggregator
2472                  */
2473                 if (node->ondisk->mirror_tid >= mirror_tid)
2474                         return(0);
2475                 hammer_modify_node_field(cursor->trans, node, mirror_tid);
2476                 node->ondisk->mirror_tid = mirror_tid;
2477                 hammer_modify_node_done(node);
2478                 if (hammer_debug_general & 0x0002) {
2479                         kprintf("mirror_propagate: propagate "
2480                                 "%016llx @%016llx\n",
2481                                 (long long)mirror_tid,
2482                                 (long long)node->node_offset);
2483                 }
2484         }
2485         if (error == ENOENT)
2486                 error = 0;
2487         return(error);
2488 }
2489
2490 hammer_node_t
2491 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node,
2492                         int *parent_indexp, int *errorp, int try_exclusive)
2493 {
2494         hammer_node_t parent;
2495         hammer_btree_elm_t elm;
2496         int i;
2497
2498         /*
2499          * Get the node
2500          */
2501         parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp);
2502         if (*errorp) {
2503                 KKASSERT(parent == NULL);
2504                 return(NULL);
2505         }
2506         KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0);
2507
2508         /*
2509          * Lock the node
2510          */
2511         if (try_exclusive) {
2512                 if (hammer_lock_ex_try(&parent->lock)) {
2513                         hammer_rel_node(parent);
2514                         *errorp = EDEADLK;
2515                         return(NULL);
2516                 }
2517         } else {
2518                 hammer_lock_sh(&parent->lock);
2519         }
2520
2521         /*
2522          * Figure out which element in the parent is pointing to the
2523          * child.
2524          */
2525         if (node->ondisk->count) {
2526                 i = hammer_btree_search_node(&node->ondisk->elms[0].base,
2527                                              parent->ondisk);
2528         } else {
2529                 i = 0;
2530         }
2531         while (i < parent->ondisk->count) {
2532                 elm = &parent->ondisk->elms[i];
2533                 if (elm->internal.subtree_offset == node->node_offset)
2534                         break;
2535                 ++i;
2536         }
2537         if (i == parent->ondisk->count) {
2538                 hammer_unlock(&parent->lock);
2539                 panic("Bad B-Tree link: parent %p node %p\n", parent, node);
2540         }
2541         *parent_indexp = i;
2542         KKASSERT(*errorp == 0);
2543         return(parent);
2544 }
2545
2546 /*
2547  * The element (elm) has been moved to a new internal node (node).
2548  *
2549  * If the element represents a pointer to an internal node that node's
2550  * parent must be adjusted to the element's new location.
2551  *
2552  * XXX deadlock potential here with our exclusive locks
2553  */
2554 int
2555 btree_set_parent(hammer_transaction_t trans, hammer_node_t node,
2556                  hammer_btree_elm_t elm)
2557 {
2558         hammer_node_t child;
2559         int error;
2560
2561         error = 0;
2562
2563         switch(elm->base.btype) {
2564         case HAMMER_BTREE_TYPE_INTERNAL:
2565         case HAMMER_BTREE_TYPE_LEAF:
2566                 child = hammer_get_node(trans, elm->internal.subtree_offset,
2567                                         0, &error);
2568                 if (error == 0) {
2569                         hammer_modify_node_field(trans, child, parent);
2570                         child->ondisk->parent = node->node_offset;
2571                         hammer_modify_node_done(child);
2572                         hammer_rel_node(child);
2573                 }
2574                 break;
2575         default:
2576                 break;
2577         }
2578         return(error);
2579 }
2580
2581 /*
2582  * Initialize the root of a recursive B-Tree node lock list structure.
2583  */
2584 void
2585 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node)
2586 {
2587         TAILQ_INIT(&parent->list);
2588         parent->parent = NULL;
2589         parent->node = node;
2590         parent->index = -1;
2591         parent->count = node->ondisk->count;
2592         parent->copy = NULL;
2593         parent->flags = 0;
2594 }
2595
2596 /*
2597  * Exclusively lock all the children of node.  This is used by the split
2598  * code to prevent anyone from accessing the children of a cursor node
2599  * while we fix-up its parent offset.
2600  *
2601  * If we don't lock the children we can really mess up cursors which block
2602  * trying to cursor-up into our node.
2603  *
2604  * On failure EDEADLK (or some other error) is returned.  If a deadlock
2605  * error is returned the cursor is adjusted to block on termination.
2606  *
2607  * The caller is responsible for managing parent->node, the root's node
2608  * is usually aliased from a cursor.
2609  */
2610 int
2611 hammer_btree_lock_children(hammer_cursor_t cursor, int depth,
2612                            hammer_node_lock_t parent)
2613 {
2614         hammer_node_t node;
2615         hammer_node_lock_t item;
2616         hammer_node_ondisk_t ondisk;
2617         hammer_btree_elm_t elm;
2618         hammer_node_t child;
2619         struct hammer_mount *hmp;
2620         int error;
2621         int i;
2622
2623         node = parent->node;
2624         ondisk = node->ondisk;
2625         error = 0;
2626         hmp = cursor->trans->hmp;
2627
2628         /*
2629          * We really do not want to block on I/O with exclusive locks held,
2630          * pre-get the children before trying to lock the mess.  This is
2631          * only done one-level deep for now.
2632          */
2633         for (i = 0; i < ondisk->count; ++i) {
2634                 ++hammer_stats_btree_elements;
2635                 elm = &ondisk->elms[i];
2636                 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF &&
2637                     elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) {
2638                         continue;
2639                 }
2640                 child = hammer_get_node(cursor->trans,
2641                                         elm->internal.subtree_offset,
2642                                         0, &error);
2643                 if (child)
2644                         hammer_rel_node(child);
2645         }
2646
2647         /*
2648          * Do it for real
2649          */
2650         for (i = 0; error == 0 && i < ondisk->count; ++i) {
2651                 ++hammer_stats_btree_elements;
2652                 elm = &ondisk->elms[i];
2653
2654                 switch(elm->base.btype) {
2655                 case HAMMER_BTREE_TYPE_INTERNAL:
2656                 case HAMMER_BTREE_TYPE_LEAF:
2657                         KKASSERT(elm->internal.subtree_offset != 0);
2658                         child = hammer_get_node(cursor->trans,
2659                                                 elm->internal.subtree_offset,
2660                                                 0, &error);
2661                         break;
2662                 default:
2663                         child = NULL;
2664                         break;
2665                 }
2666                 if (child) {
2667                         if (hammer_lock_ex_try(&child->lock) != 0) {
2668                                 if (cursor->deadlk_node == NULL) {
2669                                         cursor->deadlk_node = child;
2670                                         hammer_ref_node(cursor->deadlk_node);
2671                                 }
2672                                 error = EDEADLK;
2673                                 hammer_rel_node(child);
2674                         } else {
2675                                 item = kmalloc(sizeof(*item), hmp->m_misc,
2676                                                M_WAITOK|M_ZERO);
2677                                 TAILQ_INSERT_TAIL(&parent->list, item, entry);
2678                                 TAILQ_INIT(&item->list);
2679                                 item->parent = parent;
2680                                 item->node = child;
2681                                 item->index = i;
2682                                 item->count = child->ondisk->count;
2683
2684                                 /*
2685                                  * Recurse (used by the rebalancing code)
2686                                  */
2687                                 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) {
2688                                         error = hammer_btree_lock_children(
2689                                                         cursor,
2690                                                         depth - 1,
2691                                                         item);
2692                                 }
2693                         }
2694                 }
2695         }
2696         if (error)
2697                 hammer_btree_unlock_children(cursor, parent);
2698         return(error);
2699 }
2700
2701 /*
2702  * Create an in-memory copy of all B-Tree nodes listed, recursively,
2703  * including the parent.
2704  */
2705 void
2706 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent)
2707 {
2708         hammer_mount_t hmp = cursor->trans->hmp;
2709         hammer_node_lock_t item;
2710
2711         if (parent->copy == NULL) {
2712                 parent->copy = kmalloc(sizeof(*parent->copy), hmp->m_misc,
2713                                        M_WAITOK);
2714                 *parent->copy = *parent->node->ondisk;
2715         }
2716         TAILQ_FOREACH(item, &parent->list, entry) {
2717                 hammer_btree_lock_copy(cursor, item);
2718         }
2719 }
2720
2721 /*
2722  * Recursively sync modified copies to the media.
2723  */
2724 int
2725 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent)
2726 {
2727         hammer_node_lock_t item;
2728         int count = 0;
2729
2730         if (parent->flags & HAMMER_NODE_LOCK_UPDATED) {
2731                 ++count;
2732                 hammer_modify_node_all(cursor->trans, parent->node);
2733                 *parent->node->ondisk = *parent->copy;
2734                 hammer_modify_node_done(parent->node);
2735                 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) {
2736                         hammer_flush_node(parent->node);
2737                         hammer_delete_node(cursor->trans, parent->node);
2738                 }
2739         }
2740         TAILQ_FOREACH(item, &parent->list, entry) {
2741                 count += hammer_btree_sync_copy(cursor, item);
2742         }
2743         return(count);
2744 }
2745
2746 /*
2747  * Release previously obtained node locks.  The caller is responsible for
2748  * cleaning up parent->node itself (its usually just aliased from a cursor),
2749  * but this function will take care of the copies.
2750  */
2751 void
2752 hammer_btree_unlock_children(hammer_cursor_t cursor, hammer_node_lock_t parent)
2753 {
2754         hammer_node_lock_t item;
2755
2756         if (parent->copy) {
2757                 kfree(parent->copy, cursor->trans->hmp->m_misc);
2758                 parent->copy = NULL;    /* safety */
2759         }
2760         while ((item = TAILQ_FIRST(&parent->list)) != NULL) {
2761                 TAILQ_REMOVE(&parent->list, item, entry);
2762                 hammer_btree_unlock_children(cursor, item);
2763                 hammer_unlock(&item->node->lock);
2764                 hammer_rel_node(item->node);
2765                 kfree(item, cursor->trans->hmp->m_misc);
2766         }
2767 }
2768
2769 /************************************************************************
2770  *                         MISCELLANIOUS SUPPORT                        *
2771  ************************************************************************/
2772
2773 /*
2774  * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp).
2775  *
2776  * Note that for this particular function a return value of -1, 0, or +1
2777  * can denote a match if create_tid is otherwise discounted.  A create_tid
2778  * of zero is considered to be 'infinity' in comparisons.
2779  *
2780  * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
2781  */
2782 int
2783 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
2784 {
2785         if (key1->localization < key2->localization)
2786                 return(-5);
2787         if (key1->localization > key2->localization)
2788                 return(5);
2789
2790         if (key1->obj_id < key2->obj_id)
2791                 return(-4);
2792         if (key1->obj_id > key2->obj_id)
2793                 return(4);
2794
2795         if (key1->rec_type < key2->rec_type)
2796                 return(-3);
2797         if (key1->rec_type > key2->rec_type)
2798                 return(3);
2799
2800         if (key1->key < key2->key)
2801                 return(-2);
2802         if (key1->key > key2->key)
2803                 return(2);
2804
2805         /*
2806          * A create_tid of zero indicates a record which is undeletable
2807          * and must be considered to have a value of positive infinity.
2808          */
2809         if (key1->create_tid == 0) {
2810                 if (key2->create_tid == 0)
2811                         return(0);
2812                 return(1);
2813         }
2814         if (key2->create_tid == 0)
2815                 return(-1);
2816         if (key1->create_tid < key2->create_tid)
2817                 return(-1);
2818         if (key1->create_tid > key2->create_tid)
2819                 return(1);
2820         return(0);
2821 }
2822
2823 /*
2824  * Test a timestamp against an element to determine whether the
2825  * element is visible.  A timestamp of 0 means 'infinity'.
2826  */
2827 int
2828 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base)
2829 {
2830         if (asof == 0) {
2831                 if (base->delete_tid)
2832                         return(1);
2833                 return(0);
2834         }
2835         if (asof < base->create_tid)
2836                 return(-1);
2837         if (base->delete_tid && asof >= base->delete_tid)
2838                 return(1);
2839         return(0);
2840 }
2841
2842 /*
2843  * Create a separator half way inbetween key1 and key2.  For fields just
2844  * one unit apart, the separator will match key2.  key1 is on the left-hand
2845  * side and key2 is on the right-hand side.
2846  *
2847  * key2 must be >= the separator.  It is ok for the separator to match key2.
2848  *
2849  * NOTE: Even if key1 does not match key2, the separator may wind up matching
2850  * key2.
2851  *
2852  * NOTE: It might be beneficial to just scrap this whole mess and just
2853  * set the separator to key2.
2854  */
2855 #define MAKE_SEPARATOR(key1, key2, dest, field) \
2856         dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
2857
2858 static void
2859 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
2860                       hammer_base_elm_t dest)
2861 {
2862         bzero(dest, sizeof(*dest));
2863
2864         dest->rec_type = key2->rec_type;
2865         dest->key = key2->key;
2866         dest->obj_id = key2->obj_id;
2867         dest->create_tid = key2->create_tid;
2868
2869         MAKE_SEPARATOR(key1, key2, dest, localization);
2870         if (key1->localization == key2->localization) {
2871                 MAKE_SEPARATOR(key1, key2, dest, obj_id);
2872                 if (key1->obj_id == key2->obj_id) {
2873                         MAKE_SEPARATOR(key1, key2, dest, rec_type);
2874                         if (key1->rec_type == key2->rec_type) {
2875                                 MAKE_SEPARATOR(key1, key2, dest, key);
2876                                 /*
2877                                  * Don't bother creating a separator for
2878                                  * create_tid, which also conveniently avoids
2879                                  * having to handle the create_tid == 0
2880                                  * (infinity) case.  Just leave create_tid
2881                                  * set to key2.
2882                                  *
2883                                  * Worst case, dest matches key2 exactly,
2884                                  * which is acceptable.
2885                                  */
2886                         }
2887                 }
2888         }
2889 }
2890
2891 #undef MAKE_SEPARATOR
2892
2893 /*
2894  * Return whether a generic internal or leaf node is full
2895  */
2896 static int
2897 btree_node_is_full(hammer_node_ondisk_t node)
2898 {
2899         switch(node->type) {
2900         case HAMMER_BTREE_TYPE_INTERNAL:
2901                 if (node->count == HAMMER_BTREE_INT_ELMS)
2902                         return(1);
2903                 break;
2904         case HAMMER_BTREE_TYPE_LEAF:
2905                 if (node->count == HAMMER_BTREE_LEAF_ELMS)
2906                         return(1);
2907                 break;
2908         default:
2909                 panic("illegal btree subtype");
2910         }
2911         return(0);
2912 }
2913
2914 #if 0
2915 static int
2916 btree_max_elements(u_int8_t type)
2917 {
2918         if (type == HAMMER_BTREE_TYPE_LEAF)
2919                 return(HAMMER_BTREE_LEAF_ELMS);
2920         if (type == HAMMER_BTREE_TYPE_INTERNAL)
2921                 return(HAMMER_BTREE_INT_ELMS);
2922         panic("btree_max_elements: bad type %d\n", type);
2923 }
2924 #endif
2925
2926 void
2927 hammer_print_btree_node(hammer_node_ondisk_t ondisk)
2928 {
2929         hammer_btree_elm_t elm;
2930         int i;
2931
2932         kprintf("node %p count=%d parent=%016llx type=%c\n",
2933                 ondisk, ondisk->count,
2934                 (long long)ondisk->parent, ondisk->type);
2935
2936         /*
2937          * Dump both boundary elements if an internal node
2938          */
2939         if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
2940                 for (i = 0; i <= ondisk->count; ++i) {
2941                         elm = &ondisk->elms[i];
2942                         hammer_print_btree_elm(elm, ondisk->type, i);
2943                 }
2944         } else {
2945                 for (i = 0; i < ondisk->count; ++i) {
2946                         elm = &ondisk->elms[i];
2947                         hammer_print_btree_elm(elm, ondisk->type, i);
2948                 }
2949         }
2950 }
2951
2952 void
2953 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
2954 {
2955         kprintf("  %2d", i);
2956         kprintf("\tobj_id       = %016llx\n", (long long)elm->base.obj_id);
2957         kprintf("\tkey          = %016llx\n", (long long)elm->base.key);
2958         kprintf("\tcreate_tid   = %016llx\n", (long long)elm->base.create_tid);
2959         kprintf("\tdelete_tid   = %016llx\n", (long long)elm->base.delete_tid);
2960         kprintf("\trec_type     = %04x\n", elm->base.rec_type);
2961         kprintf("\tobj_type     = %02x\n", elm->base.obj_type);
2962         kprintf("\tbtype        = %02x (%c)\n",
2963                 elm->base.btype,
2964                 (elm->base.btype ? elm->base.btype : '?'));
2965         kprintf("\tlocalization = %02x\n", elm->base.localization);
2966
2967         switch(type) {
2968         case HAMMER_BTREE_TYPE_INTERNAL:
2969                 kprintf("\tsubtree_off  = %016llx\n",
2970                         (long long)elm->internal.subtree_offset);
2971                 break;
2972         case HAMMER_BTREE_TYPE_RECORD:
2973                 kprintf("\tdata_offset  = %016llx\n",
2974                         (long long)elm->leaf.data_offset);
2975                 kprintf("\tdata_len     = %08x\n", elm->leaf.data_len);
2976                 kprintf("\tdata_crc     = %08x\n", elm->leaf.data_crc);
2977                 break;
2978         }
2979 }