HAMMER Utilities: Sync with 56D
[dragonfly.git] / sys / vfs / hammer / hammer_btree.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.56 2008/06/20 05:38:26 dillon Exp $
35  */
36
37 /*
38  * HAMMER B-Tree index
39  *
40  * HAMMER implements a modified B+Tree.  In documentation this will
41  * simply be refered to as the HAMMER B-Tree.  Basically a HAMMER B-Tree
42  * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43  * of the tree), but adds two additional boundary elements which describe
44  * the left-most and right-most element a node is able to represent.  In
45  * otherwords, we have boundary elements at the two ends of a B-Tree node
46  * instead of sub-tree pointers.
47  *
48  * A B-Tree internal node looks like this:
49  *
50  *      B N N N N N N B   <-- boundary and internal elements
51  *       S S S S S S S    <-- subtree pointers
52  *
53  * A B-Tree leaf node basically looks like this:
54  *
55  *      L L L L L L L L   <-- leaf elemenets
56  *
57  * The radix for an internal node is 1 less then a leaf but we get a
58  * number of significant benefits for our troubles.
59  *
60  * The big benefit to using a B-Tree containing boundary information
61  * is that it is possible to cache pointers into the middle of the tree
62  * and not have to start searches, insertions, OR deletions at the root
63  * node.   In particular, searches are able to progress in a definitive
64  * direction from any point in the tree without revisting nodes.  This
65  * greatly improves the efficiency of many operations, most especially
66  * record appends.
67  *
68  * B-Trees also make the stacking of trees fairly straightforward.
69  *
70  * INSERTIONS:  A search performed with the intention of doing
71  * an insert will guarantee that the terminal leaf node is not full by
72  * splitting full nodes.  Splits occur top-down during the dive down the
73  * B-Tree.
74  *
75  * DELETIONS: A deletion makes no attempt to proactively balance the
76  * tree and will recursively remove nodes that become empty.  If a
77  * deadlock occurs a deletion may not be able to remove an empty leaf.
78  * Deletions never allow internal nodes to become empty (that would blow
79  * up the boundaries).
80  */
81 #include "hammer.h"
82 #include <sys/buf.h>
83 #include <sys/buf2.h>
84
85 static int btree_search(hammer_cursor_t cursor, int flags);
86 static int btree_split_internal(hammer_cursor_t cursor);
87 static int btree_split_leaf(hammer_cursor_t cursor);
88 static int btree_remove(hammer_cursor_t cursor);
89 static int btree_node_is_full(hammer_node_ondisk_t node);
90 static void hammer_make_separator(hammer_base_elm_t key1,
91                         hammer_base_elm_t key2, hammer_base_elm_t dest);
92
93 /*
94  * Iterate records after a search.  The cursor is iterated forwards past
95  * the current record until a record matching the key-range requirements
96  * is found.  ENOENT is returned if the iteration goes past the ending
97  * key. 
98  *
99  * The iteration is inclusive of key_beg and can be inclusive or exclusive
100  * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set.
101  *
102  * When doing an as-of search (cursor->asof != 0), key_beg.create_tid
103  * may be modified by B-Tree functions.
104  *
105  * cursor->key_beg may or may not be modified by this function during
106  * the iteration.  XXX future - in case of an inverted lock we may have
107  * to reinitiate the lookup and set key_beg to properly pick up where we
108  * left off.
109  *
110  * NOTE!  EDEADLK *CANNOT* be returned by this procedure.
111  */
112 int
113 hammer_btree_iterate(hammer_cursor_t cursor)
114 {
115         hammer_node_ondisk_t node;
116         hammer_btree_elm_t elm;
117         int error;
118         int r;
119         int s;
120
121         /*
122          * Skip past the current record
123          */
124         node = cursor->node->ondisk;
125         if (node == NULL)
126                 return(ENOENT);
127         if (cursor->index < node->count && 
128             (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
129                 ++cursor->index;
130         }
131
132         /*
133          * Loop until an element is found or we are done.
134          */
135         for (;;) {
136                 /*
137                  * We iterate up the tree and then index over one element
138                  * while we are at the last element in the current node.
139                  *
140                  * If we are at the root of the filesystem, cursor_up
141                  * returns ENOENT.
142                  *
143                  * XXX this could be optimized by storing the information in
144                  * the parent reference.
145                  *
146                  * XXX we can lose the node lock temporarily, this could mess
147                  * up our scan.
148                  */
149                 ++hammer_stats_btree_iterations;
150                 if (cursor->index == node->count) {
151                         if (hammer_debug_btree) {
152                                 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n",
153                                         cursor->node->node_offset,
154                                         cursor->index,
155                                         (cursor->parent ? cursor->parent->node_offset : -1),
156                                         cursor->parent_index,
157                                         curthread);
158                         }
159                         KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset);
160                         error = hammer_cursor_up(cursor);
161                         if (error)
162                                 break;
163                         /* reload stale pointer */
164                         node = cursor->node->ondisk;
165                         KKASSERT(cursor->index != node->count);
166
167                         /*
168                          * If we are reblocking we want to return internal
169                          * nodes.
170                          */
171                         if (cursor->flags & HAMMER_CURSOR_REBLOCKING) {
172                                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
173                                 return(0);
174                         }
175                         ++cursor->index;
176                         continue;
177                 }
178
179                 /*
180                  * Check internal or leaf element.  Determine if the record
181                  * at the cursor has gone beyond the end of our range.
182                  *
183                  * We recurse down through internal nodes.
184                  */
185                 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
186                         elm = &node->elms[cursor->index];
187                         r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
188                         s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
189                         if (hammer_debug_btree) {
190                                 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d (td=%p)\n",
191                                         cursor->node->node_offset,
192                                         cursor->index,
193                                         elm[0].internal.base.obj_id,
194                                         elm[0].internal.base.rec_type,
195                                         elm[0].internal.base.key,
196                                         elm[0].internal.base.localization,
197                                         r,
198                                         curthread
199                                 );
200                                 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n",
201                                         cursor->node->node_offset,
202                                         cursor->index + 1,
203                                         elm[1].internal.base.obj_id,
204                                         elm[1].internal.base.rec_type,
205                                         elm[1].internal.base.key,
206                                         elm[1].internal.base.localization,
207                                         s
208                                 );
209                         }
210
211                         if (r < 0) {
212                                 error = ENOENT;
213                                 break;
214                         }
215                         if (r == 0 && (cursor->flags &
216                                        HAMMER_CURSOR_END_INCLUSIVE) == 0) {
217                                 error = ENOENT;
218                                 break;
219                         }
220                         KKASSERT(s <= 0);
221
222                         /*
223                          * Better not be zero
224                          */
225                         KKASSERT(elm->internal.subtree_offset != 0);
226
227                         error = hammer_cursor_down(cursor);
228                         if (error)
229                                 break;
230                         KKASSERT(cursor->index == 0);
231                         /* reload stale pointer */
232                         node = cursor->node->ondisk;
233                         continue;
234                 } else {
235                         elm = &node->elms[cursor->index];
236                         r = hammer_btree_cmp(&cursor->key_end, &elm->base);
237                         if (hammer_debug_btree) {
238                                 kprintf("ELEMENT  %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n",
239                                         cursor->node->node_offset,
240                                         cursor->index,
241                                         (elm[0].leaf.base.btype ?
242                                          elm[0].leaf.base.btype : '?'),
243                                         elm[0].leaf.base.obj_id,
244                                         elm[0].leaf.base.rec_type,
245                                         elm[0].leaf.base.key,
246                                         elm[0].leaf.base.localization,
247                                         r
248                                 );
249                         }
250                         if (r < 0) {
251                                 error = ENOENT;
252                                 break;
253                         }
254
255                         /*
256                          * We support both end-inclusive and
257                          * end-exclusive searches.
258                          */
259                         if (r == 0 &&
260                            (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
261                                 error = ENOENT;
262                                 break;
263                         }
264
265                         switch(elm->leaf.base.btype) {
266                         case HAMMER_BTREE_TYPE_RECORD:
267                                 if ((cursor->flags & HAMMER_CURSOR_ASOF) &&
268                                     hammer_btree_chkts(cursor->asof, &elm->base)) {
269                                         ++cursor->index;
270                                         continue;
271                                 }
272                                 break;
273                         default:
274                                 error = EINVAL;
275                                 break;
276                         }
277                         if (error)
278                                 break;
279                 }
280                 /*
281                  * node pointer invalid after loop
282                  */
283
284                 /*
285                  * Return entry
286                  */
287                 if (hammer_debug_btree) {
288                         int i = cursor->index;
289                         hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i];
290                         kprintf("ITERATE  %p:%d %016llx %02x %016llx lo=%02x\n",
291                                 cursor->node, i,
292                                 elm->internal.base.obj_id,
293                                 elm->internal.base.rec_type,
294                                 elm->internal.base.key,
295                                 elm->internal.base.localization
296                         );
297                 }
298                 return(0);
299         }
300         return(error);
301 }
302
303 /*
304  * Iterate in the reverse direction.  This is used by the pruning code to
305  * avoid overlapping records.
306  */
307 int
308 hammer_btree_iterate_reverse(hammer_cursor_t cursor)
309 {
310         hammer_node_ondisk_t node;
311         hammer_btree_elm_t elm;
312         int error;
313         int r;
314         int s;
315
316         /*
317          * Skip past the current record.  For various reasons the cursor
318          * may end up set to -1 or set to point at the end of the current
319          * node.  These cases must be addressed.
320          */
321         node = cursor->node->ondisk;
322         if (node == NULL)
323                 return(ENOENT);
324         if (cursor->index != -1 && 
325             (cursor->flags & HAMMER_CURSOR_ATEDISK)) {
326                 --cursor->index;
327         }
328         if (cursor->index == cursor->node->ondisk->count)
329                 --cursor->index;
330
331         /*
332          * Loop until an element is found or we are done.
333          */
334         for (;;) {
335                 /*
336                  * We iterate up the tree and then index over one element
337                  * while we are at the last element in the current node.
338                  */
339                 if (cursor->index == -1) {
340                         error = hammer_cursor_up(cursor);
341                         if (error) {
342                                 cursor->index = 0; /* sanity */
343                                 break;
344                         }
345                         /* reload stale pointer */
346                         node = cursor->node->ondisk;
347                         KKASSERT(cursor->index != node->count);
348                         --cursor->index;
349                         continue;
350                 }
351
352                 /*
353                  * Check internal or leaf element.  Determine if the record
354                  * at the cursor has gone beyond the end of our range.
355                  *
356                  * We recurse down through internal nodes. 
357                  */
358                 KKASSERT(cursor->index != node->count);
359                 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
360                         elm = &node->elms[cursor->index];
361                         r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
362                         s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
363                         if (hammer_debug_btree) {
364                                 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d\n",
365                                         cursor->node->node_offset,
366                                         cursor->index,
367                                         elm[0].internal.base.obj_id,
368                                         elm[0].internal.base.rec_type,
369                                         elm[0].internal.base.key,
370                                         elm[0].internal.base.localization,
371                                         r
372                                 );
373                                 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n",
374                                         cursor->node->node_offset,
375                                         cursor->index + 1,
376                                         elm[1].internal.base.obj_id,
377                                         elm[1].internal.base.rec_type,
378                                         elm[1].internal.base.key,
379                                         elm[1].internal.base.localization,
380                                         s
381                                 );
382                         }
383
384                         if (s >= 0) {
385                                 error = ENOENT;
386                                 break;
387                         }
388                         KKASSERT(r >= 0);
389
390                         /*
391                          * Better not be zero
392                          */
393                         KKASSERT(elm->internal.subtree_offset != 0);
394
395                         error = hammer_cursor_down(cursor);
396                         if (error)
397                                 break;
398                         KKASSERT(cursor->index == 0);
399                         /* reload stale pointer */
400                         node = cursor->node->ondisk;
401
402                         /* this can assign -1 if the leaf was empty */
403                         cursor->index = node->count - 1;
404                         continue;
405                 } else {
406                         elm = &node->elms[cursor->index];
407                         s = hammer_btree_cmp(&cursor->key_beg, &elm->base);
408                         if (hammer_debug_btree) {
409                                 kprintf("ELEMENT  %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n",
410                                         cursor->node->node_offset,
411                                         cursor->index,
412                                         (elm[0].leaf.base.btype ?
413                                          elm[0].leaf.base.btype : '?'),
414                                         elm[0].leaf.base.obj_id,
415                                         elm[0].leaf.base.rec_type,
416                                         elm[0].leaf.base.key,
417                                         elm[0].leaf.base.localization,
418                                         s
419                                 );
420                         }
421                         if (s > 0) {
422                                 error = ENOENT;
423                                 break;
424                         }
425
426                         switch(elm->leaf.base.btype) {
427                         case HAMMER_BTREE_TYPE_RECORD:
428                                 if ((cursor->flags & HAMMER_CURSOR_ASOF) &&
429                                     hammer_btree_chkts(cursor->asof, &elm->base)) {
430                                         --cursor->index;
431                                         continue;
432                                 }
433                                 break;
434                         default:
435                                 error = EINVAL;
436                                 break;
437                         }
438                         if (error)
439                                 break;
440                 }
441                 /*
442                  * node pointer invalid after loop
443                  */
444
445                 /*
446                  * Return entry
447                  */
448                 if (hammer_debug_btree) {
449                         int i = cursor->index;
450                         hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i];
451                         kprintf("ITERATE  %p:%d %016llx %02x %016llx lo=%02x\n",
452                                 cursor->node, i,
453                                 elm->internal.base.obj_id,
454                                 elm->internal.base.rec_type,
455                                 elm->internal.base.key,
456                                 elm->internal.base.localization
457                         );
458                 }
459                 return(0);
460         }
461         return(error);
462 }
463
464 /*
465  * Lookup cursor->key_beg.  0 is returned on success, ENOENT if the entry
466  * could not be found, EDEADLK if inserting and a retry is needed, and a
467  * fatal error otherwise.  When retrying, the caller must terminate the
468  * cursor and reinitialize it.  EDEADLK cannot be returned if not inserting.
469  * 
470  * The cursor is suitably positioned for a deletion on success, and suitably
471  * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was
472  * specified.
473  *
474  * The cursor may begin anywhere, the search will traverse the tree in
475  * either direction to locate the requested element.
476  *
477  * Most of the logic implementing historical searches is handled here.  We
478  * do an initial lookup with create_tid set to the asof TID.  Due to the
479  * way records are laid out, a backwards iteration may be required if
480  * ENOENT is returned to locate the historical record.  Here's the
481  * problem:
482  *
483  * create_tid:    10      15       20
484  *                   LEAF1   LEAF2
485  * records:         (11)        (18)
486  *
487  * Lets say we want to do a lookup AS-OF timestamp 17.  We will traverse
488  * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is
489  * not visible and thus causes ENOENT to be returned.  We really need
490  * to check record 11 in LEAF1.  If it also fails then the search fails
491  * (e.g. it might represent the range 11-16 and thus still not match our
492  * AS-OF timestamp of 17).  Note that LEAF1 could be empty, requiring
493  * further iterations.
494  *
495  * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK
496  * and the cursor->create_check TID if an iteration might be needed.
497  * In the above example create_check would be set to 14.
498  */
499 int
500 hammer_btree_lookup(hammer_cursor_t cursor)
501 {
502         int error;
503
504         ++hammer_stats_btree_lookups;
505         if (cursor->flags & HAMMER_CURSOR_ASOF) {
506                 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0);
507                 cursor->key_beg.create_tid = cursor->asof;
508                 for (;;) {
509                         cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK;
510                         error = btree_search(cursor, 0);
511                         if (error != ENOENT ||
512                             (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) {
513                                 /*
514                                  * Stop if no error.
515                                  * Stop if error other then ENOENT.
516                                  * Stop if ENOENT and not special case.
517                                  */
518                                 break;
519                         }
520                         if (hammer_debug_btree) {
521                                 kprintf("CREATE_CHECK %016llx\n",
522                                         cursor->create_check);
523                         }
524                         cursor->key_beg.create_tid = cursor->create_check;
525                         /* loop */
526                 }
527         } else {
528                 error = btree_search(cursor, 0);
529         }
530         if (error == 0)
531                 error = hammer_btree_extract(cursor, cursor->flags);
532         return(error);
533 }
534
535 /*
536  * Execute the logic required to start an iteration.  The first record
537  * located within the specified range is returned and iteration control
538  * flags are adjusted for successive hammer_btree_iterate() calls.
539  */
540 int
541 hammer_btree_first(hammer_cursor_t cursor)
542 {
543         int error;
544
545         error = hammer_btree_lookup(cursor);
546         if (error == ENOENT) {
547                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
548                 error = hammer_btree_iterate(cursor);
549         }
550         cursor->flags |= HAMMER_CURSOR_ATEDISK;
551         return(error);
552 }
553
554 /*
555  * Similarly but for an iteration in the reverse direction.
556  *
557  * Set ATEDISK when iterating backwards to skip the current entry,
558  * which after an ENOENT lookup will be pointing beyond our end point.
559  */
560 int
561 hammer_btree_last(hammer_cursor_t cursor)
562 {
563         struct hammer_base_elm save;
564         int error;
565
566         save = cursor->key_beg;
567         cursor->key_beg = cursor->key_end;
568         error = hammer_btree_lookup(cursor);
569         cursor->key_beg = save;
570         if (error == ENOENT ||
571             (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
572                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
573                 error = hammer_btree_iterate_reverse(cursor);
574         }
575         cursor->flags |= HAMMER_CURSOR_ATEDISK;
576         return(error);
577 }
578
579 /*
580  * Extract the record and/or data associated with the cursor's current
581  * position.  Any prior record or data stored in the cursor is replaced.
582  * The cursor must be positioned at a leaf node.
583  *
584  * NOTE: All extractions occur at the leaf of the B-Tree.
585  */
586 int
587 hammer_btree_extract(hammer_cursor_t cursor, int flags)
588 {
589         hammer_mount_t hmp;
590         hammer_node_ondisk_t node;
591         hammer_btree_elm_t elm;
592         hammer_off_t data_off;
593         int32_t data_len;
594         int error;
595
596         /*
597          * The case where the data reference resolves to the same buffer
598          * as the record reference must be handled.
599          */
600         node = cursor->node->ondisk;
601         elm = &node->elms[cursor->index];
602         cursor->data = NULL;
603         hmp = cursor->node->hmp;
604
605         /*
606          * There is nothing to extract for an internal element.
607          */
608         if (node->type == HAMMER_BTREE_TYPE_INTERNAL)
609                 return(EINVAL);
610
611         /*
612          * Only record types have data.
613          */
614         KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
615         cursor->leaf = &elm->leaf;
616
617         if ((flags & HAMMER_CURSOR_GET_DATA) == 0)
618                 return(0);
619         if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD)
620                 return(0);
621         data_off = elm->leaf.data_offset;
622         data_len = elm->leaf.data_len;
623         if (data_off == 0)
624                 return(0);
625
626         /*
627          * Load the data
628          */
629         KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE);
630         cursor->data = hammer_bread_ext(hmp, data_off, data_len,
631                                         &error, &cursor->data_buffer);
632         if (data_len && 
633             crc32(cursor->data, data_len) != elm->leaf.data_crc) {
634                 Debugger("CRC FAILED: DATA");
635         }
636         return(error);
637 }
638
639
640 /*
641  * Insert a leaf element into the B-Tree at the current cursor position.
642  * The cursor is positioned such that the element at and beyond the cursor
643  * are shifted to make room for the new record.
644  *
645  * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT
646  * flag set and that call must return ENOENT before this function can be
647  * called.
648  *
649  * The caller may depend on the cursor's exclusive lock after return to
650  * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE).
651  *
652  * ENOSPC is returned if there is no room to insert a new record.
653  */
654 int
655 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
656 {
657         hammer_node_ondisk_t node;
658         int i;
659         int error;
660
661         if ((error = hammer_cursor_upgrade_node(cursor)) != 0)
662                 return(error);
663         ++hammer_stats_btree_inserts;
664
665         /*
666          * Insert the element at the leaf node and update the count in the
667          * parent.  It is possible for parent to be NULL, indicating that
668          * the filesystem's ROOT B-Tree node is a leaf itself, which is
669          * possible.  The root inode can never be deleted so the leaf should
670          * never be empty.
671          *
672          * Remember that the right-hand boundary is not included in the
673          * count.
674          */
675         hammer_modify_node_all(cursor->trans, cursor->node);
676         node = cursor->node->ondisk;
677         i = cursor->index;
678         KKASSERT(elm->base.btype != 0);
679         KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
680         KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS);
681         if (i != node->count) {
682                 bcopy(&node->elms[i], &node->elms[i+1],
683                       (node->count - i) * sizeof(*elm));
684         }
685         node->elms[i].leaf = *elm;
686         ++node->count;
687         hammer_modify_node_done(cursor->node);
688
689         /*
690          * Debugging sanity checks.
691          */
692         KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0);
693         KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0);
694         if (i) {
695                 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0);
696         }
697         if (i != node->count - 1)
698                 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0);
699
700         return(0);
701 }
702
703 /*
704  * Delete a record from the B-Tree at the current cursor position.
705  * The cursor is positioned such that the current element is the one
706  * to be deleted.
707  *
708  * On return the cursor will be positioned after the deleted element and
709  * MAY point to an internal node.  It will be suitable for the continuation
710  * of an iteration but not for an insertion or deletion.
711  *
712  * Deletions will attempt to partially rebalance the B-Tree in an upward
713  * direction, but will terminate rather then deadlock.  Empty internal nodes
714  * are never allowed by a deletion which deadlocks may end up giving us an
715  * empty leaf.  The pruner will clean up and rebalance the tree.
716  *
717  * This function can return EDEADLK, requiring the caller to retry the
718  * operation after clearing the deadlock.
719  */
720 int
721 hammer_btree_delete(hammer_cursor_t cursor)
722 {
723         hammer_node_ondisk_t ondisk;
724         hammer_node_t node;
725         hammer_node_t parent;
726         int error;
727         int i;
728
729         if ((error = hammer_cursor_upgrade(cursor)) != 0)
730                 return(error);
731         ++hammer_stats_btree_deletes;
732
733         /*
734          * Delete the element from the leaf node. 
735          *
736          * Remember that leaf nodes do not have boundaries.
737          */
738         node = cursor->node;
739         ondisk = node->ondisk;
740         i = cursor->index;
741
742         KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF);
743         KKASSERT(i >= 0 && i < ondisk->count);
744         hammer_modify_node_all(cursor->trans, node);
745         if (i + 1 != ondisk->count) {
746                 bcopy(&ondisk->elms[i+1], &ondisk->elms[i],
747                       (ondisk->count - i - 1) * sizeof(ondisk->elms[0]));
748         }
749         --ondisk->count;
750         hammer_modify_node_done(node);
751
752         /*
753          * Validate local parent
754          */
755         if (ondisk->parent) {
756                 parent = cursor->parent;
757
758                 KKASSERT(parent != NULL);
759                 KKASSERT(parent->node_offset == ondisk->parent);
760         }
761
762         /*
763          * If the leaf becomes empty it must be detached from the parent,
764          * potentially recursing through to the filesystem root.
765          *
766          * This may reposition the cursor at one of the parent's of the
767          * current node.
768          *
769          * Ignore deadlock errors, that simply means that btree_remove
770          * was unable to recurse and had to leave us with an empty leaf. 
771          */
772         KKASSERT(cursor->index <= ondisk->count);
773         if (ondisk->count == 0) {
774                 error = btree_remove(cursor);
775                 if (error == EDEADLK)
776                         error = 0;
777         } else {
778                 error = 0;
779         }
780         KKASSERT(cursor->parent == NULL ||
781                  cursor->parent_index < cursor->parent->ondisk->count);
782         return(error);
783 }
784
785 /*
786  * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE
787  *
788  * Search the filesystem B-Tree for cursor->key_beg, return the matching node.
789  *
790  * The search can begin ANYWHERE in the B-Tree.  As a first step the search
791  * iterates up the tree as necessary to properly position itself prior to
792  * actually doing the sarch.
793  * 
794  * INSERTIONS: The search will split full nodes and leaves on its way down
795  * and guarentee that the leaf it ends up on is not full.  If we run out
796  * of space the search continues to the leaf (to position the cursor for
797  * the spike), but ENOSPC is returned.
798  *
799  * The search is only guarenteed to end up on a leaf if an error code of 0
800  * is returned, or if inserting and an error code of ENOENT is returned.
801  * Otherwise it can stop at an internal node.  On success a search returns
802  * a leaf node.
803  *
804  * COMPLEXITY WARNING!  This is the core B-Tree search code for the entire
805  * filesystem, and it is not simple code.  Please note the following facts:
806  *
807  * - Internal node recursions have a boundary on the left AND right.  The
808  *   right boundary is non-inclusive.  The create_tid is a generic part
809  *   of the key for internal nodes.
810  *
811  * - Leaf nodes contain terminal elements only now.
812  *
813  * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a
814  *   historical search.  ASOF and INSERT are mutually exclusive.  When
815  *   doing an as-of lookup btree_search() checks for a right-edge boundary
816  *   case.  If while recursing down the left-edge differs from the key
817  *   by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along
818  *   with cursor->create_check.  This is used by btree_lookup() to iterate.
819  *   The iteration backwards because as-of searches can wind up going
820  *   down the wrong branch of the B-Tree.
821  */
822 static 
823 int
824 btree_search(hammer_cursor_t cursor, int flags)
825 {
826         hammer_node_ondisk_t node;
827         hammer_btree_elm_t elm;
828         int error;
829         int enospc = 0;
830         int i;
831         int r;
832         int s;
833
834         flags |= cursor->flags;
835         ++hammer_stats_btree_searches;
836
837         if (hammer_debug_btree) {
838                 kprintf("SEARCH   %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td = %p)\n",
839                         cursor->node->node_offset, 
840                         cursor->index,
841                         cursor->key_beg.obj_id,
842                         cursor->key_beg.rec_type,
843                         cursor->key_beg.key,
844                         cursor->key_beg.create_tid, 
845                         cursor->key_beg.localization, 
846                         curthread
847                 );
848                 if (cursor->parent)
849                     kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n",
850                         cursor->parent->node_offset, cursor->parent_index,
851                         cursor->left_bound->obj_id,
852                         cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id,
853                         cursor->right_bound->obj_id,
854                         cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id,
855                         cursor->left_bound,
856                         &cursor->parent->ondisk->elms[cursor->parent_index],
857                         cursor->right_bound,
858                         &cursor->parent->ondisk->elms[cursor->parent_index+1]
859                     );
860         }
861
862         /*
863          * Move our cursor up the tree until we find a node whos range covers
864          * the key we are trying to locate.
865          *
866          * The left bound is inclusive, the right bound is non-inclusive.
867          * It is ok to cursor up too far.
868          */
869         for (;;) {
870                 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound);
871                 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound);
872                 if (r >= 0 && s < 0)
873                         break;
874                 KKASSERT(cursor->parent);
875                 ++hammer_stats_btree_iterations;
876                 error = hammer_cursor_up(cursor);
877                 if (error)
878                         goto done;
879         }
880
881         /*
882          * The delete-checks below are based on node, not parent.  Set the
883          * initial delete-check based on the parent.
884          */
885         if (r == 1) {
886                 KKASSERT(cursor->left_bound->create_tid != 1);
887                 cursor->create_check = cursor->left_bound->create_tid - 1;
888                 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK;
889         }
890
891         /*
892          * We better have ended up with a node somewhere.
893          */
894         KKASSERT(cursor->node != NULL);
895
896         /*
897          * If we are inserting we can't start at a full node if the parent
898          * is also full (because there is no way to split the node),
899          * continue running up the tree until the requirement is satisfied
900          * or we hit the root of the filesystem.
901          *
902          * (If inserting we aren't doing an as-of search so we don't have
903          *  to worry about create_check).
904          */
905         while ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) {
906                 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
907                         if (btree_node_is_full(cursor->node->ondisk) == 0)
908                                 break;
909                 } else {
910                         if (btree_node_is_full(cursor->node->ondisk) ==0)
911                                 break;
912                 }
913                 if (cursor->node->ondisk->parent == 0 ||
914                     cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) {
915                         break;
916                 }
917                 ++hammer_stats_btree_iterations;
918                 error = hammer_cursor_up(cursor);
919                 /* node may have become stale */
920                 if (error)
921                         goto done;
922         }
923
924         /*
925          * Push down through internal nodes to locate the requested key.
926          */
927         node = cursor->node->ondisk;
928         while (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
929                 /*
930                  * Scan the node to find the subtree index to push down into.
931                  * We go one-past, then back-up.
932                  *
933                  * We must proactively remove deleted elements which may
934                  * have been left over from a deadlocked btree_remove().
935                  *
936                  * The left and right boundaries are included in the loop
937                  * in order to detect edge cases.
938                  *
939                  * If the separator only differs by create_tid (r == 1)
940                  * and we are doing an as-of search, we may end up going
941                  * down a branch to the left of the one containing the
942                  * desired key.  This requires numerous special cases.
943                  */
944                 ++hammer_stats_btree_iterations;
945                 if (hammer_debug_btree) {
946                         kprintf("SEARCH-I %016llx count=%d\n",
947                                 cursor->node->node_offset,
948                                 node->count);
949                 }
950
951                 /*
952                  * Try to shortcut the search before dropping into the
953                  * linear loop.  Locate the first node where r <= 1.
954                  */
955                 i = hammer_btree_search_node(&cursor->key_beg, node);
956                 while (i <= node->count) {
957                         ++hammer_stats_btree_elements;
958                         elm = &node->elms[i];
959                         r = hammer_btree_cmp(&cursor->key_beg, &elm->base);
960                         if (hammer_debug_btree > 2) {
961                                 kprintf(" IELM %p %d r=%d\n",
962                                         &node->elms[i], i, r);
963                         }
964                         if (r < 0)
965                                 break;
966                         if (r == 1) {
967                                 KKASSERT(elm->base.create_tid != 1);
968                                 cursor->create_check = elm->base.create_tid - 1;
969                                 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK;
970                         }
971                         ++i;
972                 }
973                 if (hammer_debug_btree) {
974                         kprintf("SEARCH-I preI=%d/%d r=%d\n",
975                                 i, node->count, r);
976                 }
977
978                 /*
979                  * These cases occur when the parent's idea of the boundary
980                  * is wider then the child's idea of the boundary, and
981                  * require special handling.  If not inserting we can
982                  * terminate the search early for these cases but the
983                  * child's boundaries cannot be unconditionally modified.
984                  */
985                 if (i == 0) {
986                         /*
987                          * If i == 0 the search terminated to the LEFT of the
988                          * left_boundary but to the RIGHT of the parent's left
989                          * boundary.
990                          */
991                         u_int8_t save;
992
993                         elm = &node->elms[0];
994
995                         /*
996                          * If we aren't inserting we can stop here.
997                          */
998                         if ((flags & (HAMMER_CURSOR_INSERT |
999                                       HAMMER_CURSOR_PRUNING)) == 0) {
1000                                 cursor->index = 0;
1001                                 return(ENOENT);
1002                         }
1003
1004                         /*
1005                          * Correct a left-hand boundary mismatch.
1006                          *
1007                          * We can only do this if we can upgrade the lock,
1008                          * and synchronized as a background cursor (i.e.
1009                          * inserting or pruning).
1010                          *
1011                          * WARNING: We can only do this if inserting, i.e.
1012                          * we are running on the backend.
1013                          */
1014                         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1015                                 return(error);
1016                         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1017                         hammer_modify_node_field(cursor->trans, cursor->node,
1018                                                  elms[0]);
1019                         save = node->elms[0].base.btype;
1020                         node->elms[0].base = *cursor->left_bound;
1021                         node->elms[0].base.btype = save;
1022                         hammer_modify_node_done(cursor->node);
1023                 } else if (i == node->count + 1) {
1024                         /*
1025                          * If i == node->count + 1 the search terminated to
1026                          * the RIGHT of the right boundary but to the LEFT
1027                          * of the parent's right boundary.  If we aren't
1028                          * inserting we can stop here.
1029                          *
1030                          * Note that the last element in this case is
1031                          * elms[i-2] prior to adjustments to 'i'.
1032                          */
1033                         --i;
1034                         if ((flags & (HAMMER_CURSOR_INSERT |
1035                                       HAMMER_CURSOR_PRUNING)) == 0) {
1036                                 cursor->index = i;
1037                                 return (ENOENT);
1038                         }
1039
1040                         /*
1041                          * Correct a right-hand boundary mismatch.
1042                          * (actual push-down record is i-2 prior to
1043                          * adjustments to i).
1044                          *
1045                          * We can only do this if we can upgrade the lock,
1046                          * and synchronized as a background cursor (i.e.
1047                          * inserting or pruning).
1048                          *
1049                          * WARNING: We can only do this if inserting, i.e.
1050                          * we are running on the backend.
1051                          */
1052                         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1053                                 return(error);
1054                         elm = &node->elms[i];
1055                         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1056                         hammer_modify_node(cursor->trans, cursor->node,
1057                                            &elm->base, sizeof(elm->base));
1058                         elm->base = *cursor->right_bound;
1059                         hammer_modify_node_done(cursor->node);
1060                         --i;
1061                 } else {
1062                         /*
1063                          * The push-down index is now i - 1.  If we had
1064                          * terminated on the right boundary this will point
1065                          * us at the last element.
1066                          */
1067                         --i;
1068                 }
1069                 cursor->index = i;
1070                 elm = &node->elms[i];
1071
1072                 if (hammer_debug_btree) {
1073                         kprintf("RESULT-I %016llx[%d] %016llx %02x "
1074                                 "key=%016llx cre=%016llx lo=%02x\n",
1075                                 cursor->node->node_offset,
1076                                 i,
1077                                 elm->internal.base.obj_id,
1078                                 elm->internal.base.rec_type,
1079                                 elm->internal.base.key,
1080                                 elm->internal.base.create_tid,
1081                                 elm->internal.base.localization
1082                         );
1083                 }
1084
1085                 /*
1086                  * We better have a valid subtree offset.
1087                  */
1088                 KKASSERT(elm->internal.subtree_offset != 0);
1089
1090                 /*
1091                  * Handle insertion and deletion requirements.
1092                  *
1093                  * If inserting split full nodes.  The split code will
1094                  * adjust cursor->node and cursor->index if the current
1095                  * index winds up in the new node.
1096                  *
1097                  * If inserting and a left or right edge case was detected,
1098                  * we cannot correct the left or right boundary and must
1099                  * prepend and append an empty leaf node in order to make
1100                  * the boundary correction.
1101                  *
1102                  * If we run out of space we set enospc and continue on
1103                  * to a leaf to provide the spike code with a good point
1104                  * of entry.
1105                  */
1106                 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) {
1107                         if (btree_node_is_full(node)) {
1108                                 error = btree_split_internal(cursor);
1109                                 if (error) {
1110                                         if (error != ENOSPC)
1111                                                 goto done;
1112                                         enospc = 1;
1113                                 }
1114                                 /*
1115                                  * reload stale pointers
1116                                  */
1117                                 i = cursor->index;
1118                                 node = cursor->node->ondisk;
1119                         }
1120                 }
1121
1122                 /*
1123                  * Push down (push into new node, existing node becomes
1124                  * the parent) and continue the search.
1125                  */
1126                 error = hammer_cursor_down(cursor);
1127                 /* node may have become stale */
1128                 if (error)
1129                         goto done;
1130                 node = cursor->node->ondisk;
1131         }
1132
1133         /*
1134          * We are at a leaf, do a linear search of the key array.
1135          *
1136          * On success the index is set to the matching element and 0
1137          * is returned.
1138          *
1139          * On failure the index is set to the insertion point and ENOENT
1140          * is returned.
1141          *
1142          * Boundaries are not stored in leaf nodes, so the index can wind
1143          * up to the left of element 0 (index == 0) or past the end of
1144          * the array (index == node->count).  It is also possible that the
1145          * leaf might be empty.
1146          */
1147         ++hammer_stats_btree_iterations;
1148         KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF);
1149         KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS);
1150         if (hammer_debug_btree) {
1151                 kprintf("SEARCH-L %016llx count=%d\n",
1152                         cursor->node->node_offset,
1153                         node->count);
1154         }
1155
1156         /*
1157          * Try to shortcut the search before dropping into the
1158          * linear loop.  Locate the first node where r <= 1.
1159          */
1160         i = hammer_btree_search_node(&cursor->key_beg, node);
1161         while (i < node->count) {
1162                 ++hammer_stats_btree_elements;
1163                 elm = &node->elms[i];
1164
1165                 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base);
1166
1167                 if (hammer_debug_btree > 1)
1168                         kprintf("  ELM %p %d r=%d\n", &node->elms[i], i, r);
1169
1170                 /*
1171                  * We are at a record element.  Stop if we've flipped past
1172                  * key_beg, not counting the create_tid test.  Allow the
1173                  * r == 1 case (key_beg > element but differs only by its
1174                  * create_tid) to fall through to the AS-OF check.
1175                  */
1176                 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD);
1177
1178                 if (r < 0)
1179                         goto failed;
1180                 if (r > 1) {
1181                         ++i;
1182                         continue;
1183                 }
1184
1185                 /*
1186                  * Check our as-of timestamp against the element.
1187                  */
1188                 if (flags & HAMMER_CURSOR_ASOF) {
1189                         if (hammer_btree_chkts(cursor->asof,
1190                                                &node->elms[i].base) != 0) {
1191                                 ++i;
1192                                 continue;
1193                         }
1194                         /* success */
1195                 } else {
1196                         if (r > 0) {    /* can only be +1 */
1197                                 ++i;
1198                                 continue;
1199                         }
1200                         /* success */
1201                 }
1202                 cursor->index = i;
1203                 error = 0;
1204                 if (hammer_debug_btree) {
1205                         kprintf("RESULT-L %016llx[%d] (SUCCESS)\n",
1206                                 cursor->node->node_offset, i);
1207                 }
1208                 goto done;
1209         }
1210
1211         /*
1212          * The search of the leaf node failed.  i is the insertion point.
1213          */
1214 failed:
1215         if (hammer_debug_btree) {
1216                 kprintf("RESULT-L %016llx[%d] (FAILED)\n",
1217                         cursor->node->node_offset, i);
1218         }
1219
1220         /*
1221          * No exact match was found, i is now at the insertion point.
1222          *
1223          * If inserting split a full leaf before returning.  This
1224          * may have the side effect of adjusting cursor->node and
1225          * cursor->index.
1226          */
1227         cursor->index = i;
1228         if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 &&
1229              btree_node_is_full(node)) {
1230                 error = btree_split_leaf(cursor);
1231                 if (error) {
1232                         if (error != ENOSPC)
1233                                 goto done;
1234                         enospc = 1;
1235                 }
1236                 /*
1237                  * reload stale pointers
1238                  */
1239                 /* NOT USED
1240                 i = cursor->index;
1241                 node = &cursor->node->internal;
1242                 */
1243         }
1244
1245         /*
1246          * We reached a leaf but did not find the key we were looking for.
1247          * If this is an insert we will be properly positioned for an insert
1248          * (ENOENT) or spike (ENOSPC) operation.
1249          */
1250         error = enospc ? ENOSPC : ENOENT;
1251 done:
1252         return(error);
1253 }
1254
1255 /*
1256  * Heuristical search for the first element whos comparison is <= 1.  May
1257  * return an index whos compare result is > 1 but may only return an index
1258  * whos compare result is <= 1 if it is the first element with that result.
1259  */
1260 int
1261 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node)
1262 {
1263         int b;
1264         int s;
1265         int i;
1266         int r;
1267
1268         /*
1269          * Don't bother if the node does not have very many elements
1270          */
1271         b = 0;
1272         s = node->count;
1273         while (s - b > 4) {
1274                 i = b + (s - b) / 2;
1275                 ++hammer_stats_btree_elements;
1276                 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base);
1277                 if (r <= 1) {
1278                         s = i;
1279                 } else {
1280                         b = i;
1281                 }
1282         }
1283         return(b);
1284 }
1285
1286
1287 /************************************************************************
1288  *                         SPLITTING AND MERGING                        *
1289  ************************************************************************
1290  *
1291  * These routines do all the dirty work required to split and merge nodes.
1292  */
1293
1294 /*
1295  * Split an internal node into two nodes and move the separator at the split
1296  * point to the parent.
1297  *
1298  * (cursor->node, cursor->index) indicates the element the caller intends
1299  * to push into.  We will adjust node and index if that element winds
1300  * up in the split node.
1301  *
1302  * If we are at the root of the filesystem a new root must be created with
1303  * two elements, one pointing to the original root and one pointing to the
1304  * newly allocated split node.
1305  */
1306 static
1307 int
1308 btree_split_internal(hammer_cursor_t cursor)
1309 {
1310         hammer_node_ondisk_t ondisk;
1311         hammer_node_t node;
1312         hammer_node_t parent;
1313         hammer_node_t new_node;
1314         hammer_btree_elm_t elm;
1315         hammer_btree_elm_t parent_elm;
1316         hammer_node_locklist_t locklist = NULL;
1317         hammer_mount_t hmp = cursor->trans->hmp;
1318         int parent_index;
1319         int made_root;
1320         int split;
1321         int error;
1322         int i;
1323         const int esize = sizeof(*elm);
1324
1325         error = hammer_btree_lock_children(cursor, &locklist);
1326         if (error)
1327                 goto done;
1328         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1329                 goto done;
1330         ++hammer_stats_btree_splits;
1331
1332         /* 
1333          * We are splitting but elms[split] will be promoted to the parent,
1334          * leaving the right hand node with one less element.  If the
1335          * insertion point will be on the left-hand side adjust the split
1336          * point to give the right hand side one additional node.
1337          */
1338         node = cursor->node;
1339         ondisk = node->ondisk;
1340         split = (ondisk->count + 1) / 2;
1341         if (cursor->index <= split)
1342                 --split;
1343
1344         /*
1345          * If we are at the root of the filesystem, create a new root node
1346          * with 1 element and split normally.  Avoid making major
1347          * modifications until we know the whole operation will work.
1348          */
1349         if (ondisk->parent == 0) {
1350                 parent = hammer_alloc_btree(cursor->trans, &error);
1351                 if (parent == NULL)
1352                         goto done;
1353                 hammer_lock_ex(&parent->lock);
1354                 hammer_modify_node_noundo(cursor->trans, parent);
1355                 ondisk = parent->ondisk;
1356                 ondisk->count = 1;
1357                 ondisk->parent = 0;
1358                 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1359                 ondisk->elms[0].base = hmp->root_btree_beg;
1360                 ondisk->elms[0].base.btype = node->ondisk->type;
1361                 ondisk->elms[0].internal.subtree_offset = node->node_offset;
1362                 ondisk->elms[1].base = hmp->root_btree_end;
1363                 hammer_modify_node_done(parent);
1364                 /* ondisk->elms[1].base.btype - not used */
1365                 made_root = 1;
1366                 parent_index = 0;       /* index of current node in parent */
1367         } else {
1368                 made_root = 0;
1369                 parent = cursor->parent;
1370                 parent_index = cursor->parent_index;
1371         }
1372
1373         /*
1374          * Split node into new_node at the split point.
1375          *
1376          *  B O O O P N N B     <-- P = node->elms[split]
1377          *   0 1 2 3 4 5 6      <-- subtree indices
1378          *
1379          *       x x P x x
1380          *        s S S s  
1381          *         /   \
1382          *  B O O O B    B N N B        <--- inner boundary points are 'P'
1383          *   0 1 2 3      4 5 6  
1384          *
1385          */
1386         new_node = hammer_alloc_btree(cursor->trans, &error);
1387         if (new_node == NULL) {
1388                 if (made_root) {
1389                         hammer_unlock(&parent->lock);
1390                         hammer_delete_node(cursor->trans, parent);
1391                         hammer_rel_node(parent);
1392                 }
1393                 goto done;
1394         }
1395         hammer_lock_ex(&new_node->lock);
1396
1397         /*
1398          * Create the new node.  P becomes the left-hand boundary in the
1399          * new node.  Copy the right-hand boundary as well.
1400          *
1401          * elm is the new separator.
1402          */
1403         hammer_modify_node_noundo(cursor->trans, new_node);
1404         hammer_modify_node_all(cursor->trans, node);
1405         ondisk = node->ondisk;
1406         elm = &ondisk->elms[split];
1407         bcopy(elm, &new_node->ondisk->elms[0],
1408               (ondisk->count - split + 1) * esize);
1409         new_node->ondisk->count = ondisk->count - split;
1410         new_node->ondisk->parent = parent->node_offset;
1411         new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1412         KKASSERT(ondisk->type == new_node->ondisk->type);
1413
1414         /*
1415          * Cleanup the original node.  Elm (P) becomes the new boundary,
1416          * its subtree_offset was moved to the new node.  If we had created
1417          * a new root its parent pointer may have changed.
1418          */
1419         elm->internal.subtree_offset = 0;
1420         ondisk->count = split;
1421
1422         /*
1423          * Insert the separator into the parent, fixup the parent's
1424          * reference to the original node, and reference the new node.
1425          * The separator is P.
1426          *
1427          * Remember that base.count does not include the right-hand boundary.
1428          */
1429         hammer_modify_node_all(cursor->trans, parent);
1430         ondisk = parent->ondisk;
1431         KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
1432         parent_elm = &ondisk->elms[parent_index+1];
1433         bcopy(parent_elm, parent_elm + 1,
1434               (ondisk->count - parent_index) * esize);
1435         parent_elm->internal.base = elm->base;  /* separator P */
1436         parent_elm->internal.base.btype = new_node->ondisk->type;
1437         parent_elm->internal.subtree_offset = new_node->node_offset;
1438         ++ondisk->count;
1439         hammer_modify_node_done(parent);
1440
1441         /*
1442          * The children of new_node need their parent pointer set to new_node.
1443          * The children have already been locked by
1444          * hammer_btree_lock_children().
1445          */
1446         for (i = 0; i < new_node->ondisk->count; ++i) {
1447                 elm = &new_node->ondisk->elms[i];
1448                 error = btree_set_parent(cursor->trans, new_node, elm);
1449                 if (error) {
1450                         panic("btree_split_internal: btree-fixup problem");
1451                 }
1452         }
1453         hammer_modify_node_done(new_node);
1454
1455         /*
1456          * The filesystem's root B-Tree pointer may have to be updated.
1457          */
1458         if (made_root) {
1459                 hammer_volume_t volume;
1460
1461                 volume = hammer_get_root_volume(hmp, &error);
1462                 KKASSERT(error == 0);
1463
1464                 hammer_modify_volume_field(cursor->trans, volume,
1465                                            vol0_btree_root);
1466                 volume->ondisk->vol0_btree_root = parent->node_offset;
1467                 hammer_modify_volume_done(volume);
1468                 node->ondisk->parent = parent->node_offset;
1469                 if (cursor->parent) {
1470                         hammer_unlock(&cursor->parent->lock);
1471                         hammer_rel_node(cursor->parent);
1472                 }
1473                 cursor->parent = parent;        /* lock'd and ref'd */
1474                 hammer_rel_volume(volume, 0);
1475         }
1476         hammer_modify_node_done(node);
1477
1478
1479         /*
1480          * Ok, now adjust the cursor depending on which element the original
1481          * index was pointing at.  If we are >= the split point the push node
1482          * is now in the new node.
1483          *
1484          * NOTE: If we are at the split point itself we cannot stay with the
1485          * original node because the push index will point at the right-hand
1486          * boundary, which is illegal.
1487          *
1488          * NOTE: The cursor's parent or parent_index must be adjusted for
1489          * the case where a new parent (new root) was created, and the case
1490          * where the cursor is now pointing at the split node.
1491          */
1492         if (cursor->index >= split) {
1493                 cursor->parent_index = parent_index + 1;
1494                 cursor->index -= split;
1495                 hammer_unlock(&cursor->node->lock);
1496                 hammer_rel_node(cursor->node);
1497                 cursor->node = new_node;        /* locked and ref'd */
1498         } else {
1499                 cursor->parent_index = parent_index;
1500                 hammer_unlock(&new_node->lock);
1501                 hammer_rel_node(new_node);
1502         }
1503
1504         /*
1505          * Fixup left and right bounds
1506          */
1507         parent_elm = &parent->ondisk->elms[cursor->parent_index];
1508         cursor->left_bound = &parent_elm[0].internal.base;
1509         cursor->right_bound = &parent_elm[1].internal.base;
1510         KKASSERT(hammer_btree_cmp(cursor->left_bound,
1511                  &cursor->node->ondisk->elms[0].internal.base) <= 0);
1512         KKASSERT(hammer_btree_cmp(cursor->right_bound,
1513                  &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0);
1514
1515 done:
1516         hammer_btree_unlock_children(&locklist);
1517         hammer_cursor_downgrade(cursor);
1518         return (error);
1519 }
1520
1521 /*
1522  * Same as the above, but splits a full leaf node.
1523  *
1524  * This function
1525  */
1526 static
1527 int
1528 btree_split_leaf(hammer_cursor_t cursor)
1529 {
1530         hammer_node_ondisk_t ondisk;
1531         hammer_node_t parent;
1532         hammer_node_t leaf;
1533         hammer_mount_t hmp;
1534         hammer_node_t new_leaf;
1535         hammer_btree_elm_t elm;
1536         hammer_btree_elm_t parent_elm;
1537         hammer_base_elm_t mid_boundary;
1538         int parent_index;
1539         int made_root;
1540         int split;
1541         int error;
1542         const size_t esize = sizeof(*elm);
1543
1544         if ((error = hammer_cursor_upgrade(cursor)) != 0)
1545                 return(error);
1546         ++hammer_stats_btree_splits;
1547
1548         KKASSERT(hammer_btree_cmp(cursor->left_bound,
1549                  &cursor->node->ondisk->elms[0].leaf.base) <= 0);
1550         KKASSERT(hammer_btree_cmp(cursor->right_bound,
1551                  &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0);
1552
1553         /* 
1554          * Calculate the split point.  If the insertion point will be on
1555          * the left-hand side adjust the split point to give the right
1556          * hand side one additional node.
1557          *
1558          * Spikes are made up of two leaf elements which cannot be
1559          * safely split.
1560          */
1561         leaf = cursor->node;
1562         ondisk = leaf->ondisk;
1563         split = (ondisk->count + 1) / 2;
1564         if (cursor->index <= split)
1565                 --split;
1566         error = 0;
1567         hmp = leaf->hmp;
1568
1569         elm = &ondisk->elms[split];
1570
1571         KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0);
1572         KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0);
1573         KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0);
1574         KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0);
1575
1576         /*
1577          * If we are at the root of the tree, create a new root node with
1578          * 1 element and split normally.  Avoid making major modifications
1579          * until we know the whole operation will work.
1580          */
1581         if (ondisk->parent == 0) {
1582                 parent = hammer_alloc_btree(cursor->trans, &error);
1583                 if (parent == NULL)
1584                         goto done;
1585                 hammer_lock_ex(&parent->lock);
1586                 hammer_modify_node_noundo(cursor->trans, parent);
1587                 ondisk = parent->ondisk;
1588                 ondisk->count = 1;
1589                 ondisk->parent = 0;
1590                 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL;
1591                 ondisk->elms[0].base = hmp->root_btree_beg;
1592                 ondisk->elms[0].base.btype = leaf->ondisk->type;
1593                 ondisk->elms[0].internal.subtree_offset = leaf->node_offset;
1594                 ondisk->elms[1].base = hmp->root_btree_end;
1595                 /* ondisk->elms[1].base.btype = not used */
1596                 hammer_modify_node_done(parent);
1597                 made_root = 1;
1598                 parent_index = 0;       /* insertion point in parent */
1599         } else {
1600                 made_root = 0;
1601                 parent = cursor->parent;
1602                 parent_index = cursor->parent_index;
1603         }
1604
1605         /*
1606          * Split leaf into new_leaf at the split point.  Select a separator
1607          * value in-between the two leafs but with a bent towards the right
1608          * leaf since comparisons use an 'elm >= separator' inequality.
1609          *
1610          *  L L L L L L L L
1611          *
1612          *       x x P x x
1613          *        s S S s  
1614          *         /   \
1615          *  L L L L     L L L L
1616          */
1617         new_leaf = hammer_alloc_btree(cursor->trans, &error);
1618         if (new_leaf == NULL) {
1619                 if (made_root) {
1620                         hammer_unlock(&parent->lock);
1621                         hammer_delete_node(cursor->trans, parent);
1622                         hammer_rel_node(parent);
1623                 }
1624                 goto done;
1625         }
1626         hammer_lock_ex(&new_leaf->lock);
1627
1628         /*
1629          * Create the new node and copy the leaf elements from the split 
1630          * point on to the new node.
1631          */
1632         hammer_modify_node_all(cursor->trans, leaf);
1633         hammer_modify_node_noundo(cursor->trans, new_leaf);
1634         ondisk = leaf->ondisk;
1635         elm = &ondisk->elms[split];
1636         bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize);
1637         new_leaf->ondisk->count = ondisk->count - split;
1638         new_leaf->ondisk->parent = parent->node_offset;
1639         new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1640         KKASSERT(ondisk->type == new_leaf->ondisk->type);
1641         hammer_modify_node_done(new_leaf);
1642
1643         /*
1644          * Cleanup the original node.  Because this is a leaf node and
1645          * leaf nodes do not have a right-hand boundary, there
1646          * aren't any special edge cases to clean up.  We just fixup the
1647          * count.
1648          */
1649         ondisk->count = split;
1650
1651         /*
1652          * Insert the separator into the parent, fixup the parent's
1653          * reference to the original node, and reference the new node.
1654          * The separator is P.
1655          *
1656          * Remember that base.count does not include the right-hand boundary.
1657          * We are copying parent_index+1 to parent_index+2, not +0 to +1.
1658          */
1659         hammer_modify_node_all(cursor->trans, parent);
1660         ondisk = parent->ondisk;
1661         KKASSERT(split != 0);
1662         KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
1663         parent_elm = &ondisk->elms[parent_index+1];
1664         bcopy(parent_elm, parent_elm + 1,
1665               (ondisk->count - parent_index) * esize);
1666
1667         hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
1668         parent_elm->internal.base.btype = new_leaf->ondisk->type;
1669         parent_elm->internal.subtree_offset = new_leaf->node_offset;
1670         mid_boundary = &parent_elm->base;
1671         ++ondisk->count;
1672         hammer_modify_node_done(parent);
1673
1674         /*
1675          * The filesystem's root B-Tree pointer may have to be updated.
1676          */
1677         if (made_root) {
1678                 hammer_volume_t volume;
1679
1680                 volume = hammer_get_root_volume(hmp, &error);
1681                 KKASSERT(error == 0);
1682
1683                 hammer_modify_volume_field(cursor->trans, volume,
1684                                            vol0_btree_root);
1685                 volume->ondisk->vol0_btree_root = parent->node_offset;
1686                 hammer_modify_volume_done(volume);
1687                 leaf->ondisk->parent = parent->node_offset;
1688                 if (cursor->parent) {
1689                         hammer_unlock(&cursor->parent->lock);
1690                         hammer_rel_node(cursor->parent);
1691                 }
1692                 cursor->parent = parent;        /* lock'd and ref'd */
1693                 hammer_rel_volume(volume, 0);
1694         }
1695         hammer_modify_node_done(leaf);
1696
1697         /*
1698          * Ok, now adjust the cursor depending on which element the original
1699          * index was pointing at.  If we are >= the split point the push node
1700          * is now in the new node.
1701          *
1702          * NOTE: If we are at the split point itself we need to select the
1703          * old or new node based on where key_beg's insertion point will be.
1704          * If we pick the wrong side the inserted element will wind up in
1705          * the wrong leaf node and outside that node's bounds.
1706          */
1707         if (cursor->index > split ||
1708             (cursor->index == split &&
1709              hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) {
1710                 cursor->parent_index = parent_index + 1;
1711                 cursor->index -= split;
1712                 hammer_unlock(&cursor->node->lock);
1713                 hammer_rel_node(cursor->node);
1714                 cursor->node = new_leaf;
1715         } else {
1716                 cursor->parent_index = parent_index;
1717                 hammer_unlock(&new_leaf->lock);
1718                 hammer_rel_node(new_leaf);
1719         }
1720
1721         /*
1722          * Fixup left and right bounds
1723          */
1724         parent_elm = &parent->ondisk->elms[cursor->parent_index];
1725         cursor->left_bound = &parent_elm[0].internal.base;
1726         cursor->right_bound = &parent_elm[1].internal.base;
1727
1728         /*
1729          * Assert that the bounds are correct.
1730          */
1731         KKASSERT(hammer_btree_cmp(cursor->left_bound,
1732                  &cursor->node->ondisk->elms[0].leaf.base) <= 0);
1733         KKASSERT(hammer_btree_cmp(cursor->right_bound,
1734                  &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0);
1735         KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0);
1736         KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0);
1737
1738 done:
1739         hammer_cursor_downgrade(cursor);
1740         return (error);
1741 }
1742
1743 /*
1744  * Recursively correct the right-hand boundary's create_tid to (tid) as
1745  * long as the rest of the key matches.  We have to recurse upward in
1746  * the tree as well as down the left side of each parent's right node.
1747  *
1748  * Return EDEADLK if we were only partially successful, forcing the caller
1749  * to try again.  The original cursor is not modified.  This routine can
1750  * also fail with EDEADLK if it is forced to throw away a portion of its
1751  * record history.
1752  *
1753  * The caller must pass a downgraded cursor to us (otherwise we can't dup it).
1754  */
1755 struct hammer_rhb {
1756         TAILQ_ENTRY(hammer_rhb) entry;
1757         hammer_node_t   node;
1758         int             index;
1759 };
1760
1761 TAILQ_HEAD(hammer_rhb_list, hammer_rhb);
1762
1763 int
1764 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid)
1765 {
1766         struct hammer_rhb_list rhb_list;
1767         hammer_base_elm_t elm;
1768         hammer_node_t orig_node;
1769         struct hammer_rhb *rhb;
1770         int orig_index;
1771         int error;
1772
1773         TAILQ_INIT(&rhb_list);
1774
1775         /*
1776          * Save our position so we can restore it on return.  This also
1777          * gives us a stable 'elm'.
1778          */
1779         orig_node = cursor->node;
1780         hammer_ref_node(orig_node);
1781         hammer_lock_sh(&orig_node->lock);
1782         orig_index = cursor->index;
1783         elm = &orig_node->ondisk->elms[orig_index].base;
1784
1785         /*
1786          * Now build a list of parents going up, allocating a rhb
1787          * structure for each one.
1788          */
1789         while (cursor->parent) {
1790                 /*
1791                  * Stop if we no longer have any right-bounds to fix up
1792                  */
1793                 if (elm->obj_id != cursor->right_bound->obj_id ||
1794                     elm->rec_type != cursor->right_bound->rec_type ||
1795                     elm->key != cursor->right_bound->key) {
1796                         break;
1797                 }
1798
1799                 /*
1800                  * Stop if the right-hand bound's create_tid does not
1801                  * need to be corrected.
1802                  */
1803                 if (cursor->right_bound->create_tid >= tid)
1804                         break;
1805
1806                 rhb = kmalloc(sizeof(*rhb), M_HAMMER, M_WAITOK|M_ZERO);
1807                 rhb->node = cursor->parent;
1808                 rhb->index = cursor->parent_index;
1809                 hammer_ref_node(rhb->node);
1810                 hammer_lock_sh(&rhb->node->lock);
1811                 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry);
1812
1813                 hammer_cursor_up(cursor);
1814         }
1815
1816         /*
1817          * now safely adjust the right hand bound for each rhb.  This may
1818          * also require taking the right side of the tree and iterating down
1819          * ITS left side.
1820          */
1821         error = 0;
1822         while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
1823                 error = hammer_cursor_seek(cursor, rhb->node, rhb->index);
1824                 if (error)
1825                         break;
1826                 TAILQ_REMOVE(&rhb_list, rhb, entry);
1827                 hammer_unlock(&rhb->node->lock);
1828                 hammer_rel_node(rhb->node);
1829                 kfree(rhb, M_HAMMER);
1830
1831                 switch (cursor->node->ondisk->type) {
1832                 case HAMMER_BTREE_TYPE_INTERNAL:
1833                         /*
1834                          * Right-boundary for parent at internal node
1835                          * is one element to the right of the element whos
1836                          * right boundary needs adjusting.  We must then
1837                          * traverse down the left side correcting any left
1838                          * bounds (which may now be too far to the left).
1839                          */
1840                         ++cursor->index;
1841                         error = hammer_btree_correct_lhb(cursor, tid);
1842                         break;
1843                 default:
1844                         panic("hammer_btree_correct_rhb(): Bad node type");
1845                         error = EINVAL;
1846                         break;
1847                 }
1848         }
1849
1850         /*
1851          * Cleanup
1852          */
1853         while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
1854                 TAILQ_REMOVE(&rhb_list, rhb, entry);
1855                 hammer_unlock(&rhb->node->lock);
1856                 hammer_rel_node(rhb->node);
1857                 kfree(rhb, M_HAMMER);
1858         }
1859         error = hammer_cursor_seek(cursor, orig_node, orig_index);
1860         hammer_unlock(&orig_node->lock);
1861         hammer_rel_node(orig_node);
1862         return (error);
1863 }
1864
1865 /*
1866  * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand
1867  * bound going downward starting at the current cursor position.
1868  *
1869  * This function does not restore the cursor after use.
1870  */
1871 int
1872 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid)
1873 {
1874         struct hammer_rhb_list rhb_list;
1875         hammer_base_elm_t elm;
1876         hammer_base_elm_t cmp;
1877         struct hammer_rhb *rhb;
1878         int error;
1879
1880         TAILQ_INIT(&rhb_list);
1881
1882         cmp = &cursor->node->ondisk->elms[cursor->index].base;
1883
1884         /*
1885          * Record the node and traverse down the left-hand side for all
1886          * matching records needing a boundary correction.
1887          */
1888         error = 0;
1889         for (;;) {
1890                 rhb = kmalloc(sizeof(*rhb), M_HAMMER, M_WAITOK|M_ZERO);
1891                 rhb->node = cursor->node;
1892                 rhb->index = cursor->index;
1893                 hammer_ref_node(rhb->node);
1894                 hammer_lock_sh(&rhb->node->lock);
1895                 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry);
1896
1897                 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
1898                         /*
1899                          * Nothing to traverse down if we are at the right
1900                          * boundary of an internal node.
1901                          */
1902                         if (cursor->index == cursor->node->ondisk->count)
1903                                 break;
1904                 } else {
1905                         elm = &cursor->node->ondisk->elms[cursor->index].base;
1906                         if (elm->btype == HAMMER_BTREE_TYPE_RECORD)
1907                                 break;
1908                         panic("Illegal leaf record type %02x", elm->btype);
1909                 }
1910                 error = hammer_cursor_down(cursor);
1911                 if (error)
1912                         break;
1913
1914                 elm = &cursor->node->ondisk->elms[cursor->index].base;
1915                 if (elm->obj_id != cmp->obj_id ||
1916                     elm->rec_type != cmp->rec_type ||
1917                     elm->key != cmp->key) {
1918                         break;
1919                 }
1920                 if (elm->create_tid >= tid)
1921                         break;
1922
1923         }
1924
1925         /*
1926          * Now we can safely adjust the left-hand boundary from the bottom-up.
1927          * The last element we remove from the list is the caller's right hand
1928          * boundary, which must also be adjusted.
1929          */
1930         while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
1931                 error = hammer_cursor_seek(cursor, rhb->node, rhb->index);
1932                 if (error)
1933                         break;
1934                 TAILQ_REMOVE(&rhb_list, rhb, entry);
1935                 hammer_unlock(&rhb->node->lock);
1936                 hammer_rel_node(rhb->node);
1937                 kfree(rhb, M_HAMMER);
1938
1939                 elm = &cursor->node->ondisk->elms[cursor->index].base;
1940                 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
1941                         hammer_modify_node(cursor->trans, cursor->node,
1942                                            &elm->create_tid,
1943                                            sizeof(elm->create_tid));
1944                         elm->create_tid = tid;
1945                         hammer_modify_node_done(cursor->node);
1946                 } else {
1947                         panic("hammer_btree_correct_lhb(): Bad element type");
1948                 }
1949         }
1950
1951         /*
1952          * Cleanup
1953          */
1954         while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) {
1955                 TAILQ_REMOVE(&rhb_list, rhb, entry);
1956                 hammer_unlock(&rhb->node->lock);
1957                 hammer_rel_node(rhb->node);
1958                 kfree(rhb, M_HAMMER);
1959         }
1960         return (error);
1961 }
1962
1963 /*
1964  * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at
1965  * (cursor->node).  Returns 0 on success, EDEADLK if we could not complete
1966  * the operation due to a deadlock, or some other error.
1967  *
1968  * This routine is always called with an empty, locked leaf but may recurse
1969  * into want-to-be-empty parents as part of its operation.
1970  *
1971  * On return the cursor may end up pointing to an internal node, suitable
1972  * for further iteration but not for an immediate insertion or deletion.
1973  */
1974 static int
1975 btree_remove(hammer_cursor_t cursor)
1976 {
1977         hammer_node_ondisk_t ondisk;
1978         hammer_btree_elm_t elm;
1979         hammer_node_t node;
1980         hammer_node_t parent;
1981         const int esize = sizeof(*elm);
1982         int error;
1983
1984         node = cursor->node;
1985
1986         /*
1987          * When deleting the root of the filesystem convert it to
1988          * an empty leaf node.  Internal nodes cannot be empty.
1989          */
1990         if (node->ondisk->parent == 0) {
1991                 KKASSERT(cursor->parent == NULL);
1992                 hammer_modify_node_all(cursor->trans, node);
1993                 ondisk = node->ondisk;
1994                 ondisk->type = HAMMER_BTREE_TYPE_LEAF;
1995                 ondisk->count = 0;
1996                 hammer_modify_node_done(node);
1997                 cursor->index = 0;
1998                 return(0);
1999         }
2000
2001         /*
2002          * Attempt to remove the parent's reference to the child.  If the
2003          * parent would become empty we have to recurse.  If we fail we 
2004          * leave the parent pointing to an empty leaf node.
2005          */
2006         parent = cursor->parent;
2007
2008         if (parent->ondisk->count == 1) {
2009                 /*
2010                  * This special cursor_up_locked() call leaves the original
2011                  * node exclusively locked and referenced, leaves the
2012                  * original parent locked (as the new node), and locks the
2013                  * new parent.  It can return EDEADLK.
2014                  */
2015                 error = hammer_cursor_up_locked(cursor);
2016                 if (error == 0) {
2017                         error = btree_remove(cursor);
2018                         if (error == 0) {
2019                                 hammer_modify_node_all(cursor->trans, node);
2020                                 ondisk = node->ondisk;
2021                                 ondisk->type = HAMMER_BTREE_TYPE_DELETED;
2022                                 ondisk->count = 0;
2023                                 hammer_modify_node_done(node);
2024                                 hammer_flush_node(node);
2025                                 hammer_delete_node(cursor->trans, node);
2026                         } else {
2027                                 kprintf("Warning: BTREE_REMOVE: Defering "
2028                                         "parent removal1 @ %016llx, skipping\n",
2029                                         node->node_offset);
2030                         }
2031                         hammer_unlock(&node->lock);
2032                         hammer_rel_node(node);
2033                 } else {
2034                         kprintf("Warning: BTREE_REMOVE: Defering parent "
2035                                 "removal2 @ %016llx, skipping\n",
2036                                 node->node_offset);
2037                 }
2038         } else {
2039                 KKASSERT(parent->ondisk->count > 1);
2040
2041                 /*
2042                  * Delete the subtree reference in the parent
2043                  */
2044                 hammer_modify_node_all(cursor->trans, parent);
2045                 ondisk = parent->ondisk;
2046                 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
2047                 elm = &ondisk->elms[cursor->parent_index];
2048                 KKASSERT(elm->internal.subtree_offset == node->node_offset);
2049                 KKASSERT(ondisk->count > 0);
2050                 bcopy(&elm[1], &elm[0],
2051                       (ondisk->count - cursor->parent_index) * esize);
2052                 --ondisk->count;
2053                 hammer_modify_node_done(parent);
2054                 hammer_flush_node(node);
2055                 hammer_delete_node(cursor->trans, node);
2056
2057                 /*
2058                  * cursor->node is invalid, cursor up to make the cursor
2059                  * valid again.
2060                  */
2061                 error = hammer_cursor_up(cursor);
2062         }
2063         return (error);
2064 }
2065
2066 /*
2067  * The element (elm) has been moved to a new internal node (node).
2068  *
2069  * If the element represents a pointer to an internal node that node's
2070  * parent must be adjusted to the element's new location.
2071  *
2072  * XXX deadlock potential here with our exclusive locks
2073  */
2074 int
2075 btree_set_parent(hammer_transaction_t trans, hammer_node_t node,
2076                  hammer_btree_elm_t elm)
2077 {
2078         hammer_node_t child;
2079         int error;
2080
2081         error = 0;
2082
2083         switch(elm->base.btype) {
2084         case HAMMER_BTREE_TYPE_INTERNAL:
2085         case HAMMER_BTREE_TYPE_LEAF:
2086                 child = hammer_get_node(node->hmp, elm->internal.subtree_offset,
2087                                         0, &error);
2088                 if (error == 0) {
2089                         hammer_modify_node_field(trans, child, parent);
2090                         child->ondisk->parent = node->node_offset;
2091                         hammer_modify_node_done(child);
2092                         hammer_rel_node(child);
2093                 }
2094                 break;
2095         default:
2096                 break;
2097         }
2098         return(error);
2099 }
2100
2101 /*
2102  * Exclusively lock all the children of node.  This is used by the split
2103  * code to prevent anyone from accessing the children of a cursor node
2104  * while we fix-up its parent offset.
2105  *
2106  * If we don't lock the children we can really mess up cursors which block
2107  * trying to cursor-up into our node.
2108  *
2109  * On failure EDEADLK (or some other error) is returned.  If a deadlock
2110  * error is returned the cursor is adjusted to block on termination.
2111  */
2112 int
2113 hammer_btree_lock_children(hammer_cursor_t cursor,
2114                            struct hammer_node_locklist **locklistp)
2115 {
2116         hammer_node_t node;
2117         hammer_node_locklist_t item;
2118         hammer_node_ondisk_t ondisk;
2119         hammer_btree_elm_t elm;
2120         hammer_node_t child;
2121         int error;
2122         int i;
2123
2124         node = cursor->node;
2125         ondisk = node->ondisk;
2126         error = 0;
2127
2128         /*
2129          * We really do not want to block on I/O with exclusive locks held,
2130          * pre-get the children before trying to lock the mess.
2131          */
2132         for (i = 0; i < ondisk->count; ++i) {
2133                 ++hammer_stats_btree_elements;
2134                 elm = &ondisk->elms[i];
2135                 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF &&
2136                     elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) {
2137                         continue;
2138                 }
2139                 child = hammer_get_node(node->hmp,
2140                                         elm->internal.subtree_offset,
2141                                         0, &error);
2142                 if (child)
2143                         hammer_rel_node(child);
2144         }
2145
2146         /*
2147          * Do it for real
2148          */
2149         for (i = 0; error == 0 && i < ondisk->count; ++i) {
2150                 ++hammer_stats_btree_elements;
2151                 elm = &ondisk->elms[i];
2152
2153                 switch(elm->base.btype) {
2154                 case HAMMER_BTREE_TYPE_INTERNAL:
2155                 case HAMMER_BTREE_TYPE_LEAF:
2156                         KKASSERT(elm->internal.subtree_offset != 0);
2157                         child = hammer_get_node(node->hmp,
2158                                                 elm->internal.subtree_offset,
2159                                                 0, &error);
2160                         break;
2161                 default:
2162                         child = NULL;
2163                         break;
2164                 }
2165                 if (child) {
2166                         if (hammer_lock_ex_try(&child->lock) != 0) {
2167                                 if (cursor->deadlk_node == NULL) {
2168                                         cursor->deadlk_node = child;
2169                                         hammer_ref_node(cursor->deadlk_node);
2170                                 }
2171                                 error = EDEADLK;
2172                                 hammer_rel_node(child);
2173                         } else {
2174                                 item = kmalloc(sizeof(*item),
2175                                                 M_HAMMER, M_WAITOK);
2176                                 item->next = *locklistp;
2177                                 item->node = child;
2178                                 *locklistp = item;
2179                         }
2180                 }
2181         }
2182         if (error)
2183                 hammer_btree_unlock_children(locklistp);
2184         return(error);
2185 }
2186
2187
2188 /*
2189  * Release previously obtained node locks.
2190  */
2191 void
2192 hammer_btree_unlock_children(struct hammer_node_locklist **locklistp)
2193 {
2194         hammer_node_locklist_t item;
2195
2196         while ((item = *locklistp) != NULL) {
2197                 *locklistp = item->next;
2198                 hammer_unlock(&item->node->lock);
2199                 hammer_rel_node(item->node);
2200                 kfree(item, M_HAMMER);
2201         }
2202 }
2203
2204 /************************************************************************
2205  *                         MISCELLANIOUS SUPPORT                        *
2206  ************************************************************************/
2207
2208 /*
2209  * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp).
2210  *
2211  * Note that for this particular function a return value of -1, 0, or +1
2212  * can denote a match if create_tid is otherwise discounted.  A create_tid
2213  * of zero is considered to be 'infinity' in comparisons.
2214  *
2215  * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
2216  */
2217 int
2218 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
2219 {
2220         if (key1->localization < key2->localization)
2221                 return(-5);
2222         if (key1->localization > key2->localization)
2223                 return(5);
2224
2225         if (key1->obj_id < key2->obj_id)
2226                 return(-4);
2227         if (key1->obj_id > key2->obj_id)
2228                 return(4);
2229
2230         if (key1->rec_type < key2->rec_type)
2231                 return(-3);
2232         if (key1->rec_type > key2->rec_type)
2233                 return(3);
2234
2235         if (key1->key < key2->key)
2236                 return(-2);
2237         if (key1->key > key2->key)
2238                 return(2);
2239
2240         /*
2241          * A create_tid of zero indicates a record which is undeletable
2242          * and must be considered to have a value of positive infinity.
2243          */
2244         if (key1->create_tid == 0) {
2245                 if (key2->create_tid == 0)
2246                         return(0);
2247                 return(1);
2248         }
2249         if (key2->create_tid == 0)
2250                 return(-1);
2251         if (key1->create_tid < key2->create_tid)
2252                 return(-1);
2253         if (key1->create_tid > key2->create_tid)
2254                 return(1);
2255         return(0);
2256 }
2257
2258 /*
2259  * Test a timestamp against an element to determine whether the
2260  * element is visible.  A timestamp of 0 means 'infinity'.
2261  */
2262 int
2263 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base)
2264 {
2265         if (asof == 0) {
2266                 if (base->delete_tid)
2267                         return(1);
2268                 return(0);
2269         }
2270         if (asof < base->create_tid)
2271                 return(-1);
2272         if (base->delete_tid && asof >= base->delete_tid)
2273                 return(1);
2274         return(0);
2275 }
2276
2277 /*
2278  * Create a separator half way inbetween key1 and key2.  For fields just
2279  * one unit apart, the separator will match key2.  key1 is on the left-hand
2280  * side and key2 is on the right-hand side.
2281  *
2282  * key2 must be >= the separator.  It is ok for the separator to match key2.
2283  *
2284  * NOTE: Even if key1 does not match key2, the separator may wind up matching
2285  * key2.
2286  *
2287  * NOTE: It might be beneficial to just scrap this whole mess and just
2288  * set the separator to key2.
2289  */
2290 #define MAKE_SEPARATOR(key1, key2, dest, field) \
2291         dest->field = key1->field + ((key2->field - key1->field + 1) >> 1);
2292
2293 static void
2294 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2,
2295                       hammer_base_elm_t dest)
2296 {
2297         bzero(dest, sizeof(*dest));
2298
2299         dest->rec_type = key2->rec_type;
2300         dest->key = key2->key;
2301         dest->obj_id = key2->obj_id;
2302         dest->create_tid = key2->create_tid;
2303
2304         MAKE_SEPARATOR(key1, key2, dest, localization);
2305         if (key1->localization == key2->localization) {
2306                 MAKE_SEPARATOR(key1, key2, dest, obj_id);
2307                 if (key1->obj_id == key2->obj_id) {
2308                         MAKE_SEPARATOR(key1, key2, dest, rec_type);
2309                         if (key1->rec_type == key2->rec_type) {
2310                                 MAKE_SEPARATOR(key1, key2, dest, key);
2311                                 /*
2312                                  * Don't bother creating a separator for
2313                                  * create_tid, which also conveniently avoids
2314                                  * having to handle the create_tid == 0
2315                                  * (infinity) case.  Just leave create_tid
2316                                  * set to key2.
2317                                  *
2318                                  * Worst case, dest matches key2 exactly,
2319                                  * which is acceptable.
2320                                  */
2321                         }
2322                 }
2323         }
2324 }
2325
2326 #undef MAKE_SEPARATOR
2327
2328 /*
2329  * Return whether a generic internal or leaf node is full
2330  */
2331 static int
2332 btree_node_is_full(hammer_node_ondisk_t node)
2333 {
2334         switch(node->type) {
2335         case HAMMER_BTREE_TYPE_INTERNAL:
2336                 if (node->count == HAMMER_BTREE_INT_ELMS)
2337                         return(1);
2338                 break;
2339         case HAMMER_BTREE_TYPE_LEAF:
2340                 if (node->count == HAMMER_BTREE_LEAF_ELMS)
2341                         return(1);
2342                 break;
2343         default:
2344                 panic("illegal btree subtype");
2345         }
2346         return(0);
2347 }
2348
2349 #if 0
2350 static int
2351 btree_max_elements(u_int8_t type)
2352 {
2353         if (type == HAMMER_BTREE_TYPE_LEAF)
2354                 return(HAMMER_BTREE_LEAF_ELMS);
2355         if (type == HAMMER_BTREE_TYPE_INTERNAL)
2356                 return(HAMMER_BTREE_INT_ELMS);
2357         panic("btree_max_elements: bad type %d\n", type);
2358 }
2359 #endif
2360
2361 void
2362 hammer_print_btree_node(hammer_node_ondisk_t ondisk)
2363 {
2364         hammer_btree_elm_t elm;
2365         int i;
2366
2367         kprintf("node %p count=%d parent=%016llx type=%c\n",
2368                 ondisk, ondisk->count, ondisk->parent, ondisk->type);
2369
2370         /*
2371          * Dump both boundary elements if an internal node
2372          */
2373         if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
2374                 for (i = 0; i <= ondisk->count; ++i) {
2375                         elm = &ondisk->elms[i];
2376                         hammer_print_btree_elm(elm, ondisk->type, i);
2377                 }
2378         } else {
2379                 for (i = 0; i < ondisk->count; ++i) {
2380                         elm = &ondisk->elms[i];
2381                         hammer_print_btree_elm(elm, ondisk->type, i);
2382                 }
2383         }
2384 }
2385
2386 void
2387 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
2388 {
2389         kprintf("  %2d", i);
2390         kprintf("\tobj_id       = %016llx\n", elm->base.obj_id);
2391         kprintf("\tkey          = %016llx\n", elm->base.key);
2392         kprintf("\tcreate_tid   = %016llx\n", elm->base.create_tid);
2393         kprintf("\tdelete_tid   = %016llx\n", elm->base.delete_tid);
2394         kprintf("\trec_type     = %04x\n", elm->base.rec_type);
2395         kprintf("\tobj_type     = %02x\n", elm->base.obj_type);
2396         kprintf("\tbtype        = %02x (%c)\n",
2397                 elm->base.btype,
2398                 (elm->base.btype ? elm->base.btype : '?'));
2399         kprintf("\tlocalization = %02x\n", elm->base.localization);
2400
2401         switch(type) {
2402         case HAMMER_BTREE_TYPE_INTERNAL:
2403                 kprintf("\tsubtree_off  = %016llx\n",
2404                         elm->internal.subtree_offset);
2405                 break;
2406         case HAMMER_BTREE_TYPE_RECORD:
2407                 kprintf("\tdata_offset  = %016llx\n", elm->leaf.data_offset);
2408                 kprintf("\tdata_len     = %08x\n", elm->leaf.data_len);
2409                 kprintf("\tdata_crc     = %08x\n", elm->leaf.data_crc);
2410                 break;
2411         }
2412 }