2 * Copyright (c) 2009 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 static int rebalance_node(struct hammer_ioc_rebalance *rebal,
38 hammer_cursor_t cursor);
39 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
40 hammer_btree_elm_t elm);
41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
42 hammer_node_lock_t item, hammer_node_lock_t chld_item);
45 * Iterate through the specified range of object ids and rebalance B-Tree
46 * leaf and internal nodes we encounter. A forwards iteration is used.
48 * All leafs are at the same depth. We use the b-tree scan code loosely
49 * to position ourselves and create degenerate cases to skip indices
50 * that we have rebalanced in bulk.
54 hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
55 struct hammer_ioc_rebalance *rebal)
57 struct hammer_cursor cursor;
58 hammer_btree_leaf_elm_t elm;
62 if ((rebal->key_beg.localization | rebal->key_end.localization) &
63 HAMMER_LOCALIZE_PSEUDOFS_MASK) {
66 if (rebal->key_beg.localization > rebal->key_end.localization)
68 if (rebal->key_beg.localization == rebal->key_end.localization) {
69 if (rebal->key_beg.obj_id > rebal->key_end.obj_id)
71 /* key-space limitations - no check needed */
73 if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2)
74 rebal->saturation = HAMMER_BTREE_INT_ELMS / 2;
75 if (rebal->saturation > HAMMER_BTREE_INT_ELMS)
76 rebal->saturation = HAMMER_BTREE_INT_ELMS;
78 rebal->key_cur = rebal->key_beg;
79 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
80 rebal->key_cur.localization += ip->obj_localization;
82 seq = trans->hmp->flusher.act;
85 * Scan forwards. Retries typically occur if a deadlock is detected.
88 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
90 hammer_done_cursor(&cursor);
93 cursor.key_beg = rebal->key_cur;
94 cursor.key_end = rebal->key_end;
95 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK;
96 cursor.key_end.localization += ip->obj_localization;
97 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
98 cursor.flags |= HAMMER_CURSOR_BACKEND;
101 * Cause internal nodes to be returned on the way up. Internal nodes
102 * are not returned on the way down so we can create a degenerate
103 * case to handle internal nodes as a trailing function of their
106 * Note that by not setting INSERTING or PRUNING no boundary
107 * corrections will be made and a sync lock is not needed for the
108 * B-Tree scan itself.
110 cursor.flags |= HAMMER_CURSOR_REBLOCKING;
112 error = hammer_btree_first(&cursor);
116 * We only care about internal nodes visited for the last
117 * time on the way up... that is, a trailing scan of the
118 * internal node after all of its children have been recursed
121 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
123 * Leave cursor.index alone, we want to recurse
124 * through all children of the internal node before
127 * Process the internal node on the way up after
128 * the last child's sub-tree has been balanced.
130 if (cursor.index == cursor.node->ondisk->count - 1) {
131 hammer_sync_lock_sh(trans);
132 error = rebalance_node(rebal, &cursor);
133 hammer_sync_unlock(trans);
137 * We don't need to iterate through all the leaf
138 * elements, we only care about the parent (internal)
141 cursor.index = cursor.node->ondisk->count - 1;
147 * Update returned scan position and do a flush if
150 elm = &cursor.node->ondisk->elms[cursor.index].leaf;
151 rebal->key_cur = elm->base;
152 ++rebal->stat_ncount;
154 while (hammer_flusher_meta_halflimit(trans->hmp) ||
155 hammer_flusher_undo_exhausted(trans, 2)) {
156 hammer_unlock_cursor(&cursor);
157 hammer_flusher_wait(trans->hmp, seq);
158 hammer_lock_cursor(&cursor);
159 seq = hammer_flusher_async_one(trans->hmp);
163 * Iterate, stop if a signal was received.
165 if ((error = hammer_signal_check(trans->hmp)) != 0)
167 error = hammer_btree_iterate(&cursor);
171 hammer_done_cursor(&cursor);
172 if (error == EDEADLK)
174 if (error == EINTR) {
175 rebal->head.flags |= HAMMER_IOC_HEAD_INTR;
179 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
184 * Rebalance an internal node, called via a trailing upward recursion.
185 * All the children have already been individually rebalanced.
187 * To rebalance we scan the elements in the children and pack them,
188 * so we actually need to lock the children and the children's children.
192 * C C C C C C C children (first level) (internal or leaf nodes)
193 * children's elements (second level)
195 * <<<---------- pack children's elements, possibly remove excess
196 * children after packing.
198 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
199 * Any live tracked B-Tree nodes must be updated (we worm out of that
200 * by not allowing any). And boundary elements must be preserved.
202 * NOTE: If the children are leaf nodes we may have a degenerate case
203 * case where there are no elements in the leafs.
208 rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor)
210 struct hammer_node_lock lockroot;
211 hammer_node_lock_t base_item;
212 hammer_node_lock_t chld_item;
213 hammer_node_lock_t item;
214 hammer_btree_elm_t elm;
226 * Lock the parent node via the cursor, collect and lock our
227 * children and children's children.
229 * By the way, this is a LOT of locks.
231 hammer_node_lock_init(&lockroot, cursor->node);
232 error = hammer_cursor_upgrade(cursor);
235 error = hammer_btree_lock_children(cursor, 2, &lockroot);
240 * Make a copy of all the locked on-disk data to simplify the element
241 * shifting we are going to have to do. We will modify the copy
244 hammer_btree_lock_copy(cursor, &lockroot);
247 * Look at the first child node.
249 if (TAILQ_FIRST(&lockroot.list) == NULL)
251 type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type;
254 * Figure out the total number of children's children and
255 * calculate the average number of elements per child.
257 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms
258 * is always greater or equal to count.
260 * If count == 0 we hit a degenerate case which will cause
261 * avg_elms to also calculate as 0.
263 if (hammer_debug_general & 0x1000)
264 kprintf("lockroot %p count %d\n", &lockroot, lockroot.count);
266 TAILQ_FOREACH(item, &lockroot.list, entry) {
267 if (hammer_debug_general & 0x1000)
268 kprintf("add count %d\n", item->count);
269 count += item->count;
270 KKASSERT(item->node->ondisk->type == type1);
272 avg_elms = (count + (lockroot.count - 1)) / lockroot.count;
273 KKASSERT(avg_elms >= 0);
276 * If the average number of elements per child is too low then
277 * calculate the desired number of children (n) such that the
278 * average number of elements is reasonable.
280 * If the desired number of children is 1 then avg_elms will
281 * wind up being count, which may still be smaller then saturation
284 if (count && avg_elms < rebal->saturation) {
285 n = (count + (rebal->saturation - 1)) / rebal->saturation;
286 avg_elms = (count + (n - 1)) / n;
290 * Pack the elements in the children. Elements for each item is
291 * packed into base_item until avg_elms is reached, then base_item
294 * hammer_cursor_moved_element() is called for each element moved
295 * to update tracked cursors, including the index beyond the last
296 * element (at count).
298 base_item = TAILQ_FIRST(&lockroot.list);
302 TAILQ_FOREACH(item, &lockroot.list, entry) {
304 KKASSERT(item->count == node->ondisk->count);
305 chld_item = TAILQ_FIRST(&item->list);
306 for (i = 0; i < item->count; ++i) {
308 * Closeout. If the next element is at index 0
309 * just use the existing separator in the parent.
311 if (base_count == avg_elms) {
313 elm = &lockroot.node->ondisk->elms[
316 elm = &node->ondisk->elms[i];
318 rebalance_closeout(base_item, base_count, elm);
319 base_item = TAILQ_NEXT(base_item, entry);
326 * Check degenerate no-work case. Otherwise pack
329 * All changes are made to the copy.
331 if (item == base_item && i == base_count) {
334 chld_item = TAILQ_NEXT(chld_item, entry);
341 elm = &base_item->copy->elms[base_count];
342 *elm = node->ondisk->elms[i];
343 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
346 * Adjust the mirror_tid of the target. The parent
347 * node (lockroot.node) should already have an
348 * aggregate mirror_tid so we do not have to update
351 if (base_item->copy->mirror_tid <
352 node->ondisk->mirror_tid) {
353 base_item->copy->mirror_tid =
354 node->ondisk->mirror_tid;
355 KKASSERT(lockroot.node->ondisk->mirror_tid >=
356 node->ondisk->mirror_tid);
357 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
361 * We moved elm. The parent pointers for any
362 * children of elm must be repointed.
364 if (item != base_item &&
365 node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
367 rebalance_parent_ptrs(base_item, base_count,
370 hammer_cursor_moved_element(node, base_item->node,
374 chld_item = TAILQ_NEXT(chld_item, entry);
378 * Always call at the end (i == number of elements) in
379 * case a cursor is sitting indexed there.
381 hammer_cursor_moved_element(node, base_item->node,
386 * Packing complete, close-out base_item using the right-hand
387 * boundary of the original parent.
389 * If we will be deleting nodes from the root shift the old
390 * right-hand-boundary to the new ending index.
392 elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count];
393 rebalance_closeout(base_item, base_count, elm);
395 if (lockroot.copy->count != root_count) {
396 lockroot.copy->count = root_count;
397 lockroot.copy->elms[root_count] = *elm;
398 lockroot.flags |= HAMMER_NODE_LOCK_UPDATED;
402 * Any extra items beyond base_item are now completely empty and
403 * can be destroyed. Queue the destruction up in the copy. Note
404 * that none of the destroyed nodes are part of our cursor.
406 * The cursor is locked so it isn't on the tracking list. It
407 * should have been pointing at the boundary element (at root_count).
408 * When deleting elements from the root (which is cursor.node), we
409 * have to update the cursor.index manually to keep it in bounds.
411 while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) {
412 hammer_cursor_removed_node(base_item->node, lockroot.node,
414 hammer_cursor_deleted_element(lockroot.node, base_count);
415 base_item->copy->type = HAMMER_BTREE_TYPE_DELETED;
416 base_item->copy->count = 0;
417 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
418 if (cursor->index > lockroot.copy->count)
423 * All done, sync the locked child tree to disk. This will also
424 * flush and delete deleted nodes.
426 hammer_btree_sync_copy(cursor, &lockroot);
428 hammer_btree_unlock_children(cursor, &lockroot);
429 hammer_cursor_downgrade(cursor);
434 * Close-out the child base_item. This node contains base_count
437 * If the node is an internal node the right-hand boundary must be
442 rebalance_closeout(hammer_node_lock_t base_item, int base_count,
443 hammer_btree_elm_t elm)
445 hammer_node_lock_t parent;
446 hammer_btree_elm_t base_elm;
447 hammer_btree_elm_t rbound_elm;
451 * Update the count. NOTE: base_count can be 0 for the
452 * degenerate leaf case.
454 if (hammer_debug_general & 0x1000) {
455 kprintf("rebalance_closeout %016llx:",
456 base_item->node->node_offset);
458 if (base_item->copy->count != base_count) {
459 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
460 base_item->copy->count = base_count;
461 if (hammer_debug_general & 0x1000)
462 kprintf(" (count update)");
466 * If we are closing out an internal node we must assign
467 * a right-hand boundary. Use the element contents as the
468 * right-hand boundary.
470 * Internal nodes are required to have at least one child,
471 * otherwise the left and right boundary would end up being
472 * the same element. Only leaf nodes can be empty.
474 if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) {
475 KKASSERT(base_count != 0);
476 base_elm = &base_item->copy->elms[base_count];
478 if (bcmp(base_elm, elm, sizeof(*elm)) != 0) {
480 base_elm->internal.subtree_offset = 0;
481 base_elm->base.btype = 0;
482 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
483 if (hammer_debug_general & 0x1000)
484 kprintf(" (rhs update)");
486 if (hammer_debug_general & 0x1000)
487 kprintf(" (rhs same)");
492 * The parent's boundary must be updated. Be careful to retain
493 * the btype and non-base internal fields as that information is
496 parent = base_item->parent;
497 rbound_elm = &parent->copy->elms[base_item->index + 1];
498 if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) {
499 save = rbound_elm->base.btype;
500 rbound_elm->base = elm->base;
501 rbound_elm->base.btype = save;
502 parent->flags |= HAMMER_NODE_LOCK_UPDATED;
503 if (hammer_debug_general & 0x1000) {
504 kprintf(" (parent bound update %d)",
505 base_item->index + 1);
508 if (hammer_debug_general & 0x1000)
513 * An element in item has moved to base_item. We must update the parent
514 * pointer of the node the element points to (which is chld_item).
518 rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
519 hammer_node_lock_t item, hammer_node_lock_t chld_item)
521 KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset);
522 chld_item->copy->parent = base_item->node->node_offset;
523 chld_item->flags |= HAMMER_NODE_LOCK_UPDATED;
524 hammer_cursor_parent_changed(chld_item->node,
525 item->node, base_item->node, index);