HAMMER VFS - Correct seriuos bug in hammer rebalancing code
[dragonfly.git] / sys / vfs / hammer / hammer_rebalance.c
CommitLineData
1775b6a0
MD
1/*
2 * Copyright (c) 2009 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include "hammer.h"
36
37static int rebalance_node(struct hammer_ioc_rebalance *rebal,
38 hammer_cursor_t cursor);
39static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
40 hammer_btree_elm_t elm);
41static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
42 hammer_node_lock_t item, hammer_node_lock_t chld_item);
43
44/*
45 * Iterate through the specified range of object ids and rebalance B-Tree
46 * leaf and internal nodes we encounter. A forwards iteration is used.
47 *
48 * All leafs are at the same depth. We use the b-tree scan code loosely
49 * to position ourselves and create degenerate cases to skip indices
50 * that we have rebalanced in bulk.
51 */
52
53int
54hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
55 struct hammer_ioc_rebalance *rebal)
56{
57 struct hammer_cursor cursor;
58 hammer_btree_leaf_elm_t elm;
59 int error;
60 int seq;
61
62 if ((rebal->key_beg.localization | rebal->key_end.localization) &
63 HAMMER_LOCALIZE_PSEUDOFS_MASK) {
64 return(EINVAL);
65 }
66 if (rebal->key_beg.localization > rebal->key_end.localization)
67 return(EINVAL);
68 if (rebal->key_beg.localization == rebal->key_end.localization) {
69 if (rebal->key_beg.obj_id > rebal->key_end.obj_id)
70 return(EINVAL);
71 /* key-space limitations - no check needed */
72 }
73 if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2)
74 rebal->saturation = HAMMER_BTREE_INT_ELMS / 2;
75 if (rebal->saturation > HAMMER_BTREE_INT_ELMS)
76 rebal->saturation = HAMMER_BTREE_INT_ELMS;
77
78 rebal->key_cur = rebal->key_beg;
79 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
80 rebal->key_cur.localization += ip->obj_localization;
81
82 seq = trans->hmp->flusher.act;
83
84 /*
85 * Scan forwards. Retries typically occur if a deadlock is detected.
86 */
87retry:
88 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
89 if (error) {
90 hammer_done_cursor(&cursor);
91 goto failed;
92 }
93 cursor.key_beg = rebal->key_cur;
94 cursor.key_end = rebal->key_end;
95 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK;
96 cursor.key_end.localization += ip->obj_localization;
97 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
98 cursor.flags |= HAMMER_CURSOR_BACKEND;
99
100 /*
101 * Cause internal nodes to be returned on the way up. Internal nodes
102 * are not returned on the way down so we can create a degenerate
103 * case to handle internal nodes as a trailing function of their
104 * sub-trees.
105 *
106 * Note that by not setting INSERTING or PRUNING no boundary
107 * corrections will be made and a sync lock is not needed for the
108 * B-Tree scan itself.
109 */
110 cursor.flags |= HAMMER_CURSOR_REBLOCKING;
111
112 error = hammer_btree_first(&cursor);
113
114 while (error == 0) {
115 /*
116 * We only care about internal nodes visited for the last
117 * time on the way up... that is, a trailing scan of the
118 * internal node after all of its children have been recursed
119 * through.
120 */
121 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
122 /*
123 * Leave cursor.index alone, we want to recurse
124 * through all children of the internal node before
125 * visiting it.
126 *
127 * Process the internal node on the way up after
128 * the last child's sub-tree has been balanced.
129 */
130 if (cursor.index == cursor.node->ondisk->count - 1) {
131 hammer_sync_lock_sh(trans);
132 error = rebalance_node(rebal, &cursor);
133 hammer_sync_unlock(trans);
134 }
135 } else {
136 /*
137 * We don't need to iterate through all the leaf
138 * elements, we only care about the parent (internal)
139 * node.
140 */
141 cursor.index = cursor.node->ondisk->count - 1;
142 }
143 if (error)
144 break;
145
146 /*
147 * Update returned scan position and do a flush if
148 * necessary.
c9ce54d6 149 *
f3a4893b
MD
150 * WARNING: We extract the base using the leaf element
151 * type but this could be an internal node. The
152 * base is the same either way.
153 *
154 * However, due to the rebalancing operation the
155 * cursor position may have exceeded the right-hand
156 * boundary.
157 *
c9ce54d6
MD
158 * WARNING: See warnings in hammer_unlock_cursor()
159 * function.
1775b6a0
MD
160 */
161 elm = &cursor.node->ondisk->elms[cursor.index].leaf;
162 rebal->key_cur = elm->base;
163 ++rebal->stat_ncount;
164
165 while (hammer_flusher_meta_halflimit(trans->hmp) ||
166 hammer_flusher_undo_exhausted(trans, 2)) {
167 hammer_unlock_cursor(&cursor);
168 hammer_flusher_wait(trans->hmp, seq);
169 hammer_lock_cursor(&cursor);
170 seq = hammer_flusher_async_one(trans->hmp);
171 }
172
173 /*
f3a4893b
MD
174 * Before iterating check if the rebalance operation caused
175 * the cursor to index past the right-hand boundary and make
176 * sure to stop if it does. Otherwise the iteration may
177 * panic e.g. due to the key maxing out its fields and no
178 * longer being within the strict bounds of the root node.
179 */
180 if (hammer_btree_cmp(&rebal->key_cur, &cursor.key_end) > 0) {
f3a4893b
MD
181 rebal->key_cur = cursor.key_end;
182 break;
183 }
184
185 /*
1775b6a0
MD
186 * Iterate, stop if a signal was received.
187 */
188 if ((error = hammer_signal_check(trans->hmp)) != 0)
189 break;
190 error = hammer_btree_iterate(&cursor);
191 }
192 if (error == ENOENT)
193 error = 0;
194 hammer_done_cursor(&cursor);
7ddc70d1
MD
195 if (error == EDEADLK) {
196 ++rebal->stat_collisions;
1775b6a0 197 goto retry;
7ddc70d1 198 }
1775b6a0
MD
199 if (error == EINTR) {
200 rebal->head.flags |= HAMMER_IOC_HEAD_INTR;
201 error = 0;
202 }
203failed:
204 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
205 return(error);
206}
207
208/*
209 * Rebalance an internal node, called via a trailing upward recursion.
210 * All the children have already been individually rebalanced.
211 *
212 * To rebalance we scan the elements in the children and pack them,
213 * so we actually need to lock the children and the children's children.
214 *
215 * INTERNAL_NODE
216 * / / | | | \ \
217 * C C C C C C C children (first level) (internal or leaf nodes)
218 * children's elements (second level)
219 *
220 * <<<---------- pack children's elements, possibly remove excess
221 * children after packing.
222 *
223 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
224 * Any live tracked B-Tree nodes must be updated (we worm out of that
225 * by not allowing any). And boundary elements must be preserved.
226 *
227 * NOTE: If the children are leaf nodes we may have a degenerate case
228 * case where there are no elements in the leafs.
229 *
230 * XXX live-tracked
231 */
232static int
233rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor)
234{
235 struct hammer_node_lock lockroot;
236 hammer_node_lock_t base_item;
237 hammer_node_lock_t chld_item;
238 hammer_node_lock_t item;
239 hammer_btree_elm_t elm;
240 hammer_node_t node;
bbb01e14 241 hammer_tid_t tid;
1775b6a0
MD
242 u_int8_t type1;
243 int base_count;
244 int root_count;
245 int avg_elms;
246 int count;
247 int error;
248 int i;
249 int n;
250
251 /*
252 * Lock the parent node via the cursor, collect and lock our
253 * children and children's children.
254 *
255 * By the way, this is a LOT of locks.
256 */
257 hammer_node_lock_init(&lockroot, cursor->node);
258 error = hammer_cursor_upgrade(cursor);
259 if (error)
260 goto done;
261 error = hammer_btree_lock_children(cursor, 2, &lockroot);
262 if (error)
263 goto done;
264
265 /*
266 * Make a copy of all the locked on-disk data to simplify the element
267 * shifting we are going to have to do. We will modify the copy
268 * first.
269 */
270 hammer_btree_lock_copy(cursor, &lockroot);
271
272 /*
273 * Look at the first child node.
274 */
275 if (TAILQ_FIRST(&lockroot.list) == NULL)
276 goto done;
277 type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type;
278
279 /*
280 * Figure out the total number of children's children and
281 * calculate the average number of elements per child.
282 *
283 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms
284 * is always greater or equal to count.
285 *
286 * If count == 0 we hit a degenerate case which will cause
287 * avg_elms to also calculate as 0.
288 */
289 if (hammer_debug_general & 0x1000)
290 kprintf("lockroot %p count %d\n", &lockroot, lockroot.count);
291 count = 0;
292 TAILQ_FOREACH(item, &lockroot.list, entry) {
293 if (hammer_debug_general & 0x1000)
294 kprintf("add count %d\n", item->count);
295 count += item->count;
296 KKASSERT(item->node->ondisk->type == type1);
297 }
298 avg_elms = (count + (lockroot.count - 1)) / lockroot.count;
299 KKASSERT(avg_elms >= 0);
300
301 /*
302 * If the average number of elements per child is too low then
303 * calculate the desired number of children (n) such that the
304 * average number of elements is reasonable.
305 *
306 * If the desired number of children is 1 then avg_elms will
307 * wind up being count, which may still be smaller then saturation
308 * but that is ok.
309 */
310 if (count && avg_elms < rebal->saturation) {
311 n = (count + (rebal->saturation - 1)) / rebal->saturation;
312 avg_elms = (count + (n - 1)) / n;
313 }
314
315 /*
316 * Pack the elements in the children. Elements for each item is
317 * packed into base_item until avg_elms is reached, then base_item
318 * iterates.
319 *
320 * hammer_cursor_moved_element() is called for each element moved
321 * to update tracked cursors, including the index beyond the last
322 * element (at count).
bbb01e14
MD
323 *
324 * Any cursors tracking the internal node itself must also be
325 * updated, potentially repointing them at a leaf and clearing
326 * ATEDISK.
1775b6a0
MD
327 */
328 base_item = TAILQ_FIRST(&lockroot.list);
329 base_count = 0;
330 root_count = 0;
331
332 TAILQ_FOREACH(item, &lockroot.list, entry) {
333 node = item->node;
334 KKASSERT(item->count == node->ondisk->count);
335 chld_item = TAILQ_FIRST(&item->list);
336 for (i = 0; i < item->count; ++i) {
337 /*
338 * Closeout. If the next element is at index 0
339 * just use the existing separator in the parent.
340 */
341 if (base_count == avg_elms) {
342 if (i == 0) {
343 elm = &lockroot.node->ondisk->elms[
344 item->index];
345 } else {
346 elm = &node->ondisk->elms[i];
347 }
348 rebalance_closeout(base_item, base_count, elm);
349 base_item = TAILQ_NEXT(base_item, entry);
350 KKASSERT(base_item);
351 base_count = 0;
352 ++root_count;
353 }
354
355 /*
356 * Check degenerate no-work case. Otherwise pack
357 * the element.
358 *
359 * All changes are made to the copy.
360 */
361 if (item == base_item && i == base_count) {
362 ++base_count;
363 if (chld_item)
364 chld_item = TAILQ_NEXT(chld_item, entry);
365 continue;
366 }
367
368 /*
369 * Pack element.
370 */
371 elm = &base_item->copy->elms[base_count];
372 *elm = node->ondisk->elms[i];
373 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
374
375 /*
bbb01e14
MD
376 * Adjust the mirror_tid of the target and the
377 * internal element linkage.
4ac6e40f 378 *
bbb01e14
MD
379 * The parent node (lockroot.node) should already
380 * have an aggregate mirror_tid so we do not have
381 * to update that. However, it is possible for us
382 * to catch a hammer_btree_mirror_propagate() with
383 * its pants down. Update the parent if necessary.
1775b6a0 384 */
bbb01e14
MD
385 tid = node->ondisk->mirror_tid;
386
387 if (base_item->copy->mirror_tid < tid) {
388 base_item->copy->mirror_tid = tid;
389 if (lockroot.copy->mirror_tid < tid) {
390 lockroot.copy->mirror_tid = tid;
391 lockroot.flags |=
392 HAMMER_NODE_LOCK_UPDATED;
393 }
394 if (lockroot.copy->elms[root_count].
395 internal.mirror_tid < tid) {
396 lockroot.copy->elms[root_count].
397 internal.mirror_tid = tid;
4ac6e40f
MD
398 lockroot.flags |=
399 HAMMER_NODE_LOCK_UPDATED;
400 }
1775b6a0
MD
401 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
402 }
403
404 /*
405 * We moved elm. The parent pointers for any
406 * children of elm must be repointed.
407 */
408 if (item != base_item &&
409 node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
410 KKASSERT(chld_item);
411 rebalance_parent_ptrs(base_item, base_count,
412 item, chld_item);
413 }
bbb01e14
MD
414 hammer_cursor_moved_element(item->parent->node,
415 item->index,
416 node, i,
417 base_item->node,
418 base_count);
1775b6a0
MD
419 ++base_count;
420 if (chld_item)
421 chld_item = TAILQ_NEXT(chld_item, entry);
422 }
423
424 /*
425 * Always call at the end (i == number of elements) in
426 * case a cursor is sitting indexed there.
427 */
bbb01e14
MD
428 hammer_cursor_moved_element(item->parent->node, item->index,
429 node, i,
430 base_item->node, base_count);
1775b6a0
MD
431 }
432
433 /*
434 * Packing complete, close-out base_item using the right-hand
435 * boundary of the original parent.
436 *
437 * If we will be deleting nodes from the root shift the old
438 * right-hand-boundary to the new ending index.
439 */
440 elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count];
441 rebalance_closeout(base_item, base_count, elm);
442 ++root_count;
443 if (lockroot.copy->count != root_count) {
444 lockroot.copy->count = root_count;
445 lockroot.copy->elms[root_count] = *elm;
446 lockroot.flags |= HAMMER_NODE_LOCK_UPDATED;
447 }
448
449 /*
450 * Any extra items beyond base_item are now completely empty and
451 * can be destroyed. Queue the destruction up in the copy. Note
452 * that none of the destroyed nodes are part of our cursor.
453 *
454 * The cursor is locked so it isn't on the tracking list. It
455 * should have been pointing at the boundary element (at root_count).
456 * When deleting elements from the root (which is cursor.node), we
457 * have to update the cursor.index manually to keep it in bounds.
458 */
459 while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) {
460 hammer_cursor_removed_node(base_item->node, lockroot.node,
461 base_count);
462 hammer_cursor_deleted_element(lockroot.node, base_count);
463 base_item->copy->type = HAMMER_BTREE_TYPE_DELETED;
464 base_item->copy->count = 0;
465 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
466 if (cursor->index > lockroot.copy->count)
467 --cursor->index;
7ddc70d1 468 ++rebal->stat_deletions;
1775b6a0
MD
469 }
470
471 /*
472 * All done, sync the locked child tree to disk. This will also
473 * flush and delete deleted nodes.
474 */
7ddc70d1 475 rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot);
1775b6a0
MD
476done:
477 hammer_btree_unlock_children(cursor, &lockroot);
478 hammer_cursor_downgrade(cursor);
479 return (error);
480}
481
482/*
483 * Close-out the child base_item. This node contains base_count
484 * elements.
485 *
486 * If the node is an internal node the right-hand boundary must be
487 * set to elm.
488 */
489static
490void
491rebalance_closeout(hammer_node_lock_t base_item, int base_count,
492 hammer_btree_elm_t elm)
493{
494 hammer_node_lock_t parent;
495 hammer_btree_elm_t base_elm;
496 hammer_btree_elm_t rbound_elm;
497 u_int8_t save;
498
499 /*
500 * Update the count. NOTE: base_count can be 0 for the
501 * degenerate leaf case.
502 */
503 if (hammer_debug_general & 0x1000) {
504 kprintf("rebalance_closeout %016llx:",
973c11b9 505 (long long)base_item->node->node_offset);
1775b6a0
MD
506 }
507 if (base_item->copy->count != base_count) {
508 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
509 base_item->copy->count = base_count;
510 if (hammer_debug_general & 0x1000)
511 kprintf(" (count update)");
512 }
513
514 /*
515 * If we are closing out an internal node we must assign
516 * a right-hand boundary. Use the element contents as the
517 * right-hand boundary.
518 *
519 * Internal nodes are required to have at least one child,
520 * otherwise the left and right boundary would end up being
521 * the same element. Only leaf nodes can be empty.
7ddc70d1
MD
522 *
523 * Rebalancing may cut-off an internal node such that the
524 * new right hand boundary is the next element anyway, but
525 * we still have to make sure that subtree_offset, btype,
526 * and mirror_tid are all 0.
1775b6a0
MD
527 */
528 if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) {
529 KKASSERT(base_count != 0);
530 base_elm = &base_item->copy->elms[base_count];
531
7ddc70d1
MD
532 if (bcmp(base_elm, elm, sizeof(*elm)) != 0 ||
533 elm->internal.subtree_offset ||
534 elm->internal.mirror_tid ||
535 elm->base.btype) {
1775b6a0
MD
536 *base_elm = *elm;
537 base_elm->internal.subtree_offset = 0;
7ddc70d1 538 base_elm->internal.mirror_tid = 0;
1775b6a0
MD
539 base_elm->base.btype = 0;
540 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
541 if (hammer_debug_general & 0x1000)
542 kprintf(" (rhs update)");
543 } else {
544 if (hammer_debug_general & 0x1000)
545 kprintf(" (rhs same)");
546 }
547 }
548
549 /*
550 * The parent's boundary must be updated. Be careful to retain
551 * the btype and non-base internal fields as that information is
552 * unrelated.
553 */
554 parent = base_item->parent;
555 rbound_elm = &parent->copy->elms[base_item->index + 1];
556 if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) {
557 save = rbound_elm->base.btype;
558 rbound_elm->base = elm->base;
559 rbound_elm->base.btype = save;
560 parent->flags |= HAMMER_NODE_LOCK_UPDATED;
561 if (hammer_debug_general & 0x1000) {
562 kprintf(" (parent bound update %d)",
563 base_item->index + 1);
564 }
565 }
566 if (hammer_debug_general & 0x1000)
567 kprintf("\n");
568}
569
570/*
571 * An element in item has moved to base_item. We must update the parent
572 * pointer of the node the element points to (which is chld_item).
573 */
574static
575void
576rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
577 hammer_node_lock_t item, hammer_node_lock_t chld_item)
578{
579 KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset);
580 chld_item->copy->parent = base_item->node->node_offset;
581 chld_item->flags |= HAMMER_NODE_LOCK_UPDATED;
582 hammer_cursor_parent_changed(chld_item->node,
583 item->node, base_item->node, index);
584}