HAMMER VFS - Fix degenerate stall condition in flusher during unmount
[dragonfly.git] / sys / vfs / hammer / hammer_rebalance.c
CommitLineData
1775b6a0
MD
1/*
2 * Copyright (c) 2009 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include "hammer.h"
36
37static int rebalance_node(struct hammer_ioc_rebalance *rebal,
24cf83d2 38 hammer_cursor_t cursor, hammer_node_lock_t lcache);
1775b6a0
MD
39static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
40 hammer_btree_elm_t elm);
41static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
42 hammer_node_lock_t item, hammer_node_lock_t chld_item);
43
44/*
45 * Iterate through the specified range of object ids and rebalance B-Tree
46 * leaf and internal nodes we encounter. A forwards iteration is used.
47 *
48 * All leafs are at the same depth. We use the b-tree scan code loosely
49 * to position ourselves and create degenerate cases to skip indices
50 * that we have rebalanced in bulk.
51 */
52
53int
54hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
55 struct hammer_ioc_rebalance *rebal)
56{
57 struct hammer_cursor cursor;
24cf83d2 58 struct hammer_node_lock lcache;
1775b6a0
MD
59 hammer_btree_leaf_elm_t elm;
60 int error;
61 int seq;
62
63 if ((rebal->key_beg.localization | rebal->key_end.localization) &
64 HAMMER_LOCALIZE_PSEUDOFS_MASK) {
65 return(EINVAL);
66 }
67 if (rebal->key_beg.localization > rebal->key_end.localization)
68 return(EINVAL);
69 if (rebal->key_beg.localization == rebal->key_end.localization) {
70 if (rebal->key_beg.obj_id > rebal->key_end.obj_id)
71 return(EINVAL);
72 /* key-space limitations - no check needed */
73 }
74 if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2)
75 rebal->saturation = HAMMER_BTREE_INT_ELMS / 2;
76 if (rebal->saturation > HAMMER_BTREE_INT_ELMS)
77 rebal->saturation = HAMMER_BTREE_INT_ELMS;
78
79 rebal->key_cur = rebal->key_beg;
80 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
81 rebal->key_cur.localization += ip->obj_localization;
82
24cf83d2
MD
83 hammer_btree_lcache_init(trans->hmp, &lcache, 2);
84
e86903d8 85 seq = trans->hmp->flusher.done;
1775b6a0
MD
86
87 /*
88 * Scan forwards. Retries typically occur if a deadlock is detected.
89 */
90retry:
91 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
92 if (error) {
93 hammer_done_cursor(&cursor);
94 goto failed;
95 }
96 cursor.key_beg = rebal->key_cur;
97 cursor.key_end = rebal->key_end;
98 cursor.key_end.localization &= HAMMER_LOCALIZE_MASK;
99 cursor.key_end.localization += ip->obj_localization;
100 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
101 cursor.flags |= HAMMER_CURSOR_BACKEND;
102
103 /*
104 * Cause internal nodes to be returned on the way up. Internal nodes
105 * are not returned on the way down so we can create a degenerate
106 * case to handle internal nodes as a trailing function of their
107 * sub-trees.
108 *
109 * Note that by not setting INSERTING or PRUNING no boundary
110 * corrections will be made and a sync lock is not needed for the
111 * B-Tree scan itself.
112 */
113 cursor.flags |= HAMMER_CURSOR_REBLOCKING;
114
115 error = hammer_btree_first(&cursor);
116
117 while (error == 0) {
118 /*
c652be54
MD
119 * Rebalancing can be hard on the memory allocator, make
120 * sure there is enough free memory before doing it.
121 */
12052253
MD
122 if (vm_test_nominal()) {
123 hammer_unlock_cursor(&cursor);
124 vm_wait_nominal();
125 hammer_lock_cursor(&cursor);
126 }
c652be54
MD
127
128 /*
1775b6a0
MD
129 * We only care about internal nodes visited for the last
130 * time on the way up... that is, a trailing scan of the
131 * internal node after all of its children have been recursed
132 * through.
133 */
134 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
135 /*
136 * Leave cursor.index alone, we want to recurse
137 * through all children of the internal node before
138 * visiting it.
139 *
140 * Process the internal node on the way up after
141 * the last child's sub-tree has been balanced.
142 */
143 if (cursor.index == cursor.node->ondisk->count - 1) {
144 hammer_sync_lock_sh(trans);
24cf83d2 145 error = rebalance_node(rebal, &cursor, &lcache);
1775b6a0
MD
146 hammer_sync_unlock(trans);
147 }
148 } else {
149 /*
150 * We don't need to iterate through all the leaf
151 * elements, we only care about the parent (internal)
152 * node.
153 */
154 cursor.index = cursor.node->ondisk->count - 1;
155 }
156 if (error)
157 break;
158
159 /*
160 * Update returned scan position and do a flush if
161 * necessary.
c9ce54d6 162 *
f3a4893b
MD
163 * WARNING: We extract the base using the leaf element
164 * type but this could be an internal node. The
165 * base is the same either way.
166 *
167 * However, due to the rebalancing operation the
168 * cursor position may have exceeded the right-hand
169 * boundary.
170 *
c9ce54d6
MD
171 * WARNING: See warnings in hammer_unlock_cursor()
172 * function.
1775b6a0
MD
173 */
174 elm = &cursor.node->ondisk->elms[cursor.index].leaf;
175 rebal->key_cur = elm->base;
176 ++rebal->stat_ncount;
177
178 while (hammer_flusher_meta_halflimit(trans->hmp) ||
179 hammer_flusher_undo_exhausted(trans, 2)) {
180 hammer_unlock_cursor(&cursor);
181 hammer_flusher_wait(trans->hmp, seq);
182 hammer_lock_cursor(&cursor);
183 seq = hammer_flusher_async_one(trans->hmp);
184 }
185
186 /*
f3a4893b
MD
187 * Before iterating check if the rebalance operation caused
188 * the cursor to index past the right-hand boundary and make
189 * sure to stop if it does. Otherwise the iteration may
190 * panic e.g. due to the key maxing out its fields and no
191 * longer being within the strict bounds of the root node.
192 */
193 if (hammer_btree_cmp(&rebal->key_cur, &cursor.key_end) > 0) {
f3a4893b
MD
194 rebal->key_cur = cursor.key_end;
195 break;
196 }
197
198 /*
1775b6a0
MD
199 * Iterate, stop if a signal was received.
200 */
201 if ((error = hammer_signal_check(trans->hmp)) != 0)
202 break;
203 error = hammer_btree_iterate(&cursor);
204 }
205 if (error == ENOENT)
206 error = 0;
207 hammer_done_cursor(&cursor);
7ddc70d1
MD
208 if (error == EDEADLK) {
209 ++rebal->stat_collisions;
1775b6a0 210 goto retry;
7ddc70d1 211 }
1775b6a0
MD
212 if (error == EINTR) {
213 rebal->head.flags |= HAMMER_IOC_HEAD_INTR;
214 error = 0;
215 }
216failed:
217 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
24cf83d2 218 hammer_btree_lcache_free(trans->hmp, &lcache);
1775b6a0
MD
219 return(error);
220}
221
222/*
223 * Rebalance an internal node, called via a trailing upward recursion.
224 * All the children have already been individually rebalanced.
225 *
226 * To rebalance we scan the elements in the children and pack them,
227 * so we actually need to lock the children and the children's children.
228 *
229 * INTERNAL_NODE
230 * / / | | | \ \
231 * C C C C C C C children (first level) (internal or leaf nodes)
232 * children's elements (second level)
233 *
234 * <<<---------- pack children's elements, possibly remove excess
235 * children after packing.
236 *
237 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
238 * Any live tracked B-Tree nodes must be updated (we worm out of that
239 * by not allowing any). And boundary elements must be preserved.
240 *
241 * NOTE: If the children are leaf nodes we may have a degenerate case
242 * case where there are no elements in the leafs.
243 *
244 * XXX live-tracked
245 */
246static int
24cf83d2
MD
247rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor,
248 struct hammer_node_lock *lcache)
1775b6a0
MD
249{
250 struct hammer_node_lock lockroot;
251 hammer_node_lock_t base_item;
252 hammer_node_lock_t chld_item;
253 hammer_node_lock_t item;
254 hammer_btree_elm_t elm;
255 hammer_node_t node;
bbb01e14 256 hammer_tid_t tid;
1775b6a0
MD
257 u_int8_t type1;
258 int base_count;
259 int root_count;
260 int avg_elms;
261 int count;
262 int error;
263 int i;
264 int n;
265
266 /*
267 * Lock the parent node via the cursor, collect and lock our
268 * children and children's children.
269 *
270 * By the way, this is a LOT of locks.
271 */
272 hammer_node_lock_init(&lockroot, cursor->node);
273 error = hammer_cursor_upgrade(cursor);
274 if (error)
275 goto done;
24cf83d2 276 error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache);
1775b6a0
MD
277 if (error)
278 goto done;
279
280 /*
281 * Make a copy of all the locked on-disk data to simplify the element
282 * shifting we are going to have to do. We will modify the copy
283 * first.
284 */
285 hammer_btree_lock_copy(cursor, &lockroot);
286
287 /*
288 * Look at the first child node.
289 */
290 if (TAILQ_FIRST(&lockroot.list) == NULL)
291 goto done;
292 type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type;
293
294 /*
295 * Figure out the total number of children's children and
296 * calculate the average number of elements per child.
297 *
298 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms
299 * is always greater or equal to count.
300 *
301 * If count == 0 we hit a degenerate case which will cause
302 * avg_elms to also calculate as 0.
303 */
304 if (hammer_debug_general & 0x1000)
305 kprintf("lockroot %p count %d\n", &lockroot, lockroot.count);
306 count = 0;
307 TAILQ_FOREACH(item, &lockroot.list, entry) {
308 if (hammer_debug_general & 0x1000)
309 kprintf("add count %d\n", item->count);
310 count += item->count;
311 KKASSERT(item->node->ondisk->type == type1);
312 }
313 avg_elms = (count + (lockroot.count - 1)) / lockroot.count;
314 KKASSERT(avg_elms >= 0);
315
316 /*
317 * If the average number of elements per child is too low then
318 * calculate the desired number of children (n) such that the
319 * average number of elements is reasonable.
320 *
321 * If the desired number of children is 1 then avg_elms will
322 * wind up being count, which may still be smaller then saturation
323 * but that is ok.
324 */
325 if (count && avg_elms < rebal->saturation) {
326 n = (count + (rebal->saturation - 1)) / rebal->saturation;
327 avg_elms = (count + (n - 1)) / n;
328 }
329
330 /*
331 * Pack the elements in the children. Elements for each item is
332 * packed into base_item until avg_elms is reached, then base_item
333 * iterates.
334 *
335 * hammer_cursor_moved_element() is called for each element moved
336 * to update tracked cursors, including the index beyond the last
337 * element (at count).
bbb01e14
MD
338 *
339 * Any cursors tracking the internal node itself must also be
340 * updated, potentially repointing them at a leaf and clearing
341 * ATEDISK.
1775b6a0
MD
342 */
343 base_item = TAILQ_FIRST(&lockroot.list);
344 base_count = 0;
345 root_count = 0;
346
347 TAILQ_FOREACH(item, &lockroot.list, entry) {
348 node = item->node;
349 KKASSERT(item->count == node->ondisk->count);
350 chld_item = TAILQ_FIRST(&item->list);
351 for (i = 0; i < item->count; ++i) {
352 /*
353 * Closeout. If the next element is at index 0
354 * just use the existing separator in the parent.
355 */
356 if (base_count == avg_elms) {
357 if (i == 0) {
358 elm = &lockroot.node->ondisk->elms[
359 item->index];
360 } else {
361 elm = &node->ondisk->elms[i];
362 }
363 rebalance_closeout(base_item, base_count, elm);
364 base_item = TAILQ_NEXT(base_item, entry);
365 KKASSERT(base_item);
366 base_count = 0;
367 ++root_count;
368 }
369
370 /*
371 * Check degenerate no-work case. Otherwise pack
372 * the element.
373 *
374 * All changes are made to the copy.
375 */
376 if (item == base_item && i == base_count) {
377 ++base_count;
378 if (chld_item)
379 chld_item = TAILQ_NEXT(chld_item, entry);
380 continue;
381 }
382
383 /*
384 * Pack element.
385 */
386 elm = &base_item->copy->elms[base_count];
387 *elm = node->ondisk->elms[i];
388 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
389
390 /*
bbb01e14
MD
391 * Adjust the mirror_tid of the target and the
392 * internal element linkage.
4ac6e40f 393 *
bbb01e14
MD
394 * The parent node (lockroot.node) should already
395 * have an aggregate mirror_tid so we do not have
396 * to update that. However, it is possible for us
397 * to catch a hammer_btree_mirror_propagate() with
398 * its pants down. Update the parent if necessary.
1775b6a0 399 */
bbb01e14
MD
400 tid = node->ondisk->mirror_tid;
401
402 if (base_item->copy->mirror_tid < tid) {
403 base_item->copy->mirror_tid = tid;
404 if (lockroot.copy->mirror_tid < tid) {
405 lockroot.copy->mirror_tid = tid;
406 lockroot.flags |=
407 HAMMER_NODE_LOCK_UPDATED;
408 }
409 if (lockroot.copy->elms[root_count].
410 internal.mirror_tid < tid) {
411 lockroot.copy->elms[root_count].
412 internal.mirror_tid = tid;
4ac6e40f
MD
413 lockroot.flags |=
414 HAMMER_NODE_LOCK_UPDATED;
415 }
1775b6a0
MD
416 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
417 }
418
419 /*
420 * We moved elm. The parent pointers for any
421 * children of elm must be repointed.
422 */
423 if (item != base_item &&
424 node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
425 KKASSERT(chld_item);
426 rebalance_parent_ptrs(base_item, base_count,
427 item, chld_item);
428 }
bbb01e14
MD
429 hammer_cursor_moved_element(item->parent->node,
430 item->index,
431 node, i,
432 base_item->node,
433 base_count);
1775b6a0
MD
434 ++base_count;
435 if (chld_item)
436 chld_item = TAILQ_NEXT(chld_item, entry);
437 }
438
439 /*
440 * Always call at the end (i == number of elements) in
441 * case a cursor is sitting indexed there.
442 */
bbb01e14
MD
443 hammer_cursor_moved_element(item->parent->node, item->index,
444 node, i,
445 base_item->node, base_count);
1775b6a0
MD
446 }
447
448 /*
449 * Packing complete, close-out base_item using the right-hand
450 * boundary of the original parent.
451 *
452 * If we will be deleting nodes from the root shift the old
453 * right-hand-boundary to the new ending index.
454 */
455 elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count];
456 rebalance_closeout(base_item, base_count, elm);
457 ++root_count;
458 if (lockroot.copy->count != root_count) {
459 lockroot.copy->count = root_count;
460 lockroot.copy->elms[root_count] = *elm;
461 lockroot.flags |= HAMMER_NODE_LOCK_UPDATED;
462 }
463
464 /*
465 * Any extra items beyond base_item are now completely empty and
466 * can be destroyed. Queue the destruction up in the copy. Note
467 * that none of the destroyed nodes are part of our cursor.
468 *
469 * The cursor is locked so it isn't on the tracking list. It
470 * should have been pointing at the boundary element (at root_count).
471 * When deleting elements from the root (which is cursor.node), we
472 * have to update the cursor.index manually to keep it in bounds.
473 */
474 while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) {
475 hammer_cursor_removed_node(base_item->node, lockroot.node,
476 base_count);
477 hammer_cursor_deleted_element(lockroot.node, base_count);
478 base_item->copy->type = HAMMER_BTREE_TYPE_DELETED;
479 base_item->copy->count = 0;
480 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
481 if (cursor->index > lockroot.copy->count)
482 --cursor->index;
7ddc70d1 483 ++rebal->stat_deletions;
1775b6a0
MD
484 }
485
486 /*
487 * All done, sync the locked child tree to disk. This will also
488 * flush and delete deleted nodes.
489 */
7ddc70d1 490 rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot);
1775b6a0 491done:
24cf83d2 492 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache);
1775b6a0
MD
493 hammer_cursor_downgrade(cursor);
494 return (error);
495}
496
497/*
498 * Close-out the child base_item. This node contains base_count
499 * elements.
500 *
501 * If the node is an internal node the right-hand boundary must be
502 * set to elm.
503 */
504static
505void
506rebalance_closeout(hammer_node_lock_t base_item, int base_count,
507 hammer_btree_elm_t elm)
508{
509 hammer_node_lock_t parent;
510 hammer_btree_elm_t base_elm;
511 hammer_btree_elm_t rbound_elm;
512 u_int8_t save;
513
514 /*
515 * Update the count. NOTE: base_count can be 0 for the
516 * degenerate leaf case.
517 */
518 if (hammer_debug_general & 0x1000) {
519 kprintf("rebalance_closeout %016llx:",
973c11b9 520 (long long)base_item->node->node_offset);
1775b6a0
MD
521 }
522 if (base_item->copy->count != base_count) {
523 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
524 base_item->copy->count = base_count;
525 if (hammer_debug_general & 0x1000)
526 kprintf(" (count update)");
527 }
528
529 /*
530 * If we are closing out an internal node we must assign
531 * a right-hand boundary. Use the element contents as the
532 * right-hand boundary.
533 *
534 * Internal nodes are required to have at least one child,
535 * otherwise the left and right boundary would end up being
536 * the same element. Only leaf nodes can be empty.
7ddc70d1
MD
537 *
538 * Rebalancing may cut-off an internal node such that the
539 * new right hand boundary is the next element anyway, but
540 * we still have to make sure that subtree_offset, btype,
541 * and mirror_tid are all 0.
1775b6a0
MD
542 */
543 if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) {
544 KKASSERT(base_count != 0);
545 base_elm = &base_item->copy->elms[base_count];
546
7ddc70d1
MD
547 if (bcmp(base_elm, elm, sizeof(*elm)) != 0 ||
548 elm->internal.subtree_offset ||
549 elm->internal.mirror_tid ||
550 elm->base.btype) {
1775b6a0
MD
551 *base_elm = *elm;
552 base_elm->internal.subtree_offset = 0;
7ddc70d1 553 base_elm->internal.mirror_tid = 0;
1775b6a0
MD
554 base_elm->base.btype = 0;
555 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
556 if (hammer_debug_general & 0x1000)
557 kprintf(" (rhs update)");
558 } else {
559 if (hammer_debug_general & 0x1000)
560 kprintf(" (rhs same)");
561 }
562 }
563
564 /*
565 * The parent's boundary must be updated. Be careful to retain
566 * the btype and non-base internal fields as that information is
567 * unrelated.
568 */
569 parent = base_item->parent;
570 rbound_elm = &parent->copy->elms[base_item->index + 1];
571 if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) {
572 save = rbound_elm->base.btype;
573 rbound_elm->base = elm->base;
574 rbound_elm->base.btype = save;
575 parent->flags |= HAMMER_NODE_LOCK_UPDATED;
576 if (hammer_debug_general & 0x1000) {
577 kprintf(" (parent bound update %d)",
578 base_item->index + 1);
579 }
580 }
581 if (hammer_debug_general & 0x1000)
582 kprintf("\n");
583}
584
585/*
586 * An element in item has moved to base_item. We must update the parent
587 * pointer of the node the element points to (which is chld_item).
588 */
589static
590void
591rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
592 hammer_node_lock_t item, hammer_node_lock_t chld_item)
593{
594 KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset);
595 chld_item->copy->parent = base_item->node->node_offset;
596 chld_item->flags |= HAMMER_NODE_LOCK_UPDATED;
597 hammer_cursor_parent_changed(chld_item->node,
598 item->node, base_item->node, index);
599}