Commit | Line | Data |
---|---|---|
8cd0a023 | 1 | /* |
b84de5af | 2 | * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. |
745703c7 | 3 | * |
8cd0a023 MD |
4 | * This code is derived from software contributed to The DragonFly Project |
5 | * by Matthew Dillon <dillon@backplane.com> | |
745703c7 | 6 | * |
8cd0a023 MD |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
745703c7 | 10 | * |
8cd0a023 MD |
11 | * 1. Redistributions of source code must retain the above copyright |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
745703c7 | 20 | * |
8cd0a023 MD |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
8cd0a023 MD |
33 | */ |
34 | ||
35 | /* | |
36 | * HAMMER B-Tree index - cursor support routines | |
37 | */ | |
38 | #include "hammer.h" | |
39 | ||
f36a9737 | 40 | static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive); |
8cd0a023 MD |
41 | |
42 | /* | |
61aeeb33 | 43 | * Initialize a fresh cursor using the B-Tree node cache. If the cache |
47197d71 | 44 | * is not available initialize a fresh cursor at the root of the filesystem. |
8cd0a023 MD |
45 | */ |
46 | int | |
36f82b23 | 47 | hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, |
bcac4bbb | 48 | hammer_node_cache_t cache, hammer_inode_t ip) |
8cd0a023 | 49 | { |
47197d71 | 50 | hammer_volume_t volume; |
61aeeb33 | 51 | hammer_node_t node; |
e2a02b72 MD |
52 | hammer_mount_t hmp; |
53 | u_int tticks; | |
8cd0a023 MD |
54 | int error; |
55 | ||
56 | bzero(cursor, sizeof(*cursor)); | |
61aeeb33 | 57 | |
36f82b23 | 58 | cursor->trans = trans; |
e2a02b72 MD |
59 | hmp = trans->hmp; |
60 | ||
61 | /* | |
62 | * As the number of inodes queued to the flusher increases we use | |
63 | * time-domain multiplexing to control read vs flush performance. | |
64 | * We have to do it here, before acquiring any ip or node locks, | |
65 | * to avoid deadlocking or excessively delaying the flusher. | |
66 | * | |
67 | * The full time period is hammer_tdmux_ticks, typically 1/5 of | |
68 | * a second. | |
69 | * | |
70 | * inode allocation begins to get restrained at 2/4 the limit | |
71 | * via the "hmrrcm" mechanism in hammer_inode. We want to begin | |
72 | * limiting read activity before that to try to avoid processes | |
73 | * stalling out in "hmrrcm". | |
74 | */ | |
75 | tticks = hammer_tdmux_ticks; | |
76 | if (trans->type != HAMMER_TRANS_FLS && tticks && | |
77 | hmp->count_reclaims > hammer_limit_reclaims / tticks && | |
78 | hmp->count_reclaims > hammer_autoflush * 2 && | |
79 | hammer_flusher_running(hmp)) { | |
80 | u_int rticks; | |
81 | u_int xticks; | |
82 | u_int dummy; | |
83 | ||
84 | /* | |
85 | * 0 ... xticks ... tticks | |
86 | * | |
87 | * rticks is the calculated position, xticks is the demarc | |
88 | * where values below xticks are reserved for the flusher | |
89 | * and values >= to xticks may be used by the frontend. | |
90 | * | |
91 | * At least one tick is always made available for the | |
92 | * frontend. | |
93 | */ | |
94 | rticks = (u_int)ticks % tticks; | |
95 | xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims; | |
96 | ||
97 | /* | |
98 | * Ensure rticks and xticks are stable | |
99 | */ | |
100 | cpu_ccfence(); | |
101 | if (rticks < xticks) { | |
102 | if (hammer_debug_general & 0x0004) | |
33234d14 | 103 | hdkprintf("rt %3u, xt %3u, tt %3u\n", |
e2a02b72 MD |
104 | rticks, xticks, tticks); |
105 | tsleep(&dummy, 0, "htdmux", xticks - rticks); | |
106 | } | |
107 | } | |
36f82b23 | 108 | |
4e17f465 MD |
109 | /* |
110 | * If the cursor operation is on behalf of an inode, lock | |
111 | * the inode. | |
e2a02b72 MD |
112 | * |
113 | * When acquiring a shared lock on an inode on which the backend | |
114 | * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second) | |
115 | * for the deadlock to clear. | |
4e17f465 MD |
116 | */ |
117 | if ((cursor->ip = ip) != NULL) { | |
118 | ++ip->cursor_ip_refs; | |
e2a02b72 | 119 | if (trans->type == HAMMER_TRANS_FLS) { |
4e17f465 | 120 | hammer_lock_ex(&ip->lock); |
e2a02b72 MD |
121 | } else { |
122 | #if 0 | |
123 | if (ip->cursor_exclreq_count) { | |
124 | tsleep(&ip->cursor_exclreq_count, 0, | |
125 | "hstag1", hammer_tdmux_ticks); | |
126 | } | |
127 | #endif | |
4e17f465 | 128 | hammer_lock_sh(&ip->lock); |
e2a02b72 | 129 | } |
4e17f465 MD |
130 | } |
131 | ||
61aeeb33 MD |
132 | /* |
133 | * Step 1 - acquire a locked node from the cache if possible | |
134 | */ | |
bcac4bbb | 135 | if (cache && cache->node) { |
4c286c36 | 136 | node = hammer_ref_node_safe(trans, cache, &error); |
61aeeb33 | 137 | if (error == 0) { |
98da6d8c | 138 | hammer_lock_sh(&node->lock); |
61aeeb33 MD |
139 | if (node->flags & HAMMER_NODE_DELETED) { |
140 | hammer_unlock(&node->lock); | |
141 | hammer_rel_node(node); | |
142 | node = NULL; | |
143 | } | |
61aeeb33 | 144 | } |
39d8fd63 MD |
145 | if (node == NULL) |
146 | ++hammer_stats_btree_root_iterations; | |
61aeeb33 MD |
147 | } else { |
148 | node = NULL; | |
39d8fd63 | 149 | ++hammer_stats_btree_root_iterations; |
61aeeb33 MD |
150 | } |
151 | ||
152 | /* | |
153 | * Step 2 - If we couldn't get a node from the cache, get | |
47197d71 | 154 | * the one from the root of the filesystem. |
61aeeb33 MD |
155 | */ |
156 | while (node == NULL) { | |
e2a02b72 | 157 | volume = hammer_get_root_volume(hmp, &error); |
61aeeb33 MD |
158 | if (error) |
159 | break; | |
82010f9f | 160 | node = hammer_get_node(trans, volume->ondisk->vol0_btree_root, |
19619882 | 161 | 0, &error); |
47197d71 | 162 | hammer_rel_volume(volume, 0); |
61aeeb33 MD |
163 | if (error) |
164 | break; | |
e2a02b72 MD |
165 | /* |
166 | * When the frontend acquires the root b-tree node while the | |
167 | * backend is deadlocked on it, wait up to hammer_tdmux_ticks | |
168 | * (1 second) for the deadlock to clear. | |
169 | */ | |
170 | #if 0 | |
171 | if (node->cursor_exclreq_count && | |
172 | cursor->trans->type != HAMMER_TRANS_FLS) { | |
173 | tsleep(&node->cursor_exclreq_count, 0, | |
174 | "hstag3", hammer_tdmux_ticks); | |
175 | } | |
176 | #endif | |
98da6d8c | 177 | hammer_lock_sh(&node->lock); |
11ad5ade MD |
178 | |
179 | /* | |
180 | * If someone got in before we could lock the node, retry. | |
181 | */ | |
61aeeb33 MD |
182 | if (node->flags & HAMMER_NODE_DELETED) { |
183 | hammer_unlock(&node->lock); | |
184 | hammer_rel_node(node); | |
185 | node = NULL; | |
11ad5ade MD |
186 | continue; |
187 | } | |
188 | if (volume->ondisk->vol0_btree_root != node->node_offset) { | |
189 | hammer_unlock(&node->lock); | |
190 | hammer_rel_node(node); | |
191 | node = NULL; | |
192 | continue; | |
61aeeb33 | 193 | } |
8cd0a023 | 194 | } |
61aeeb33 MD |
195 | |
196 | /* | |
197 | * Step 3 - finish initializing the cursor by acquiring the parent | |
198 | */ | |
199 | cursor->node = node; | |
8cd0a023 | 200 | if (error == 0) |
f36a9737 | 201 | error = hammer_load_cursor_parent(cursor, 0); |
0b075555 | 202 | KKASSERT(error == 0); |
a84a197d | 203 | /* if (error) hammer_done_cursor(cursor); */ |
8cd0a023 MD |
204 | return(error); |
205 | } | |
206 | ||
4e17f465 MD |
207 | /* |
208 | * Normalize a cursor. Sometimes cursors can be left in a state | |
209 | * where node is NULL. If the cursor is in this state, cursor up. | |
210 | */ | |
211 | void | |
212 | hammer_normalize_cursor(hammer_cursor_t cursor) | |
213 | { | |
214 | if (cursor->node == NULL) { | |
215 | KKASSERT(cursor->parent != NULL); | |
216 | hammer_cursor_up(cursor); | |
217 | } | |
218 | } | |
219 | ||
220 | ||
8cd0a023 MD |
221 | /* |
222 | * We are finished with a cursor. We NULL out various fields as sanity | |
223 | * check, in case the structure is inappropriately used afterwords. | |
224 | */ | |
225 | void | |
226 | hammer_done_cursor(hammer_cursor_t cursor) | |
227 | { | |
4e17f465 MD |
228 | hammer_inode_t ip; |
229 | ||
adf01747 | 230 | KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); |
8cd0a023 MD |
231 | if (cursor->parent) { |
232 | hammer_unlock(&cursor->parent->lock); | |
233 | hammer_rel_node(cursor->parent); | |
234 | cursor->parent = NULL; | |
235 | } | |
236 | if (cursor->node) { | |
237 | hammer_unlock(&cursor->node->lock); | |
238 | hammer_rel_node(cursor->node); | |
239 | cursor->node = NULL; | |
240 | } | |
241 | if (cursor->data_buffer) { | |
242 | hammer_rel_buffer(cursor->data_buffer, 0); | |
243 | cursor->data_buffer = NULL; | |
244 | } | |
4e17f465 | 245 | if ((ip = cursor->ip) != NULL) { |
4e17f465 MD |
246 | KKASSERT(ip->cursor_ip_refs > 0); |
247 | --ip->cursor_ip_refs; | |
248 | hammer_unlock(&ip->lock); | |
249 | cursor->ip = NULL; | |
250 | } | |
3f43fb33 MD |
251 | if (cursor->iprec) { |
252 | hammer_rel_mem_record(cursor->iprec); | |
253 | cursor->iprec = NULL; | |
254 | } | |
4e17f465 | 255 | |
6a37e7e4 MD |
256 | /* |
257 | * If we deadlocked this node will be referenced. Do a quick | |
258 | * lock/unlock to wait for the deadlock condition to clear. | |
e2a02b72 MD |
259 | * |
260 | * Maintain exclreq_count / wakeup as necessary to notify new | |
261 | * entrants into ip. We continue to hold the fs_token so our | |
262 | * EDEADLK retry loop should get its chance before another thread | |
263 | * steals the lock. | |
6a37e7e4 MD |
264 | */ |
265 | if (cursor->deadlk_node) { | |
e2a02b72 MD |
266 | #if 0 |
267 | if (ip && cursor->trans->type == HAMMER_TRANS_FLS) | |
268 | ++ip->cursor_exclreq_count; | |
269 | ++cursor->deadlk_node->cursor_exclreq_count; | |
270 | #endif | |
af209b0f | 271 | hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); |
6a37e7e4 | 272 | hammer_unlock(&cursor->deadlk_node->lock); |
e2a02b72 MD |
273 | #if 0 |
274 | if (--cursor->deadlk_node->cursor_exclreq_count == 0) | |
275 | wakeup(&cursor->deadlk_node->cursor_exclreq_count); | |
276 | if (ip && cursor->trans->type == HAMMER_TRANS_FLS) { | |
277 | if (--ip->cursor_exclreq_count == 0) | |
278 | wakeup(&ip->cursor_exclreq_count); | |
279 | } | |
280 | #endif | |
6a37e7e4 MD |
281 | hammer_rel_node(cursor->deadlk_node); |
282 | cursor->deadlk_node = NULL; | |
283 | } | |
b84de5af | 284 | if (cursor->deadlk_rec) { |
a99b9ea2 | 285 | hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); |
b84de5af MD |
286 | hammer_rel_mem_record(cursor->deadlk_rec); |
287 | cursor->deadlk_rec = NULL; | |
288 | } | |
6a37e7e4 | 289 | |
40043e7f | 290 | cursor->data = NULL; |
11ad5ade | 291 | cursor->leaf = NULL; |
8cd0a023 MD |
292 | cursor->left_bound = NULL; |
293 | cursor->right_bound = NULL; | |
36f82b23 | 294 | cursor->trans = NULL; |
8cd0a023 MD |
295 | } |
296 | ||
6a37e7e4 MD |
297 | /* |
298 | * Upgrade cursor->node and cursor->parent to exclusive locks. This | |
299 | * function can return EDEADLK. | |
300 | * | |
7aa3b8a6 MD |
301 | * The lock must already be either held shared or already held exclusively |
302 | * by us. | |
303 | * | |
e2a02b72 MD |
304 | * We upgrade the parent first as it is the most likely to collide first |
305 | * with the downward traversal that the frontend typically does. | |
306 | * | |
745703c7 | 307 | * If we fail to upgrade the lock and cursor->deadlk_node is NULL, |
6a37e7e4 MD |
308 | * we add another reference to the node that failed and set |
309 | * cursor->deadlk_node so hammer_done_cursor() can block on it. | |
310 | */ | |
311 | int | |
312 | hammer_cursor_upgrade(hammer_cursor_t cursor) | |
313 | { | |
314 | int error; | |
315 | ||
e2a02b72 MD |
316 | if (cursor->parent) { |
317 | error = hammer_lock_upgrade(&cursor->parent->lock, 1); | |
318 | if (error && cursor->deadlk_node == NULL) { | |
319 | cursor->deadlk_node = cursor->parent; | |
320 | hammer_ref_node(cursor->deadlk_node); | |
321 | } | |
322 | } else { | |
323 | error = 0; | |
324 | } | |
325 | if (error == 0) { | |
326 | error = hammer_lock_upgrade(&cursor->node->lock, 1); | |
327 | if (error && cursor->deadlk_node == NULL) { | |
328 | cursor->deadlk_node = cursor->node; | |
329 | hammer_ref_node(cursor->deadlk_node); | |
330 | } | |
331 | } | |
332 | #if 0 | |
bb29b5d8 | 333 | error = hammer_lock_upgrade(&cursor->node->lock, 1); |
7aa3b8a6 MD |
334 | if (error && cursor->deadlk_node == NULL) { |
335 | cursor->deadlk_node = cursor->node; | |
336 | hammer_ref_node(cursor->deadlk_node); | |
337 | } else if (error == 0 && cursor->parent) { | |
bb29b5d8 | 338 | error = hammer_lock_upgrade(&cursor->parent->lock, 1); |
6a37e7e4 MD |
339 | if (error && cursor->deadlk_node == NULL) { |
340 | cursor->deadlk_node = cursor->parent; | |
341 | hammer_ref_node(cursor->deadlk_node); | |
342 | } | |
6a37e7e4 | 343 | } |
e2a02b72 | 344 | #endif |
6a37e7e4 MD |
345 | return(error); |
346 | } | |
347 | ||
7bc5b8c2 MD |
348 | int |
349 | hammer_cursor_upgrade_node(hammer_cursor_t cursor) | |
350 | { | |
351 | int error; | |
352 | ||
bb29b5d8 | 353 | error = hammer_lock_upgrade(&cursor->node->lock, 1); |
7bc5b8c2 MD |
354 | if (error && cursor->deadlk_node == NULL) { |
355 | cursor->deadlk_node = cursor->node; | |
356 | hammer_ref_node(cursor->deadlk_node); | |
357 | } | |
358 | return(error); | |
359 | } | |
360 | ||
6a37e7e4 | 361 | /* |
d34bdd31 | 362 | * Downgrade cursor->node and cursor->parent to shared locks. |
6a37e7e4 MD |
363 | */ |
364 | void | |
365 | hammer_cursor_downgrade(hammer_cursor_t cursor) | |
366 | { | |
7aa3b8a6 | 367 | if (hammer_lock_excl_owned(&cursor->node->lock, curthread)) |
bb29b5d8 | 368 | hammer_lock_downgrade(&cursor->node->lock, 1); |
7aa3b8a6 MD |
369 | if (cursor->parent && |
370 | hammer_lock_excl_owned(&cursor->parent->lock, curthread)) { | |
bb29b5d8 | 371 | hammer_lock_downgrade(&cursor->parent->lock, 1); |
7aa3b8a6 | 372 | } |
6a37e7e4 MD |
373 | } |
374 | ||
bb29b5d8 MD |
375 | /* |
376 | * Upgrade and downgrade pairs of cursors. This is used by the dedup | |
377 | * code which must deal with two cursors. A special function is needed | |
378 | * because some of the nodes may be shared between the two cursors, | |
379 | * resulting in share counts > 1 which will normally cause an upgrade | |
380 | * to fail. | |
381 | */ | |
382 | static __noinline | |
383 | int | |
384 | collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node) | |
385 | { | |
386 | int i; | |
387 | ||
388 | for (i = 0; i < n; ++i) { | |
389 | if (array[i] == node) | |
390 | break; | |
391 | } | |
392 | if (i == n) { | |
393 | array[i] = node; | |
394 | counts[i] = 1; | |
395 | ++i; | |
396 | } else { | |
397 | ++counts[i]; | |
398 | } | |
399 | return(i); | |
400 | } | |
401 | ||
402 | int | |
403 | hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) | |
404 | { | |
405 | hammer_node_t nodes[4]; | |
406 | int counts[4]; | |
407 | int error; | |
408 | int i; | |
409 | int n; | |
410 | ||
411 | n = collect_node(nodes, counts, 0, cursor1->node); | |
412 | if (cursor1->parent) | |
413 | n = collect_node(nodes, counts, n, cursor1->parent); | |
414 | n = collect_node(nodes, counts, n, cursor2->node); | |
415 | if (cursor2->parent) | |
416 | n = collect_node(nodes, counts, n, cursor2->parent); | |
417 | ||
418 | error = 0; | |
419 | for (i = 0; i < n; ++i) { | |
420 | error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]); | |
421 | if (error) | |
422 | break; | |
423 | } | |
424 | if (error) { | |
425 | while (--i >= 0) | |
426 | hammer_lock_downgrade(&nodes[i]->lock, counts[i]); | |
427 | } | |
428 | return (error); | |
429 | } | |
430 | ||
431 | void | |
432 | hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) | |
433 | { | |
434 | hammer_node_t nodes[4]; | |
435 | int counts[4]; | |
436 | int i; | |
437 | int n; | |
438 | ||
439 | n = collect_node(nodes, counts, 0, cursor1->node); | |
440 | if (cursor1->parent) | |
441 | n = collect_node(nodes, counts, n, cursor1->parent); | |
442 | n = collect_node(nodes, counts, n, cursor2->node); | |
443 | if (cursor2->parent) | |
444 | n = collect_node(nodes, counts, n, cursor2->parent); | |
445 | ||
446 | for (i = 0; i < n; ++i) | |
447 | hammer_lock_downgrade(&nodes[i]->lock, counts[i]); | |
448 | } | |
449 | ||
32c90105 MD |
450 | /* |
451 | * Seek the cursor to the specified node and index. | |
cb51be26 MD |
452 | * |
453 | * The caller must ref the node prior to calling this routine and release | |
454 | * it after it returns. If the seek succeeds the cursor will gain its own | |
455 | * ref on the node. | |
32c90105 MD |
456 | */ |
457 | int | |
458 | hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index) | |
459 | { | |
460 | int error; | |
461 | ||
462 | hammer_cursor_downgrade(cursor); | |
463 | error = 0; | |
464 | ||
465 | if (cursor->node != node) { | |
466 | hammer_unlock(&cursor->node->lock); | |
467 | hammer_rel_node(cursor->node); | |
468 | cursor->node = node; | |
32c90105 MD |
469 | hammer_ref_node(node); |
470 | hammer_lock_sh(&node->lock); | |
f36a9737 | 471 | KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); |
32c90105 MD |
472 | |
473 | if (cursor->parent) { | |
474 | hammer_unlock(&cursor->parent->lock); | |
475 | hammer_rel_node(cursor->parent); | |
476 | cursor->parent = NULL; | |
477 | cursor->parent_index = 0; | |
478 | } | |
f36a9737 | 479 | error = hammer_load_cursor_parent(cursor, 0); |
32c90105 | 480 | } |
a84a197d | 481 | cursor->index = index; |
32c90105 MD |
482 | return (error); |
483 | } | |
6a37e7e4 | 484 | |
195c19a1 | 485 | /* |
47197d71 | 486 | * Load the parent of cursor->node into cursor->parent. |
8cd0a023 MD |
487 | */ |
488 | static | |
489 | int | |
f36a9737 | 490 | hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive) |
8cd0a023 | 491 | { |
47197d71 MD |
492 | hammer_mount_t hmp; |
493 | hammer_node_t parent; | |
494 | hammer_node_t node; | |
495 | hammer_btree_elm_t elm; | |
8cd0a023 | 496 | int error; |
c82af904 | 497 | int parent_index; |
8cd0a023 | 498 | |
36f82b23 | 499 | hmp = cursor->trans->hmp; |
8cd0a023 MD |
500 | |
501 | if (cursor->node->ondisk->parent) { | |
47197d71 | 502 | node = cursor->node; |
82010f9f MD |
503 | parent = hammer_btree_get_parent(cursor->trans, node, |
504 | &parent_index, | |
c82af904 MD |
505 | &error, try_exclusive); |
506 | if (error == 0) { | |
507 | elm = &parent->ondisk->elms[parent_index]; | |
508 | cursor->parent = parent; | |
509 | cursor->parent_index = parent_index; | |
510 | cursor->left_bound = &elm[0].internal.base; | |
511 | cursor->right_bound = &elm[1].internal.base; | |
a84a197d | 512 | } |
8cd0a023 MD |
513 | } else { |
514 | cursor->parent = NULL; | |
515 | cursor->parent_index = 0; | |
47197d71 MD |
516 | cursor->left_bound = &hmp->root_btree_beg; |
517 | cursor->right_bound = &hmp->root_btree_end; | |
8cd0a023 MD |
518 | error = 0; |
519 | } | |
520 | return(error); | |
521 | } | |
522 | ||
8cd0a023 MD |
523 | /* |
524 | * Cursor up to our parent node. Return ENOENT if we are at the root of | |
47197d71 | 525 | * the filesystem. |
8cd0a023 MD |
526 | */ |
527 | int | |
6a37e7e4 | 528 | hammer_cursor_up(hammer_cursor_t cursor) |
8cd0a023 MD |
529 | { |
530 | int error; | |
531 | ||
6a37e7e4 | 532 | hammer_cursor_downgrade(cursor); |
195c19a1 | 533 | |
8cd0a023 | 534 | /* |
4e17f465 MD |
535 | * If the parent is NULL we are at the root of the B-Tree and |
536 | * return ENOENT. | |
537 | */ | |
538 | if (cursor->parent == NULL) | |
539 | return (ENOENT); | |
540 | ||
541 | /* | |
745703c7 | 542 | * Set the node to its parent. |
8cd0a023 MD |
543 | */ |
544 | hammer_unlock(&cursor->node->lock); | |
545 | hammer_rel_node(cursor->node); | |
546 | cursor->node = cursor->parent; | |
547 | cursor->index = cursor->parent_index; | |
548 | cursor->parent = NULL; | |
549 | cursor->parent_index = 0; | |
550 | ||
f36a9737 | 551 | error = hammer_load_cursor_parent(cursor, 0); |
8cd0a023 MD |
552 | return(error); |
553 | } | |
554 | ||
f36a9737 MD |
555 | /* |
556 | * Special cursor up given a locked cursor. The orignal node is not | |
b3bad96f MD |
557 | * unlocked or released and the cursor is not downgraded. |
558 | * | |
f3a4893b MD |
559 | * This function can fail with EDEADLK. |
560 | * | |
561 | * This function is only run when recursively deleting parent nodes | |
562 | * to get rid of an empty leaf. | |
f36a9737 MD |
563 | */ |
564 | int | |
565 | hammer_cursor_up_locked(hammer_cursor_t cursor) | |
566 | { | |
567 | hammer_node_t save; | |
568 | int error; | |
f3a4893b | 569 | int save_index; |
f36a9737 MD |
570 | |
571 | /* | |
572 | * If the parent is NULL we are at the root of the B-Tree and | |
573 | * return ENOENT. | |
574 | */ | |
575 | if (cursor->parent == NULL) | |
576 | return (ENOENT); | |
577 | ||
578 | save = cursor->node; | |
f3a4893b | 579 | save_index = cursor->index; |
f36a9737 MD |
580 | |
581 | /* | |
745703c7 | 582 | * Set the node to its parent. |
f36a9737 MD |
583 | */ |
584 | cursor->node = cursor->parent; | |
585 | cursor->index = cursor->parent_index; | |
586 | cursor->parent = NULL; | |
587 | cursor->parent_index = 0; | |
588 | ||
589 | /* | |
590 | * load the new parent, attempt to exclusively lock it. Note that | |
591 | * we are still holding the old parent (now cursor->node) exclusively | |
f3a4893b MD |
592 | * locked. |
593 | * | |
594 | * This can return EDEADLK. Undo the operation on any error. These | |
595 | * up sequences can occur during iterations so be sure to restore | |
596 | * the index. | |
f36a9737 MD |
597 | */ |
598 | error = hammer_load_cursor_parent(cursor, 1); | |
599 | if (error) { | |
600 | cursor->parent = cursor->node; | |
601 | cursor->parent_index = cursor->index; | |
602 | cursor->node = save; | |
f3a4893b | 603 | cursor->index = save_index; |
f36a9737 MD |
604 | } |
605 | return(error); | |
606 | } | |
607 | ||
608 | ||
8cd0a023 MD |
609 | /* |
610 | * Cursor down through the current node, which must be an internal node. | |
611 | * | |
612 | * This routine adjusts the cursor and sets index to 0. | |
613 | */ | |
614 | int | |
615 | hammer_cursor_down(hammer_cursor_t cursor) | |
616 | { | |
617 | hammer_node_t node; | |
618 | hammer_btree_elm_t elm; | |
8cd0a023 MD |
619 | int error; |
620 | ||
621 | /* | |
622 | * The current node becomes the current parent | |
623 | */ | |
6a37e7e4 | 624 | hammer_cursor_downgrade(cursor); |
8cd0a023 | 625 | node = cursor->node; |
8cd0a023 MD |
626 | KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); |
627 | if (cursor->parent) { | |
628 | hammer_unlock(&cursor->parent->lock); | |
629 | hammer_rel_node(cursor->parent); | |
630 | } | |
631 | cursor->parent = node; | |
632 | cursor->parent_index = cursor->index; | |
633 | cursor->node = NULL; | |
634 | cursor->index = 0; | |
635 | ||
636 | /* | |
76376933 | 637 | * Extract element to push into at (node,index), set bounds. |
8cd0a023 MD |
638 | */ |
639 | elm = &node->ondisk->elms[cursor->parent_index]; | |
640 | ||
641 | /* | |
1424c922 | 642 | * Ok, push down into elm of an internal node. |
8cd0a023 | 643 | */ |
62d8972e TK |
644 | KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); |
645 | KKASSERT(elm->internal.subtree_offset != 0); | |
646 | cursor->left_bound = &elm[0].internal.base; | |
647 | cursor->right_bound = &elm[1].internal.base; | |
648 | node = hammer_get_node(cursor->trans, | |
649 | elm->internal.subtree_offset, 0, &error); | |
650 | if (error == 0) { | |
651 | KASSERT(elm->base.btype == node->ondisk->type, | |
652 | ("BTYPE MISMATCH %c %c NODE %p", | |
653 | elm->base.btype, node->ondisk->type, node)); | |
654 | if (node->ondisk->parent != cursor->parent->node_offset) | |
35a5249b | 655 | hpanic("node %p %016jx vs %016jx", |
62d8972e | 656 | node, |
35a5249b TK |
657 | (intmax_t)node->ondisk->parent, |
658 | (intmax_t)cursor->parent->node_offset); | |
62d8972e | 659 | KKASSERT(node->ondisk->parent == cursor->parent->node_offset); |
8cd0a023 | 660 | } |
e2a02b72 MD |
661 | |
662 | /* | |
663 | * If no error occured we can lock the new child node. If the | |
664 | * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second) | |
665 | * for the deadlock to clear. Otherwise a large number of concurrent | |
666 | * readers can continuously stall the flusher. | |
667 | * | |
668 | * We specifically do this in the cursor_down() code in order to | |
669 | * deal with frontend top-down searches smashing against bottom-up | |
670 | * flusher-based mirror updates. These collisions typically occur | |
671 | * above the inode in the B-Tree and are not covered by the | |
672 | * ip->cursor_exclreq_count logic. | |
673 | */ | |
8cd0a023 | 674 | if (error == 0) { |
e2a02b72 MD |
675 | #if 0 |
676 | if (node->cursor_exclreq_count && | |
677 | cursor->trans->type != HAMMER_TRANS_FLS) { | |
678 | tsleep(&node->cursor_exclreq_count, 0, | |
679 | "hstag2", hammer_tdmux_ticks); | |
680 | } | |
681 | #endif | |
6a37e7e4 | 682 | hammer_lock_sh(&node->lock); |
f36a9737 | 683 | KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); |
8cd0a023 MD |
684 | cursor->node = node; |
685 | cursor->index = 0; | |
686 | } | |
687 | return(error); | |
688 | } | |
689 | ||
b3bad96f MD |
690 | /************************************************************************ |
691 | * DEADLOCK RECOVERY * | |
692 | ************************************************************************ | |
693 | * | |
694 | * These are the new deadlock recovery functions. Currently they are only | |
695 | * used for the mirror propagation and physical node removal cases but | |
696 | * ultimately the intention is to use them for all deadlock recovery | |
697 | * operations. | |
c9ce54d6 MD |
698 | * |
699 | * WARNING! The contents of the cursor may be modified while unlocked. | |
700 | * passive modifications including adjusting the node, parent, | |
701 | * indexes, and leaf pointer. | |
702 | * | |
703 | * An outright removal of the element the cursor was pointing at | |
704 | * will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set, | |
705 | * which chains to causing the HAMMER_CURSOR_RETEST to be set | |
706 | * when the cursor is locked again. | |
b3bad96f | 707 | */ |
adf01747 | 708 | void |
982be4bf | 709 | hammer_unlock_cursor(hammer_cursor_t cursor) |
b3bad96f | 710 | { |
b3bad96f | 711 | hammer_node_t node; |
b3bad96f | 712 | |
adf01747 | 713 | KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); |
b3bad96f | 714 | KKASSERT(cursor->node); |
19168d41 | 715 | |
b3bad96f MD |
716 | /* |
717 | * Release the cursor's locks and track B-Tree operations on node. | |
718 | * While being tracked our cursor can be modified by other threads | |
5c8d05e2 | 719 | * and the node may be replaced. |
b3bad96f MD |
720 | */ |
721 | if (cursor->parent) { | |
722 | hammer_unlock(&cursor->parent->lock); | |
723 | hammer_rel_node(cursor->parent); | |
724 | cursor->parent = NULL; | |
725 | } | |
726 | node = cursor->node; | |
adf01747 | 727 | cursor->flags |= HAMMER_CURSOR_TRACKED; |
b3bad96f | 728 | TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry); |
b3bad96f | 729 | hammer_unlock(&node->lock); |
adf01747 MD |
730 | } |
731 | ||
732 | /* | |
733 | * Get the cursor heated up again. The cursor's node may have | |
734 | * changed and we might have to locate the new parent. | |
735 | * | |
736 | * If the exact element we were on got deleted RIPOUT will be | |
737 | * set and we must clear ATEDISK so an iteration does not skip | |
738 | * the element after it. | |
739 | */ | |
740 | int | |
982be4bf | 741 | hammer_lock_cursor(hammer_cursor_t cursor) |
adf01747 | 742 | { |
adf01747 MD |
743 | hammer_node_t node; |
744 | int error; | |
745 | ||
746 | KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED); | |
747 | ||
adf01747 MD |
748 | /* |
749 | * Relock the node | |
750 | */ | |
751 | for (;;) { | |
752 | node = cursor->node; | |
753 | hammer_ref_node(node); | |
754 | hammer_lock_sh(&node->lock); | |
755 | if (cursor->node == node) { | |
756 | hammer_rel_node(node); | |
757 | break; | |
758 | } | |
759 | hammer_unlock(&node->lock); | |
760 | hammer_rel_node(node); | |
761 | } | |
762 | ||
763 | /* | |
764 | * Untrack the cursor, clean up, and re-establish the parent node. | |
765 | */ | |
766 | TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); | |
767 | cursor->flags &= ~HAMMER_CURSOR_TRACKED; | |
768 | ||
5c8d05e2 MD |
769 | /* |
770 | * If a ripout has occured iterations must re-test the (new) | |
771 | * current element. Clearing ATEDISK prevents the element from | |
772 | * being skipped and RETEST causes it to be re-tested. | |
773 | */ | |
adf01747 MD |
774 | if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) { |
775 | cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT; | |
776 | cursor->flags &= ~HAMMER_CURSOR_ATEDISK; | |
5c8d05e2 | 777 | cursor->flags |= HAMMER_CURSOR_RETEST; |
adf01747 MD |
778 | } |
779 | error = hammer_load_cursor_parent(cursor, 0); | |
780 | return(error); | |
781 | } | |
782 | ||
783 | /* | |
784 | * Recover from a deadlocked cursor, tracking any node removals or | |
785 | * replacements. If the cursor's current node is removed by another | |
786 | * thread (via btree_remove()) the cursor will be seeked upwards. | |
98da6d8c MD |
787 | * |
788 | * The caller is working a modifying operation and must be holding the | |
789 | * sync lock (shared). We do not release the sync lock because this | |
790 | * would break atomicy. | |
adf01747 MD |
791 | */ |
792 | int | |
793 | hammer_recover_cursor(hammer_cursor_t cursor) | |
794 | { | |
f31f6d84 SW |
795 | hammer_transaction_t trans __debugvar; |
796 | #if 0 | |
e2a02b72 | 797 | hammer_inode_t ip; |
f31f6d84 | 798 | #endif |
adf01747 MD |
799 | int error; |
800 | ||
982be4bf | 801 | hammer_unlock_cursor(cursor); |
e2a02b72 | 802 | |
f31f6d84 | 803 | #if 0 |
e2a02b72 | 804 | ip = cursor->ip; |
f31f6d84 | 805 | #endif |
e2a02b72 MD |
806 | trans = cursor->trans; |
807 | KKASSERT(trans->sync_lock_refs > 0); | |
b3bad96f MD |
808 | |
809 | /* | |
e2a02b72 MD |
810 | * Wait for the deadlock to clear. |
811 | * | |
812 | * Maintain exclreq_count / wakeup as necessary to notify new | |
813 | * entrants into ip. We continue to hold the fs_token so our | |
814 | * EDEADLK retry loop should get its chance before another thread | |
815 | * steals the lock. | |
b3bad96f MD |
816 | */ |
817 | if (cursor->deadlk_node) { | |
e2a02b72 MD |
818 | #if 0 |
819 | if (ip && trans->type == HAMMER_TRANS_FLS) | |
820 | ++ip->cursor_exclreq_count; | |
821 | ++cursor->deadlk_node->cursor_exclreq_count; | |
822 | #endif | |
b3bad96f MD |
823 | hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); |
824 | hammer_unlock(&cursor->deadlk_node->lock); | |
e2a02b72 MD |
825 | #if 0 |
826 | if (--cursor->deadlk_node->cursor_exclreq_count == 0) | |
827 | wakeup(&cursor->deadlk_node->cursor_exclreq_count); | |
828 | if (ip && trans->type == HAMMER_TRANS_FLS) { | |
829 | if (--ip->cursor_exclreq_count == 0) | |
830 | wakeup(&ip->cursor_exclreq_count); | |
831 | } | |
832 | #endif | |
b3bad96f MD |
833 | hammer_rel_node(cursor->deadlk_node); |
834 | cursor->deadlk_node = NULL; | |
835 | } | |
836 | if (cursor->deadlk_rec) { | |
837 | hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); | |
838 | hammer_rel_mem_record(cursor->deadlk_rec); | |
839 | cursor->deadlk_rec = NULL; | |
840 | } | |
982be4bf | 841 | error = hammer_lock_cursor(cursor); |
adf01747 MD |
842 | return(error); |
843 | } | |
844 | ||
845 | /* | |
846 | * Dup ocursor to ncursor. ncursor inherits ocursor's locks and ocursor | |
847 | * is effectively unlocked and becomes tracked. If ocursor was not locked | |
848 | * then ncursor also inherits the tracking. | |
849 | * | |
850 | * After the caller finishes working with ncursor it must be cleaned up | |
851 | * with hammer_done_cursor(), and the caller must re-lock ocursor. | |
852 | */ | |
3f43fb33 MD |
853 | hammer_cursor_t |
854 | hammer_push_cursor(hammer_cursor_t ocursor) | |
adf01747 | 855 | { |
3f43fb33 | 856 | hammer_cursor_t ncursor; |
adf01747 MD |
857 | hammer_inode_t ip; |
858 | hammer_node_t node; | |
bac808fe | 859 | hammer_mount_t hmp; |
adf01747 | 860 | |
bac808fe MD |
861 | hmp = ocursor->trans->hmp; |
862 | ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO); | |
adf01747 MD |
863 | bcopy(ocursor, ncursor, sizeof(*ocursor)); |
864 | ||
865 | node = ocursor->node; | |
866 | hammer_ref_node(node); | |
867 | if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) { | |
868 | ocursor->flags |= HAMMER_CURSOR_TRACKED; | |
869 | TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry); | |
b3bad96f | 870 | } |
adf01747 MD |
871 | if (ncursor->parent) |
872 | ocursor->parent = NULL; | |
873 | ocursor->data_buffer = NULL; | |
874 | ocursor->leaf = NULL; | |
875 | ocursor->data = NULL; | |
876 | if (ncursor->flags & HAMMER_CURSOR_TRACKED) | |
877 | TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry); | |
878 | if ((ip = ncursor->ip) != NULL) { | |
879 | ++ip->cursor_ip_refs; | |
b3bad96f | 880 | } |
adf01747 MD |
881 | if (ncursor->iprec) |
882 | hammer_ref(&ncursor->iprec->lock); | |
3f43fb33 MD |
883 | return(ncursor); |
884 | } | |
885 | ||
886 | /* | |
887 | * Destroy ncursor and restore ocursor | |
888 | * | |
889 | * This is a temporary hack for the release. We can't afford to lose | |
890 | * the IP lock until the IP object scan code is able to deal with it, | |
891 | * so have ocursor inherit it back. | |
892 | */ | |
893 | void | |
894 | hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor) | |
895 | { | |
bac808fe | 896 | hammer_mount_t hmp; |
3f43fb33 MD |
897 | hammer_inode_t ip; |
898 | ||
bac808fe | 899 | hmp = ncursor->trans->hmp; |
3f43fb33 MD |
900 | ip = ncursor->ip; |
901 | ncursor->ip = NULL; | |
902 | if (ip) | |
903 | --ip->cursor_ip_refs; | |
904 | hammer_done_cursor(ncursor); | |
bac808fe | 905 | kfree(ncursor, hmp->m_misc); |
3f43fb33 | 906 | KKASSERT(ocursor->ip == ip); |
982be4bf | 907 | hammer_lock_cursor(ocursor); |
b3bad96f MD |
908 | } |
909 | ||
910 | /* | |
911 | * onode is being replaced by nnode by the reblocking code. | |
912 | */ | |
913 | void | |
914 | hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode) | |
915 | { | |
916 | hammer_cursor_t cursor; | |
c9ce54d6 MD |
917 | hammer_node_ondisk_t ondisk; |
918 | hammer_node_ondisk_t nndisk; | |
919 | ||
920 | ondisk = onode->ondisk; | |
921 | nndisk = nnode->ondisk; | |
b3bad96f MD |
922 | |
923 | while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) { | |
924 | TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); | |
925 | TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); | |
926 | KKASSERT(cursor->node == onode); | |
c9ce54d6 MD |
927 | if (cursor->leaf == &ondisk->elms[cursor->index].leaf) |
928 | cursor->leaf = &nndisk->elms[cursor->index].leaf; | |
b3bad96f MD |
929 | cursor->node = nnode; |
930 | hammer_ref_node(nnode); | |
931 | hammer_rel_node(onode); | |
932 | } | |
933 | } | |
934 | ||
935 | /* | |
f3a4893b MD |
936 | * We have removed <node> from the parent and collapsed the parent. |
937 | * | |
938 | * Cursors in deadlock recovery are seeked upward to the parent so the | |
6dc17446 MD |
939 | * btree_remove() recursion works properly even though we have marked |
940 | * the cursor as requiring a reseek. | |
941 | * | |
942 | * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK, | |
943 | * meaning the cursor is no longer definitively pointing at an element | |
944 | * within its iteration (if the cursor is being used to iterate). The | |
945 | * iteration code will take this into account instead of asserting if the | |
946 | * cursor is outside the iteration range. | |
b3bad96f MD |
947 | */ |
948 | void | |
949 | hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index) | |
950 | { | |
951 | hammer_cursor_t cursor; | |
c9ce54d6 | 952 | hammer_node_ondisk_t ondisk; |
b3bad96f MD |
953 | |
954 | KKASSERT(parent != NULL); | |
c9ce54d6 MD |
955 | ondisk = node->ondisk; |
956 | ||
b3bad96f MD |
957 | while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) { |
958 | KKASSERT(cursor->node == node); | |
959 | KKASSERT(cursor->index == 0); | |
960 | TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); | |
961 | TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry); | |
c9ce54d6 MD |
962 | if (cursor->leaf == &ondisk->elms[cursor->index].leaf) |
963 | cursor->leaf = NULL; | |
adf01747 | 964 | cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; |
6dc17446 | 965 | cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; |
b3bad96f MD |
966 | cursor->node = parent; |
967 | cursor->index = index; | |
968 | hammer_ref_node(parent); | |
969 | hammer_rel_node(node); | |
970 | } | |
971 | } | |
972 | ||
973 | /* | |
974 | * node was split at (onode, index) with elements >= index moved to nnode. | |
975 | */ | |
976 | void | |
977 | hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index) | |
978 | { | |
979 | hammer_cursor_t cursor; | |
c9ce54d6 MD |
980 | hammer_node_ondisk_t ondisk; |
981 | hammer_node_ondisk_t nndisk; | |
982 | ||
983 | ondisk = onode->ondisk; | |
984 | nndisk = nnode->ondisk; | |
b3bad96f MD |
985 | |
986 | again: | |
987 | TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { | |
988 | KKASSERT(cursor->node == onode); | |
989 | if (cursor->index < index) | |
990 | continue; | |
991 | TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); | |
992 | TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); | |
c9ce54d6 MD |
993 | if (cursor->leaf == &ondisk->elms[cursor->index].leaf) |
994 | cursor->leaf = &nndisk->elms[cursor->index - index].leaf; | |
b3bad96f MD |
995 | cursor->node = nnode; |
996 | cursor->index -= index; | |
997 | hammer_ref_node(nnode); | |
998 | hammer_rel_node(onode); | |
999 | goto again; | |
1000 | } | |
1001 | } | |
1002 | ||
1775b6a0 MD |
1003 | /* |
1004 | * An element was moved from one node to another or within a node. The | |
1005 | * index may also represent the end of the node (index == numelements). | |
1006 | * | |
bbb01e14 MD |
1007 | * {oparent,pindex} is the parent node's pointer to onode/oindex. |
1008 | * | |
1775b6a0 MD |
1009 | * This is used by the rebalancing code. This is not an insertion or |
1010 | * deletion and any additional elements, including the degenerate case at | |
1011 | * the end of the node, will be dealt with by additional distinct calls. | |
1012 | */ | |
1013 | void | |
bbb01e14 MD |
1014 | hammer_cursor_moved_element(hammer_node_t oparent, int pindex, |
1015 | hammer_node_t onode, int oindex, | |
1016 | hammer_node_t nnode, int nindex) | |
1775b6a0 MD |
1017 | { |
1018 | hammer_cursor_t cursor; | |
c9ce54d6 MD |
1019 | hammer_node_ondisk_t ondisk; |
1020 | hammer_node_ondisk_t nndisk; | |
1021 | ||
bbb01e14 MD |
1022 | /* |
1023 | * Adjust any cursors pointing at the element | |
1024 | */ | |
c9ce54d6 MD |
1025 | ondisk = onode->ondisk; |
1026 | nndisk = nnode->ondisk; | |
bbb01e14 | 1027 | again1: |
1775b6a0 MD |
1028 | TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { |
1029 | KKASSERT(cursor->node == onode); | |
1030 | if (cursor->index != oindex) | |
1031 | continue; | |
1032 | TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); | |
1033 | TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); | |
c9ce54d6 MD |
1034 | if (cursor->leaf == &ondisk->elms[oindex].leaf) |
1035 | cursor->leaf = &nndisk->elms[nindex].leaf; | |
1775b6a0 MD |
1036 | cursor->node = nnode; |
1037 | cursor->index = nindex; | |
1038 | hammer_ref_node(nnode); | |
1039 | hammer_rel_node(onode); | |
bbb01e14 MD |
1040 | goto again1; |
1041 | } | |
1042 | ||
1043 | /* | |
1044 | * When moving the first element of onode to a different node any | |
1045 | * cursor which is pointing at (oparent,pindex) must be repointed | |
1046 | * to nnode and ATEDISK must be cleared. | |
1047 | * | |
1048 | * This prevents cursors from losing track due to insertions. | |
1049 | * Insertions temporarily release the cursor in order to update | |
1050 | * the mirror_tids. It primarily effects the mirror_write code. | |
1051 | * The other code paths generally only do a single insertion and | |
1052 | * then relookup or drop the cursor. | |
1053 | */ | |
1054 | if (onode == nnode || oindex) | |
1055 | return; | |
1056 | ondisk = oparent->ondisk; | |
1057 | again2: | |
1058 | TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) { | |
1059 | KKASSERT(cursor->node == oparent); | |
1060 | if (cursor->index != pindex) | |
1061 | continue; | |
d053aa8a | 1062 | hkprintf("debug: shifted cursor pointing at parent\n" |
bbb01e14 MD |
1063 | "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n", |
1064 | (intmax_t)oparent->node_offset, pindex, | |
1065 | (intmax_t)onode->node_offset, oindex, | |
1066 | (intmax_t)nnode->node_offset, nindex); | |
bbb01e14 MD |
1067 | TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry); |
1068 | TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); | |
1069 | if (cursor->leaf == &ondisk->elms[oindex].leaf) | |
1070 | cursor->leaf = &nndisk->elms[nindex].leaf; | |
1071 | cursor->node = nnode; | |
1072 | cursor->index = nindex; | |
1073 | cursor->flags &= ~HAMMER_CURSOR_ATEDISK; | |
1074 | hammer_ref_node(nnode); | |
1075 | hammer_rel_node(oparent); | |
1076 | goto again2; | |
1775b6a0 MD |
1077 | } |
1078 | } | |
1079 | ||
1080 | /* | |
1081 | * The B-Tree element pointing to the specified node was moved from (oparent) | |
1082 | * to (nparent, nindex). We must locate any tracked cursors pointing at | |
1083 | * node and adjust their parent accordingly. | |
1084 | * | |
1085 | * This is used by the rebalancing code when packing elements causes an | |
1086 | * element to shift from one node to another. | |
1087 | */ | |
1088 | void | |
1089 | hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent, | |
1090 | hammer_node_t nparent, int nindex) | |
1091 | { | |
1092 | hammer_cursor_t cursor; | |
1093 | ||
1094 | again: | |
1095 | TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { | |
1096 | KKASSERT(cursor->node == node); | |
1097 | if (cursor->parent == oparent) { | |
1098 | cursor->parent = nparent; | |
1099 | cursor->parent_index = nindex; | |
1100 | hammer_ref_node(nparent); | |
1101 | hammer_rel_node(oparent); | |
1102 | goto again; | |
1103 | } | |
1104 | } | |
1105 | } | |
1106 | ||
b3bad96f | 1107 | /* |
e4a5ff06 | 1108 | * Deleted element at (node, index) |
b3bad96f MD |
1109 | * |
1110 | * Shift indexes >= index | |
1111 | */ | |
1112 | void | |
1113 | hammer_cursor_deleted_element(hammer_node_t node, int index) | |
1114 | { | |
1115 | hammer_cursor_t cursor; | |
c9ce54d6 MD |
1116 | hammer_node_ondisk_t ondisk; |
1117 | ||
1118 | ondisk = node->ondisk; | |
b3bad96f MD |
1119 | |
1120 | TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { | |
1121 | KKASSERT(cursor->node == node); | |
1122 | if (cursor->index == index) { | |
adf01747 | 1123 | cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; |
c9ce54d6 MD |
1124 | if (cursor->leaf == &ondisk->elms[cursor->index].leaf) |
1125 | cursor->leaf = NULL; | |
b3bad96f | 1126 | } else if (cursor->index > index) { |
c9ce54d6 MD |
1127 | if (cursor->leaf == &ondisk->elms[cursor->index].leaf) |
1128 | cursor->leaf = &ondisk->elms[cursor->index - 1].leaf; | |
b3bad96f MD |
1129 | --cursor->index; |
1130 | } | |
1131 | } | |
1132 | } | |
1133 | ||
1134 | /* | |
e4a5ff06 | 1135 | * Inserted element at (node, index) |
b3bad96f MD |
1136 | * |
1137 | * Shift indexes >= index | |
1138 | */ | |
1139 | void | |
1140 | hammer_cursor_inserted_element(hammer_node_t node, int index) | |
1141 | { | |
1142 | hammer_cursor_t cursor; | |
c9ce54d6 MD |
1143 | hammer_node_ondisk_t ondisk; |
1144 | ||
1145 | ondisk = node->ondisk; | |
b3bad96f MD |
1146 | |
1147 | TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { | |
1148 | KKASSERT(cursor->node == node); | |
c9ce54d6 MD |
1149 | if (cursor->index >= index) { |
1150 | if (cursor->leaf == &ondisk->elms[cursor->index].leaf) | |
1151 | cursor->leaf = &ondisk->elms[cursor->index + 1].leaf; | |
b3bad96f | 1152 | ++cursor->index; |
c9ce54d6 | 1153 | } |
b3bad96f MD |
1154 | } |
1155 | } | |
1156 | ||
b9107f58 MD |
1157 | /* |
1158 | * Invalidate the cached data buffer associated with a cursor. | |
1159 | * | |
1160 | * This needs to be done when the underlying block is being freed or | |
1161 | * the referenced buffer can prevent the related buffer cache buffer | |
1162 | * from being properly invalidated. | |
1163 | */ | |
1164 | void | |
1165 | hammer_cursor_invalidate_cache(hammer_cursor_t cursor) | |
1166 | { | |
1167 | if (cursor->data_buffer) { | |
1168 | hammer_rel_buffer(cursor->data_buffer, 0); | |
1169 | cursor->data_buffer = NULL; | |
1170 | cursor->data = NULL; | |
1171 | } | |
1172 | } | |
1173 |