hammer - Add tunable vfs.hammer.skip_redo
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
CommitLineData
4d75d829
MD
1/*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
c58123da
MD
33 */
34
35/*
36 * UNDO ALGORITHM:
37 *
38 * The UNDO algorithm is trivial. The nominal UNDO range in the
39 * FIFO is determined by taking the first/next offset stored in
40 * the volume header. The next offset may not be correct since
41 * UNDO flushes are not required to flush the volume header, so
42 * the code also scans forward until it finds a discontinuous
43 * sequence number.
44 *
45 * The UNDOs are then scanned and executed in reverse order. These
46 * UNDOs are effectively just data restorations based on HAMMER offsets.
47 *
48 * REDO ALGORITHM:
49 *
50 * REDO records are laid down in the UNDO/REDO FIFO for nominal
51 * writes, truncations, and file extension ops. On a per-inode
52 * basis two types of REDO records are generated, REDO_WRITE
53 * and REDO_TRUNC.
54 *
55 * Essentially the recovery block will contain UNDO records backing
56 * out partial operations and REDO records to regenerate those partial
57 * operations guaranteed by the filesystem during recovery.
58 *
59 * REDO generation is optional, and can also be started and then
60 * later stopped due to excessive write()s inbetween fsyncs, or not
61 * started at all. Because of this the recovery code must determine
62 * when REDOs are valid and when they are not. Additional records are
63 * generated to help figure it out.
64 *
65 * The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66 * during a flush cycle indicating which records the flush cycle
67 * has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68 * each flush cycle to indicate how far back in the UNDO/REDO FIFO
69 * the recovery code must go to find the earliest applicable REDO
70 * record. Applicable REDO records can be far outside the nominal
71 * UNDO recovery range, for example if a write() lays down a REDO but
72 * the related file is not flushed for several cycles.
73 *
74 * The SYNC reference is to a point prior to the nominal UNDO FIFO
75 * range, creating an extended REDO range which must be scanned.
76 *
77 * Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78 * which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79 * prior to the start of the nominal UNDO range are applicable.
80 * That is, any REDO_TERM_* records in the extended range but not in
81 * the nominal undo range will mask any redo operations for prior REDO
82 * records. This is necessary because once the TERM is laid down
83 * followup operations may make additional changes to the related
84 * records but not necessarily record them as REDOs (because REDOs are
85 * optional).
86 *
87 * REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88 * must be ignored since they represent meta-data flushes which are
89 * undone by the UNDOs in that nominal UNDO range by the recovery
90 * code. Only REDO_TERM_* records in the extended range but not
91 * in the nominal undo range are applicable.
92 *
93 * The REDO_SYNC record itself always exists in the nominal UNDO range
94 * (this is how the extended range is determined). For recovery
95 * purposes the most recent REDO_SYNC record is always used if several
96 * are found.
97 *
98 * CRASHES DURING UNDO/REDO
99 *
100 * A crash during the UNDO phase requires no additional effort. The
101 * UNDOs will simply be re-run again. The state of the UNDO/REDO fifo
102 * remains unchanged and has no re-crash issues.
103 *
104 * A crash during the REDO phase is more complex because the REDOs
105 * run normal filesystem ops and generate additional UNDO/REDO records.
106 * REDO is disabled during REDO recovery and any SYNC records generated
107 * by flushes during REDO recovery must continue to reference the
108 * original extended range.
109 *
110 * If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111 * may become impossible. This is detected when the start of the
112 * extended range fails to have monotonically increasing sequence
113 * numbers leading into the nominal undo range.
4d75d829
MD
114 */
115
116#include "hammer.h"
117
c58123da 118/*
dbd4f600
AHJ
119 * Specify the way we want to handle stage2 errors.
120 *
121 * Following values are accepted:
122 *
123 * 0 - Run redo recovery normally and fail to mount if
124 * the operation fails (default).
125 * 1 - Run redo recovery, but don't fail to mount if the
126 * operation fails.
127 * 2 - Completely skip redo recovery (only for severe error
128 * conditions and/or debugging.
129 */
130int hammer_skip_redo = 0;
131TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo);
132
133/*
c58123da
MD
134 * Each rterm entry has a list of fifo offsets indicating termination
135 * points. These are stripped as the scan progresses.
136 */
137typedef struct hammer_rterm_entry {
138 struct hammer_rterm_entry *next;
139 hammer_off_t fifo_offset;
140} *hammer_rterm_entry_t;
141
142/*
143 * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
144 * TRUNC entries ignore the offset.
145 */
146typedef struct hammer_rterm {
147 RB_ENTRY(hammer_rterm) rb_node;
148 int64_t redo_objid;
149 u_int32_t redo_localization;
150 u_int32_t redo_flags;
151 hammer_off_t redo_offset;
152 hammer_rterm_entry_t term_list;
153} *hammer_rterm_t;
154
155static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
156struct hammer_rterm_rb_tree;
157RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
158RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
159
f90dde4c
MD
160static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
161 hammer_off_t end_off);
02428fb6
MD
162static int hammer_check_head_signature(hammer_fifo_head_t head,
163 hammer_off_t beg_off);
f90dde4c
MD
164static void hammer_recover_copy_undo(hammer_off_t undo_offset,
165 char *src, char *dst, int bytes);
02428fb6
MD
166static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
167 hammer_volume_t root_volume,
168 hammer_off_t *scan_offsetp,
169 int *errorp, struct hammer_buffer **bufferp);
170static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
171 hammer_volume_t root_volume,
172 hammer_off_t *scan_offsetp,
173 int *errorp, struct hammer_buffer **bufferp);
d36ec43b 174#if 0
f90dde4c 175static void hammer_recover_debug_dump(int w, char *buf, int bytes);
d36ec43b 176#endif
51c35492 177static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
02428fb6 178 hammer_fifo_undo_t undo);
c58123da
MD
179static int hammer_recover_redo_rec(hammer_mount_t hmp,
180 struct hammer_rterm_rb_tree *root,
181 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
182static int hammer_recover_redo_run(hammer_mount_t hmp,
183 struct hammer_rterm_rb_tree *root,
184 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
185static void hammer_recover_redo_exec(hammer_mount_t hmp,
186 hammer_fifo_redo_t redo);
187
188RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
4d75d829
MD
189
190/*
02428fb6
MD
191 * Recover filesystem meta-data on mount. This procedure figures out the
192 * UNDO FIFO range and runs the UNDOs backwards. The FIFO pointers are not
193 * resynchronized by this procedure.
194 *
195 * This procedure is run near the beginning of the mount sequence, before
196 * any B-Tree or high-level accesses are enabled, and is responsible for
197 * restoring the meta-data to a consistent state. High level HAMMER data
198 * structures (such as the B-Tree) cannot be accessed here.
0729c8c8
MD
199 *
200 * NOTE: No information from the root volume has been cached in the
02428fb6
MD
201 * hammer_mount structure yet, so we need to access the root volume's
202 * buffer directly.
203 *
204 * NOTE:
4d75d829
MD
205 */
206int
02428fb6 207hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
4d75d829 208{
f90dde4c
MD
209 hammer_blockmap_t rootmap;
210 hammer_buffer_t buffer;
211 hammer_off_t scan_offset;
02428fb6 212 hammer_off_t scan_offset_save;
f90dde4c 213 hammer_off_t bytes;
02428fb6 214 hammer_fifo_any_t head;
9f5097dc
MD
215 hammer_off_t first_offset;
216 hammer_off_t last_offset;
02428fb6 217 u_int32_t seqno;
f90dde4c 218 int error;
2dd2e007 219 int degenerate_case = 0;
b33e2cc0
MD
220
221 /*
02428fb6 222 * Examine the UNDO FIFO indices in the volume header.
4d75d829 223 */
f90dde4c 224 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
9f5097dc
MD
225 first_offset = rootmap->first_offset;
226 last_offset = rootmap->next_offset;
02428fb6
MD
227 buffer = NULL;
228 error = 0;
229
c58123da
MD
230 hmp->recover_stage2_offset = 0;
231
02428fb6
MD
232 if (first_offset > rootmap->alloc_offset ||
233 last_offset > rootmap->alloc_offset) {
234 kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
235 "%016jx, %016jx limit %016jx\n",
236 root_volume->ondisk->vol_name,
237 (intmax_t)first_offset,
238 (intmax_t)last_offset,
239 (intmax_t)rootmap->alloc_offset);
240 error = EIO;
241 goto done;
242 }
243
244 /*
245 * In HAMMER version 4+ filesystems the volume header does NOT
246 * contain definitive UNDO FIFO state. In particular, the
247 * rootmap->next_offset may not be indexed completely to the
248 * end of the active UNDO FIFO.
249 */
250 if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
251 /*
252 * To find the definitive range we must first scan backwards
253 * from first_offset to locate the first real record and
254 * extract the sequence number from it. This record is not
255 * part of the active undo space.
256 */
257 scan_offset = first_offset;
258 seqno = 0;
259
260 for (;;) {
261 head = hammer_recover_scan_rev(hmp, root_volume,
262 &scan_offset,
263 &error, &buffer);
264 if (error)
265 break;
266 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
267 seqno = head->head.hdr_seq;
268 break;
269 }
270 }
271 if (error) {
2dd2e007 272 kprintf("HAMMER(%s) recovery failure "
02428fb6
MD
273 "during seqno backscan\n",
274 root_volume->ondisk->vol_name);
275 goto done;
276 }
277
278 /*
279 * Scan forwards from first_offset and (seqno+1) looking
280 * for a sequence space discontinuity. This denotes the
281 * end of the active FIFO area.
282 *
283 * NOTE: For the case where the FIFO is empty the very first
284 * record we find will be discontinuous.
285 *
286 * NOTE: Do not include trailing PADs in the scan range,
287 * and remember the returned scan_offset after a
288 * fwd iteration points to the end of the returned
289 * record.
290 */
2dd2e007 291 kprintf("HAMMER(%s) recovery check seqno=%08x\n",
02428fb6
MD
292 root_volume->ondisk->vol_name,
293 seqno);
294
295 scan_offset = first_offset;
296 scan_offset_save = scan_offset;
297 ++seqno;
c58123da
MD
298 hmp->recover_stage2_seqno = seqno;
299
02428fb6
MD
300 for (;;) {
301 head = hammer_recover_scan_fwd(hmp, root_volume,
302 &scan_offset,
303 &error, &buffer);
304 if (error)
305 break;
306 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
307 if (seqno != head->head.hdr_seq) {
308 scan_offset = scan_offset_save;
309 break;
310 }
311 scan_offset_save = scan_offset;
312 ++seqno;
313 }
314
315#if 0
316 /*
317 * If the forward scan is grossly ahead of last_offset
318 * then something is wrong. last_offset is supposed
319 * to be flushed out
320 */
321 if (last_offset >= scan_offset) {
322 bytes = last_offset - scan_offset;
323 } else {
324 bytes = rootmap->alloc_offset - scan_offset +
325 (last_offset & HAMMER_OFF_LONG_MASK);
326 }
327 if (bytes >
328 (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
329 4 / 5) {
2dd2e007 330 kprintf("HAMMER(%s) recovery forward scan is "
02428fb6
MD
331 "grossly beyond the last_offset in "
332 "the volume header, this can't be "
333 "right.\n",
334 root_volume->ondisk->vol_name);
335 error = EIO;
336 break;
337 }
338#endif
339 }
9f5097dc 340
02428fb6
MD
341 /*
342 * Store the seqno. This will be the next seqno we lay down
343 * when generating new UNDOs.
344 */
345 hmp->undo_seqno = seqno;
346 if (error) {
2dd2e007 347 kprintf("HAMMER(%s) recovery failure "
02428fb6
MD
348 "during seqno fwdscan\n",
349 root_volume->ondisk->vol_name);
350 goto done;
351 }
352 last_offset = scan_offset;
2dd2e007
MD
353 kprintf("HAMMER(%s) recovery range %016jx-%016jx\n"
354 "HAMMER(%s) recovery nexto %016jx endseqno=%08x\n",
02428fb6
MD
355 root_volume->ondisk->vol_name,
356 (intmax_t)first_offset,
357 (intmax_t)last_offset,
2dd2e007 358 root_volume->ondisk->vol_name,
02428fb6
MD
359 (intmax_t)rootmap->next_offset,
360 seqno);
361 }
362
363 /*
364 * Calculate the size of the active portion of the FIFO. If the
365 * FIFO is empty the filesystem is clean and no further action is
366 * needed.
367 */
9f5097dc
MD
368 if (last_offset >= first_offset) {
369 bytes = last_offset - first_offset;
c9b9e29d 370 } else {
9f5097dc
MD
371 bytes = rootmap->alloc_offset - first_offset +
372 (last_offset & HAMMER_OFF_LONG_MASK);
c9b9e29d 373 }
02428fb6 374 if (bytes == 0) {
2dd2e007 375 degenerate_case = 1;
02428fb6
MD
376 error = 0;
377 goto done;
378 }
379
c58123da 380 kprintf("HAMMER(%s) recovery undo %016jx-%016jx (%jd bytes)%s\n",
09ac686b 381 root_volume->ondisk->vol_name,
02428fb6
MD
382 (intmax_t)first_offset,
383 (intmax_t)last_offset,
384 (intmax_t)bytes,
51c35492 385 (hmp->ronly ? " (RO)" : "(RW)"));
c9b9e29d
MD
386 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
387 kprintf("Undo size is absurd, unable to mount\n");
02428fb6
MD
388 error = EIO;
389 goto done;
c9b9e29d 390 }
4d75d829
MD
391
392 /*
f90dde4c 393 * Scan the UNDOs backwards.
4d75d829 394 */
9f5097dc 395 scan_offset = last_offset;
4d75d829 396
f90dde4c 397 while ((int64_t)bytes > 0) {
02428fb6
MD
398 KKASSERT(scan_offset != first_offset);
399 head = hammer_recover_scan_rev(hmp, root_volume,
400 &scan_offset, &error, &buffer);
401 if (error)
f90dde4c 402 break;
c58123da
MD
403
404 /*
405 * Normal UNDO
406 */
02428fb6 407 error = hammer_recover_undo(hmp, root_volume, &head->undo);
f90dde4c 408 if (error) {
02428fb6 409 kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
f90dde4c 410 root_volume->ondisk->vol_name,
02428fb6 411 (intmax_t)scan_offset - head->head.hdr_size);
b33e2cc0 412 break;
f90dde4c 413 }
c58123da
MD
414
415 /*
416 * The first REDO_SYNC record encountered (scanning backwards)
417 * enables REDO processing.
418 */
419 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
420 head->redo.redo_flags == HAMMER_REDO_SYNC) {
421 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
422 kprintf("HAMMER(%s) Ignoring extra REDO_SYNC "
423 "records in UNDO/REDO FIFO.\n",
424 root_volume->ondisk->vol_name
425 );
426 } else {
427 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
428 hmp->recover_stage2_offset =
429 head->redo.redo_offset;
430 kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n",
431 root_volume->ondisk->vol_name,
432 (intmax_t)head->redo.redo_offset);
433 }
434 }
435
02428fb6 436 bytes -= head->head.hdr_size;
06ad81ff
MD
437
438 /*
312de84d
MD
439 * If too many dirty buffers have built up we have to flush'm
440 * out. As long as we do not flush out the volume header
441 * a crash here should not cause any problems.
442 *
443 * buffer must be released so the flush can assert that
444 * all buffers are idle.
06ad81ff
MD
445 */
446 if (hammer_flusher_meta_limit(hmp)) {
312de84d
MD
447 if (buffer) {
448 hammer_rel_buffer(buffer, 0);
449 buffer = NULL;
450 }
06ad81ff
MD
451 if (hmp->ronly == 0) {
452 hammer_recover_flush_buffers(hmp, root_volume,
453 0);
454 kprintf("HAMMER(%s) Continuing recovery\n",
455 root_volume->ondisk->vol_name);
00f16fad 456 } else {
06ad81ff
MD
457 kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
458 root_volume->ondisk->vol_name);
00f16fad
MD
459 error = EIO;
460 break;
06ad81ff
MD
461 }
462 }
4d75d829 463 }
c58123da 464 KKASSERT(error || bytes == 0);
c9b9e29d 465done:
02428fb6 466 if (buffer) {
f90dde4c 467 hammer_rel_buffer(buffer, 0);
02428fb6
MD
468 buffer = NULL;
469 }
51c35492
MD
470
471 /*
9f5097dc 472 * After completely flushing all the recovered buffers the volume
02428fb6 473 * header will also be flushed.
51c35492 474 */
9f5097dc
MD
475 if (root_volume->io.recovered == 0) {
476 hammer_ref_volume(root_volume);
477 root_volume->io.recovered = 1;
51c35492 478 }
9f5097dc
MD
479
480 /*
02428fb6
MD
481 * Finish up flushing (or discarding) recovered buffers. FIFO
482 * indices in the volume header are updated to the actual undo
483 * range but will not be collapsed until stage 2.
9f5097dc 484 */
00f16fad
MD
485 if (error == 0) {
486 hammer_modify_volume(NULL, root_volume, NULL, 0);
487 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
02428fb6 488 rootmap->first_offset = first_offset;
00f16fad
MD
489 rootmap->next_offset = last_offset;
490 hammer_modify_volume_done(root_volume);
491 if (hmp->ronly == 0)
492 hammer_recover_flush_buffers(hmp, root_volume, 1);
493 } else {
494 hammer_recover_flush_buffers(hmp, root_volume, -1);
495 }
2dd2e007
MD
496 if (degenerate_case == 0) {
497 kprintf("HAMMER(%s) recovery complete\n",
498 root_volume->ondisk->vol_name);
499 } else {
500 kprintf("HAMMER(%s) mounted clean, no recovery needed\n",
501 root_volume->ondisk->vol_name);
502 }
f90dde4c 503 return (error);
4d75d829
MD
504}
505
02428fb6
MD
506/*
507 * Execute redo operations
508 *
509 * This procedure is run at the end of the mount sequence, after the hammer
510 * mount structure has been completely initialized but before the filesystem
511 * goes live. It can access standard cursors, the B-Tree, flush the
512 * filesystem, and so forth.
513 *
514 * This code may only be called for read-write mounts or when a mount
2dd2e007 515 * switches from read-only to read-write. vnodes may or may not be present.
02428fb6
MD
516 *
517 * The stage1 code will have already calculated the correct FIFO range
c58123da
MD
518 * for the nominal UNDO FIFO and stored it in the rootmap. The extended
519 * range for REDO is stored in hmp->recover_stage2_offset.
02428fb6
MD
520 */
521int
522hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
4d75d829 523{
02428fb6
MD
524 hammer_blockmap_t rootmap;
525 hammer_buffer_t buffer;
526 hammer_off_t scan_offset;
c58123da 527 hammer_off_t oscan_offset;
02428fb6 528 hammer_off_t bytes;
c58123da 529 hammer_off_t ext_bytes;
02428fb6
MD
530 hammer_fifo_any_t head;
531 hammer_off_t first_offset;
532 hammer_off_t last_offset;
c58123da
MD
533 hammer_off_t ext_offset;
534 struct hammer_rterm_rb_tree rterm_root;
535 u_int32_t seqno;
02428fb6 536 int error;
c58123da
MD
537 int verbose = 0;
538 int dorscan;
02428fb6
MD
539
540 /*
541 * Stage 2 can only be run on a RW mount, or when the mount is
c58123da 542 * switched from RO to RW.
02428fb6
MD
543 */
544 KKASSERT(hmp->ronly == 0);
c58123da 545 RB_INIT(&rterm_root);
b33e2cc0 546
dbd4f600
AHJ
547 if (hammer_skip_redo == 1)
548 kprintf("HAMMER(%s) recovery redo marked as optional\n",
549 root_volume->ondisk->vol_name);
550
551 if (hammer_skip_redo == 2) {
552 kprintf("HAMMER(%s) recovery redo skipped.\n",
553 root_volume->ondisk->vol_name);
554 return (0);
555 }
556
b33e2cc0 557 /*
02428fb6
MD
558 * Examine the UNDO FIFO. If it is empty the filesystem is clean
559 * and no action need be taken.
b33e2cc0 560 */
02428fb6
MD
561 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
562 first_offset = rootmap->first_offset;
563 last_offset = rootmap->next_offset;
c58123da
MD
564 if (first_offset == last_offset) {
565 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
02428fb6 566 return(0);
c58123da 567 }
02428fb6 568
c58123da
MD
569 /*
570 * Stage2 must only be run once, and will not be run at all
571 * if Stage1 did not find a REDO_SYNC record.
572 */
573 error = 0;
574 buffer = NULL;
575
576 if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
577 goto done;
578 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
579 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
580 ext_offset = hmp->recover_stage2_offset;
581 if (ext_offset == 0) {
582 kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC "
583 "offset, ignoring\n",
584 root_volume->ondisk->vol_name);
585 goto done;
586 }
587
588 /*
589 * Calculate nominal UNDO range (this is not yet the extended
590 * range).
591 */
02428fb6
MD
592 if (last_offset >= first_offset) {
593 bytes = last_offset - first_offset;
594 } else {
595 bytes = rootmap->alloc_offset - first_offset +
596 (last_offset & HAMMER_OFF_LONG_MASK);
597 }
c58123da 598 kprintf("HAMMER(%s) recovery redo %016jx-%016jx (%jd bytes)%s\n",
02428fb6
MD
599 root_volume->ondisk->vol_name,
600 (intmax_t)first_offset,
601 (intmax_t)last_offset,
602 (intmax_t)bytes,
603 (hmp->ronly ? " (RO)" : "(RW)"));
c58123da 604 verbose = 1;
02428fb6
MD
605 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
606 kprintf("Undo size is absurd, unable to mount\n");
c58123da
MD
607 error = EIO;
608 goto fatal;
b33e2cc0
MD
609 }
610
b33e2cc0 611 /*
c58123da
MD
612 * Scan the REDOs backwards collecting REDO_TERM_* information.
613 * This information is only collected for the extended range,
614 * non-inclusive of any TERMs in the nominal UNDO range.
615 *
616 * If the stage2 extended range is inside the nominal undo range
617 * we have nothing to scan.
618 *
619 * This must fit in memory!
b33e2cc0 620 */
c58123da
MD
621 if (first_offset < last_offset) {
622 /*
623 * [ first_offset........last_offset ]
624 */
625 if (ext_offset < first_offset) {
626 dorscan = 1;
627 ext_bytes = first_offset - ext_offset;
628 } else if (ext_offset > last_offset) {
629 dorscan = 1;
630 ext_bytes = (rootmap->alloc_offset - ext_offset) +
631 (first_offset & HAMMER_OFF_LONG_MASK);
632 } else {
633 ext_bytes = -(ext_offset - first_offset);
634 dorscan = 0;
635 }
636 } else {
637 /*
638 * [......last_offset first_offset.....]
639 */
640 if (ext_offset < last_offset) {
641 ext_bytes = -((rootmap->alloc_offset - first_offset) +
642 (ext_offset & HAMMER_OFF_LONG_MASK));
643 dorscan = 0;
644 } else if (ext_offset > first_offset) {
645 ext_bytes = -(ext_offset - first_offset);
646 dorscan = 0;
647 } else {
648 ext_bytes = first_offset - ext_offset;
649 dorscan = 1;
650 }
651 }
02428fb6 652
c58123da
MD
653 if (dorscan) {
654 scan_offset = first_offset;
655 kprintf("HAMMER(%s) Find extended redo %016jx, %jd extbytes\n",
656 root_volume->ondisk->vol_name,
657 (intmax_t)ext_offset,
658 (intmax_t)ext_bytes);
659 seqno = hmp->recover_stage2_seqno - 1;
660 for (;;) {
661 head = hammer_recover_scan_rev(hmp, root_volume,
662 &scan_offset,
663 &error, &buffer);
664 if (error)
665 break;
666 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
667 if (head->head.hdr_seq != seqno) {
668 error = ERANGE;
669 break;
670 }
671 error = hammer_recover_redo_rec(
672 hmp, &rterm_root,
673 scan_offset, &head->redo);
674 --seqno;
675 }
676 if (scan_offset == ext_offset)
677 break;
678 }
679 if (error) {
680 kprintf("HAMMER(%s) Find extended redo failed %d, "
681 "unable to run REDO\n",
682 root_volume->ondisk->vol_name,
683 error);
684 goto done;
685 }
686 } else {
7750bdfa 687 kprintf("HAMMER(%s) Embedded extended redo %016jx, "
c58123da
MD
688 "%jd extbytes\n",
689 root_volume->ondisk->vol_name,
690 (intmax_t)ext_offset,
691 (intmax_t)ext_bytes);
692 }
693
694 /*
695 * Scan the REDO forwards through the entire extended range.
696 * Anything with a previously recorded matching TERM is discarded.
697 */
698 scan_offset = ext_offset;
699 bytes += ext_bytes;
700
701 /*
702 * NOTE: when doing a forward scan the returned scan_offset is
703 * for the record following the returned record, so we
704 * have to play a bit.
705 */
706 while ((int64_t)bytes > 0) {
02428fb6
MD
707 KKASSERT(scan_offset != last_offset);
708
c58123da 709 oscan_offset = scan_offset;
02428fb6
MD
710 head = hammer_recover_scan_fwd(hmp, root_volume,
711 &scan_offset, &error, &buffer);
712 if (error)
713 break;
714
c58123da
MD
715 error = hammer_recover_redo_run(hmp, &rterm_root,
716 oscan_offset, &head->redo);
02428fb6
MD
717 if (error) {
718 kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
719 root_volume->ondisk->vol_name,
720 (intmax_t)scan_offset - head->head.hdr_size);
721 break;
f90dde4c 722 }
02428fb6
MD
723 bytes -= head->head.hdr_size;
724 }
c58123da 725 KKASSERT(error || bytes == 0);
86327cc9
MD
726
727done:
02428fb6
MD
728 if (buffer) {
729 hammer_rel_buffer(buffer, 0);
730 buffer = NULL;
f90dde4c 731 }
4d75d829 732
c58123da
MD
733 /*
734 * Cleanup rterm tree
735 */
736 {
737 hammer_rterm_t rterm;
738 hammer_rterm_entry_t rte;
739
740 while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
741 RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
742 while ((rte = rterm->term_list) != NULL) {
743 rterm->term_list = rte->next;
744 kfree(rte, hmp->m_misc);
745 }
746 kfree(rterm, hmp->m_misc);
747 }
748 }
749
4d75d829 750 /*
02428fb6
MD
751 * Finish up flushing (or discarding) recovered buffers by executing
752 * a normal flush cycle. Setting HMNT_UNDO_DIRTY bypasses degenerate
753 * case tests and forces the flush in order to update the FIFO indices.
754 *
755 * If a crash occurs during the flush the entire undo/redo will be
756 * re-run during recovery on the next mount.
4d75d829 757 */
02428fb6
MD
758 if (error == 0) {
759 if (rootmap->first_offset != rootmap->next_offset)
760 hmp->hflags |= HMNT_UNDO_DIRTY;
761 hammer_flusher_sync(hmp);
4d75d829 762 }
c58123da
MD
763fatal:
764 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
765 if (verbose) {
766 kprintf("HAMMER(%s) End redo recovery\n",
767 root_volume->ondisk->vol_name);
768 }
dbd4f600
AHJ
769
770 if (error && hammer_skip_redo == 1)
771 kprintf("HAMMER(%s) recovery redo error %d, "
772 " skipping.\n", root_volume->ondisk->vol_name,
773 error);
774
775 return (hammer_skip_redo ? 0 : error);
4d75d829
MD
776}
777
02428fb6
MD
778/*
779 * Scan backwards from *scan_offsetp, return the FIFO record prior to the
780 * record at *scan_offsetp or NULL if an error occured.
781 *
782 * On return *scan_offsetp will be the offset of the returned record.
783 */
784hammer_fifo_any_t
785hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
786 hammer_off_t *scan_offsetp,
787 int *errorp, struct hammer_buffer **bufferp)
4d75d829 788{
02428fb6
MD
789 hammer_off_t scan_offset;
790 hammer_blockmap_t rootmap;
791 hammer_fifo_any_t head;
f90dde4c 792 hammer_fifo_tail_t tail;
4d75d829 793
02428fb6
MD
794 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
795 scan_offset = *scan_offsetp;
796
797 if (hammer_debug_general & 0x0080)
798 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
799 if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
800 scan_offset = rootmap->alloc_offset;
801 if (scan_offset - sizeof(*tail) <
802 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
803 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
804 root_volume->ondisk->vol_name,
805 (intmax_t)scan_offset);
806 *errorp = EIO;
807 return (NULL);
9944ae54 808 }
02428fb6
MD
809 tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
810 errorp, bufferp);
811 if (*errorp) {
812 kprintf("HAMMER(%s) Unable to read UNDO TAIL "
813 "at %016jx\n",
814 root_volume->ondisk->vol_name,
815 (intmax_t)scan_offset - sizeof(*tail));
816 return (NULL);
f90dde4c 817 }
02428fb6
MD
818
819 if (hammer_check_tail_signature(tail, scan_offset) != 0) {
820 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
821 "at %016jx\n",
822 root_volume->ondisk->vol_name,
823 (intmax_t)scan_offset - sizeof(*tail));
824 *errorp = EIO;
825 return (NULL);
4d75d829 826 }
02428fb6
MD
827 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
828 *scan_offsetp = scan_offset - head->head.hdr_size;
829
830 return (head);
831}
832
833/*
834 * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
835 * an error occured.
836 *
837 * On return *scan_offsetp will be the offset of the record following
838 * the returned record.
839 */
840hammer_fifo_any_t
841hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
842 hammer_off_t *scan_offsetp,
843 int *errorp, struct hammer_buffer **bufferp)
844{
845 hammer_off_t scan_offset;
846 hammer_blockmap_t rootmap;
847 hammer_fifo_any_t head;
848
849 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
850 scan_offset = *scan_offsetp;
851
852 if (hammer_debug_general & 0x0080)
853 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
854 if (scan_offset == rootmap->alloc_offset)
855 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
856
857 head = hammer_bread(hmp, scan_offset, errorp, bufferp);
858 if (*errorp) {
859 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
860 root_volume->ondisk->vol_name,
861 (intmax_t)scan_offset);
862 return (NULL);
863 }
864
865 if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
866 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
867 "at %016jx\n",
868 root_volume->ondisk->vol_name,
869 (intmax_t)scan_offset);
870 *errorp = EIO;
871 return (NULL);
872 }
873 scan_offset += head->head.hdr_size;
874 if (scan_offset == rootmap->alloc_offset)
875 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
876 *scan_offsetp = scan_offset;
877
878 return (head);
879}
880
881/*
882 * Helper function for hammer_check_{head,tail}_signature(). Check stuff
883 * once the head and tail has been established.
884 *
885 * This function validates the entire FIFO record wrapper.
886 */
887static __inline
888int
889_hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
890 hammer_off_t beg_off)
891{
892 hammer_off_t end_off;
893 u_int32_t crc;
894 int bytes;
4d75d829
MD
895
896 /*
02428fb6
MD
897 * Check signatures. The tail signature is allowed to be the
898 * head signature only for 8-byte PADs.
4d75d829 899 */
02428fb6
MD
900 if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
901 kprintf("HAMMER: FIFO record bad head signature "
902 "%04x at %016jx\n",
903 head->hdr_signature,
904 (intmax_t)beg_off);
905 return(2);
906 }
907 if (head->hdr_size < HAMMER_HEAD_ALIGN ||
908 (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
909 kprintf("HAMMER: FIFO record unaligned or bad size"
910 "%04x at %016jx\n",
911 head->hdr_size,
912 (intmax_t)beg_off);
913 return(2);
914 }
915 end_off = beg_off + head->hdr_size;
916
917 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
918 (size_t)(end_off - beg_off) != sizeof(*tail)) {
919 if (head->hdr_type != tail->tail_type) {
920 kprintf("HAMMER: FIFO record head/tail type mismatch "
921 "%04x %04x at %016jx\n",
922 head->hdr_type, tail->tail_type,
923 (intmax_t)beg_off);
924 return(2);
925 }
926 if (head->hdr_size != tail->tail_size) {
927 kprintf("HAMMER: FIFO record head/tail size mismatch "
928 "%04x %04x at %016jx\n",
929 head->hdr_size, tail->tail_size,
930 (intmax_t)beg_off);
931 return(2);
932 }
933 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
934 kprintf("HAMMER: FIFO record bad tail signature "
935 "%04x at %016jx\n",
936 tail->tail_signature,
937 (intmax_t)beg_off);
938 return(3);
939 }
940 }
9944ae54
MD
941
942 /*
02428fb6
MD
943 * Non-PAD records must have a CRC and must be sized at
944 * least large enough to fit the head and tail.
09ac686b 945 */
02428fb6
MD
946 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
947 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
948 crc32(head + 1, head->hdr_size - sizeof(*head));
949 if (head->hdr_crc != crc) {
950 kprintf("HAMMER: FIFO record CRC failed %08x %08x "
951 "at %016jx\n",
952 head->hdr_crc, crc,
953 (intmax_t)beg_off);
954 return(EIO);
955 }
956 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
957 kprintf("HAMMER: FIFO record too small "
958 "%04x at %016jx\n",
959 head->hdr_size,
960 (intmax_t)beg_off);
961 return(EIO);
962 }
09ac686b
MD
963 }
964
09ac686b 965 /*
f90dde4c 966 * Check the tail
9944ae54 967 */
02428fb6
MD
968 bytes = head->hdr_size;
969 tail = (void *)((char *)head + bytes - sizeof(*tail));
970 if (tail->tail_size != head->hdr_size) {
971 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
972 tail->tail_size, head->hdr_size,
973 (intmax_t)beg_off);
f90dde4c
MD
974 return(EIO);
975 }
02428fb6
MD
976 if (tail->tail_type != head->hdr_type) {
977 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
978 tail->tail_type, head->hdr_type,
979 (intmax_t)beg_off);
f90dde4c 980 return(EIO);
4d75d829
MD
981 }
982
02428fb6
MD
983 return(0);
984}
985
986/*
987 * Check that the FIFO record is in-bounds given the head and the
988 * hammer offset.
989 *
990 * Also checks that the head and tail structures agree with each other,
991 * but does not check beyond the signature, type, and size.
992 */
993static int
994hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
995{
996 hammer_fifo_tail_t tail;
997 hammer_off_t end_off;
998
999 /*
1000 * head overlaps buffer boundary. This could be a PAD so only
1001 * check the minimum PAD size here.
1002 */
1003 if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
1004 return(1);
1005
1006 /*
1007 * Calculate the ending offset and make sure the record does
1008 * not cross a buffer boundary.
1009 */
1010 end_off = beg_off + head->hdr_size;
1011 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1012 return(1);
1013 tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
1014 return (_hammer_check_signature(head, tail, beg_off));
1015}
1016
1017/*
1018 * Check that the FIFO record is in-bounds given the tail and the
1019 * hammer offset. The offset is pointing at the ending boundary of the
1020 * record.
1021 *
1022 * Also checks that the head and tail structures agree with each other,
1023 * but does not check beyond the signature, type, and size.
1024 */
1025static int
1026hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
1027{
1028 hammer_fifo_head_t head;
1029 hammer_off_t beg_off;
1030
4d75d829 1031 /*
02428fb6
MD
1032 * tail overlaps buffer boundary
1033 */
1034 if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1035 return(1);
1036
1037 /*
1038 * Calculate the begining offset and make sure the record does
1039 * not cross a buffer boundary.
4d75d829 1040 */
02428fb6
MD
1041 beg_off = end_off - tail->tail_size;
1042 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1043 return(1);
1044 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1045 return (_hammer_check_signature(head, tail, beg_off));
1046}
1047
1048static int
1049hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1050 hammer_fifo_undo_t undo)
1051{
1052 hammer_volume_t volume;
1053 hammer_buffer_t buffer;
1054 hammer_off_t buf_offset;
1055 int zone;
1056 int error;
1057 int vol_no;
1058 int bytes;
1059 u_int32_t offset;
1060
1061 /*
1062 * Only process UNDO records. Flag if we find other records to
1063 * optimize stage2 recovery.
1064 */
c58123da 1065 if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
f90dde4c 1066 return(0);
4d75d829
MD
1067
1068 /*
f90dde4c 1069 * Validate the UNDO record.
4d75d829 1070 */
02428fb6
MD
1071 bytes = undo->head.hdr_size - sizeof(*undo) -
1072 sizeof(struct hammer_fifo_tail);
1073 if (bytes < 0 || undo->undo_data_bytes < 0 ||
1074 undo->undo_data_bytes > bytes) {
f90dde4c 1075 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
02428fb6 1076 undo->undo_data_bytes, bytes);
f90dde4c 1077 return(EIO);
4d75d829
MD
1078 }
1079
02428fb6
MD
1080 bytes = undo->undo_data_bytes;
1081
4d75d829 1082 /*
f90dde4c
MD
1083 * The undo offset may only be a zone-1 or zone-2 offset.
1084 *
1085 * Currently we only support a zone-1 offset representing the
1086 * volume header.
4d75d829 1087 */
f90dde4c
MD
1088 zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1089 offset = undo->undo_offset & HAMMER_BUFMASK;
4d75d829 1090
02428fb6 1091 if (offset + bytes > HAMMER_BUFSIZE) {
f90dde4c
MD
1092 kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
1093 return (EIO);
1094 }
4d75d829 1095
f90dde4c
MD
1096 switch(zone) {
1097 case HAMMER_ZONE_RAW_VOLUME_INDEX:
1098 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1099 volume = hammer_get_volume(hmp, vol_no, &error);
1100 if (volume == NULL) {
1101 kprintf("HAMMER: UNDO record, "
1102 "cannot access volume %d\n", vol_no);
1103 break;
4d75d829 1104 }
f90dde4c
MD
1105 hammer_modify_volume(NULL, volume, NULL, 0);
1106 hammer_recover_copy_undo(undo->undo_offset,
1107 (char *)(undo + 1),
1108 (char *)volume->ondisk + offset,
02428fb6 1109 bytes);
f90dde4c 1110 hammer_modify_volume_done(volume);
51c35492
MD
1111
1112 /*
9f5097dc
MD
1113 * Multiple modifications may be made to the same buffer.
1114 * Also, the volume header cannot be written out until
1115 * everything else has been flushed. This also
51c35492
MD
1116 * covers the read-only case by preventing the kernel from
1117 * flushing the buffer.
1118 */
1119 if (volume->io.recovered == 0)
1120 volume->io.recovered = 1;
1121 else
1122 hammer_rel_volume(volume, 0);
f90dde4c
MD
1123 break;
1124 case HAMMER_ZONE_RAW_BUFFER_INDEX:
2f85fa4d 1125 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
4a2796f3
MD
1126 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1127 0, &error);
f90dde4c
MD
1128 if (buffer == NULL) {
1129 kprintf("HAMMER: UNDO record, "
02428fb6
MD
1130 "cannot access buffer %016jx\n",
1131 (intmax_t)undo->undo_offset);
f90dde4c 1132 break;
4d75d829 1133 }
f90dde4c
MD
1134 hammer_modify_buffer(NULL, buffer, NULL, 0);
1135 hammer_recover_copy_undo(undo->undo_offset,
1136 (char *)(undo + 1),
1137 (char *)buffer->ondisk + offset,
02428fb6 1138 bytes);
f90dde4c 1139 hammer_modify_buffer_done(buffer);
51c35492
MD
1140
1141 /*
1142 * Multiple modifications may be made to the same buffer,
1143 * improve performance by delaying the flush. This also
1144 * covers the read-only case by preventing the kernel from
1145 * flushing the buffer.
1146 */
1147 if (buffer->io.recovered == 0)
1148 buffer->io.recovered = 1;
1149 else
1150 hammer_rel_buffer(buffer, 0);
f90dde4c
MD
1151 break;
1152 default:
1153 kprintf("HAMMER: Corrupt UNDO record\n");
1154 error = EIO;
4d75d829 1155 }
f90dde4c 1156 return (error);
4d75d829
MD
1157}
1158
f90dde4c
MD
1159static void
1160hammer_recover_copy_undo(hammer_off_t undo_offset,
1161 char *src, char *dst, int bytes)
4d75d829 1162{
973c11b9 1163 if (hammer_debug_general & 0x0080) {
02428fb6
MD
1164 kprintf("UNDO %016jx: %d\n",
1165 (intmax_t)undo_offset, bytes);
973c11b9 1166 }
ec4e8497 1167#if 0
02428fb6 1168 kprintf("UNDO %016jx:", (intmax_t)undo_offset);
f90dde4c
MD
1169 hammer_recover_debug_dump(22, dst, bytes);
1170 kprintf("%22s", "to:");
1171 hammer_recover_debug_dump(22, src, bytes);
ec4e8497 1172#endif
f90dde4c 1173 bcopy(src, dst, bytes);
4d75d829
MD
1174}
1175
c58123da
MD
1176/*
1177 * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1178 * during the backwards scan of the extended UNDO/REDO FIFO. This scan
1179 * does not include the nominal UNDO range, just the extended range.
1180 */
1181int
1182hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1183 hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1184{
1185 hammer_rterm_t rterm;
1186 hammer_rterm_t nrterm;
1187 hammer_rterm_entry_t rte;
1188
1189 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1190 return(0);
1191 if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1192 redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1193 return(0);
1194 }
1195
1196 nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1197 nrterm->redo_objid = redo->redo_objid;
1198 nrterm->redo_localization = redo->redo_localization;
1199 nrterm->redo_flags = redo->redo_flags;
1200 nrterm->redo_offset = redo->redo_offset;
1201
1202 rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1203 if (rterm)
1204 kfree(nrterm, hmp->m_misc);
1205 else
1206 rterm = nrterm;
1207
fad4297b
MD
1208 if (bootverbose) {
1209 kprintf("record record %016jx objid %016jx "
1210 "offset %016jx flags %08x\n",
1211 (intmax_t)scan_offset,
1212 (intmax_t)redo->redo_objid,
1213 (intmax_t)redo->redo_offset,
1214 (int)redo->redo_flags);
1215 }
c58123da
MD
1216
1217 /*
1218 * Scan in reverse order, rte prepended, so the rte list will be
1219 * in forward order.
1220 */
1221 rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1222 rte->fifo_offset = scan_offset;
1223 rte->next = rterm->term_list;
1224 rterm->term_list = rte;
1225
1226 return(0);
1227}
1228
1229/*
1230 * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1231 * the forwards scan of the entire extended UNDO/REDO FIFO range.
1232 *
1233 * Records matching previously recorded TERMs have already been committed
1234 * and are ignored.
1235 */
1236int
1237hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1238 hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1239{
1240 struct hammer_rterm rtval;
1241 hammer_rterm_t rterm;
1242 hammer_rterm_entry_t rte;
1243
1244 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1245 return(0);
1246
1247 switch(redo->redo_flags) {
1248 case HAMMER_REDO_WRITE:
1249 case HAMMER_REDO_TRUNC:
1250 /*
1251 * We hit a REDO request. The REDO request is only executed
1252 * if there is no matching TERM.
1253 */
1254 bzero(&rtval, sizeof(rtval));
1255 rtval.redo_objid = redo->redo_objid;
1256 rtval.redo_localization = redo->redo_localization;
1257 rtval.redo_offset = redo->redo_offset;
1258 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1259 HAMMER_REDO_TERM_WRITE :
1260 HAMMER_REDO_TERM_TRUNC;
1261
1262 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1263 if (rterm) {
418cb5e5
MD
1264 if (bootverbose) {
1265 kprintf("ignore record %016jx objid %016jx "
1266 "offset %016jx flags %08x\n",
1267 (intmax_t)scan_offset,
1268 (intmax_t)redo->redo_objid,
1269 (intmax_t)redo->redo_offset,
1270 (int)redo->redo_flags);
1271 }
1272 break;
1273 }
1274 if (bootverbose) {
1275 kprintf("run record %016jx objid %016jx "
c58123da
MD
1276 "offset %016jx flags %08x\n",
1277 (intmax_t)scan_offset,
1278 (intmax_t)redo->redo_objid,
1279 (intmax_t)redo->redo_offset,
1280 (int)redo->redo_flags);
c58123da 1281 }
c58123da
MD
1282
1283 /*
1284 * Redo stage2 can access a live filesystem, acquire the
1285 * vnode.
1286 */
1287 hammer_recover_redo_exec(hmp, redo);
1288 break;
1289 case HAMMER_REDO_TERM_WRITE:
1290 case HAMMER_REDO_TERM_TRUNC:
1291 /*
1292 * As we encounter TERMs in the forward scan we remove
1293 * them. Once the forward scan hits the nominal undo range
1294 * there will be no more recorded TERMs.
1295 */
1296 bzero(&rtval, sizeof(rtval));
1297 rtval.redo_objid = redo->redo_objid;
1298 rtval.redo_localization = redo->redo_localization;
1299 rtval.redo_flags = redo->redo_flags;
1300 rtval.redo_offset = redo->redo_offset;
1301
1302 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1303 if (rterm) {
1304 if ((rte = rterm->term_list) != NULL) {
1305 KKASSERT(rte->fifo_offset == scan_offset);
1306 rterm->term_list = rte->next;
1307 kfree(rte, hmp->m_misc);
1308 }
1309 }
1310 break;
1311 }
1312 return(0);
1313}
1314
1315static void
1316hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1317{
1318 struct hammer_transaction trans;
1319 struct vattr va;
1320 struct hammer_inode *ip;
1321 struct vnode *vp = NULL;
1322 int error;
1323
1324 hammer_start_transaction(&trans, hmp);
1325
1326 ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1327 HAMMER_MAX_TID, redo->redo_localization,
1328 0, &error);
1329 if (ip == NULL) {
418cb5e5 1330 kprintf("unable to find objid %016jx:%08x\n",
c58123da
MD
1331 (intmax_t)redo->redo_objid, redo->redo_localization);
1332 goto done2;
1333 }
1334 error = hammer_get_vnode(ip, &vp);
1335 if (error) {
418cb5e5 1336 kprintf("unable to acquire vnode for %016jx:%08x\n",
c58123da
MD
1337 (intmax_t)redo->redo_objid, redo->redo_localization);
1338 goto done1;
1339 }
1340
1341 switch(redo->redo_flags) {
1342 case HAMMER_REDO_WRITE:
1343 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1344 if (error) {
418cb5e5
MD
1345 kprintf("vn_rdwr open %016jx:%08x returned %d\n",
1346 (intmax_t)redo->redo_objid,
1347 redo->redo_localization, error);
c58123da
MD
1348 break;
1349 }
1350 vn_unlock(vp);
1351 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1352 redo->redo_data_bytes,
1353 redo->redo_offset, UIO_SYSSPACE,
1354 0, proc0.p_ucred, NULL);
1355 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
418cb5e5
MD
1356 if (error) {
1357 kprintf("write %016jx:%08x returned %d\n",
1358 (intmax_t)redo->redo_objid,
1359 redo->redo_localization, error);
1360 }
c58123da
MD
1361 VOP_CLOSE(vp, FREAD|FWRITE);
1362 break;
1363 case HAMMER_REDO_TRUNC:
c58123da
MD
1364 VATTR_NULL(&va);
1365 va.va_size = redo->redo_offset;
1366 error = VOP_SETATTR(vp, &va, proc0.p_ucred);
418cb5e5
MD
1367 if (error) {
1368 kprintf("setattr offset %016jx error %d\n",
1369 (intmax_t)redo->redo_offset, error);
1370 }
c58123da
MD
1371 break;
1372 }
1373 vput(vp);
1374done1:
1375 hammer_rel_inode(ip, 0);
1376done2:
1377 hammer_done_transaction(&trans);
1378}
1379
1380/*
1381 * RB tree compare function. Note that REDO_TERM_TRUNC ops ignore
1382 * the offset.
1383 *
1384 * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1385 */
1386static int
1387hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1388{
1389 if (rt1->redo_objid < rt2->redo_objid)
1390 return(-1);
1391 if (rt1->redo_objid > rt2->redo_objid)
1392 return(1);
1393 if (rt1->redo_localization < rt2->redo_localization)
1394 return(-1);
1395 if (rt1->redo_localization > rt2->redo_localization)
1396 return(1);
1397 if (rt1->redo_flags < rt2->redo_flags)
1398 return(-1);
1399 if (rt1->redo_flags > rt2->redo_flags)
1400 return(1);
1401 if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1402 if (rt1->redo_offset < rt2->redo_offset)
1403 return(-1);
1404 if (rt1->redo_offset > rt2->redo_offset)
1405 return(1);
1406 }
1407 return(0);
1408}
1409
d36ec43b
MD
1410#if 0
1411
f90dde4c
MD
1412static void
1413hammer_recover_debug_dump(int w, char *buf, int bytes)
4d75d829 1414{
f90dde4c 1415 int i;
4d75d829 1416
f90dde4c
MD
1417 for (i = 0; i < bytes; ++i) {
1418 if (i && (i & 15) == 0)
1419 kprintf("\n%*.*s", w, w, "");
1420 kprintf(" %02x", (unsigned char)buf[i]);
b33e2cc0 1421 }
f90dde4c 1422 kprintf("\n");
4d75d829
MD
1423}
1424
d36ec43b 1425#endif
51c35492
MD
1426
1427/*
9f5097dc
MD
1428 * Flush recovered buffers from recovery operations. The call to this
1429 * routine may be delayed if a read-only mount was made and then later
2faf0737
MD
1430 * upgraded to read-write. This routine is also called when unmounting
1431 * a read-only mount to clean out recovered (dirty) buffers which we
1432 * couldn't flush (because the mount is read-only).
9f5097dc
MD
1433 *
1434 * The volume header is always written last. The UNDO FIFO will be forced
1435 * to zero-length by setting next_offset to first_offset. This leaves the
1436 * (now stale) UNDO information used to recover the disk available for
1437 * forensic analysis.
00f16fad
MD
1438 *
1439 * final is typically 0 or 1. The volume header is only written if final
1440 * is 1. If final is -1 the recovered buffers are discarded instead of
1441 * written and root_volume can also be passed as NULL in that case.
51c35492
MD
1442 */
1443static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1444static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1445
1446void
06ad81ff
MD
1447hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1448 int final)
51c35492 1449{
af209b0f
MD
1450 /*
1451 * Flush the buffers out asynchronously, wait for all the I/O to
1452 * complete, then do it again to destroy the buffer cache buffer
1453 * so it doesn't alias something later on.
1454 */
1455 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
00f16fad 1456 hammer_recover_flush_buffer_callback, &final);
eddadaee 1457 hammer_io_wait_all(hmp, "hmrrcw", 1);
0832c9bb 1458 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
00f16fad 1459 hammer_recover_flush_buffer_callback, &final);
9f5097dc 1460
00f16fad
MD
1461 /*
1462 * Flush all volume headers except the root volume. If final < 0
1463 * we discard all volume headers including the root volume.
1464 */
1465 if (final >= 0) {
1466 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1467 hammer_recover_flush_volume_callback, root_volume);
1468 } else {
1469 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1470 hammer_recover_flush_volume_callback, NULL);
1471 }
9f5097dc 1472
af209b0f 1473 /*
00f16fad 1474 * Finalize the root volume header.
77912481
MD
1475 *
1476 * No interlock is needed, volume buffers are not
1477 * messed with by bioops.
af209b0f 1478 */
00f16fad 1479 if (root_volume && root_volume->io.recovered && final > 0) {
eddadaee 1480 hammer_io_wait_all(hmp, "hmrflx", 1);
51c35492 1481 root_volume->io.recovered = 0;
710733a6 1482 hammer_io_flush(&root_volume->io, 0);
51c35492 1483 hammer_rel_volume(root_volume, 0);
eddadaee 1484 hammer_io_wait_all(hmp, "hmrfly", 1);
51c35492
MD
1485 }
1486}
1487
00f16fad
MD
1488/*
1489 * Callback to flush volume headers. If discarding data will be NULL and
1490 * all volume headers (including the root volume) will be discarded.
1491 * Otherwise data is the root_volume and we flush all volume headers
1492 * EXCEPT the root_volume.
2faf0737
MD
1493 *
1494 * Clear any I/O error or modified condition when discarding buffers to
1495 * clean up the reference count, otherwise the buffer may have extra refs
1496 * on it.
00f16fad 1497 */
51c35492
MD
1498static
1499int
1500hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1501{
1502 hammer_volume_t root_volume = data;
1503
51c35492
MD
1504 if (volume->io.recovered && volume != root_volume) {
1505 volume->io.recovered = 0;
2faf0737 1506 if (root_volume != NULL) {
77912481
MD
1507 /*
1508 * No interlock is needed, volume buffers are not
1509 * messed with by bioops.
1510 */
710733a6 1511 hammer_io_flush(&volume->io, 0);
2faf0737
MD
1512 } else {
1513 hammer_io_clear_error(&volume->io);
00f16fad 1514 hammer_io_clear_modify(&volume->io, 1);
2faf0737 1515 }
51c35492
MD
1516 hammer_rel_volume(volume, 0);
1517 }
1518 return(0);
1519}
1520
2faf0737
MD
1521/*
1522 * Flush or discard recovered I/O buffers.
1523 *
1524 * Clear any I/O error or modified condition when discarding buffers to
1525 * clean up the reference count, otherwise the buffer may have extra refs
1526 * on it.
1527 */
51c35492
MD
1528static
1529int
1530hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1531{
00f16fad 1532 int final = *(int *)data;
250aec18 1533 int flush;
00f16fad 1534
51c35492
MD
1535 if (buffer->io.recovered) {
1536 buffer->io.recovered = 0;
af209b0f 1537 buffer->io.reclaim = 1;
2faf0737
MD
1538 if (final < 0) {
1539 hammer_io_clear_error(&buffer->io);
00f16fad 1540 hammer_io_clear_modify(&buffer->io, 1);
2faf0737 1541 } else {
77912481 1542 hammer_io_write_interlock(&buffer->io);
710733a6 1543 hammer_io_flush(&buffer->io, 0);
77912481 1544 hammer_io_done_interlock(&buffer->io);
2faf0737 1545 }
af209b0f
MD
1546 hammer_rel_buffer(buffer, 0);
1547 } else {
250aec18
MD
1548 flush = hammer_ref_interlock(&buffer->io.lock);
1549 if (flush)
c1745db9 1550 atomic_add_int(&hammer_count_refedbufs, 1);
250aec18 1551
2faf0737
MD
1552 if (final < 0) {
1553 hammer_io_clear_error(&buffer->io);
1554 hammer_io_clear_modify(&buffer->io, 1);
1555 }
250aec18 1556 KKASSERT(hammer_oneref(&buffer->io.lock));
af209b0f 1557 buffer->io.reclaim = 1;
250aec18 1558 hammer_rel_buffer(buffer, flush);
51c35492
MD
1559 }
1560 return(0);
1561}
1562