2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * The UNDO algorithm is trivial. The nominal UNDO range in the
39 * FIFO is determined by taking the first/next offset stored in
40 * the volume header. The next offset may not be correct since
41 * UNDO flushes are not required to flush the volume header, so
42 * the code also scans forward until it finds a discontinuous
45 * The UNDOs are then scanned and executed in reverse order. These
46 * UNDOs are effectively just data restorations based on HAMMER offsets.
50 * REDO records are laid down in the UNDO/REDO FIFO for nominal
51 * writes, truncations, and file extension ops. On a per-inode
52 * basis two types of REDO records are generated, REDO_WRITE
55 * Essentially the recovery block will contain UNDO records backing
56 * out partial operations and REDO records to regenerate those partial
57 * operations guaranteed by the filesystem during recovery.
59 * REDO generation is optional, and can also be started and then
60 * later stopped due to excessive write()s inbetween fsyncs, or not
61 * started at all. Because of this the recovery code must determine
62 * when REDOs are valid and when they are not. Additional records are
63 * generated to help figure it out.
65 * The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66 * during a flush cycle indicating which records the flush cycle
67 * has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68 * each flush cycle to indicate how far back in the UNDO/REDO FIFO
69 * the recovery code must go to find the earliest applicable REDO
70 * record. Applicable REDO records can be far outside the nominal
71 * UNDO recovery range, for example if a write() lays down a REDO but
72 * the related file is not flushed for several cycles.
74 * The SYNC reference is to a point prior to the nominal UNDO FIFO
75 * range, creating an extended REDO range which must be scanned.
77 * Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78 * which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79 * prior to the start of the nominal UNDO range are applicable.
80 * That is, any REDO_TERM_* records in the extended range but not in
81 * the nominal undo range will mask any redo operations for prior REDO
82 * records. This is necessary because once the TERM is laid down
83 * followup operations may make additional changes to the related
84 * records but not necessarily record them as REDOs (because REDOs are
87 * REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88 * must be ignored since they represent meta-data flushes which are
89 * undone by the UNDOs in that nominal UNDO range by the recovery
90 * code. Only REDO_TERM_* records in the extended range but not
91 * in the nominal undo range are applicable.
93 * The REDO_SYNC record itself always exists in the nominal UNDO range
94 * (this is how the extended range is determined). For recovery
95 * purposes the most recent REDO_SYNC record is always used if several
98 * CRASHES DURING UNDO/REDO
100 * A crash during the UNDO phase requires no additional effort. The
101 * UNDOs will simply be re-run again. The state of the UNDO/REDO fifo
102 * remains unchanged and has no re-crash issues.
104 * A crash during the REDO phase is more complex because the REDOs
105 * run normal filesystem ops and generate additional UNDO/REDO records.
106 * REDO is disabled during REDO recovery and any SYNC records generated
107 * by flushes during REDO recovery must continue to reference the
108 * original extended range.
110 * If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111 * may become impossible. This is detected when the start of the
112 * extended range fails to have monotonically increasing sequence
113 * numbers leading into the nominal undo range.
119 * Specify the way we want to handle stage2 errors.
121 * Following values are accepted:
123 * 0 - Run redo recovery normally and fail to mount if
124 * the operation fails (default).
125 * 1 - Run redo recovery, but don't fail to mount if the
127 * 2 - Completely skip redo recovery (only for severe error
128 * conditions and/or debugging.
130 static int hammer_skip_redo = 0;
131 TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo);
134 * Each rterm entry has a list of fifo offsets indicating termination
135 * points. These are stripped as the scan progresses.
137 typedef struct hammer_rterm_entry {
138 struct hammer_rterm_entry *next;
139 hammer_off_t fifo_offset;
140 } *hammer_rterm_entry_t;
143 * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
144 * TRUNC entries ignore the offset.
146 typedef struct hammer_rterm {
147 RB_ENTRY(hammer_rterm) rb_node;
149 uint32_t redo_localization;
151 hammer_off_t redo_offset;
152 hammer_rterm_entry_t term_list;
155 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
156 struct hammer_rterm_rb_tree;
157 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
158 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
160 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
161 hammer_off_t end_off);
162 static int hammer_check_head_signature(hammer_fifo_head_t head,
163 hammer_off_t beg_off);
164 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
165 char *src, char *dst, int bytes);
166 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
167 hammer_volume_t root_volume,
168 hammer_off_t *scan_offsetp,
169 int *errorp, struct hammer_buffer **bufferp);
170 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
171 hammer_volume_t root_volume,
172 hammer_off_t *scan_offsetp,
173 int *errorp, struct hammer_buffer **bufferp);
175 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
177 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
178 hammer_fifo_undo_t undo);
179 static int hammer_recover_redo_rec(hammer_mount_t hmp,
180 struct hammer_rterm_rb_tree *root,
181 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
182 static int hammer_recover_redo_run(hammer_mount_t hmp,
183 struct hammer_rterm_rb_tree *root,
184 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
185 static void hammer_recover_redo_exec(hammer_mount_t hmp,
186 hammer_fifo_redo_t redo);
188 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
191 * Recover filesystem meta-data on mount. This procedure figures out the
192 * UNDO FIFO range and runs the UNDOs backwards. The FIFO pointers are not
193 * resynchronized by this procedure.
195 * This procedure is run near the beginning of the mount sequence, before
196 * any B-Tree or high-level accesses are enabled, and is responsible for
197 * restoring the meta-data to a consistent state. High level HAMMER data
198 * structures (such as the B-Tree) cannot be accessed here.
200 * NOTE: No information from the root volume has been cached in the
201 * hammer_mount structure yet, so we need to access the root volume's
207 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
209 hammer_blockmap_t rootmap;
210 hammer_buffer_t buffer;
211 hammer_off_t scan_offset;
212 hammer_off_t scan_offset_save;
214 hammer_fifo_any_t head;
215 hammer_off_t first_offset;
216 hammer_off_t last_offset;
219 int degenerate_case = 0;
222 * Examine the UNDO FIFO indices in the volume header.
224 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
225 first_offset = rootmap->first_offset;
226 last_offset = rootmap->next_offset;
230 hmp->recover_stage2_offset = 0;
232 if (first_offset > rootmap->alloc_offset ||
233 last_offset > rootmap->alloc_offset) {
234 hvkprintf(root_volume,
235 "Illegal UNDO FIFO index range "
236 "%016jx, %016jx limit %016jx\n",
237 (intmax_t)first_offset,
238 (intmax_t)last_offset,
239 (intmax_t)rootmap->alloc_offset);
245 * In HAMMER version 4+ filesystems the volume header does NOT
246 * contain definitive UNDO FIFO state. In particular, the
247 * rootmap->next_offset may not be indexed completely to the
248 * end of the active UNDO FIFO.
250 if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
252 * To find the definitive range we must first scan backwards
253 * from first_offset to locate the first real record and
254 * extract the sequence number from it. This record is not
255 * part of the active undo space.
257 scan_offset = first_offset;
261 head = hammer_recover_scan_rev(hmp, root_volume,
266 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
267 seqno = head->head.hdr_seq;
272 hvkprintf(root_volume,
273 "recovery failure during seqno backscan\n");
278 * Scan forwards from first_offset and (seqno+1) looking
279 * for a sequence space discontinuity. This denotes the
280 * end of the active FIFO area.
282 * NOTE: For the case where the FIFO is empty the very first
283 * record we find will be discontinuous.
285 * NOTE: Do not include trailing PADs in the scan range,
286 * and remember the returned scan_offset after a
287 * fwd iteration points to the end of the returned
290 hvkprintf(root_volume, "recovery check seqno=%08x\n", seqno);
292 scan_offset = first_offset;
293 scan_offset_save = scan_offset;
295 hmp->recover_stage2_seqno = seqno;
298 head = hammer_recover_scan_fwd(hmp, root_volume,
303 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
304 if (seqno != head->head.hdr_seq) {
305 scan_offset = scan_offset_save;
308 scan_offset_save = scan_offset;
314 * If the forward scan is grossly ahead of last_offset
315 * then something is wrong. last_offset is supposed
318 if (last_offset >= scan_offset) {
319 bytes = last_offset - scan_offset;
321 bytes = rootmap->alloc_offset - scan_offset +
322 (last_offset & HAMMER_OFF_LONG_MASK);
325 (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
327 hvkprintf(root_volume,
328 "recovery forward scan is "
329 "grossly beyond the last_offset in "
330 "the volume header, this can't be "
339 * Store the seqno. This will be the next seqno we lay down
340 * when generating new UNDOs.
342 hmp->undo_seqno = seqno;
344 hvkprintf(root_volume,
345 "recovery failure during seqno fwdscan\n");
348 last_offset = scan_offset;
349 hvkprintf(root_volume,
350 "recovery range %016jx-%016jx\n",
351 (intmax_t)first_offset,
352 (intmax_t)last_offset);
353 hvkprintf(root_volume,
354 "recovery nexto %016jx endseqno=%08x\n",
355 (intmax_t)rootmap->next_offset,
360 * Calculate the size of the active portion of the FIFO. If the
361 * FIFO is empty the filesystem is clean and no further action is
364 if (last_offset >= first_offset) {
365 bytes = last_offset - first_offset;
367 bytes = rootmap->alloc_offset - first_offset +
368 (last_offset & HAMMER_OFF_LONG_MASK);
376 hvkprintf(root_volume,
377 "recovery undo %016jx-%016jx (%jd bytes)%s\n",
378 (intmax_t)first_offset,
379 (intmax_t)last_offset,
381 (hmp->ronly ? " (RO)" : "(RW)"));
382 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
383 hkprintf("Undo size is absurd, unable to mount\n");
389 * Scan the UNDOs backwards.
391 scan_offset = last_offset;
393 while ((int64_t)bytes > 0) {
394 KKASSERT(scan_offset != first_offset);
395 head = hammer_recover_scan_rev(hmp, root_volume,
396 &scan_offset, &error, &buffer);
403 error = hammer_recover_undo(hmp, root_volume, &head->undo);
405 hvkprintf(root_volume,
406 "UNDO record at %016jx failed\n",
407 (intmax_t)scan_offset - head->head.hdr_size);
412 * The first REDO_SYNC record encountered (scanning backwards)
413 * enables REDO processing.
415 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
416 head->redo.redo_flags == HAMMER_REDO_SYNC) {
417 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
418 hvkprintf(root_volume,
419 "Ignoring extra REDO_SYNC "
420 "records in UNDO/REDO FIFO.\n");
422 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
423 hmp->recover_stage2_offset =
424 head->redo.redo_offset;
425 hvkprintf(root_volume,
426 "Found REDO_SYNC %016jx\n",
427 (intmax_t)head->redo.redo_offset);
431 bytes -= head->head.hdr_size;
434 * If too many dirty buffers have built up we have to flush'm
435 * out. As long as we do not flush out the volume header
436 * a crash here should not cause any problems.
438 * buffer must be released so the flush can assert that
439 * all buffers are idle.
441 if (hammer_flusher_meta_limit(hmp)) {
443 hammer_rel_buffer(buffer, 0);
446 if (hmp->ronly == 0) {
447 hammer_recover_flush_buffers(hmp, root_volume,
449 hvkprintf(root_volume, "Continuing recovery\n");
451 hvkprintf(root_volume,
453 "Insufficient buffer cache to hold "
454 "dirty buffers on read-only mount!\n");
460 KKASSERT(error || bytes == 0);
463 hammer_rel_buffer(buffer, 0);
468 * After completely flushing all the recovered buffers the volume
469 * header will also be flushed.
471 if (root_volume->io.recovered == 0) {
472 hammer_ref_volume(root_volume);
473 root_volume->io.recovered = 1;
477 * Finish up flushing (or discarding) recovered buffers. FIFO
478 * indices in the volume header are updated to the actual undo
479 * range but will not be collapsed until stage 2.
482 hammer_modify_volume_noundo(NULL, root_volume);
483 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
484 rootmap->first_offset = first_offset;
485 rootmap->next_offset = last_offset;
486 hammer_modify_volume_done(root_volume);
488 hammer_recover_flush_buffers(hmp, root_volume, 1);
490 hammer_recover_flush_buffers(hmp, root_volume, -1);
492 if (degenerate_case == 0) {
493 hvkprintf(root_volume, "recovery complete\n");
495 hvkprintf(root_volume, "mounted clean, no recovery needed\n");
501 * Execute redo operations
503 * This procedure is run at the end of the mount sequence, after the hammer
504 * mount structure has been completely initialized but before the filesystem
505 * goes live. It can access standard cursors, the B-Tree, flush the
506 * filesystem, and so forth.
508 * This code may only be called for read-write mounts or when a mount
509 * switches from read-only to read-write. vnodes may or may not be present.
511 * The stage1 code will have already calculated the correct FIFO range
512 * for the nominal UNDO FIFO and stored it in the rootmap. The extended
513 * range for REDO is stored in hmp->recover_stage2_offset.
516 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
518 hammer_blockmap_t rootmap;
519 hammer_buffer_t buffer;
520 hammer_off_t scan_offset;
521 hammer_off_t oscan_offset;
523 hammer_off_t ext_bytes;
524 hammer_fifo_any_t head;
525 hammer_off_t first_offset;
526 hammer_off_t last_offset;
527 hammer_off_t ext_offset;
528 struct hammer_rterm_rb_tree rterm_root;
535 * Stage 2 can only be run on a RW mount, or when the mount is
536 * switched from RO to RW.
538 KKASSERT(hmp->ronly == 0);
539 RB_INIT(&rterm_root);
541 if (hammer_skip_redo == 1)
542 hvkprintf(root_volume, "recovery redo marked as optional\n");
544 if (hammer_skip_redo == 2) {
545 hvkprintf(root_volume, "recovery redo skipped.\n");
550 * Examine the UNDO FIFO. If it is empty the filesystem is clean
551 * and no action need be taken.
553 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
554 first_offset = rootmap->first_offset;
555 last_offset = rootmap->next_offset;
556 if (first_offset == last_offset) {
557 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
562 * Stage2 must only be run once, and will not be run at all
563 * if Stage1 did not find a REDO_SYNC record.
568 if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
570 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
571 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
572 ext_offset = hmp->recover_stage2_offset;
573 if (ext_offset == 0) {
574 hvkprintf(root_volume,
575 "REDO stage specified but no REDO_SYNC "
576 "offset, ignoring\n");
581 * Calculate nominal UNDO range (this is not yet the extended
584 if (last_offset >= first_offset) {
585 bytes = last_offset - first_offset;
587 bytes = rootmap->alloc_offset - first_offset +
588 (last_offset & HAMMER_OFF_LONG_MASK);
590 hvkprintf(root_volume,
591 "recovery redo %016jx-%016jx (%jd bytes)%s\n",
592 (intmax_t)first_offset,
593 (intmax_t)last_offset,
595 (hmp->ronly ? " (RO)" : "(RW)"));
597 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
598 hkprintf("Undo size is absurd, unable to mount\n");
604 * Scan the REDOs backwards collecting REDO_TERM_* information.
605 * This information is only collected for the extended range,
606 * non-inclusive of any TERMs in the nominal UNDO range.
608 * If the stage2 extended range is inside the nominal undo range
609 * we have nothing to scan.
611 * This must fit in memory!
613 if (first_offset < last_offset) {
615 * [ first_offset........last_offset ]
617 if (ext_offset < first_offset) {
619 ext_bytes = first_offset - ext_offset;
620 } else if (ext_offset > last_offset) {
622 ext_bytes = (rootmap->alloc_offset - ext_offset) +
623 (first_offset & HAMMER_OFF_LONG_MASK);
625 ext_bytes = -(ext_offset - first_offset);
630 * [......last_offset first_offset.....]
632 if (ext_offset < last_offset) {
633 ext_bytes = -((rootmap->alloc_offset - first_offset) +
634 (ext_offset & HAMMER_OFF_LONG_MASK));
636 } else if (ext_offset > first_offset) {
637 ext_bytes = -(ext_offset - first_offset);
640 ext_bytes = first_offset - ext_offset;
646 scan_offset = first_offset;
647 hvkprintf(root_volume,
648 "Find extended redo %016jx, %jd extbytes\n",
649 (intmax_t)ext_offset,
650 (intmax_t)ext_bytes);
651 seqno = hmp->recover_stage2_seqno - 1;
653 head = hammer_recover_scan_rev(hmp, root_volume,
658 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
659 if (head->head.hdr_seq != seqno) {
663 error = hammer_recover_redo_rec(
665 scan_offset, &head->redo);
668 if (scan_offset == ext_offset)
672 hvkprintf(root_volume,
673 "Find extended redo failed %d, "
674 "unable to run REDO\n",
679 hvkprintf(root_volume,
680 "Embedded extended redo %016jx, %jd extbytes\n",
681 (intmax_t)ext_offset,
682 (intmax_t)ext_bytes);
686 * Scan the REDO forwards through the entire extended range.
687 * Anything with a previously recorded matching TERM is discarded.
689 scan_offset = ext_offset;
693 * NOTE: when doing a forward scan the returned scan_offset is
694 * for the record following the returned record, so we
695 * have to play a bit.
697 while ((int64_t)bytes > 0) {
698 KKASSERT(scan_offset != last_offset);
700 oscan_offset = scan_offset;
701 head = hammer_recover_scan_fwd(hmp, root_volume,
702 &scan_offset, &error, &buffer);
706 error = hammer_recover_redo_run(hmp, &rterm_root,
707 oscan_offset, &head->redo);
709 hvkprintf(root_volume,
710 "UNDO record at %016jx failed\n",
711 (intmax_t)scan_offset - head->head.hdr_size);
714 bytes -= head->head.hdr_size;
716 KKASSERT(error || bytes == 0);
720 hammer_rel_buffer(buffer, 0);
728 hammer_rterm_t rterm;
729 hammer_rterm_entry_t rte;
731 while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
732 RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
733 while ((rte = rterm->term_list) != NULL) {
734 rterm->term_list = rte->next;
735 kfree(rte, hmp->m_misc);
737 kfree(rterm, hmp->m_misc);
742 * Finish up flushing (or discarding) recovered buffers by executing
743 * a normal flush cycle. Setting HMNT_UNDO_DIRTY bypasses degenerate
744 * case tests and forces the flush in order to update the FIFO indices.
746 * If a crash occurs during the flush the entire undo/redo will be
747 * re-run during recovery on the next mount.
750 if (rootmap->first_offset != rootmap->next_offset)
751 hmp->hflags |= HMNT_UNDO_DIRTY;
752 hammer_flusher_sync(hmp);
755 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
757 hvkprintf(root_volume, "End redo recovery\n");
760 if (error && hammer_skip_redo == 1)
761 hvkprintf(root_volume,
762 "recovery redo error %d, skipping.\n",
765 return (hammer_skip_redo ? 0 : error);
769 * Scan backwards from *scan_offsetp, return the FIFO record prior to the
770 * record at *scan_offsetp or NULL if an error occured.
772 * On return *scan_offsetp will be the offset of the returned record.
775 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
776 hammer_off_t *scan_offsetp,
777 int *errorp, struct hammer_buffer **bufferp)
779 hammer_off_t scan_offset;
780 hammer_blockmap_t rootmap;
781 hammer_fifo_any_t head;
782 hammer_fifo_tail_t tail;
784 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
785 scan_offset = *scan_offsetp;
787 if (hammer_debug_general & 0x0080)
788 hdkprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
789 if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
790 scan_offset = rootmap->alloc_offset;
791 if (scan_offset - sizeof(*tail) <
792 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
793 hvkprintf(root_volume,
794 "UNDO record at %016jx FIFO underflow\n",
795 (intmax_t)scan_offset);
799 tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
802 hvkprintf(root_volume,
803 "Unable to read UNDO TAIL at %016jx\n",
804 (intmax_t)scan_offset - sizeof(*tail));
808 if (hammer_check_tail_signature(tail, scan_offset) != 0) {
809 hvkprintf(root_volume,
810 "Illegal UNDO TAIL signature at %016jx\n",
811 (intmax_t)scan_offset - sizeof(*tail));
815 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
816 *scan_offsetp = scan_offset - head->head.hdr_size;
822 * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
825 * On return *scan_offsetp will be the offset of the record following
826 * the returned record.
829 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
830 hammer_off_t *scan_offsetp,
831 int *errorp, struct hammer_buffer **bufferp)
833 hammer_off_t scan_offset;
834 hammer_blockmap_t rootmap;
835 hammer_fifo_any_t head;
837 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
838 scan_offset = *scan_offsetp;
840 if (hammer_debug_general & 0x0080)
841 hdkprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
842 if (scan_offset == rootmap->alloc_offset)
843 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
845 head = hammer_bread(hmp, scan_offset, errorp, bufferp);
847 hvkprintf(root_volume,
848 "Unable to read UNDO HEAD at %016jx\n",
849 (intmax_t)scan_offset);
853 if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
854 hvkprintf(root_volume,
855 "Illegal UNDO TAIL signature at %016jx\n",
856 (intmax_t)scan_offset);
860 scan_offset += head->head.hdr_size;
861 if (scan_offset == rootmap->alloc_offset)
862 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
863 *scan_offsetp = scan_offset;
869 * Helper function for hammer_check_{head,tail}_signature(). Check stuff
870 * once the head and tail has been established.
872 * This function validates the entire FIFO record wrapper.
876 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
877 hammer_off_t beg_off)
879 hammer_off_t end_off;
884 * Check signatures. The tail signature is allowed to be the
885 * head signature only for 8-byte PADs.
887 if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
888 hkprintf("FIFO record bad head signature %04x at %016jx\n",
893 if (head->hdr_size < HAMMER_HEAD_ALIGN ||
894 (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
895 hkprintf("FIFO record unaligned or bad size %04x at %016jx\n",
900 end_off = beg_off + head->hdr_size;
902 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
903 (size_t)(end_off - beg_off) != sizeof(*tail)) {
904 if (head->hdr_type != tail->tail_type) {
905 hkprintf("FIFO record head/tail type mismatch "
906 "%04x %04x at %016jx\n",
907 head->hdr_type, tail->tail_type,
911 if (head->hdr_size != tail->tail_size) {
912 hkprintf("FIFO record head/tail size mismatch "
913 "%04x %04x at %016jx\n",
914 head->hdr_size, tail->tail_size,
918 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
919 hkprintf("FIFO record bad tail signature "
921 tail->tail_signature,
928 * Non-PAD records must have a CRC and must be sized at
929 * least large enough to fit the head and tail.
931 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
932 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
933 crc32(head + 1, head->hdr_size - sizeof(*head));
934 if (head->hdr_crc != crc) {
935 hkprintf("FIFO record CRC failed %08x %08x at %016jx\n",
940 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
941 hkprintf("FIFO record too small %04x at %016jx\n",
951 bytes = head->hdr_size;
952 tail = (void *)((char *)head + bytes - sizeof(*tail));
953 if (tail->tail_size != head->hdr_size) {
954 hkprintf("Bad tail size %04x vs %04x at %016jx\n",
955 tail->tail_size, head->hdr_size,
959 if (tail->tail_type != head->hdr_type) {
960 hkprintf("Bad tail type %04x vs %04x at %016jx\n",
961 tail->tail_type, head->hdr_type,
970 * Check that the FIFO record is in-bounds given the head and the
973 * Also checks that the head and tail structures agree with each other,
974 * but does not check beyond the signature, type, and size.
977 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
979 hammer_fifo_tail_t tail;
980 hammer_off_t end_off;
983 * head overlaps buffer boundary. This could be a PAD so only
984 * check the minimum PAD size here.
986 if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
990 * Calculate the ending offset and make sure the record does
991 * not cross a buffer boundary.
993 end_off = beg_off + head->hdr_size;
994 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
996 tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
997 return (_hammer_check_signature(head, tail, beg_off));
1001 * Check that the FIFO record is in-bounds given the tail and the
1002 * hammer offset. The offset is pointing at the ending boundary of the
1005 * Also checks that the head and tail structures agree with each other,
1006 * but does not check beyond the signature, type, and size.
1009 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
1011 hammer_fifo_head_t head;
1012 hammer_off_t beg_off;
1015 * tail overlaps buffer boundary
1017 if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1021 * Calculate the begining offset and make sure the record does
1022 * not cross a buffer boundary.
1024 beg_off = end_off - tail->tail_size;
1025 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1027 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1028 return (_hammer_check_signature(head, tail, beg_off));
1032 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1033 hammer_fifo_undo_t undo)
1035 hammer_volume_t volume;
1036 hammer_buffer_t buffer;
1037 hammer_off_t buf_offset;
1045 * Only process UNDO records. Flag if we find other records to
1046 * optimize stage2 recovery.
1048 if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
1052 * Validate the UNDO record.
1054 bytes = undo->head.hdr_size - sizeof(*undo) -
1055 sizeof(struct hammer_fifo_tail);
1056 if (bytes < 0 || undo->undo_data_bytes < 0 ||
1057 undo->undo_data_bytes > bytes) {
1058 hkprintf("Corrupt UNDO record, undo_data_bytes %d/%d\n",
1059 undo->undo_data_bytes, bytes);
1063 bytes = undo->undo_data_bytes;
1066 * The undo offset may only be a zone-1 or zone-2 offset.
1068 * Currently we only support a zone-1 offset representing the
1071 zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1072 offset = undo->undo_offset & HAMMER_BUFMASK;
1074 if (offset + bytes > HAMMER_BUFSIZE) {
1075 hkprintf("Corrupt UNDO record, bad offset\n");
1080 case HAMMER_ZONE_RAW_VOLUME_INDEX:
1081 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1082 volume = hammer_get_volume(hmp, vol_no, &error);
1083 if (volume == NULL) {
1084 hkprintf("UNDO record, cannot access volume %d\n",
1088 hammer_modify_volume_noundo(NULL, volume);
1089 hammer_recover_copy_undo(undo->undo_offset,
1091 (char *)volume->ondisk + offset,
1093 hammer_modify_volume_done(volume);
1096 * Multiple modifications may be made to the same buffer.
1097 * Also, the volume header cannot be written out until
1098 * everything else has been flushed. This also
1099 * covers the read-only case by preventing the kernel from
1100 * flushing the buffer.
1102 if (volume->io.recovered == 0)
1103 volume->io.recovered = 1;
1105 hammer_rel_volume(volume, 0);
1107 case HAMMER_ZONE_RAW_BUFFER_INDEX:
1108 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
1109 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1111 if (buffer == NULL) {
1112 hkprintf("UNDO record, cannot access buffer %016jx\n",
1113 (intmax_t)undo->undo_offset);
1116 hammer_modify_buffer_noundo(NULL, buffer);
1117 hammer_recover_copy_undo(undo->undo_offset,
1119 (char *)buffer->ondisk + offset,
1121 hammer_modify_buffer_done(buffer);
1124 * Multiple modifications may be made to the same buffer,
1125 * improve performance by delaying the flush. This also
1126 * covers the read-only case by preventing the kernel from
1127 * flushing the buffer.
1129 if (buffer->io.recovered == 0)
1130 buffer->io.recovered = 1;
1132 hammer_rel_buffer(buffer, 0);
1135 hkprintf("Corrupt UNDO record\n");
1142 hammer_recover_copy_undo(hammer_off_t undo_offset,
1143 char *src, char *dst, int bytes)
1145 if (hammer_debug_general & 0x0080) {
1146 hdkprintf("UNDO %016jx: %d\n",
1147 (intmax_t)undo_offset, bytes);
1150 hkprintf("UNDO %016jx:", (intmax_t)undo_offset);
1151 hammer_recover_debug_dump(22, dst, bytes);
1152 kprintf("%22s", "to:");
1153 hammer_recover_debug_dump(22, src, bytes);
1155 bcopy(src, dst, bytes);
1159 * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1160 * during the backwards scan of the extended UNDO/REDO FIFO. This scan
1161 * does not include the nominal UNDO range, just the extended range.
1164 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1165 hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1167 hammer_rterm_t rterm;
1168 hammer_rterm_t nrterm;
1169 hammer_rterm_entry_t rte;
1171 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1173 if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1174 redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1178 nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1179 nrterm->redo_objid = redo->redo_objid;
1180 nrterm->redo_localization = redo->redo_localization;
1181 nrterm->redo_flags = redo->redo_flags;
1182 nrterm->redo_offset = redo->redo_offset;
1184 rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1186 kfree(nrterm, hmp->m_misc);
1191 hkprintf("record record %016jx objid %016jx "
1192 "offset %016jx flags %08x\n",
1193 (intmax_t)scan_offset,
1194 (intmax_t)redo->redo_objid,
1195 (intmax_t)redo->redo_offset,
1196 (int)redo->redo_flags);
1200 * Scan in reverse order, rte prepended, so the rte list will be
1203 rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1204 rte->fifo_offset = scan_offset;
1205 rte->next = rterm->term_list;
1206 rterm->term_list = rte;
1212 * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1213 * the forwards scan of the entire extended UNDO/REDO FIFO range.
1215 * Records matching previously recorded TERMs have already been committed
1219 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1220 hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1222 struct hammer_rterm rtval;
1223 hammer_rterm_t rterm;
1224 hammer_rterm_entry_t rte;
1226 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1229 switch(redo->redo_flags) {
1230 case HAMMER_REDO_WRITE:
1231 case HAMMER_REDO_TRUNC:
1233 * We hit a REDO request. The REDO request is only executed
1234 * if there is no matching TERM.
1236 bzero(&rtval, sizeof(rtval));
1237 rtval.redo_objid = redo->redo_objid;
1238 rtval.redo_localization = redo->redo_localization;
1239 rtval.redo_offset = redo->redo_offset;
1240 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1241 HAMMER_REDO_TERM_WRITE :
1242 HAMMER_REDO_TERM_TRUNC;
1244 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1247 hkprintf("ignore record %016jx objid %016jx "
1248 "offset %016jx flags %08x\n",
1249 (intmax_t)scan_offset,
1250 (intmax_t)redo->redo_objid,
1251 (intmax_t)redo->redo_offset,
1252 (int)redo->redo_flags);
1257 hkprintf("run record %016jx objid %016jx "
1258 "offset %016jx flags %08x\n",
1259 (intmax_t)scan_offset,
1260 (intmax_t)redo->redo_objid,
1261 (intmax_t)redo->redo_offset,
1262 (int)redo->redo_flags);
1266 * Redo stage2 can access a live filesystem, acquire the
1269 hammer_recover_redo_exec(hmp, redo);
1271 case HAMMER_REDO_TERM_WRITE:
1272 case HAMMER_REDO_TERM_TRUNC:
1274 * As we encounter TERMs in the forward scan we remove
1275 * them. Once the forward scan hits the nominal undo range
1276 * there will be no more recorded TERMs.
1278 bzero(&rtval, sizeof(rtval));
1279 rtval.redo_objid = redo->redo_objid;
1280 rtval.redo_localization = redo->redo_localization;
1281 rtval.redo_flags = redo->redo_flags;
1282 rtval.redo_offset = redo->redo_offset;
1284 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1286 if ((rte = rterm->term_list) != NULL) {
1287 KKASSERT(rte->fifo_offset == scan_offset);
1288 rterm->term_list = rte->next;
1289 kfree(rte, hmp->m_misc);
1298 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1300 struct hammer_transaction trans;
1302 struct hammer_inode *ip;
1303 struct vnode *vp = NULL;
1306 hammer_start_transaction(&trans, hmp);
1308 ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1309 HAMMER_MAX_TID, redo->redo_localization,
1312 hkprintf("unable to find objid %016jx:%08x\n",
1313 (intmax_t)redo->redo_objid, redo->redo_localization);
1316 error = hammer_get_vnode(ip, &vp);
1318 hkprintf("unable to acquire vnode for %016jx:%08x\n",
1319 (intmax_t)redo->redo_objid, redo->redo_localization);
1323 switch(redo->redo_flags) {
1324 case HAMMER_REDO_WRITE:
1325 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1327 hkprintf("vn_rdwr open %016jx:%08x returned %d\n",
1328 (intmax_t)redo->redo_objid,
1329 redo->redo_localization, error);
1333 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1334 redo->redo_data_bytes,
1335 redo->redo_offset, UIO_SYSSPACE,
1336 0, proc0.p_ucred, NULL);
1337 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1339 hkprintf("write %016jx:%08x returned %d\n",
1340 (intmax_t)redo->redo_objid,
1341 redo->redo_localization, error);
1343 VOP_CLOSE(vp, FREAD|FWRITE, NULL);
1345 case HAMMER_REDO_TRUNC:
1347 va.va_size = redo->redo_offset;
1348 error = VOP_SETATTR(vp, &va, proc0.p_ucred);
1350 hkprintf("setattr offset %016jx error %d\n",
1351 (intmax_t)redo->redo_offset, error);
1357 hammer_rel_inode(ip, 0);
1359 hammer_done_transaction(&trans);
1363 * RB tree compare function. Note that REDO_TERM_TRUNC ops ignore
1366 * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1369 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1371 if (rt1->redo_objid < rt2->redo_objid)
1373 if (rt1->redo_objid > rt2->redo_objid)
1375 if (rt1->redo_localization < rt2->redo_localization)
1377 if (rt1->redo_localization > rt2->redo_localization)
1379 if (rt1->redo_flags < rt2->redo_flags)
1381 if (rt1->redo_flags > rt2->redo_flags)
1383 if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1384 if (rt1->redo_offset < rt2->redo_offset)
1386 if (rt1->redo_offset > rt2->redo_offset)
1395 hammer_recover_debug_dump(int w, char *buf, int bytes)
1399 for (i = 0; i < bytes; ++i) {
1400 if (i && (i & 15) == 0)
1401 kprintf("\n%*.*s", w, w, "");
1402 kprintf(" %02x", (unsigned char)buf[i]);
1410 * Flush recovered buffers from recovery operations. The call to this
1411 * routine may be delayed if a read-only mount was made and then later
1412 * upgraded to read-write. This routine is also called when unmounting
1413 * a read-only mount to clean out recovered (dirty) buffers which we
1414 * couldn't flush (because the mount is read-only).
1416 * The volume header is always written last. The UNDO FIFO will be forced
1417 * to zero-length by setting next_offset to first_offset. This leaves the
1418 * (now stale) UNDO information used to recover the disk available for
1419 * forensic analysis.
1421 * final is typically 0 or 1. The volume header is only written if final
1422 * is 1. If final is -1 the recovered buffers are discarded instead of
1423 * written and root_volume can also be passed as NULL in that case.
1425 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1426 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1429 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1433 * Flush the buffers out asynchronously, wait for all the I/O to
1434 * complete, then do it again to destroy the buffer cache buffer
1435 * so it doesn't alias something later on.
1437 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1438 hammer_recover_flush_buffer_callback, &final);
1439 hammer_io_wait_all(hmp, "hmrrcw", 1);
1440 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1441 hammer_recover_flush_buffer_callback, &final);
1444 * Flush all volume headers except the root volume. If final < 0
1445 * we discard all volume headers including the root volume.
1448 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1449 hammer_recover_flush_volume_callback, root_volume);
1451 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1452 hammer_recover_flush_volume_callback, NULL);
1456 * Finalize the root volume header.
1458 * No interlock is needed, volume buffers are not
1459 * messed with by bioops.
1461 if (root_volume && root_volume->io.recovered && final > 0) {
1462 hammer_io_wait_all(hmp, "hmrflx", 1);
1463 root_volume->io.recovered = 0;
1464 hammer_io_flush(&root_volume->io, 0);
1465 hammer_rel_volume(root_volume, 0);
1466 hammer_io_wait_all(hmp, "hmrfly", 1);
1471 * Callback to flush volume headers. If discarding data will be NULL and
1472 * all volume headers (including the root volume) will be discarded.
1473 * Otherwise data is the root_volume and we flush all volume headers
1474 * EXCEPT the root_volume.
1476 * Clear any I/O error or modified condition when discarding buffers to
1477 * clean up the reference count, otherwise the buffer may have extra refs
1482 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1484 hammer_volume_t root_volume = data;
1486 if (volume->io.recovered && volume != root_volume) {
1487 volume->io.recovered = 0;
1488 if (root_volume != NULL) {
1490 * No interlock is needed, volume buffers are not
1491 * messed with by bioops.
1493 hammer_io_flush(&volume->io, 0);
1495 hammer_io_clear_error(&volume->io);
1496 hammer_io_clear_modify(&volume->io, 1);
1498 hammer_rel_volume(volume, 0);
1504 * Flush or discard recovered I/O buffers.
1506 * Clear any I/O error or modified condition when discarding buffers to
1507 * clean up the reference count, otherwise the buffer may have extra refs
1512 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1514 int final = *(int *)data;
1517 if (buffer->io.recovered) {
1518 buffer->io.recovered = 0;
1519 buffer->io.reclaim = 1;
1521 hammer_io_clear_error(&buffer->io);
1522 hammer_io_clear_modify(&buffer->io, 1);
1524 hammer_io_write_interlock(&buffer->io);
1525 hammer_io_flush(&buffer->io, 0);
1526 hammer_io_done_interlock(&buffer->io);
1528 hammer_rel_buffer(buffer, 0);
1530 flush = hammer_ref_interlock(&buffer->io.lock);
1532 atomic_add_int(&hammer_count_refedbufs, 1);
1535 hammer_io_clear_error(&buffer->io);
1536 hammer_io_clear_modify(&buffer->io, 1);
1538 KKASSERT(hammer_oneref(&buffer->io.lock));
1539 buffer->io.reclaim = 1;
1540 hammer_rel_buffer(buffer, flush);