hammer: Change u_int{8,16,32,64}_t to uint{8,16,32,64}_t
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 /*
36  * UNDO ALGORITHM:
37  *
38  *      The UNDO algorithm is trivial.  The nominal UNDO range in the
39  *      FIFO is determined by taking the first/next offset stored in
40  *      the volume header.  The next offset may not be correct since
41  *      UNDO flushes are not required to flush the volume header, so
42  *      the code also scans forward until it finds a discontinuous
43  *      sequence number.
44  *
45  *      The UNDOs are then scanned and executed in reverse order.  These
46  *      UNDOs are effectively just data restorations based on HAMMER offsets.
47  *
48  * REDO ALGORITHM:
49  *
50  *      REDO records are laid down in the UNDO/REDO FIFO for nominal
51  *      writes, truncations, and file extension ops.  On a per-inode
52  *      basis two types of REDO records are generated, REDO_WRITE
53  *      and REDO_TRUNC.
54  *
55  *      Essentially the recovery block will contain UNDO records backing
56  *      out partial operations and REDO records to regenerate those partial
57  *      operations guaranteed by the filesystem during recovery.
58  *
59  *      REDO generation is optional, and can also be started and then
60  *      later stopped due to excessive write()s inbetween fsyncs, or not
61  *      started at all.  Because of this the recovery code must determine
62  *      when REDOs are valid and when they are not.  Additional records are
63  *      generated to help figure it out.
64  *
65  *      The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66  *      during a flush cycle indicating which records the flush cycle
67  *      has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68  *      each flush cycle to indicate how far back in the UNDO/REDO FIFO
69  *      the recovery code must go to find the earliest applicable REDO
70  *      record.  Applicable REDO records can be far outside the nominal
71  *      UNDO recovery range, for example if a write() lays down a REDO but
72  *      the related file is not flushed for several cycles.
73  *
74  *      The SYNC reference is to a point prior to the nominal UNDO FIFO
75  *      range, creating an extended REDO range which must be scanned.
76  *
77  *      Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78  *      which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79  *      prior to the start of the nominal UNDO range are applicable.
80  *      That is, any REDO_TERM_* records in the extended range but not in
81  *      the nominal undo range will mask any redo operations for prior REDO
82  *      records.  This is necessary because once the TERM is laid down
83  *      followup operations may make additional changes to the related
84  *      records but not necessarily record them as REDOs (because REDOs are
85  *      optional).
86  *
87  *      REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88  *      must be ignored since they represent meta-data flushes which are
89  *      undone by the UNDOs in that nominal UNDO range by the recovery
90  *      code.  Only REDO_TERM_* records in the extended range but not
91  *      in the nominal undo range are applicable.
92  *
93  *      The REDO_SYNC record itself always exists in the nominal UNDO range
94  *      (this is how the extended range is determined).  For recovery
95  *      purposes the most recent REDO_SYNC record is always used if several
96  *      are found.
97  *
98  * CRASHES DURING UNDO/REDO
99  *
100  *      A crash during the UNDO phase requires no additional effort.  The
101  *      UNDOs will simply be re-run again.  The state of the UNDO/REDO fifo
102  *      remains unchanged and has no re-crash issues.
103  *
104  *      A crash during the REDO phase is more complex because the REDOs
105  *      run normal filesystem ops and generate additional UNDO/REDO records.
106  *      REDO is disabled during REDO recovery and any SYNC records generated
107  *      by flushes during REDO recovery must continue to reference the
108  *      original extended range.
109  *
110  *      If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111  *      may become impossible.  This is detected when the start of the
112  *      extended range fails to have monotonically increasing sequence
113  *      numbers leading into the nominal undo range.
114  */
115
116 #include "hammer.h"
117
118 /*
119  * Specify the way we want to handle stage2 errors.
120  *
121  * Following values are accepted:
122  *
123  * 0 - Run redo recovery normally and fail to mount if
124  *     the operation fails (default).
125  * 1 - Run redo recovery, but don't fail to mount if the
126  *     operation fails.
127  * 2 - Completely skip redo recovery (only for severe error
128  *     conditions and/or debugging.
129  */
130 static int hammer_skip_redo = 0;
131 TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo);
132
133 /*
134  * Each rterm entry has a list of fifo offsets indicating termination
135  * points.  These are stripped as the scan progresses.
136  */
137 typedef struct hammer_rterm_entry {
138         struct hammer_rterm_entry *next;
139         hammer_off_t            fifo_offset;
140 } *hammer_rterm_entry_t;
141
142 /*
143  * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
144  * TRUNC entries ignore the offset.
145  */
146 typedef struct hammer_rterm {
147         RB_ENTRY(hammer_rterm)  rb_node;
148         int64_t                 redo_objid;
149         uint32_t                redo_localization;
150         uint32_t                redo_flags;
151         hammer_off_t            redo_offset;
152         hammer_rterm_entry_t    term_list;
153 } *hammer_rterm_t;
154
155 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
156 struct hammer_rterm_rb_tree;
157 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
158 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
159
160 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
161                         hammer_off_t end_off);
162 static int hammer_check_head_signature(hammer_fifo_head_t head,
163                         hammer_off_t beg_off);
164 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
165                         char *src, char *dst, int bytes);
166 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
167                         hammer_volume_t root_volume,
168                         hammer_off_t *scan_offsetp,
169                         int *errorp, struct hammer_buffer **bufferp);
170 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
171                         hammer_volume_t root_volume,
172                         hammer_off_t *scan_offsetp,
173                         int *errorp, struct hammer_buffer **bufferp);
174 #if 0
175 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
176 #endif
177 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
178                         hammer_fifo_undo_t undo);
179 static int hammer_recover_redo_rec(hammer_mount_t hmp,
180                         struct hammer_rterm_rb_tree *root,
181                         hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
182 static int hammer_recover_redo_run(hammer_mount_t hmp,
183                         struct hammer_rterm_rb_tree *root,
184                         hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
185 static void hammer_recover_redo_exec(hammer_mount_t hmp,
186                         hammer_fifo_redo_t redo);
187
188 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
189
190 /*
191  * Recover filesystem meta-data on mount.  This procedure figures out the
192  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
193  * resynchronized by this procedure.
194  *
195  * This procedure is run near the beginning of the mount sequence, before
196  * any B-Tree or high-level accesses are enabled, and is responsible for
197  * restoring the meta-data to a consistent state.  High level HAMMER data
198  * structures (such as the B-Tree) cannot be accessed here.
199  *
200  * NOTE: No information from the root volume has been cached in the
201  *       hammer_mount structure yet, so we need to access the root volume's
202  *       buffer directly.
203  *
204  * NOTE:
205  */
206 int
207 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
208 {
209         hammer_blockmap_t rootmap;
210         hammer_buffer_t buffer;
211         hammer_off_t scan_offset;
212         hammer_off_t scan_offset_save;
213         hammer_off_t bytes;
214         hammer_fifo_any_t head;
215         hammer_off_t first_offset;
216         hammer_off_t last_offset;
217         uint32_t seqno;
218         int error;
219         int degenerate_case = 0;
220
221         /*
222          * Examine the UNDO FIFO indices in the volume header.
223          */
224         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
225         first_offset = rootmap->first_offset;
226         last_offset  = rootmap->next_offset;
227         buffer = NULL;
228         error = 0;
229
230         hmp->recover_stage2_offset = 0;
231
232         if (first_offset > rootmap->alloc_offset ||
233             last_offset > rootmap->alloc_offset) {
234                 hvkprintf(root_volume,
235                         "Illegal UNDO FIFO index range "
236                         "%016jx, %016jx limit %016jx\n",
237                         (intmax_t)first_offset,
238                         (intmax_t)last_offset,
239                         (intmax_t)rootmap->alloc_offset);
240                 error = EIO;
241                 goto done;
242         }
243
244         /*
245          * In HAMMER version 4+ filesystems the volume header does NOT
246          * contain definitive UNDO FIFO state.  In particular, the
247          * rootmap->next_offset may not be indexed completely to the
248          * end of the active UNDO FIFO.
249          */
250         if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
251                 /*
252                  * To find the definitive range we must first scan backwards
253                  * from first_offset to locate the first real record and
254                  * extract the sequence number from it.  This record is not
255                  * part of the active undo space.
256                  */
257                 scan_offset = first_offset;
258                 seqno = 0;
259
260                 for (;;) {
261                         head = hammer_recover_scan_rev(hmp, root_volume,
262                                                        &scan_offset,
263                                                        &error, &buffer);
264                         if (error)
265                                 break;
266                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
267                                 seqno = head->head.hdr_seq;
268                                 break;
269                         }
270                 }
271                 if (error) {
272                         hvkprintf(root_volume,
273                                 "recovery failure during seqno backscan\n");
274                         goto done;
275                 }
276
277                 /*
278                  * Scan forwards from first_offset and (seqno+1) looking
279                  * for a sequence space discontinuity.  This denotes the
280                  * end of the active FIFO area.
281                  *
282                  * NOTE: For the case where the FIFO is empty the very first
283                  *       record we find will be discontinuous.
284                  *
285                  * NOTE: Do not include trailing PADs in the scan range,
286                  *       and remember the returned scan_offset after a
287                  *       fwd iteration points to the end of the returned
288                  *       record.
289                  */
290                 hvkprintf(root_volume, "recovery check seqno=%08x\n", seqno);
291
292                 scan_offset = first_offset;
293                 scan_offset_save = scan_offset;
294                 ++seqno;
295                 hmp->recover_stage2_seqno = seqno;
296
297                 for (;;) {
298                         head = hammer_recover_scan_fwd(hmp, root_volume,
299                                                        &scan_offset,
300                                                        &error, &buffer);
301                         if (error)
302                                 break;
303                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
304                                 if (seqno != head->head.hdr_seq) {
305                                         scan_offset = scan_offset_save;
306                                         break;
307                                 }
308                                 scan_offset_save = scan_offset;
309                                 ++seqno;
310                         }
311
312 #if 0
313                         /*
314                          * If the forward scan is grossly ahead of last_offset
315                          * then something is wrong.  last_offset is supposed
316                          * to be flushed out
317                          */
318                         if (last_offset >= scan_offset) {
319                                 bytes = last_offset - scan_offset;
320                         } else {
321                                 bytes = rootmap->alloc_offset - scan_offset +
322                                         (last_offset & HAMMER_OFF_LONG_MASK);
323                         }
324                         if (bytes >
325                             (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
326                             4 / 5) {
327                                 hvkprintf(root_volume,
328                                         "recovery forward scan is "
329                                         "grossly beyond the last_offset in "
330                                         "the volume header, this can't be "
331                                         "right.\n");
332                                 error = EIO;
333                                 break;
334                         }
335 #endif
336                 }
337
338                 /*
339                  * Store the seqno.  This will be the next seqno we lay down
340                  * when generating new UNDOs.
341                  */
342                 hmp->undo_seqno = seqno;
343                 if (error) {
344                         hvkprintf(root_volume,
345                                 "recovery failure during seqno fwdscan\n");
346                         goto done;
347                 }
348                 last_offset = scan_offset;
349                 hvkprintf(root_volume,
350                         "recovery range %016jx-%016jx\n",
351                         (intmax_t)first_offset,
352                         (intmax_t)last_offset);
353                 hvkprintf(root_volume,
354                         "recovery nexto %016jx endseqno=%08x\n",
355                         (intmax_t)rootmap->next_offset,
356                         seqno);
357         }
358
359         /*
360          * Calculate the size of the active portion of the FIFO.  If the
361          * FIFO is empty the filesystem is clean and no further action is
362          * needed.
363          */
364         if (last_offset >= first_offset) {
365                 bytes = last_offset - first_offset;
366         } else {
367                 bytes = rootmap->alloc_offset - first_offset +
368                         (last_offset & HAMMER_OFF_LONG_MASK);
369         }
370         if (bytes == 0) {
371                 degenerate_case = 1;
372                 error = 0;
373                 goto done;
374         }
375
376         hvkprintf(root_volume,
377                 "recovery undo  %016jx-%016jx (%jd bytes)%s\n",
378                 (intmax_t)first_offset,
379                 (intmax_t)last_offset,
380                 (intmax_t)bytes,
381                 (hmp->ronly ? " (RO)" : "(RW)"));
382         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
383                 hkprintf("Undo size is absurd, unable to mount\n");
384                 error = EIO;
385                 goto done;
386         }
387
388         /*
389          * Scan the UNDOs backwards.
390          */
391         scan_offset = last_offset;
392
393         while ((int64_t)bytes > 0) {
394                 KKASSERT(scan_offset != first_offset);
395                 head = hammer_recover_scan_rev(hmp, root_volume,
396                                                &scan_offset, &error, &buffer);
397                 if (error)
398                         break;
399
400                 /*
401                  * Normal UNDO
402                  */
403                 error = hammer_recover_undo(hmp, root_volume, &head->undo);
404                 if (error) {
405                         hvkprintf(root_volume,
406                                 "UNDO record at %016jx failed\n",
407                                 (intmax_t)scan_offset - head->head.hdr_size);
408                         break;
409                 }
410
411                 /*
412                  * The first REDO_SYNC record encountered (scanning backwards)
413                  * enables REDO processing.
414                  */
415                 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
416                     head->redo.redo_flags == HAMMER_REDO_SYNC) {
417                         if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
418                                 hvkprintf(root_volume,
419                                         "Ignoring extra REDO_SYNC "
420                                         "records in UNDO/REDO FIFO.\n");
421                         } else {
422                                 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
423                                 hmp->recover_stage2_offset =
424                                         head->redo.redo_offset;
425                                 hvkprintf(root_volume,
426                                         "Found REDO_SYNC %016jx\n",
427                                         (intmax_t)head->redo.redo_offset);
428                         }
429                 }
430
431                 bytes -= head->head.hdr_size;
432
433                 /*
434                  * If too many dirty buffers have built up we have to flush'm
435                  * out.  As long as we do not flush out the volume header
436                  * a crash here should not cause any problems.
437                  *
438                  * buffer must be released so the flush can assert that
439                  * all buffers are idle.
440                  */
441                 if (hammer_flusher_meta_limit(hmp)) {
442                         if (buffer) {
443                                 hammer_rel_buffer(buffer, 0);
444                                 buffer = NULL;
445                         }
446                         if (hmp->ronly == 0) {
447                                 hammer_recover_flush_buffers(hmp, root_volume,
448                                                              0);
449                                 hvkprintf(root_volume, "Continuing recovery\n");
450                         } else {
451                                 hvkprintf(root_volume,
452                                         "Recovery failure: "
453                                         "Insufficient buffer cache to hold "
454                                         "dirty buffers on read-only mount!\n");
455                                 error = EIO;
456                                 break;
457                         }
458                 }
459         }
460         KKASSERT(error || bytes == 0);
461 done:
462         if (buffer) {
463                 hammer_rel_buffer(buffer, 0);
464                 buffer = NULL;
465         }
466
467         /*
468          * After completely flushing all the recovered buffers the volume
469          * header will also be flushed.
470          */
471         if (root_volume->io.recovered == 0) {
472                 hammer_ref_volume(root_volume);
473                 root_volume->io.recovered = 1;
474         }
475
476         /*
477          * Finish up flushing (or discarding) recovered buffers.  FIFO
478          * indices in the volume header are updated to the actual undo
479          * range but will not be collapsed until stage 2.
480          */
481         if (error == 0) {
482                 hammer_modify_volume_noundo(NULL, root_volume);
483                 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
484                 rootmap->first_offset = first_offset;
485                 rootmap->next_offset = last_offset;
486                 hammer_modify_volume_done(root_volume);
487                 if (hmp->ronly == 0)
488                         hammer_recover_flush_buffers(hmp, root_volume, 1);
489         } else {
490                 hammer_recover_flush_buffers(hmp, root_volume, -1);
491         }
492         if (degenerate_case == 0) {
493                 hvkprintf(root_volume, "recovery complete\n");
494         } else {
495                 hvkprintf(root_volume, "mounted clean, no recovery needed\n");
496         }
497         return (error);
498 }
499
500 /*
501  * Execute redo operations
502  *
503  * This procedure is run at the end of the mount sequence, after the hammer
504  * mount structure has been completely initialized but before the filesystem
505  * goes live.  It can access standard cursors, the B-Tree, flush the
506  * filesystem, and so forth.
507  *
508  * This code may only be called for read-write mounts or when a mount
509  * switches from read-only to read-write.  vnodes may or may not be present.
510  *
511  * The stage1 code will have already calculated the correct FIFO range
512  * for the nominal UNDO FIFO and stored it in the rootmap.  The extended
513  * range for REDO is stored in hmp->recover_stage2_offset.
514  */
515 int
516 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
517 {
518         hammer_blockmap_t rootmap;
519         hammer_buffer_t buffer;
520         hammer_off_t scan_offset;
521         hammer_off_t oscan_offset;
522         hammer_off_t bytes;
523         hammer_off_t ext_bytes;
524         hammer_fifo_any_t head;
525         hammer_off_t first_offset;
526         hammer_off_t last_offset;
527         hammer_off_t ext_offset;
528         struct hammer_rterm_rb_tree rterm_root;
529         uint32_t seqno;
530         int error;
531         int verbose = 0;
532         int dorscan;
533
534         /*
535          * Stage 2 can only be run on a RW mount, or when the mount is
536          * switched from RO to RW.
537          */
538         KKASSERT(hmp->ronly == 0);
539         RB_INIT(&rterm_root);
540
541         if (hammer_skip_redo == 1)
542                 hvkprintf(root_volume, "recovery redo marked as optional\n");
543
544         if (hammer_skip_redo == 2) {
545                 hvkprintf(root_volume, "recovery redo skipped.\n");
546                 return (0);
547         }
548
549         /*
550          * Examine the UNDO FIFO.  If it is empty the filesystem is clean
551          * and no action need be taken.
552          */
553         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
554         first_offset = rootmap->first_offset;
555         last_offset  = rootmap->next_offset;
556         if (first_offset == last_offset) {
557                 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
558                 return(0);
559         }
560
561         /*
562          * Stage2 must only be run once, and will not be run at all
563          * if Stage1 did not find a REDO_SYNC record.
564          */
565         error = 0;
566         buffer = NULL;
567
568         if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
569                 goto done;
570         hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
571         hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
572         ext_offset = hmp->recover_stage2_offset;
573         if (ext_offset == 0) {
574                 hvkprintf(root_volume,
575                         "REDO stage specified but no REDO_SYNC "
576                         "offset, ignoring\n");
577                 goto done;
578         }
579
580         /*
581          * Calculate nominal UNDO range (this is not yet the extended
582          * range).
583          */
584         if (last_offset >= first_offset) {
585                 bytes = last_offset - first_offset;
586         } else {
587                 bytes = rootmap->alloc_offset - first_offset +
588                         (last_offset & HAMMER_OFF_LONG_MASK);
589         }
590         hvkprintf(root_volume,
591                 "recovery redo  %016jx-%016jx (%jd bytes)%s\n",
592                 (intmax_t)first_offset,
593                 (intmax_t)last_offset,
594                 (intmax_t)bytes,
595                 (hmp->ronly ? " (RO)" : "(RW)"));
596         verbose = 1;
597         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
598                 hkprintf("Undo size is absurd, unable to mount\n");
599                 error = EIO;
600                 goto fatal;
601         }
602
603         /*
604          * Scan the REDOs backwards collecting REDO_TERM_* information.
605          * This information is only collected for the extended range,
606          * non-inclusive of any TERMs in the nominal UNDO range.
607          *
608          * If the stage2 extended range is inside the nominal undo range
609          * we have nothing to scan.
610          *
611          * This must fit in memory!
612          */
613         if (first_offset < last_offset) {
614                 /*
615                  * [      first_offset........last_offset      ]
616                  */
617                 if (ext_offset < first_offset) {
618                         dorscan = 1;
619                         ext_bytes = first_offset - ext_offset;
620                 } else if (ext_offset > last_offset) {
621                         dorscan = 1;
622                         ext_bytes = (rootmap->alloc_offset - ext_offset) +
623                                     (first_offset & HAMMER_OFF_LONG_MASK);
624                 } else {
625                         ext_bytes = -(ext_offset - first_offset);
626                         dorscan = 0;
627                 }
628         } else {
629                 /*
630                  * [......last_offset         first_offset.....]
631                  */
632                 if (ext_offset < last_offset) {
633                         ext_bytes = -((rootmap->alloc_offset - first_offset) +
634                                     (ext_offset & HAMMER_OFF_LONG_MASK));
635                         dorscan = 0;
636                 } else if (ext_offset > first_offset) {
637                         ext_bytes = -(ext_offset - first_offset);
638                         dorscan = 0;
639                 } else {
640                         ext_bytes = first_offset - ext_offset;
641                         dorscan = 1;
642                 }
643         }
644
645         if (dorscan) {
646                 scan_offset = first_offset;
647                 hvkprintf(root_volume,
648                         "Find extended redo  %016jx, %jd extbytes\n",
649                         (intmax_t)ext_offset,
650                         (intmax_t)ext_bytes);
651                 seqno = hmp->recover_stage2_seqno - 1;
652                 for (;;) {
653                         head = hammer_recover_scan_rev(hmp, root_volume,
654                                                        &scan_offset,
655                                                        &error, &buffer);
656                         if (error)
657                                 break;
658                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
659                                 if (head->head.hdr_seq != seqno) {
660                                         error = ERANGE;
661                                         break;
662                                 }
663                                 error = hammer_recover_redo_rec(
664                                                 hmp, &rterm_root,
665                                                 scan_offset, &head->redo);
666                                 --seqno;
667                         }
668                         if (scan_offset == ext_offset)
669                                 break;
670                 }
671                 if (error) {
672                         hvkprintf(root_volume,
673                                 "Find extended redo failed %d, "
674                                 "unable to run REDO\n",
675                                 error);
676                         goto done;
677                 }
678         } else {
679                 hvkprintf(root_volume,
680                         "Embedded extended redo %016jx, %jd extbytes\n",
681                         (intmax_t)ext_offset,
682                         (intmax_t)ext_bytes);
683         }
684
685         /*
686          * Scan the REDO forwards through the entire extended range.
687          * Anything with a previously recorded matching TERM is discarded.
688          */
689         scan_offset = ext_offset;
690         bytes += ext_bytes;
691
692         /*
693          * NOTE: when doing a forward scan the returned scan_offset is
694          *       for the record following the returned record, so we
695          *       have to play a bit.
696          */
697         while ((int64_t)bytes > 0) {
698                 KKASSERT(scan_offset != last_offset);
699
700                 oscan_offset = scan_offset;
701                 head = hammer_recover_scan_fwd(hmp, root_volume,
702                                                &scan_offset, &error, &buffer);
703                 if (error)
704                         break;
705
706                 error = hammer_recover_redo_run(hmp, &rterm_root,
707                                                 oscan_offset, &head->redo);
708                 if (error) {
709                         hvkprintf(root_volume,
710                                 "UNDO record at %016jx failed\n",
711                                 (intmax_t)scan_offset - head->head.hdr_size);
712                         break;
713                 }
714                 bytes -= head->head.hdr_size;
715         }
716         KKASSERT(error || bytes == 0);
717
718 done:
719         if (buffer) {
720                 hammer_rel_buffer(buffer, 0);
721                 buffer = NULL;
722         }
723
724         /*
725          * Cleanup rterm tree
726          */
727         {
728                 hammer_rterm_t rterm;
729                 hammer_rterm_entry_t rte;
730
731                 while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
732                         RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
733                         while ((rte = rterm->term_list) != NULL) {
734                                 rterm->term_list = rte->next;
735                                 kfree(rte, hmp->m_misc);
736                         }
737                         kfree(rterm, hmp->m_misc);
738                 }
739         }
740
741         /*
742          * Finish up flushing (or discarding) recovered buffers by executing
743          * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
744          * case tests and forces the flush in order to update the FIFO indices.
745          *
746          * If a crash occurs during the flush the entire undo/redo will be
747          * re-run during recovery on the next mount.
748          */
749         if (error == 0) {
750                 if (rootmap->first_offset != rootmap->next_offset)
751                         hmp->hflags |= HMNT_UNDO_DIRTY;
752                 hammer_flusher_sync(hmp);
753         }
754 fatal:
755         hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
756         if (verbose) {
757                 hvkprintf(root_volume, "End redo recovery\n");
758         }
759
760         if (error && hammer_skip_redo == 1)
761                 hvkprintf(root_volume,
762                         "recovery redo error %d, skipping.\n",
763                         error);
764
765         return (hammer_skip_redo ? 0 : error);
766 }
767
768 /*
769  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
770  * record at *scan_offsetp or NULL if an error occured.
771  *
772  * On return *scan_offsetp will be the offset of the returned record.
773  */
774 hammer_fifo_any_t
775 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
776                         hammer_off_t *scan_offsetp,
777                         int *errorp, struct hammer_buffer **bufferp)
778 {
779         hammer_off_t scan_offset;
780         hammer_blockmap_t rootmap;
781         hammer_fifo_any_t head;
782         hammer_fifo_tail_t tail;
783
784         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
785         scan_offset = *scan_offsetp;
786
787         if (hammer_debug_general & 0x0080)
788                 hdkprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
789         if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
790                 scan_offset = rootmap->alloc_offset;
791         if (scan_offset - sizeof(*tail) <
792             HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
793                 hvkprintf(root_volume,
794                         "UNDO record at %016jx FIFO underflow\n",
795                         (intmax_t)scan_offset);
796                 *errorp = EIO;
797                 return (NULL);
798         }
799         tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
800                             errorp, bufferp);
801         if (*errorp) {
802                 hvkprintf(root_volume,
803                         "Unable to read UNDO TAIL at %016jx\n",
804                         (intmax_t)scan_offset - sizeof(*tail));
805                 return (NULL);
806         }
807
808         if (hammer_check_tail_signature(tail, scan_offset) != 0) {
809                 hvkprintf(root_volume,
810                         "Illegal UNDO TAIL signature at %016jx\n",
811                         (intmax_t)scan_offset - sizeof(*tail));
812                 *errorp = EIO;
813                 return (NULL);
814         }
815         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
816         *scan_offsetp = scan_offset - head->head.hdr_size;
817
818         return (head);
819 }
820
821 /*
822  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
823  * an error occured.
824  *
825  * On return *scan_offsetp will be the offset of the record following
826  * the returned record.
827  */
828 hammer_fifo_any_t
829 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
830                         hammer_off_t *scan_offsetp,
831                         int *errorp, struct hammer_buffer **bufferp)
832 {
833         hammer_off_t scan_offset;
834         hammer_blockmap_t rootmap;
835         hammer_fifo_any_t head;
836
837         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
838         scan_offset = *scan_offsetp;
839
840         if (hammer_debug_general & 0x0080)
841                 hdkprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
842         if (scan_offset == rootmap->alloc_offset)
843                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
844
845         head = hammer_bread(hmp, scan_offset, errorp, bufferp);
846         if (*errorp) {
847                 hvkprintf(root_volume,
848                         "Unable to read UNDO HEAD at %016jx\n",
849                         (intmax_t)scan_offset);
850                 return (NULL);
851         }
852
853         if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
854                 hvkprintf(root_volume,
855                         "Illegal UNDO TAIL signature at %016jx\n",
856                         (intmax_t)scan_offset);
857                 *errorp = EIO;
858                 return (NULL);
859         }
860         scan_offset += head->head.hdr_size;
861         if (scan_offset == rootmap->alloc_offset)
862                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
863         *scan_offsetp = scan_offset;
864
865         return (head);
866 }
867
868 /*
869  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
870  * once the head and tail has been established.
871  *
872  * This function validates the entire FIFO record wrapper.
873  */
874 static __inline
875 int
876 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
877                         hammer_off_t beg_off)
878 {
879         hammer_off_t end_off;
880         uint32_t crc;
881         int bytes;
882
883         /*
884          * Check signatures.  The tail signature is allowed to be the
885          * head signature only for 8-byte PADs.
886          */
887         if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
888                 hkprintf("FIFO record bad head signature %04x at %016jx\n",
889                         head->hdr_signature,
890                         (intmax_t)beg_off);
891                 return(2);
892         }
893         if (head->hdr_size < HAMMER_HEAD_ALIGN ||
894             (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
895                 hkprintf("FIFO record unaligned or bad size %04x at %016jx\n",
896                         head->hdr_size,
897                         (intmax_t)beg_off);
898                 return(2);
899         }
900         end_off = beg_off + head->hdr_size;
901
902         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
903             (size_t)(end_off - beg_off) != sizeof(*tail)) {
904                 if (head->hdr_type != tail->tail_type) {
905                         hkprintf("FIFO record head/tail type mismatch "
906                                 "%04x %04x at %016jx\n",
907                                 head->hdr_type, tail->tail_type,
908                                 (intmax_t)beg_off);
909                         return(2);
910                 }
911                 if (head->hdr_size != tail->tail_size) {
912                         hkprintf("FIFO record head/tail size mismatch "
913                                 "%04x %04x at %016jx\n",
914                                 head->hdr_size, tail->tail_size,
915                                 (intmax_t)beg_off);
916                         return(2);
917                 }
918                 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
919                         hkprintf("FIFO record bad tail signature "
920                                 "%04x at %016jx\n",
921                                 tail->tail_signature,
922                                 (intmax_t)beg_off);
923                         return(3);
924                 }
925         }
926
927         /*
928          * Non-PAD records must have a CRC and must be sized at
929          * least large enough to fit the head and tail.
930          */
931         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
932                 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
933                       crc32(head + 1, head->hdr_size - sizeof(*head));
934                 if (head->hdr_crc != crc) {
935                         hkprintf("FIFO record CRC failed %08x %08x at %016jx\n",
936                                 head->hdr_crc, crc,
937                                 (intmax_t)beg_off);
938                         return(EIO);
939                 }
940                 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
941                         hkprintf("FIFO record too small %04x at %016jx\n",
942                                 head->hdr_size,
943                                 (intmax_t)beg_off);
944                         return(EIO);
945                 }
946         }
947
948         /*
949          * Check the tail
950          */
951         bytes = head->hdr_size;
952         tail = (void *)((char *)head + bytes - sizeof(*tail));
953         if (tail->tail_size != head->hdr_size) {
954                 hkprintf("Bad tail size %04x vs %04x at %016jx\n",
955                         tail->tail_size, head->hdr_size,
956                         (intmax_t)beg_off);
957                 return(EIO);
958         }
959         if (tail->tail_type != head->hdr_type) {
960                 hkprintf("Bad tail type %04x vs %04x at %016jx\n",
961                         tail->tail_type, head->hdr_type,
962                         (intmax_t)beg_off);
963                 return(EIO);
964         }
965
966         return(0);
967 }
968
969 /*
970  * Check that the FIFO record is in-bounds given the head and the
971  * hammer offset.
972  *
973  * Also checks that the head and tail structures agree with each other,
974  * but does not check beyond the signature, type, and size.
975  */
976 static int
977 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
978 {
979         hammer_fifo_tail_t tail;
980         hammer_off_t end_off;
981
982         /*
983          * head overlaps buffer boundary.  This could be a PAD so only
984          * check the minimum PAD size here.
985          */
986         if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
987                 return(1);
988
989         /*
990          * Calculate the ending offset and make sure the record does
991          * not cross a buffer boundary.
992          */
993         end_off = beg_off + head->hdr_size;
994         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
995                 return(1);
996         tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
997         return (_hammer_check_signature(head, tail, beg_off));
998 }
999
1000 /*
1001  * Check that the FIFO record is in-bounds given the tail and the
1002  * hammer offset.  The offset is pointing at the ending boundary of the
1003  * record.
1004  *
1005  * Also checks that the head and tail structures agree with each other,
1006  * but does not check beyond the signature, type, and size.
1007  */
1008 static int
1009 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
1010 {
1011         hammer_fifo_head_t head;
1012         hammer_off_t beg_off;
1013
1014         /*
1015          * tail overlaps buffer boundary
1016          */
1017         if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1018                 return(1);
1019
1020         /*
1021          * Calculate the begining offset and make sure the record does
1022          * not cross a buffer boundary.
1023          */
1024         beg_off = end_off - tail->tail_size;
1025         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1026                 return(1);
1027         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1028         return (_hammer_check_signature(head, tail, beg_off));
1029 }
1030
1031 static int
1032 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1033                     hammer_fifo_undo_t undo)
1034 {
1035         hammer_volume_t volume;
1036         hammer_buffer_t buffer;
1037         hammer_off_t buf_offset;
1038         int zone;
1039         int error;
1040         int vol_no;
1041         int bytes;
1042         uint32_t offset;
1043
1044         /*
1045          * Only process UNDO records.  Flag if we find other records to
1046          * optimize stage2 recovery.
1047          */
1048         if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
1049                 return(0);
1050
1051         /*
1052          * Validate the UNDO record.
1053          */
1054         bytes = undo->head.hdr_size - sizeof(*undo) -
1055                 sizeof(struct hammer_fifo_tail);
1056         if (bytes < 0 || undo->undo_data_bytes < 0 ||
1057             undo->undo_data_bytes > bytes) {
1058                 hkprintf("Corrupt UNDO record, undo_data_bytes %d/%d\n",
1059                         undo->undo_data_bytes, bytes);
1060                 return(EIO);
1061         }
1062
1063         bytes = undo->undo_data_bytes;
1064
1065         /*
1066          * The undo offset may only be a zone-1 or zone-2 offset.
1067          *
1068          * Currently we only support a zone-1 offset representing the
1069          * volume header.
1070          */
1071         zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1072         offset = undo->undo_offset & HAMMER_BUFMASK;
1073
1074         if (offset + bytes > HAMMER_BUFSIZE) {
1075                 hkprintf("Corrupt UNDO record, bad offset\n");
1076                 return (EIO);
1077         }
1078
1079         switch(zone) {
1080         case HAMMER_ZONE_RAW_VOLUME_INDEX:
1081                 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1082                 volume = hammer_get_volume(hmp, vol_no, &error);
1083                 if (volume == NULL) {
1084                         hkprintf("UNDO record, cannot access volume %d\n",
1085                                 vol_no);
1086                         break;
1087                 }
1088                 hammer_modify_volume_noundo(NULL, volume);
1089                 hammer_recover_copy_undo(undo->undo_offset,
1090                                          (char *)(undo + 1),
1091                                          (char *)volume->ondisk + offset,
1092                                          bytes);
1093                 hammer_modify_volume_done(volume);
1094
1095                 /*
1096                  * Multiple modifications may be made to the same buffer.
1097                  * Also, the volume header cannot be written out until
1098                  * everything else has been flushed.  This also
1099                  * covers the read-only case by preventing the kernel from
1100                  * flushing the buffer.
1101                  */
1102                 if (volume->io.recovered == 0)
1103                         volume->io.recovered = 1;
1104                 else
1105                         hammer_rel_volume(volume, 0);
1106                 break;
1107         case HAMMER_ZONE_RAW_BUFFER_INDEX:
1108                 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
1109                 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1110                                            0, &error);
1111                 if (buffer == NULL) {
1112                         hkprintf("UNDO record, cannot access buffer %016jx\n",
1113                                 (intmax_t)undo->undo_offset);
1114                         break;
1115                 }
1116                 hammer_modify_buffer_noundo(NULL, buffer);
1117                 hammer_recover_copy_undo(undo->undo_offset,
1118                                          (char *)(undo + 1),
1119                                          (char *)buffer->ondisk + offset,
1120                                          bytes);
1121                 hammer_modify_buffer_done(buffer);
1122
1123                 /*
1124                  * Multiple modifications may be made to the same buffer,
1125                  * improve performance by delaying the flush.  This also
1126                  * covers the read-only case by preventing the kernel from
1127                  * flushing the buffer.
1128                  */
1129                 if (buffer->io.recovered == 0)
1130                         buffer->io.recovered = 1;
1131                 else
1132                         hammer_rel_buffer(buffer, 0);
1133                 break;
1134         default:
1135                 hkprintf("Corrupt UNDO record\n");
1136                 error = EIO;
1137         }
1138         return (error);
1139 }
1140
1141 static void
1142 hammer_recover_copy_undo(hammer_off_t undo_offset,
1143                          char *src, char *dst, int bytes)
1144 {
1145         if (hammer_debug_general & 0x0080) {
1146                 hdkprintf("UNDO %016jx: %d\n",
1147                         (intmax_t)undo_offset, bytes);
1148         }
1149 #if 0
1150         hkprintf("UNDO %016jx:", (intmax_t)undo_offset);
1151         hammer_recover_debug_dump(22, dst, bytes);
1152         kprintf("%22s", "to:");
1153         hammer_recover_debug_dump(22, src, bytes);
1154 #endif
1155         bcopy(src, dst, bytes);
1156 }
1157
1158 /*
1159  * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1160  * during the backwards scan of the extended UNDO/REDO FIFO.  This scan
1161  * does not include the nominal UNDO range, just the extended range.
1162  */
1163 int
1164 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1165                         hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1166 {
1167         hammer_rterm_t rterm;
1168         hammer_rterm_t nrterm;
1169         hammer_rterm_entry_t rte;
1170
1171         if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1172                 return(0);
1173         if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1174             redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1175                 return(0);
1176         }
1177
1178         nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1179         nrterm->redo_objid = redo->redo_objid;
1180         nrterm->redo_localization = redo->redo_localization;
1181         nrterm->redo_flags = redo->redo_flags;
1182         nrterm->redo_offset = redo->redo_offset;
1183
1184         rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1185         if (rterm)
1186                 kfree(nrterm, hmp->m_misc);
1187         else
1188                 rterm = nrterm;
1189
1190         if (bootverbose) {
1191                 hkprintf("record record %016jx objid %016jx "
1192                         "offset %016jx flags %08x\n",
1193                         (intmax_t)scan_offset,
1194                         (intmax_t)redo->redo_objid,
1195                         (intmax_t)redo->redo_offset,
1196                         (int)redo->redo_flags);
1197         }
1198
1199         /*
1200          * Scan in reverse order, rte prepended, so the rte list will be
1201          * in forward order.
1202          */
1203         rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1204         rte->fifo_offset = scan_offset;
1205         rte->next = rterm->term_list;
1206         rterm->term_list = rte;
1207
1208         return(0);
1209 }
1210
1211 /*
1212  * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1213  * the forwards scan of the entire extended UNDO/REDO FIFO range.
1214  *
1215  * Records matching previously recorded TERMs have already been committed
1216  * and are ignored.
1217  */
1218 int
1219 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1220                         hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1221 {
1222         struct hammer_rterm rtval;
1223         hammer_rterm_t rterm;
1224         hammer_rterm_entry_t rte;
1225
1226         if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1227                 return(0);
1228
1229         switch(redo->redo_flags) {
1230         case HAMMER_REDO_WRITE:
1231         case HAMMER_REDO_TRUNC:
1232                 /*
1233                  * We hit a REDO request.  The REDO request is only executed
1234                  * if there is no matching TERM.
1235                  */
1236                 bzero(&rtval, sizeof(rtval));
1237                 rtval.redo_objid = redo->redo_objid;
1238                 rtval.redo_localization = redo->redo_localization;
1239                 rtval.redo_offset = redo->redo_offset;
1240                 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1241                                    HAMMER_REDO_TERM_WRITE :
1242                                    HAMMER_REDO_TERM_TRUNC;
1243
1244                 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1245                 if (rterm) {
1246                         if (bootverbose) {
1247                                 hkprintf("ignore record %016jx objid %016jx "
1248                                         "offset %016jx flags %08x\n",
1249                                         (intmax_t)scan_offset,
1250                                         (intmax_t)redo->redo_objid,
1251                                         (intmax_t)redo->redo_offset,
1252                                         (int)redo->redo_flags);
1253                         }
1254                         break;
1255                 }
1256                 if (bootverbose) {
1257                         hkprintf("run    record %016jx objid %016jx "
1258                                 "offset %016jx flags %08x\n",
1259                                 (intmax_t)scan_offset,
1260                                 (intmax_t)redo->redo_objid,
1261                                 (intmax_t)redo->redo_offset,
1262                                 (int)redo->redo_flags);
1263                 }
1264
1265                 /*
1266                  * Redo stage2 can access a live filesystem, acquire the
1267                  * vnode.
1268                  */
1269                 hammer_recover_redo_exec(hmp, redo);
1270                 break;
1271         case HAMMER_REDO_TERM_WRITE:
1272         case HAMMER_REDO_TERM_TRUNC:
1273                 /*
1274                  * As we encounter TERMs in the forward scan we remove
1275                  * them.  Once the forward scan hits the nominal undo range
1276                  * there will be no more recorded TERMs.
1277                  */
1278                 bzero(&rtval, sizeof(rtval));
1279                 rtval.redo_objid = redo->redo_objid;
1280                 rtval.redo_localization = redo->redo_localization;
1281                 rtval.redo_flags = redo->redo_flags;
1282                 rtval.redo_offset = redo->redo_offset;
1283
1284                 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1285                 if (rterm) {
1286                         if ((rte = rterm->term_list) != NULL) {
1287                                 KKASSERT(rte->fifo_offset == scan_offset);
1288                                 rterm->term_list = rte->next;
1289                                 kfree(rte, hmp->m_misc);
1290                         }
1291                 }
1292                 break;
1293         }
1294         return(0);
1295 }
1296
1297 static void
1298 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1299 {
1300         struct hammer_transaction trans;
1301         struct vattr va;
1302         struct hammer_inode *ip;
1303         struct vnode *vp = NULL;
1304         int error;
1305
1306         hammer_start_transaction(&trans, hmp);
1307
1308         ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1309                               HAMMER_MAX_TID, redo->redo_localization,
1310                               0, &error);
1311         if (ip == NULL) {
1312                 hkprintf("unable to find objid %016jx:%08x\n",
1313                         (intmax_t)redo->redo_objid, redo->redo_localization);
1314                 goto done2;
1315         }
1316         error = hammer_get_vnode(ip, &vp);
1317         if (error) {
1318                 hkprintf("unable to acquire vnode for %016jx:%08x\n",
1319                         (intmax_t)redo->redo_objid, redo->redo_localization);
1320                 goto done1;
1321         }
1322
1323         switch(redo->redo_flags) {
1324         case HAMMER_REDO_WRITE:
1325                 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1326                 if (error) {
1327                         hkprintf("vn_rdwr open %016jx:%08x returned %d\n",
1328                                 (intmax_t)redo->redo_objid,
1329                                 redo->redo_localization, error);
1330                         break;
1331                 }
1332                 vn_unlock(vp);
1333                 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1334                                 redo->redo_data_bytes,
1335                                 redo->redo_offset, UIO_SYSSPACE,
1336                                 0, proc0.p_ucred, NULL);
1337                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1338                 if (error) {
1339                         hkprintf("write %016jx:%08x returned %d\n",
1340                                 (intmax_t)redo->redo_objid,
1341                                 redo->redo_localization, error);
1342                 }
1343                 VOP_CLOSE(vp, FREAD|FWRITE, NULL);
1344                 break;
1345         case HAMMER_REDO_TRUNC:
1346                 VATTR_NULL(&va);
1347                 va.va_size = redo->redo_offset;
1348                 error = VOP_SETATTR(vp, &va, proc0.p_ucred);
1349                 if (error) {
1350                         hkprintf("setattr offset %016jx error %d\n",
1351                                 (intmax_t)redo->redo_offset, error);
1352                 }
1353                 break;
1354         }
1355         vput(vp);
1356 done1:
1357         hammer_rel_inode(ip, 0);
1358 done2:
1359         hammer_done_transaction(&trans);
1360 }
1361
1362 /*
1363  * RB tree compare function.  Note that REDO_TERM_TRUNC ops ignore
1364  * the offset.
1365  *
1366  * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1367  */
1368 static int
1369 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1370 {
1371         if (rt1->redo_objid < rt2->redo_objid)
1372                 return(-1);
1373         if (rt1->redo_objid > rt2->redo_objid)
1374                 return(1);
1375         if (rt1->redo_localization < rt2->redo_localization)
1376                 return(-1);
1377         if (rt1->redo_localization > rt2->redo_localization)
1378                 return(1);
1379         if (rt1->redo_flags < rt2->redo_flags)
1380                 return(-1);
1381         if (rt1->redo_flags > rt2->redo_flags)
1382                 return(1);
1383         if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1384                 if (rt1->redo_offset < rt2->redo_offset)
1385                         return(-1);
1386                 if (rt1->redo_offset > rt2->redo_offset)
1387                         return(1);
1388         }
1389         return(0);
1390 }
1391
1392 #if 0
1393
1394 static void
1395 hammer_recover_debug_dump(int w, char *buf, int bytes)
1396 {
1397         int i;
1398
1399         for (i = 0; i < bytes; ++i) {
1400                 if (i && (i & 15) == 0)
1401                         kprintf("\n%*.*s", w, w, "");
1402                 kprintf(" %02x", (unsigned char)buf[i]);
1403         }
1404         kprintf("\n");
1405 }
1406
1407 #endif
1408
1409 /*
1410  * Flush recovered buffers from recovery operations.  The call to this
1411  * routine may be delayed if a read-only mount was made and then later
1412  * upgraded to read-write.  This routine is also called when unmounting
1413  * a read-only mount to clean out recovered (dirty) buffers which we
1414  * couldn't flush (because the mount is read-only).
1415  *
1416  * The volume header is always written last.  The UNDO FIFO will be forced
1417  * to zero-length by setting next_offset to first_offset.  This leaves the
1418  * (now stale) UNDO information used to recover the disk available for
1419  * forensic analysis.
1420  *
1421  * final is typically 0 or 1.  The volume header is only written if final
1422  * is 1.  If final is -1 the recovered buffers are discarded instead of
1423  * written and root_volume can also be passed as NULL in that case.
1424  */
1425 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1426 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1427
1428 void
1429 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1430                              int final)
1431 {
1432         /*
1433          * Flush the buffers out asynchronously, wait for all the I/O to
1434          * complete, then do it again to destroy the buffer cache buffer
1435          * so it doesn't alias something later on.
1436          */
1437         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1438                 hammer_recover_flush_buffer_callback, &final);
1439         hammer_io_wait_all(hmp, "hmrrcw", 1);
1440         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1441                 hammer_recover_flush_buffer_callback, &final);
1442
1443         /*
1444          * Flush all volume headers except the root volume.  If final < 0
1445          * we discard all volume headers including the root volume.
1446          */
1447         if (final >= 0) {
1448                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1449                         hammer_recover_flush_volume_callback, root_volume);
1450         } else {
1451                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1452                         hammer_recover_flush_volume_callback, NULL);
1453         }
1454
1455         /*
1456          * Finalize the root volume header.
1457          *
1458          * No interlock is needed, volume buffers are not
1459          * messed with by bioops.
1460          */
1461         if (root_volume && root_volume->io.recovered && final > 0) {
1462                 hammer_io_wait_all(hmp, "hmrflx", 1);
1463                 root_volume->io.recovered = 0;
1464                 hammer_io_flush(&root_volume->io, 0);
1465                 hammer_rel_volume(root_volume, 0);
1466                 hammer_io_wait_all(hmp, "hmrfly", 1);
1467         }
1468 }
1469
1470 /*
1471  * Callback to flush volume headers.  If discarding data will be NULL and
1472  * all volume headers (including the root volume) will be discarded.
1473  * Otherwise data is the root_volume and we flush all volume headers
1474  * EXCEPT the root_volume.
1475  *
1476  * Clear any I/O error or modified condition when discarding buffers to
1477  * clean up the reference count, otherwise the buffer may have extra refs
1478  * on it.
1479  */
1480 static
1481 int
1482 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1483 {
1484         hammer_volume_t root_volume = data;
1485
1486         if (volume->io.recovered && volume != root_volume) {
1487                 volume->io.recovered = 0;
1488                 if (root_volume != NULL) {
1489                         /*
1490                          * No interlock is needed, volume buffers are not
1491                          * messed with by bioops.
1492                          */
1493                         hammer_io_flush(&volume->io, 0);
1494                 } else {
1495                         hammer_io_clear_error(&volume->io);
1496                         hammer_io_clear_modify(&volume->io, 1);
1497                 }
1498                 hammer_rel_volume(volume, 0);
1499         }
1500         return(0);
1501 }
1502
1503 /*
1504  * Flush or discard recovered I/O buffers.
1505  *
1506  * Clear any I/O error or modified condition when discarding buffers to
1507  * clean up the reference count, otherwise the buffer may have extra refs
1508  * on it.
1509  */
1510 static
1511 int
1512 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1513 {
1514         int final = *(int *)data;
1515         int flush;
1516
1517         if (buffer->io.recovered) {
1518                 buffer->io.recovered = 0;
1519                 buffer->io.reclaim = 1;
1520                 if (final < 0) {
1521                         hammer_io_clear_error(&buffer->io);
1522                         hammer_io_clear_modify(&buffer->io, 1);
1523                 } else {
1524                         hammer_io_write_interlock(&buffer->io);
1525                         hammer_io_flush(&buffer->io, 0);
1526                         hammer_io_done_interlock(&buffer->io);
1527                 }
1528                 hammer_rel_buffer(buffer, 0);
1529         } else {
1530                 flush = hammer_ref_interlock(&buffer->io.lock);
1531                 if (flush)
1532                         atomic_add_int(&hammer_count_refedbufs, 1);
1533
1534                 if (final < 0) {
1535                         hammer_io_clear_error(&buffer->io);
1536                         hammer_io_clear_modify(&buffer->io, 1);
1537                 }
1538                 KKASSERT(hammer_oneref(&buffer->io.lock));
1539                 buffer->io.reclaim = 1;
1540                 hammer_rel_buffer(buffer, flush);
1541         }
1542         return(0);
1543 }
1544