sys/vfs/hammer: Adjust and cleanup _KERNEL
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 /*
36  * UNDO ALGORITHM:
37  *
38  *      The UNDO algorithm is trivial.  The nominal UNDO range in the
39  *      FIFO is determined by taking the first/next offset stored in
40  *      the volume header.  The next offset may not be correct since
41  *      UNDO flushes are not required to flush the volume header, so
42  *      the code also scans forward until it finds a discontinuous
43  *      sequence number.
44  *
45  *      The UNDOs are then scanned and executed in reverse order.  These
46  *      UNDOs are effectively just data restorations based on HAMMER offsets.
47  *
48  * REDO ALGORITHM:
49  *
50  *      REDO records are laid down in the UNDO/REDO FIFO for nominal
51  *      writes, truncations, and file extension ops.  On a per-inode
52  *      basis two types of REDO records are generated, REDO_WRITE
53  *      and REDO_TRUNC.
54  *
55  *      Essentially the recovery block will contain UNDO records backing
56  *      out partial operations and REDO records to regenerate those partial
57  *      operations guaranteed by the filesystem during recovery.
58  *
59  *      REDO generation is optional, and can also be started and then
60  *      later stopped due to excessive write()s inbetween fsyncs, or not
61  *      started at all.  Because of this the recovery code must determine
62  *      when REDOs are valid and when they are not.  Additional records are
63  *      generated to help figure it out.
64  *
65  *      The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66  *      during a flush cycle indicating which records the flush cycle
67  *      has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68  *      each flush cycle to indicate how far back in the UNDO/REDO FIFO
69  *      the recovery code must go to find the earliest applicable REDO
70  *      record.  Applicable REDO records can be far outside the nominal
71  *      UNDO recovery range, for example if a write() lays down a REDO but
72  *      the related file is not flushed for several cycles.
73  *
74  *      The SYNC reference is to a point prior to the nominal UNDO FIFO
75  *      range, creating an extended REDO range which must be scanned.
76  *
77  *      Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78  *      which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79  *      prior to the start of the nominal UNDO range are applicable.
80  *      That is, any REDO_TERM_* records in the extended range but not in
81  *      the nominal undo range will mask any redo operations for prior REDO
82  *      records.  This is necessary because once the TERM is laid down
83  *      followup operations may make additional changes to the related
84  *      records but not necessarily record them as REDOs (because REDOs are
85  *      optional).
86  *
87  *      REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88  *      must be ignored since they represent meta-data flushes which are
89  *      undone by the UNDOs in that nominal UNDO range by the recovery
90  *      code.  Only REDO_TERM_* records in the extended range but not
91  *      in the nominal undo range are applicable.
92  *
93  *      The REDO_SYNC record itself always exists in the nominal UNDO range
94  *      (this is how the extended range is determined).  For recovery
95  *      purposes the most recent REDO_SYNC record is always used if several
96  *      are found.
97  *
98  * CRASHES DURING UNDO/REDO
99  *
100  *      A crash during the UNDO phase requires no additional effort.  The
101  *      UNDOs will simply be re-run again.  The state of the UNDO/REDO fifo
102  *      remains unchanged and has no re-crash issues.
103  *
104  *      A crash during the REDO phase is more complex because the REDOs
105  *      run normal filesystem ops and generate additional UNDO/REDO records.
106  *      REDO is disabled during REDO recovery and any SYNC records generated
107  *      by flushes during REDO recovery must continue to reference the
108  *      original extended range.
109  *
110  *      If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111  *      may become impossible.  This is detected when the start of the
112  *      extended range fails to have monotonically increasing sequence
113  *      numbers leading into the nominal undo range.
114  */
115
116 #include "hammer.h"
117
118 /*
119  * Specify the way we want to handle stage2 errors.
120  *
121  * Following values are accepted:
122  *
123  * 0 - Run redo recovery normally and fail to mount if
124  *     the operation fails (default).
125  * 1 - Run redo recovery, but don't fail to mount if the
126  *     operation fails.
127  * 2 - Completely skip redo recovery (only for severe error
128  *     conditions and/or debugging.
129  */
130 static int hammer_skip_redo = 0;
131 TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo);
132
133 /*
134  * Each rterm entry has a list of fifo offsets indicating termination
135  * points.  These are stripped as the scan progresses.
136  */
137 typedef struct hammer_rterm_entry {
138         struct hammer_rterm_entry *next;
139         hammer_off_t            fifo_offset;
140 } *hammer_rterm_entry_t;
141
142 /*
143  * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
144  * TRUNC entries ignore the offset.
145  */
146 typedef struct hammer_rterm {
147         RB_ENTRY(hammer_rterm)  rb_node;
148         int64_t                 redo_objid;
149         u_int32_t               redo_localization;
150         u_int32_t               redo_flags;
151         hammer_off_t            redo_offset;
152         hammer_rterm_entry_t    term_list;
153 } *hammer_rterm_t;
154
155 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
156 struct hammer_rterm_rb_tree;
157 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
158 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
159
160 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
161                         hammer_off_t end_off);
162 static int hammer_check_head_signature(hammer_fifo_head_t head,
163                         hammer_off_t beg_off);
164 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
165                         char *src, char *dst, int bytes);
166 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
167                         hammer_volume_t root_volume,
168                         hammer_off_t *scan_offsetp,
169                         int *errorp, struct hammer_buffer **bufferp);
170 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
171                         hammer_volume_t root_volume,
172                         hammer_off_t *scan_offsetp,
173                         int *errorp, struct hammer_buffer **bufferp);
174 #if 0
175 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
176 #endif
177 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
178                         hammer_fifo_undo_t undo);
179 static int hammer_recover_redo_rec(hammer_mount_t hmp,
180                         struct hammer_rterm_rb_tree *root,
181                         hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
182 static int hammer_recover_redo_run(hammer_mount_t hmp,
183                         struct hammer_rterm_rb_tree *root,
184                         hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
185 static void hammer_recover_redo_exec(hammer_mount_t hmp,
186                         hammer_fifo_redo_t redo);
187
188 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
189
190 /*
191  * Recover filesystem meta-data on mount.  This procedure figures out the
192  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
193  * resynchronized by this procedure.
194  *
195  * This procedure is run near the beginning of the mount sequence, before
196  * any B-Tree or high-level accesses are enabled, and is responsible for
197  * restoring the meta-data to a consistent state.  High level HAMMER data
198  * structures (such as the B-Tree) cannot be accessed here.
199  *
200  * NOTE: No information from the root volume has been cached in the
201  *       hammer_mount structure yet, so we need to access the root volume's
202  *       buffer directly.
203  *
204  * NOTE:
205  */
206 int
207 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
208 {
209         hammer_blockmap_t rootmap;
210         hammer_buffer_t buffer;
211         hammer_off_t scan_offset;
212         hammer_off_t scan_offset_save;
213         hammer_off_t bytes;
214         hammer_fifo_any_t head;
215         hammer_off_t first_offset;
216         hammer_off_t last_offset;
217         u_int32_t seqno;
218         int error;
219         int degenerate_case = 0;
220
221         /*
222          * Examine the UNDO FIFO indices in the volume header.
223          */
224         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
225         first_offset = rootmap->first_offset;
226         last_offset  = rootmap->next_offset;
227         buffer = NULL;
228         error = 0;
229
230         hmp->recover_stage2_offset = 0;
231
232         if (first_offset > rootmap->alloc_offset ||
233             last_offset > rootmap->alloc_offset) {
234                 kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
235                         "%016jx, %016jx limit %016jx\n",
236                         root_volume->ondisk->vol_name,
237                         (intmax_t)first_offset,
238                         (intmax_t)last_offset,
239                         (intmax_t)rootmap->alloc_offset);
240                 error = EIO;
241                 goto done;
242         }
243
244         /*
245          * In HAMMER version 4+ filesystems the volume header does NOT
246          * contain definitive UNDO FIFO state.  In particular, the
247          * rootmap->next_offset may not be indexed completely to the
248          * end of the active UNDO FIFO.
249          */
250         if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
251                 /*
252                  * To find the definitive range we must first scan backwards
253                  * from first_offset to locate the first real record and
254                  * extract the sequence number from it.  This record is not
255                  * part of the active undo space.
256                  */
257                 scan_offset = first_offset;
258                 seqno = 0;
259
260                 for (;;) {
261                         head = hammer_recover_scan_rev(hmp, root_volume,
262                                                        &scan_offset,
263                                                        &error, &buffer);
264                         if (error)
265                                 break;
266                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
267                                 seqno = head->head.hdr_seq;
268                                 break;
269                         }
270                 }
271                 if (error) {
272                         kprintf("HAMMER(%s) recovery failure "
273                                 "during seqno backscan\n",
274                                 root_volume->ondisk->vol_name);
275                         goto done;
276                 }
277
278                 /*
279                  * Scan forwards from first_offset and (seqno+1) looking
280                  * for a sequence space discontinuity.  This denotes the
281                  * end of the active FIFO area.
282                  *
283                  * NOTE: For the case where the FIFO is empty the very first
284                  *       record we find will be discontinuous.
285                  *
286                  * NOTE: Do not include trailing PADs in the scan range,
287                  *       and remember the returned scan_offset after a
288                  *       fwd iteration points to the end of the returned
289                  *       record.
290                  */
291                 kprintf("HAMMER(%s) recovery check seqno=%08x\n",
292                         root_volume->ondisk->vol_name,
293                         seqno);
294
295                 scan_offset = first_offset;
296                 scan_offset_save = scan_offset;
297                 ++seqno;
298                 hmp->recover_stage2_seqno = seqno;
299
300                 for (;;) {
301                         head = hammer_recover_scan_fwd(hmp, root_volume,
302                                                        &scan_offset,
303                                                        &error, &buffer);
304                         if (error)
305                                 break;
306                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
307                                 if (seqno != head->head.hdr_seq) {
308                                         scan_offset = scan_offset_save;
309                                         break;
310                                 }
311                                 scan_offset_save = scan_offset;
312                                 ++seqno;
313                         }
314
315 #if 0
316                         /*
317                          * If the forward scan is grossly ahead of last_offset
318                          * then something is wrong.  last_offset is supposed
319                          * to be flushed out
320                          */
321                         if (last_offset >= scan_offset) {
322                                 bytes = last_offset - scan_offset;
323                         } else {
324                                 bytes = rootmap->alloc_offset - scan_offset +
325                                         (last_offset & HAMMER_OFF_LONG_MASK);
326                         }
327                         if (bytes >
328                             (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
329                             4 / 5) {
330                                 kprintf("HAMMER(%s) recovery forward scan is "
331                                         "grossly beyond the last_offset in "
332                                         "the volume header, this can't be "
333                                         "right.\n",
334                                         root_volume->ondisk->vol_name);
335                                 error = EIO;
336                                 break;
337                         }
338 #endif
339                 }
340
341                 /*
342                  * Store the seqno.  This will be the next seqno we lay down
343                  * when generating new UNDOs.
344                  */
345                 hmp->undo_seqno = seqno;
346                 if (error) {
347                         kprintf("HAMMER(%s) recovery failure "
348                                 "during seqno fwdscan\n",
349                                 root_volume->ondisk->vol_name);
350                         goto done;
351                 }
352                 last_offset = scan_offset;
353                 kprintf("HAMMER(%s) recovery range %016jx-%016jx\n"
354                         "HAMMER(%s) recovery nexto %016jx endseqno=%08x\n",
355                         root_volume->ondisk->vol_name,
356                         (intmax_t)first_offset,
357                         (intmax_t)last_offset,
358                         root_volume->ondisk->vol_name,
359                         (intmax_t)rootmap->next_offset,
360                         seqno);
361         }
362
363         /*
364          * Calculate the size of the active portion of the FIFO.  If the
365          * FIFO is empty the filesystem is clean and no further action is
366          * needed.
367          */
368         if (last_offset >= first_offset) {
369                 bytes = last_offset - first_offset;
370         } else {
371                 bytes = rootmap->alloc_offset - first_offset +
372                         (last_offset & HAMMER_OFF_LONG_MASK);
373         }
374         if (bytes == 0) {
375                 degenerate_case = 1;
376                 error = 0;
377                 goto done;
378         }
379
380         kprintf("HAMMER(%s) recovery undo  %016jx-%016jx (%jd bytes)%s\n",
381                 root_volume->ondisk->vol_name,
382                 (intmax_t)first_offset,
383                 (intmax_t)last_offset,
384                 (intmax_t)bytes,
385                 (hmp->ronly ? " (RO)" : "(RW)"));
386         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
387                 kprintf("Undo size is absurd, unable to mount\n");
388                 error = EIO;
389                 goto done;
390         }
391
392         /*
393          * Scan the UNDOs backwards.
394          */
395         scan_offset = last_offset;
396
397         while ((int64_t)bytes > 0) {
398                 KKASSERT(scan_offset != first_offset);
399                 head = hammer_recover_scan_rev(hmp, root_volume,
400                                                &scan_offset, &error, &buffer);
401                 if (error)
402                         break;
403
404                 /*
405                  * Normal UNDO
406                  */
407                 error = hammer_recover_undo(hmp, root_volume, &head->undo);
408                 if (error) {
409                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
410                                 root_volume->ondisk->vol_name,
411                                 (intmax_t)scan_offset - head->head.hdr_size);
412                         break;
413                 }
414
415                 /*
416                  * The first REDO_SYNC record encountered (scanning backwards)
417                  * enables REDO processing.
418                  */
419                 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
420                     head->redo.redo_flags == HAMMER_REDO_SYNC) {
421                         if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
422                                 kprintf("HAMMER(%s) Ignoring extra REDO_SYNC "
423                                         "records in UNDO/REDO FIFO.\n",
424                                         root_volume->ondisk->vol_name
425                                 );
426                         } else {
427                                 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
428                                 hmp->recover_stage2_offset =
429                                         head->redo.redo_offset;
430                                 kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n",
431                                         root_volume->ondisk->vol_name,
432                                         (intmax_t)head->redo.redo_offset);
433                         }
434                 }
435
436                 bytes -= head->head.hdr_size;
437
438                 /*
439                  * If too many dirty buffers have built up we have to flush'm
440                  * out.  As long as we do not flush out the volume header
441                  * a crash here should not cause any problems.
442                  *
443                  * buffer must be released so the flush can assert that
444                  * all buffers are idle.
445                  */
446                 if (hammer_flusher_meta_limit(hmp)) {
447                         if (buffer) {
448                                 hammer_rel_buffer(buffer, 0);
449                                 buffer = NULL;
450                         }
451                         if (hmp->ronly == 0) {
452                                 hammer_recover_flush_buffers(hmp, root_volume,
453                                                              0);
454                                 kprintf("HAMMER(%s) Continuing recovery\n",
455                                         root_volume->ondisk->vol_name);
456                         } else {
457                                 kprintf("HAMMER(%s) Recovery failure: "
458                                         "Insufficient buffer cache to hold "
459                                         "dirty buffers on read-only mount!\n",
460                                         root_volume->ondisk->vol_name);
461                                 error = EIO;
462                                 break;
463                         }
464                 }
465         }
466         KKASSERT(error || bytes == 0);
467 done:
468         if (buffer) {
469                 hammer_rel_buffer(buffer, 0);
470                 buffer = NULL;
471         }
472
473         /*
474          * After completely flushing all the recovered buffers the volume
475          * header will also be flushed.
476          */
477         if (root_volume->io.recovered == 0) {
478                 hammer_ref_volume(root_volume);
479                 root_volume->io.recovered = 1;
480         }
481
482         /*
483          * Finish up flushing (or discarding) recovered buffers.  FIFO
484          * indices in the volume header are updated to the actual undo
485          * range but will not be collapsed until stage 2.
486          */
487         if (error == 0) {
488                 hammer_modify_volume_noundo(NULL, root_volume);
489                 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
490                 rootmap->first_offset = first_offset;
491                 rootmap->next_offset = last_offset;
492                 hammer_modify_volume_done(root_volume);
493                 if (hmp->ronly == 0)
494                         hammer_recover_flush_buffers(hmp, root_volume, 1);
495         } else {
496                 hammer_recover_flush_buffers(hmp, root_volume, -1);
497         }
498         if (degenerate_case == 0) {
499                 kprintf("HAMMER(%s) recovery complete\n",
500                         root_volume->ondisk->vol_name);
501         } else {
502                 kprintf("HAMMER(%s) mounted clean, no recovery needed\n",
503                         root_volume->ondisk->vol_name);
504         }
505         return (error);
506 }
507
508 /*
509  * Execute redo operations
510  *
511  * This procedure is run at the end of the mount sequence, after the hammer
512  * mount structure has been completely initialized but before the filesystem
513  * goes live.  It can access standard cursors, the B-Tree, flush the
514  * filesystem, and so forth.
515  *
516  * This code may only be called for read-write mounts or when a mount
517  * switches from read-only to read-write.  vnodes may or may not be present.
518  *
519  * The stage1 code will have already calculated the correct FIFO range
520  * for the nominal UNDO FIFO and stored it in the rootmap.  The extended
521  * range for REDO is stored in hmp->recover_stage2_offset.
522  */
523 int
524 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
525 {
526         hammer_blockmap_t rootmap;
527         hammer_buffer_t buffer;
528         hammer_off_t scan_offset;
529         hammer_off_t oscan_offset;
530         hammer_off_t bytes;
531         hammer_off_t ext_bytes;
532         hammer_fifo_any_t head;
533         hammer_off_t first_offset;
534         hammer_off_t last_offset;
535         hammer_off_t ext_offset;
536         struct hammer_rterm_rb_tree rterm_root;
537         u_int32_t seqno;
538         int error;
539         int verbose = 0;
540         int dorscan;
541
542         /*
543          * Stage 2 can only be run on a RW mount, or when the mount is
544          * switched from RO to RW.
545          */
546         KKASSERT(hmp->ronly == 0);
547         RB_INIT(&rterm_root);
548
549         if (hammer_skip_redo == 1)
550                 kprintf("HAMMER(%s) recovery redo marked as optional\n",
551                     root_volume->ondisk->vol_name);
552
553         if (hammer_skip_redo == 2) {
554                 kprintf("HAMMER(%s) recovery redo skipped.\n",
555                     root_volume->ondisk->vol_name);
556                 return (0);
557         }
558
559         /*
560          * Examine the UNDO FIFO.  If it is empty the filesystem is clean
561          * and no action need be taken.
562          */
563         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
564         first_offset = rootmap->first_offset;
565         last_offset  = rootmap->next_offset;
566         if (first_offset == last_offset) {
567                 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
568                 return(0);
569         }
570
571         /*
572          * Stage2 must only be run once, and will not be run at all
573          * if Stage1 did not find a REDO_SYNC record.
574          */
575         error = 0;
576         buffer = NULL;
577
578         if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
579                 goto done;
580         hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
581         hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
582         ext_offset = hmp->recover_stage2_offset;
583         if (ext_offset == 0) {
584                 kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC "
585                         "offset, ignoring\n",
586                         root_volume->ondisk->vol_name);
587                 goto done;
588         }
589
590         /*
591          * Calculate nominal UNDO range (this is not yet the extended
592          * range).
593          */
594         if (last_offset >= first_offset) {
595                 bytes = last_offset - first_offset;
596         } else {
597                 bytes = rootmap->alloc_offset - first_offset +
598                         (last_offset & HAMMER_OFF_LONG_MASK);
599         }
600         kprintf("HAMMER(%s) recovery redo  %016jx-%016jx (%jd bytes)%s\n",
601                 root_volume->ondisk->vol_name,
602                 (intmax_t)first_offset,
603                 (intmax_t)last_offset,
604                 (intmax_t)bytes,
605                 (hmp->ronly ? " (RO)" : "(RW)"));
606         verbose = 1;
607         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
608                 kprintf("Undo size is absurd, unable to mount\n");
609                 error = EIO;
610                 goto fatal;
611         }
612
613         /*
614          * Scan the REDOs backwards collecting REDO_TERM_* information.
615          * This information is only collected for the extended range,
616          * non-inclusive of any TERMs in the nominal UNDO range.
617          *
618          * If the stage2 extended range is inside the nominal undo range
619          * we have nothing to scan.
620          *
621          * This must fit in memory!
622          */
623         if (first_offset < last_offset) {
624                 /*
625                  * [      first_offset........last_offset      ]
626                  */
627                 if (ext_offset < first_offset) {
628                         dorscan = 1;
629                         ext_bytes = first_offset - ext_offset;
630                 } else if (ext_offset > last_offset) {
631                         dorscan = 1;
632                         ext_bytes = (rootmap->alloc_offset - ext_offset) +
633                                     (first_offset & HAMMER_OFF_LONG_MASK);
634                 } else {
635                         ext_bytes = -(ext_offset - first_offset);
636                         dorscan = 0;
637                 }
638         } else {
639                 /*
640                  * [......last_offset         first_offset.....]
641                  */
642                 if (ext_offset < last_offset) {
643                         ext_bytes = -((rootmap->alloc_offset - first_offset) +
644                                     (ext_offset & HAMMER_OFF_LONG_MASK));
645                         dorscan = 0;
646                 } else if (ext_offset > first_offset) {
647                         ext_bytes = -(ext_offset - first_offset);
648                         dorscan = 0;
649                 } else {
650                         ext_bytes = first_offset - ext_offset;
651                         dorscan = 1;
652                 }
653         }
654
655         if (dorscan) {
656                 scan_offset = first_offset;
657                 kprintf("HAMMER(%s) Find extended redo  %016jx, %jd extbytes\n",
658                         root_volume->ondisk->vol_name,
659                         (intmax_t)ext_offset,
660                         (intmax_t)ext_bytes);
661                 seqno = hmp->recover_stage2_seqno - 1;
662                 for (;;) {
663                         head = hammer_recover_scan_rev(hmp, root_volume,
664                                                        &scan_offset,
665                                                        &error, &buffer);
666                         if (error)
667                                 break;
668                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
669                                 if (head->head.hdr_seq != seqno) {
670                                         error = ERANGE;
671                                         break;
672                                 }
673                                 error = hammer_recover_redo_rec(
674                                                 hmp, &rterm_root,
675                                                 scan_offset, &head->redo);
676                                 --seqno;
677                         }
678                         if (scan_offset == ext_offset)
679                                 break;
680                 }
681                 if (error) {
682                         kprintf("HAMMER(%s) Find extended redo failed %d, "
683                                 "unable to run REDO\n",
684                                 root_volume->ondisk->vol_name,
685                                 error);
686                         goto done;
687                 }
688         } else {
689                 kprintf("HAMMER(%s) Embedded extended redo %016jx, "
690                         "%jd extbytes\n",
691                         root_volume->ondisk->vol_name,
692                         (intmax_t)ext_offset,
693                         (intmax_t)ext_bytes);
694         }
695
696         /*
697          * Scan the REDO forwards through the entire extended range.
698          * Anything with a previously recorded matching TERM is discarded.
699          */
700         scan_offset = ext_offset;
701         bytes += ext_bytes;
702
703         /*
704          * NOTE: when doing a forward scan the returned scan_offset is
705          *       for the record following the returned record, so we
706          *       have to play a bit.
707          */
708         while ((int64_t)bytes > 0) {
709                 KKASSERT(scan_offset != last_offset);
710
711                 oscan_offset = scan_offset;
712                 head = hammer_recover_scan_fwd(hmp, root_volume,
713                                                &scan_offset, &error, &buffer);
714                 if (error)
715                         break;
716
717                 error = hammer_recover_redo_run(hmp, &rterm_root,
718                                                 oscan_offset, &head->redo);
719                 if (error) {
720                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
721                                 root_volume->ondisk->vol_name,
722                                 (intmax_t)scan_offset - head->head.hdr_size);
723                         break;
724                 }
725                 bytes -= head->head.hdr_size;
726         }
727         KKASSERT(error || bytes == 0);
728
729 done:
730         if (buffer) {
731                 hammer_rel_buffer(buffer, 0);
732                 buffer = NULL;
733         }
734
735         /*
736          * Cleanup rterm tree
737          */
738         {
739                 hammer_rterm_t rterm;
740                 hammer_rterm_entry_t rte;
741
742                 while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
743                         RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
744                         while ((rte = rterm->term_list) != NULL) {
745                                 rterm->term_list = rte->next;
746                                 kfree(rte, hmp->m_misc);
747                         }
748                         kfree(rterm, hmp->m_misc);
749                 }
750         }
751
752         /*
753          * Finish up flushing (or discarding) recovered buffers by executing
754          * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
755          * case tests and forces the flush in order to update the FIFO indices.
756          *
757          * If a crash occurs during the flush the entire undo/redo will be
758          * re-run during recovery on the next mount.
759          */
760         if (error == 0) {
761                 if (rootmap->first_offset != rootmap->next_offset)
762                         hmp->hflags |= HMNT_UNDO_DIRTY;
763                 hammer_flusher_sync(hmp);
764         }
765 fatal:
766         hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
767         if (verbose) {
768                 kprintf("HAMMER(%s) End redo recovery\n",
769                         root_volume->ondisk->vol_name);
770         }
771
772         if (error && hammer_skip_redo == 1)
773                 kprintf("HAMMER(%s) recovery redo error %d, "
774                     " skipping.\n", root_volume->ondisk->vol_name,
775                     error);
776
777         return (hammer_skip_redo ? 0 : error);
778 }
779
780 /*
781  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
782  * record at *scan_offsetp or NULL if an error occured.
783  *
784  * On return *scan_offsetp will be the offset of the returned record.
785  */
786 hammer_fifo_any_t
787 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
788                         hammer_off_t *scan_offsetp,
789                         int *errorp, struct hammer_buffer **bufferp)
790 {
791         hammer_off_t scan_offset;
792         hammer_blockmap_t rootmap;
793         hammer_fifo_any_t head;
794         hammer_fifo_tail_t tail;
795
796         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
797         scan_offset = *scan_offsetp;
798
799         if (hammer_debug_general & 0x0080)
800                 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
801         if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
802                 scan_offset = rootmap->alloc_offset;
803         if (scan_offset - sizeof(*tail) <
804             HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
805                 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
806                         root_volume->ondisk->vol_name,
807                         (intmax_t)scan_offset);
808                 *errorp = EIO;
809                 return (NULL);
810         }
811         tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
812                             errorp, bufferp);
813         if (*errorp) {
814                 kprintf("HAMMER(%s) Unable to read UNDO TAIL "
815                         "at %016jx\n",
816                         root_volume->ondisk->vol_name,
817                         (intmax_t)scan_offset - sizeof(*tail));
818                 return (NULL);
819         }
820
821         if (hammer_check_tail_signature(tail, scan_offset) != 0) {
822                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
823                         "at %016jx\n",
824                         root_volume->ondisk->vol_name,
825                         (intmax_t)scan_offset - sizeof(*tail));
826                 *errorp = EIO;
827                 return (NULL);
828         }
829         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
830         *scan_offsetp = scan_offset - head->head.hdr_size;
831
832         return (head);
833 }
834
835 /*
836  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
837  * an error occured.
838  *
839  * On return *scan_offsetp will be the offset of the record following
840  * the returned record.
841  */
842 hammer_fifo_any_t
843 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
844                         hammer_off_t *scan_offsetp,
845                         int *errorp, struct hammer_buffer **bufferp)
846 {
847         hammer_off_t scan_offset;
848         hammer_blockmap_t rootmap;
849         hammer_fifo_any_t head;
850
851         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
852         scan_offset = *scan_offsetp;
853
854         if (hammer_debug_general & 0x0080)
855                 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
856         if (scan_offset == rootmap->alloc_offset)
857                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
858
859         head = hammer_bread(hmp, scan_offset, errorp, bufferp);
860         if (*errorp) {
861                 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
862                         root_volume->ondisk->vol_name,
863                         (intmax_t)scan_offset);
864                 return (NULL);
865         }
866
867         if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
868                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
869                         "at %016jx\n",
870                         root_volume->ondisk->vol_name,
871                         (intmax_t)scan_offset);
872                 *errorp = EIO;
873                 return (NULL);
874         }
875         scan_offset += head->head.hdr_size;
876         if (scan_offset == rootmap->alloc_offset)
877                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
878         *scan_offsetp = scan_offset;
879
880         return (head);
881 }
882
883 /*
884  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
885  * once the head and tail has been established.
886  *
887  * This function validates the entire FIFO record wrapper.
888  */
889 static __inline
890 int
891 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
892                         hammer_off_t beg_off)
893 {
894         hammer_off_t end_off;
895         u_int32_t crc;
896         int bytes;
897
898         /*
899          * Check signatures.  The tail signature is allowed to be the
900          * head signature only for 8-byte PADs.
901          */
902         if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
903                 kprintf("HAMMER: FIFO record bad head signature "
904                         "%04x at %016jx\n",
905                         head->hdr_signature,
906                         (intmax_t)beg_off);
907                 return(2);
908         }
909         if (head->hdr_size < HAMMER_HEAD_ALIGN ||
910             (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
911                 kprintf("HAMMER: FIFO record unaligned or bad size"
912                         "%04x at %016jx\n",
913                         head->hdr_size,
914                         (intmax_t)beg_off);
915                 return(2);
916         }
917         end_off = beg_off + head->hdr_size;
918
919         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
920             (size_t)(end_off - beg_off) != sizeof(*tail)) {
921                 if (head->hdr_type != tail->tail_type) {
922                         kprintf("HAMMER: FIFO record head/tail type mismatch "
923                                 "%04x %04x at %016jx\n",
924                                 head->hdr_type, tail->tail_type,
925                                 (intmax_t)beg_off);
926                         return(2);
927                 }
928                 if (head->hdr_size != tail->tail_size) {
929                         kprintf("HAMMER: FIFO record head/tail size mismatch "
930                                 "%04x %04x at %016jx\n",
931                                 head->hdr_size, tail->tail_size,
932                                 (intmax_t)beg_off);
933                         return(2);
934                 }
935                 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
936                         kprintf("HAMMER: FIFO record bad tail signature "
937                                 "%04x at %016jx\n",
938                                 tail->tail_signature,
939                                 (intmax_t)beg_off);
940                         return(3);
941                 }
942         }
943
944         /*
945          * Non-PAD records must have a CRC and must be sized at
946          * least large enough to fit the head and tail.
947          */
948         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
949                 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
950                       crc32(head + 1, head->hdr_size - sizeof(*head));
951                 if (head->hdr_crc != crc) {
952                         kprintf("HAMMER: FIFO record CRC failed %08x %08x "
953                                 "at %016jx\n",
954                                 head->hdr_crc, crc,
955                                 (intmax_t)beg_off);
956                         return(EIO);
957                 }
958                 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
959                         kprintf("HAMMER: FIFO record too small "
960                                 "%04x at %016jx\n",
961                                 head->hdr_size,
962                                 (intmax_t)beg_off);
963                         return(EIO);
964                 }
965         }
966
967         /*
968          * Check the tail
969          */
970         bytes = head->hdr_size;
971         tail = (void *)((char *)head + bytes - sizeof(*tail));
972         if (tail->tail_size != head->hdr_size) {
973                 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
974                         tail->tail_size, head->hdr_size,
975                         (intmax_t)beg_off);
976                 return(EIO);
977         }
978         if (tail->tail_type != head->hdr_type) {
979                 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
980                         tail->tail_type, head->hdr_type,
981                         (intmax_t)beg_off);
982                 return(EIO);
983         }
984
985         return(0);
986 }
987
988 /*
989  * Check that the FIFO record is in-bounds given the head and the
990  * hammer offset.
991  *
992  * Also checks that the head and tail structures agree with each other,
993  * but does not check beyond the signature, type, and size.
994  */
995 static int
996 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
997 {
998         hammer_fifo_tail_t tail;
999         hammer_off_t end_off;
1000
1001         /*
1002          * head overlaps buffer boundary.  This could be a PAD so only
1003          * check the minimum PAD size here.
1004          */
1005         if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
1006                 return(1);
1007
1008         /*
1009          * Calculate the ending offset and make sure the record does
1010          * not cross a buffer boundary.
1011          */
1012         end_off = beg_off + head->hdr_size;
1013         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1014                 return(1);
1015         tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
1016         return (_hammer_check_signature(head, tail, beg_off));
1017 }
1018
1019 /*
1020  * Check that the FIFO record is in-bounds given the tail and the
1021  * hammer offset.  The offset is pointing at the ending boundary of the
1022  * record.
1023  *
1024  * Also checks that the head and tail structures agree with each other,
1025  * but does not check beyond the signature, type, and size.
1026  */
1027 static int
1028 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
1029 {
1030         hammer_fifo_head_t head;
1031         hammer_off_t beg_off;
1032
1033         /*
1034          * tail overlaps buffer boundary
1035          */
1036         if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1037                 return(1);
1038
1039         /*
1040          * Calculate the begining offset and make sure the record does
1041          * not cross a buffer boundary.
1042          */
1043         beg_off = end_off - tail->tail_size;
1044         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1045                 return(1);
1046         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1047         return (_hammer_check_signature(head, tail, beg_off));
1048 }
1049
1050 static int
1051 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1052                     hammer_fifo_undo_t undo)
1053 {
1054         hammer_volume_t volume;
1055         hammer_buffer_t buffer;
1056         hammer_off_t buf_offset;
1057         int zone;
1058         int error;
1059         int vol_no;
1060         int bytes;
1061         u_int32_t offset;
1062
1063         /*
1064          * Only process UNDO records.  Flag if we find other records to
1065          * optimize stage2 recovery.
1066          */
1067         if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
1068                 return(0);
1069
1070         /*
1071          * Validate the UNDO record.
1072          */
1073         bytes = undo->head.hdr_size - sizeof(*undo) -
1074                 sizeof(struct hammer_fifo_tail);
1075         if (bytes < 0 || undo->undo_data_bytes < 0 ||
1076             undo->undo_data_bytes > bytes) {
1077                 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
1078                         undo->undo_data_bytes, bytes);
1079                 return(EIO);
1080         }
1081
1082         bytes = undo->undo_data_bytes;
1083
1084         /*
1085          * The undo offset may only be a zone-1 or zone-2 offset.
1086          *
1087          * Currently we only support a zone-1 offset representing the
1088          * volume header.
1089          */
1090         zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1091         offset = undo->undo_offset & HAMMER_BUFMASK;
1092
1093         if (offset + bytes > HAMMER_BUFSIZE) {
1094                 kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
1095                 return (EIO);
1096         }
1097
1098         switch(zone) {
1099         case HAMMER_ZONE_RAW_VOLUME_INDEX:
1100                 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1101                 volume = hammer_get_volume(hmp, vol_no, &error);
1102                 if (volume == NULL) {
1103                         kprintf("HAMMER: UNDO record, "
1104                                 "cannot access volume %d\n", vol_no);
1105                         break;
1106                 }
1107                 hammer_modify_volume_noundo(NULL, volume);
1108                 hammer_recover_copy_undo(undo->undo_offset,
1109                                          (char *)(undo + 1),
1110                                          (char *)volume->ondisk + offset,
1111                                          bytes);
1112                 hammer_modify_volume_done(volume);
1113
1114                 /*
1115                  * Multiple modifications may be made to the same buffer.
1116                  * Also, the volume header cannot be written out until
1117                  * everything else has been flushed.  This also
1118                  * covers the read-only case by preventing the kernel from
1119                  * flushing the buffer.
1120                  */
1121                 if (volume->io.recovered == 0)
1122                         volume->io.recovered = 1;
1123                 else
1124                         hammer_rel_volume(volume, 0);
1125                 break;
1126         case HAMMER_ZONE_RAW_BUFFER_INDEX:
1127                 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
1128                 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1129                                            0, &error);
1130                 if (buffer == NULL) {
1131                         kprintf("HAMMER: UNDO record, "
1132                                 "cannot access buffer %016jx\n",
1133                                 (intmax_t)undo->undo_offset);
1134                         break;
1135                 }
1136                 hammer_modify_buffer_noundo(NULL, buffer);
1137                 hammer_recover_copy_undo(undo->undo_offset,
1138                                          (char *)(undo + 1),
1139                                          (char *)buffer->ondisk + offset,
1140                                          bytes);
1141                 hammer_modify_buffer_done(buffer);
1142
1143                 /*
1144                  * Multiple modifications may be made to the same buffer,
1145                  * improve performance by delaying the flush.  This also
1146                  * covers the read-only case by preventing the kernel from
1147                  * flushing the buffer.
1148                  */
1149                 if (buffer->io.recovered == 0)
1150                         buffer->io.recovered = 1;
1151                 else
1152                         hammer_rel_buffer(buffer, 0);
1153                 break;
1154         default:
1155                 kprintf("HAMMER: Corrupt UNDO record\n");
1156                 error = EIO;
1157         }
1158         return (error);
1159 }
1160
1161 static void
1162 hammer_recover_copy_undo(hammer_off_t undo_offset,
1163                          char *src, char *dst, int bytes)
1164 {
1165         if (hammer_debug_general & 0x0080) {
1166                 kprintf("UNDO %016jx: %d\n",
1167                         (intmax_t)undo_offset, bytes);
1168         }
1169 #if 0
1170         kprintf("UNDO %016jx:", (intmax_t)undo_offset);
1171         hammer_recover_debug_dump(22, dst, bytes);
1172         kprintf("%22s", "to:");
1173         hammer_recover_debug_dump(22, src, bytes);
1174 #endif
1175         bcopy(src, dst, bytes);
1176 }
1177
1178 /*
1179  * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1180  * during the backwards scan of the extended UNDO/REDO FIFO.  This scan
1181  * does not include the nominal UNDO range, just the extended range.
1182  */
1183 int
1184 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1185                         hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1186 {
1187         hammer_rterm_t rterm;
1188         hammer_rterm_t nrterm;
1189         hammer_rterm_entry_t rte;
1190
1191         if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1192                 return(0);
1193         if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1194             redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1195                 return(0);
1196         }
1197
1198         nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1199         nrterm->redo_objid = redo->redo_objid;
1200         nrterm->redo_localization = redo->redo_localization;
1201         nrterm->redo_flags = redo->redo_flags;
1202         nrterm->redo_offset = redo->redo_offset;
1203
1204         rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1205         if (rterm)
1206                 kfree(nrterm, hmp->m_misc);
1207         else
1208                 rterm = nrterm;
1209
1210         if (bootverbose) {
1211                 kprintf("record record %016jx objid %016jx "
1212                         "offset %016jx flags %08x\n",
1213                         (intmax_t)scan_offset,
1214                         (intmax_t)redo->redo_objid,
1215                         (intmax_t)redo->redo_offset,
1216                         (int)redo->redo_flags);
1217         }
1218
1219         /*
1220          * Scan in reverse order, rte prepended, so the rte list will be
1221          * in forward order.
1222          */
1223         rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1224         rte->fifo_offset = scan_offset;
1225         rte->next = rterm->term_list;
1226         rterm->term_list = rte;
1227
1228         return(0);
1229 }
1230
1231 /*
1232  * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1233  * the forwards scan of the entire extended UNDO/REDO FIFO range.
1234  *
1235  * Records matching previously recorded TERMs have already been committed
1236  * and are ignored.
1237  */
1238 int
1239 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1240                         hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1241 {
1242         struct hammer_rterm rtval;
1243         hammer_rterm_t rterm;
1244         hammer_rterm_entry_t rte;
1245
1246         if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1247                 return(0);
1248
1249         switch(redo->redo_flags) {
1250         case HAMMER_REDO_WRITE:
1251         case HAMMER_REDO_TRUNC:
1252                 /*
1253                  * We hit a REDO request.  The REDO request is only executed
1254                  * if there is no matching TERM.
1255                  */
1256                 bzero(&rtval, sizeof(rtval));
1257                 rtval.redo_objid = redo->redo_objid;
1258                 rtval.redo_localization = redo->redo_localization;
1259                 rtval.redo_offset = redo->redo_offset;
1260                 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1261                                    HAMMER_REDO_TERM_WRITE :
1262                                    HAMMER_REDO_TERM_TRUNC;
1263
1264                 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1265                 if (rterm) {
1266                         if (bootverbose) {
1267                                 kprintf("ignore record %016jx objid %016jx "
1268                                         "offset %016jx flags %08x\n",
1269                                         (intmax_t)scan_offset,
1270                                         (intmax_t)redo->redo_objid,
1271                                         (intmax_t)redo->redo_offset,
1272                                         (int)redo->redo_flags);
1273                         }
1274                         break;
1275                 }
1276                 if (bootverbose) {
1277                         kprintf("run    record %016jx objid %016jx "
1278                                 "offset %016jx flags %08x\n",
1279                                 (intmax_t)scan_offset,
1280                                 (intmax_t)redo->redo_objid,
1281                                 (intmax_t)redo->redo_offset,
1282                                 (int)redo->redo_flags);
1283                 }
1284
1285                 /*
1286                  * Redo stage2 can access a live filesystem, acquire the
1287                  * vnode.
1288                  */
1289                 hammer_recover_redo_exec(hmp, redo);
1290                 break;
1291         case HAMMER_REDO_TERM_WRITE:
1292         case HAMMER_REDO_TERM_TRUNC:
1293                 /*
1294                  * As we encounter TERMs in the forward scan we remove
1295                  * them.  Once the forward scan hits the nominal undo range
1296                  * there will be no more recorded TERMs.
1297                  */
1298                 bzero(&rtval, sizeof(rtval));
1299                 rtval.redo_objid = redo->redo_objid;
1300                 rtval.redo_localization = redo->redo_localization;
1301                 rtval.redo_flags = redo->redo_flags;
1302                 rtval.redo_offset = redo->redo_offset;
1303
1304                 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1305                 if (rterm) {
1306                         if ((rte = rterm->term_list) != NULL) {
1307                                 KKASSERT(rte->fifo_offset == scan_offset);
1308                                 rterm->term_list = rte->next;
1309                                 kfree(rte, hmp->m_misc);
1310                         }
1311                 }
1312                 break;
1313         }
1314         return(0);
1315 }
1316
1317 static void
1318 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1319 {
1320         struct hammer_transaction trans;
1321         struct vattr va;
1322         struct hammer_inode *ip;
1323         struct vnode *vp = NULL;
1324         int error;
1325
1326         hammer_start_transaction(&trans, hmp);
1327
1328         ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1329                               HAMMER_MAX_TID, redo->redo_localization,
1330                               0, &error);
1331         if (ip == NULL) {
1332                 kprintf("unable to find objid %016jx:%08x\n",
1333                         (intmax_t)redo->redo_objid, redo->redo_localization);
1334                 goto done2;
1335         }
1336         error = hammer_get_vnode(ip, &vp);
1337         if (error) {
1338                 kprintf("unable to acquire vnode for %016jx:%08x\n",
1339                         (intmax_t)redo->redo_objid, redo->redo_localization);
1340                 goto done1;
1341         }
1342
1343         switch(redo->redo_flags) {
1344         case HAMMER_REDO_WRITE:
1345                 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1346                 if (error) {
1347                         kprintf("vn_rdwr open %016jx:%08x returned %d\n",
1348                                 (intmax_t)redo->redo_objid,
1349                                 redo->redo_localization, error);
1350                         break;
1351                 }
1352                 vn_unlock(vp);
1353                 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1354                                 redo->redo_data_bytes,
1355                                 redo->redo_offset, UIO_SYSSPACE,
1356                                 0, proc0.p_ucred, NULL);
1357                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1358                 if (error) {
1359                         kprintf("write %016jx:%08x returned %d\n",
1360                                 (intmax_t)redo->redo_objid,
1361                                 redo->redo_localization, error);
1362                 }
1363                 VOP_CLOSE(vp, FREAD|FWRITE, NULL);
1364                 break;
1365         case HAMMER_REDO_TRUNC:
1366                 VATTR_NULL(&va);
1367                 va.va_size = redo->redo_offset;
1368                 error = VOP_SETATTR(vp, &va, proc0.p_ucred);
1369                 if (error) {
1370                         kprintf("setattr offset %016jx error %d\n",
1371                                 (intmax_t)redo->redo_offset, error);
1372                 }
1373                 break;
1374         }
1375         vput(vp);
1376 done1:
1377         hammer_rel_inode(ip, 0);
1378 done2:
1379         hammer_done_transaction(&trans);
1380 }
1381
1382 /*
1383  * RB tree compare function.  Note that REDO_TERM_TRUNC ops ignore
1384  * the offset.
1385  *
1386  * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1387  */
1388 static int
1389 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1390 {
1391         if (rt1->redo_objid < rt2->redo_objid)
1392                 return(-1);
1393         if (rt1->redo_objid > rt2->redo_objid)
1394                 return(1);
1395         if (rt1->redo_localization < rt2->redo_localization)
1396                 return(-1);
1397         if (rt1->redo_localization > rt2->redo_localization)
1398                 return(1);
1399         if (rt1->redo_flags < rt2->redo_flags)
1400                 return(-1);
1401         if (rt1->redo_flags > rt2->redo_flags)
1402                 return(1);
1403         if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1404                 if (rt1->redo_offset < rt2->redo_offset)
1405                         return(-1);
1406                 if (rt1->redo_offset > rt2->redo_offset)
1407                         return(1);
1408         }
1409         return(0);
1410 }
1411
1412 #if 0
1413
1414 static void
1415 hammer_recover_debug_dump(int w, char *buf, int bytes)
1416 {
1417         int i;
1418
1419         for (i = 0; i < bytes; ++i) {
1420                 if (i && (i & 15) == 0)
1421                         kprintf("\n%*.*s", w, w, "");
1422                 kprintf(" %02x", (unsigned char)buf[i]);
1423         }
1424         kprintf("\n");
1425 }
1426
1427 #endif
1428
1429 /*
1430  * Flush recovered buffers from recovery operations.  The call to this
1431  * routine may be delayed if a read-only mount was made and then later
1432  * upgraded to read-write.  This routine is also called when unmounting
1433  * a read-only mount to clean out recovered (dirty) buffers which we
1434  * couldn't flush (because the mount is read-only).
1435  *
1436  * The volume header is always written last.  The UNDO FIFO will be forced
1437  * to zero-length by setting next_offset to first_offset.  This leaves the
1438  * (now stale) UNDO information used to recover the disk available for
1439  * forensic analysis.
1440  *
1441  * final is typically 0 or 1.  The volume header is only written if final
1442  * is 1.  If final is -1 the recovered buffers are discarded instead of
1443  * written and root_volume can also be passed as NULL in that case.
1444  */
1445 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1446 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1447
1448 void
1449 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1450                              int final)
1451 {
1452         /*
1453          * Flush the buffers out asynchronously, wait for all the I/O to
1454          * complete, then do it again to destroy the buffer cache buffer
1455          * so it doesn't alias something later on.
1456          */
1457         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1458                 hammer_recover_flush_buffer_callback, &final);
1459         hammer_io_wait_all(hmp, "hmrrcw", 1);
1460         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1461                 hammer_recover_flush_buffer_callback, &final);
1462
1463         /*
1464          * Flush all volume headers except the root volume.  If final < 0
1465          * we discard all volume headers including the root volume.
1466          */
1467         if (final >= 0) {
1468                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1469                         hammer_recover_flush_volume_callback, root_volume);
1470         } else {
1471                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1472                         hammer_recover_flush_volume_callback, NULL);
1473         }
1474
1475         /*
1476          * Finalize the root volume header.
1477          *
1478          * No interlock is needed, volume buffers are not
1479          * messed with by bioops.
1480          */
1481         if (root_volume && root_volume->io.recovered && final > 0) {
1482                 hammer_io_wait_all(hmp, "hmrflx", 1);
1483                 root_volume->io.recovered = 0;
1484                 hammer_io_flush(&root_volume->io, 0);
1485                 hammer_rel_volume(root_volume, 0);
1486                 hammer_io_wait_all(hmp, "hmrfly", 1);
1487         }
1488 }
1489
1490 /*
1491  * Callback to flush volume headers.  If discarding data will be NULL and
1492  * all volume headers (including the root volume) will be discarded.
1493  * Otherwise data is the root_volume and we flush all volume headers
1494  * EXCEPT the root_volume.
1495  *
1496  * Clear any I/O error or modified condition when discarding buffers to
1497  * clean up the reference count, otherwise the buffer may have extra refs
1498  * on it.
1499  */
1500 static
1501 int
1502 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1503 {
1504         hammer_volume_t root_volume = data;
1505
1506         if (volume->io.recovered && volume != root_volume) {
1507                 volume->io.recovered = 0;
1508                 if (root_volume != NULL) {
1509                         /*
1510                          * No interlock is needed, volume buffers are not
1511                          * messed with by bioops.
1512                          */
1513                         hammer_io_flush(&volume->io, 0);
1514                 } else {
1515                         hammer_io_clear_error(&volume->io);
1516                         hammer_io_clear_modify(&volume->io, 1);
1517                 }
1518                 hammer_rel_volume(volume, 0);
1519         }
1520         return(0);
1521 }
1522
1523 /*
1524  * Flush or discard recovered I/O buffers.
1525  *
1526  * Clear any I/O error or modified condition when discarding buffers to
1527  * clean up the reference count, otherwise the buffer may have extra refs
1528  * on it.
1529  */
1530 static
1531 int
1532 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1533 {
1534         int final = *(int *)data;
1535         int flush;
1536
1537         if (buffer->io.recovered) {
1538                 buffer->io.recovered = 0;
1539                 buffer->io.reclaim = 1;
1540                 if (final < 0) {
1541                         hammer_io_clear_error(&buffer->io);
1542                         hammer_io_clear_modify(&buffer->io, 1);
1543                 } else {
1544                         hammer_io_write_interlock(&buffer->io);
1545                         hammer_io_flush(&buffer->io, 0);
1546                         hammer_io_done_interlock(&buffer->io);
1547                 }
1548                 hammer_rel_buffer(buffer, 0);
1549         } else {
1550                 flush = hammer_ref_interlock(&buffer->io.lock);
1551                 if (flush)
1552                         atomic_add_int(&hammer_count_refedbufs, 1);
1553
1554                 if (final < 0) {
1555                         hammer_io_clear_error(&buffer->io);
1556                         hammer_io_clear_modify(&buffer->io, 1);
1557                 }
1558                 KKASSERT(hammer_oneref(&buffer->io.lock));
1559                 buffer->io.reclaim = 1;
1560                 hammer_rel_buffer(buffer, flush);
1561         }
1562         return(0);
1563 }
1564