0b699d230665f471e5cb9909dbd1d9c702941dcd
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 /*
36  * UNDO ALGORITHM:
37  *
38  *      The UNDO algorithm is trivial.  The nominal UNDO range in the
39  *      FIFO is determined by taking the first/next offset stored in
40  *      the volume header.  The next offset may not be correct since
41  *      UNDO flushes are not required to flush the volume header, so
42  *      the code also scans forward until it finds a discontinuous
43  *      sequence number.
44  *
45  *      The UNDOs are then scanned and executed in reverse order.  These
46  *      UNDOs are effectively just data restorations based on HAMMER offsets.
47  *
48  * REDO ALGORITHM:
49  *
50  *      REDO records are laid down in the UNDO/REDO FIFO for nominal
51  *      writes, truncations, and file extension ops.  On a per-inode
52  *      basis two types of REDO records are generated, REDO_WRITE
53  *      and REDO_TRUNC.
54  *
55  *      Essentially the recovery block will contain UNDO records backing
56  *      out partial operations and REDO records to regenerate those partial
57  *      operations guaranteed by the filesystem during recovery.
58  *
59  *      REDO generation is optional, and can also be started and then
60  *      later stopped due to excessive write()s inbetween fsyncs, or not
61  *      started at all.  Because of this the recovery code must determine
62  *      when REDOs are valid and when they are not.  Additional records are
63  *      generated to help figure it out.
64  *
65  *      The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66  *      during a flush cycle indicating which records the flush cycle
67  *      has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68  *      each flush cycle to indicate how far back in the UNDO/REDO FIFO
69  *      the recovery code must go to find the earliest applicable REDO
70  *      record.  Applicable REDO records can be far outside the nominal
71  *      UNDO recovery range, for example if a write() lays down a REDO but
72  *      the related file is not flushed for several cycles.
73  *
74  *      The SYNC reference is to a point prior to the nominal UNDO FIFO
75  *      range, creating an extended REDO range which must be scanned.
76  *
77  *      Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78  *      which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79  *      prior to the start of the nominal UNDO range are applicable.
80  *      That is, any REDO_TERM_* records in the extended range but not in
81  *      the nominal undo range will mask any redo operations for prior REDO
82  *      records.  This is necessary because once the TERM is laid down
83  *      followup operations may make additional changes to the related
84  *      records but not necessarily record them as REDOs (because REDOs are
85  *      optional).
86  *
87  *      REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88  *      must be ignored since they represent meta-data flushes which are
89  *      undone by the UNDOs in that nominal UNDO range by the recovery
90  *      code.  Only REDO_TERM_* records in the extended range but not
91  *      in the nominal undo range are applicable.
92  *
93  *      The REDO_SYNC record itself always exists in the nominal UNDO range
94  *      (this is how the extended range is determined).  For recovery
95  *      purposes the most recent REDO_SYNC record is always used if several
96  *      are found.
97  *
98  * CRASHES DURING UNDO/REDO
99  *
100  *      A crash during the UNDO phase requires no additional effort.  The
101  *      UNDOs will simply be re-run again.  The state of the UNDO/REDO fifo
102  *      remains unchanged and has no re-crash issues.
103  *
104  *      A crash during the REDO phase is more complex because the REDOs
105  *      run normal filesystem ops and generate additional UNDO/REDO records.
106  *      REDO is disabled during REDO recovery and any SYNC records generated
107  *      by flushes during REDO recovery must continue to reference the
108  *      original extended range.
109  *
110  *      If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111  *      may become impossible.  This is detected when the start of the
112  *      extended range fails to have monotonically increasing sequence
113  *      numbers leading into the nominal undo range.
114  */
115
116 #include "hammer.h"
117
118 /*
119  * Each rterm entry has a list of fifo offsets indicating termination
120  * points.  These are stripped as the scan progresses.
121  */
122 typedef struct hammer_rterm_entry {
123         struct hammer_rterm_entry *next;
124         hammer_off_t            fifo_offset;
125 } *hammer_rterm_entry_t;
126
127 /*
128  * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
129  * TRUNC entries ignore the offset.
130  */
131 typedef struct hammer_rterm {
132         RB_ENTRY(hammer_rterm)  rb_node;
133         int64_t                 redo_objid;
134         u_int32_t               redo_localization;
135         u_int32_t               redo_flags;
136         hammer_off_t            redo_offset;
137         hammer_rterm_entry_t    term_list;
138 } *hammer_rterm_t;
139
140 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
141 struct hammer_rterm_rb_tree;
142 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
143 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
144
145 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
146                         hammer_off_t end_off);
147 static int hammer_check_head_signature(hammer_fifo_head_t head,
148                         hammer_off_t beg_off);
149 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
150                         char *src, char *dst, int bytes);
151 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
152                         hammer_volume_t root_volume,
153                         hammer_off_t *scan_offsetp,
154                         int *errorp, struct hammer_buffer **bufferp);
155 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
156                         hammer_volume_t root_volume,
157                         hammer_off_t *scan_offsetp,
158                         int *errorp, struct hammer_buffer **bufferp);
159 #if 0
160 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
161 #endif
162 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
163                         hammer_fifo_undo_t undo);
164 static int hammer_recover_redo_rec(hammer_mount_t hmp,
165                         struct hammer_rterm_rb_tree *root,
166                         hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
167 static int hammer_recover_redo_run(hammer_mount_t hmp,
168                         struct hammer_rterm_rb_tree *root,
169                         hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
170 static void hammer_recover_redo_exec(hammer_mount_t hmp,
171                         hammer_fifo_redo_t redo);
172
173 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
174
175 /*
176  * Recover filesystem meta-data on mount.  This procedure figures out the
177  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
178  * resynchronized by this procedure.
179  *
180  * This procedure is run near the beginning of the mount sequence, before
181  * any B-Tree or high-level accesses are enabled, and is responsible for
182  * restoring the meta-data to a consistent state.  High level HAMMER data
183  * structures (such as the B-Tree) cannot be accessed here.
184  *
185  * NOTE: No information from the root volume has been cached in the
186  *       hammer_mount structure yet, so we need to access the root volume's
187  *       buffer directly.
188  *
189  * NOTE:
190  */
191 int
192 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
193 {
194         hammer_blockmap_t rootmap;
195         hammer_buffer_t buffer;
196         hammer_off_t scan_offset;
197         hammer_off_t scan_offset_save;
198         hammer_off_t bytes;
199         hammer_fifo_any_t head;
200         hammer_off_t first_offset;
201         hammer_off_t last_offset;
202         u_int32_t seqno;
203         int error;
204         int degenerate_case = 0;
205
206         /*
207          * Examine the UNDO FIFO indices in the volume header.
208          */
209         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
210         first_offset = rootmap->first_offset;
211         last_offset  = rootmap->next_offset;
212         buffer = NULL;
213         error = 0;
214
215         hmp->recover_stage2_offset = 0;
216
217         if (first_offset > rootmap->alloc_offset ||
218             last_offset > rootmap->alloc_offset) {
219                 kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
220                         "%016jx, %016jx limit %016jx\n",
221                         root_volume->ondisk->vol_name,
222                         (intmax_t)first_offset,
223                         (intmax_t)last_offset,
224                         (intmax_t)rootmap->alloc_offset);
225                 error = EIO;
226                 goto done;
227         }
228
229         /*
230          * In HAMMER version 4+ filesystems the volume header does NOT
231          * contain definitive UNDO FIFO state.  In particular, the
232          * rootmap->next_offset may not be indexed completely to the
233          * end of the active UNDO FIFO.
234          */
235         if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
236                 /*
237                  * To find the definitive range we must first scan backwards
238                  * from first_offset to locate the first real record and
239                  * extract the sequence number from it.  This record is not
240                  * part of the active undo space.
241                  */
242                 scan_offset = first_offset;
243                 seqno = 0;
244
245                 for (;;) {
246                         head = hammer_recover_scan_rev(hmp, root_volume,
247                                                        &scan_offset,
248                                                        &error, &buffer);
249                         if (error)
250                                 break;
251                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
252                                 seqno = head->head.hdr_seq;
253                                 break;
254                         }
255                 }
256                 if (error) {
257                         kprintf("HAMMER(%s) recovery failure "
258                                 "during seqno backscan\n",
259                                 root_volume->ondisk->vol_name);
260                         goto done;
261                 }
262
263                 /*
264                  * Scan forwards from first_offset and (seqno+1) looking
265                  * for a sequence space discontinuity.  This denotes the
266                  * end of the active FIFO area.
267                  *
268                  * NOTE: For the case where the FIFO is empty the very first
269                  *       record we find will be discontinuous.
270                  *
271                  * NOTE: Do not include trailing PADs in the scan range,
272                  *       and remember the returned scan_offset after a
273                  *       fwd iteration points to the end of the returned
274                  *       record.
275                  */
276                 kprintf("HAMMER(%s) recovery check seqno=%08x\n",
277                         root_volume->ondisk->vol_name,
278                         seqno);
279
280                 scan_offset = first_offset;
281                 scan_offset_save = scan_offset;
282                 ++seqno;
283                 hmp->recover_stage2_seqno = seqno;
284
285                 for (;;) {
286                         head = hammer_recover_scan_fwd(hmp, root_volume,
287                                                        &scan_offset,
288                                                        &error, &buffer);
289                         if (error)
290                                 break;
291                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
292                                 if (seqno != head->head.hdr_seq) {
293                                         scan_offset = scan_offset_save;
294                                         break;
295                                 }
296                                 scan_offset_save = scan_offset;
297                                 ++seqno;
298                         }
299
300 #if 0
301                         /*
302                          * If the forward scan is grossly ahead of last_offset
303                          * then something is wrong.  last_offset is supposed
304                          * to be flushed out
305                          */
306                         if (last_offset >= scan_offset) {
307                                 bytes = last_offset - scan_offset;
308                         } else {
309                                 bytes = rootmap->alloc_offset - scan_offset +
310                                         (last_offset & HAMMER_OFF_LONG_MASK);
311                         }
312                         if (bytes >
313                             (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
314                             4 / 5) {
315                                 kprintf("HAMMER(%s) recovery forward scan is "
316                                         "grossly beyond the last_offset in "
317                                         "the volume header, this can't be "
318                                         "right.\n",
319                                         root_volume->ondisk->vol_name);
320                                 error = EIO;
321                                 break;
322                         }
323 #endif
324                 }
325
326                 /*
327                  * Store the seqno.  This will be the next seqno we lay down
328                  * when generating new UNDOs.
329                  */
330                 hmp->undo_seqno = seqno;
331                 if (error) {
332                         kprintf("HAMMER(%s) recovery failure "
333                                 "during seqno fwdscan\n",
334                                 root_volume->ondisk->vol_name);
335                         goto done;
336                 }
337                 last_offset = scan_offset;
338                 kprintf("HAMMER(%s) recovery range %016jx-%016jx\n"
339                         "HAMMER(%s) recovery nexto %016jx endseqno=%08x\n",
340                         root_volume->ondisk->vol_name,
341                         (intmax_t)first_offset,
342                         (intmax_t)last_offset,
343                         root_volume->ondisk->vol_name,
344                         (intmax_t)rootmap->next_offset,
345                         seqno);
346         }
347
348         /*
349          * Calculate the size of the active portion of the FIFO.  If the
350          * FIFO is empty the filesystem is clean and no further action is
351          * needed.
352          */
353         if (last_offset >= first_offset) {
354                 bytes = last_offset - first_offset;
355         } else {
356                 bytes = rootmap->alloc_offset - first_offset +
357                         (last_offset & HAMMER_OFF_LONG_MASK);
358         }
359         if (bytes == 0) {
360                 degenerate_case = 1;
361                 error = 0;
362                 goto done;
363         }
364
365         kprintf("HAMMER(%s) recovery undo  %016jx-%016jx (%jd bytes)%s\n",
366                 root_volume->ondisk->vol_name,
367                 (intmax_t)first_offset,
368                 (intmax_t)last_offset,
369                 (intmax_t)bytes,
370                 (hmp->ronly ? " (RO)" : "(RW)"));
371         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
372                 kprintf("Undo size is absurd, unable to mount\n");
373                 error = EIO;
374                 goto done;
375         }
376
377         /*
378          * Scan the UNDOs backwards.
379          */
380         scan_offset = last_offset;
381
382         while ((int64_t)bytes > 0) {
383                 KKASSERT(scan_offset != first_offset);
384                 head = hammer_recover_scan_rev(hmp, root_volume,
385                                                &scan_offset, &error, &buffer);
386                 if (error)
387                         break;
388
389                 /*
390                  * Normal UNDO
391                  */
392                 error = hammer_recover_undo(hmp, root_volume, &head->undo);
393                 if (error) {
394                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
395                                 root_volume->ondisk->vol_name,
396                                 (intmax_t)scan_offset - head->head.hdr_size);
397                         break;
398                 }
399
400                 /*
401                  * The first REDO_SYNC record encountered (scanning backwards)
402                  * enables REDO processing.
403                  */
404                 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
405                     head->redo.redo_flags == HAMMER_REDO_SYNC) {
406                         if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
407                                 kprintf("HAMMER(%s) Ignoring extra REDO_SYNC "
408                                         "records in UNDO/REDO FIFO.\n",
409                                         root_volume->ondisk->vol_name
410                                 );
411                         } else {
412                                 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
413                                 hmp->recover_stage2_offset =
414                                         head->redo.redo_offset;
415                                 kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n",
416                                         root_volume->ondisk->vol_name,
417                                         (intmax_t)head->redo.redo_offset);
418                         }
419                 }
420
421                 bytes -= head->head.hdr_size;
422
423                 /*
424                  * If too many dirty buffers have built up we have to flush'm
425                  * out.  As long as we do not flush out the volume header
426                  * a crash here should not cause any problems.
427                  *
428                  * buffer must be released so the flush can assert that
429                  * all buffers are idle.
430                  */
431                 if (hammer_flusher_meta_limit(hmp)) {
432                         if (buffer) {
433                                 hammer_rel_buffer(buffer, 0);
434                                 buffer = NULL;
435                         }
436                         if (hmp->ronly == 0) {
437                                 hammer_recover_flush_buffers(hmp, root_volume,
438                                                              0);
439                                 kprintf("HAMMER(%s) Continuing recovery\n",
440                                         root_volume->ondisk->vol_name);
441                         } else {
442                                 kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
443                                         root_volume->ondisk->vol_name);
444                                 error = EIO;
445                                 break;
446                         }
447                 }
448         }
449         KKASSERT(error || bytes == 0);
450 done:
451         if (buffer) {
452                 hammer_rel_buffer(buffer, 0);
453                 buffer = NULL;
454         }
455
456         /*
457          * After completely flushing all the recovered buffers the volume
458          * header will also be flushed.
459          */
460         if (root_volume->io.recovered == 0) {
461                 hammer_ref_volume(root_volume);
462                 root_volume->io.recovered = 1;
463         }
464
465         /*
466          * Finish up flushing (or discarding) recovered buffers.  FIFO
467          * indices in the volume header are updated to the actual undo
468          * range but will not be collapsed until stage 2.
469          */
470         if (error == 0) {
471                 hammer_modify_volume(NULL, root_volume, NULL, 0);
472                 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
473                 rootmap->first_offset = first_offset;
474                 rootmap->next_offset = last_offset;
475                 hammer_modify_volume_done(root_volume);
476                 if (hmp->ronly == 0)
477                         hammer_recover_flush_buffers(hmp, root_volume, 1);
478         } else {
479                 hammer_recover_flush_buffers(hmp, root_volume, -1);
480         }
481         if (degenerate_case == 0) {
482                 kprintf("HAMMER(%s) recovery complete\n",
483                         root_volume->ondisk->vol_name);
484         } else {
485                 kprintf("HAMMER(%s) mounted clean, no recovery needed\n",
486                         root_volume->ondisk->vol_name);
487         }
488         return (error);
489 }
490
491 /*
492  * Execute redo operations
493  *
494  * This procedure is run at the end of the mount sequence, after the hammer
495  * mount structure has been completely initialized but before the filesystem
496  * goes live.  It can access standard cursors, the B-Tree, flush the
497  * filesystem, and so forth.
498  *
499  * This code may only be called for read-write mounts or when a mount
500  * switches from read-only to read-write.  vnodes may or may not be present.
501  *
502  * The stage1 code will have already calculated the correct FIFO range
503  * for the nominal UNDO FIFO and stored it in the rootmap.  The extended
504  * range for REDO is stored in hmp->recover_stage2_offset.
505  */
506 int
507 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
508 {
509         hammer_blockmap_t rootmap;
510         hammer_buffer_t buffer;
511         hammer_off_t scan_offset;
512         hammer_off_t oscan_offset;
513         hammer_off_t bytes;
514         hammer_off_t ext_bytes;
515         hammer_fifo_any_t head;
516         hammer_off_t first_offset;
517         hammer_off_t last_offset;
518         hammer_off_t ext_offset;
519         struct hammer_rterm_rb_tree rterm_root;
520         u_int32_t seqno;
521         int error;
522         int verbose = 0;
523         int dorscan;
524
525         /*
526          * Stage 2 can only be run on a RW mount, or when the mount is
527          * switched from RO to RW.
528          */
529         KKASSERT(hmp->ronly == 0);
530         RB_INIT(&rterm_root);
531
532         /*
533          * Examine the UNDO FIFO.  If it is empty the filesystem is clean
534          * and no action need be taken.
535          */
536         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
537         first_offset = rootmap->first_offset;
538         last_offset  = rootmap->next_offset;
539         if (first_offset == last_offset) {
540                 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
541                 return(0);
542         }
543
544         /*
545          * Stage2 must only be run once, and will not be run at all
546          * if Stage1 did not find a REDO_SYNC record.
547          */
548         error = 0;
549         buffer = NULL;
550
551         if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
552                 goto done;
553         hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
554         hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
555         ext_offset = hmp->recover_stage2_offset;
556         if (ext_offset == 0) {
557                 kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC "
558                         "offset, ignoring\n",
559                         root_volume->ondisk->vol_name);
560                 goto done;
561         }
562
563         /*
564          * Calculate nominal UNDO range (this is not yet the extended
565          * range).
566          */
567         if (last_offset >= first_offset) {
568                 bytes = last_offset - first_offset;
569         } else {
570                 bytes = rootmap->alloc_offset - first_offset +
571                         (last_offset & HAMMER_OFF_LONG_MASK);
572         }
573         kprintf("HAMMER(%s) recovery redo  %016jx-%016jx (%jd bytes)%s\n",
574                 root_volume->ondisk->vol_name,
575                 (intmax_t)first_offset,
576                 (intmax_t)last_offset,
577                 (intmax_t)bytes,
578                 (hmp->ronly ? " (RO)" : "(RW)"));
579         verbose = 1;
580         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
581                 kprintf("Undo size is absurd, unable to mount\n");
582                 error = EIO;
583                 goto fatal;
584         }
585
586         /*
587          * Scan the REDOs backwards collecting REDO_TERM_* information.
588          * This information is only collected for the extended range,
589          * non-inclusive of any TERMs in the nominal UNDO range.
590          *
591          * If the stage2 extended range is inside the nominal undo range
592          * we have nothing to scan.
593          *
594          * This must fit in memory!
595          */
596         if (first_offset < last_offset) {
597                 /*
598                  * [      first_offset........last_offset      ]
599                  */
600                 if (ext_offset < first_offset) {
601                         dorscan = 1;
602                         ext_bytes = first_offset - ext_offset;
603                 } else if (ext_offset > last_offset) {
604                         dorscan = 1;
605                         ext_bytes = (rootmap->alloc_offset - ext_offset) +
606                                     (first_offset & HAMMER_OFF_LONG_MASK);
607                 } else {
608                         ext_bytes = -(ext_offset - first_offset);
609                         dorscan = 0;
610                 }
611         } else {
612                 /*
613                  * [......last_offset         first_offset.....]
614                  */
615                 if (ext_offset < last_offset) {
616                         ext_bytes = -((rootmap->alloc_offset - first_offset) +
617                                     (ext_offset & HAMMER_OFF_LONG_MASK));
618                         dorscan = 0;
619                 } else if (ext_offset > first_offset) {
620                         ext_bytes = -(ext_offset - first_offset);
621                         dorscan = 0;
622                 } else {
623                         ext_bytes = first_offset - ext_offset;
624                         dorscan = 1;
625                 }
626         }
627
628         if (dorscan) {
629                 scan_offset = first_offset;
630                 kprintf("HAMMER(%s) Find extended redo  %016jx, %jd extbytes\n",
631                         root_volume->ondisk->vol_name,
632                         (intmax_t)ext_offset,
633                         (intmax_t)ext_bytes);
634                 seqno = hmp->recover_stage2_seqno - 1;
635                 for (;;) {
636                         head = hammer_recover_scan_rev(hmp, root_volume,
637                                                        &scan_offset,
638                                                        &error, &buffer);
639                         if (error)
640                                 break;
641                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
642                                 if (head->head.hdr_seq != seqno) {
643                                         error = ERANGE;
644                                         break;
645                                 }
646                                 error = hammer_recover_redo_rec(
647                                                 hmp, &rterm_root,
648                                                 scan_offset, &head->redo);
649                                 --seqno;
650                         }
651                         if (scan_offset == ext_offset)
652                                 break;
653                 }
654                 if (error) {
655                         kprintf("HAMMER(%s) Find extended redo failed %d, "
656                                 "unable to run REDO\n",
657                                 root_volume->ondisk->vol_name,
658                                 error);
659                         goto done;
660                 }
661         } else {
662                 kprintf("HAMMER(%s) Embedded extended redo %016jx, "
663                         "%jd extbytes\n",
664                         root_volume->ondisk->vol_name,
665                         (intmax_t)ext_offset,
666                         (intmax_t)ext_bytes);
667         }
668
669         /*
670          * Scan the REDO forwards through the entire extended range.
671          * Anything with a previously recorded matching TERM is discarded.
672          */
673         scan_offset = ext_offset;
674         bytes += ext_bytes;
675
676         /*
677          * NOTE: when doing a forward scan the returned scan_offset is
678          *       for the record following the returned record, so we
679          *       have to play a bit.
680          */
681         while ((int64_t)bytes > 0) {
682                 KKASSERT(scan_offset != last_offset);
683
684                 oscan_offset = scan_offset;
685                 head = hammer_recover_scan_fwd(hmp, root_volume,
686                                                &scan_offset, &error, &buffer);
687                 if (error)
688                         break;
689
690                 error = hammer_recover_redo_run(hmp, &rterm_root,
691                                                 oscan_offset, &head->redo);
692                 if (error) {
693                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
694                                 root_volume->ondisk->vol_name,
695                                 (intmax_t)scan_offset - head->head.hdr_size);
696                         break;
697                 }
698                 bytes -= head->head.hdr_size;
699         }
700         KKASSERT(error || bytes == 0);
701
702 done:
703         if (buffer) {
704                 hammer_rel_buffer(buffer, 0);
705                 buffer = NULL;
706         }
707
708         /*
709          * Cleanup rterm tree
710          */
711         {
712                 hammer_rterm_t rterm;
713                 hammer_rterm_entry_t rte;
714
715                 while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
716                         RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
717                         while ((rte = rterm->term_list) != NULL) {
718                                 rterm->term_list = rte->next;
719                                 kfree(rte, hmp->m_misc);
720                         }
721                         kfree(rterm, hmp->m_misc);
722                 }
723         }
724
725         /*
726          * Finish up flushing (or discarding) recovered buffers by executing
727          * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
728          * case tests and forces the flush in order to update the FIFO indices.
729          *
730          * If a crash occurs during the flush the entire undo/redo will be
731          * re-run during recovery on the next mount.
732          */
733         if (error == 0) {
734                 if (rootmap->first_offset != rootmap->next_offset)
735                         hmp->hflags |= HMNT_UNDO_DIRTY;
736                 hammer_flusher_sync(hmp);
737         }
738 fatal:
739         hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
740         if (verbose) {
741                 kprintf("HAMMER(%s) End redo recovery\n",
742                         root_volume->ondisk->vol_name);
743         }
744         return (error);
745 }
746
747 /*
748  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
749  * record at *scan_offsetp or NULL if an error occured.
750  *
751  * On return *scan_offsetp will be the offset of the returned record.
752  */
753 hammer_fifo_any_t
754 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
755                         hammer_off_t *scan_offsetp,
756                         int *errorp, struct hammer_buffer **bufferp)
757 {
758         hammer_off_t scan_offset;
759         hammer_blockmap_t rootmap;
760         hammer_fifo_any_t head;
761         hammer_fifo_tail_t tail;
762
763         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
764         scan_offset = *scan_offsetp;
765
766         if (hammer_debug_general & 0x0080)
767                 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
768         if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
769                 scan_offset = rootmap->alloc_offset;
770         if (scan_offset - sizeof(*tail) <
771             HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
772                 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
773                         root_volume->ondisk->vol_name,
774                         (intmax_t)scan_offset);
775                 *errorp = EIO;
776                 return (NULL);
777         }
778         tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
779                             errorp, bufferp);
780         if (*errorp) {
781                 kprintf("HAMMER(%s) Unable to read UNDO TAIL "
782                         "at %016jx\n",
783                         root_volume->ondisk->vol_name,
784                         (intmax_t)scan_offset - sizeof(*tail));
785                 return (NULL);
786         }
787
788         if (hammer_check_tail_signature(tail, scan_offset) != 0) {
789                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
790                         "at %016jx\n",
791                         root_volume->ondisk->vol_name,
792                         (intmax_t)scan_offset - sizeof(*tail));
793                 *errorp = EIO;
794                 return (NULL);
795         }
796         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
797         *scan_offsetp = scan_offset - head->head.hdr_size;
798
799         return (head);
800 }
801
802 /*
803  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
804  * an error occured.
805  *
806  * On return *scan_offsetp will be the offset of the record following
807  * the returned record.
808  */
809 hammer_fifo_any_t
810 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
811                         hammer_off_t *scan_offsetp,
812                         int *errorp, struct hammer_buffer **bufferp)
813 {
814         hammer_off_t scan_offset;
815         hammer_blockmap_t rootmap;
816         hammer_fifo_any_t head;
817
818         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
819         scan_offset = *scan_offsetp;
820
821         if (hammer_debug_general & 0x0080)
822                 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
823         if (scan_offset == rootmap->alloc_offset)
824                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
825
826         head = hammer_bread(hmp, scan_offset, errorp, bufferp);
827         if (*errorp) {
828                 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
829                         root_volume->ondisk->vol_name,
830                         (intmax_t)scan_offset);
831                 return (NULL);
832         }
833
834         if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
835                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
836                         "at %016jx\n",
837                         root_volume->ondisk->vol_name,
838                         (intmax_t)scan_offset);
839                 *errorp = EIO;
840                 return (NULL);
841         }
842         scan_offset += head->head.hdr_size;
843         if (scan_offset == rootmap->alloc_offset)
844                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
845         *scan_offsetp = scan_offset;
846
847         return (head);
848 }
849
850 /*
851  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
852  * once the head and tail has been established.
853  *
854  * This function validates the entire FIFO record wrapper.
855  */
856 static __inline
857 int
858 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
859                         hammer_off_t beg_off)
860 {
861         hammer_off_t end_off;
862         u_int32_t crc;
863         int bytes;
864
865         /*
866          * Check signatures.  The tail signature is allowed to be the
867          * head signature only for 8-byte PADs.
868          */
869         if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
870                 kprintf("HAMMER: FIFO record bad head signature "
871                         "%04x at %016jx\n",
872                         head->hdr_signature,
873                         (intmax_t)beg_off);
874                 return(2);
875         }
876         if (head->hdr_size < HAMMER_HEAD_ALIGN ||
877             (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
878                 kprintf("HAMMER: FIFO record unaligned or bad size"
879                         "%04x at %016jx\n",
880                         head->hdr_size,
881                         (intmax_t)beg_off);
882                 return(2);
883         }
884         end_off = beg_off + head->hdr_size;
885
886         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
887             (size_t)(end_off - beg_off) != sizeof(*tail)) {
888                 if (head->hdr_type != tail->tail_type) {
889                         kprintf("HAMMER: FIFO record head/tail type mismatch "
890                                 "%04x %04x at %016jx\n",
891                                 head->hdr_type, tail->tail_type,
892                                 (intmax_t)beg_off);
893                         return(2);
894                 }
895                 if (head->hdr_size != tail->tail_size) {
896                         kprintf("HAMMER: FIFO record head/tail size mismatch "
897                                 "%04x %04x at %016jx\n",
898                                 head->hdr_size, tail->tail_size,
899                                 (intmax_t)beg_off);
900                         return(2);
901                 }
902                 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
903                         kprintf("HAMMER: FIFO record bad tail signature "
904                                 "%04x at %016jx\n",
905                                 tail->tail_signature,
906                                 (intmax_t)beg_off);
907                         return(3);
908                 }
909         }
910
911         /*
912          * Non-PAD records must have a CRC and must be sized at
913          * least large enough to fit the head and tail.
914          */
915         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
916                 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
917                       crc32(head + 1, head->hdr_size - sizeof(*head));
918                 if (head->hdr_crc != crc) {
919                         kprintf("HAMMER: FIFO record CRC failed %08x %08x "
920                                 "at %016jx\n",
921                                 head->hdr_crc, crc,
922                                 (intmax_t)beg_off);
923                         return(EIO);
924                 }
925                 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
926                         kprintf("HAMMER: FIFO record too small "
927                                 "%04x at %016jx\n",
928                                 head->hdr_size,
929                                 (intmax_t)beg_off);
930                         return(EIO);
931                 }
932         }
933
934         /*
935          * Check the tail
936          */
937         bytes = head->hdr_size;
938         tail = (void *)((char *)head + bytes - sizeof(*tail));
939         if (tail->tail_size != head->hdr_size) {
940                 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
941                         tail->tail_size, head->hdr_size,
942                         (intmax_t)beg_off);
943                 return(EIO);
944         }
945         if (tail->tail_type != head->hdr_type) {
946                 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
947                         tail->tail_type, head->hdr_type,
948                         (intmax_t)beg_off);
949                 return(EIO);
950         }
951
952         return(0);
953 }
954
955 /*
956  * Check that the FIFO record is in-bounds given the head and the
957  * hammer offset.
958  *
959  * Also checks that the head and tail structures agree with each other,
960  * but does not check beyond the signature, type, and size.
961  */
962 static int
963 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
964 {
965         hammer_fifo_tail_t tail;
966         hammer_off_t end_off;
967
968         /*
969          * head overlaps buffer boundary.  This could be a PAD so only
970          * check the minimum PAD size here.
971          */
972         if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
973                 return(1);
974
975         /*
976          * Calculate the ending offset and make sure the record does
977          * not cross a buffer boundary.
978          */
979         end_off = beg_off + head->hdr_size;
980         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
981                 return(1);
982         tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
983         return (_hammer_check_signature(head, tail, beg_off));
984 }
985
986 /*
987  * Check that the FIFO record is in-bounds given the tail and the
988  * hammer offset.  The offset is pointing at the ending boundary of the
989  * record.
990  *
991  * Also checks that the head and tail structures agree with each other,
992  * but does not check beyond the signature, type, and size.
993  */
994 static int
995 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
996 {
997         hammer_fifo_head_t head;
998         hammer_off_t beg_off;
999
1000         /*
1001          * tail overlaps buffer boundary
1002          */
1003         if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1004                 return(1);
1005
1006         /*
1007          * Calculate the begining offset and make sure the record does
1008          * not cross a buffer boundary.
1009          */
1010         beg_off = end_off - tail->tail_size;
1011         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1012                 return(1);
1013         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1014         return (_hammer_check_signature(head, tail, beg_off));
1015 }
1016
1017 static int
1018 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1019                     hammer_fifo_undo_t undo)
1020 {
1021         hammer_volume_t volume;
1022         hammer_buffer_t buffer;
1023         hammer_off_t buf_offset;
1024         int zone;
1025         int error;
1026         int vol_no;
1027         int bytes;
1028         u_int32_t offset;
1029
1030         /*
1031          * Only process UNDO records.  Flag if we find other records to
1032          * optimize stage2 recovery.
1033          */
1034         if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
1035                 return(0);
1036
1037         /*
1038          * Validate the UNDO record.
1039          */
1040         bytes = undo->head.hdr_size - sizeof(*undo) -
1041                 sizeof(struct hammer_fifo_tail);
1042         if (bytes < 0 || undo->undo_data_bytes < 0 ||
1043             undo->undo_data_bytes > bytes) {
1044                 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
1045                         undo->undo_data_bytes, bytes);
1046                 return(EIO);
1047         }
1048
1049         bytes = undo->undo_data_bytes;
1050
1051         /*
1052          * The undo offset may only be a zone-1 or zone-2 offset.
1053          *
1054          * Currently we only support a zone-1 offset representing the
1055          * volume header.
1056          */
1057         zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1058         offset = undo->undo_offset & HAMMER_BUFMASK;
1059
1060         if (offset + bytes > HAMMER_BUFSIZE) {
1061                 kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
1062                 return (EIO);
1063         }
1064
1065         switch(zone) {
1066         case HAMMER_ZONE_RAW_VOLUME_INDEX:
1067                 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1068                 volume = hammer_get_volume(hmp, vol_no, &error);
1069                 if (volume == NULL) {
1070                         kprintf("HAMMER: UNDO record, "
1071                                 "cannot access volume %d\n", vol_no);
1072                         break;
1073                 }
1074                 hammer_modify_volume(NULL, volume, NULL, 0);
1075                 hammer_recover_copy_undo(undo->undo_offset,
1076                                          (char *)(undo + 1),
1077                                          (char *)volume->ondisk + offset,
1078                                          bytes);
1079                 hammer_modify_volume_done(volume);
1080
1081                 /*
1082                  * Multiple modifications may be made to the same buffer.
1083                  * Also, the volume header cannot be written out until
1084                  * everything else has been flushed.  This also
1085                  * covers the read-only case by preventing the kernel from
1086                  * flushing the buffer.
1087                  */
1088                 if (volume->io.recovered == 0)
1089                         volume->io.recovered = 1;
1090                 else
1091                         hammer_rel_volume(volume, 0);
1092                 break;
1093         case HAMMER_ZONE_RAW_BUFFER_INDEX:
1094                 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
1095                 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1096                                            0, &error);
1097                 if (buffer == NULL) {
1098                         kprintf("HAMMER: UNDO record, "
1099                                 "cannot access buffer %016jx\n",
1100                                 (intmax_t)undo->undo_offset);
1101                         break;
1102                 }
1103                 hammer_modify_buffer(NULL, buffer, NULL, 0);
1104                 hammer_recover_copy_undo(undo->undo_offset,
1105                                          (char *)(undo + 1),
1106                                          (char *)buffer->ondisk + offset,
1107                                          bytes);
1108                 hammer_modify_buffer_done(buffer);
1109
1110                 /*
1111                  * Multiple modifications may be made to the same buffer,
1112                  * improve performance by delaying the flush.  This also
1113                  * covers the read-only case by preventing the kernel from
1114                  * flushing the buffer.
1115                  */
1116                 if (buffer->io.recovered == 0)
1117                         buffer->io.recovered = 1;
1118                 else
1119                         hammer_rel_buffer(buffer, 0);
1120                 break;
1121         default:
1122                 kprintf("HAMMER: Corrupt UNDO record\n");
1123                 error = EIO;
1124         }
1125         return (error);
1126 }
1127
1128 static void
1129 hammer_recover_copy_undo(hammer_off_t undo_offset, 
1130                          char *src, char *dst, int bytes)
1131 {
1132         if (hammer_debug_general & 0x0080) {
1133                 kprintf("UNDO %016jx: %d\n",
1134                         (intmax_t)undo_offset, bytes);
1135         }
1136 #if 0
1137         kprintf("UNDO %016jx:", (intmax_t)undo_offset);
1138         hammer_recover_debug_dump(22, dst, bytes);
1139         kprintf("%22s", "to:");
1140         hammer_recover_debug_dump(22, src, bytes);
1141 #endif
1142         bcopy(src, dst, bytes);
1143 }
1144
1145 /*
1146  * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1147  * during the backwards scan of the extended UNDO/REDO FIFO.  This scan
1148  * does not include the nominal UNDO range, just the extended range.
1149  */
1150 int
1151 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1152                         hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1153 {
1154         hammer_rterm_t rterm;
1155         hammer_rterm_t nrterm;
1156         hammer_rterm_entry_t rte;
1157
1158         if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1159                 return(0);
1160         if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1161             redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1162                 return(0);
1163         }
1164
1165         nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1166         nrterm->redo_objid = redo->redo_objid;
1167         nrterm->redo_localization = redo->redo_localization;
1168         nrterm->redo_flags = redo->redo_flags;
1169         nrterm->redo_offset = redo->redo_offset;
1170
1171         rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1172         if (rterm)
1173                 kfree(nrterm, hmp->m_misc);
1174         else
1175                 rterm = nrterm;
1176
1177         if (bootverbose) {
1178                 kprintf("record record %016jx objid %016jx "
1179                         "offset %016jx flags %08x\n",
1180                         (intmax_t)scan_offset,
1181                         (intmax_t)redo->redo_objid,
1182                         (intmax_t)redo->redo_offset,
1183                         (int)redo->redo_flags);
1184         }
1185
1186         /*
1187          * Scan in reverse order, rte prepended, so the rte list will be
1188          * in forward order.
1189          */
1190         rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1191         rte->fifo_offset = scan_offset;
1192         rte->next = rterm->term_list;
1193         rterm->term_list = rte;
1194
1195         return(0);
1196 }
1197
1198 /*
1199  * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1200  * the forwards scan of the entire extended UNDO/REDO FIFO range.
1201  *
1202  * Records matching previously recorded TERMs have already been committed
1203  * and are ignored.
1204  */
1205 int
1206 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1207                         hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1208 {
1209         struct hammer_rterm rtval;
1210         hammer_rterm_t rterm;
1211         hammer_rterm_entry_t rte;
1212
1213         if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1214                 return(0);
1215
1216         switch(redo->redo_flags) {
1217         case HAMMER_REDO_WRITE:
1218         case HAMMER_REDO_TRUNC:
1219                 /*
1220                  * We hit a REDO request.  The REDO request is only executed
1221                  * if there is no matching TERM.
1222                  */
1223                 bzero(&rtval, sizeof(rtval));
1224                 rtval.redo_objid = redo->redo_objid;
1225                 rtval.redo_localization = redo->redo_localization;
1226                 rtval.redo_offset = redo->redo_offset;
1227                 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1228                                    HAMMER_REDO_TERM_WRITE :
1229                                    HAMMER_REDO_TERM_TRUNC;
1230
1231                 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1232                 if (rterm) {
1233                         if (bootverbose) {
1234                                 kprintf("ignore record %016jx objid %016jx "
1235                                         "offset %016jx flags %08x\n",
1236                                         (intmax_t)scan_offset,
1237                                         (intmax_t)redo->redo_objid,
1238                                         (intmax_t)redo->redo_offset,
1239                                         (int)redo->redo_flags);
1240                         }
1241                         break;
1242                 }
1243                 if (bootverbose) {
1244                         kprintf("run    record %016jx objid %016jx "
1245                                 "offset %016jx flags %08x\n",
1246                                 (intmax_t)scan_offset,
1247                                 (intmax_t)redo->redo_objid,
1248                                 (intmax_t)redo->redo_offset,
1249                                 (int)redo->redo_flags);
1250                 }
1251
1252                 /*
1253                  * Redo stage2 can access a live filesystem, acquire the
1254                  * vnode.
1255                  */
1256                 hammer_recover_redo_exec(hmp, redo);
1257                 break;
1258         case HAMMER_REDO_TERM_WRITE:
1259         case HAMMER_REDO_TERM_TRUNC:
1260                 /*
1261                  * As we encounter TERMs in the forward scan we remove
1262                  * them.  Once the forward scan hits the nominal undo range
1263                  * there will be no more recorded TERMs.
1264                  */
1265                 bzero(&rtval, sizeof(rtval));
1266                 rtval.redo_objid = redo->redo_objid;
1267                 rtval.redo_localization = redo->redo_localization;
1268                 rtval.redo_flags = redo->redo_flags;
1269                 rtval.redo_offset = redo->redo_offset;
1270
1271                 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1272                 if (rterm) {
1273                         if ((rte = rterm->term_list) != NULL) {
1274                                 KKASSERT(rte->fifo_offset == scan_offset);
1275                                 rterm->term_list = rte->next;
1276                                 kfree(rte, hmp->m_misc);
1277                         }
1278                 }
1279                 break;
1280         }
1281         return(0);
1282 }
1283
1284 static void
1285 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1286 {
1287         struct hammer_transaction trans;
1288         struct vattr va;
1289         struct hammer_inode *ip;
1290         struct vnode *vp = NULL;
1291         int error;
1292
1293         hammer_start_transaction(&trans, hmp);
1294
1295         ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1296                               HAMMER_MAX_TID, redo->redo_localization,
1297                               0, &error);
1298         if (ip == NULL) {
1299                 kprintf("unable to find objid %016jx:%08x\n",
1300                         (intmax_t)redo->redo_objid, redo->redo_localization);
1301                 goto done2;
1302         }
1303         error = hammer_get_vnode(ip, &vp);
1304         if (error) {
1305                 kprintf("unable to acquire vnode for %016jx:%08x\n",
1306                         (intmax_t)redo->redo_objid, redo->redo_localization);
1307                 goto done1;
1308         }
1309
1310         switch(redo->redo_flags) {
1311         case HAMMER_REDO_WRITE:
1312                 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1313                 if (error) {
1314                         kprintf("vn_rdwr open %016jx:%08x returned %d\n",
1315                                 (intmax_t)redo->redo_objid,
1316                                 redo->redo_localization, error);
1317                         break;
1318                 }
1319                 vn_unlock(vp);
1320                 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1321                                 redo->redo_data_bytes,
1322                                 redo->redo_offset, UIO_SYSSPACE,
1323                                 0, proc0.p_ucred, NULL);
1324                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1325                 if (error) {
1326                         kprintf("write %016jx:%08x returned %d\n",
1327                                 (intmax_t)redo->redo_objid,
1328                                 redo->redo_localization, error);
1329                 }
1330                 VOP_CLOSE(vp, FREAD|FWRITE);
1331                 break;
1332         case HAMMER_REDO_TRUNC:
1333                 VATTR_NULL(&va);
1334                 va.va_size = redo->redo_offset;
1335                 error = VOP_SETATTR(vp, &va, proc0.p_ucred);
1336                 if (error) {
1337                         kprintf("setattr offset %016jx error %d\n",
1338                                 (intmax_t)redo->redo_offset, error);
1339                 }
1340                 break;
1341         }
1342         vput(vp);
1343 done1:
1344         hammer_rel_inode(ip, 0);
1345 done2:
1346         hammer_done_transaction(&trans);
1347 }
1348
1349 /*
1350  * RB tree compare function.  Note that REDO_TERM_TRUNC ops ignore
1351  * the offset.
1352  *
1353  * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1354  */
1355 static int
1356 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1357 {
1358         if (rt1->redo_objid < rt2->redo_objid)
1359                 return(-1);
1360         if (rt1->redo_objid > rt2->redo_objid)
1361                 return(1);
1362         if (rt1->redo_localization < rt2->redo_localization)
1363                 return(-1);
1364         if (rt1->redo_localization > rt2->redo_localization)
1365                 return(1);
1366         if (rt1->redo_flags < rt2->redo_flags)
1367                 return(-1);
1368         if (rt1->redo_flags > rt2->redo_flags)
1369                 return(1);
1370         if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1371                 if (rt1->redo_offset < rt2->redo_offset)
1372                         return(-1);
1373                 if (rt1->redo_offset > rt2->redo_offset)
1374                         return(1);
1375         }
1376         return(0);
1377 }
1378
1379 #if 0
1380
1381 static void
1382 hammer_recover_debug_dump(int w, char *buf, int bytes)
1383 {
1384         int i;
1385
1386         for (i = 0; i < bytes; ++i) {
1387                 if (i && (i & 15) == 0)
1388                         kprintf("\n%*.*s", w, w, "");
1389                 kprintf(" %02x", (unsigned char)buf[i]);
1390         }
1391         kprintf("\n");
1392 }
1393
1394 #endif
1395
1396 /*
1397  * Flush recovered buffers from recovery operations.  The call to this
1398  * routine may be delayed if a read-only mount was made and then later
1399  * upgraded to read-write.  This routine is also called when unmounting
1400  * a read-only mount to clean out recovered (dirty) buffers which we
1401  * couldn't flush (because the mount is read-only).
1402  *
1403  * The volume header is always written last.  The UNDO FIFO will be forced
1404  * to zero-length by setting next_offset to first_offset.  This leaves the
1405  * (now stale) UNDO information used to recover the disk available for
1406  * forensic analysis.
1407  *
1408  * final is typically 0 or 1.  The volume header is only written if final
1409  * is 1.  If final is -1 the recovered buffers are discarded instead of
1410  * written and root_volume can also be passed as NULL in that case.
1411  */
1412 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1413 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1414
1415 void
1416 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1417                              int final)
1418 {
1419         /*
1420          * Flush the buffers out asynchronously, wait for all the I/O to
1421          * complete, then do it again to destroy the buffer cache buffer
1422          * so it doesn't alias something later on.
1423          */
1424         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1425                 hammer_recover_flush_buffer_callback, &final);
1426         hammer_io_wait_all(hmp, "hmrrcw", 1);
1427         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1428                 hammer_recover_flush_buffer_callback, &final);
1429
1430         /*
1431          * Flush all volume headers except the root volume.  If final < 0
1432          * we discard all volume headers including the root volume.
1433          */
1434         if (final >= 0) {
1435                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1436                         hammer_recover_flush_volume_callback, root_volume);
1437         } else {
1438                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1439                         hammer_recover_flush_volume_callback, NULL);
1440         }
1441
1442         /*
1443          * Finalize the root volume header.
1444          *
1445          * No interlock is needed, volume buffers are not
1446          * messed with by bioops.
1447          */
1448         if (root_volume && root_volume->io.recovered && final > 0) {
1449                 hammer_io_wait_all(hmp, "hmrflx", 1);
1450                 root_volume->io.recovered = 0;
1451                 hammer_io_flush(&root_volume->io, 0);
1452                 hammer_rel_volume(root_volume, 0);
1453                 hammer_io_wait_all(hmp, "hmrfly", 1);
1454         }
1455 }
1456
1457 /*
1458  * Callback to flush volume headers.  If discarding data will be NULL and
1459  * all volume headers (including the root volume) will be discarded.
1460  * Otherwise data is the root_volume and we flush all volume headers
1461  * EXCEPT the root_volume.
1462  *
1463  * Clear any I/O error or modified condition when discarding buffers to
1464  * clean up the reference count, otherwise the buffer may have extra refs
1465  * on it.
1466  */
1467 static
1468 int
1469 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1470 {
1471         hammer_volume_t root_volume = data;
1472
1473         if (volume->io.recovered && volume != root_volume) {
1474                 volume->io.recovered = 0;
1475                 if (root_volume != NULL) {
1476                         /*
1477                          * No interlock is needed, volume buffers are not
1478                          * messed with by bioops.
1479                          */
1480                         hammer_io_flush(&volume->io, 0);
1481                 } else {
1482                         hammer_io_clear_error(&volume->io);
1483                         hammer_io_clear_modify(&volume->io, 1);
1484                 }
1485                 hammer_rel_volume(volume, 0);
1486         }
1487         return(0);
1488 }
1489
1490 /*
1491  * Flush or discard recovered I/O buffers.
1492  *
1493  * Clear any I/O error or modified condition when discarding buffers to
1494  * clean up the reference count, otherwise the buffer may have extra refs
1495  * on it.
1496  */
1497 static
1498 int
1499 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1500 {
1501         int final = *(int *)data;
1502         int flush;
1503
1504         if (buffer->io.recovered) {
1505                 buffer->io.recovered = 0;
1506                 buffer->io.reclaim = 1;
1507                 if (final < 0) {
1508                         hammer_io_clear_error(&buffer->io);
1509                         hammer_io_clear_modify(&buffer->io, 1);
1510                 } else {
1511                         hammer_io_write_interlock(&buffer->io);
1512                         hammer_io_flush(&buffer->io, 0);
1513                         hammer_io_done_interlock(&buffer->io);
1514                 }
1515                 hammer_rel_buffer(buffer, 0);
1516         } else {
1517                 flush = hammer_ref_interlock(&buffer->io.lock);
1518                 if (flush)
1519                         atomic_add_int(&hammer_count_refedbufs, 1);
1520
1521                 if (final < 0) {
1522                         hammer_io_clear_error(&buffer->io);
1523                         hammer_io_clear_modify(&buffer->io, 1);
1524                 }
1525                 KKASSERT(hammer_oneref(&buffer->io.lock));
1526                 buffer->io.reclaim = 1;
1527                 hammer_rel_buffer(buffer, flush);
1528         }
1529         return(0);
1530 }
1531