HAMMER VFS - REDO implementation base code part 4/many
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.29 2008/07/26 05:36:21 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
40                         hammer_off_t end_off);
41 static int hammer_check_head_signature(hammer_fifo_head_t head,
42                         hammer_off_t beg_off);
43 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
44                         char *src, char *dst, int bytes);
45 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
46                         hammer_volume_t root_volume,
47                         hammer_off_t *scan_offsetp,
48                         int *errorp, struct hammer_buffer **bufferp);
49 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
50                         hammer_volume_t root_volume,
51                         hammer_off_t *scan_offsetp,
52                         int *errorp, struct hammer_buffer **bufferp);
53 #if 0
54 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
55 #endif
56 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
57                         hammer_fifo_undo_t undo);
58
59 /*
60  * Recover filesystem meta-data on mount.  This procedure figures out the
61  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
62  * resynchronized by this procedure.
63  *
64  * This procedure is run near the beginning of the mount sequence, before
65  * any B-Tree or high-level accesses are enabled, and is responsible for
66  * restoring the meta-data to a consistent state.  High level HAMMER data
67  * structures (such as the B-Tree) cannot be accessed here.
68  *
69  * NOTE: No information from the root volume has been cached in the
70  *       hammer_mount structure yet, so we need to access the root volume's
71  *       buffer directly.
72  *
73  * NOTE:
74  */
75 int
76 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
77 {
78         hammer_blockmap_t rootmap;
79         hammer_buffer_t buffer;
80         hammer_off_t scan_offset;
81         hammer_off_t scan_offset_save;
82         hammer_off_t bytes;
83         hammer_fifo_any_t head;
84         hammer_off_t first_offset;
85         hammer_off_t last_offset;
86         u_int32_t seqno;
87         int error;
88
89         /*
90          * Examine the UNDO FIFO indices in the volume header.
91          */
92         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
93         first_offset = rootmap->first_offset;
94         last_offset  = rootmap->next_offset;
95         buffer = NULL;
96         error = 0;
97
98         if (first_offset > rootmap->alloc_offset ||
99             last_offset > rootmap->alloc_offset) {
100                 kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
101                         "%016jx, %016jx limit %016jx\n",
102                         root_volume->ondisk->vol_name,
103                         (intmax_t)first_offset,
104                         (intmax_t)last_offset,
105                         (intmax_t)rootmap->alloc_offset);
106                 error = EIO;
107                 goto done;
108         }
109
110         /*
111          * In HAMMER version 4+ filesystems the volume header does NOT
112          * contain definitive UNDO FIFO state.  In particular, the
113          * rootmap->next_offset may not be indexed completely to the
114          * end of the active UNDO FIFO.
115          */
116         if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
117                 /*
118                  * To find the definitive range we must first scan backwards
119                  * from first_offset to locate the first real record and
120                  * extract the sequence number from it.  This record is not
121                  * part of the active undo space.
122                  */
123                 scan_offset = first_offset;
124                 seqno = 0;
125
126                 for (;;) {
127                         head = hammer_recover_scan_rev(hmp, root_volume,
128                                                        &scan_offset,
129                                                        &error, &buffer);
130                         if (error)
131                                 break;
132                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
133                                 seqno = head->head.hdr_seq;
134                                 break;
135                         }
136                 }
137                 if (error) {
138                         kprintf("HAMMER(%s) meta-data recovery failure "
139                                 "during seqno backscan\n",
140                                 root_volume->ondisk->vol_name);
141                         goto done;
142                 }
143
144                 /*
145                  * Scan forwards from first_offset and (seqno+1) looking
146                  * for a sequence space discontinuity.  This denotes the
147                  * end of the active FIFO area.
148                  *
149                  * NOTE: For the case where the FIFO is empty the very first
150                  *       record we find will be discontinuous.
151                  *
152                  * NOTE: Do not include trailing PADs in the scan range,
153                  *       and remember the returned scan_offset after a
154                  *       fwd iteration points to the end of the returned
155                  *       record.
156                  */
157                 kprintf("HAMMER(%s) meta-data recovery check seqno=%08x\n",
158                         root_volume->ondisk->vol_name,
159                         seqno);
160
161                 scan_offset = first_offset;
162                 scan_offset_save = scan_offset;
163                 ++seqno;
164                 for (;;) {
165                         head = hammer_recover_scan_fwd(hmp, root_volume,
166                                                        &scan_offset,
167                                                        &error, &buffer);
168                         if (error)
169                                 break;
170                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
171                                 if (seqno != head->head.hdr_seq) {
172                                         scan_offset = scan_offset_save;
173                                         break;
174                                 }
175                                 scan_offset_save = scan_offset;
176                                 ++seqno;
177                         }
178
179 #if 0
180                         /*
181                          * If the forward scan is grossly ahead of last_offset
182                          * then something is wrong.  last_offset is supposed
183                          * to be flushed out
184                          */
185                         if (last_offset >= scan_offset) {
186                                 bytes = last_offset - scan_offset;
187                         } else {
188                                 bytes = rootmap->alloc_offset - scan_offset +
189                                         (last_offset & HAMMER_OFF_LONG_MASK);
190                         }
191                         if (bytes >
192                             (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
193                             4 / 5) {
194                                 kprintf("HAMMER(%s) meta-data forward scan is "
195                                         "grossly beyond the last_offset in "
196                                         "the volume header, this can't be "
197                                         "right.\n",
198                                         root_volume->ondisk->vol_name);
199                                 error = EIO;
200                                 break;
201                         }
202 #endif
203                 }
204
205                 /*
206                  * Store the seqno.  This will be the next seqno we lay down
207                  * when generating new UNDOs.
208                  */
209                 hmp->undo_seqno = seqno;
210                 if (error) {
211                         kprintf("HAMMER(%s) meta-data recovery failure "
212                                 "during seqno fwdscan\n",
213                                 root_volume->ondisk->vol_name);
214                         goto done;
215                 }
216                 last_offset = scan_offset;
217                 kprintf("HAMMER(%s) meta-data recovery range %016jx-%016jx "
218                         "(invol %016jx) endseqno=%08x\n",
219                         root_volume->ondisk->vol_name,
220                         (intmax_t)first_offset,
221                         (intmax_t)last_offset,
222                         (intmax_t)rootmap->next_offset,
223                         seqno);
224         }
225
226         /*
227          * Calculate the size of the active portion of the FIFO.  If the
228          * FIFO is empty the filesystem is clean and no further action is
229          * needed.
230          */
231         if (last_offset >= first_offset) {
232                 bytes = last_offset - first_offset;
233         } else {
234                 bytes = rootmap->alloc_offset - first_offset +
235                         (last_offset & HAMMER_OFF_LONG_MASK);
236         }
237         if (bytes == 0) {
238                 error = 0;
239                 goto done;
240         }
241
242         kprintf("HAMMER(%s) Start meta-data recovery %016jx - %016jx "
243                 "(%jd bytes of UNDO)%s\n",
244                 root_volume->ondisk->vol_name,
245                 (intmax_t)first_offset,
246                 (intmax_t)last_offset,
247                 (intmax_t)bytes,
248                 (hmp->ronly ? " (RO)" : "(RW)"));
249         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
250                 kprintf("Undo size is absurd, unable to mount\n");
251                 error = EIO;
252                 goto done;
253         }
254
255         /*
256          * Scan the UNDOs backwards.
257          */
258         scan_offset = last_offset;
259
260         while ((int64_t)bytes > 0) {
261                 KKASSERT(scan_offset != first_offset);
262                 head = hammer_recover_scan_rev(hmp, root_volume,
263                                                &scan_offset, &error, &buffer);
264                 if (error)
265                         break;
266                 error = hammer_recover_undo(hmp, root_volume, &head->undo);
267                 if (error) {
268                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
269                                 root_volume->ondisk->vol_name,
270                                 (intmax_t)scan_offset - head->head.hdr_size);
271                         break;
272                 }
273                 bytes -= head->head.hdr_size;
274
275                 /*
276                  * If too many dirty buffers have built up we have to flush'm
277                  * out.  As long as we do not flush out the volume header
278                  * a crash here should not cause any problems.
279                  *
280                  * buffer must be released so the flush can assert that
281                  * all buffers are idle.
282                  */
283                 if (hammer_flusher_meta_limit(hmp)) {
284                         if (buffer) {
285                                 hammer_rel_buffer(buffer, 0);
286                                 buffer = NULL;
287                         }
288                         if (hmp->ronly == 0) {
289                                 hammer_recover_flush_buffers(hmp, root_volume,
290                                                              0);
291                                 kprintf("HAMMER(%s) Continuing recovery\n",
292                                         root_volume->ondisk->vol_name);
293                         } else {
294                                 kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
295                                         root_volume->ondisk->vol_name);
296                                 error = EIO;
297                                 break;
298                         }
299                 }
300         }
301 done:
302         if (buffer) {
303                 hammer_rel_buffer(buffer, 0);
304                 buffer = NULL;
305         }
306
307         /*
308          * After completely flushing all the recovered buffers the volume
309          * header will also be flushed.
310          */
311         if (root_volume->io.recovered == 0) {
312                 hammer_ref_volume(root_volume);
313                 root_volume->io.recovered = 1;
314         }
315
316         /*
317          * Finish up flushing (or discarding) recovered buffers.  FIFO
318          * indices in the volume header are updated to the actual undo
319          * range but will not be collapsed until stage 2.
320          */
321         if (error == 0) {
322                 hammer_modify_volume(NULL, root_volume, NULL, 0);
323                 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
324                 rootmap->first_offset = first_offset;
325                 rootmap->next_offset = last_offset;
326                 hammer_modify_volume_done(root_volume);
327                 if (hmp->ronly == 0)
328                         hammer_recover_flush_buffers(hmp, root_volume, 1);
329         } else {
330                 hammer_recover_flush_buffers(hmp, root_volume, -1);
331         }
332         kprintf("HAMMER(%s) End meta-data recovery\n",
333                 root_volume->ondisk->vol_name);
334         return (error);
335 }
336
337 /*
338  * Execute redo operations
339  *
340  * This procedure is run at the end of the mount sequence, after the hammer
341  * mount structure has been completely initialized but before the filesystem
342  * goes live.  It can access standard cursors, the B-Tree, flush the
343  * filesystem, and so forth.
344  *
345  * This code may only be called for read-write mounts or when a mount
346  * switches from read-only to read-write.
347  *
348  * The stage1 code will have already calculated the correct FIFO range
349  * and stored it in the rootmap.
350  */
351 int
352 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
353 {
354         hammer_blockmap_t rootmap;
355         hammer_buffer_t buffer;
356         hammer_off_t scan_offset;
357         hammer_off_t bytes;
358         hammer_fifo_any_t head;
359         hammer_off_t first_offset;
360         hammer_off_t last_offset;
361         int error;
362
363         /*
364          * Stage 2 can only be run on a RW mount, or when the mount is
365          * switched from RO to RW.  It must be run only once.
366          */
367         KKASSERT(hmp->ronly == 0);
368
369         if (hmp->hflags & HMNT_STAGE2)
370                 return(0);
371         hmp->hflags |= HMNT_STAGE2;
372
373         /*
374          * Examine the UNDO FIFO.  If it is empty the filesystem is clean
375          * and no action need be taken.
376          */
377         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
378         first_offset = rootmap->first_offset;
379         last_offset  = rootmap->next_offset;
380         if (first_offset == last_offset)
381                 return(0);
382
383         if (last_offset >= first_offset) {
384                 bytes = last_offset - first_offset;
385         } else {
386                 bytes = rootmap->alloc_offset - first_offset +
387                         (last_offset & HAMMER_OFF_LONG_MASK);
388         }
389         kprintf("HAMMER(%s) Start redo recovery %016jx - %016jx "
390                 "(%jd bytes of UNDO)%s\n",
391                 root_volume->ondisk->vol_name,
392                 (intmax_t)first_offset,
393                 (intmax_t)last_offset,
394                 (intmax_t)bytes,
395                 (hmp->ronly ? " (RO)" : "(RW)"));
396         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
397                 kprintf("Undo size is absurd, unable to mount\n");
398                 return(EIO);
399         }
400
401         /*
402          * Scan the REDOs forwards.
403          */
404         scan_offset = first_offset;
405         buffer = NULL;
406
407         while (bytes) {
408                 KKASSERT(scan_offset != last_offset);
409
410                 head = hammer_recover_scan_fwd(hmp, root_volume,
411                                                &scan_offset, &error, &buffer);
412                 if (error)
413                         break;
414
415 #if 0
416                 error = hammer_recover_redo(hmp, root_volume, &head->redo);
417 #endif
418                 if (error) {
419                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
420                                 root_volume->ondisk->vol_name,
421                                 (intmax_t)scan_offset - head->head.hdr_size);
422                         break;
423                 }
424                 bytes -= head->head.hdr_size;
425         }
426         if (buffer) {
427                 hammer_rel_buffer(buffer, 0);
428                 buffer = NULL;
429         }
430
431         /*
432          * Finish up flushing (or discarding) recovered buffers by executing
433          * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
434          * case tests and forces the flush in order to update the FIFO indices.
435          *
436          * If a crash occurs during the flush the entire undo/redo will be
437          * re-run during recovery on the next mount.
438          */
439         if (error == 0) {
440                 if (rootmap->first_offset != rootmap->next_offset)
441                         hmp->hflags |= HMNT_UNDO_DIRTY;
442                 hammer_flusher_sync(hmp);
443         }
444         kprintf("HAMMER(%s) End redo recovery\n",
445                 root_volume->ondisk->vol_name);
446         return (error);
447 }
448
449 /*
450  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
451  * record at *scan_offsetp or NULL if an error occured.
452  *
453  * On return *scan_offsetp will be the offset of the returned record.
454  */
455 hammer_fifo_any_t
456 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
457                         hammer_off_t *scan_offsetp,
458                         int *errorp, struct hammer_buffer **bufferp)
459 {
460         hammer_off_t scan_offset;
461         hammer_blockmap_t rootmap;
462         hammer_fifo_any_t head;
463         hammer_fifo_tail_t tail;
464
465         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
466         scan_offset = *scan_offsetp;
467
468         if (hammer_debug_general & 0x0080)
469                 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
470         if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
471                 scan_offset = rootmap->alloc_offset;
472         if (scan_offset - sizeof(*tail) <
473             HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
474                 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
475                         root_volume->ondisk->vol_name,
476                         (intmax_t)scan_offset);
477                 *errorp = EIO;
478                 return (NULL);
479         }
480         tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
481                             errorp, bufferp);
482         if (*errorp) {
483                 kprintf("HAMMER(%s) Unable to read UNDO TAIL "
484                         "at %016jx\n",
485                         root_volume->ondisk->vol_name,
486                         (intmax_t)scan_offset - sizeof(*tail));
487                 return (NULL);
488         }
489
490         if (hammer_check_tail_signature(tail, scan_offset) != 0) {
491                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
492                         "at %016jx\n",
493                         root_volume->ondisk->vol_name,
494                         (intmax_t)scan_offset - sizeof(*tail));
495                 *errorp = EIO;
496                 return (NULL);
497         }
498         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
499         *scan_offsetp = scan_offset - head->head.hdr_size;
500
501         return (head);
502 }
503
504 /*
505  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
506  * an error occured.
507  *
508  * On return *scan_offsetp will be the offset of the record following
509  * the returned record.
510  */
511 hammer_fifo_any_t
512 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
513                         hammer_off_t *scan_offsetp,
514                         int *errorp, struct hammer_buffer **bufferp)
515 {
516         hammer_off_t scan_offset;
517         hammer_blockmap_t rootmap;
518         hammer_fifo_any_t head;
519
520         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
521         scan_offset = *scan_offsetp;
522
523         if (hammer_debug_general & 0x0080)
524                 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
525         if (scan_offset == rootmap->alloc_offset)
526                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
527
528         head = hammer_bread(hmp, scan_offset, errorp, bufferp);
529         if (*errorp) {
530                 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
531                         root_volume->ondisk->vol_name,
532                         (intmax_t)scan_offset);
533                 return (NULL);
534         }
535
536         if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
537                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
538                         "at %016jx\n",
539                         root_volume->ondisk->vol_name,
540                         (intmax_t)scan_offset);
541                 *errorp = EIO;
542                 return (NULL);
543         }
544         scan_offset += head->head.hdr_size;
545         if (scan_offset == rootmap->alloc_offset)
546                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
547         *scan_offsetp = scan_offset;
548
549         return (head);
550 }
551
552 /*
553  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
554  * once the head and tail has been established.
555  *
556  * This function validates the entire FIFO record wrapper.
557  */
558 static __inline
559 int
560 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
561                         hammer_off_t beg_off)
562 {
563         hammer_off_t end_off;
564         u_int32_t crc;
565         int bytes;
566
567         /*
568          * Check signatures.  The tail signature is allowed to be the
569          * head signature only for 8-byte PADs.
570          */
571         if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
572                 kprintf("HAMMER: FIFO record bad head signature "
573                         "%04x at %016jx\n",
574                         head->hdr_signature,
575                         (intmax_t)beg_off);
576                 return(2);
577         }
578         if (head->hdr_size < HAMMER_HEAD_ALIGN ||
579             (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
580                 kprintf("HAMMER: FIFO record unaligned or bad size"
581                         "%04x at %016jx\n",
582                         head->hdr_size,
583                         (intmax_t)beg_off);
584                 return(2);
585         }
586         end_off = beg_off + head->hdr_size;
587
588         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
589             (size_t)(end_off - beg_off) != sizeof(*tail)) {
590                 if (head->hdr_type != tail->tail_type) {
591                         kprintf("HAMMER: FIFO record head/tail type mismatch "
592                                 "%04x %04x at %016jx\n",
593                                 head->hdr_type, tail->tail_type,
594                                 (intmax_t)beg_off);
595                         return(2);
596                 }
597                 if (head->hdr_size != tail->tail_size) {
598                         kprintf("HAMMER: FIFO record head/tail size mismatch "
599                                 "%04x %04x at %016jx\n",
600                                 head->hdr_size, tail->tail_size,
601                                 (intmax_t)beg_off);
602                         return(2);
603                 }
604                 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
605                         kprintf("HAMMER: FIFO record bad tail signature "
606                                 "%04x at %016jx\n",
607                                 tail->tail_signature,
608                                 (intmax_t)beg_off);
609                         return(3);
610                 }
611         }
612
613         /*
614          * Non-PAD records must have a CRC and must be sized at
615          * least large enough to fit the head and tail.
616          */
617         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
618                 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
619                       crc32(head + 1, head->hdr_size - sizeof(*head));
620                 if (head->hdr_crc != crc) {
621                         kprintf("HAMMER: FIFO record CRC failed %08x %08x "
622                                 "at %016jx\n",
623                                 head->hdr_crc, crc,
624                                 (intmax_t)beg_off);
625                         return(EIO);
626                 }
627                 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
628                         kprintf("HAMMER: FIFO record too small "
629                                 "%04x at %016jx\n",
630                                 head->hdr_size,
631                                 (intmax_t)beg_off);
632                         return(EIO);
633                 }
634         }
635
636         /*
637          * Check the tail
638          */
639         bytes = head->hdr_size;
640         tail = (void *)((char *)head + bytes - sizeof(*tail));
641         if (tail->tail_size != head->hdr_size) {
642                 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
643                         tail->tail_size, head->hdr_size,
644                         (intmax_t)beg_off);
645                 return(EIO);
646         }
647         if (tail->tail_type != head->hdr_type) {
648                 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
649                         tail->tail_type, head->hdr_type,
650                         (intmax_t)beg_off);
651                 return(EIO);
652         }
653
654         return(0);
655 }
656
657 /*
658  * Check that the FIFO record is in-bounds given the head and the
659  * hammer offset.
660  *
661  * Also checks that the head and tail structures agree with each other,
662  * but does not check beyond the signature, type, and size.
663  */
664 static int
665 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
666 {
667         hammer_fifo_tail_t tail;
668         hammer_off_t end_off;
669
670         /*
671          * head overlaps buffer boundary.  This could be a PAD so only
672          * check the minimum PAD size here.
673          */
674         if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
675                 return(1);
676
677         /*
678          * Calculate the ending offset and make sure the record does
679          * not cross a buffer boundary.
680          */
681         end_off = beg_off + head->hdr_size;
682         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
683                 return(1);
684         tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
685         return (_hammer_check_signature(head, tail, beg_off));
686 }
687
688 /*
689  * Check that the FIFO record is in-bounds given the tail and the
690  * hammer offset.  The offset is pointing at the ending boundary of the
691  * record.
692  *
693  * Also checks that the head and tail structures agree with each other,
694  * but does not check beyond the signature, type, and size.
695  */
696 static int
697 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
698 {
699         hammer_fifo_head_t head;
700         hammer_off_t beg_off;
701
702         /*
703          * tail overlaps buffer boundary
704          */
705         if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
706                 return(1);
707
708         /*
709          * Calculate the begining offset and make sure the record does
710          * not cross a buffer boundary.
711          */
712         beg_off = end_off - tail->tail_size;
713         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
714                 return(1);
715         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
716         return (_hammer_check_signature(head, tail, beg_off));
717 }
718
719 static int
720 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
721                     hammer_fifo_undo_t undo)
722 {
723         hammer_volume_t volume;
724         hammer_buffer_t buffer;
725         hammer_off_t buf_offset;
726         int zone;
727         int error;
728         int vol_no;
729         int bytes;
730         u_int32_t offset;
731
732         /*
733          * Only process UNDO records.  Flag if we find other records to
734          * optimize stage2 recovery.
735          */
736         if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) {
737                 if (undo->head.hdr_type == HAMMER_HEAD_TYPE_REDO)
738                         hmp->hflags |= HMNT_HASREDO;
739                 return(0);
740         }
741
742         /*
743          * Validate the UNDO record.
744          */
745         bytes = undo->head.hdr_size - sizeof(*undo) -
746                 sizeof(struct hammer_fifo_tail);
747         if (bytes < 0 || undo->undo_data_bytes < 0 ||
748             undo->undo_data_bytes > bytes) {
749                 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
750                         undo->undo_data_bytes, bytes);
751                 return(EIO);
752         }
753
754         bytes = undo->undo_data_bytes;
755
756         /*
757          * The undo offset may only be a zone-1 or zone-2 offset.
758          *
759          * Currently we only support a zone-1 offset representing the
760          * volume header.
761          */
762         zone = HAMMER_ZONE_DECODE(undo->undo_offset);
763         offset = undo->undo_offset & HAMMER_BUFMASK;
764
765         if (offset + bytes > HAMMER_BUFSIZE) {
766                 kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
767                 return (EIO);
768         }
769
770         switch(zone) {
771         case HAMMER_ZONE_RAW_VOLUME_INDEX:
772                 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
773                 volume = hammer_get_volume(hmp, vol_no, &error);
774                 if (volume == NULL) {
775                         kprintf("HAMMER: UNDO record, "
776                                 "cannot access volume %d\n", vol_no);
777                         break;
778                 }
779                 hammer_modify_volume(NULL, volume, NULL, 0);
780                 hammer_recover_copy_undo(undo->undo_offset,
781                                          (char *)(undo + 1),
782                                          (char *)volume->ondisk + offset,
783                                          bytes);
784                 hammer_modify_volume_done(volume);
785
786                 /*
787                  * Multiple modifications may be made to the same buffer.
788                  * Also, the volume header cannot be written out until
789                  * everything else has been flushed.  This also
790                  * covers the read-only case by preventing the kernel from
791                  * flushing the buffer.
792                  */
793                 if (volume->io.recovered == 0)
794                         volume->io.recovered = 1;
795                 else
796                         hammer_rel_volume(volume, 0);
797                 break;
798         case HAMMER_ZONE_RAW_BUFFER_INDEX:
799                 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
800                 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
801                                            0, &error);
802                 if (buffer == NULL) {
803                         kprintf("HAMMER: UNDO record, "
804                                 "cannot access buffer %016jx\n",
805                                 (intmax_t)undo->undo_offset);
806                         break;
807                 }
808                 hammer_modify_buffer(NULL, buffer, NULL, 0);
809                 hammer_recover_copy_undo(undo->undo_offset,
810                                          (char *)(undo + 1),
811                                          (char *)buffer->ondisk + offset,
812                                          bytes);
813                 hammer_modify_buffer_done(buffer);
814
815                 /*
816                  * Multiple modifications may be made to the same buffer,
817                  * improve performance by delaying the flush.  This also
818                  * covers the read-only case by preventing the kernel from
819                  * flushing the buffer.
820                  */
821                 if (buffer->io.recovered == 0)
822                         buffer->io.recovered = 1;
823                 else
824                         hammer_rel_buffer(buffer, 0);
825                 break;
826         default:
827                 kprintf("HAMMER: Corrupt UNDO record\n");
828                 error = EIO;
829         }
830         return (error);
831 }
832
833 static void
834 hammer_recover_copy_undo(hammer_off_t undo_offset, 
835                          char *src, char *dst, int bytes)
836 {
837         if (hammer_debug_general & 0x0080) {
838                 kprintf("UNDO %016jx: %d\n",
839                         (intmax_t)undo_offset, bytes);
840         }
841 #if 0
842         kprintf("UNDO %016jx:", (intmax_t)undo_offset);
843         hammer_recover_debug_dump(22, dst, bytes);
844         kprintf("%22s", "to:");
845         hammer_recover_debug_dump(22, src, bytes);
846 #endif
847         bcopy(src, dst, bytes);
848 }
849
850 #if 0
851
852 static void
853 hammer_recover_debug_dump(int w, char *buf, int bytes)
854 {
855         int i;
856
857         for (i = 0; i < bytes; ++i) {
858                 if (i && (i & 15) == 0)
859                         kprintf("\n%*.*s", w, w, "");
860                 kprintf(" %02x", (unsigned char)buf[i]);
861         }
862         kprintf("\n");
863 }
864
865 #endif
866
867 /*
868  * Flush recovered buffers from recovery operations.  The call to this
869  * routine may be delayed if a read-only mount was made and then later
870  * upgraded to read-write.  This routine is also called when unmounting
871  * a read-only mount to clean out recovered (dirty) buffers which we
872  * couldn't flush (because the mount is read-only).
873  *
874  * The volume header is always written last.  The UNDO FIFO will be forced
875  * to zero-length by setting next_offset to first_offset.  This leaves the
876  * (now stale) UNDO information used to recover the disk available for
877  * forensic analysis.
878  *
879  * final is typically 0 or 1.  The volume header is only written if final
880  * is 1.  If final is -1 the recovered buffers are discarded instead of
881  * written and root_volume can also be passed as NULL in that case.
882  */
883 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
884 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
885
886 void
887 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
888                              int final)
889 {
890         /*
891          * Flush the buffers out asynchronously, wait for all the I/O to
892          * complete, then do it again to destroy the buffer cache buffer
893          * so it doesn't alias something later on.
894          */
895         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
896                 hammer_recover_flush_buffer_callback, &final);
897         hammer_io_wait_all(hmp, "hmrrcw", 1);
898         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
899                 hammer_recover_flush_buffer_callback, &final);
900
901         /*
902          * Flush all volume headers except the root volume.  If final < 0
903          * we discard all volume headers including the root volume.
904          */
905         if (final >= 0) {
906                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
907                         hammer_recover_flush_volume_callback, root_volume);
908         } else {
909                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
910                         hammer_recover_flush_volume_callback, NULL);
911         }
912
913         /*
914          * Finalize the root volume header.
915          */
916         if (root_volume && root_volume->io.recovered && final > 0) {
917                 hammer_io_wait_all(hmp, "hmrflx", 1);
918                 root_volume->io.recovered = 0;
919                 hammer_io_flush(&root_volume->io, 0);
920                 hammer_rel_volume(root_volume, 0);
921                 hammer_io_wait_all(hmp, "hmrfly", 1);
922         }
923 }
924
925 /*
926  * Callback to flush volume headers.  If discarding data will be NULL and
927  * all volume headers (including the root volume) will be discarded.
928  * Otherwise data is the root_volume and we flush all volume headers
929  * EXCEPT the root_volume.
930  *
931  * Clear any I/O error or modified condition when discarding buffers to
932  * clean up the reference count, otherwise the buffer may have extra refs
933  * on it.
934  */
935 static
936 int
937 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
938 {
939         hammer_volume_t root_volume = data;
940
941         if (volume->io.recovered && volume != root_volume) {
942                 volume->io.recovered = 0;
943                 if (root_volume != NULL) {
944                         hammer_io_flush(&volume->io, 0);
945                 } else {
946                         hammer_io_clear_error(&volume->io);
947                         hammer_io_clear_modify(&volume->io, 1);
948                 }
949                 hammer_rel_volume(volume, 0);
950         }
951         return(0);
952 }
953
954 /*
955  * Flush or discard recovered I/O buffers.
956  *
957  * Clear any I/O error or modified condition when discarding buffers to
958  * clean up the reference count, otherwise the buffer may have extra refs
959  * on it.
960  */
961 static
962 int
963 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
964 {
965         int final = *(int *)data;
966
967         if (buffer->io.recovered) {
968                 buffer->io.recovered = 0;
969                 buffer->io.reclaim = 1;
970                 if (final < 0) {
971                         hammer_io_clear_error(&buffer->io);
972                         hammer_io_clear_modify(&buffer->io, 1);
973                 } else {
974                         hammer_io_flush(&buffer->io, 0);
975                 }
976                 hammer_rel_buffer(buffer, 0);
977         } else {
978                 if (buffer->io.lock.refs == 0)
979                         ++hammer_count_refedbufs;
980                 hammer_ref(&buffer->io.lock);
981                 if (final < 0) {
982                         hammer_io_clear_error(&buffer->io);
983                         hammer_io_clear_modify(&buffer->io, 1);
984                 }
985                 KKASSERT(buffer->io.lock.refs == 1);
986                 buffer->io.reclaim = 1;
987                 hammer_rel_buffer(buffer, 1);
988         }
989         return(0);
990 }
991