Merge branch 'master' into net80211-update
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.29 2008/07/26 05:36:21 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
40                         hammer_off_t end_off);
41 static int hammer_check_head_signature(hammer_fifo_head_t head,
42                         hammer_off_t beg_off);
43 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
44                         char *src, char *dst, int bytes);
45 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
46                         hammer_volume_t root_volume,
47                         hammer_off_t *scan_offsetp,
48                         int *errorp, struct hammer_buffer **bufferp);
49 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
50                         hammer_volume_t root_volume,
51                         hammer_off_t *scan_offsetp,
52                         int *errorp, struct hammer_buffer **bufferp);
53 #if 0
54 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
55 #endif
56 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
57                         hammer_fifo_undo_t undo);
58
59 /*
60  * Recover filesystem meta-data on mount.  This procedure figures out the
61  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
62  * resynchronized by this procedure.
63  *
64  * This procedure is run near the beginning of the mount sequence, before
65  * any B-Tree or high-level accesses are enabled, and is responsible for
66  * restoring the meta-data to a consistent state.  High level HAMMER data
67  * structures (such as the B-Tree) cannot be accessed here.
68  *
69  * NOTE: No information from the root volume has been cached in the
70  *       hammer_mount structure yet, so we need to access the root volume's
71  *       buffer directly.
72  *
73  * NOTE:
74  */
75 int
76 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
77 {
78         hammer_blockmap_t rootmap;
79         hammer_buffer_t buffer;
80         hammer_off_t scan_offset;
81         hammer_off_t scan_offset_save;
82         hammer_off_t bytes;
83         hammer_fifo_any_t head;
84         hammer_off_t first_offset;
85         hammer_off_t last_offset;
86         u_int32_t seqno;
87         int error;
88         int degenerate_case = 0;
89
90         /*
91          * Examine the UNDO FIFO indices in the volume header.
92          */
93         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
94         first_offset = rootmap->first_offset;
95         last_offset  = rootmap->next_offset;
96         buffer = NULL;
97         error = 0;
98
99         if (first_offset > rootmap->alloc_offset ||
100             last_offset > rootmap->alloc_offset) {
101                 kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
102                         "%016jx, %016jx limit %016jx\n",
103                         root_volume->ondisk->vol_name,
104                         (intmax_t)first_offset,
105                         (intmax_t)last_offset,
106                         (intmax_t)rootmap->alloc_offset);
107                 error = EIO;
108                 goto done;
109         }
110
111         /*
112          * In HAMMER version 4+ filesystems the volume header does NOT
113          * contain definitive UNDO FIFO state.  In particular, the
114          * rootmap->next_offset may not be indexed completely to the
115          * end of the active UNDO FIFO.
116          */
117         if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
118                 /*
119                  * To find the definitive range we must first scan backwards
120                  * from first_offset to locate the first real record and
121                  * extract the sequence number from it.  This record is not
122                  * part of the active undo space.
123                  */
124                 scan_offset = first_offset;
125                 seqno = 0;
126
127                 for (;;) {
128                         head = hammer_recover_scan_rev(hmp, root_volume,
129                                                        &scan_offset,
130                                                        &error, &buffer);
131                         if (error)
132                                 break;
133                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
134                                 seqno = head->head.hdr_seq;
135                                 break;
136                         }
137                 }
138                 if (error) {
139                         kprintf("HAMMER(%s) recovery failure "
140                                 "during seqno backscan\n",
141                                 root_volume->ondisk->vol_name);
142                         goto done;
143                 }
144
145                 /*
146                  * Scan forwards from first_offset and (seqno+1) looking
147                  * for a sequence space discontinuity.  This denotes the
148                  * end of the active FIFO area.
149                  *
150                  * NOTE: For the case where the FIFO is empty the very first
151                  *       record we find will be discontinuous.
152                  *
153                  * NOTE: Do not include trailing PADs in the scan range,
154                  *       and remember the returned scan_offset after a
155                  *       fwd iteration points to the end of the returned
156                  *       record.
157                  */
158                 kprintf("HAMMER(%s) recovery check seqno=%08x\n",
159                         root_volume->ondisk->vol_name,
160                         seqno);
161
162                 scan_offset = first_offset;
163                 scan_offset_save = scan_offset;
164                 ++seqno;
165                 for (;;) {
166                         head = hammer_recover_scan_fwd(hmp, root_volume,
167                                                        &scan_offset,
168                                                        &error, &buffer);
169                         if (error)
170                                 break;
171                         if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
172                                 if (seqno != head->head.hdr_seq) {
173                                         scan_offset = scan_offset_save;
174                                         break;
175                                 }
176                                 scan_offset_save = scan_offset;
177                                 ++seqno;
178                         }
179
180 #if 0
181                         /*
182                          * If the forward scan is grossly ahead of last_offset
183                          * then something is wrong.  last_offset is supposed
184                          * to be flushed out
185                          */
186                         if (last_offset >= scan_offset) {
187                                 bytes = last_offset - scan_offset;
188                         } else {
189                                 bytes = rootmap->alloc_offset - scan_offset +
190                                         (last_offset & HAMMER_OFF_LONG_MASK);
191                         }
192                         if (bytes >
193                             (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
194                             4 / 5) {
195                                 kprintf("HAMMER(%s) recovery forward scan is "
196                                         "grossly beyond the last_offset in "
197                                         "the volume header, this can't be "
198                                         "right.\n",
199                                         root_volume->ondisk->vol_name);
200                                 error = EIO;
201                                 break;
202                         }
203 #endif
204                 }
205
206                 /*
207                  * Store the seqno.  This will be the next seqno we lay down
208                  * when generating new UNDOs.
209                  */
210                 hmp->undo_seqno = seqno;
211                 if (error) {
212                         kprintf("HAMMER(%s) recovery failure "
213                                 "during seqno fwdscan\n",
214                                 root_volume->ondisk->vol_name);
215                         goto done;
216                 }
217                 last_offset = scan_offset;
218                 kprintf("HAMMER(%s) recovery range %016jx-%016jx\n"
219                         "HAMMER(%s) recovery nexto %016jx endseqno=%08x\n",
220                         root_volume->ondisk->vol_name,
221                         (intmax_t)first_offset,
222                         (intmax_t)last_offset,
223                         root_volume->ondisk->vol_name,
224                         (intmax_t)rootmap->next_offset,
225                         seqno);
226         }
227
228         /*
229          * Calculate the size of the active portion of the FIFO.  If the
230          * FIFO is empty the filesystem is clean and no further action is
231          * needed.
232          */
233         if (last_offset >= first_offset) {
234                 bytes = last_offset - first_offset;
235         } else {
236                 bytes = rootmap->alloc_offset - first_offset +
237                         (last_offset & HAMMER_OFF_LONG_MASK);
238         }
239         if (bytes == 0) {
240                 degenerate_case = 1;
241                 error = 0;
242                 goto done;
243         }
244
245         kprintf("HAMMER(%s) Start recovery undo %016jx - %016jx "
246                 "(%jd bytes of UNDO)%s\n",
247                 root_volume->ondisk->vol_name,
248                 (intmax_t)first_offset,
249                 (intmax_t)last_offset,
250                 (intmax_t)bytes,
251                 (hmp->ronly ? " (RO)" : "(RW)"));
252         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
253                 kprintf("Undo size is absurd, unable to mount\n");
254                 error = EIO;
255                 goto done;
256         }
257
258         /*
259          * Scan the UNDOs backwards.
260          */
261         scan_offset = last_offset;
262
263         while ((int64_t)bytes > 0) {
264                 KKASSERT(scan_offset != first_offset);
265                 head = hammer_recover_scan_rev(hmp, root_volume,
266                                                &scan_offset, &error, &buffer);
267                 if (error)
268                         break;
269                 error = hammer_recover_undo(hmp, root_volume, &head->undo);
270                 if (error) {
271                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
272                                 root_volume->ondisk->vol_name,
273                                 (intmax_t)scan_offset - head->head.hdr_size);
274                         break;
275                 }
276                 bytes -= head->head.hdr_size;
277
278                 /*
279                  * If too many dirty buffers have built up we have to flush'm
280                  * out.  As long as we do not flush out the volume header
281                  * a crash here should not cause any problems.
282                  *
283                  * buffer must be released so the flush can assert that
284                  * all buffers are idle.
285                  */
286                 if (hammer_flusher_meta_limit(hmp)) {
287                         if (buffer) {
288                                 hammer_rel_buffer(buffer, 0);
289                                 buffer = NULL;
290                         }
291                         if (hmp->ronly == 0) {
292                                 hammer_recover_flush_buffers(hmp, root_volume,
293                                                              0);
294                                 kprintf("HAMMER(%s) Continuing recovery\n",
295                                         root_volume->ondisk->vol_name);
296                         } else {
297                                 kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
298                                         root_volume->ondisk->vol_name);
299                                 error = EIO;
300                                 break;
301                         }
302                 }
303         }
304 done:
305         if (buffer) {
306                 hammer_rel_buffer(buffer, 0);
307                 buffer = NULL;
308         }
309
310         /*
311          * After completely flushing all the recovered buffers the volume
312          * header will also be flushed.
313          */
314         if (root_volume->io.recovered == 0) {
315                 hammer_ref_volume(root_volume);
316                 root_volume->io.recovered = 1;
317         }
318
319         /*
320          * Finish up flushing (or discarding) recovered buffers.  FIFO
321          * indices in the volume header are updated to the actual undo
322          * range but will not be collapsed until stage 2.
323          */
324         if (error == 0) {
325                 hammer_modify_volume(NULL, root_volume, NULL, 0);
326                 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
327                 rootmap->first_offset = first_offset;
328                 rootmap->next_offset = last_offset;
329                 hammer_modify_volume_done(root_volume);
330                 if (hmp->ronly == 0)
331                         hammer_recover_flush_buffers(hmp, root_volume, 1);
332         } else {
333                 hammer_recover_flush_buffers(hmp, root_volume, -1);
334         }
335         if (degenerate_case == 0) {
336                 kprintf("HAMMER(%s) recovery complete\n",
337                         root_volume->ondisk->vol_name);
338         } else {
339                 kprintf("HAMMER(%s) mounted clean, no recovery needed\n",
340                         root_volume->ondisk->vol_name);
341         }
342         return (error);
343 }
344
345 /*
346  * Execute redo operations
347  *
348  * This procedure is run at the end of the mount sequence, after the hammer
349  * mount structure has been completely initialized but before the filesystem
350  * goes live.  It can access standard cursors, the B-Tree, flush the
351  * filesystem, and so forth.
352  *
353  * This code may only be called for read-write mounts or when a mount
354  * switches from read-only to read-write.  vnodes may or may not be present.
355  *
356  * The stage1 code will have already calculated the correct FIFO range
357  * and stored it in the rootmap.
358  */
359 int
360 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
361 {
362         hammer_blockmap_t rootmap;
363         hammer_buffer_t buffer;
364         hammer_off_t scan_offset;
365         hammer_off_t bytes;
366         hammer_fifo_any_t head;
367         hammer_off_t first_offset;
368         hammer_off_t last_offset;
369         int error;
370
371         /*
372          * Stage 2 can only be run on a RW mount, or when the mount is
373          * switched from RO to RW.  It must be run only once.
374          */
375         KKASSERT(hmp->ronly == 0);
376
377         if (hmp->hflags & HMNT_STAGE2)
378                 return(0);
379         hmp->hflags |= HMNT_STAGE2;
380
381         /*
382          * Examine the UNDO FIFO.  If it is empty the filesystem is clean
383          * and no action need be taken.
384          */
385         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
386         first_offset = rootmap->first_offset;
387         last_offset  = rootmap->next_offset;
388         if (first_offset == last_offset)
389                 return(0);
390
391         if (last_offset >= first_offset) {
392                 bytes = last_offset - first_offset;
393         } else {
394                 bytes = rootmap->alloc_offset - first_offset +
395                         (last_offset & HAMMER_OFF_LONG_MASK);
396         }
397         kprintf("HAMMER(%s) Start recovery redo %016jx - %016jx "
398                 "(%jd bytes of REDO)%s\n",
399                 root_volume->ondisk->vol_name,
400                 (intmax_t)first_offset,
401                 (intmax_t)last_offset,
402                 (intmax_t)bytes,
403                 (hmp->ronly ? " (RO)" : "(RW)"));
404         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
405                 kprintf("Undo size is absurd, unable to mount\n");
406                 return(EIO);
407         }
408
409         /*
410          * Scan the REDOs forwards.
411          */
412         scan_offset = first_offset;
413         buffer = NULL;
414
415         while (bytes) {
416                 KKASSERT(scan_offset != last_offset);
417
418                 head = hammer_recover_scan_fwd(hmp, root_volume,
419                                                &scan_offset, &error, &buffer);
420                 if (error)
421                         break;
422
423 #if 0
424                 error = hammer_recover_redo(hmp, root_volume, &head->redo);
425 #endif
426                 if (error) {
427                         kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
428                                 root_volume->ondisk->vol_name,
429                                 (intmax_t)scan_offset - head->head.hdr_size);
430                         break;
431                 }
432                 bytes -= head->head.hdr_size;
433         }
434         if (buffer) {
435                 hammer_rel_buffer(buffer, 0);
436                 buffer = NULL;
437         }
438
439         /*
440          * Finish up flushing (or discarding) recovered buffers by executing
441          * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
442          * case tests and forces the flush in order to update the FIFO indices.
443          *
444          * If a crash occurs during the flush the entire undo/redo will be
445          * re-run during recovery on the next mount.
446          */
447         if (error == 0) {
448                 if (rootmap->first_offset != rootmap->next_offset)
449                         hmp->hflags |= HMNT_UNDO_DIRTY;
450                 hammer_flusher_sync(hmp);
451         }
452         kprintf("HAMMER(%s) End redo recovery\n",
453                 root_volume->ondisk->vol_name);
454         return (error);
455 }
456
457 /*
458  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
459  * record at *scan_offsetp or NULL if an error occured.
460  *
461  * On return *scan_offsetp will be the offset of the returned record.
462  */
463 hammer_fifo_any_t
464 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
465                         hammer_off_t *scan_offsetp,
466                         int *errorp, struct hammer_buffer **bufferp)
467 {
468         hammer_off_t scan_offset;
469         hammer_blockmap_t rootmap;
470         hammer_fifo_any_t head;
471         hammer_fifo_tail_t tail;
472
473         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
474         scan_offset = *scan_offsetp;
475
476         if (hammer_debug_general & 0x0080)
477                 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
478         if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
479                 scan_offset = rootmap->alloc_offset;
480         if (scan_offset - sizeof(*tail) <
481             HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
482                 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
483                         root_volume->ondisk->vol_name,
484                         (intmax_t)scan_offset);
485                 *errorp = EIO;
486                 return (NULL);
487         }
488         tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
489                             errorp, bufferp);
490         if (*errorp) {
491                 kprintf("HAMMER(%s) Unable to read UNDO TAIL "
492                         "at %016jx\n",
493                         root_volume->ondisk->vol_name,
494                         (intmax_t)scan_offset - sizeof(*tail));
495                 return (NULL);
496         }
497
498         if (hammer_check_tail_signature(tail, scan_offset) != 0) {
499                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
500                         "at %016jx\n",
501                         root_volume->ondisk->vol_name,
502                         (intmax_t)scan_offset - sizeof(*tail));
503                 *errorp = EIO;
504                 return (NULL);
505         }
506         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
507         *scan_offsetp = scan_offset - head->head.hdr_size;
508
509         return (head);
510 }
511
512 /*
513  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
514  * an error occured.
515  *
516  * On return *scan_offsetp will be the offset of the record following
517  * the returned record.
518  */
519 hammer_fifo_any_t
520 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
521                         hammer_off_t *scan_offsetp,
522                         int *errorp, struct hammer_buffer **bufferp)
523 {
524         hammer_off_t scan_offset;
525         hammer_blockmap_t rootmap;
526         hammer_fifo_any_t head;
527
528         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
529         scan_offset = *scan_offsetp;
530
531         if (hammer_debug_general & 0x0080)
532                 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
533         if (scan_offset == rootmap->alloc_offset)
534                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
535
536         head = hammer_bread(hmp, scan_offset, errorp, bufferp);
537         if (*errorp) {
538                 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
539                         root_volume->ondisk->vol_name,
540                         (intmax_t)scan_offset);
541                 return (NULL);
542         }
543
544         if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
545                 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
546                         "at %016jx\n",
547                         root_volume->ondisk->vol_name,
548                         (intmax_t)scan_offset);
549                 *errorp = EIO;
550                 return (NULL);
551         }
552         scan_offset += head->head.hdr_size;
553         if (scan_offset == rootmap->alloc_offset)
554                 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
555         *scan_offsetp = scan_offset;
556
557         return (head);
558 }
559
560 /*
561  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
562  * once the head and tail has been established.
563  *
564  * This function validates the entire FIFO record wrapper.
565  */
566 static __inline
567 int
568 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
569                         hammer_off_t beg_off)
570 {
571         hammer_off_t end_off;
572         u_int32_t crc;
573         int bytes;
574
575         /*
576          * Check signatures.  The tail signature is allowed to be the
577          * head signature only for 8-byte PADs.
578          */
579         if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
580                 kprintf("HAMMER: FIFO record bad head signature "
581                         "%04x at %016jx\n",
582                         head->hdr_signature,
583                         (intmax_t)beg_off);
584                 return(2);
585         }
586         if (head->hdr_size < HAMMER_HEAD_ALIGN ||
587             (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
588                 kprintf("HAMMER: FIFO record unaligned or bad size"
589                         "%04x at %016jx\n",
590                         head->hdr_size,
591                         (intmax_t)beg_off);
592                 return(2);
593         }
594         end_off = beg_off + head->hdr_size;
595
596         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
597             (size_t)(end_off - beg_off) != sizeof(*tail)) {
598                 if (head->hdr_type != tail->tail_type) {
599                         kprintf("HAMMER: FIFO record head/tail type mismatch "
600                                 "%04x %04x at %016jx\n",
601                                 head->hdr_type, tail->tail_type,
602                                 (intmax_t)beg_off);
603                         return(2);
604                 }
605                 if (head->hdr_size != tail->tail_size) {
606                         kprintf("HAMMER: FIFO record head/tail size mismatch "
607                                 "%04x %04x at %016jx\n",
608                                 head->hdr_size, tail->tail_size,
609                                 (intmax_t)beg_off);
610                         return(2);
611                 }
612                 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
613                         kprintf("HAMMER: FIFO record bad tail signature "
614                                 "%04x at %016jx\n",
615                                 tail->tail_signature,
616                                 (intmax_t)beg_off);
617                         return(3);
618                 }
619         }
620
621         /*
622          * Non-PAD records must have a CRC and must be sized at
623          * least large enough to fit the head and tail.
624          */
625         if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
626                 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
627                       crc32(head + 1, head->hdr_size - sizeof(*head));
628                 if (head->hdr_crc != crc) {
629                         kprintf("HAMMER: FIFO record CRC failed %08x %08x "
630                                 "at %016jx\n",
631                                 head->hdr_crc, crc,
632                                 (intmax_t)beg_off);
633                         return(EIO);
634                 }
635                 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
636                         kprintf("HAMMER: FIFO record too small "
637                                 "%04x at %016jx\n",
638                                 head->hdr_size,
639                                 (intmax_t)beg_off);
640                         return(EIO);
641                 }
642         }
643
644         /*
645          * Check the tail
646          */
647         bytes = head->hdr_size;
648         tail = (void *)((char *)head + bytes - sizeof(*tail));
649         if (tail->tail_size != head->hdr_size) {
650                 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
651                         tail->tail_size, head->hdr_size,
652                         (intmax_t)beg_off);
653                 return(EIO);
654         }
655         if (tail->tail_type != head->hdr_type) {
656                 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
657                         tail->tail_type, head->hdr_type,
658                         (intmax_t)beg_off);
659                 return(EIO);
660         }
661
662         return(0);
663 }
664
665 /*
666  * Check that the FIFO record is in-bounds given the head and the
667  * hammer offset.
668  *
669  * Also checks that the head and tail structures agree with each other,
670  * but does not check beyond the signature, type, and size.
671  */
672 static int
673 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
674 {
675         hammer_fifo_tail_t tail;
676         hammer_off_t end_off;
677
678         /*
679          * head overlaps buffer boundary.  This could be a PAD so only
680          * check the minimum PAD size here.
681          */
682         if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
683                 return(1);
684
685         /*
686          * Calculate the ending offset and make sure the record does
687          * not cross a buffer boundary.
688          */
689         end_off = beg_off + head->hdr_size;
690         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
691                 return(1);
692         tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
693         return (_hammer_check_signature(head, tail, beg_off));
694 }
695
696 /*
697  * Check that the FIFO record is in-bounds given the tail and the
698  * hammer offset.  The offset is pointing at the ending boundary of the
699  * record.
700  *
701  * Also checks that the head and tail structures agree with each other,
702  * but does not check beyond the signature, type, and size.
703  */
704 static int
705 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
706 {
707         hammer_fifo_head_t head;
708         hammer_off_t beg_off;
709
710         /*
711          * tail overlaps buffer boundary
712          */
713         if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
714                 return(1);
715
716         /*
717          * Calculate the begining offset and make sure the record does
718          * not cross a buffer boundary.
719          */
720         beg_off = end_off - tail->tail_size;
721         if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
722                 return(1);
723         head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
724         return (_hammer_check_signature(head, tail, beg_off));
725 }
726
727 static int
728 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
729                     hammer_fifo_undo_t undo)
730 {
731         hammer_volume_t volume;
732         hammer_buffer_t buffer;
733         hammer_off_t buf_offset;
734         int zone;
735         int error;
736         int vol_no;
737         int bytes;
738         u_int32_t offset;
739
740         /*
741          * Only process UNDO records.  Flag if we find other records to
742          * optimize stage2 recovery.
743          */
744         if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) {
745                 if (undo->head.hdr_type == HAMMER_HEAD_TYPE_REDO)
746                         hmp->hflags |= HMNT_HASREDO;
747                 return(0);
748         }
749
750         /*
751          * Validate the UNDO record.
752          */
753         bytes = undo->head.hdr_size - sizeof(*undo) -
754                 sizeof(struct hammer_fifo_tail);
755         if (bytes < 0 || undo->undo_data_bytes < 0 ||
756             undo->undo_data_bytes > bytes) {
757                 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
758                         undo->undo_data_bytes, bytes);
759                 return(EIO);
760         }
761
762         bytes = undo->undo_data_bytes;
763
764         /*
765          * The undo offset may only be a zone-1 or zone-2 offset.
766          *
767          * Currently we only support a zone-1 offset representing the
768          * volume header.
769          */
770         zone = HAMMER_ZONE_DECODE(undo->undo_offset);
771         offset = undo->undo_offset & HAMMER_BUFMASK;
772
773         if (offset + bytes > HAMMER_BUFSIZE) {
774                 kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
775                 return (EIO);
776         }
777
778         switch(zone) {
779         case HAMMER_ZONE_RAW_VOLUME_INDEX:
780                 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
781                 volume = hammer_get_volume(hmp, vol_no, &error);
782                 if (volume == NULL) {
783                         kprintf("HAMMER: UNDO record, "
784                                 "cannot access volume %d\n", vol_no);
785                         break;
786                 }
787                 hammer_modify_volume(NULL, volume, NULL, 0);
788                 hammer_recover_copy_undo(undo->undo_offset,
789                                          (char *)(undo + 1),
790                                          (char *)volume->ondisk + offset,
791                                          bytes);
792                 hammer_modify_volume_done(volume);
793
794                 /*
795                  * Multiple modifications may be made to the same buffer.
796                  * Also, the volume header cannot be written out until
797                  * everything else has been flushed.  This also
798                  * covers the read-only case by preventing the kernel from
799                  * flushing the buffer.
800                  */
801                 if (volume->io.recovered == 0)
802                         volume->io.recovered = 1;
803                 else
804                         hammer_rel_volume(volume, 0);
805                 break;
806         case HAMMER_ZONE_RAW_BUFFER_INDEX:
807                 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
808                 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
809                                            0, &error);
810                 if (buffer == NULL) {
811                         kprintf("HAMMER: UNDO record, "
812                                 "cannot access buffer %016jx\n",
813                                 (intmax_t)undo->undo_offset);
814                         break;
815                 }
816                 hammer_modify_buffer(NULL, buffer, NULL, 0);
817                 hammer_recover_copy_undo(undo->undo_offset,
818                                          (char *)(undo + 1),
819                                          (char *)buffer->ondisk + offset,
820                                          bytes);
821                 hammer_modify_buffer_done(buffer);
822
823                 /*
824                  * Multiple modifications may be made to the same buffer,
825                  * improve performance by delaying the flush.  This also
826                  * covers the read-only case by preventing the kernel from
827                  * flushing the buffer.
828                  */
829                 if (buffer->io.recovered == 0)
830                         buffer->io.recovered = 1;
831                 else
832                         hammer_rel_buffer(buffer, 0);
833                 break;
834         default:
835                 kprintf("HAMMER: Corrupt UNDO record\n");
836                 error = EIO;
837         }
838         return (error);
839 }
840
841 static void
842 hammer_recover_copy_undo(hammer_off_t undo_offset, 
843                          char *src, char *dst, int bytes)
844 {
845         if (hammer_debug_general & 0x0080) {
846                 kprintf("UNDO %016jx: %d\n",
847                         (intmax_t)undo_offset, bytes);
848         }
849 #if 0
850         kprintf("UNDO %016jx:", (intmax_t)undo_offset);
851         hammer_recover_debug_dump(22, dst, bytes);
852         kprintf("%22s", "to:");
853         hammer_recover_debug_dump(22, src, bytes);
854 #endif
855         bcopy(src, dst, bytes);
856 }
857
858 #if 0
859
860 static void
861 hammer_recover_debug_dump(int w, char *buf, int bytes)
862 {
863         int i;
864
865         for (i = 0; i < bytes; ++i) {
866                 if (i && (i & 15) == 0)
867                         kprintf("\n%*.*s", w, w, "");
868                 kprintf(" %02x", (unsigned char)buf[i]);
869         }
870         kprintf("\n");
871 }
872
873 #endif
874
875 /*
876  * Flush recovered buffers from recovery operations.  The call to this
877  * routine may be delayed if a read-only mount was made and then later
878  * upgraded to read-write.  This routine is also called when unmounting
879  * a read-only mount to clean out recovered (dirty) buffers which we
880  * couldn't flush (because the mount is read-only).
881  *
882  * The volume header is always written last.  The UNDO FIFO will be forced
883  * to zero-length by setting next_offset to first_offset.  This leaves the
884  * (now stale) UNDO information used to recover the disk available for
885  * forensic analysis.
886  *
887  * final is typically 0 or 1.  The volume header is only written if final
888  * is 1.  If final is -1 the recovered buffers are discarded instead of
889  * written and root_volume can also be passed as NULL in that case.
890  */
891 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
892 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
893
894 void
895 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
896                              int final)
897 {
898         /*
899          * Flush the buffers out asynchronously, wait for all the I/O to
900          * complete, then do it again to destroy the buffer cache buffer
901          * so it doesn't alias something later on.
902          */
903         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
904                 hammer_recover_flush_buffer_callback, &final);
905         hammer_io_wait_all(hmp, "hmrrcw", 1);
906         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
907                 hammer_recover_flush_buffer_callback, &final);
908
909         /*
910          * Flush all volume headers except the root volume.  If final < 0
911          * we discard all volume headers including the root volume.
912          */
913         if (final >= 0) {
914                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
915                         hammer_recover_flush_volume_callback, root_volume);
916         } else {
917                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
918                         hammer_recover_flush_volume_callback, NULL);
919         }
920
921         /*
922          * Finalize the root volume header.
923          */
924         if (root_volume && root_volume->io.recovered && final > 0) {
925                 hammer_io_wait_all(hmp, "hmrflx", 1);
926                 root_volume->io.recovered = 0;
927                 hammer_io_flush(&root_volume->io, 0);
928                 hammer_rel_volume(root_volume, 0);
929                 hammer_io_wait_all(hmp, "hmrfly", 1);
930         }
931 }
932
933 /*
934  * Callback to flush volume headers.  If discarding data will be NULL and
935  * all volume headers (including the root volume) will be discarded.
936  * Otherwise data is the root_volume and we flush all volume headers
937  * EXCEPT the root_volume.
938  *
939  * Clear any I/O error or modified condition when discarding buffers to
940  * clean up the reference count, otherwise the buffer may have extra refs
941  * on it.
942  */
943 static
944 int
945 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
946 {
947         hammer_volume_t root_volume = data;
948
949         if (volume->io.recovered && volume != root_volume) {
950                 volume->io.recovered = 0;
951                 if (root_volume != NULL) {
952                         hammer_io_flush(&volume->io, 0);
953                 } else {
954                         hammer_io_clear_error(&volume->io);
955                         hammer_io_clear_modify(&volume->io, 1);
956                 }
957                 hammer_rel_volume(volume, 0);
958         }
959         return(0);
960 }
961
962 /*
963  * Flush or discard recovered I/O buffers.
964  *
965  * Clear any I/O error or modified condition when discarding buffers to
966  * clean up the reference count, otherwise the buffer may have extra refs
967  * on it.
968  */
969 static
970 int
971 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
972 {
973         int final = *(int *)data;
974         int flush;
975
976         if (buffer->io.recovered) {
977                 buffer->io.recovered = 0;
978                 buffer->io.reclaim = 1;
979                 if (final < 0) {
980                         hammer_io_clear_error(&buffer->io);
981                         hammer_io_clear_modify(&buffer->io, 1);
982                 } else {
983                         hammer_io_flush(&buffer->io, 0);
984                 }
985                 hammer_rel_buffer(buffer, 0);
986         } else {
987                 flush = hammer_ref_interlock(&buffer->io.lock);
988                 if (flush)
989                         ++hammer_count_refedbufs;
990
991                 if (final < 0) {
992                         hammer_io_clear_error(&buffer->io);
993                         hammer_io_clear_modify(&buffer->io, 1);
994                 }
995                 KKASSERT(hammer_oneref(&buffer->io.lock));
996                 buffer->io.reclaim = 1;
997                 hammer_rel_buffer(buffer, flush);
998         }
999         return(0);
1000 }
1001