Merge branch 'vendor/TCPDUMP'
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.29 2008/07/26 05:36:21 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
40                         hammer_off_t end_off);
41 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
42                         char *src, char *dst, int bytes);
43 #if 0
44 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
45 #endif
46 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
47                         hammer_fifo_undo_t undo, int bytes);
48
49 /*
50  * Recover a filesystem on mount
51  *
52  * NOTE: No information from the root volume has been cached in the
53  * hammer_mount structure yet, so we need to access the root volume's
54  * buffer directly.
55  */
56 int
57 hammer_recover(hammer_mount_t hmp, hammer_volume_t root_volume)
58 {
59         hammer_blockmap_t rootmap;
60         hammer_buffer_t buffer;
61         hammer_off_t scan_offset;
62         hammer_off_t bytes;
63         hammer_fifo_tail_t tail;
64         hammer_fifo_undo_t undo;
65         hammer_off_t first_offset;
66         hammer_off_t last_offset;
67         int error;
68
69         /*
70          * Examine the UNDO FIFO.  If it is empty the filesystem is clean
71          * and no action need be taken.
72          */
73         rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
74
75         if (rootmap->first_offset == rootmap->next_offset)
76                 return(0);
77
78         first_offset = rootmap->first_offset;
79         last_offset  = rootmap->next_offset;
80
81         if (last_offset >= first_offset) {
82                 bytes = last_offset - first_offset;
83         } else {
84                 bytes = rootmap->alloc_offset - first_offset +
85                         (last_offset & HAMMER_OFF_LONG_MASK);
86         }
87         kprintf("HAMMER(%s) Start Recovery %016llx - %016llx "
88                 "(%lld bytes of UNDO)%s\n",
89                 root_volume->ondisk->vol_name,
90                 (long long)first_offset,
91                 (long long)last_offset,
92                 (long long)bytes,
93                 (hmp->ronly ? " (RO)" : "(RW)"));
94         if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
95                 kprintf("Undo size is absurd, unable to mount\n");
96                 return(EIO);
97         }
98
99         /*
100          * Scan the UNDOs backwards.
101          */
102         scan_offset = last_offset;
103         buffer = NULL;
104         if (scan_offset > rootmap->alloc_offset) {
105                 kprintf("HAMMER(%s) UNDO record at %016llx FIFO overflow\n",
106                         root_volume->ondisk->vol_name,
107                         (long long)scan_offset);
108                 error = EIO;
109                 goto done;
110         }
111
112         while ((int64_t)bytes > 0) {
113                 if (hammer_debug_general & 0x0080)
114                         kprintf("scan_offset %016llx\n",
115                                 (long long)scan_offset);
116                 if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
117                         scan_offset = rootmap->alloc_offset;
118                         continue;
119                 }
120                 if (scan_offset - sizeof(*tail) <
121                     HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
122                         kprintf("HAMMER(%s) UNDO record at %016llx FIFO "
123                                 "underflow\n",
124                                 root_volume->ondisk->vol_name,
125                                 (long long)scan_offset);
126                         error = EIO;
127                         break;
128                 }
129                 tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
130                                     &error, &buffer);
131                 if (error) {
132                         kprintf("HAMMER(%s) Unable to read UNDO TAIL "
133                                 "at %016llx\n",
134                                 root_volume->ondisk->vol_name,
135                                 (long long)scan_offset - sizeof(*tail));
136                         break;
137                 }
138
139                 if (hammer_check_tail_signature(tail, scan_offset) != 0) {
140                         kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
141                                 "at %016llx\n",
142                                 root_volume->ondisk->vol_name,
143                                 (long long)scan_offset - sizeof(*tail));
144                         error = EIO;
145                         break;
146                 }
147                 undo = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
148
149                 error = hammer_recover_undo(hmp, root_volume, undo,
150                                 HAMMER_BUFSIZE -
151                                 (int)((char *)undo - (char *)buffer->ondisk));
152                 if (error) {
153                         kprintf("HAMMER(%s) UNDO record at %016llx failed\n",
154                                 root_volume->ondisk->vol_name,
155                                 (long long)scan_offset - tail->tail_size);
156                         break;
157                 }
158                 scan_offset -= tail->tail_size;
159                 bytes -= tail->tail_size;
160
161                 /*
162                  * If too many dirty buffers have built up we have to flush'm
163                  * out.  As long as we do not flush out the volume header
164                  * a crash here should not cause any problems.
165                  *
166                  * buffer must be released so the flush can assert that
167                  * all buffers are idle.
168                  */
169                 if (hammer_flusher_meta_limit(hmp)) {
170                         if (buffer) {
171                                 hammer_rel_buffer(buffer, 0);
172                                 buffer = NULL;
173                         }
174                         if (hmp->ronly == 0) {
175                                 hammer_recover_flush_buffers(hmp, root_volume,
176                                                              0);
177                                 kprintf("HAMMER(%s) Continuing recovery\n",
178                                         root_volume->ondisk->vol_name);
179                         } else {
180                                 kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
181                                         root_volume->ondisk->vol_name);
182                                 error = EIO;
183                                 break;
184                         }
185                 }
186         }
187 done:
188         if (buffer)
189                 hammer_rel_buffer(buffer, 0);
190
191         /*
192          * After completely flushing all the recovered buffers the volume
193          * header will also be flushed.  Force the UNDO FIFO to 0-length.
194          */
195         if (root_volume->io.recovered == 0) {
196                 hammer_ref_volume(root_volume);
197                 root_volume->io.recovered = 1;
198         }
199
200         /*
201          * Finish up flushing (or discarding) recovered buffers
202          */
203         if (error == 0) {
204                 hammer_modify_volume(NULL, root_volume, NULL, 0);
205                 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
206                 rootmap->first_offset = last_offset;
207                 rootmap->next_offset = last_offset;
208                 hammer_modify_volume_done(root_volume);
209                 if (hmp->ronly == 0)
210                         hammer_recover_flush_buffers(hmp, root_volume, 1);
211         } else {
212                 hammer_recover_flush_buffers(hmp, root_volume, -1);
213         }
214         kprintf("HAMMER(%s) End Recovery\n", root_volume->ondisk->vol_name);
215         return (error);
216 }
217
218 static int
219 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
220 {
221         int max_bytes;
222
223         max_bytes = ((end_off - sizeof(*tail)) & HAMMER_BUFMASK);
224         max_bytes += sizeof(*tail);
225
226         /*
227          * tail overlaps buffer boundary
228          */
229         if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) {
230                 return(1);
231         }
232
233         /*
234          * signature check, the tail signature is allowed to be the head
235          * signature only for 8-byte PADs.
236          */
237         switch(tail->tail_signature) {
238         case HAMMER_TAIL_SIGNATURE:
239                 break;
240         case HAMMER_HEAD_SIGNATURE:
241                 if (tail->tail_type != HAMMER_HEAD_TYPE_PAD ||
242                     tail->tail_size != sizeof(*tail)) {
243                         return(2);
244                 }
245                 break;
246         }
247
248         /*
249          * The undo structure must not overlap a buffer boundary.
250          */
251         if (tail->tail_size < sizeof(*tail) || tail->tail_size > max_bytes) {
252                 return(3);
253         }
254         return(0);
255 }
256
257 static int
258 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
259                     hammer_fifo_undo_t undo, int bytes)
260 {
261         hammer_fifo_tail_t tail;
262         hammer_volume_t volume;
263         hammer_buffer_t buffer;
264         hammer_off_t buf_offset;
265         int zone;
266         int error;
267         int vol_no;
268         int max_bytes;
269         u_int32_t offset;
270         u_int32_t crc;
271
272         /*
273          * Basic sanity checks
274          */
275         if (bytes < HAMMER_HEAD_ALIGN) {
276                 kprintf("HAMMER: Undo alignment error (%d)\n", bytes);
277                 return(EIO);
278         }
279         if (undo->head.hdr_signature != HAMMER_HEAD_SIGNATURE) {
280                 kprintf("HAMMER: Bad head signature %04x\n", 
281                         undo->head.hdr_signature);
282                 return(EIO);
283         }
284         if (undo->head.hdr_size < HAMMER_HEAD_ALIGN ||
285             undo->head.hdr_size > bytes) {
286                 kprintf("HAMMER: Bad size %d\n", bytes);
287                 return(EIO);
288         }
289
290         /*
291          * Skip PAD records.  Note that PAD records also do not require
292          * a tail and may have a truncated structure.
293          */
294         if (undo->head.hdr_type == HAMMER_HEAD_TYPE_PAD)
295                 return(0);
296
297         /*
298          * Check the CRC
299          */
300         crc = crc32(undo, HAMMER_FIFO_HEAD_CRCOFF) ^
301               crc32(&undo->head + 1, undo->head.hdr_size - sizeof(undo->head));
302         if (undo->head.hdr_crc != crc) {
303                 kprintf("HAMMER: Undo record CRC failed %08x %08x\n",
304                         undo->head.hdr_crc, crc);
305                 return(EIO);
306         }
307
308
309         /*
310          * Check the tail
311          */
312         bytes = undo->head.hdr_size;
313         tail = (void *)((char *)undo + bytes - sizeof(*tail));
314         if (tail->tail_size != undo->head.hdr_size) {
315                 kprintf("HAMMER: Bad tail size %d\n", tail->tail_size);
316                 return(EIO);
317         }
318         if (tail->tail_type != undo->head.hdr_type) {
319                 kprintf("HAMMER: Bad tail type %d\n", tail->tail_type);
320                 return(EIO);
321         }
322
323         /*
324          * Only process UNDO records
325          */
326         if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
327                 return(0);
328
329         /*
330          * Validate the UNDO record.
331          */
332         max_bytes = undo->head.hdr_size - sizeof(*undo) - sizeof(*tail);
333         if (undo->undo_data_bytes < 0 || undo->undo_data_bytes > max_bytes) {
334                 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
335                         undo->undo_data_bytes, max_bytes);
336                 return(EIO);
337         }
338
339         /*
340          * The undo offset may only be a zone-1 or zone-2 offset.
341          *
342          * Currently we only support a zone-1 offset representing the
343          * volume header.
344          */
345         zone = HAMMER_ZONE_DECODE(undo->undo_offset);
346         offset = undo->undo_offset & HAMMER_BUFMASK;
347
348         if (offset + undo->undo_data_bytes > HAMMER_BUFSIZE) {
349                 kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
350                 return (EIO);
351         }
352
353         switch(zone) {
354         case HAMMER_ZONE_RAW_VOLUME_INDEX:
355                 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
356                 volume = hammer_get_volume(hmp, vol_no, &error);
357                 if (volume == NULL) {
358                         kprintf("HAMMER: UNDO record, "
359                                 "cannot access volume %d\n", vol_no);
360                         break;
361                 }
362                 hammer_modify_volume(NULL, volume, NULL, 0);
363                 hammer_recover_copy_undo(undo->undo_offset,
364                                          (char *)(undo + 1),
365                                          (char *)volume->ondisk + offset,
366                                          undo->undo_data_bytes);
367                 hammer_modify_volume_done(volume);
368
369                 /*
370                  * Multiple modifications may be made to the same buffer.
371                  * Also, the volume header cannot be written out until
372                  * everything else has been flushed.  This also
373                  * covers the read-only case by preventing the kernel from
374                  * flushing the buffer.
375                  */
376                 if (volume->io.recovered == 0)
377                         volume->io.recovered = 1;
378                 else
379                         hammer_rel_volume(volume, 0);
380                 break;
381         case HAMMER_ZONE_RAW_BUFFER_INDEX:
382                 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
383                 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
384                                            0, &error);
385                 if (buffer == NULL) {
386                         kprintf("HAMMER: UNDO record, "
387                                 "cannot access buffer %016llx\n",
388                                 (long long)undo->undo_offset);
389                         break;
390                 }
391                 hammer_modify_buffer(NULL, buffer, NULL, 0);
392                 hammer_recover_copy_undo(undo->undo_offset,
393                                          (char *)(undo + 1),
394                                          (char *)buffer->ondisk + offset,
395                                          undo->undo_data_bytes);
396                 hammer_modify_buffer_done(buffer);
397
398                 /*
399                  * Multiple modifications may be made to the same buffer,
400                  * improve performance by delaying the flush.  This also
401                  * covers the read-only case by preventing the kernel from
402                  * flushing the buffer.
403                  */
404                 if (buffer->io.recovered == 0)
405                         buffer->io.recovered = 1;
406                 else
407                         hammer_rel_buffer(buffer, 0);
408                 break;
409         default:
410                 kprintf("HAMMER: Corrupt UNDO record\n");
411                 error = EIO;
412         }
413         return (error);
414 }
415
416 static void
417 hammer_recover_copy_undo(hammer_off_t undo_offset, 
418                          char *src, char *dst, int bytes)
419 {
420         if (hammer_debug_general & 0x0080) {
421                 kprintf("UNDO %016llx: %d\n",
422                         (long long)undo_offset, bytes);
423         }
424 #if 0
425         kprintf("UNDO %016llx:", (long long)undo_offset);
426         hammer_recover_debug_dump(22, dst, bytes);
427         kprintf("%22s", "to:");
428         hammer_recover_debug_dump(22, src, bytes);
429 #endif
430         bcopy(src, dst, bytes);
431 }
432
433 #if 0
434
435 static void
436 hammer_recover_debug_dump(int w, char *buf, int bytes)
437 {
438         int i;
439
440         for (i = 0; i < bytes; ++i) {
441                 if (i && (i & 15) == 0)
442                         kprintf("\n%*.*s", w, w, "");
443                 kprintf(" %02x", (unsigned char)buf[i]);
444         }
445         kprintf("\n");
446 }
447
448 #endif
449
450 /*
451  * Flush recovered buffers from recovery operations.  The call to this
452  * routine may be delayed if a read-only mount was made and then later
453  * upgraded to read-write.
454  *
455  * The volume header is always written last.  The UNDO FIFO will be forced
456  * to zero-length by setting next_offset to first_offset.  This leaves the
457  * (now stale) UNDO information used to recover the disk available for
458  * forensic analysis.
459  *
460  * final is typically 0 or 1.  The volume header is only written if final
461  * is 1.  If final is -1 the recovered buffers are discarded instead of
462  * written and root_volume can also be passed as NULL in that case.
463  */
464 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
465 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
466
467 void
468 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
469                              int final)
470 {
471         /*
472          * Flush the buffers out asynchronously, wait for all the I/O to
473          * complete, then do it again to destroy the buffer cache buffer
474          * so it doesn't alias something later on.
475          */
476         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
477                 hammer_recover_flush_buffer_callback, &final);
478         hammer_io_wait_all(hmp, "hmrrcw");
479         RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
480                 hammer_recover_flush_buffer_callback, &final);
481
482         /*
483          * Flush all volume headers except the root volume.  If final < 0
484          * we discard all volume headers including the root volume.
485          */
486         if (final >= 0) {
487                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
488                         hammer_recover_flush_volume_callback, root_volume);
489         } else {
490                 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
491                         hammer_recover_flush_volume_callback, NULL);
492         }
493
494         /*
495          * Finalize the root volume header.
496          */
497         if (root_volume && root_volume->io.recovered && final > 0) {
498                 crit_enter();
499                 while (hmp->io_running_space > 0)
500                         tsleep(&hmp->io_running_space, 0, "hmrflx", 0);
501                 crit_exit();
502                 root_volume->io.recovered = 0;
503                 hammer_io_flush(&root_volume->io);
504                 hammer_rel_volume(root_volume, 0);
505         }
506 }
507
508 /*
509  * Callback to flush volume headers.  If discarding data will be NULL and
510  * all volume headers (including the root volume) will be discarded.
511  * Otherwise data is the root_volume and we flush all volume headers
512  * EXCEPT the root_volume.
513  */
514 static
515 int
516 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
517 {
518         hammer_volume_t root_volume = data;
519
520         if (volume->io.recovered && volume != root_volume) {
521                 volume->io.recovered = 0;
522                 if (root_volume != NULL)
523                         hammer_io_flush(&volume->io);
524                 else
525                         hammer_io_clear_modify(&volume->io, 1);
526                 hammer_rel_volume(volume, 0);
527         }
528         return(0);
529 }
530
531 static
532 int
533 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
534 {
535         int final = *(int *)data;
536
537         if (buffer->io.recovered) {
538                 buffer->io.recovered = 0;
539                 buffer->io.reclaim = 1;
540                 if (final < 0)
541                         hammer_io_clear_modify(&buffer->io, 1);
542                 else
543                         hammer_io_flush(&buffer->io);
544                 hammer_rel_buffer(buffer, 0);
545         } else {
546                 KKASSERT(buffer->io.lock.refs == 0);
547                 ++hammer_count_refedbufs;
548                 hammer_ref(&buffer->io.lock);
549                 buffer->io.reclaim = 1;
550                 hammer_rel_buffer(buffer, 1);
551         }
552         return(0);
553 }
554