kernel - Fix excessive mbuf use in nfs_realign()
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
CommitLineData
4d75d829
MD
1/*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
c58123da
MD
33 */
34
35/*
36 * UNDO ALGORITHM:
37 *
38 * The UNDO algorithm is trivial. The nominal UNDO range in the
39 * FIFO is determined by taking the first/next offset stored in
40 * the volume header. The next offset may not be correct since
41 * UNDO flushes are not required to flush the volume header, so
42 * the code also scans forward until it finds a discontinuous
43 * sequence number.
44 *
45 * The UNDOs are then scanned and executed in reverse order. These
46 * UNDOs are effectively just data restorations based on HAMMER offsets.
47 *
48 * REDO ALGORITHM:
49 *
50 * REDO records are laid down in the UNDO/REDO FIFO for nominal
51 * writes, truncations, and file extension ops. On a per-inode
52 * basis two types of REDO records are generated, REDO_WRITE
53 * and REDO_TRUNC.
54 *
55 * Essentially the recovery block will contain UNDO records backing
56 * out partial operations and REDO records to regenerate those partial
57 * operations guaranteed by the filesystem during recovery.
58 *
59 * REDO generation is optional, and can also be started and then
60 * later stopped due to excessive write()s inbetween fsyncs, or not
61 * started at all. Because of this the recovery code must determine
62 * when REDOs are valid and when they are not. Additional records are
63 * generated to help figure it out.
64 *
65 * The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66 * during a flush cycle indicating which records the flush cycle
67 * has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68 * each flush cycle to indicate how far back in the UNDO/REDO FIFO
69 * the recovery code must go to find the earliest applicable REDO
70 * record. Applicable REDO records can be far outside the nominal
71 * UNDO recovery range, for example if a write() lays down a REDO but
72 * the related file is not flushed for several cycles.
73 *
74 * The SYNC reference is to a point prior to the nominal UNDO FIFO
75 * range, creating an extended REDO range which must be scanned.
76 *
77 * Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78 * which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79 * prior to the start of the nominal UNDO range are applicable.
80 * That is, any REDO_TERM_* records in the extended range but not in
81 * the nominal undo range will mask any redo operations for prior REDO
82 * records. This is necessary because once the TERM is laid down
83 * followup operations may make additional changes to the related
84 * records but not necessarily record them as REDOs (because REDOs are
85 * optional).
86 *
87 * REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88 * must be ignored since they represent meta-data flushes which are
89 * undone by the UNDOs in that nominal UNDO range by the recovery
90 * code. Only REDO_TERM_* records in the extended range but not
91 * in the nominal undo range are applicable.
92 *
93 * The REDO_SYNC record itself always exists in the nominal UNDO range
94 * (this is how the extended range is determined). For recovery
95 * purposes the most recent REDO_SYNC record is always used if several
96 * are found.
97 *
98 * CRASHES DURING UNDO/REDO
99 *
100 * A crash during the UNDO phase requires no additional effort. The
101 * UNDOs will simply be re-run again. The state of the UNDO/REDO fifo
102 * remains unchanged and has no re-crash issues.
103 *
104 * A crash during the REDO phase is more complex because the REDOs
105 * run normal filesystem ops and generate additional UNDO/REDO records.
106 * REDO is disabled during REDO recovery and any SYNC records generated
107 * by flushes during REDO recovery must continue to reference the
108 * original extended range.
109 *
110 * If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111 * may become impossible. This is detected when the start of the
112 * extended range fails to have monotonically increasing sequence
113 * numbers leading into the nominal undo range.
4d75d829
MD
114 */
115
116#include "hammer.h"
117
c58123da
MD
118/*
119 * Each rterm entry has a list of fifo offsets indicating termination
120 * points. These are stripped as the scan progresses.
121 */
122typedef struct hammer_rterm_entry {
123 struct hammer_rterm_entry *next;
124 hammer_off_t fifo_offset;
125} *hammer_rterm_entry_t;
126
127/*
128 * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
129 * TRUNC entries ignore the offset.
130 */
131typedef struct hammer_rterm {
132 RB_ENTRY(hammer_rterm) rb_node;
133 int64_t redo_objid;
134 u_int32_t redo_localization;
135 u_int32_t redo_flags;
136 hammer_off_t redo_offset;
137 hammer_rterm_entry_t term_list;
138} *hammer_rterm_t;
139
140static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
141struct hammer_rterm_rb_tree;
142RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
143RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
144
f90dde4c
MD
145static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
146 hammer_off_t end_off);
02428fb6
MD
147static int hammer_check_head_signature(hammer_fifo_head_t head,
148 hammer_off_t beg_off);
f90dde4c
MD
149static void hammer_recover_copy_undo(hammer_off_t undo_offset,
150 char *src, char *dst, int bytes);
02428fb6
MD
151static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
152 hammer_volume_t root_volume,
153 hammer_off_t *scan_offsetp,
154 int *errorp, struct hammer_buffer **bufferp);
155static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
156 hammer_volume_t root_volume,
157 hammer_off_t *scan_offsetp,
158 int *errorp, struct hammer_buffer **bufferp);
d36ec43b 159#if 0
f90dde4c 160static void hammer_recover_debug_dump(int w, char *buf, int bytes);
d36ec43b 161#endif
51c35492 162static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
02428fb6 163 hammer_fifo_undo_t undo);
c58123da
MD
164static int hammer_recover_redo_rec(hammer_mount_t hmp,
165 struct hammer_rterm_rb_tree *root,
166 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
167static int hammer_recover_redo_run(hammer_mount_t hmp,
168 struct hammer_rterm_rb_tree *root,
169 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
170static void hammer_recover_redo_exec(hammer_mount_t hmp,
171 hammer_fifo_redo_t redo);
172
173RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
4d75d829
MD
174
175/*
02428fb6
MD
176 * Recover filesystem meta-data on mount. This procedure figures out the
177 * UNDO FIFO range and runs the UNDOs backwards. The FIFO pointers are not
178 * resynchronized by this procedure.
179 *
180 * This procedure is run near the beginning of the mount sequence, before
181 * any B-Tree or high-level accesses are enabled, and is responsible for
182 * restoring the meta-data to a consistent state. High level HAMMER data
183 * structures (such as the B-Tree) cannot be accessed here.
0729c8c8
MD
184 *
185 * NOTE: No information from the root volume has been cached in the
02428fb6
MD
186 * hammer_mount structure yet, so we need to access the root volume's
187 * buffer directly.
188 *
189 * NOTE:
4d75d829
MD
190 */
191int
02428fb6 192hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
4d75d829 193{
f90dde4c
MD
194 hammer_blockmap_t rootmap;
195 hammer_buffer_t buffer;
196 hammer_off_t scan_offset;
02428fb6 197 hammer_off_t scan_offset_save;
f90dde4c 198 hammer_off_t bytes;
02428fb6 199 hammer_fifo_any_t head;
9f5097dc
MD
200 hammer_off_t first_offset;
201 hammer_off_t last_offset;
02428fb6 202 u_int32_t seqno;
f90dde4c 203 int error;
2dd2e007 204 int degenerate_case = 0;
b33e2cc0 205
4d75d829 206 /*
02428fb6 207 * Examine the UNDO FIFO indices in the volume header.
4d75d829 208 */
f90dde4c 209 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
9f5097dc
MD
210 first_offset = rootmap->first_offset;
211 last_offset = rootmap->next_offset;
02428fb6
MD
212 buffer = NULL;
213 error = 0;
214
c58123da
MD
215 hmp->recover_stage2_offset = 0;
216
02428fb6
MD
217 if (first_offset > rootmap->alloc_offset ||
218 last_offset > rootmap->alloc_offset) {
219 kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
220 "%016jx, %016jx limit %016jx\n",
221 root_volume->ondisk->vol_name,
222 (intmax_t)first_offset,
223 (intmax_t)last_offset,
224 (intmax_t)rootmap->alloc_offset);
225 error = EIO;
226 goto done;
227 }
228
229 /*
230 * In HAMMER version 4+ filesystems the volume header does NOT
231 * contain definitive UNDO FIFO state. In particular, the
232 * rootmap->next_offset may not be indexed completely to the
233 * end of the active UNDO FIFO.
234 */
235 if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
236 /*
237 * To find the definitive range we must first scan backwards
238 * from first_offset to locate the first real record and
239 * extract the sequence number from it. This record is not
240 * part of the active undo space.
241 */
242 scan_offset = first_offset;
243 seqno = 0;
244
245 for (;;) {
246 head = hammer_recover_scan_rev(hmp, root_volume,
247 &scan_offset,
248 &error, &buffer);
249 if (error)
250 break;
251 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
252 seqno = head->head.hdr_seq;
253 break;
254 }
255 }
256 if (error) {
2dd2e007 257 kprintf("HAMMER(%s) recovery failure "
02428fb6
MD
258 "during seqno backscan\n",
259 root_volume->ondisk->vol_name);
260 goto done;
261 }
262
263 /*
264 * Scan forwards from first_offset and (seqno+1) looking
265 * for a sequence space discontinuity. This denotes the
266 * end of the active FIFO area.
267 *
268 * NOTE: For the case where the FIFO is empty the very first
269 * record we find will be discontinuous.
270 *
271 * NOTE: Do not include trailing PADs in the scan range,
272 * and remember the returned scan_offset after a
273 * fwd iteration points to the end of the returned
274 * record.
275 */
2dd2e007 276 kprintf("HAMMER(%s) recovery check seqno=%08x\n",
02428fb6
MD
277 root_volume->ondisk->vol_name,
278 seqno);
279
280 scan_offset = first_offset;
281 scan_offset_save = scan_offset;
282 ++seqno;
c58123da
MD
283 hmp->recover_stage2_seqno = seqno;
284
02428fb6
MD
285 for (;;) {
286 head = hammer_recover_scan_fwd(hmp, root_volume,
287 &scan_offset,
288 &error, &buffer);
289 if (error)
290 break;
291 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
292 if (seqno != head->head.hdr_seq) {
293 scan_offset = scan_offset_save;
294 break;
295 }
296 scan_offset_save = scan_offset;
297 ++seqno;
298 }
299
300#if 0
301 /*
302 * If the forward scan is grossly ahead of last_offset
303 * then something is wrong. last_offset is supposed
304 * to be flushed out
305 */
306 if (last_offset >= scan_offset) {
307 bytes = last_offset - scan_offset;
308 } else {
309 bytes = rootmap->alloc_offset - scan_offset +
310 (last_offset & HAMMER_OFF_LONG_MASK);
311 }
312 if (bytes >
313 (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
314 4 / 5) {
2dd2e007 315 kprintf("HAMMER(%s) recovery forward scan is "
02428fb6
MD
316 "grossly beyond the last_offset in "
317 "the volume header, this can't be "
318 "right.\n",
319 root_volume->ondisk->vol_name);
320 error = EIO;
321 break;
322 }
323#endif
324 }
9f5097dc 325
02428fb6
MD
326 /*
327 * Store the seqno. This will be the next seqno we lay down
328 * when generating new UNDOs.
329 */
330 hmp->undo_seqno = seqno;
331 if (error) {
2dd2e007 332 kprintf("HAMMER(%s) recovery failure "
02428fb6
MD
333 "during seqno fwdscan\n",
334 root_volume->ondisk->vol_name);
335 goto done;
336 }
337 last_offset = scan_offset;
2dd2e007
MD
338 kprintf("HAMMER(%s) recovery range %016jx-%016jx\n"
339 "HAMMER(%s) recovery nexto %016jx endseqno=%08x\n",
02428fb6
MD
340 root_volume->ondisk->vol_name,
341 (intmax_t)first_offset,
342 (intmax_t)last_offset,
2dd2e007 343 root_volume->ondisk->vol_name,
02428fb6
MD
344 (intmax_t)rootmap->next_offset,
345 seqno);
346 }
347
348 /*
349 * Calculate the size of the active portion of the FIFO. If the
350 * FIFO is empty the filesystem is clean and no further action is
351 * needed.
352 */
9f5097dc
MD
353 if (last_offset >= first_offset) {
354 bytes = last_offset - first_offset;
c9b9e29d 355 } else {
9f5097dc
MD
356 bytes = rootmap->alloc_offset - first_offset +
357 (last_offset & HAMMER_OFF_LONG_MASK);
c9b9e29d 358 }
02428fb6 359 if (bytes == 0) {
2dd2e007 360 degenerate_case = 1;
02428fb6
MD
361 error = 0;
362 goto done;
363 }
364
c58123da 365 kprintf("HAMMER(%s) recovery undo %016jx-%016jx (%jd bytes)%s\n",
09ac686b 366 root_volume->ondisk->vol_name,
02428fb6
MD
367 (intmax_t)first_offset,
368 (intmax_t)last_offset,
369 (intmax_t)bytes,
51c35492 370 (hmp->ronly ? " (RO)" : "(RW)"));
c9b9e29d
MD
371 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
372 kprintf("Undo size is absurd, unable to mount\n");
02428fb6
MD
373 error = EIO;
374 goto done;
c9b9e29d 375 }
4d75d829
MD
376
377 /*
f90dde4c 378 * Scan the UNDOs backwards.
4d75d829 379 */
9f5097dc 380 scan_offset = last_offset;
4d75d829 381
f90dde4c 382 while ((int64_t)bytes > 0) {
02428fb6
MD
383 KKASSERT(scan_offset != first_offset);
384 head = hammer_recover_scan_rev(hmp, root_volume,
385 &scan_offset, &error, &buffer);
386 if (error)
f90dde4c 387 break;
c58123da
MD
388
389 /*
390 * Normal UNDO
391 */
02428fb6 392 error = hammer_recover_undo(hmp, root_volume, &head->undo);
f90dde4c 393 if (error) {
02428fb6 394 kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
f90dde4c 395 root_volume->ondisk->vol_name,
02428fb6 396 (intmax_t)scan_offset - head->head.hdr_size);
b33e2cc0 397 break;
f90dde4c 398 }
c58123da
MD
399
400 /*
401 * The first REDO_SYNC record encountered (scanning backwards)
402 * enables REDO processing.
403 */
404 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
405 head->redo.redo_flags == HAMMER_REDO_SYNC) {
406 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
407 kprintf("HAMMER(%s) Ignoring extra REDO_SYNC "
408 "records in UNDO/REDO FIFO.\n",
409 root_volume->ondisk->vol_name
410 );
411 } else {
412 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
413 hmp->recover_stage2_offset =
414 head->redo.redo_offset;
415 kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n",
416 root_volume->ondisk->vol_name,
417 (intmax_t)head->redo.redo_offset);
418 }
419 }
420
02428fb6 421 bytes -= head->head.hdr_size;
06ad81ff
MD
422
423 /*
312de84d
MD
424 * If too many dirty buffers have built up we have to flush'm
425 * out. As long as we do not flush out the volume header
426 * a crash here should not cause any problems.
427 *
428 * buffer must be released so the flush can assert that
429 * all buffers are idle.
06ad81ff
MD
430 */
431 if (hammer_flusher_meta_limit(hmp)) {
312de84d
MD
432 if (buffer) {
433 hammer_rel_buffer(buffer, 0);
434 buffer = NULL;
435 }
06ad81ff
MD
436 if (hmp->ronly == 0) {
437 hammer_recover_flush_buffers(hmp, root_volume,
438 0);
439 kprintf("HAMMER(%s) Continuing recovery\n",
440 root_volume->ondisk->vol_name);
00f16fad 441 } else {
06ad81ff
MD
442 kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
443 root_volume->ondisk->vol_name);
00f16fad
MD
444 error = EIO;
445 break;
06ad81ff
MD
446 }
447 }
4d75d829 448 }
c58123da 449 KKASSERT(error || bytes == 0);
c9b9e29d 450done:
02428fb6 451 if (buffer) {
f90dde4c 452 hammer_rel_buffer(buffer, 0);
02428fb6
MD
453 buffer = NULL;
454 }
51c35492
MD
455
456 /*
9f5097dc 457 * After completely flushing all the recovered buffers the volume
02428fb6 458 * header will also be flushed.
51c35492 459 */
9f5097dc
MD
460 if (root_volume->io.recovered == 0) {
461 hammer_ref_volume(root_volume);
462 root_volume->io.recovered = 1;
51c35492 463 }
9f5097dc
MD
464
465 /*
02428fb6
MD
466 * Finish up flushing (or discarding) recovered buffers. FIFO
467 * indices in the volume header are updated to the actual undo
468 * range but will not be collapsed until stage 2.
9f5097dc 469 */
00f16fad
MD
470 if (error == 0) {
471 hammer_modify_volume(NULL, root_volume, NULL, 0);
472 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
02428fb6 473 rootmap->first_offset = first_offset;
00f16fad
MD
474 rootmap->next_offset = last_offset;
475 hammer_modify_volume_done(root_volume);
476 if (hmp->ronly == 0)
477 hammer_recover_flush_buffers(hmp, root_volume, 1);
478 } else {
479 hammer_recover_flush_buffers(hmp, root_volume, -1);
480 }
2dd2e007
MD
481 if (degenerate_case == 0) {
482 kprintf("HAMMER(%s) recovery complete\n",
483 root_volume->ondisk->vol_name);
484 } else {
485 kprintf("HAMMER(%s) mounted clean, no recovery needed\n",
486 root_volume->ondisk->vol_name);
487 }
f90dde4c 488 return (error);
4d75d829
MD
489}
490
02428fb6
MD
491/*
492 * Execute redo operations
493 *
494 * This procedure is run at the end of the mount sequence, after the hammer
495 * mount structure has been completely initialized but before the filesystem
496 * goes live. It can access standard cursors, the B-Tree, flush the
497 * filesystem, and so forth.
498 *
499 * This code may only be called for read-write mounts or when a mount
2dd2e007 500 * switches from read-only to read-write. vnodes may or may not be present.
02428fb6
MD
501 *
502 * The stage1 code will have already calculated the correct FIFO range
c58123da
MD
503 * for the nominal UNDO FIFO and stored it in the rootmap. The extended
504 * range for REDO is stored in hmp->recover_stage2_offset.
02428fb6
MD
505 */
506int
507hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
4d75d829 508{
02428fb6
MD
509 hammer_blockmap_t rootmap;
510 hammer_buffer_t buffer;
511 hammer_off_t scan_offset;
c58123da 512 hammer_off_t oscan_offset;
02428fb6 513 hammer_off_t bytes;
c58123da 514 hammer_off_t ext_bytes;
02428fb6
MD
515 hammer_fifo_any_t head;
516 hammer_off_t first_offset;
517 hammer_off_t last_offset;
c58123da
MD
518 hammer_off_t ext_offset;
519 struct hammer_rterm_rb_tree rterm_root;
520 u_int32_t seqno;
02428fb6 521 int error;
c58123da
MD
522 int verbose = 0;
523 int dorscan;
02428fb6
MD
524
525 /*
526 * Stage 2 can only be run on a RW mount, or when the mount is
c58123da 527 * switched from RO to RW.
02428fb6
MD
528 */
529 KKASSERT(hmp->ronly == 0);
c58123da 530 RB_INIT(&rterm_root);
b33e2cc0
MD
531
532 /*
02428fb6
MD
533 * Examine the UNDO FIFO. If it is empty the filesystem is clean
534 * and no action need be taken.
b33e2cc0 535 */
02428fb6
MD
536 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
537 first_offset = rootmap->first_offset;
538 last_offset = rootmap->next_offset;
c58123da
MD
539 if (first_offset == last_offset) {
540 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
02428fb6 541 return(0);
c58123da 542 }
02428fb6 543
c58123da
MD
544 /*
545 * Stage2 must only be run once, and will not be run at all
546 * if Stage1 did not find a REDO_SYNC record.
547 */
548 error = 0;
549 buffer = NULL;
550
551 if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
552 goto done;
553 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
554 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
555 ext_offset = hmp->recover_stage2_offset;
556 if (ext_offset == 0) {
557 kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC "
558 "offset, ignoring\n",
559 root_volume->ondisk->vol_name);
560 goto done;
561 }
562
563 /*
564 * Calculate nominal UNDO range (this is not yet the extended
565 * range).
566 */
02428fb6
MD
567 if (last_offset >= first_offset) {
568 bytes = last_offset - first_offset;
569 } else {
570 bytes = rootmap->alloc_offset - first_offset +
571 (last_offset & HAMMER_OFF_LONG_MASK);
572 }
c58123da 573 kprintf("HAMMER(%s) recovery redo %016jx-%016jx (%jd bytes)%s\n",
02428fb6
MD
574 root_volume->ondisk->vol_name,
575 (intmax_t)first_offset,
576 (intmax_t)last_offset,
577 (intmax_t)bytes,
578 (hmp->ronly ? " (RO)" : "(RW)"));
c58123da 579 verbose = 1;
02428fb6
MD
580 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
581 kprintf("Undo size is absurd, unable to mount\n");
c58123da
MD
582 error = EIO;
583 goto fatal;
b33e2cc0
MD
584 }
585
b33e2cc0 586 /*
c58123da
MD
587 * Scan the REDOs backwards collecting REDO_TERM_* information.
588 * This information is only collected for the extended range,
589 * non-inclusive of any TERMs in the nominal UNDO range.
590 *
591 * If the stage2 extended range is inside the nominal undo range
592 * we have nothing to scan.
593 *
594 * This must fit in memory!
b33e2cc0 595 */
c58123da
MD
596 if (first_offset < last_offset) {
597 /*
598 * [ first_offset........last_offset ]
599 */
600 if (ext_offset < first_offset) {
601 dorscan = 1;
602 ext_bytes = first_offset - ext_offset;
603 } else if (ext_offset > last_offset) {
604 dorscan = 1;
605 ext_bytes = (rootmap->alloc_offset - ext_offset) +
606 (first_offset & HAMMER_OFF_LONG_MASK);
607 } else {
608 ext_bytes = -(ext_offset - first_offset);
609 dorscan = 0;
610 }
611 } else {
612 /*
613 * [......last_offset first_offset.....]
614 */
615 if (ext_offset < last_offset) {
616 ext_bytes = -((rootmap->alloc_offset - first_offset) +
617 (ext_offset & HAMMER_OFF_LONG_MASK));
618 dorscan = 0;
619 } else if (ext_offset > first_offset) {
620 ext_bytes = -(ext_offset - first_offset);
621 dorscan = 0;
622 } else {
623 ext_bytes = first_offset - ext_offset;
624 dorscan = 1;
625 }
626 }
02428fb6 627
c58123da
MD
628 if (dorscan) {
629 scan_offset = first_offset;
630 kprintf("HAMMER(%s) Find extended redo %016jx, %jd extbytes\n",
631 root_volume->ondisk->vol_name,
632 (intmax_t)ext_offset,
633 (intmax_t)ext_bytes);
634 seqno = hmp->recover_stage2_seqno - 1;
635 for (;;) {
636 head = hammer_recover_scan_rev(hmp, root_volume,
637 &scan_offset,
638 &error, &buffer);
639 if (error)
640 break;
641 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
642 if (head->head.hdr_seq != seqno) {
643 error = ERANGE;
644 break;
645 }
646 error = hammer_recover_redo_rec(
647 hmp, &rterm_root,
648 scan_offset, &head->redo);
649 --seqno;
650 }
651 if (scan_offset == ext_offset)
652 break;
653 }
654 if (error) {
655 kprintf("HAMMER(%s) Find extended redo failed %d, "
656 "unable to run REDO\n",
657 root_volume->ondisk->vol_name,
658 error);
659 goto done;
660 }
661 } else {
7750bdfa 662 kprintf("HAMMER(%s) Embedded extended redo %016jx, "
c58123da
MD
663 "%jd extbytes\n",
664 root_volume->ondisk->vol_name,
665 (intmax_t)ext_offset,
666 (intmax_t)ext_bytes);
667 }
668
669 /*
670 * Scan the REDO forwards through the entire extended range.
671 * Anything with a previously recorded matching TERM is discarded.
672 */
673 scan_offset = ext_offset;
674 bytes += ext_bytes;
675
676 /*
677 * NOTE: when doing a forward scan the returned scan_offset is
678 * for the record following the returned record, so we
679 * have to play a bit.
680 */
681 while ((int64_t)bytes > 0) {
02428fb6
MD
682 KKASSERT(scan_offset != last_offset);
683
c58123da 684 oscan_offset = scan_offset;
02428fb6
MD
685 head = hammer_recover_scan_fwd(hmp, root_volume,
686 &scan_offset, &error, &buffer);
687 if (error)
688 break;
689
c58123da
MD
690 error = hammer_recover_redo_run(hmp, &rterm_root,
691 oscan_offset, &head->redo);
02428fb6
MD
692 if (error) {
693 kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
694 root_volume->ondisk->vol_name,
695 (intmax_t)scan_offset - head->head.hdr_size);
696 break;
f90dde4c 697 }
02428fb6
MD
698 bytes -= head->head.hdr_size;
699 }
c58123da 700 KKASSERT(error || bytes == 0);
86327cc9
MD
701
702done:
02428fb6
MD
703 if (buffer) {
704 hammer_rel_buffer(buffer, 0);
705 buffer = NULL;
f90dde4c 706 }
4d75d829 707
c58123da
MD
708 /*
709 * Cleanup rterm tree
710 */
711 {
712 hammer_rterm_t rterm;
713 hammer_rterm_entry_t rte;
714
715 while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
716 RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
717 while ((rte = rterm->term_list) != NULL) {
718 rterm->term_list = rte->next;
719 kfree(rte, hmp->m_misc);
720 }
721 kfree(rterm, hmp->m_misc);
722 }
723 }
724
4d75d829 725 /*
02428fb6
MD
726 * Finish up flushing (or discarding) recovered buffers by executing
727 * a normal flush cycle. Setting HMNT_UNDO_DIRTY bypasses degenerate
728 * case tests and forces the flush in order to update the FIFO indices.
729 *
730 * If a crash occurs during the flush the entire undo/redo will be
731 * re-run during recovery on the next mount.
4d75d829 732 */
02428fb6
MD
733 if (error == 0) {
734 if (rootmap->first_offset != rootmap->next_offset)
735 hmp->hflags |= HMNT_UNDO_DIRTY;
736 hammer_flusher_sync(hmp);
4d75d829 737 }
c58123da
MD
738fatal:
739 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
740 if (verbose) {
741 kprintf("HAMMER(%s) End redo recovery\n",
742 root_volume->ondisk->vol_name);
743 }
02428fb6 744 return (error);
4d75d829
MD
745}
746
02428fb6
MD
747/*
748 * Scan backwards from *scan_offsetp, return the FIFO record prior to the
749 * record at *scan_offsetp or NULL if an error occured.
750 *
751 * On return *scan_offsetp will be the offset of the returned record.
752 */
753hammer_fifo_any_t
754hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
755 hammer_off_t *scan_offsetp,
756 int *errorp, struct hammer_buffer **bufferp)
4d75d829 757{
02428fb6
MD
758 hammer_off_t scan_offset;
759 hammer_blockmap_t rootmap;
760 hammer_fifo_any_t head;
f90dde4c 761 hammer_fifo_tail_t tail;
4d75d829 762
02428fb6
MD
763 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
764 scan_offset = *scan_offsetp;
765
766 if (hammer_debug_general & 0x0080)
767 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
768 if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
769 scan_offset = rootmap->alloc_offset;
770 if (scan_offset - sizeof(*tail) <
771 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
772 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
773 root_volume->ondisk->vol_name,
774 (intmax_t)scan_offset);
775 *errorp = EIO;
776 return (NULL);
9944ae54 777 }
02428fb6
MD
778 tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
779 errorp, bufferp);
780 if (*errorp) {
781 kprintf("HAMMER(%s) Unable to read UNDO TAIL "
782 "at %016jx\n",
783 root_volume->ondisk->vol_name,
784 (intmax_t)scan_offset - sizeof(*tail));
785 return (NULL);
f90dde4c 786 }
02428fb6
MD
787
788 if (hammer_check_tail_signature(tail, scan_offset) != 0) {
789 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
790 "at %016jx\n",
791 root_volume->ondisk->vol_name,
792 (intmax_t)scan_offset - sizeof(*tail));
793 *errorp = EIO;
794 return (NULL);
4d75d829 795 }
02428fb6
MD
796 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
797 *scan_offsetp = scan_offset - head->head.hdr_size;
798
799 return (head);
800}
801
802/*
803 * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
804 * an error occured.
805 *
806 * On return *scan_offsetp will be the offset of the record following
807 * the returned record.
808 */
809hammer_fifo_any_t
810hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
811 hammer_off_t *scan_offsetp,
812 int *errorp, struct hammer_buffer **bufferp)
813{
814 hammer_off_t scan_offset;
815 hammer_blockmap_t rootmap;
816 hammer_fifo_any_t head;
817
818 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
819 scan_offset = *scan_offsetp;
820
821 if (hammer_debug_general & 0x0080)
822 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
823 if (scan_offset == rootmap->alloc_offset)
824 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
825
826 head = hammer_bread(hmp, scan_offset, errorp, bufferp);
827 if (*errorp) {
828 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
829 root_volume->ondisk->vol_name,
830 (intmax_t)scan_offset);
831 return (NULL);
832 }
833
834 if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
835 kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
836 "at %016jx\n",
837 root_volume->ondisk->vol_name,
838 (intmax_t)scan_offset);
839 *errorp = EIO;
840 return (NULL);
841 }
842 scan_offset += head->head.hdr_size;
843 if (scan_offset == rootmap->alloc_offset)
844 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
845 *scan_offsetp = scan_offset;
846
847 return (head);
848}
849
850/*
851 * Helper function for hammer_check_{head,tail}_signature(). Check stuff
852 * once the head and tail has been established.
853 *
854 * This function validates the entire FIFO record wrapper.
855 */
856static __inline
857int
858_hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
859 hammer_off_t beg_off)
860{
861 hammer_off_t end_off;
862 u_int32_t crc;
863 int bytes;
4d75d829
MD
864
865 /*
02428fb6
MD
866 * Check signatures. The tail signature is allowed to be the
867 * head signature only for 8-byte PADs.
4d75d829 868 */
02428fb6
MD
869 if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
870 kprintf("HAMMER: FIFO record bad head signature "
871 "%04x at %016jx\n",
872 head->hdr_signature,
873 (intmax_t)beg_off);
874 return(2);
875 }
876 if (head->hdr_size < HAMMER_HEAD_ALIGN ||
877 (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
878 kprintf("HAMMER: FIFO record unaligned or bad size"
879 "%04x at %016jx\n",
880 head->hdr_size,
881 (intmax_t)beg_off);
882 return(2);
883 }
884 end_off = beg_off + head->hdr_size;
885
886 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
887 (size_t)(end_off - beg_off) != sizeof(*tail)) {
888 if (head->hdr_type != tail->tail_type) {
889 kprintf("HAMMER: FIFO record head/tail type mismatch "
890 "%04x %04x at %016jx\n",
891 head->hdr_type, tail->tail_type,
892 (intmax_t)beg_off);
893 return(2);
894 }
895 if (head->hdr_size != tail->tail_size) {
896 kprintf("HAMMER: FIFO record head/tail size mismatch "
897 "%04x %04x at %016jx\n",
898 head->hdr_size, tail->tail_size,
899 (intmax_t)beg_off);
900 return(2);
901 }
902 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
903 kprintf("HAMMER: FIFO record bad tail signature "
904 "%04x at %016jx\n",
905 tail->tail_signature,
906 (intmax_t)beg_off);
907 return(3);
908 }
909 }
9944ae54 910
09ac686b 911 /*
02428fb6
MD
912 * Non-PAD records must have a CRC and must be sized at
913 * least large enough to fit the head and tail.
09ac686b 914 */
02428fb6
MD
915 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
916 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
917 crc32(head + 1, head->hdr_size - sizeof(*head));
918 if (head->hdr_crc != crc) {
919 kprintf("HAMMER: FIFO record CRC failed %08x %08x "
920 "at %016jx\n",
921 head->hdr_crc, crc,
922 (intmax_t)beg_off);
923 return(EIO);
924 }
925 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
926 kprintf("HAMMER: FIFO record too small "
927 "%04x at %016jx\n",
928 head->hdr_size,
929 (intmax_t)beg_off);
930 return(EIO);
931 }
09ac686b
MD
932 }
933
9944ae54 934 /*
f90dde4c 935 * Check the tail
9944ae54 936 */
02428fb6
MD
937 bytes = head->hdr_size;
938 tail = (void *)((char *)head + bytes - sizeof(*tail));
939 if (tail->tail_size != head->hdr_size) {
940 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
941 tail->tail_size, head->hdr_size,
942 (intmax_t)beg_off);
f90dde4c
MD
943 return(EIO);
944 }
02428fb6
MD
945 if (tail->tail_type != head->hdr_type) {
946 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
947 tail->tail_type, head->hdr_type,
948 (intmax_t)beg_off);
f90dde4c 949 return(EIO);
4d75d829
MD
950 }
951
02428fb6
MD
952 return(0);
953}
954
955/*
956 * Check that the FIFO record is in-bounds given the head and the
957 * hammer offset.
958 *
959 * Also checks that the head and tail structures agree with each other,
960 * but does not check beyond the signature, type, and size.
961 */
962static int
963hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
964{
965 hammer_fifo_tail_t tail;
966 hammer_off_t end_off;
967
968 /*
969 * head overlaps buffer boundary. This could be a PAD so only
970 * check the minimum PAD size here.
971 */
972 if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
973 return(1);
974
975 /*
976 * Calculate the ending offset and make sure the record does
977 * not cross a buffer boundary.
978 */
979 end_off = beg_off + head->hdr_size;
980 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
981 return(1);
982 tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
983 return (_hammer_check_signature(head, tail, beg_off));
984}
985
986/*
987 * Check that the FIFO record is in-bounds given the tail and the
988 * hammer offset. The offset is pointing at the ending boundary of the
989 * record.
990 *
991 * Also checks that the head and tail structures agree with each other,
992 * but does not check beyond the signature, type, and size.
993 */
994static int
995hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
996{
997 hammer_fifo_head_t head;
998 hammer_off_t beg_off;
999
4d75d829 1000 /*
02428fb6
MD
1001 * tail overlaps buffer boundary
1002 */
1003 if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1004 return(1);
1005
1006 /*
1007 * Calculate the begining offset and make sure the record does
1008 * not cross a buffer boundary.
4d75d829 1009 */
02428fb6
MD
1010 beg_off = end_off - tail->tail_size;
1011 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1012 return(1);
1013 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1014 return (_hammer_check_signature(head, tail, beg_off));
1015}
1016
1017static int
1018hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1019 hammer_fifo_undo_t undo)
1020{
1021 hammer_volume_t volume;
1022 hammer_buffer_t buffer;
1023 hammer_off_t buf_offset;
1024 int zone;
1025 int error;
1026 int vol_no;
1027 int bytes;
1028 u_int32_t offset;
1029
1030 /*
1031 * Only process UNDO records. Flag if we find other records to
1032 * optimize stage2 recovery.
1033 */
c58123da 1034 if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
f90dde4c 1035 return(0);
4d75d829
MD
1036
1037 /*
f90dde4c 1038 * Validate the UNDO record.
4d75d829 1039 */
02428fb6
MD
1040 bytes = undo->head.hdr_size - sizeof(*undo) -
1041 sizeof(struct hammer_fifo_tail);
1042 if (bytes < 0 || undo->undo_data_bytes < 0 ||
1043 undo->undo_data_bytes > bytes) {
f90dde4c 1044 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
02428fb6 1045 undo->undo_data_bytes, bytes);
f90dde4c 1046 return(EIO);
4d75d829
MD
1047 }
1048
02428fb6
MD
1049 bytes = undo->undo_data_bytes;
1050
4d75d829 1051 /*
f90dde4c
MD
1052 * The undo offset may only be a zone-1 or zone-2 offset.
1053 *
1054 * Currently we only support a zone-1 offset representing the
1055 * volume header.
4d75d829 1056 */
f90dde4c
MD
1057 zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1058 offset = undo->undo_offset & HAMMER_BUFMASK;
4d75d829 1059
02428fb6 1060 if (offset + bytes > HAMMER_BUFSIZE) {
f90dde4c
MD
1061 kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
1062 return (EIO);
1063 }
4d75d829 1064
f90dde4c
MD
1065 switch(zone) {
1066 case HAMMER_ZONE_RAW_VOLUME_INDEX:
1067 vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1068 volume = hammer_get_volume(hmp, vol_no, &error);
1069 if (volume == NULL) {
1070 kprintf("HAMMER: UNDO record, "
1071 "cannot access volume %d\n", vol_no);
1072 break;
4d75d829 1073 }
f90dde4c
MD
1074 hammer_modify_volume(NULL, volume, NULL, 0);
1075 hammer_recover_copy_undo(undo->undo_offset,
1076 (char *)(undo + 1),
1077 (char *)volume->ondisk + offset,
02428fb6 1078 bytes);
f90dde4c 1079 hammer_modify_volume_done(volume);
51c35492
MD
1080
1081 /*
9f5097dc
MD
1082 * Multiple modifications may be made to the same buffer.
1083 * Also, the volume header cannot be written out until
1084 * everything else has been flushed. This also
51c35492
MD
1085 * covers the read-only case by preventing the kernel from
1086 * flushing the buffer.
1087 */
1088 if (volume->io.recovered == 0)
1089 volume->io.recovered = 1;
1090 else
1091 hammer_rel_volume(volume, 0);
f90dde4c
MD
1092 break;
1093 case HAMMER_ZONE_RAW_BUFFER_INDEX:
2f85fa4d 1094 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
4a2796f3
MD
1095 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1096 0, &error);
f90dde4c
MD
1097 if (buffer == NULL) {
1098 kprintf("HAMMER: UNDO record, "
02428fb6
MD
1099 "cannot access buffer %016jx\n",
1100 (intmax_t)undo->undo_offset);
f90dde4c 1101 break;
4d75d829 1102 }
f90dde4c
MD
1103 hammer_modify_buffer(NULL, buffer, NULL, 0);
1104 hammer_recover_copy_undo(undo->undo_offset,
1105 (char *)(undo + 1),
1106 (char *)buffer->ondisk + offset,
02428fb6 1107 bytes);
f90dde4c 1108 hammer_modify_buffer_done(buffer);
51c35492
MD
1109
1110 /*
1111 * Multiple modifications may be made to the same buffer,
1112 * improve performance by delaying the flush. This also
1113 * covers the read-only case by preventing the kernel from
1114 * flushing the buffer.
1115 */
1116 if (buffer->io.recovered == 0)
1117 buffer->io.recovered = 1;
1118 else
1119 hammer_rel_buffer(buffer, 0);
f90dde4c
MD
1120 break;
1121 default:
1122 kprintf("HAMMER: Corrupt UNDO record\n");
1123 error = EIO;
4d75d829 1124 }
f90dde4c 1125 return (error);
4d75d829
MD
1126}
1127
f90dde4c
MD
1128static void
1129hammer_recover_copy_undo(hammer_off_t undo_offset,
1130 char *src, char *dst, int bytes)
4d75d829 1131{
973c11b9 1132 if (hammer_debug_general & 0x0080) {
02428fb6
MD
1133 kprintf("UNDO %016jx: %d\n",
1134 (intmax_t)undo_offset, bytes);
973c11b9 1135 }
ec4e8497 1136#if 0
02428fb6 1137 kprintf("UNDO %016jx:", (intmax_t)undo_offset);
f90dde4c
MD
1138 hammer_recover_debug_dump(22, dst, bytes);
1139 kprintf("%22s", "to:");
1140 hammer_recover_debug_dump(22, src, bytes);
ec4e8497 1141#endif
f90dde4c 1142 bcopy(src, dst, bytes);
4d75d829
MD
1143}
1144
c58123da
MD
1145/*
1146 * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1147 * during the backwards scan of the extended UNDO/REDO FIFO. This scan
1148 * does not include the nominal UNDO range, just the extended range.
1149 */
1150int
1151hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1152 hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1153{
1154 hammer_rterm_t rterm;
1155 hammer_rterm_t nrterm;
1156 hammer_rterm_entry_t rte;
1157
1158 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1159 return(0);
1160 if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1161 redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1162 return(0);
1163 }
1164
1165 nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1166 nrterm->redo_objid = redo->redo_objid;
1167 nrterm->redo_localization = redo->redo_localization;
1168 nrterm->redo_flags = redo->redo_flags;
1169 nrterm->redo_offset = redo->redo_offset;
1170
1171 rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1172 if (rterm)
1173 kfree(nrterm, hmp->m_misc);
1174 else
1175 rterm = nrterm;
1176
fad4297b
MD
1177 if (bootverbose) {
1178 kprintf("record record %016jx objid %016jx "
1179 "offset %016jx flags %08x\n",
1180 (intmax_t)scan_offset,
1181 (intmax_t)redo->redo_objid,
1182 (intmax_t)redo->redo_offset,
1183 (int)redo->redo_flags);
1184 }
c58123da
MD
1185
1186 /*
1187 * Scan in reverse order, rte prepended, so the rte list will be
1188 * in forward order.
1189 */
1190 rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1191 rte->fifo_offset = scan_offset;
1192 rte->next = rterm->term_list;
1193 rterm->term_list = rte;
1194
1195 return(0);
1196}
1197
1198/*
1199 * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1200 * the forwards scan of the entire extended UNDO/REDO FIFO range.
1201 *
1202 * Records matching previously recorded TERMs have already been committed
1203 * and are ignored.
1204 */
1205int
1206hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1207 hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1208{
1209 struct hammer_rterm rtval;
1210 hammer_rterm_t rterm;
1211 hammer_rterm_entry_t rte;
1212
1213 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1214 return(0);
1215
1216 switch(redo->redo_flags) {
1217 case HAMMER_REDO_WRITE:
1218 case HAMMER_REDO_TRUNC:
1219 /*
1220 * We hit a REDO request. The REDO request is only executed
1221 * if there is no matching TERM.
1222 */
1223 bzero(&rtval, sizeof(rtval));
1224 rtval.redo_objid = redo->redo_objid;
1225 rtval.redo_localization = redo->redo_localization;
1226 rtval.redo_offset = redo->redo_offset;
1227 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1228 HAMMER_REDO_TERM_WRITE :
1229 HAMMER_REDO_TERM_TRUNC;
1230
1231 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1232 if (rterm) {
418cb5e5
MD
1233 if (bootverbose) {
1234 kprintf("ignore record %016jx objid %016jx "
1235 "offset %016jx flags %08x\n",
1236 (intmax_t)scan_offset,
1237 (intmax_t)redo->redo_objid,
1238 (intmax_t)redo->redo_offset,
1239 (int)redo->redo_flags);
1240 }
1241 break;
1242 }
1243 if (bootverbose) {
1244 kprintf("run record %016jx objid %016jx "
c58123da
MD
1245 "offset %016jx flags %08x\n",
1246 (intmax_t)scan_offset,
1247 (intmax_t)redo->redo_objid,
1248 (intmax_t)redo->redo_offset,
1249 (int)redo->redo_flags);
c58123da 1250 }
c58123da
MD
1251
1252 /*
1253 * Redo stage2 can access a live filesystem, acquire the
1254 * vnode.
1255 */
1256 hammer_recover_redo_exec(hmp, redo);
1257 break;
1258 case HAMMER_REDO_TERM_WRITE:
1259 case HAMMER_REDO_TERM_TRUNC:
1260 /*
1261 * As we encounter TERMs in the forward scan we remove
1262 * them. Once the forward scan hits the nominal undo range
1263 * there will be no more recorded TERMs.
1264 */
1265 bzero(&rtval, sizeof(rtval));
1266 rtval.redo_objid = redo->redo_objid;
1267 rtval.redo_localization = redo->redo_localization;
1268 rtval.redo_flags = redo->redo_flags;
1269 rtval.redo_offset = redo->redo_offset;
1270
1271 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1272 if (rterm) {
1273 if ((rte = rterm->term_list) != NULL) {
1274 KKASSERT(rte->fifo_offset == scan_offset);
1275 rterm->term_list = rte->next;
1276 kfree(rte, hmp->m_misc);
1277 }
1278 }
1279 break;
1280 }
1281 return(0);
1282}
1283
1284static void
1285hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1286{
1287 struct hammer_transaction trans;
1288 struct vattr va;
1289 struct hammer_inode *ip;
1290 struct vnode *vp = NULL;
1291 int error;
1292
1293 hammer_start_transaction(&trans, hmp);
1294
1295 ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1296 HAMMER_MAX_TID, redo->redo_localization,
1297 0, &error);
1298 if (ip == NULL) {
418cb5e5 1299 kprintf("unable to find objid %016jx:%08x\n",
c58123da
MD
1300 (intmax_t)redo->redo_objid, redo->redo_localization);
1301 goto done2;
1302 }
1303 error = hammer_get_vnode(ip, &vp);
1304 if (error) {
418cb5e5 1305 kprintf("unable to acquire vnode for %016jx:%08x\n",
c58123da
MD
1306 (intmax_t)redo->redo_objid, redo->redo_localization);
1307 goto done1;
1308 }
1309
1310 switch(redo->redo_flags) {
1311 case HAMMER_REDO_WRITE:
1312 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1313 if (error) {
418cb5e5
MD
1314 kprintf("vn_rdwr open %016jx:%08x returned %d\n",
1315 (intmax_t)redo->redo_objid,
1316 redo->redo_localization, error);
c58123da
MD
1317 break;
1318 }
1319 vn_unlock(vp);
1320 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1321 redo->redo_data_bytes,
1322 redo->redo_offset, UIO_SYSSPACE,
1323 0, proc0.p_ucred, NULL);
1324 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
418cb5e5
MD
1325 if (error) {
1326 kprintf("write %016jx:%08x returned %d\n",
1327 (intmax_t)redo->redo_objid,
1328 redo->redo_localization, error);
1329 }
c58123da
MD
1330 VOP_CLOSE(vp, FREAD|FWRITE);
1331 break;
1332 case HAMMER_REDO_TRUNC:
c58123da
MD
1333 VATTR_NULL(&va);
1334 va.va_size = redo->redo_offset;
1335 error = VOP_SETATTR(vp, &va, proc0.p_ucred);
418cb5e5
MD
1336 if (error) {
1337 kprintf("setattr offset %016jx error %d\n",
1338 (intmax_t)redo->redo_offset, error);
1339 }
c58123da
MD
1340 break;
1341 }
1342 vput(vp);
1343done1:
1344 hammer_rel_inode(ip, 0);
1345done2:
1346 hammer_done_transaction(&trans);
1347}
1348
1349/*
1350 * RB tree compare function. Note that REDO_TERM_TRUNC ops ignore
1351 * the offset.
1352 *
1353 * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1354 */
1355static int
1356hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1357{
1358 if (rt1->redo_objid < rt2->redo_objid)
1359 return(-1);
1360 if (rt1->redo_objid > rt2->redo_objid)
1361 return(1);
1362 if (rt1->redo_localization < rt2->redo_localization)
1363 return(-1);
1364 if (rt1->redo_localization > rt2->redo_localization)
1365 return(1);
1366 if (rt1->redo_flags < rt2->redo_flags)
1367 return(-1);
1368 if (rt1->redo_flags > rt2->redo_flags)
1369 return(1);
1370 if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1371 if (rt1->redo_offset < rt2->redo_offset)
1372 return(-1);
1373 if (rt1->redo_offset > rt2->redo_offset)
1374 return(1);
1375 }
1376 return(0);
1377}
1378
d36ec43b
MD
1379#if 0
1380
f90dde4c
MD
1381static void
1382hammer_recover_debug_dump(int w, char *buf, int bytes)
4d75d829 1383{
f90dde4c 1384 int i;
4d75d829 1385
f90dde4c
MD
1386 for (i = 0; i < bytes; ++i) {
1387 if (i && (i & 15) == 0)
1388 kprintf("\n%*.*s", w, w, "");
1389 kprintf(" %02x", (unsigned char)buf[i]);
b33e2cc0 1390 }
f90dde4c 1391 kprintf("\n");
4d75d829
MD
1392}
1393
d36ec43b 1394#endif
51c35492
MD
1395
1396/*
9f5097dc
MD
1397 * Flush recovered buffers from recovery operations. The call to this
1398 * routine may be delayed if a read-only mount was made and then later
2faf0737
MD
1399 * upgraded to read-write. This routine is also called when unmounting
1400 * a read-only mount to clean out recovered (dirty) buffers which we
1401 * couldn't flush (because the mount is read-only).
9f5097dc
MD
1402 *
1403 * The volume header is always written last. The UNDO FIFO will be forced
1404 * to zero-length by setting next_offset to first_offset. This leaves the
1405 * (now stale) UNDO information used to recover the disk available for
1406 * forensic analysis.
00f16fad
MD
1407 *
1408 * final is typically 0 or 1. The volume header is only written if final
1409 * is 1. If final is -1 the recovered buffers are discarded instead of
1410 * written and root_volume can also be passed as NULL in that case.
51c35492
MD
1411 */
1412static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1413static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1414
1415void
06ad81ff
MD
1416hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1417 int final)
51c35492 1418{
af209b0f
MD
1419 /*
1420 * Flush the buffers out asynchronously, wait for all the I/O to
1421 * complete, then do it again to destroy the buffer cache buffer
1422 * so it doesn't alias something later on.
1423 */
1424 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
00f16fad 1425 hammer_recover_flush_buffer_callback, &final);
eddadaee 1426 hammer_io_wait_all(hmp, "hmrrcw", 1);
0832c9bb 1427 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
00f16fad 1428 hammer_recover_flush_buffer_callback, &final);
9f5097dc 1429
00f16fad
MD
1430 /*
1431 * Flush all volume headers except the root volume. If final < 0
1432 * we discard all volume headers including the root volume.
1433 */
1434 if (final >= 0) {
1435 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1436 hammer_recover_flush_volume_callback, root_volume);
1437 } else {
1438 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1439 hammer_recover_flush_volume_callback, NULL);
1440 }
9f5097dc 1441
af209b0f 1442 /*
00f16fad 1443 * Finalize the root volume header.
af209b0f 1444 */
00f16fad 1445 if (root_volume && root_volume->io.recovered && final > 0) {
eddadaee 1446 hammer_io_wait_all(hmp, "hmrflx", 1);
51c35492 1447 root_volume->io.recovered = 0;
710733a6 1448 hammer_io_flush(&root_volume->io, 0);
51c35492 1449 hammer_rel_volume(root_volume, 0);
eddadaee 1450 hammer_io_wait_all(hmp, "hmrfly", 1);
51c35492
MD
1451 }
1452}
1453
00f16fad
MD
1454/*
1455 * Callback to flush volume headers. If discarding data will be NULL and
1456 * all volume headers (including the root volume) will be discarded.
1457 * Otherwise data is the root_volume and we flush all volume headers
1458 * EXCEPT the root_volume.
2faf0737
MD
1459 *
1460 * Clear any I/O error or modified condition when discarding buffers to
1461 * clean up the reference count, otherwise the buffer may have extra refs
1462 * on it.
00f16fad 1463 */
51c35492
MD
1464static
1465int
1466hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1467{
1468 hammer_volume_t root_volume = data;
1469
51c35492
MD
1470 if (volume->io.recovered && volume != root_volume) {
1471 volume->io.recovered = 0;
2faf0737 1472 if (root_volume != NULL) {
710733a6 1473 hammer_io_flush(&volume->io, 0);
2faf0737
MD
1474 } else {
1475 hammer_io_clear_error(&volume->io);
00f16fad 1476 hammer_io_clear_modify(&volume->io, 1);
2faf0737 1477 }
51c35492
MD
1478 hammer_rel_volume(volume, 0);
1479 }
1480 return(0);
1481}
1482
2faf0737
MD
1483/*
1484 * Flush or discard recovered I/O buffers.
1485 *
1486 * Clear any I/O error or modified condition when discarding buffers to
1487 * clean up the reference count, otherwise the buffer may have extra refs
1488 * on it.
1489 */
51c35492
MD
1490static
1491int
1492hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1493{
00f16fad 1494 int final = *(int *)data;
250aec18 1495 int flush;
00f16fad 1496
51c35492
MD
1497 if (buffer->io.recovered) {
1498 buffer->io.recovered = 0;
af209b0f 1499 buffer->io.reclaim = 1;
2faf0737
MD
1500 if (final < 0) {
1501 hammer_io_clear_error(&buffer->io);
00f16fad 1502 hammer_io_clear_modify(&buffer->io, 1);
2faf0737 1503 } else {
710733a6 1504 hammer_io_flush(&buffer->io, 0);
2faf0737 1505 }
af209b0f
MD
1506 hammer_rel_buffer(buffer, 0);
1507 } else {
250aec18
MD
1508 flush = hammer_ref_interlock(&buffer->io.lock);
1509 if (flush)
2faf0737 1510 ++hammer_count_refedbufs;
250aec18 1511
2faf0737
MD
1512 if (final < 0) {
1513 hammer_io_clear_error(&buffer->io);
1514 hammer_io_clear_modify(&buffer->io, 1);
1515 }
250aec18 1516 KKASSERT(hammer_oneref(&buffer->io.lock));
af209b0f 1517 buffer->io.reclaim = 1;
250aec18 1518 hammer_rel_buffer(buffer, flush);
51c35492
MD
1519 }
1520 return(0);
1521}
1522