HAMMER VFS - Add HAMMER_OFF_BAD
[dragonfly.git] / sys / vfs / hammer / hammer_redo.c
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 /*
36  * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
37  *
38  * See also hammer_undo.c
39  */
40
41 #include "hammer.h"
42
43 RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
44              hammer_redo_rb_compare, hammer_off_t, redo_fifo_start);
45
46 /*
47  * HAMMER version 4+ REDO support.
48  *
49  * REDO records are used to improve fsync() performance.  Instead of having
50  * to go through a complete double-flush cycle involving at least two disk
51  * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
52  * the related REDO records, which is a single synchronization requiring
53  * no track seeking.  If a recovery becomes necessary the recovery code
54  * will generate logical data writes based on the REDO records encountered.
55  * That is, the recovery code will UNDO any partial meta-data/data writes
56  * at the raw disk block level and then REDO the data writes at the logical
57  * level.
58  */
59 int
60 hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
61                      hammer_off_t file_off, u_int32_t flags,
62                      void *base, int len)
63 {
64         hammer_mount_t hmp;
65         hammer_volume_t root_volume;
66         hammer_blockmap_t undomap;
67         hammer_buffer_t buffer = NULL;
68         hammer_fifo_redo_t redo;
69         hammer_fifo_tail_t tail;
70         hammer_off_t next_offset;
71         int error;
72         int bytes;
73         int n;
74
75         /*
76          * Setup
77          */
78         hmp = trans->hmp;
79
80         root_volume = trans->rootvol;
81         undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
82
83         /*
84          * No undo recursion when modifying the root volume
85          */
86         hammer_modify_volume(NULL, root_volume, NULL, 0);
87         hammer_lock_ex(&hmp->undo_lock);
88
89         /* undo had better not roll over (loose test) */
90         if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
91                 panic("hammer: insufficient undo FIFO space!");
92
93         /*
94          * Loop until the undo for the entire range has been laid down.
95          * Loop at least once (len might be 0 as a degenerate case).
96          */
97         for (;;) {
98                 /*
99                  * Fetch the layout offset in the UNDO FIFO, wrap it as
100                  * necessary.
101                  */
102                 if (undomap->next_offset == undomap->alloc_offset) {
103                         undomap->next_offset =
104                                 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
105                 }
106                 next_offset = undomap->next_offset;
107
108                 /*
109                  * This is a tail-chasing FIFO, when we hit the start of a new
110                  * buffer we don't have to read it in.
111                  */
112                 if ((next_offset & HAMMER_BUFMASK) == 0) {
113                         redo = hammer_bnew(hmp, next_offset, &error, &buffer);
114                         hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000);
115                 } else {
116                         redo = hammer_bread(hmp, next_offset, &error, &buffer);
117                 }
118                 if (error)
119                         break;
120                 hammer_modify_buffer(NULL, buffer, NULL, 0);
121
122                 /*
123                  * Calculate how big a media structure fits up to the next
124                  * alignment point and how large a data payload we can
125                  * accomodate.
126                  *
127                  * If n calculates to 0 or negative there is no room for
128                  * anything but a PAD.
129                  */
130                 bytes = HAMMER_UNDO_ALIGN -
131                         ((int)next_offset & HAMMER_UNDO_MASK);
132                 n = bytes -
133                     (int)sizeof(struct hammer_fifo_redo) -
134                     (int)sizeof(struct hammer_fifo_tail);
135
136                 /*
137                  * If available space is insufficient for any payload
138                  * we have to lay down a PAD.
139                  *
140                  * The minimum PAD is 8 bytes and the head and tail will
141                  * overlap each other in that case.  PADs do not have
142                  * sequence numbers or CRCs.
143                  *
144                  * A PAD may not start on a boundary.  That is, every
145                  * 512-byte block in the UNDO/REDO FIFO must begin with
146                  * a record containing a sequence number.
147                  */
148                 if (n <= 0) {
149                         KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
150                         KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
151                         tail = (void *)((char *)redo + bytes - sizeof(*tail));
152                         if ((void *)redo != (void *)tail) {
153                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
154                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
155                                 tail->tail_size = bytes;
156                         }
157                         redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
158                         redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
159                         redo->head.hdr_size = bytes;
160                         /* NO CRC OR SEQ NO */
161                         undomap->next_offset += bytes;
162                         hammer_modify_buffer_done(buffer);
163                         hammer_stats_redo += bytes;
164                         continue;
165                 }
166
167                 /*
168                  * When generating an inode-related REDO record we track
169                  * the point in the UNDO/REDO FIFO containing the inode's
170                  * earliest REDO record.  See hammer_generate_redo_sync().
171                  *
172                  * redo_fifo_next is cleared when an inode is staged to
173                  * the backend and then used to determine how to reassign
174                  * redo_fifo_start after the inode flush completes.
175                  */
176                 if (ip) {
177                         redo->redo_objid = ip->obj_id;
178                         redo->redo_localization = ip->obj_localization;
179                         if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
180                                 ip->redo_fifo_start = next_offset;
181                                 if (RB_INSERT(hammer_redo_rb_tree,
182                                               &hmp->rb_redo_root, ip)) {
183                                         panic("hammer_generate_redo: "
184                                               "cannot insert inode %p on "
185                                               "redo FIFO", ip);
186                                 }
187                                 ip->flags |= HAMMER_INODE_RDIRTY;
188                         }
189                         if (ip->redo_fifo_next == 0)
190                                 ip->redo_fifo_next = next_offset;
191                 } else {
192                         redo->redo_objid = 0;
193                         redo->redo_localization = 0;
194                 }
195
196                 /*
197                  * Calculate the actual payload and recalculate the size
198                  * of the media structure as necessary.  If no data buffer
199                  * is supplied there is no payload.
200                  */
201                 if (base == NULL) {
202                         n = 0;
203                 } else if (n > len) {
204                         n = len;
205                 }
206                 bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
207                          ~HAMMER_HEAD_ALIGN_MASK) +
208                         (int)sizeof(struct hammer_fifo_redo) +
209                         (int)sizeof(struct hammer_fifo_tail);
210                 if (hammer_debug_general & 0x0080) {
211                         kprintf("redo %016llx %d %d\n",
212                                 (long long)next_offset, bytes, n);
213                 }
214
215                 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
216                 redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
217                 redo->head.hdr_size = bytes;
218                 redo->head.hdr_seq = hmp->undo_seqno++;
219                 redo->head.hdr_crc = 0;
220                 redo->redo_mtime = trans->time;
221                 redo->redo_offset = file_off;
222                 redo->redo_flags = flags;
223
224                 /*
225                  * Incremental payload.  If no payload we throw the entire
226                  * len into redo_data_bytes and will not loop.
227                  */
228                 if (base) {
229                         redo->redo_data_bytes = n;
230                         bcopy(base, redo + 1, n);
231                         len -= n;
232                         base = (char *)base + n;
233                         file_off += n;
234                 } else {
235                         redo->redo_data_bytes = len;
236                         file_off += len;
237                         len = 0;
238                 }
239
240                 tail = (void *)((char *)redo + bytes - sizeof(*tail));
241                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
242                 tail->tail_type = HAMMER_HEAD_TYPE_REDO;
243                 tail->tail_size = bytes;
244
245                 KKASSERT(bytes >= sizeof(redo->head));
246                 redo->head.hdr_crc = crc32(redo, HAMMER_FIFO_HEAD_CRCOFF) ^
247                              crc32(&redo->head + 1, bytes - sizeof(redo->head));
248                 undomap->next_offset += bytes;
249                 hammer_stats_redo += bytes;
250
251                 /*
252                  * Before we finish off the buffer we have to deal with any
253                  * junk between the end of the media structure we just laid
254                  * down and the UNDO alignment boundary.  We do this by laying
255                  * down a dummy PAD.  Even though we will probably overwrite
256                  * it almost immediately we have to do this so recovery runs
257                  * can iterate the UNDO space without having to depend on
258                  * the indices in the volume header.
259                  *
260                  * This dummy PAD will be overwritten on the next undo so
261                  * we do not adjust undomap->next_offset.
262                  */
263                 bytes = HAMMER_UNDO_ALIGN -
264                         ((int)undomap->next_offset & HAMMER_UNDO_MASK);
265                 if (bytes != HAMMER_UNDO_ALIGN) {
266                         KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
267                         redo = (void *)(tail + 1);
268                         tail = (void *)((char *)redo + bytes - sizeof(*tail));
269                         if ((void *)redo != (void *)tail) {
270                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
271                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
272                                 tail->tail_size = bytes;
273                         }
274                         redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
275                         redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
276                         redo->head.hdr_size = bytes;
277                         /* NO CRC OR SEQ NO */
278                 }
279                 hammer_modify_buffer_done(buffer);
280                 if (len == 0)
281                         break;
282         }
283         hammer_modify_volume_done(root_volume);
284         hammer_unlock(&hmp->undo_lock);
285
286         if (buffer)
287                 hammer_rel_buffer(buffer, 0);
288
289         /*
290          * Make sure the nominal undo span contains at least one REDO_SYNC,
291          * otherwise the REDO recovery will not be triggered.
292          */
293         if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
294             flags != HAMMER_REDO_SYNC) {
295                 hammer_generate_redo_sync(trans);
296         }
297
298         return(error);
299 }
300
301 /*
302  * Generate a REDO SYNC record.  At least one such record must be generated
303  * in the nominal recovery span for the recovery code to be able to run
304  * REDOs outside of the span.
305  *
306  * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
307  * for all inodes with active REDOs.  This changes dynamically as inodes
308  * get flushed.
309  *
310  * During recovery stage2 any new flush cycles must specify the original
311  * redo sync offset.  That way a crash will re-run the REDOs, at least
312  * up to the point where the UNDO FIFO does not overwrite the area.
313  */
314 void
315 hammer_generate_redo_sync(hammer_transaction_t trans)
316 {
317         hammer_mount_t hmp = trans->hmp;
318         hammer_inode_t ip;
319         hammer_off_t redo_fifo_start;
320
321         if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) {
322                 ip = NULL;
323                 redo_fifo_start = hmp->recover_stage2_offset;
324         } else {
325                 ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
326                 if (ip)
327                         redo_fifo_start = ip->redo_fifo_start;
328                 else
329                         redo_fifo_start = 0;
330         }
331         if (redo_fifo_start) {
332                 if (hammer_debug_io & 0x0004) {
333                         kprintf("SYNC IP %p %016jx\n",
334                                 ip, (intmax_t)redo_fifo_start);
335                 }
336                 hammer_generate_redo(trans, NULL, redo_fifo_start,
337                                      HAMMER_REDO_SYNC, NULL, 0);
338                 trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
339         }
340 }
341
342 /*
343  * This is called when an inode is queued to the backend.
344  */
345 void
346 hammer_redo_fifo_start_flush(hammer_inode_t ip)
347 {
348         ip->redo_fifo_next = 0;
349 }
350
351 /*
352  * This is called when an inode backend flush is finished.  We have to make
353  * sure that RDIRTY is not set unless dirty bufs are present.  Dirty bufs
354  * can get destroyed through operations such as truncations and leave
355  * us with a stale redo_fifo_next.
356  */
357 void
358 hammer_redo_fifo_end_flush(hammer_inode_t ip)
359 {
360         hammer_mount_t hmp = ip->hmp;
361
362         if (ip->flags & HAMMER_INODE_RDIRTY) {
363                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
364                 ip->flags &= ~HAMMER_INODE_RDIRTY;
365         }
366         if ((ip->flags & HAMMER_INODE_BUFS) == 0)
367                 ip->redo_fifo_next = 0;
368         if (ip->redo_fifo_next) {
369                 ip->redo_fifo_start = ip->redo_fifo_next;
370                 if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) {
371                         panic("hammer_generate_redo: cannot reinsert "
372                               "inode %p on redo FIFO",
373                               ip);
374                 }
375                 ip->flags |= HAMMER_INODE_RDIRTY;
376         }
377 }