Merge from vendor branch NETGRAPH:
[dragonfly.git] / sys / vfs / hammer / hammer_mirror.c
CommitLineData
dd94f1b1
MD
1/*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
c82af904 34 * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.2 2008/06/26 04:06:23 dillon Exp $
dd94f1b1
MD
35 */
36/*
37 * HAMMER mirroring ioctls - serialize and deserialize modifications made
38 * to a filesystem.
39 */
40
41#include "hammer.h"
42
c82af904
MD
43static int hammer_mirror_check(hammer_cursor_t cursor,
44 struct hammer_ioc_mrecord *mrec);
45static int hammer_mirror_update(hammer_cursor_t cursor,
46 struct hammer_ioc_mrecord *mrec);
47static int hammer_mirror_write(hammer_cursor_t cursor,
48 struct hammer_ioc_mrecord *mrec,
49 char *udata);
50static int hammer_mirror_localize_data(hammer_data_ondisk_t data,
51 hammer_btree_leaf_elm_t leaf);
52
53/*
54 * All B-Tree records within the specified key range which also conform
55 * to the transaction id range are returned. Mirroring code keeps track
56 * of the last transaction id fully scanned and can efficiently pick up
57 * where it left off if interrupted.
58 */
dd94f1b1
MD
59int
60hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip,
61 struct hammer_ioc_mirror_rw *mirror)
62{
63 struct hammer_cursor cursor;
c82af904
MD
64 struct hammer_ioc_mrecord mrec;
65 hammer_btree_leaf_elm_t elm;
66 const int head_size = HAMMER_MREC_HEADSIZE;
67 const int crc_start = HAMMER_MREC_CRCOFF;
68 char *uptr;
dd94f1b1 69 int error;
c82af904
MD
70 int data_len;
71 int bytes;
dd94f1b1
MD
72
73 if ((mirror->key_beg.localization | mirror->key_end.localization) &
74 HAMMER_LOCALIZE_PSEUDOFS_MASK) {
75 return(EINVAL);
76 }
77 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0)
78 return(EINVAL);
79
80 mirror->key_cur = mirror->key_beg;
81 mirror->key_cur.localization += ip->obj_localization;
c82af904 82 bzero(&mrec, sizeof(mrec));
dd94f1b1
MD
83
84retry:
85 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
86 if (error) {
87 hammer_done_cursor(&cursor);
88 goto failed;
89 }
90 cursor.key_beg = mirror->key_cur;
91 cursor.key_end = mirror->key_end;
92 cursor.key_end.localization += ip->obj_localization;
93
94 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
95 cursor.flags |= HAMMER_CURSOR_BACKEND;
96
97 /*
c82af904
MD
98 * This flag filters the search to only return elements whos create
99 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid
100 * field stored with internal and leaf nodes to shortcut the scan.
dd94f1b1 101 */
c82af904
MD
102 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
103 cursor.mirror_tid = mirror->tid_beg;
dd94f1b1
MD
104
105 error = hammer_btree_first(&cursor);
106 while (error == 0) {
107 /*
c82af904
MD
108 * Leaf node. Only return elements modified in the range
109 * requested by userland.
dd94f1b1 110 */
c82af904
MD
111 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
112 elm = &cursor.node->ondisk->elms[cursor.index].leaf;
113
114 if (elm->base.create_tid < mirror->tid_beg ||
115 elm->base.create_tid >= mirror->tid_end) {
116 if (elm->base.delete_tid < mirror->tid_beg ||
117 elm->base.delete_tid >= mirror->tid_end) {
118 goto skip;
119 }
120 }
121
122 mirror->key_cur = elm->base;
dd94f1b1
MD
123
124 /*
125 * Yield to more important tasks
126 */
127 if ((error = hammer_signal_check(trans->hmp)) != 0)
128 break;
129 if (trans->hmp->sync_lock.wanted) {
130 tsleep(trans, 0, "hmrslo", hz / 10);
131 }
132 if (trans->hmp->locked_dirty_count +
133 trans->hmp->io_running_count > hammer_limit_dirtybufs) {
134 hammer_flusher_async(trans->hmp);
135 tsleep(trans, 0, "hmrslo", hz / 10);
136 }
137
dd94f1b1 138 /*
c82af904
MD
139 * The core code exports the data to userland.
140 */
141 data_len = (elm->data_offset) ? elm->data_len : 0;
142 if (data_len) {
143 error = hammer_btree_extract(&cursor,
144 HAMMER_CURSOR_GET_DATA);
145 if (error)
146 break;
147 }
148 bytes = offsetof(struct hammer_ioc_mrecord, data[data_len]);
149 bytes = (bytes + HAMMER_HEAD_ALIGN_MASK) &
150 ~HAMMER_HEAD_ALIGN_MASK;
151 if (mirror->count + bytes > mirror->size)
152 break;
153
154 /*
155 * Construct the record for userland and copyout.
dd94f1b1 156 *
c82af904
MD
157 * The user is asking for a snapshot, if the record was
158 * deleted beyond the user-requested ending tid, the record
159 * is not considered deleted from the point of view of
160 * userland and delete_tid is cleared.
dd94f1b1 161 */
c82af904
MD
162 mrec.signature = HAMMER_IOC_MIRROR_SIGNATURE;
163 mrec.rec_size = bytes;
164 mrec.leaf = *elm;
165 if (elm->base.delete_tid >= mirror->tid_end)
166 mrec.leaf.base.delete_tid = 0;
167 mrec.rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
168 uptr = (char *)mirror->ubuf + mirror->count;
169 error = copyout(&mrec, uptr, head_size);
170 if (data_len && error == 0) {
171 error = copyout(cursor.data, uptr + head_size,
172 data_len);
173 }
174 if (error == 0)
175 mirror->count += bytes;
176skip:
dd94f1b1
MD
177 if (error == 0) {
178 cursor.flags |= HAMMER_CURSOR_ATEDISK;
179 error = hammer_btree_iterate(&cursor);
180 }
181 }
c82af904
MD
182 if (error == ENOENT) {
183 mirror->key_cur = mirror->key_end;
dd94f1b1 184 error = 0;
c82af904 185 }
dd94f1b1
MD
186 hammer_done_cursor(&cursor);
187 if (error == EDEADLK)
188 goto retry;
189 if (error == EINTR) {
c82af904 190 mirror->head.flags |= HAMMER_IOC_HEAD_INTR;
dd94f1b1
MD
191 error = 0;
192 }
193failed:
194 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK;
195 return(error);
196}
197
c82af904
MD
198/*
199 * Copy records from userland to the target mirror. Records which already
200 * exist may only have their delete_tid updated.
201 */
202int
203hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip,
204 struct hammer_ioc_mirror_rw *mirror)
205{
206 struct hammer_cursor cursor;
207 struct hammer_ioc_mrecord mrec;
208 const int head_size = HAMMER_MREC_HEADSIZE;
209 const int crc_start = HAMMER_MREC_CRCOFF;
210 u_int32_t rec_crc;
211 int error;
212 char *uptr;
213
214 if (mirror->size < 0 || mirror->size > 0x70000000)
215 return(EINVAL);
216
217 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
218retry:
219 hammer_normalize_cursor(&cursor);
220
221 while (error == 0 && mirror->count + head_size <= mirror->size) {
222 /*
223 * Acquire and validate header
224 */
225 uptr = (char *)mirror->ubuf + mirror->count;
226 error = copyin(uptr, &mrec, head_size);
227 if (error)
228 break;
229 rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
230 if (mrec.signature != HAMMER_IOC_MIRROR_SIGNATURE) {
231 error = EINVAL;
232 break;
233 }
234 if (rec_crc != mrec.rec_crc) {
235 error = EINVAL;
236 break;
237 }
238 if (mrec.rec_size < head_size ||
239 mrec.rec_size > head_size + HAMMER_XBUFSIZE + 16 ||
240 mirror->count + mrec.rec_size > mirror->size) {
241 error = EINVAL;
242 break;
243 }
244 if (mrec.leaf.data_len < 0 ||
245 mrec.leaf.data_len > HAMMER_XBUFSIZE ||
246 offsetof(struct hammer_ioc_mrecord, data[mrec.leaf.data_len]) > mrec.rec_size) {
247 error = EINVAL;
248 }
249
250 /*
251 * Re-localize for target. relocalization of data is handled
252 * by hammer_mirror_write().
253 */
254 mrec.leaf.base.localization &= HAMMER_LOCALIZE_MASK;
255 mrec.leaf.base.localization += ip->obj_localization;
256
257 /*
258 * Locate the record.
259 *
260 * If the record exists only the delete_tid may be updated.
261 *
262 * If the record does not exist we create it. For now we
263 * ignore records with a non-zero delete_tid. Note that
264 * mirror operations are effective an as-of operation and
265 * delete_tid can be 0 for mirroring purposes even if it is
266 * not actually 0 at the originator.
267 */
268 hammer_normalize_cursor(&cursor);
269 cursor.key_beg = mrec.leaf.base;
270 cursor.flags |= HAMMER_CURSOR_BACKEND;
271 cursor.flags &= ~HAMMER_CURSOR_INSERT;
272 error = hammer_btree_lookup(&cursor);
273
274 if (error == 0 && hammer_mirror_check(&cursor, &mrec)) {
275 hammer_sync_lock_sh(trans);
276 error = hammer_mirror_update(&cursor, &mrec);
277 hammer_sync_unlock(trans);
278 } else if (error == ENOENT && mrec.leaf.base.delete_tid == 0) {
279 hammer_sync_lock_sh(trans);
280 error = hammer_mirror_write(&cursor, &mrec,
281 uptr + head_size);
282 hammer_sync_unlock(trans);
283 }
284
285 /*
286 * Setup for loop
287 */
288 if (error == EDEADLK) {
289 hammer_done_cursor(&cursor);
290 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
291 goto retry;
292 }
293 if (error == 0) {
294 mirror->count += mrec.rec_size;
295 }
296 }
297 hammer_done_cursor(&cursor);
298 return(0);
299}
300
301/*
302 * Check whether an update is needed in the case where a match already
303 * exists on the target. The only type of update allowed in this case
304 * is an update of the delete_tid.
305 *
306 * Return non-zero if the update should proceed.
307 */
308static
309int
310hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
311{
312 hammer_btree_leaf_elm_t leaf = cursor->leaf;
313
314 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) {
315 if (leaf->base.delete_tid != 0)
316 return(1);
317 }
318 return(0);
319}
320
321/*
322 * Update a record in-place. Only the delete_tid can change.
323 */
324static
325int
326hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
327{
328 hammer_btree_leaf_elm_t elm;
329
330 elm = cursor->leaf;
331 KKASSERT(elm->base.create_tid < mrec->leaf.base.delete_tid);
332 hammer_modify_node(cursor->trans, cursor->node, elm, sizeof(*elm));
333 elm->base.delete_tid = mrec->leaf.base.delete_tid;
334 elm->delete_ts = mrec->leaf.delete_ts;
335 hammer_modify_node_done(cursor->node);
336 return(0);
337}
338
339/*
340 * Write out a new record.
341 *
342 * XXX this is messy.
343 */
344static
345int
346hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
347 char *udata)
348{
349 hammer_buffer_t data_buffer = NULL;
350 hammer_off_t ndata_offset;
351 void *ndata;
352 int error;
353 int wanted_skip = 0;
354
355 if (mrec->leaf.data_len && mrec->leaf.data_offset) {
356 ndata = hammer_alloc_data(cursor->trans, mrec->leaf.data_len,
357 mrec->leaf.base.rec_type,
358 &ndata_offset, &data_buffer, &error);
359 if (ndata == NULL)
360 return(error);
361 mrec->leaf.data_offset = ndata_offset;
362 hammer_modify_buffer(cursor->trans, data_buffer, NULL, 0);
363 error = copyin(udata, ndata, mrec->leaf.data_len);
364 if (error == 0) {
365 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) {
366 kprintf("data crc mismatch on pipe\n");
367 error = EINVAL;
368 } else {
369 error = hammer_mirror_localize_data(
370 ndata, &mrec->leaf);
371 if (error)
372 wanted_skip = 1;
373 }
374 }
375 hammer_modify_buffer_done(data_buffer);
376 } else {
377 mrec->leaf.data_offset = 0;
378 error = 0;
379 ndata = NULL;
380 }
381 if (error)
382 goto failed;
383 cursor->flags |= HAMMER_CURSOR_INSERT;
384 error = hammer_btree_lookup(cursor);
385 if (error != ENOENT) {
386 if (error == 0)
387 error = EALREADY;
388 goto failed;
389 }
390 error = 0;
391
392 /*
393 * Physical insertion
394 */
395 error = hammer_btree_insert(cursor, &mrec->leaf);
396
397failed:
398 /*
399 * Cleanup
400 */
401 if (error && mrec->leaf.data_offset) {
402 hammer_blockmap_free(cursor->trans,
403 mrec->leaf.data_offset,
404 mrec->leaf.data_len);
405 }
406 if (data_buffer)
407 hammer_rel_buffer(data_buffer, 0);
408 if (wanted_skip)
409 error = 0;
410 return(error);
411}
412
413/*
414 * Localize the data payload. Directory entries may need their
415 * localization adjusted.
416 *
417 * Pseudo-fs directory entries must be skipped entirely (EBADF).
418 *
419 * The root inode must be skipped, it will exist on the target with a
420 * different create_tid so updating it would result in a duplicate. This
421 * also prevents inode updates on the root directory (aka mtime, ctime, etc)
422 * from mirroring, which is ok.
423 *
424 * XXX Root directory inode updates - parent_obj_localization is broken.
425 */
426static
427int
428hammer_mirror_localize_data(hammer_data_ondisk_t data,
429 hammer_btree_leaf_elm_t leaf)
430{
431 int modified = 0;
432 int error = 0;
433 u_int32_t localization;
434
435 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
436 localization = leaf->base.localization &
437 HAMMER_LOCALIZE_PSEUDOFS_MASK;
438 if (data->entry.localization != localization) {
439 data->entry.localization = localization;
440 modified = 1;
441 }
442 if (data->entry.obj_id == 1)
443 error = EBADF;
444 }
445 if (leaf->base.rec_type == HAMMER_RECTYPE_INODE) {
446 if (leaf->base.obj_id == HAMMER_OBJID_ROOT)
447 error = EBADF;
448 }
449 if (modified)
450 hammer_crc_set_leaf(data, leaf);
451 return(error);
452}
453