Commit | Line | Data |
---|---|---|
dd94f1b1 MD |
1 | /* |
2 | * Copyright (c) 2008 The DragonFly Project. All rights reserved. | |
745703c7 | 3 | * |
dd94f1b1 MD |
4 | * This code is derived from software contributed to The DragonFly Project |
5 | * by Matthew Dillon <dillon@backplane.com> | |
745703c7 | 6 | * |
dd94f1b1 MD |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
745703c7 | 10 | * |
dd94f1b1 MD |
11 | * 1. Redistributions of source code must retain the above copyright |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
745703c7 | 20 | * |
dd94f1b1 MD |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
dd94f1b1 MD |
33 | */ |
34 | /* | |
35 | * HAMMER mirroring ioctls - serialize and deserialize modifications made | |
36 | * to a filesystem. | |
37 | */ | |
38 | ||
39 | #include "hammer.h" | |
40 | ||
c82af904 | 41 | static int hammer_mirror_check(hammer_cursor_t cursor, |
4c038e17 | 42 | struct hammer_ioc_mrecord_rec *mrec); |
c82af904 | 43 | static int hammer_mirror_update(hammer_cursor_t cursor, |
4c038e17 | 44 | struct hammer_ioc_mrecord_rec *mrec); |
4c038e17 MD |
45 | static int hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, |
46 | struct hammer_ioc_mrecord_rec *mrec, | |
47 | struct hammer_ioc_mirror_rw *mirror, | |
46137e17 | 48 | uint32_t localization, |
4c038e17 MD |
49 | char *uptr); |
50 | static int hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, | |
51 | struct hammer_ioc_mrecord_rec *mrec, | |
52 | struct hammer_ioc_mirror_rw *mirror, | |
46137e17 | 53 | uint32_t localization); |
4c038e17 MD |
54 | static int hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, |
55 | struct hammer_ioc_mrecord_skip *mrec, | |
56 | struct hammer_ioc_mirror_rw *mirror, | |
46137e17 | 57 | uint32_t localization); |
842e7a70 | 58 | static int hammer_mirror_delete_to(hammer_cursor_t cursor, |
4c038e17 | 59 | struct hammer_ioc_mirror_rw *mirror); |
5f532f10 | 60 | static int hammer_mirror_nomirror(hammer_base_elm_t base); |
c82af904 MD |
61 | |
62 | /* | |
63 | * All B-Tree records within the specified key range which also conform | |
64 | * to the transaction id range are returned. Mirroring code keeps track | |
65 | * of the last transaction id fully scanned and can efficiently pick up | |
66 | * where it left off if interrupted. | |
ea434b6f MD |
67 | * |
68 | * The PFS is identified in the mirror structure. The passed ip is just | |
69 | * some directory in the overall HAMMER filesystem and has nothing to | |
70 | * do with the PFS. | |
c82af904 | 71 | */ |
dd94f1b1 MD |
72 | int |
73 | hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip, | |
74 | struct hammer_ioc_mirror_rw *mirror) | |
75 | { | |
4c038e17 | 76 | struct hammer_cmirror cmirror; |
dd94f1b1 | 77 | struct hammer_cursor cursor; |
4c038e17 | 78 | union hammer_ioc_mrecord_any mrec; |
c82af904 | 79 | hammer_btree_leaf_elm_t elm; |
c82af904 | 80 | char *uptr; |
dd94f1b1 | 81 | int error; |
c82af904 MD |
82 | int data_len; |
83 | int bytes; | |
4c038e17 | 84 | int eatdisk; |
4c286c36 | 85 | int mrec_flags; |
46137e17 | 86 | uint32_t localization; |
17b150c6 | 87 | hammer_crc_t rec_crc; |
ea434b6f | 88 | |
20cf2291 | 89 | localization = pfs_to_lo(mirror->pfs_id); |
dd94f1b1 MD |
90 | |
91 | if ((mirror->key_beg.localization | mirror->key_end.localization) & | |
92 | HAMMER_LOCALIZE_PSEUDOFS_MASK) { | |
93 | return(EINVAL); | |
94 | } | |
95 | if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0) | |
96 | return(EINVAL); | |
97 | ||
98 | mirror->key_cur = mirror->key_beg; | |
4c038e17 | 99 | mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; |
7e52af60 | 100 | mirror->key_cur.localization |= localization; |
c82af904 | 101 | bzero(&mrec, sizeof(mrec)); |
4c038e17 | 102 | bzero(&cmirror, sizeof(cmirror)); |
dd94f1b1 | 103 | |
4c286c36 MD |
104 | /* |
105 | * Make CRC errors non-fatal (at least on data), causing an EDOM | |
106 | * error instead of EIO. | |
107 | */ | |
108 | trans->flags |= HAMMER_TRANSF_CRCDOM; | |
109 | ||
dd94f1b1 MD |
110 | retry: |
111 | error = hammer_init_cursor(trans, &cursor, NULL, NULL); | |
112 | if (error) { | |
113 | hammer_done_cursor(&cursor); | |
114 | goto failed; | |
115 | } | |
116 | cursor.key_beg = mirror->key_cur; | |
117 | cursor.key_end = mirror->key_end; | |
4c038e17 | 118 | cursor.key_end.localization &= HAMMER_LOCALIZE_MASK; |
7e52af60 | 119 | cursor.key_end.localization |= localization; |
dd94f1b1 MD |
120 | |
121 | cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; | |
122 | cursor.flags |= HAMMER_CURSOR_BACKEND; | |
123 | ||
124 | /* | |
c82af904 MD |
125 | * This flag filters the search to only return elements whos create |
126 | * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid | |
127 | * field stored with internal and leaf nodes to shortcut the scan. | |
dd94f1b1 | 128 | */ |
c82af904 | 129 | cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED; |
4c038e17 MD |
130 | cursor.cmirror = &cmirror; |
131 | cmirror.mirror_tid = mirror->tid_beg; | |
dd94f1b1 MD |
132 | |
133 | error = hammer_btree_first(&cursor); | |
134 | while (error == 0) { | |
93291532 MD |
135 | /* |
136 | * Yield to more important tasks | |
137 | */ | |
138 | if (error == 0) { | |
139 | error = hammer_signal_check(trans->hmp); | |
140 | if (error) | |
141 | break; | |
142 | } | |
143 | ||
dd94f1b1 | 144 | /* |
4c038e17 MD |
145 | * An internal node can be returned in mirror-filtered |
146 | * mode and indicates that the scan is returning a skip | |
147 | * range in the cursor->cmirror structure. | |
148 | */ | |
149 | uptr = (char *)mirror->ubuf + mirror->count; | |
150 | if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { | |
151 | /* | |
152 | * Check space | |
153 | */ | |
154 | mirror->key_cur = cmirror.skip_beg; | |
155 | bytes = sizeof(mrec.skip); | |
156 | if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > | |
157 | mirror->size) { | |
158 | break; | |
159 | } | |
160 | ||
161 | /* | |
162 | * Fill mrec | |
163 | */ | |
164 | mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; | |
165 | mrec.head.type = HAMMER_MREC_TYPE_SKIP; | |
166 | mrec.head.rec_size = bytes; | |
167 | mrec.skip.skip_beg = cmirror.skip_beg; | |
168 | mrec.skip.skip_end = cmirror.skip_end; | |
d8fe5ece | 169 | hammer_crc_set_mrec_head(&mrec.head, bytes); |
4c038e17 MD |
170 | error = copyout(&mrec, uptr, bytes); |
171 | eatdisk = 0; | |
172 | goto didwrite; | |
173 | } | |
174 | ||
175 | /* | |
176 | * Leaf node. In full-history mode we could filter out | |
177 | * elements modified outside the user-requested TID range. | |
178 | * | |
179 | * However, such elements must be returned so the writer | |
f96881ff | 180 | * can compare them against the target to determine what |
4c038e17 MD |
181 | * needs to be deleted on the target, particular for |
182 | * no-history mirrors. | |
dd94f1b1 | 183 | */ |
c82af904 MD |
184 | KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF); |
185 | elm = &cursor.node->ondisk->elms[cursor.index].leaf; | |
4c038e17 | 186 | mirror->key_cur = elm->base; |
c82af904 | 187 | |
3324b8cd MD |
188 | /* |
189 | * If the record was created after our end point we just | |
190 | * ignore it. | |
191 | */ | |
192 | if (elm->base.create_tid > mirror->tid_end) { | |
193 | error = 0; | |
194 | bytes = 0; | |
195 | eatdisk = 1; | |
196 | goto didwrite; | |
197 | } | |
198 | ||
e469566b MD |
199 | /* |
200 | * Determine if we should generate a PASS or a REC. PASS | |
201 | * records are records without any data payload. Such | |
202 | * records will be generated if the target is already expected | |
203 | * to have the record, allowing it to delete the gaps. | |
204 | * | |
205 | * A PASS record is also used to perform deletions on the | |
206 | * target. | |
207 | * | |
208 | * Such deletions are needed if the master or files on the | |
209 | * master are no-history, or if the slave is so far behind | |
210 | * the master has already been pruned. | |
211 | */ | |
3324b8cd | 212 | if (elm->base.create_tid < mirror->tid_beg) { |
4c038e17 MD |
213 | bytes = sizeof(mrec.rec); |
214 | if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > | |
215 | mirror->size) { | |
216 | break; | |
c82af904 | 217 | } |
c82af904 | 218 | |
4c038e17 | 219 | /* |
e469566b | 220 | * Fill mrec. |
4c038e17 MD |
221 | */ |
222 | mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; | |
223 | mrec.head.type = HAMMER_MREC_TYPE_PASS; | |
224 | mrec.head.rec_size = bytes; | |
225 | mrec.rec.leaf = *elm; | |
d8fe5ece | 226 | hammer_crc_set_mrec_head(&mrec.head, bytes); |
4c038e17 MD |
227 | error = copyout(&mrec, uptr, bytes); |
228 | eatdisk = 1; | |
229 | goto didwrite; | |
4c038e17 | 230 | } |
dd94f1b1 | 231 | |
dd94f1b1 | 232 | /* |
c82af904 | 233 | * The core code exports the data to userland. |
4c286c36 MD |
234 | * |
235 | * CRC errors on data are reported but passed through, | |
236 | * but the data must be washed by the user program. | |
54ee5a26 MD |
237 | * |
238 | * If userland just wants the btree records it can | |
239 | * request that bulk data not be returned. This is | |
240 | * use during mirror-stream histogram generation. | |
c82af904 | 241 | */ |
4c286c36 | 242 | mrec_flags = 0; |
c82af904 | 243 | data_len = (elm->data_offset) ? elm->data_len : 0; |
54ee5a26 MD |
244 | if (data_len && |
245 | (mirror->head.flags & HAMMER_IOC_MIRROR_NODATA)) { | |
246 | data_len = 0; | |
247 | mrec_flags |= HAMMER_MRECF_NODATA; | |
248 | } | |
c82af904 | 249 | if (data_len) { |
40962009 | 250 | error = hammer_btree_extract_data(&cursor); |
4c286c36 MD |
251 | if (error) { |
252 | if (error != EDOM) | |
253 | break; | |
254 | mrec_flags |= HAMMER_MRECF_CRC_ERROR | | |
255 | HAMMER_MRECF_DATA_CRC_BAD; | |
256 | } | |
c82af904 | 257 | } |
4c038e17 MD |
258 | |
259 | bytes = sizeof(mrec.rec) + data_len; | |
260 | if (mirror->count + HAMMER_HEAD_DOALIGN(bytes) > mirror->size) | |
c82af904 MD |
261 | break; |
262 | ||
263 | /* | |
264 | * Construct the record for userland and copyout. | |
dd94f1b1 | 265 | * |
c82af904 MD |
266 | * The user is asking for a snapshot, if the record was |
267 | * deleted beyond the user-requested ending tid, the record | |
268 | * is not considered deleted from the point of view of | |
269 | * userland and delete_tid is cleared. | |
dd94f1b1 | 270 | */ |
4c038e17 | 271 | mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE; |
4c286c36 | 272 | mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags; |
4c038e17 MD |
273 | mrec.head.rec_size = bytes; |
274 | mrec.rec.leaf = *elm; | |
4c286c36 | 275 | |
4889cbd4 | 276 | if (elm->base.delete_tid > mirror->tid_end) |
4c038e17 | 277 | mrec.rec.leaf.base.delete_tid = 0; |
d8fe5ece | 278 | rec_crc = hammer_crc_get_mrec_head(&mrec.head, sizeof(mrec.rec)); |
4c038e17 MD |
279 | if (data_len) |
280 | rec_crc = crc32_ext(cursor.data, data_len, rec_crc); | |
281 | mrec.head.rec_crc = rec_crc; | |
282 | error = copyout(&mrec, uptr, sizeof(mrec.rec)); | |
c82af904 | 283 | if (data_len && error == 0) { |
4c038e17 | 284 | error = copyout(cursor.data, uptr + sizeof(mrec.rec), |
c82af904 MD |
285 | data_len); |
286 | } | |
4c038e17 MD |
287 | eatdisk = 1; |
288 | ||
289 | /* | |
290 | * eatdisk controls whether we skip the current cursor | |
291 | * position on the next scan or not. If doing a SKIP | |
292 | * the cursor is already positioned properly for the next | |
293 | * scan and eatdisk will be 0. | |
294 | */ | |
295 | didwrite: | |
dd94f1b1 | 296 | if (error == 0) { |
4c038e17 MD |
297 | mirror->count += HAMMER_HEAD_DOALIGN(bytes); |
298 | if (eatdisk) | |
299 | cursor.flags |= HAMMER_CURSOR_ATEDISK; | |
300 | else | |
301 | cursor.flags &= ~HAMMER_CURSOR_ATEDISK; | |
dd94f1b1 MD |
302 | error = hammer_btree_iterate(&cursor); |
303 | } | |
304 | } | |
c82af904 MD |
305 | if (error == ENOENT) { |
306 | mirror->key_cur = mirror->key_end; | |
dd94f1b1 | 307 | error = 0; |
c82af904 | 308 | } |
dd94f1b1 MD |
309 | hammer_done_cursor(&cursor); |
310 | if (error == EDEADLK) | |
311 | goto retry; | |
312 | if (error == EINTR) { | |
c82af904 | 313 | mirror->head.flags |= HAMMER_IOC_HEAD_INTR; |
dd94f1b1 MD |
314 | error = 0; |
315 | } | |
316 | failed: | |
317 | mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; | |
318 | return(error); | |
319 | } | |
320 | ||
c82af904 | 321 | /* |
4c038e17 | 322 | * Copy records from userland to the target mirror. |
602c6cb8 | 323 | * |
ea434b6f MD |
324 | * The PFS is identified in the mirror structure. The passed ip is just |
325 | * some directory in the overall HAMMER filesystem and has nothing to | |
326 | * do with the PFS. In fact, there might not even be a root directory for | |
327 | * the PFS yet! | |
c82af904 MD |
328 | */ |
329 | int | |
330 | hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip, | |
331 | struct hammer_ioc_mirror_rw *mirror) | |
332 | { | |
4c038e17 | 333 | union hammer_ioc_mrecord_any mrec; |
c82af904 | 334 | struct hammer_cursor cursor; |
46137e17 | 335 | uint32_t localization; |
93291532 | 336 | int checkspace_count = 0; |
c82af904 | 337 | int error; |
4c038e17 | 338 | int bytes; |
c82af904 | 339 | char *uptr; |
93291532 | 340 | int seq; |
ea434b6f | 341 | |
20cf2291 | 342 | localization = pfs_to_lo(mirror->pfs_id); |
e86903d8 | 343 | seq = trans->hmp->flusher.done; |
c82af904 | 344 | |
4c038e17 MD |
345 | /* |
346 | * Validate the mirror structure and relocalize the tracking keys. | |
347 | */ | |
c82af904 MD |
348 | if (mirror->size < 0 || mirror->size > 0x70000000) |
349 | return(EINVAL); | |
4c038e17 | 350 | mirror->key_beg.localization &= HAMMER_LOCALIZE_MASK; |
7e52af60 | 351 | mirror->key_beg.localization |= localization; |
4c038e17 | 352 | mirror->key_end.localization &= HAMMER_LOCALIZE_MASK; |
7e52af60 | 353 | mirror->key_end.localization |= localization; |
4c038e17 | 354 | mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK; |
7e52af60 | 355 | mirror->key_cur.localization |= localization; |
c82af904 | 356 | |
4c038e17 MD |
357 | /* |
358 | * Set up our tracking cursor for the loop. The tracking cursor | |
359 | * is used to delete records that are no longer present on the | |
360 | * master. The last handled record at key_cur must be skipped. | |
361 | */ | |
c82af904 | 362 | error = hammer_init_cursor(trans, &cursor, NULL, NULL); |
c82af904 | 363 | |
4c038e17 MD |
364 | cursor.key_beg = mirror->key_cur; |
365 | cursor.key_end = mirror->key_end; | |
366 | cursor.flags |= HAMMER_CURSOR_BACKEND; | |
367 | error = hammer_btree_first(&cursor); | |
368 | if (error == 0) | |
369 | cursor.flags |= HAMMER_CURSOR_ATEDISK; | |
370 | if (error == ENOENT) | |
371 | error = 0; | |
372 | ||
373 | /* | |
374 | * Loop until our input buffer has been exhausted. | |
375 | */ | |
376 | while (error == 0 && | |
93291532 MD |
377 | mirror->count + sizeof(mrec.head) <= mirror->size) { |
378 | ||
379 | /* | |
380 | * Don't blow out the buffer cache. Leave room for frontend | |
381 | * cache as well. | |
c9ce54d6 MD |
382 | * |
383 | * WARNING: See warnings in hammer_unlock_cursor() function. | |
93291532 | 384 | */ |
15e75dab MD |
385 | while (hammer_flusher_meta_halflimit(trans->hmp) || |
386 | hammer_flusher_undo_exhausted(trans, 2)) { | |
982be4bf | 387 | hammer_unlock_cursor(&cursor); |
93291532 | 388 | hammer_flusher_wait(trans->hmp, seq); |
982be4bf | 389 | hammer_lock_cursor(&cursor); |
15e75dab | 390 | seq = hammer_flusher_async_one(trans->hmp); |
93291532 MD |
391 | } |
392 | ||
393 | /* | |
394 | * If there is insufficient free space it may be due to | |
a981af19 | 395 | * reserved big-blocks, which flushing might fix. |
93291532 MD |
396 | */ |
397 | if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_MIRROR)) { | |
398 | if (++checkspace_count == 10) { | |
399 | error = ENOSPC; | |
400 | break; | |
401 | } | |
982be4bf | 402 | hammer_unlock_cursor(&cursor); |
93291532 | 403 | hammer_flusher_wait(trans->hmp, seq); |
982be4bf | 404 | hammer_lock_cursor(&cursor); |
7a61b85d | 405 | seq = hammer_flusher_async(trans->hmp, NULL); |
93291532 MD |
406 | } |
407 | ||
4c038e17 | 408 | |
c82af904 MD |
409 | /* |
410 | * Acquire and validate header | |
411 | */ | |
4c038e17 MD |
412 | if ((bytes = mirror->size - mirror->count) > sizeof(mrec)) |
413 | bytes = sizeof(mrec); | |
c82af904 | 414 | uptr = (char *)mirror->ubuf + mirror->count; |
4c038e17 | 415 | error = copyin(uptr, &mrec, bytes); |
c82af904 MD |
416 | if (error) |
417 | break; | |
4c038e17 | 418 | if (mrec.head.signature != HAMMER_IOC_MIRROR_SIGNATURE) { |
c82af904 MD |
419 | error = EINVAL; |
420 | break; | |
421 | } | |
4c038e17 MD |
422 | if (mrec.head.rec_size < sizeof(mrec.head) || |
423 | mrec.head.rec_size > sizeof(mrec) + HAMMER_XBUFSIZE || | |
424 | mirror->count + mrec.head.rec_size > mirror->size) { | |
5fa5c92f MD |
425 | error = EINVAL; |
426 | break; | |
427 | } | |
4c038e17 | 428 | |
4c286c36 | 429 | switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) { |
4c038e17 MD |
430 | case HAMMER_MREC_TYPE_SKIP: |
431 | if (mrec.head.rec_size != sizeof(mrec.skip)) | |
432 | error = EINVAL; | |
433 | if (error == 0) | |
434 | error = hammer_ioc_mirror_write_skip(&cursor, &mrec.skip, mirror, localization); | |
c82af904 | 435 | break; |
4c038e17 MD |
436 | case HAMMER_MREC_TYPE_REC: |
437 | if (mrec.head.rec_size < sizeof(mrec.rec)) | |
438 | error = EINVAL; | |
439 | if (error == 0) | |
440 | error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec)); | |
c82af904 | 441 | break; |
54ee5a26 | 442 | case HAMMER_MREC_TYPE_REC_NODATA: |
4c286c36 MD |
443 | case HAMMER_MREC_TYPE_REC_BADCRC: |
444 | /* | |
445 | * Records with bad data payloads are ignored XXX. | |
54ee5a26 MD |
446 | * Records with no data payload have to be skipped |
447 | * (they shouldn't have been written in the first | |
448 | * place). | |
4c286c36 MD |
449 | */ |
450 | if (mrec.head.rec_size < sizeof(mrec.rec)) | |
451 | error = EINVAL; | |
452 | break; | |
4c038e17 MD |
453 | case HAMMER_MREC_TYPE_PASS: |
454 | if (mrec.head.rec_size != sizeof(mrec.rec)) | |
455 | error = EINVAL; | |
456 | if (error == 0) | |
457 | error = hammer_ioc_mirror_write_pass(&cursor, &mrec.rec, mirror, localization); | |
458 | break; | |
459 | default: | |
c82af904 | 460 | error = EINVAL; |
4c038e17 | 461 | break; |
c82af904 MD |
462 | } |
463 | ||
464 | /* | |
4c038e17 MD |
465 | * Retry the current record on deadlock, otherwise setup |
466 | * for the next loop. | |
c82af904 | 467 | */ |
4c038e17 MD |
468 | if (error == EDEADLK) { |
469 | while (error == EDEADLK) { | |
f3a4893b | 470 | hammer_sync_lock_sh(trans); |
4c038e17 MD |
471 | hammer_recover_cursor(&cursor); |
472 | error = hammer_cursor_upgrade(&cursor); | |
f3a4893b | 473 | hammer_sync_unlock(trans); |
4c038e17 MD |
474 | } |
475 | } else { | |
476 | if (error == EALREADY) | |
477 | error = 0; | |
478 | if (error == 0) { | |
745703c7 | 479 | mirror->count += |
4c038e17 MD |
480 | HAMMER_HEAD_DOALIGN(mrec.head.rec_size); |
481 | } | |
482 | } | |
483 | } | |
484 | hammer_done_cursor(&cursor); | |
c82af904 | 485 | |
4c038e17 | 486 | /* |
745703c7 | 487 | * cumulative error |
4c038e17 MD |
488 | */ |
489 | if (error) { | |
490 | mirror->head.flags |= HAMMER_IOC_HEAD_ERROR; | |
491 | mirror->head.error = error; | |
492 | } | |
493 | ||
494 | /* | |
495 | * ioctls don't update the RW data structure if an error is returned, | |
496 | * always return 0. | |
497 | */ | |
498 | return(0); | |
499 | } | |
500 | ||
501 | /* | |
502 | * Handle skip records. | |
503 | * | |
504 | * We must iterate from the last resolved record position at mirror->key_cur | |
3324b8cd | 505 | * to skip_beg non-inclusive and delete any records encountered. |
4c038e17 MD |
506 | * |
507 | * mirror->key_cur must be carefully set when we succeed in processing | |
508 | * this mrec. | |
509 | */ | |
510 | static int | |
511 | hammer_ioc_mirror_write_skip(hammer_cursor_t cursor, | |
512 | struct hammer_ioc_mrecord_skip *mrec, | |
513 | struct hammer_ioc_mirror_rw *mirror, | |
46137e17 | 514 | uint32_t localization) |
4c038e17 MD |
515 | { |
516 | int error; | |
517 | ||
518 | /* | |
519 | * Relocalize the skip range | |
520 | */ | |
521 | mrec->skip_beg.localization &= HAMMER_LOCALIZE_MASK; | |
7e52af60 | 522 | mrec->skip_beg.localization |= localization; |
4c038e17 | 523 | mrec->skip_end.localization &= HAMMER_LOCALIZE_MASK; |
7e52af60 | 524 | mrec->skip_end.localization |= localization; |
4c038e17 MD |
525 | |
526 | /* | |
527 | * Iterate from current position to skip_beg, deleting any records | |
3324b8cd MD |
528 | * we encounter. The record at skip_beg is not included (it is |
529 | * skipped). | |
4c038e17 MD |
530 | */ |
531 | cursor->key_end = mrec->skip_beg; | |
3324b8cd | 532 | cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; |
4c038e17 | 533 | cursor->flags |= HAMMER_CURSOR_BACKEND; |
842e7a70 | 534 | error = hammer_mirror_delete_to(cursor, mirror); |
4c038e17 MD |
535 | |
536 | /* | |
537 | * Now skip past the skip (which is the whole point point of | |
538 | * having a skip record). The sender has not sent us any records | |
539 | * for the skip area so we wouldn't know what to keep and what | |
540 | * to delete anyway. | |
541 | * | |
542 | * Clear ATEDISK because skip_end is non-inclusive, so we can't | |
543 | * count an exact match if we happened to get one. | |
544 | */ | |
545 | if (error == 0) { | |
546 | mirror->key_cur = mrec->skip_end; | |
547 | cursor->key_beg = mrec->skip_end; | |
548 | error = hammer_btree_lookup(cursor); | |
549 | cursor->flags &= ~HAMMER_CURSOR_ATEDISK; | |
550 | if (error == ENOENT) | |
551 | error = 0; | |
552 | } | |
553 | return(error); | |
554 | } | |
555 | ||
556 | /* | |
557 | * Handle B-Tree records. | |
558 | * | |
559 | * We must iterate to mrec->base.key (non-inclusively), and then process | |
560 | * the record. We are allowed to write a new record or delete an existing | |
561 | * record, but cannot replace an existing record. | |
562 | * | |
563 | * mirror->key_cur must be carefully set when we succeed in processing | |
564 | * this mrec. | |
565 | */ | |
566 | static int | |
567 | hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, | |
568 | struct hammer_ioc_mrecord_rec *mrec, | |
569 | struct hammer_ioc_mirror_rw *mirror, | |
46137e17 | 570 | uint32_t localization, |
4c038e17 MD |
571 | char *uptr) |
572 | { | |
4c038e17 MD |
573 | int error; |
574 | ||
745703c7 | 575 | if (mrec->leaf.data_len < 0 || |
4c038e17 MD |
576 | mrec->leaf.data_len > HAMMER_XBUFSIZE || |
577 | mrec->leaf.data_len + sizeof(*mrec) > mrec->head.rec_size) { | |
578 | return(EINVAL); | |
579 | } | |
580 | ||
581 | /* | |
582 | * Re-localize for target. relocalization of data is handled | |
913505ff | 583 | * by hammer_create_at_cursor(). |
4c038e17 MD |
584 | */ |
585 | mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; | |
7e52af60 | 586 | mrec->leaf.base.localization |= localization; |
4c038e17 MD |
587 | |
588 | /* | |
589 | * Delete records through until we reach (non-inclusively) the | |
590 | * target record. | |
591 | */ | |
592 | cursor->key_end = mrec->leaf.base; | |
593 | cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; | |
594 | cursor->flags |= HAMMER_CURSOR_BACKEND; | |
842e7a70 | 595 | error = hammer_mirror_delete_to(cursor, mirror); |
4c038e17 | 596 | |
83f2a3aa MD |
597 | /* |
598 | * Certain records are not part of the mirroring operation | |
599 | */ | |
bbb01e14 | 600 | if (error == 0 && hammer_mirror_nomirror(&mrec->leaf.base)) |
83f2a3aa MD |
601 | return(0); |
602 | ||
4c038e17 MD |
603 | /* |
604 | * Locate the record. | |
605 | * | |
606 | * If the record exists only the delete_tid may be updated. | |
607 | * | |
e469566b MD |
608 | * If the record does not exist we can create it only if the |
609 | * create_tid is not too old. If the create_tid is too old | |
610 | * it may have already been destroyed on the slave from pruning. | |
611 | * | |
612 | * Note that mirror operations are effectively as-of operations | |
613 | * and delete_tid can be 0 for mirroring purposes even if it is | |
4c038e17 | 614 | * not actually 0 at the originator. |
98da6d8c MD |
615 | * |
616 | * These functions can return EDEADLK | |
4c038e17 | 617 | */ |
bbb01e14 MD |
618 | if (error == 0) { |
619 | cursor->key_beg = mrec->leaf.base; | |
620 | cursor->flags |= HAMMER_CURSOR_BACKEND; | |
621 | cursor->flags &= ~HAMMER_CURSOR_INSERT; | |
622 | error = hammer_btree_lookup(cursor); | |
623 | } | |
4c038e17 MD |
624 | |
625 | if (error == 0 && hammer_mirror_check(cursor, mrec)) { | |
4c038e17 | 626 | error = hammer_mirror_update(cursor, mrec); |
4c038e17 | 627 | } else if (error == ENOENT) { |
83f2a3aa MD |
628 | if (mrec->leaf.base.create_tid >= mirror->tid_beg) { |
629 | error = hammer_create_at_cursor( | |
630 | cursor, &mrec->leaf, | |
631 | uptr, HAMMER_CREATE_MODE_UMIRROR); | |
632 | } else { | |
e469566b | 633 | error = 0; |
83f2a3aa | 634 | } |
4c038e17 MD |
635 | } |
636 | if (error == 0 || error == EALREADY) | |
637 | mirror->key_cur = mrec->leaf.base; | |
638 | return(error); | |
639 | } | |
640 | ||
641 | /* | |
642 | * This works like write_rec but no write or update is necessary, | |
643 | * and no data payload is included so we couldn't do a write even | |
644 | * if we wanted to. | |
645 | * | |
646 | * We must still iterate for deletions, and we can validate the | |
647 | * record header which is a good way to test for corrupted mirror | |
648 | * targets XXX. | |
649 | * | |
650 | * mirror->key_cur must be carefully set when we succeed in processing | |
651 | * this mrec. | |
652 | */ | |
653 | static | |
654 | int | |
655 | hammer_ioc_mirror_write_pass(hammer_cursor_t cursor, | |
656 | struct hammer_ioc_mrecord_rec *mrec, | |
657 | struct hammer_ioc_mirror_rw *mirror, | |
46137e17 | 658 | uint32_t localization) |
4c038e17 | 659 | { |
4c038e17 MD |
660 | int error; |
661 | ||
4c038e17 MD |
662 | /* |
663 | * Re-localize for target. Relocalization of data is handled | |
913505ff | 664 | * by hammer_create_at_cursor(). |
4c038e17 MD |
665 | */ |
666 | mrec->leaf.base.localization &= HAMMER_LOCALIZE_MASK; | |
7e52af60 | 667 | mrec->leaf.base.localization |= localization; |
4c038e17 MD |
668 | |
669 | /* | |
670 | * Delete records through until we reach (non-inclusively) the | |
671 | * target record. | |
672 | */ | |
673 | cursor->key_end = mrec->leaf.base; | |
674 | cursor->flags &= ~HAMMER_CURSOR_END_INCLUSIVE; | |
675 | cursor->flags |= HAMMER_CURSOR_BACKEND; | |
842e7a70 | 676 | error = hammer_mirror_delete_to(cursor, mirror); |
4c038e17 | 677 | |
83f2a3aa MD |
678 | /* |
679 | * Certain records are not part of the mirroring operation | |
680 | */ | |
681 | if (hammer_mirror_nomirror(&mrec->leaf.base)) | |
682 | return(0); | |
683 | ||
4c038e17 | 684 | /* |
e469566b MD |
685 | * Locate the record and get past it by setting ATEDISK. Perform |
686 | * any necessary deletions. We have no data payload and cannot | |
687 | * create a new record. | |
4c038e17 MD |
688 | */ |
689 | if (error == 0) { | |
690 | mirror->key_cur = mrec->leaf.base; | |
691 | cursor->key_beg = mrec->leaf.base; | |
692 | cursor->flags |= HAMMER_CURSOR_BACKEND; | |
693 | cursor->flags &= ~HAMMER_CURSOR_INSERT; | |
694 | error = hammer_btree_lookup(cursor); | |
e469566b MD |
695 | if (error == 0) { |
696 | if (hammer_mirror_check(cursor, mrec)) | |
697 | error = hammer_mirror_update(cursor, mrec); | |
4c038e17 | 698 | cursor->flags |= HAMMER_CURSOR_ATEDISK; |
e469566b | 699 | } else { |
4c038e17 | 700 | cursor->flags &= ~HAMMER_CURSOR_ATEDISK; |
e469566b | 701 | } |
4c038e17 MD |
702 | if (error == ENOENT) |
703 | error = 0; | |
704 | } | |
705 | return(error); | |
706 | } | |
707 | ||
708 | /* | |
709 | * As part of the mirror write we iterate across swaths of records | |
710 | * on the target which no longer exist on the source, and mark them | |
711 | * deleted. | |
842e7a70 MD |
712 | * |
713 | * The caller has indexed the cursor and set up key_end. We iterate | |
714 | * through to key_end. | |
f96881ff MD |
715 | * |
716 | * There is an edge case where the master has deleted a record whos | |
717 | * create_tid exactly matches our end_tid. We cannot delete this | |
718 | * record on the slave yet because we cannot assign delete_tid == create_tid. | |
719 | * The deletion should be picked up on the next sequence since in order | |
720 | * to have been deleted on the master a transaction must have occured with | |
721 | * a TID greater then the create_tid of the record. | |
3324b8cd MD |
722 | * |
723 | * To support incremental re-mirroring, just for robustness, we do not | |
724 | * touch any records created beyond (or equal to) mirror->tid_end. | |
4c038e17 MD |
725 | */ |
726 | static | |
727 | int | |
842e7a70 MD |
728 | hammer_mirror_delete_to(hammer_cursor_t cursor, |
729 | struct hammer_ioc_mirror_rw *mirror) | |
4c038e17 | 730 | { |
842e7a70 | 731 | hammer_btree_leaf_elm_t elm; |
98da6d8c MD |
732 | int error; |
733 | ||
842e7a70 MD |
734 | error = hammer_btree_iterate(cursor); |
735 | while (error == 0) { | |
736 | elm = &cursor->node->ondisk->elms[cursor->index].leaf; | |
737 | KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); | |
4889cbd4 | 738 | cursor->flags |= HAMMER_CURSOR_ATEDISK; |
3324b8cd | 739 | |
83f2a3aa MD |
740 | /* |
741 | * Certain records are not part of the mirroring operation | |
742 | */ | |
743 | if (hammer_mirror_nomirror(&elm->base)) { | |
744 | error = hammer_btree_iterate(cursor); | |
745 | continue; | |
746 | } | |
747 | ||
3324b8cd MD |
748 | /* |
749 | * Note: Must still delete records with create_tid < tid_beg, | |
750 | * as record may have been pruned-away on source. | |
751 | */ | |
f96881ff | 752 | if (elm->base.delete_tid == 0 && |
3324b8cd | 753 | elm->base.create_tid < mirror->tid_end) { |
842e7a70 MD |
754 | error = hammer_delete_at_cursor(cursor, |
755 | HAMMER_DELETE_ADJUST, | |
756 | mirror->tid_end, | |
757 | time_second, | |
758 | 1, NULL); | |
c82af904 | 759 | } |
842e7a70 MD |
760 | if (error == 0) |
761 | error = hammer_btree_iterate(cursor); | |
c82af904 | 762 | } |
842e7a70 MD |
763 | if (error == ENOENT) |
764 | error = 0; | |
765 | return(error); | |
c82af904 MD |
766 | } |
767 | ||
768 | /* | |
769 | * Check whether an update is needed in the case where a match already | |
770 | * exists on the target. The only type of update allowed in this case | |
771 | * is an update of the delete_tid. | |
772 | * | |
773 | * Return non-zero if the update should proceed. | |
774 | */ | |
775 | static | |
776 | int | |
4c038e17 | 777 | hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord_rec *mrec) |
c82af904 MD |
778 | { |
779 | hammer_btree_leaf_elm_t leaf = cursor->leaf; | |
780 | ||
781 | if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) { | |
ea434b6f | 782 | if (mrec->leaf.base.delete_tid != 0) |
c82af904 MD |
783 | return(1); |
784 | } | |
785 | return(0); | |
786 | } | |
787 | ||
83f2a3aa MD |
788 | /* |
789 | * Filter out records which are never mirrored, such as configuration space | |
790 | * records (for hammer cleanup). | |
791 | * | |
792 | * NOTE: We currently allow HAMMER_RECTYPE_SNAPSHOT records to be mirrored. | |
793 | */ | |
794 | static | |
795 | int | |
5f532f10 | 796 | hammer_mirror_nomirror(hammer_base_elm_t base) |
83f2a3aa MD |
797 | { |
798 | /* | |
799 | * Certain types of records are never updated when mirroring. | |
800 | * Slaves have their own configuration space. | |
801 | */ | |
802 | if (base->rec_type == HAMMER_RECTYPE_CONFIG) | |
803 | return(1); | |
804 | return(0); | |
805 | } | |
806 | ||
807 | ||
c82af904 | 808 | /* |
842e7a70 MD |
809 | * Update a record in-place. Only the delete_tid can change, and |
810 | * only from zero to non-zero. | |
c82af904 MD |
811 | */ |
812 | static | |
813 | int | |
4c038e17 MD |
814 | hammer_mirror_update(hammer_cursor_t cursor, |
815 | struct hammer_ioc_mrecord_rec *mrec) | |
c82af904 | 816 | { |
98da6d8c MD |
817 | int error; |
818 | ||
4c038e17 | 819 | /* |
842e7a70 | 820 | * This case shouldn't occur. |
4c038e17 | 821 | */ |
842e7a70 MD |
822 | if (mrec->leaf.base.delete_tid == 0) |
823 | return(0); | |
4c038e17 | 824 | |
adf01747 | 825 | /* |
842e7a70 | 826 | * Mark the record deleted on the mirror target. |
adf01747 | 827 | */ |
842e7a70 MD |
828 | error = hammer_delete_at_cursor(cursor, HAMMER_DELETE_ADJUST, |
829 | mrec->leaf.base.delete_tid, | |
830 | mrec->leaf.delete_ts, | |
831 | 1, NULL); | |
832 | cursor->flags |= HAMMER_CURSOR_ATEDISK; | |
833 | return(error); | |
c82af904 | 834 | } |