HAMMER VFS - Fix degenerate stall condition in flusher during unmount
[dragonfly.git] / sys / vfs / hammer / hammer_pfs.c
CommitLineData
842e7a70
MD
1/*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
4889cbd4 34 * $DragonFly: src/sys/vfs/hammer/hammer_pfs.c,v 1.5 2008/07/31 04:42:04 dillon Exp $
842e7a70
MD
35 */
36/*
37 * HAMMER PFS ioctls - Manage pseudo-fs configurations
38 */
39
40#include "hammer.h"
41
42static int hammer_pfs_autodetect(struct hammer_ioc_pseudofs_rw *pfs,
43 hammer_inode_t ip);
44static int hammer_pfs_rollback(hammer_transaction_t trans,
45 hammer_pseudofs_inmem_t pfsm,
46 hammer_tid_t trunc_tid);
47static int hammer_pfs_delete_at_cursor(hammer_cursor_t cursor,
48 hammer_tid_t trunc_tid);
49
50/*
51 * Get mirroring/pseudo-fs information
52 *
53 * NOTE: The ip used for ioctl is not necessarily related to the PFS
54 */
55int
56hammer_ioc_get_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
57 struct hammer_ioc_pseudofs_rw *pfs)
58{
59 hammer_pseudofs_inmem_t pfsm;
60 u_int32_t localization;
61 int error;
62
63 if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
64 return(error);
65 localization = (u_int32_t)pfs->pfs_id << 16;
66 pfs->bytes = sizeof(struct hammer_pseudofs_data);
67 pfs->version = HAMMER_IOC_PSEUDOFS_VERSION;
68
69 pfsm = hammer_load_pseudofs(trans, localization, &error);
70 if (error) {
71 hammer_rel_pseudofs(trans->hmp, pfsm);
72 return(error);
73 }
74
75 /*
76 * If the PFS is a master the sync tid is set by normal operation
1ee600fb 77 * rather than the mirroring code, and will always track the
842e7a70 78 * real HAMMER filesystem.
4889cbd4
MD
79 *
80 * We use flush_tid1, which is the highest fully committed TID.
81 * flush_tid2 is the TID most recently flushed, but the UNDO hasn't
82 * caught up to it yet so a crash will roll us back to flush_tid1.
842e7a70 83 */
f437a2ab 84 if ((pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0)
4889cbd4 85 pfsm->pfsd.sync_end_tid = trans->hmp->flush_tid1;
842e7a70
MD
86
87 /*
88 * Copy out to userland.
89 */
90 error = 0;
91 if (pfs->ondisk && error == 0)
92 error = copyout(&pfsm->pfsd, pfs->ondisk, sizeof(pfsm->pfsd));
93 hammer_rel_pseudofs(trans->hmp, pfsm);
94 return(error);
95}
96
97/*
98 * Set mirroring/pseudo-fs information
99 *
100 * NOTE: The ip used for ioctl is not necessarily related to the PFS
101 */
102int
103hammer_ioc_set_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
104 struct ucred *cred, struct hammer_ioc_pseudofs_rw *pfs)
105{
106 hammer_pseudofs_inmem_t pfsm;
107 u_int32_t localization;
108 int error;
109
110 if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
111 return(error);
112 localization = (u_int32_t)pfs->pfs_id << 16;
113 if (pfs->version != HAMMER_IOC_PSEUDOFS_VERSION)
114 error = EINVAL;
115 localization = (u_int32_t)pfs->pfs_id << 16;
116
117 if (error == 0 && pfs->ondisk) {
118 /*
119 * Load the PFS so we can modify our in-core copy. Ignore
120 * ENOENT errors.
121 */
122 pfsm = hammer_load_pseudofs(trans, localization, &error);
123 error = copyin(pfs->ondisk, &pfsm->pfsd, sizeof(pfsm->pfsd));
124
125 /*
126 * Save it back, create a root inode if we are in master
127 * mode and no root exists.
9a032a4a
MD
128 *
129 * We do not create root inodes for slaves, the root inode
130 * must be mirrored from the master.
842e7a70 131 */
9a032a4a
MD
132 if (error == 0 &&
133 (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0) {
842e7a70 134 error = hammer_mkroot_pseudofs(trans, cred, pfsm);
9a032a4a 135 }
842e7a70
MD
136 if (error == 0)
137 error = hammer_save_pseudofs(trans, pfsm);
4889cbd4
MD
138
139 /*
140 * Wakeup anyone waiting for a TID update for this PFS
141 */
142 wakeup(&pfsm->pfsd.sync_end_tid);
842e7a70
MD
143 hammer_rel_pseudofs(trans->hmp, pfsm);
144 }
145 return(error);
146}
147
148/*
149 * Upgrade a slave to a master
150 *
151 * This is fairly easy to do, but we must physically undo any partial syncs
152 * for transaction ids > sync_end_tid. Effective, we must do a partial
153 * rollback.
154 *
155 * NOTE: The ip used for ioctl is not necessarily related to the PFS
156 */
157int
158hammer_ioc_upgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
159 struct hammer_ioc_pseudofs_rw *pfs)
160{
161 hammer_pseudofs_inmem_t pfsm;
162 u_int32_t localization;
163 int error;
164
165 if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
166 return(error);
167 localization = (u_int32_t)pfs->pfs_id << 16;
168 if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
169 return(error);
170
171 /*
172 * A master id must be set when upgrading
173 */
174 pfsm = hammer_load_pseudofs(trans, localization, &error);
842e7a70
MD
175 if (error == 0) {
176 if ((pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) != 0) {
177 error = hammer_pfs_rollback(trans, pfsm,
178 pfsm->pfsd.sync_end_tid + 1);
179 if (error == 0) {
180 pfsm->pfsd.mirror_flags &= ~HAMMER_PFSD_SLAVE;
181 error = hammer_save_pseudofs(trans, pfsm);
182 }
183 }
184 }
185 hammer_rel_pseudofs(trans->hmp, pfsm);
186 if (error == EINTR) {
187 pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
188 error = 0;
189 }
190 return (error);
191}
192
193/*
194 * Downgrade a master to a slave
195 *
1ee600fb 196 * This is really easy to do, just set the SLAVE flag and update sync_end_tid.
842e7a70 197 *
1ee600fb
MD
198 * We previously did not update sync_end_tid in consideration for a slave
199 * upgraded to a master and then downgraded again, but this completely breaks
200 * the case where one starts with a master and then downgrades to a slave,
201 * then upgrades again.
842e7a70
MD
202 *
203 * NOTE: The ip used for ioctl is not necessarily related to the PFS
204 */
205int
206hammer_ioc_downgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
207 struct hammer_ioc_pseudofs_rw *pfs)
208{
1ee600fb 209 hammer_mount_t hmp = trans->hmp;
842e7a70
MD
210 hammer_pseudofs_inmem_t pfsm;
211 u_int32_t localization;
212 int error;
213
214 if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
215 return(error);
216 localization = (u_int32_t)pfs->pfs_id << 16;
217 if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
218 return(error);
219
220 pfsm = hammer_load_pseudofs(trans, localization, &error);
221 if (error == 0) {
222 if ((pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0) {
223 pfsm->pfsd.mirror_flags |= HAMMER_PFSD_SLAVE;
1ee600fb
MD
224 if (pfsm->pfsd.sync_end_tid < hmp->flush_tid1)
225 pfsm->pfsd.sync_end_tid = hmp->flush_tid1;
842e7a70
MD
226 error = hammer_save_pseudofs(trans, pfsm);
227 }
228 }
229 hammer_rel_pseudofs(trans->hmp, pfsm);
230 return (error);
231}
232
233/*
234 * Destroy a PFS
235 *
236 * We can destroy a PFS by scanning and deleting all of its records in the
237 * B-Tree. The hammer utility will delete the softlink in the primary
238 * filesystem.
239 *
240 * NOTE: The ip used for ioctl is not necessarily related to the PFS
241 */
242int
243hammer_ioc_destroy_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
244 struct hammer_ioc_pseudofs_rw *pfs)
245{
246 hammer_pseudofs_inmem_t pfsm;
247 u_int32_t localization;
248 int error;
249
250 if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
251 return(error);
252 localization = (u_int32_t)pfs->pfs_id << 16;
253
254 if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
255 return(error);
256
257 pfsm = hammer_load_pseudofs(trans, localization, &error);
258 if (error == 0) {
259 error = hammer_pfs_rollback(trans, pfsm, 0);
260 if (error == 0) {
261 pfsm->pfsd.mirror_flags |= HAMMER_PFSD_DELETED;
262 error = hammer_save_pseudofs(trans, pfsm);
263 }
264 }
265 hammer_rel_pseudofs(trans->hmp, pfsm);
266 if (error == EINTR) {
267 pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
268 error = 0;
269 }
270 return(error);
271}
272
273/*
4889cbd4
MD
274 * Wait for the PFS to sync past the specified TID
275 */
276int
277hammer_ioc_wait_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
278 struct hammer_ioc_pseudofs_rw *pfs)
279{
280 hammer_pseudofs_inmem_t pfsm;
281 struct hammer_pseudofs_data pfsd;
282 u_int32_t localization;
283 hammer_tid_t tid;
284 void *waitp;
285 int error;
286
287 if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
288 return(error);
289 localization = (u_int32_t)pfs->pfs_id << 16;
290
291 if ((error = copyin(pfs->ondisk, &pfsd, sizeof(pfsd))) != 0)
292 return(error);
293
294 pfsm = hammer_load_pseudofs(trans, localization, &error);
295 if (error == 0) {
296 if (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) {
297 tid = pfsm->pfsd.sync_end_tid;
298 waitp = &pfsm->pfsd.sync_end_tid;
299 } else {
300 tid = trans->hmp->flush_tid1;
301 waitp = &trans->hmp->flush_tid1;
302 }
303 if (tid <= pfsd.sync_end_tid)
304 tsleep(waitp, PCATCH, "hmrmwt", 0);
305 }
306 hammer_rel_pseudofs(trans->hmp, pfsm);
307 if (error == EINTR) {
308 pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
309 error = 0;
310 }
311 return(error);
312}
313
314
315/*
842e7a70
MD
316 * Auto-detect the pseudofs and do basic bounds checking.
317 */
318static
319int
320hammer_pfs_autodetect(struct hammer_ioc_pseudofs_rw *pfs, hammer_inode_t ip)
321{
322 int error = 0;
323
324 if (pfs->pfs_id == -1)
325 pfs->pfs_id = (int)(ip->obj_localization >> 16);
326 if (pfs->pfs_id < 0 || pfs->pfs_id >= HAMMER_MAX_PFS)
327 error = EINVAL;
328 if (pfs->bytes < sizeof(struct hammer_pseudofs_data))
329 error = EINVAL;
330 return(error);
331}
332
333/*
334 * Rollback the specified PFS to (trunc_tid - 1), removing everything
335 * greater or equal to trunc_tid. The PFS must not have been in no-mirror
336 * mode or the MIRROR_FILTERED scan will not work properly.
337 *
338 * This is typically used to remove any partial syncs when upgrading a
339 * slave to a master. It can theoretically also be used to rollback
340 * any PFS, including PFS#0, BUT ONLY TO POINTS THAT HAVE NOT YET BEEN
341 * PRUNED, and to points that are older only if they are on a retained
342 * (pruning softlink) boundary.
343 *
344 * Rollbacks destroy information. If you don't mind inode numbers changing
345 * a better way would be to cpdup a snapshot back onto the master.
346 */
347static
348int
349hammer_pfs_rollback(hammer_transaction_t trans,
350 hammer_pseudofs_inmem_t pfsm,
351 hammer_tid_t trunc_tid)
352{
353 struct hammer_cmirror cmirror;
354 struct hammer_cursor cursor;
355 struct hammer_base_elm key_cur;
356 int error;
ce0138a6 357 int seq;
842e7a70
MD
358
359 bzero(&cmirror, sizeof(cmirror));
360 bzero(&key_cur, sizeof(key_cur));
361 key_cur.localization = HAMMER_MIN_LOCALIZATION + pfsm->localization;
362 key_cur.obj_id = HAMMER_MIN_OBJID;
363 key_cur.key = HAMMER_MIN_KEY;
364 key_cur.create_tid = 1;
365 key_cur.rec_type = HAMMER_MIN_RECTYPE;
366
e86903d8 367 seq = trans->hmp->flusher.done;
ce0138a6 368
842e7a70
MD
369retry:
370 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
371 if (error) {
372 hammer_done_cursor(&cursor);
373 goto failed;
374 }
375 cursor.key_beg = key_cur;
376 cursor.key_end.localization = HAMMER_MAX_LOCALIZATION +
377 pfsm->localization;
378 cursor.key_end.obj_id = HAMMER_MAX_OBJID;
379 cursor.key_end.key = HAMMER_MAX_KEY;
380 cursor.key_end.create_tid = HAMMER_MAX_TID;
381 cursor.key_end.rec_type = HAMMER_MAX_RECTYPE;
382
383 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
384 cursor.flags |= HAMMER_CURSOR_BACKEND;
385
386 /*
387 * Do an optimized scan of only records created or modified
388 * >= trunc_tid, so we can fix up those records. We must
389 * still check the TIDs but this greatly reduces the size of
390 * the scan.
391 */
392 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
393 cursor.cmirror = &cmirror;
394 cmirror.mirror_tid = trunc_tid;
395
396 error = hammer_btree_first(&cursor);
397 while (error == 0) {
398 /*
399 * Abort the rollback.
400 */
401 if (error == 0) {
402 error = hammer_signal_check(trans->hmp);
403 if (error)
404 break;
405 }
406
407 /*
408 * We only care about leafs. Internal nodes can be returned
409 * in mirror-filtered mode (they are used to generate SKIP
410 * mrecords), but we don't need them for this code.
c9ce54d6
MD
411 *
412 * WARNING: See warnings in hammer_unlock_cursor() function.
842e7a70 413 */
732a1697 414 cursor.flags |= HAMMER_CURSOR_ATEDISK;
842e7a70
MD
415 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF) {
416 key_cur = cursor.node->ondisk->elms[cursor.index].base;
417 error = hammer_pfs_delete_at_cursor(&cursor, trunc_tid);
418 }
419
ce0138a6
MD
420 while (hammer_flusher_meta_halflimit(trans->hmp) ||
421 hammer_flusher_undo_exhausted(trans, 2)) {
982be4bf 422 hammer_unlock_cursor(&cursor);
ce0138a6 423 hammer_flusher_wait(trans->hmp, seq);
982be4bf 424 hammer_lock_cursor(&cursor);
ce0138a6
MD
425 seq = hammer_flusher_async_one(trans->hmp);
426 }
427
842e7a70
MD
428 if (error == 0)
429 error = hammer_btree_iterate(&cursor);
430 }
431 if (error == ENOENT)
432 error = 0;
433 hammer_done_cursor(&cursor);
434 if (error == EDEADLK)
435 goto retry;
436failed:
437 return(error);
438}
439
440/*
441 * Helper function - perform rollback on a B-Tree element given trunc_tid.
442 *
443 * If create_tid >= trunc_tid the record is physically destroyed.
444 * If delete_tid >= trunc_tid it will be set to 0, undeleting the record.
445 */
446static
447int
448hammer_pfs_delete_at_cursor(hammer_cursor_t cursor, hammer_tid_t trunc_tid)
449{
450 hammer_btree_leaf_elm_t elm;
451 hammer_transaction_t trans;
452 int error;
453
454 elm = &cursor->node->ondisk->elms[cursor->index].leaf;
455 if (elm->base.create_tid < trunc_tid &&
456 elm->base.delete_tid < trunc_tid) {
457 return(0);
458 }
459 trans = cursor->trans;
460
461 if (elm->base.create_tid >= trunc_tid) {
462 error = hammer_delete_at_cursor(
463 cursor, HAMMER_DELETE_DESTROY,
464 cursor->trans->tid, cursor->trans->time32,
465 1, NULL);
466 } else if (elm->base.delete_tid >= trunc_tid) {
467 error = hammer_delete_at_cursor(
468 cursor, HAMMER_DELETE_ADJUST,
469 0, 0,
470 1, NULL);
471 } else {
472 error = 0;
473 }
474 return(error);
475}
476