2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.41 2008/04/27 00:45:37 dillon Exp $
41 static int hammer_unload_inode(struct hammer_inode *ip);
42 static void hammer_flush_inode_copysync(hammer_inode_t ip);
43 static int hammer_mark_record_callback(hammer_record_t rec, void *data);
46 * The kernel is not actively referencing this vnode but is still holding
49 * This is called from the frontend.
52 hammer_vop_inactive(struct vop_inactive_args *ap)
54 struct hammer_inode *ip = VTOI(ap->a_vp);
65 * If the inode no longer has any references we recover its
66 * in-memory resources immediately.
68 * NOTE: called from frontend, use ino_rec instead of sync_ino_rec.
70 if (ip->ino_rec.ino_nlinks == 0)
76 * Release the vnode association. This is typically (but not always)
77 * the last reference on the inode and will flush the inode to the
80 * XXX Currently our sync code only runs through inodes with vnode
81 * associations, so we depend on hammer_rel_inode() to sync any inode
82 * record data to the block device prior to losing the association.
83 * Otherwise transactions that the user expected to be distinct by
84 * doing a manual sync may be merged.
87 hammer_vop_reclaim(struct vop_reclaim_args *ap)
89 struct hammer_inode *ip;
94 if ((ip = vp->v_data) != NULL) {
99 * Don't let too many dependancies build up on unreferenced
100 * inodes or we could run ourselves out of memory.
102 if (TAILQ_FIRST(&ip->depend_list)) {
103 ip->hmp->reclaim_count += ip->depend_count;
104 if (ip->hmp->reclaim_count > 256) {
105 ip->hmp->reclaim_count = 0;
106 hammer_flusher_async(ip->hmp);
109 hammer_rel_inode(ip, 1);
115 * Return a locked vnode for the specified inode. The inode must be
116 * referenced but NOT LOCKED on entry and will remain referenced on
119 * Called from the frontend.
122 hammer_get_vnode(struct hammer_inode *ip, int lktype, struct vnode **vpp)
128 if ((vp = ip->vp) == NULL) {
129 error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0);
132 hammer_lock_ex(&ip->lock);
133 if (ip->vp != NULL) {
134 hammer_unlock(&ip->lock);
139 hammer_ref(&ip->lock);
142 vp->v_type = hammer_get_vnode_type(
143 ip->ino_rec.base.base.obj_type);
145 switch(ip->ino_rec.base.base.obj_type) {
146 case HAMMER_OBJTYPE_CDEV:
147 case HAMMER_OBJTYPE_BDEV:
148 vp->v_ops = &ip->hmp->mp->mnt_vn_spec_ops;
149 addaliasu(vp, ip->ino_data.rmajor,
150 ip->ino_data.rminor);
152 case HAMMER_OBJTYPE_FIFO:
153 vp->v_ops = &ip->hmp->mp->mnt_vn_fifo_ops;
160 * Only mark as the root vnode if the ip is not
161 * historical, otherwise the VFS cache will get
162 * confused. The other half of the special handling
163 * is in hammer_vop_nlookupdotdot().
165 if (ip->obj_id == HAMMER_OBJID_ROOT &&
166 ip->obj_asof == ip->hmp->asof) {
170 vp->v_data = (void *)ip;
171 /* vnode locked by getnewvnode() */
172 /* make related vnode dirty if inode dirty? */
173 hammer_unlock(&ip->lock);
174 if (vp->v_type == VREG)
175 vinitvmio(vp, ip->ino_rec.ino_size);
180 * loop if the vget fails (aka races), or if the vp
181 * no longer matches ip->vp.
183 if (vget(vp, LK_EXCLUSIVE) == 0) {
194 * Acquire a HAMMER inode. The returned inode is not locked. These functions
195 * do not attach or detach the related vnode (use hammer_get_vnode() for
198 * The flags argument is only applied for newly created inodes, and only
199 * certain flags are inherited.
201 * Called from the frontend.
203 struct hammer_inode *
204 hammer_get_inode(hammer_transaction_t trans, struct hammer_node **cache,
205 u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
207 hammer_mount_t hmp = trans->hmp;
208 struct hammer_inode_info iinfo;
209 struct hammer_cursor cursor;
210 struct hammer_inode *ip;
213 * Determine if we already have an inode cached. If we do then
216 iinfo.obj_id = obj_id;
217 iinfo.obj_asof = asof;
219 ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
221 hammer_ref(&ip->lock);
226 ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
227 ++hammer_count_inodes;
229 ip->obj_asof = iinfo.obj_asof;
231 ip->flags = flags & HAMMER_INODE_RO;
232 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
234 ip->flags |= HAMMER_INODE_RO;
235 RB_INIT(&ip->rec_tree);
236 TAILQ_INIT(&ip->bio_list);
237 TAILQ_INIT(&ip->bio_alt_list);
238 TAILQ_INIT(&ip->depend_list);
241 * Locate the on-disk inode.
244 hammer_init_cursor(trans, &cursor, cache);
245 cursor.key_beg.obj_id = ip->obj_id;
246 cursor.key_beg.key = 0;
247 cursor.key_beg.create_tid = 0;
248 cursor.key_beg.delete_tid = 0;
249 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
250 cursor.key_beg.obj_type = 0;
251 cursor.asof = iinfo.obj_asof;
252 cursor.flags = HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_GET_DATA |
255 *errorp = hammer_btree_lookup(&cursor);
256 if (*errorp == EDEADLK) {
257 hammer_done_cursor(&cursor);
262 * On success the B-Tree lookup will hold the appropriate
263 * buffer cache buffers and provide a pointer to the requested
264 * information. Copy the information to the in-memory inode
265 * and cache the B-Tree node to improve future operations.
268 ip->ino_rec = cursor.record->inode;
269 ip->ino_data = cursor.data->inode;
270 hammer_cache_node(cursor.node, &ip->cache[0]);
272 hammer_cache_node(cursor.node, cache);
276 * On success load the inode's record and data and insert the
277 * inode into the B-Tree. It is possible to race another lookup
278 * insertion of the same inode so deal with that condition too.
280 * The cursor's locked node interlocks against others creating and
281 * destroying ip while we were blocked.
284 hammer_ref(&ip->lock);
285 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
286 hammer_uncache_node(&ip->cache[0]);
287 hammer_uncache_node(&ip->cache[1]);
288 KKASSERT(ip->lock.refs == 1);
289 --hammer_count_inodes;
291 hammer_done_cursor(&cursor);
294 ip->flags |= HAMMER_INODE_ONDISK;
296 --hammer_count_inodes;
300 hammer_done_cursor(&cursor);
305 * Create a new filesystem object, returning the inode in *ipp. The
306 * returned inode will be referenced and also marked HAMMER_INODE_NEW,
307 * preventing it from being synchronized too early. The caller must
308 * call hammer_finalize_inode() to make it available for media sync.
310 * The inode is created in-memory.
313 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
314 struct ucred *cred, hammer_inode_t dip,
315 struct hammer_inode **ipp)
322 ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
323 ++hammer_count_inodes;
324 ip->obj_id = hammer_alloc_tid(trans);
325 KKASSERT(ip->obj_id != 0);
326 ip->obj_asof = hmp->asof;
328 ip->flush_state = HAMMER_FST_IDLE;
329 ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_RDIRTY |
331 ip->flags |= HAMMER_INODE_NEW;
333 RB_INIT(&ip->rec_tree);
334 TAILQ_INIT(&ip->bio_list);
335 TAILQ_INIT(&ip->bio_alt_list);
336 TAILQ_INIT(&ip->depend_list);
338 ip->ino_rec.ino_atime = trans->time;
339 ip->ino_rec.ino_mtime = trans->time;
340 ip->ino_rec.ino_size = 0;
341 ip->ino_rec.ino_nlinks = 0;
343 ip->ino_rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
344 ip->ino_rec.base.base.obj_id = ip->obj_id;
345 ip->ino_rec.base.base.key = 0;
346 ip->ino_rec.base.base.create_tid = 0;
347 ip->ino_rec.base.base.delete_tid = 0;
348 ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE;
349 ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type);
351 ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
352 ip->ino_data.mode = vap->va_mode;
353 ip->ino_data.ctime = trans->time;
354 ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0;
356 switch(ip->ino_rec.base.base.obj_type) {
357 case HAMMER_OBJTYPE_CDEV:
358 case HAMMER_OBJTYPE_BDEV:
359 ip->ino_data.rmajor = vap->va_rmajor;
360 ip->ino_data.rminor = vap->va_rminor;
367 * Calculate default uid/gid and overwrite with information from
370 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
371 ip->ino_data.gid = dip->ino_data.gid;
372 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
374 ip->ino_data.mode = vap->va_mode;
376 if (vap->va_vaflags & VA_UID_UUID_VALID)
377 ip->ino_data.uid = vap->va_uid_uuid;
378 else if (vap->va_uid != (uid_t)VNOVAL)
379 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
380 if (vap->va_vaflags & VA_GID_UUID_VALID)
381 ip->ino_data.gid = vap->va_gid_uuid;
382 else if (vap->va_gid != (gid_t)VNOVAL)
383 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
385 hammer_ref(&ip->lock);
386 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
387 hammer_unref(&ip->lock);
388 panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
395 * Finalize a newly created inode, allowing it to be synchronized to the
396 * media. If an error occured make sure the inode has been cleaned up and
397 * will not be synchronized to the media.
400 hammer_finalize_inode(hammer_transaction_t trans, hammer_inode_t ip, int error)
403 ip->flags &= ~HAMMER_INODE_MODMASK;
405 KASSERT(ip->lock.refs == 1,
406 ("hammer_unload_inode: %d refs\n", ip->lock.refs));
407 KKASSERT(ip->vp == NULL);
408 KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
409 KKASSERT(ip->cursor_ip_refs == 0);
410 KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
412 KKASSERT(RB_EMPTY(&ip->rec_tree));
413 KKASSERT(TAILQ_EMPTY(&ip->bio_list));
414 KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
416 ip->flags &= ~HAMMER_INODE_NEW;
420 * Called by hammer_sync_inode().
423 hammer_update_inode(hammer_transaction_t trans, hammer_inode_t ip)
425 struct hammer_cursor cursor;
426 hammer_record_t record;
430 * Locate the record on-disk and mark it as deleted. Both the B-Tree
431 * node and the record must be marked deleted. The record may or
432 * may not be physically deleted, depending on the retention policy.
434 * If the inode has already been deleted on-disk we have nothing
437 * XXX Update the inode record and data in-place if the retention
443 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
444 HAMMER_INODE_ONDISK) {
445 hammer_init_cursor(trans, &cursor, &ip->cache[0]);
446 cursor.key_beg.obj_id = ip->obj_id;
447 cursor.key_beg.key = 0;
448 cursor.key_beg.create_tid = 0;
449 cursor.key_beg.delete_tid = 0;
450 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
451 cursor.key_beg.obj_type = 0;
452 cursor.asof = ip->obj_asof;
453 cursor.flags |= HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_ASOF;
454 cursor.flags |= HAMMER_CURSOR_BACKEND;
456 error = hammer_btree_lookup(&cursor);
458 kprintf("error %d\n", error);
459 Debugger("hammer_update_inode");
464 error = hammer_ip_delete_record(&cursor, trans->tid);
465 if (error && error != EDEADLK) {
466 kprintf("error %d\n", error);
467 Debugger("hammer_update_inode2");
470 ip->flags |= HAMMER_INODE_DELONDISK;
471 hammer_cache_node(cursor.node, &ip->cache[0]);
473 hammer_done_cursor(&cursor);
474 if (error == EDEADLK)
479 * Write out a new record if the in-memory inode is not marked
480 * as having been deleted. Update our inode statistics if this
481 * is the first application of the inode on-disk.
483 * If the inode has been deleted permanently, HAMMER_INODE_DELONDISK
484 * will remain set and prevent further updates.
486 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
487 record = hammer_alloc_mem_record(ip);
488 record->state = HAMMER_FST_FLUSH;
489 record->rec.inode = ip->sync_ino_rec;
490 record->rec.inode.base.base.create_tid = trans->tid;
491 record->rec.inode.base.data_len = sizeof(ip->sync_ino_data);
492 record->data = (void *)&ip->sync_ino_data;
493 record->flags |= HAMMER_RECF_INTERLOCK_BE;
494 error = hammer_ip_sync_record(trans, record);
496 kprintf("error %d\n", error);
497 Debugger("hammer_update_inode3");
501 * The record isn't managed by the inode's record tree,
502 * destroy it whether we succeed or fail.
504 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
505 record->flags |= HAMMER_RECF_DELETED_FE;
506 record->state = HAMMER_FST_IDLE;
507 KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL);
508 hammer_rel_mem_record(record);
511 ip->sync_flags &= ~(HAMMER_INODE_RDIRTY |
512 HAMMER_INODE_DDIRTY |
513 HAMMER_INODE_ITIMES);
514 ip->flags &= ~HAMMER_INODE_DELONDISK;
515 if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
516 hammer_modify_volume(trans, trans->rootvol,
518 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
519 hammer_modify_volume_done(trans->rootvol);
520 ip->flags |= HAMMER_INODE_ONDISK;
524 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
526 * Clean out any left-over flags if the inode has been
529 ip->sync_flags &= ~(HAMMER_INODE_RDIRTY |
530 HAMMER_INODE_DDIRTY |
531 HAMMER_INODE_ITIMES);
537 * Update only the itimes fields. This is done no-historically. The
538 * record is updated in-place on the disk.
541 hammer_update_itimes(hammer_transaction_t trans, hammer_inode_t ip)
543 struct hammer_cursor cursor;
544 struct hammer_inode_record *rec;
549 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
550 HAMMER_INODE_ONDISK) {
551 hammer_init_cursor(trans, &cursor, &ip->cache[0]);
552 cursor.key_beg.obj_id = ip->obj_id;
553 cursor.key_beg.key = 0;
554 cursor.key_beg.create_tid = 0;
555 cursor.key_beg.delete_tid = 0;
556 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
557 cursor.key_beg.obj_type = 0;
558 cursor.asof = ip->obj_asof;
559 cursor.flags |= HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_ASOF;
560 cursor.flags |= HAMMER_CURSOR_BACKEND;
562 error = hammer_btree_lookup(&cursor);
564 kprintf("error %d\n", error);
565 Debugger("hammer_update_itimes1");
569 * Do not generate UNDO records for atime/mtime
572 rec = &cursor.record->inode;
573 hammer_modify_buffer(cursor.trans, cursor.record_buffer,
575 rec->ino_atime = ip->sync_ino_rec.ino_atime;
576 rec->ino_mtime = ip->sync_ino_rec.ino_mtime;
577 hammer_modify_buffer_done(cursor.record_buffer);
578 ip->sync_flags &= ~HAMMER_INODE_ITIMES;
579 /* XXX recalculate crc */
580 hammer_cache_node(cursor.node, &ip->cache[0]);
582 hammer_done_cursor(&cursor);
583 if (error == EDEADLK)
590 * Release a reference on an inode. If asked to flush the last release
591 * will flush the inode.
593 * On the last reference we queue the inode to the flusher for its final
597 hammer_rel_inode(struct hammer_inode *ip, int flush)
600 * Handle disposition when dropping the last ref.
602 while (ip->lock.refs == 1) {
603 if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
604 hammer_unload_inode(ip);
609 * Hand the inode over to the flusher, which will
610 * add another ref to it.
612 if (++ip->hmp->reclaim_count > 256) {
613 ip->hmp->reclaim_count = 0;
614 hammer_flush_inode(ip, HAMMER_FLUSH_FORCE |
615 HAMMER_FLUSH_SIGNAL);
617 hammer_flush_inode(ip, HAMMER_FLUSH_FORCE);
623 * The inode still has multiple refs, drop one ref. If a flush was
624 * requested make sure the flusher sees it. New inodes which have
625 * not been finalized cannot be flushed.
627 if (flush && ip->flush_state == HAMMER_FST_IDLE &&
628 (ip->flags & HAMMER_INODE_NEW) == 0) {
629 hammer_flush_inode(ip, HAMMER_FLUSH_RELEASE);
631 hammer_unref(&ip->lock);
636 * Unload and destroy the specified inode. Must be called with one remaining
637 * reference. The reference is disposed of.
639 * This can only be called in the context of the flusher.
642 hammer_unload_inode(struct hammer_inode *ip)
644 KASSERT(ip->lock.refs == 1,
645 ("hammer_unload_inode: %d refs\n", ip->lock.refs));
646 KKASSERT(ip->vp == NULL);
647 KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
648 KKASSERT(ip->cursor_ip_refs == 0);
649 KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
651 KKASSERT(RB_EMPTY(&ip->rec_tree));
652 KKASSERT(TAILQ_EMPTY(&ip->bio_list));
653 KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
655 RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
657 hammer_uncache_node(&ip->cache[0]);
658 hammer_uncache_node(&ip->cache[1]);
659 --hammer_count_inodes;
666 * A transaction has modified an inode, requiring updates as specified by
669 * HAMMER_INODE_RDIRTY: Inode record has been updated
670 * HAMMER_INODE_DDIRTY: Inode data has been updated
671 * HAMMER_INODE_XDIRTY: Dirty frontend buffer cache buffer strategized
672 * HAMMER_INODE_DELETED: Inode record/data must be deleted
673 * HAMMER_INODE_ITIMES: mtime/atime has been updated
676 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
678 KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 ||
679 (flags & (HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
681 HAMMER_INODE_DELETED|HAMMER_INODE_ITIMES)) == 0);
687 * Flush an inode. If the inode is already being flushed wait for
688 * it to complete, then flush it again. The interlock is against
689 * front-end transactions, the backend flusher does not hold the lock.
691 * The flusher must distinguish between the records that are part of the
692 * flush and any new records created in parallel with the flush. The
693 * inode data and truncation fields are also copied. BIOs are a bit more
694 * troublesome because some dirty buffers may not have been queued yet.
697 hammer_flush_inode(hammer_inode_t ip, int flags)
699 KKASSERT((ip->flags & HAMMER_INODE_NEW) == 0);
700 if (ip->flush_state != HAMMER_FST_IDLE &&
701 (ip->flags & HAMMER_INODE_MODMASK)) {
702 ip->flags |= HAMMER_INODE_REFLUSH;
703 if (flags & HAMMER_FLUSH_RELEASE) {
704 hammer_unref(&ip->lock);
705 KKASSERT(ip->lock.refs > 0);
709 if (ip->flush_state == HAMMER_FST_IDLE) {
710 if ((ip->flags & HAMMER_INODE_MODMASK) ||
711 (flags & HAMMER_FLUSH_FORCE)) {
713 * Add a reference to represent the inode being queued
714 * to the flusher. If the caller wants us to
715 * release a reference the two cancel each other out.
717 if ((flags & HAMMER_FLUSH_RELEASE) == 0)
718 hammer_ref(&ip->lock);
720 hammer_flush_inode_copysync(ip);
722 * Move the inode to the flush list and add a ref to
723 * it representing it on the list.
725 TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
726 if (flags & HAMMER_FLUSH_SIGNAL)
727 hammer_flusher_async(ip->hmp);
733 * Helper routine to copy the frontend synchronization state to the backend.
734 * This routine may be called by either the frontend or the backend.
737 hammer_flush_inode_copysync(hammer_inode_t ip)
743 * Prevent anyone else from trying to do the same thing.
745 ip->flush_state = HAMMER_FST_SETUP;
748 * Sync the buffer cache. This will queue the BIOs. If called
749 * from the context of the flusher the BIO's are thrown into bio_list
750 * regardless of ip->flush_state.
753 error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
758 * This freezes strategy writes, any further BIOs will be
759 * queued to alt_bio (unless we are
761 ip->flush_state = HAMMER_FST_FLUSH;
764 * Snapshot the state of the inode for the backend flusher.
766 * The truncation must be retained in the frontend until after
767 * we've actually performed the record deletion.
769 ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
770 ip->sync_trunc_off = ip->trunc_off;
771 ip->sync_ino_rec = ip->ino_rec;
772 ip->sync_ino_data = ip->ino_data;
773 ip->flags &= ~HAMMER_INODE_MODMASK |
774 HAMMER_INODE_TRUNCATED | HAMMER_INODE_BUFS;
777 * Fix up the dirty buffer status.
779 if (ip->vp == NULL || RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL)
780 ip->flags &= ~HAMMER_INODE_BUFS;
781 if (TAILQ_FIRST(&ip->bio_list))
782 ip->sync_flags |= HAMMER_INODE_BUFS;
784 ip->sync_flags &= ~HAMMER_INODE_BUFS;
787 * Set the state for the inode's in-memory records. If some records
788 * could not be marked for backend flush (i.e. deleted records),
789 * re-set the XDIRTY flag.
791 count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
792 hammer_mark_record_callback, NULL);
794 ip->flags |= HAMMER_INODE_XDIRTY;
798 * Mark records for backend flush, accumulate a count of the number of
799 * records which could not be marked. Records marked for deletion
800 * by the frontend never make it to the media. It is possible for
801 * a record queued to the backend to wind up with FE set after the
802 * fact, as long as BE has not yet been set. The backend deals with
803 * this race by syncing the record as if FE had not been set, and
804 * then converting the record to a delete-on-disk record.
807 hammer_mark_record_callback(hammer_record_t rec, void *data)
809 if (rec->state == HAMMER_FST_FLUSH) {
811 } else if ((rec->flags & HAMMER_RECF_DELETED_FE) == 0) {
812 rec->state = HAMMER_FST_FLUSH;
813 hammer_ref(&rec->lock);
823 * Wait for a previously queued flush to complete
826 hammer_wait_inode(hammer_inode_t ip)
828 while (ip->flush_state == HAMMER_FST_FLUSH) {
829 ip->flags |= HAMMER_INODE_FLUSHW;
830 tsleep(&ip->flags, 0, "hmrwin", 0);
835 * Called by the backend code when a flush has been completed.
836 * The inode has already been removed from the flush list.
838 * A pipelined flush can occur, in which case we must re-enter the
839 * inode on the list and re-copy its fields.
842 hammer_flush_inode_done(hammer_inode_t ip)
846 KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
849 kprintf("ip %p leftover sync_flags %08x\n", ip, ip->sync_flags);
850 ip->flags |= ip->sync_flags;
851 ip->flush_state = HAMMER_FST_IDLE;
854 * Reflush any BIOs that wound up in the alt list. Our inode will
855 * also wind up at the end of the flusher's list.
857 while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) {
858 TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act);
859 TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
860 ip->flags |= HAMMER_INODE_XDIRTY;
861 ip->flags |= HAMMER_INODE_REFLUSH;
862 kprintf("rebio %p ip %p @%016llx,%d\n", bio, ip, bio->bio_offset, bio->bio_buf->b_bufsize);
866 * If the frontend made more changes and requested another flush,
869 if (ip->flags & HAMMER_INODE_REFLUSH) {
870 ip->flags &= ~HAMMER_INODE_REFLUSH;
871 hammer_flush_inode(ip, 0);
873 if (ip->flags & HAMMER_INODE_FLUSHW) {
874 ip->flags &= ~HAMMER_INODE_FLUSHW;
878 hammer_rel_inode(ip, 0);
882 * Called from hammer_sync_inode() to synchronize in-memory records
886 hammer_sync_record_callback(hammer_record_t record, void *data)
888 hammer_transaction_t trans = data;
892 * Skip records that do not belong to the current flush. Records
893 * belonging to the flush will have been referenced for us.
895 if (record->state != HAMMER_FST_FLUSH)
899 * Interlock the record using the BE flag. Once BE is set the
900 * frontend cannot change the state of FE.
902 * NOTE: If FE is set prior to us setting BE we still sync the
903 * record out, but the flush completion code converts it to
904 * a delete-on-disk record instead of destroying it.
906 hammer_lock_ex(&record->lock);
907 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
908 hammer_unlock(&record->lock);
911 record->flags |= HAMMER_RECF_INTERLOCK_BE;
914 * If DELETED_FE is set we may have already sent dependant pieces
915 * to the disk and we must flush the record as if it hadn't been
916 * deleted. This creates a bit of a mess because we have to
917 * have ip_sync_record convert the record to DELETE_ONDISK before
918 * it inserts the B-Tree record. Otherwise the media sync might
919 * be visible to the frontend.
921 if (record->flags & HAMMER_RECF_DELETED_FE)
922 record->flags |= HAMMER_RECF_CONVERT_DELETE_ONDISK;
925 * Assign the create_tid for new records. Deletions already
926 * have the record's entire key properly set up.
928 if ((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0)
929 record->rec.inode.base.base.create_tid = trans->tid;
930 error = hammer_ip_sync_record(trans, record);
934 if (error != -ENOSPC) {
935 kprintf("hammer_sync_record_callback: sync failed rec "
936 "%p, error %d\n", record, error);
937 Debugger("sync failed rec");
940 hammer_flush_record_done(record, error);
948 hammer_sync_inode(hammer_inode_t ip, int handle_delete)
950 struct hammer_transaction trans;
952 hammer_depend_t depend;
953 int error, tmp_error;
955 if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0 &&
956 handle_delete == 0) {
960 hammer_start_transaction_fls(&trans, ip->hmp);
963 * Any (directory) records this inode depends on must also be
964 * synchronized. The directory itself only needs to be flushed
965 * if its inode is not already on-disk.
967 while ((depend = TAILQ_FIRST(&ip->depend_list)) != NULL) {
968 hammer_record_t record;
970 record = depend->record;
971 TAILQ_REMOVE(&depend->record->depend_list, depend, rec_entry);
972 TAILQ_REMOVE(&ip->depend_list, depend, ip_entry);
974 if (record->state != HAMMER_FST_FLUSH) {
975 record->state = HAMMER_FST_FLUSH;
976 /* add ref (steal ref from dependancy) */
978 /* remove ref related to dependancy */
979 /* record still has at least one ref from state */
980 hammer_unref(&record->lock);
981 KKASSERT(record->lock.refs > 0);
983 if (record->ip->flags & HAMMER_INODE_ONDISK) {
985 hammer_sync_record_callback(record, &trans);
988 KKASSERT((record->ip->flags & HAMMER_INODE_NEW) == 0);
989 hammer_flush_inode(record->ip, 0);
991 hammer_unref(&ip->lock);
992 KKASSERT(ip->lock.refs > 0);
993 kfree(depend, M_HAMMER);
998 * Sync inode deletions and truncations.
1000 if (ip->sync_ino_rec.ino_nlinks == 0 && handle_delete &&
1001 (ip->flags & HAMMER_INODE_GONE) == 0) {
1003 * Handle the case where the inode has been completely deleted
1004 * and is no longer referenceable from the filesystem
1007 * NOTE: We do not set the RDIRTY flag when updating the
1008 * delete_tid, setting HAMMER_INODE_DELETED takes care of it.
1011 ip->flags |= HAMMER_INODE_GONE | HAMMER_INODE_DELETED;
1012 ip->flags &= ~HAMMER_INODE_TRUNCATED;
1013 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
1015 vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
1016 error = hammer_ip_delete_range_all(&trans, ip);
1018 Debugger("hammer_ip_delete_range_all errored");
1021 * Sanity check. The only records that remain should be
1022 * marked for back-end deletion.
1025 hammer_record_t rec;
1027 RB_FOREACH(rec, hammer_rec_rb_tree, &ip->rec_tree) {
1028 KKASSERT(rec->state == HAMMER_FST_FLUSH);
1033 * Set delete_tid in both the frontend and backend
1034 * copy of the inode record.
1036 ip->ino_rec.base.base.delete_tid = trans.tid;
1037 ip->sync_ino_rec.base.base.delete_tid = trans.tid;
1040 * Indicate that the inode has/is-being deleted.
1042 ip->flags |= HAMMER_NODE_DELETED;
1043 hammer_modify_inode(&trans, ip, HAMMER_INODE_RDIRTY);
1044 hammer_modify_volume(&trans, trans.rootvol, NULL, 0);
1045 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1046 hammer_modify_volume_done(trans.rootvol);
1047 } else if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
1049 * Interlock trunc_off. The VOP front-end may continue to
1050 * make adjustments to it while we are blocked.
1053 off_t aligned_trunc_off;
1055 trunc_off = ip->sync_trunc_off;
1056 aligned_trunc_off = (trunc_off + HAMMER_BUFMASK) &
1060 * Delete any whole blocks on-media. The front-end has
1061 * already cleaned out any partial block and made it
1062 * pending. The front-end may have updated trunc_off
1063 * while we were blocked so do not just unconditionally
1064 * set it to the maximum offset.
1066 kprintf("sync truncation range @ %016llx\n", aligned_trunc_off);
1067 error = hammer_ip_delete_range(&trans, ip,
1069 0x7FFFFFFFFFFFFFFFLL);
1071 Debugger("hammer_ip_delete_range errored");
1072 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
1073 if (ip->trunc_off >= trunc_off) {
1074 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
1075 ip->flags &= ~HAMMER_INODE_TRUNCATED;
1079 error = 0; /* XXX vfsync used to be here */
1082 * Flush any queued BIOs.
1084 while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) {
1085 TAILQ_REMOVE(&ip->bio_list, bio, bio_act);
1087 kprintf("dowrite %016llx ip %p bio %p @ %016llx\n", trans.tid, ip, bio, bio->bio_offset);
1089 tmp_error = hammer_dowrite(&trans, ip, bio);
1093 ip->sync_flags &= ~HAMMER_INODE_BUFS;
1096 * Now sync related records.
1099 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1100 hammer_sync_record_callback, &trans);
1101 KKASSERT(error <= 0);
1110 * XDIRTY represents rec_tree and bio_list. However, rec_tree may
1111 * contain new front-end records so short of scanning it we can't
1112 * just test whether it is empty or not.
1114 * If no error occured assume we succeeded.
1117 ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
1120 Debugger("RB_SCAN errored");
1123 * Now update the inode's on-disk inode-data and/or on-disk record.
1124 * DELETED and ONDISK are managed only in ip->flags.
1126 switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
1127 case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
1129 * If deleted and on-disk, don't set any additional flags.
1130 * the delete flag takes care of things.
1133 case HAMMER_INODE_DELETED:
1135 * Take care of the case where a deleted inode was never
1136 * flushed to the disk in the first place.
1138 ip->sync_flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
1139 HAMMER_INODE_XDIRTY|HAMMER_INODE_ITIMES);
1140 while (RB_ROOT(&ip->rec_tree)) {
1141 hammer_record_t record = RB_ROOT(&ip->rec_tree);
1142 hammer_ref(&record->lock);
1143 KKASSERT(record->lock.refs == 1);
1144 record->flags |= HAMMER_RECF_DELETED_FE;
1145 record->flags |= HAMMER_RECF_DELETED_BE;
1146 hammer_cleardep_mem_record(record);
1147 hammer_rel_mem_record(record);
1150 case HAMMER_INODE_ONDISK:
1152 * If already on-disk, do not set any additional flags.
1157 * If not on-disk and not deleted, set both dirty flags
1158 * to force an initial record to be written. Also set
1159 * the create_tid for the inode.
1161 * Set create_tid in both the frontend and backend
1162 * copy of the inode record.
1164 ip->ino_rec.base.base.create_tid = trans.tid;
1165 ip->sync_ino_rec.base.base.create_tid = trans.tid;
1166 ip->sync_flags |= HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY;
1171 * If RDIRTY or DDIRTY is set, write out a new record. If the inode
1172 * is already on-disk the old record is marked as deleted.
1174 * If DELETED is set hammer_update_inode() will delete the existing
1175 * record without writing out a new one.
1177 * If *ONLY* the ITIMES flag is set we can update the record in-place.
1179 if (ip->flags & HAMMER_INODE_DELETED) {
1180 error = hammer_update_inode(&trans, ip);
1182 if ((ip->sync_flags & (HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY |
1183 HAMMER_INODE_ITIMES)) == HAMMER_INODE_ITIMES) {
1184 error = hammer_update_itimes(&trans, ip);
1186 if (ip->sync_flags & (HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY |
1187 HAMMER_INODE_ITIMES)) {
1188 error = hammer_update_inode(&trans, ip);
1191 Debugger("hammer_update_itimes/inode errored");
1194 * Save the TID we used to sync the inode with to make sure we
1195 * do not improperly reuse it.
1197 hammer_done_transaction(&trans);