From 1f07f68660b83c5d87d56552b65ddc489b520a53 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 2 May 2008 01:00:42 +0000 Subject: [PATCH] HAMMER 40A/Many: Inode/link-count sequencer. * Remove the hammer_depend structure and build the dependancies directly into the hammer_record structure. * Attempt to implement layout rules to ensure connectivity is maintained. This means, for example, that before HAMMER can flush a newly created file it will make sure the file has namespace connectivity to the directory it was created it, recursively to the root. NOTE: 40A destabilizes the filesystem a bit, it's going to take a few passes to get everything working properly. There are numerous issues with this commit. --- sys/vfs/hammer/hammer.h | 98 ++-- sys/vfs/hammer/hammer_btree.c | 4 +- sys/vfs/hammer/hammer_cursor.h | 3 +- sys/vfs/hammer/hammer_flusher.c | 108 ++-- sys/vfs/hammer/hammer_inode.c | 905 ++++++++++++++++++++++---------- sys/vfs/hammer/hammer_object.c | 246 +++++---- sys/vfs/hammer/hammer_undo.c | 37 +- sys/vfs/hammer/hammer_vfsops.c | 3 +- sys/vfs/hammer/hammer_vnops.c | 35 +- 9 files changed, 935 insertions(+), 504 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index a0a88333fe..1eb5fcf23b 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 The DragonFly Project. All rights reserved. + * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.54 2008/04/29 01:10:37 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.55 2008/05/02 01:00:42 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -135,16 +135,15 @@ hammer_lock_excl_owned(struct hammer_lock *lock, thread_t td) } /* - * inode->inode dependancy + * Flush state, used by various structures */ -typedef struct hammer_depend { - TAILQ_ENTRY(hammer_depend) ip_entry; - TAILQ_ENTRY(hammer_depend) rec_entry; - struct hammer_inode *ip; - struct hammer_record *record; -} *hammer_depend_t; +typedef enum hammer_inode_state { + HAMMER_FST_IDLE, + HAMMER_FST_SETUP, + HAMMER_FST_FLUSH +} hammer_inode_state_t; -TAILQ_HEAD(hammer_depend_list, hammer_depend); +TAILQ_HEAD(hammer_record_list, hammer_record); /* * Cache object ids. A fixed number of objid cache structures are @@ -194,24 +193,18 @@ RB_PROTOTYPEX(hammer_rec_rb_tree, INFO, hammer_record, rb_node, TAILQ_HEAD(hammer_node_list, hammer_node); -typedef enum hammer_inode_state { - HAMMER_FST_IDLE, - HAMMER_FST_SETUP, - HAMMER_FST_FLUSH -} hammer_inode_state_t; - struct hammer_inode { - RB_ENTRY(hammer_inode) rb_node; - hammer_inode_state_t flush_state; + RB_ENTRY(hammer_inode) rb_node; + hammer_inode_state_t flush_state; + int flush_group; TAILQ_ENTRY(hammer_inode) flush_entry; - struct hammer_depend_list depend_list; + struct hammer_record_list target_list; /* target of dependant recs */ u_int64_t obj_id; /* (key) object identifier */ hammer_tid_t obj_asof; /* (key) snapshot or 0 */ struct hammer_mount *hmp; hammer_objid_cache_t objid_cache; int flags; int error; /* flush error */ - int depend_count; int cursor_ip_refs; /* sanity */ struct vnode *vp; struct lockf advlock; @@ -243,7 +236,7 @@ typedef struct hammer_inode *hammer_inode_t; #define HAMMER_INODE_DDIRTY 0x0001 /* in-memory ino_data is dirty */ #define HAMMER_INODE_RDIRTY 0x0002 /* in-memory ino_rec is dirty */ #define HAMMER_INODE_ITIMES 0x0004 /* in-memory mtime/atime modified */ -#define HAMMER_INODE_XDIRTY 0x0008 /* in-memory records/flsbufs present */ +#define HAMMER_INODE_XDIRTY 0x0008 /* in-memory records */ #define HAMMER_INODE_ONDISK 0x0010 /* inode is on-disk (else not yet) */ #define HAMMER_INODE_FLUSH 0x0020 /* flush on last ref */ #define HAMMER_INODE_DELETED 0x0080 /* inode ready for deletion */ @@ -253,44 +246,60 @@ typedef struct hammer_inode *hammer_inode_t; #define HAMMER_INODE_DONDISK 0x0800 /* data records may be on disk */ #define HAMMER_INODE_BUFS 0x1000 /* dirty high level bps present */ #define HAMMER_INODE_REFLUSH 0x2000 /* pipelined flush during flush */ -#define HAMMER_INODE_UNUSED4000 0x4000 +#define HAMMER_INODE_WRITE_ALT 0x4000 /* strategy writes to alt bioq */ #define HAMMER_INODE_FLUSHW 0x8000 /* Someone waiting for flush */ #define HAMMER_INODE_TRUNCATED 0x00010000 -#define HAMMER_INODE_NEW 0x00020000 +#define HAMMER_INODE_DELETING 0x00020000 /* Destroy the inode on-disk */ #define HAMMER_INODE_MODMASK (HAMMER_INODE_DDIRTY|HAMMER_INODE_RDIRTY| \ HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS| \ - HAMMER_INODE_ITIMES|HAMMER_INODE_TRUNCATED) + HAMMER_INODE_ITIMES|HAMMER_INODE_TRUNCATED|\ + HAMMER_INODE_DELETING) + +#define HAMMER_INODE_MODMASK_NOXDIRTY \ + (HAMMER_INODE_MODMASK & ~HAMMER_INODE_XDIRTY) #define HAMMER_MAX_INODE_CURSORS 4 #define HAMMER_FLUSH_SIGNAL 0x0001 #define HAMMER_FLUSH_FORCE 0x0002 -#define HAMMER_FLUSH_RELEASE 0x0004 +#define HAMMER_FLUSH_RECURSION 0x0004 /* - * Structure used to represent an unsynchronized record in-memory. This - * structure is orgranized in a per-inode RB-tree. If the inode is not + * Structure used to represent an unsynchronized record in-memory. These + * records typically represent directory entries. Only non-historical + * records are kept in-memory. + * + * Records are organized as a per-inode RB-Tree. If the inode is not * on disk then neither are any records and the in-memory record tree * represents the entire contents of the inode. If the inode is on disk * then the on-disk B-Tree is scanned in parallel with the in-memory * RB-Tree to synthesize the current state of the file. * - * Only current (delete_tid == 0) unsynchronized records are kept in-memory. - * - * blocked is the count of the number of cursors (ip_first/ip_next) blocked - * on the record waiting for a synchronization to complete. + * Records are also used to enforce the ordering of directory create/delete + * operations. A new inode will not be flushed to disk unless its related + * directory entry is also being flushed at the same time. A directory entry + * will not be removed unless its related inode is also being removed at the + * same time. */ +typedef enum hammer_record_type { + HAMMER_MEM_RECORD_ADD, /* positive memory cache record */ + HAMMER_MEM_RECORD_DEL /* negative delete-on-disk record */ +} hammer_record_type_t; + struct hammer_record { RB_ENTRY(hammer_record) rb_node; - hammer_inode_state_t state; + TAILQ_ENTRY(hammer_record) target_entry; + hammer_inode_state_t flush_state; + int flush_group; + hammer_record_type_t type; struct hammer_lock lock; struct hammer_inode *ip; + struct hammer_inode *target_ip; union hammer_record_ondisk rec; union hammer_data_ondisk *data; int flags; - struct hammer_depend_list depend_list; }; typedef struct hammer_record *hammer_record_t; @@ -306,8 +315,7 @@ typedef struct hammer_record *hammer_record_t; #define HAMMER_RECF_INBAND 0x0010 #define HAMMER_RECF_INTERLOCK_BE 0x0020 /* backend interlock */ #define HAMMER_RECF_WANTED 0x0040 -#define HAMMER_RECF_DELETE_ONDISK 0x0080 -#define HAMMER_RECF_CONVERT_DELETE_ONDISK 0x0100 /* special case */ +#define HAMMER_RECF_CONVERT_DELETE 0x0100 /* special case */ /* * In-memory structures representing on-disk structures. @@ -510,8 +518,10 @@ struct hammer_mount { int ronly; int nvolumes; int volume_iterator; - int flusher_seq; - int flusher_act; + int flusher_signal; /* flusher thread sequencer */ + int flusher_act; /* currently active flush group */ + int flusher_done; /* set to act when complete */ + int flusher_next; /* next flush group */ int flusher_exiting; int reclaim_count; thread_t flusher_td; @@ -533,11 +543,9 @@ struct hammer_mount { struct netexport export; struct hammer_lock sync_lock; struct lock blockmap_lock; - hammer_inode_t flusher_demark; struct hammer_blockmap blockmap[HAMMER_MAX_ZONES]; struct hammer_holes holes[HAMMER_MAX_ZONES]; TAILQ_HEAD(, hammer_inode) flush_list; - TAILQ_HEAD(, hammer_inode) flush_alt_list; TAILQ_HEAD(, hammer_objid_cache) objid_cache_list; }; @@ -571,6 +579,7 @@ extern int hammer_count_buffers; extern int hammer_count_nodes; extern int hammer_count_dirtybufs; extern int hammer_limit_dirtybufs; +extern int hammer_bio_count; extern int64_t hammer_contention_count; int hammer_vop_inactive(struct vop_inactive_args *); @@ -596,7 +605,7 @@ int hammer_ip_resolve_data(hammer_cursor_t cursor); int hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid); int hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes); int hammer_ip_check_directory_empty(hammer_transaction_t trans, - hammer_inode_t ip); + hammer_cursor_t parent_cursor, hammer_inode_t ip); int hammer_sync_hmp(hammer_mount_t hmp, int waitfor); hammer_record_t @@ -604,7 +613,6 @@ hammer_record_t void hammer_flush_record_done(hammer_record_t record, int error); void hammer_wait_mem_record(hammer_record_t record); void hammer_rel_mem_record(hammer_record_t record); -void hammer_cleardep_mem_record(struct hammer_record *record); int hammer_cursor_up(hammer_cursor_t cursor); int hammer_cursor_down(hammer_cursor_t cursor); @@ -731,6 +739,9 @@ hammer_off_t hammer_blockmap_lookup(hammer_mount_t hmp, hammer_off_t bmap_off, int *errorp); hammer_off_t hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t bmap_off, int *errorp); +int64_t hammer_undo_space(hammer_mount_t hmp); +int64_t hammer_undo_max(hammer_mount_t hmp); + void hammer_start_transaction(struct hammer_transaction *trans, struct hammer_mount *hmp); @@ -749,10 +760,9 @@ void hammer_wait_inode(hammer_inode_t ip); int hammer_create_inode(struct hammer_transaction *trans, struct vattr *vap, struct ucred *cred, struct hammer_inode *dip, struct hammer_inode **ipp); -void hammer_finalize_inode(hammer_transaction_t trans, hammer_inode_t ip, - int error); void hammer_rel_inode(hammer_inode_t ip, int flush); -int hammer_sync_inode(hammer_inode_t ip, int handle_delete); +int hammer_sync_inode(hammer_inode_t ip); +void hammer_test_inode(hammer_inode_t ip); int hammer_ip_add_directory(struct hammer_transaction *trans, hammer_inode_t dip, struct namecache *ncp, diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 73205a459b..42314ac0a8 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.39 2008/04/26 19:08:14 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.40 2008/05/02 01:00:42 dillon Exp $ */ /* @@ -655,7 +655,7 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags) * called. * * The caller may depend on the cursor's exclusive lock after return to - * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE_ONDISK). + * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). * * ENOSPC is returned if there is no room to insert a new record. */ diff --git a/sys/vfs/hammer/hammer_cursor.h b/sys/vfs/hammer/hammer_cursor.h index 3bd8eb4f11..3dac4cf80d 100644 --- a/sys/vfs/hammer/hammer_cursor.h +++ b/sys/vfs/hammer/hammer_cursor.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.15 2008/04/24 21:20:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.16 2008/05/02 01:00:42 dillon Exp $ */ /* @@ -120,7 +120,6 @@ typedef struct hammer_cursor *hammer_cursor_t; #define HAMMER_CURSOR_DELETE_VISIBILITY 0x0010 /* special del-on-disk recs */ #define HAMMER_CURSOR_END_INCLUSIVE 0x0020 /* key_end is inclusive */ #define HAMMER_CURSOR_END_EXCLUSIVE 0x0040 /* key_end is exclusive (def) */ -#define HAMMER_CURSOR_UNUSED0080 0x0080 #define HAMMER_CURSOR_ATEDISK 0x0100 #define HAMMER_CURSOR_ATEMEM 0x0200 diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index d2c698ebde..b777579f3c 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.8 2008/04/29 04:43:08 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.9 2008/05/02 01:00:42 dillon Exp $ */ /* * HAMMER dependancy flusher thread @@ -55,10 +55,13 @@ hammer_flusher_sync(hammer_mount_t hmp) int seq; if (hmp->flusher_td) { - seq = ++hmp->flusher_seq; - wakeup(&hmp->flusher_seq); - while ((int)(seq - hmp->flusher_act) > 0) - tsleep(&hmp->flusher_act, 0, "hmrfls", 0); + seq = hmp->flusher_next; + if (hmp->flusher_signal == 0) { + hmp->flusher_signal = 1; + wakeup(&hmp->flusher_signal); + } + while ((int)(seq - hmp->flusher_done) > 0) + tsleep(&hmp->flusher_done, 0, "hmrfls", 0); } } @@ -66,14 +69,20 @@ void hammer_flusher_async(hammer_mount_t hmp) { if (hmp->flusher_td) { - ++hmp->flusher_seq; - wakeup(&hmp->flusher_seq); + if (hmp->flusher_signal == 0) { + hmp->flusher_signal = 1; + wakeup(&hmp->flusher_signal); + } } } void hammer_flusher_create(hammer_mount_t hmp) { + hmp->flusher_signal = 0; + hmp->flusher_act = 0; + hmp->flusher_done = 0; + hmp->flusher_next = 1; lwkt_create(hammer_flusher_thread, hmp, &hmp->flusher_td, NULL, 0, -1, "hammer"); } @@ -83,10 +92,11 @@ hammer_flusher_destroy(hammer_mount_t hmp) { if (hmp->flusher_td) { hmp->flusher_exiting = 1; - ++hmp->flusher_seq; - wakeup(&hmp->flusher_seq); - while (hmp->flusher_td) + while (hmp->flusher_td) { + hmp->flusher_signal = 1; + wakeup(&hmp->flusher_signal); tsleep(&hmp->flusher_exiting, 0, "hmrwex", 0); + } } } @@ -94,34 +104,31 @@ static void hammer_flusher_thread(void *arg) { hammer_mount_t hmp = arg; - int seq; - - hmp->flusher_demark = kmalloc(sizeof(struct hammer_inode), - M_HAMMER, M_WAITOK | M_ZERO); - TAILQ_INSERT_TAIL(&hmp->flush_list, hmp->flusher_demark, flush_entry); for (;;) { - seq = hmp->flusher_seq; + hmp->flusher_act = hmp->flusher_next; + ++hmp->flusher_next; + kprintf("F"); hammer_flusher_clean_loose_ios(hmp); hammer_flusher_flush(hmp); hammer_flusher_clean_loose_ios(hmp); - hmp->flusher_act = seq; - wakeup(&hmp->flusher_act); + hmp->flusher_done = hmp->flusher_act; + + wakeup(&hmp->flusher_done); /* - * Loop if more got queued after our demark. + * Wait for activity. */ - if (TAILQ_NEXT(hmp->flusher_demark, flush_entry)) - continue; - - if (hmp->flusher_exiting) + if (hmp->flusher_exiting && TAILQ_EMPTY(&hmp->flush_list)) break; - while (hmp->flusher_seq == hmp->flusher_act) - tsleep(&hmp->flusher_seq, 0, "hmrwwa", 0); + kprintf("E"); + + while (hmp->flusher_signal == 0 && + TAILQ_EMPTY(&hmp->flush_list)) { + tsleep(&hmp->flusher_signal, 0, "hmrwwa", 0); + } + hmp->flusher_signal = 0; } - TAILQ_REMOVE(&hmp->flush_list, hmp->flusher_demark, flush_entry); - kfree(hmp->flusher_demark, M_HAMMER); - hmp->flusher_demark = NULL; hmp->flusher_td = NULL; wakeup(&hmp->flusher_exiting); lwkt_exit(); @@ -164,22 +171,27 @@ hammer_flusher_flush(hammer_mount_t hmp) rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; start_offset = rootmap->next_offset; - if (hammer_debug_general & 0x00010000) - kprintf("x"); - - TAILQ_REMOVE(&hmp->flush_list, hmp->flusher_demark, flush_entry); - TAILQ_INSERT_TAIL(&hmp->flush_list, hmp->flusher_demark, flush_entry); + while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) { + /* + * Stop when we hit a different flush group + */ + if (ip->flush_group != hmp->flusher_act) + break; - while ((ip = TAILQ_FIRST(&hmp->flush_list)) != hmp->flusher_demark) { + /* + * Remove the inode from the flush list and inherit + * its reference, sync, and clean-up. + */ TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry); + kprintf("s"); + ip->error = hammer_sync_inode(ip); + hammer_flush_inode_done(ip); /* - * We inherit the inode ref from the flush list + * XXX this breaks atomicy */ - ip->error = hammer_sync_inode(ip, (ip->vp ? 0 : 1)); - hammer_flush_inode_done(ip); - if (hmp->locked_dirty_count > 64 || - hammer_must_finalize_undo(hmp)) { + if (hammer_must_finalize_undo(hmp)) { + Debugger("Too many undos!!"); hammer_flusher_finalize(hmp, root_volume, start_offset); start_offset = rootmap->next_offset; } @@ -197,22 +209,12 @@ static int hammer_must_finalize_undo(hammer_mount_t hmp) { - hammer_blockmap_t rootmap; - int bytes; - int max_bytes; - - rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; - - if (rootmap->first_offset <= rootmap->next_offset) { - bytes = (int)(rootmap->next_offset - rootmap->first_offset); + if (hammer_undo_space(hmp) < hammer_undo_max(hmp) / 2) { + kprintf("*"); + return(1); } else { - bytes = (int)(rootmap->alloc_offset - rootmap->first_offset + - rootmap->next_offset); + return(0); } - max_bytes = (int)(rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK); - if (bytes > max_bytes / 2) - kprintf("*"); - return (bytes > max_bytes / 2); } /* diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index d9cbef2bc2..f461a04e44 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.44 2008/04/29 04:43:08 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.45 2008/05/02 01:00:42 dillon Exp $ */ #include "hammer.h" @@ -39,8 +39,10 @@ #include static int hammer_unload_inode(struct hammer_inode *ip); -static void hammer_flush_inode_copysync(hammer_inode_t ip); -static int hammer_mark_record_callback(hammer_record_t rec, void *data); +static void hammer_flush_inode_core(hammer_inode_t ip, int flags); +static int hammer_setup_child_callback(hammer_record_t rec, void *data); +static int hammer_inode_unloadable_check(hammer_inode_t ip); +static int hammer_setup_parent_inodes(hammer_record_t record); /* * The kernel is not actively referencing this vnode but is still holding @@ -62,26 +64,21 @@ hammer_vop_inactive(struct vop_inactive_args *ap) } /* - * If the inode no longer has any references we recover its - * in-memory resources immediately. - * - * NOTE: called from frontend, use ino_rec instead of sync_ino_rec. + * If the inode no longer has visibility in the filesystem and is + * fairly clean, try to recycle it immediately. This can deadlock + * in vfsync() if we aren't careful. */ - if (ip->ino_rec.ino_nlinks == 0) + if (hammer_inode_unloadable_check(ip) && ip->ino_rec.ino_nlinks == 0) vrecycle(ap->a_vp); return(0); } /* * Release the vnode association. This is typically (but not always) - * the last reference on the inode and will flush the inode to the - * buffer cache. + * the last reference on the inode. * - * XXX Currently our sync code only runs through inodes with vnode - * associations, so we depend on hammer_rel_inode() to sync any inode - * record data to the block device prior to losing the association. - * Otherwise transactions that the user expected to be distinct by - * doing a manual sync may be merged. + * Once the association is lost we are on our own with regards to + * flushing the inode. */ int hammer_vop_reclaim(struct vop_reclaim_args *ap) @@ -94,18 +91,6 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap) if ((ip = vp->v_data) != NULL) { vp->v_data = NULL; ip->vp = NULL; - - /* - * Don't let too many dependancies build up on unreferenced - * inodes or we could run ourselves out of memory. - */ - if (TAILQ_FIRST(&ip->depend_list)) { - ip->hmp->reclaim_count += ip->depend_count; - if (ip->hmp->reclaim_count > 256) { - ip->hmp->reclaim_count = 0; - hammer_flusher_async(ip->hmp); - } - } hammer_rel_inode(ip, 1); } return(0); @@ -235,7 +220,7 @@ loop: RB_INIT(&ip->rec_tree); TAILQ_INIT(&ip->bio_list); TAILQ_INIT(&ip->bio_alt_list); - TAILQ_INIT(&ip->depend_list); + TAILQ_INIT(&ip->target_list); /* * Locate the on-disk inode. @@ -303,9 +288,7 @@ retry: /* * Create a new filesystem object, returning the inode in *ipp. The - * returned inode will be referenced and also marked HAMMER_INODE_NEW, - * preventing it from being synchronized too early. The caller must - * call hammer_finalize_inode() to make it available for media sync. + * returned inode will be referenced. * * The inode is created in-memory. */ @@ -328,13 +311,12 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, ip->flush_state = HAMMER_FST_IDLE; ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_RDIRTY | HAMMER_INODE_ITIMES; - ip->flags |= HAMMER_INODE_NEW; ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; RB_INIT(&ip->rec_tree); TAILQ_INIT(&ip->bio_list); TAILQ_INIT(&ip->bio_alt_list); - TAILQ_INIT(&ip->depend_list); + TAILQ_INIT(&ip->target_list); ip->ino_rec.ino_atime = trans->time; ip->ino_rec.ino_mtime = trans->time; @@ -392,31 +374,6 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, return(0); } -/* - * Finalize a newly created inode, allowing it to be synchronized to the - * media. If an error occured make sure the inode has been cleaned up and - * will not be synchronized to the media. - */ -void -hammer_finalize_inode(hammer_transaction_t trans, hammer_inode_t ip, int error) -{ - if (error) { - ip->flags &= ~HAMMER_INODE_MODMASK; - - KASSERT(ip->lock.refs == 1, - ("hammer_unload_inode: %d refs\n", ip->lock.refs)); - KKASSERT(ip->vp == NULL); - KKASSERT(ip->flush_state == HAMMER_FST_IDLE); - KKASSERT(ip->cursor_ip_refs == 0); - KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0); - - KKASSERT(RB_EMPTY(&ip->rec_tree)); - KKASSERT(TAILQ_EMPTY(&ip->bio_list)); - KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list)); - } - ip->flags &= ~HAMMER_INODE_NEW; -} - /* * Called by hammer_sync_inode(). */ @@ -460,15 +417,16 @@ retry: Debugger("hammer_update_inode"); } - if (error == 0) { error = hammer_ip_delete_record(&cursor, trans->tid); if (error && error != EDEADLK) { kprintf("error %d\n", error); Debugger("hammer_update_inode2"); } - if (error == 0) + if (error == 0) { ip->flags |= HAMMER_INODE_DELONDISK; + ip->sync_flags &= ~HAMMER_INODE_DELETING; + } hammer_cache_node(cursor.node, &ip->cache[0]); } hammer_done_cursor(&cursor); @@ -486,7 +444,7 @@ retry: */ if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) { record = hammer_alloc_mem_record(ip); - record->state = HAMMER_FST_FLUSH; + record->flush_state = HAMMER_FST_FLUSH; record->rec.inode = ip->sync_ino_rec; record->rec.inode.base.base.create_tid = trans->tid; record->rec.inode.base.data_len = sizeof(ip->sync_ino_data); @@ -504,8 +462,7 @@ retry: */ record->flags &= ~HAMMER_RECF_INTERLOCK_BE; record->flags |= HAMMER_RECF_DELETED_FE; - record->state = HAMMER_FST_IDLE; - KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL); + record->flush_state = HAMMER_FST_IDLE; hammer_rel_mem_record(record); if (error == 0) { @@ -513,6 +470,10 @@ retry: HAMMER_INODE_DDIRTY | HAMMER_INODE_ITIMES); ip->flags &= ~HAMMER_INODE_DELONDISK; + + /* + * Root volume count of inodes + */ if ((ip->flags & HAMMER_INODE_ONDISK) == 0) { hammer_modify_volume(trans, trans->rootvol, NULL, 0); @@ -588,8 +549,7 @@ retry: } /* - * Release a reference on an inode. If asked to flush the last release - * will flush the inode. + * Release a reference on an inode, flush as requested. * * On the last reference we queue the inode to the flusher for its final * disposition. @@ -597,39 +557,57 @@ retry: void hammer_rel_inode(struct hammer_inode *ip, int flush) { + hammer_mount_t hmp = ip->hmp; + /* * Handle disposition when dropping the last ref. */ - while (ip->lock.refs == 1) { - if ((ip->flags & HAMMER_INODE_MODMASK) == 0) { - hammer_unload_inode(ip); - return; - } - - /* - * Hand the inode over to the flusher, which will - * add another ref to it. - */ - if (++ip->hmp->reclaim_count > 256) { - ip->hmp->reclaim_count = 0; - hammer_flush_inode(ip, HAMMER_FLUSH_FORCE | - HAMMER_FLUSH_SIGNAL); + for (;;) { + if (ip->lock.refs == 1) { + /* + * Determine whether on-disk action is needed for + * the inode's final disposition. + */ + if (hammer_inode_unloadable_check(ip)) { + hammer_unload_inode(ip); + break; + } + hammer_flush_inode(ip, 0); } else { - hammer_flush_inode(ip, HAMMER_FLUSH_FORCE); + /* + * We gotta flush inodes which do not have vnode + * associations. + */ +#if 0 + if (ip->vp == NULL) { + kprintf("v%d:%04x\n", ip->flush_state, ip->flags); + hammer_flush_inode(ip, 0); + } else +#endif + if (flush) { + hammer_flush_inode(ip, 0); + } + /* + * The inode still has multiple refs, try to drop + * one ref. + */ + KKASSERT(ip->lock.refs >= 1); + if (ip->lock.refs > 1) { + hammer_unref(&ip->lock); + break; + } } - /* retry */ } /* - * The inode still has multiple refs, drop one ref. If a flush was - * requested make sure the flusher sees it. New inodes which have - * not been finalized cannot be flushed. + * XXX bad hack until I add code to track inodes in SETUP. We + * can queue a lot of inodes to the syncer but if we don't wake + * it up the undo sets will be too large or too many unflushed + * records will build up and blow our malloc limit. */ - if (flush && ip->flush_state == HAMMER_FST_IDLE && - (ip->flags & HAMMER_INODE_NEW) == 0) { - hammer_flush_inode(ip, HAMMER_FLUSH_RELEASE); - } else { - hammer_unref(&ip->lock); + if (++hmp->reclaim_count > 256) { + hmp->reclaim_count = 0; + hammer_flusher_async(hmp); } } @@ -650,6 +628,7 @@ hammer_unload_inode(struct hammer_inode *ip) KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0); KKASSERT(RB_EMPTY(&ip->rec_tree)); + KKASSERT(TAILQ_EMPTY(&ip->target_list)); KKASSERT(TAILQ_EMPTY(&ip->bio_list)); KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list)); @@ -671,7 +650,8 @@ hammer_unload_inode(struct hammer_inode *ip) * * HAMMER_INODE_RDIRTY: Inode record has been updated * HAMMER_INODE_DDIRTY: Inode data has been updated - * HAMMER_INODE_XDIRTY: Dirty frontend buffer cache buffer strategized + * HAMMER_INODE_XDIRTY: Dirty in-memory records + * HAMMER_INODE_BUFS: Dirty front-end buffer cache buffers * HAMMER_INODE_DELETED: Inode record/data must be deleted * HAMMER_INODE_ITIMES: mtime/atime has been updated */ @@ -680,152 +660,421 @@ hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags) { KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 || (flags & (HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY| - HAMMER_INODE_XDIRTY| + HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS| HAMMER_INODE_DELETED|HAMMER_INODE_ITIMES)) == 0); ip->flags |= flags; } /* - * Flush an inode. If the inode is already being flushed wait for - * it to complete, then flush it again. The interlock is against - * front-end transactions, the backend flusher does not hold the lock. + * Request that an inode be flushed. This whole mess cannot block and may + * recurse. Once requested HAMMER will attempt to actively flush it until + * the flush can be done. * - * The flusher must distinguish between the records that are part of the - * flush and any new records created in parallel with the flush. The - * inode data and truncation fields are also copied. BIOs are a bit more - * troublesome because some dirty buffers may not have been queued yet. + * The inode may already be flushing, or may be in a setup state. We can + * place the inode in a flushing state if it is currently idle and flag it + * to reflush if it is currently flushing. */ void hammer_flush_inode(hammer_inode_t ip, int flags) { - KKASSERT((ip->flags & HAMMER_INODE_NEW) == 0); - if (ip->flush_state != HAMMER_FST_IDLE && - (ip->flags & HAMMER_INODE_MODMASK)) { - if ((ip->flags & HAMMER_INODE_REFLUSH) == 0) { - ip->flags |= HAMMER_INODE_REFLUSH; - if (flags & HAMMER_FLUSH_RELEASE) { - hammer_unref(&ip->lock); - KKASSERT(ip->lock.refs > 0); - } - if (flags & HAMMER_FLUSH_SIGNAL) - hammer_flusher_async(ip->hmp); + hammer_record_t depend; + int r, good; + + /* + * Trivial 'nothing to flush' case. If the inode is ina SETUP + * state we have to put it back into an IDLE state so we can + * drop the extra ref. + */ + if ((ip->flags & HAMMER_INODE_MODMASK) == 0 && + (flags & HAMMER_FLUSH_FORCE) == 0) { + if (ip->flush_state == HAMMER_FST_SETUP) { + ip->flush_state = HAMMER_FST_IDLE; + hammer_rel_inode(ip, 0); } return; } - if (ip->flush_state == HAMMER_FST_IDLE) { - if ((ip->flags & HAMMER_INODE_MODMASK) || - (flags & HAMMER_FLUSH_FORCE)) { - /* - * Add a reference to represent the inode being queued - * to the flusher. If the caller wants us to - * release a reference the two cancel each other out. - */ - if ((flags & HAMMER_FLUSH_RELEASE) == 0) - hammer_ref(&ip->lock); - hammer_flush_inode_copysync(ip); - /* - * Move the inode to the flush list and add a ref to - * it representing it on the list. - */ - TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry); - if (flags & HAMMER_FLUSH_SIGNAL) - hammer_flusher_async(ip->hmp); + /* + * Our flush action will depend on the current state. + */ + switch(ip->flush_state) { + case HAMMER_FST_IDLE: + /* + * We have no dependancies and can flush immediately. Some + * our children may not be flushable so we have to re-test + * with that additional knowledge. + */ + hammer_flush_inode_core(ip, flags); + break; + case HAMMER_FST_SETUP: + /* + * Recurse upwards through dependancies via target_list + * and start their flusher actions going if possible. + * + * 'good' is our connectivity. -1 means we have none and + * can't flush, 0 means there weren't any dependancies, and + * 1 means we have good connectivity. + */ + good = 0; + TAILQ_FOREACH(depend, &ip->target_list, target_entry) { + r = hammer_setup_parent_inodes(depend); + if (r < 0 && good == 0) + good = -1; + if (r > 0) + good = 1; + } + + /* + * We can continue if good >= 0. Determine how many records + * under our inode can be flushed (and mark them). + */ + kprintf("g%d", good); + if (good >= 0) { + hammer_flush_inode_core(ip, flags); + } else { + ip->flags |= HAMMER_INODE_REFLUSH; + } + break; + default: + /* + * We are already flushing, flag the inode to reflush + * if needed after it completes its current flush. + */ + if ((ip->flags & HAMMER_INODE_REFLUSH) == 0) + ip->flags |= HAMMER_INODE_REFLUSH; + break; + } +} + +/* + * We are asked to recurse upwards and convert the record from SETUP + * to FLUSH if possible. record->ip is a parent of the caller's inode, + * and record->target_ip is the caller's inode. + * + * Return 1 if the record gives us connectivity + * + * Return 0 if the record is not relevant + * + * Return -1 if we can't resolve the dependancy and there is no connectivity. + */ +static int +hammer_setup_parent_inodes(hammer_record_t record) +{ + hammer_mount_t hmp = record->ip->hmp; + hammer_record_t depend; + hammer_inode_t ip; + int r, good; + + KKASSERT(record->flush_state != HAMMER_FST_IDLE); + ip = record->ip; + + /* + * If the record is already flushing, is it in our flush group? + * + * If it is in our flush group but it is a delete-on-disk, it + * does not improve our connectivity (return 0), and if the + * target inode is not trying to destroy itself we can't allow + * the operation yet anyway (the second return -1). + */ + if (record->flush_state == HAMMER_FST_FLUSH) { + if (record->flush_group != hmp->flusher_next) { + ip->flags |= HAMMER_INODE_REFLUSH; + return(-1); } + if (record->type == HAMMER_MEM_RECORD_ADD) + return(1); + return(0); + } + + /* + * It must be a setup record. Try to resolve the setup dependancies + * by recursing upwards so we can place ip on the flush list. + */ + KKASSERT(record->flush_state == HAMMER_FST_SETUP); + + good = 0; + TAILQ_FOREACH(depend, &ip->target_list, target_entry) { + r = hammer_setup_parent_inodes(depend); + if (r < 0 && good == 0) + good = -1; + if (r > 0) + good = 1; + } + + /* + * We can't flush ip because it has no connectivity (XXX also check + * nlinks for pre-existing connectivity!). Flag it so any resolution + * recurses back down. + */ + if (good < 0) { + ip->flags |= HAMMER_INODE_REFLUSH; + return(good); + } + + /* + * We are go, place the parent inode in a flushing state so we can + * place its record in a flushing state. Note that the parent + * may already be flushing. The record must be in the same flush + * group as the parent. + */ + if (ip->flush_state != HAMMER_FST_FLUSH) + hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION); + KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); + KKASSERT(record->flush_state == HAMMER_FST_SETUP); + +#if 0 + if (record->type == HAMMER_MEM_RECORD_DEL && + (record->target_ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELONDISK)) == 0) { + /* + * Regardless of flushing state we cannot sync this path if the + * record represents a delete-on-disk but the target inode + * is not ready to sync its own deletion. + * + * XXX need to count effective nlinks to determine whether + * the flush is ok, otherwise removing a hardlink will + * just leave the DEL record to rot. + */ + record->target_ip->flags |= HAMMER_INODE_REFLUSH; + return(-1); + } else +#endif + if (ip->flush_group == ip->hmp->flusher_next) { + /* + * This is the record we wanted to synchronize. + */ + record->flush_state = HAMMER_FST_FLUSH; + record->flush_group = ip->flush_group; + hammer_ref(&record->lock); + if (record->type == HAMMER_MEM_RECORD_ADD) + return(1); + + /* + * The record is a delete-n-disk. It does not contribute + * to our visibility. We can still flush it. + */ + return(0); + } else { + /* + * We couldn't resolve the dependancies, request that the + * inode be flushed when the dependancies can be resolved. + */ + ip->flags |= HAMMER_INODE_REFLUSH; + return(-1); } } /* - * Helper routine to copy the frontend synchronization state to the backend. - * This routine may be called by either the frontend or the backend. + * This is the core routine placing an inode into the FST_FLUSH state. */ static void -hammer_flush_inode_copysync(hammer_inode_t ip) +hammer_flush_inode_core(hammer_inode_t ip, int flags) { + int go_count; int error; - int count; + + KKASSERT(ip->flush_state != HAMMER_FST_FLUSH); + if (ip->flush_state == HAMMER_FST_IDLE) + hammer_ref(&ip->lock); + ip->flush_state = HAMMER_FST_FLUSH; + ip->flush_group = ip->hmp->flusher_next; /* - * Prevent anyone else from trying to do the same thing. + * Figure out how many in-memory records we can actually flush + * (not including inode meta-data, buffers, etc). */ - ip->flush_state = HAMMER_FST_SETUP; + if (flags & HAMMER_FLUSH_RECURSION) { + go_count = 1; + } else { + go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, + hammer_setup_child_callback, NULL); + } /* - * Sync the buffer cache. This will queue the BIOs. If called - * from the context of the flusher the BIO's are thrown into bio_list - * regardless of ip->flush_state. + * This is a more involved test that includes go_count. If we + * can't flush, flag the inode and return. If go_count is 0 we + * were are unable to flush any records in our rec_tree and + * must ignore the XDIRTY flag. */ - if (ip->vp != NULL) - error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL); - else - error = 0; + if (go_count == 0) { + if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) { + ip->flags |= HAMMER_INODE_REFLUSH; + ip->flush_state = HAMMER_FST_SETUP; + return; + } + } /* - * This freezes strategy writes, any further BIOs will be - * queued to alt_bio (unless we are + * Inodes not in an IDLE state get an extra reference. + * + * Place the inode in a flush state and sync all frontend + * information to the backend. */ - ip->flush_state = HAMMER_FST_FLUSH; + + if ((flags & HAMMER_FLUSH_RECURSION) == 0) { + if (ip->vp != NULL) + error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL); + else + error = 0; + } + + /* + * Any further strategy calls will go into the inode's alternative + * bioq. + */ + ip->flags |= HAMMER_INODE_WRITE_ALT; /* * Snapshot the state of the inode for the backend flusher. * * The truncation must be retained in the frontend until after * we've actually performed the record deletion. + * + * NOTE: The DELETING flag is a mod flag, but it is also sticky, + * and stays in ip->flags. Once set, it stays set until the + * inode is destroyed. */ ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK); + ip->sync_flags &= ~HAMMER_INODE_DELETING; ip->sync_trunc_off = ip->trunc_off; ip->sync_ino_rec = ip->ino_rec; ip->sync_ino_data = ip->ino_data; ip->flags &= ~HAMMER_INODE_MODMASK | - HAMMER_INODE_TRUNCATED | HAMMER_INODE_BUFS; + HAMMER_INODE_TRUNCATED | HAMMER_INODE_BUFS | + HAMMER_INODE_DELETING; /* * Fix up the dirty buffer status. */ - if (ip->vp == NULL || RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL) - ip->flags &= ~HAMMER_INODE_BUFS; + if (ip->vp == NULL || RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL) { + if (TAILQ_FIRST(&ip->bio_alt_list) == NULL) + ip->flags &= ~HAMMER_INODE_BUFS; + } if (TAILQ_FIRST(&ip->bio_list)) ip->sync_flags |= HAMMER_INODE_BUFS; else ip->sync_flags &= ~HAMMER_INODE_BUFS; /* - * Set the state for the inode's in-memory records. If some records - * could not be marked for backend flush (i.e. deleted records), - * re-set the XDIRTY flag. + * The flusher inherits our inode and reference. */ - count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, - hammer_mark_record_callback, NULL); - if (count) - ip->flags |= HAMMER_INODE_XDIRTY; + TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry); + + if (flags & HAMMER_FLUSH_SIGNAL) + hammer_flusher_async(ip->hmp); } /* - * Mark records for backend flush, accumulate a count of the number of - * records which could not be marked. Records marked for deletion - * by the frontend never make it to the media. It is possible for - * a record queued to the backend to wind up with FE set after the - * fact, as long as BE has not yet been set. The backend deals with - * this race by syncing the record as if FE had not been set, and - * then converting the record to a delete-on-disk record. + * Callback for scan of ip->rec_tree. Try to include each record in our + * flush. ip->flush_group has been set but the inode has not yet been + * moved into a flushing state. + * + * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on + * both inodes. + * + * We return 1 for any record placed or found in FST_FLUSH, which prevents + * the caller from shortcutting the flush. */ static int -hammer_mark_record_callback(hammer_record_t rec, void *data) +hammer_setup_child_callback(hammer_record_t rec, void *data) { - if (rec->state == HAMMER_FST_FLUSH) { + hammer_inode_t target_ip; + hammer_inode_t ip; + int r; + + /* + * If the record has been deleted by the backend (it's being held + * by the frontend in a race), just ignore it. + */ + if (rec->flags & HAMMER_RECF_DELETED_BE) return(0); - } else if ((rec->flags & HAMMER_RECF_DELETED_FE) == 0) { - rec->state = HAMMER_FST_FLUSH; + + /* + * If the record is in an idle state it has no dependancies and + * can be flushed. + */ + ip = rec->ip; + r = 0; + + switch(rec->flush_state) { + case HAMMER_FST_IDLE: + /* + * Record has no setup dependancy, we can flush it. + */ + KKASSERT(rec->target_ip == NULL); + rec->flush_state = HAMMER_FST_FLUSH; + rec->flush_group = ip->flush_group; hammer_ref(&rec->lock); - return(0); - } else { - return(1); + r = 1; + break; + case HAMMER_FST_SETUP: + /* + * Record has a setup dependancy. Try to include the + * target ip in the flush. + * + * We have to be careful here, if we do not do the right + * thing we can lose track of dirty inodes and the system + * will lockup trying to allocate buffers. + */ + target_ip = rec->target_ip; + KKASSERT(target_ip != NULL); + KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE); + if (target_ip->flush_state == HAMMER_FST_FLUSH) { + /* + * If the target IP is already flushing in our group + * we are golden, otherwise make sure the target + * reflushes. + */ + if (target_ip->flush_group == ip->flush_group) { + rec->flush_state = HAMMER_FST_FLUSH; + rec->flush_group = ip->flush_group; + hammer_ref(&rec->lock); + r = 1; + } else { + target_ip->flags |= HAMMER_INODE_REFLUSH; + } + } else if (rec->type == HAMMER_MEM_RECORD_ADD) { + /* + * If the target IP is not flushing we can force + * it to flush, even if it is unable to write out + * any of its own records we have at least one in + * hand that we CAN deal with. + */ + rec->flush_state = HAMMER_FST_FLUSH; + rec->flush_group = ip->flush_group; + hammer_ref(&rec->lock); + hammer_flush_inode_core(target_ip, + HAMMER_FLUSH_RECURSION); + r = 1; + } else { + /* + * XXX this needs help. We have a delete-on-disk + * which could disconnect the target. If the target + * has its own dependancies they really need to + * be flushed. + * + * XXX + */ + rec->flush_state = HAMMER_FST_FLUSH; + rec->flush_group = ip->flush_group; + hammer_ref(&rec->lock); + hammer_flush_inode_core(target_ip, + HAMMER_FLUSH_RECURSION); + r = 1; + } + break; + case HAMMER_FST_FLUSH: + /* + * Record already associated with a flush group. It had + * better be ours. + */ + KKASSERT(rec->flush_group == ip->flush_group); + r = 1; + break; } + return(r); } - - /* * Wait for a previously queued flush to complete */ @@ -849,13 +1098,32 @@ void hammer_flush_inode_done(hammer_inode_t ip) { struct bio *bio; + int dorel = 0; KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); - if (ip->sync_flags) - kprintf("ip %p leftover sync_flags %08x\n", ip, ip->sync_flags); + /* + * Allow BIOs to queue to the inode's primary bioq again. + */ + ip->flags &= ~HAMMER_INODE_WRITE_ALT; + + /* + * Merge left-over flags back into the frontend and fix the state. + */ ip->flags |= ip->sync_flags; - ip->flush_state = HAMMER_FST_IDLE; + if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) { + ip->flush_state = HAMMER_FST_IDLE; + dorel = 1; + } else { + ip->flush_state = HAMMER_FST_SETUP; + } + + /* + * The backend may have adjusted nlinks, so if the adjusted nlinks + * does not match the fronttend set the frontend's RDIRTY flag again. + */ + if (ip->ino_rec.ino_nlinks != ip->sync_ino_rec.ino_nlinks) + ip->flags |= HAMMER_INODE_RDIRTY; /* * Reflush any BIOs that wound up in the alt list. Our inode will @@ -864,9 +1132,19 @@ hammer_flush_inode_done(hammer_inode_t ip) while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) { TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act); TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act); + kprintf("d"); + ip->flags |= HAMMER_INODE_BUFS; + ip->flags |= HAMMER_INODE_REFLUSH; + } + + /* + * Re-set the XDIRTY flag if some of the inode's in-memory records + * could not be flushed. + */ + if (RB_ROOT(&ip->rec_tree)) { ip->flags |= HAMMER_INODE_XDIRTY; ip->flags |= HAMMER_INODE_REFLUSH; - kprintf("rebio %p ip %p @%016llx,%d\n", bio, ip, bio->bio_offset, bio->bio_buf->b_bufsize); + kprintf("e"); } /* @@ -875,7 +1153,7 @@ hammer_flush_inode_done(hammer_inode_t ip) */ if (ip->flags & HAMMER_INODE_REFLUSH) { ip->flags &= ~HAMMER_INODE_REFLUSH; - hammer_flush_inode(ip, 0); + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); if (ip->flush_state == HAMMER_FST_IDLE) { if (ip->flags & HAMMER_INODE_FLUSHW) { ip->flags &= ~HAMMER_INODE_FLUSHW; @@ -888,7 +1166,8 @@ hammer_flush_inode_done(hammer_inode_t ip) wakeup(&ip->flags); } } - hammer_rel_inode(ip, 0); + if (dorel) + hammer_rel_inode(ip, 0); } /* @@ -902,11 +1181,19 @@ hammer_sync_record_callback(hammer_record_t record, void *data) int error; /* - * Skip records that do not belong to the current flush. Records - * belonging to the flush will have been referenced for us. + * Skip records that do not belong to the current flush. */ - if (record->state != HAMMER_FST_FLUSH) + if (record->flush_state != HAMMER_FST_FLUSH) return(0); + KKASSERT((record->flags & HAMMER_RECF_DELETED_BE) == 0); +#if 1 + if (record->flush_group != record->ip->flush_group) { + kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group); + Debugger("blah2"); + return(0); + } +#endif + KKASSERT(record->flush_group == record->ip->flush_group); /* * Interlock the record using the BE flag. Once BE is set the @@ -916,9 +1203,8 @@ hammer_sync_record_callback(hammer_record_t record, void *data) * record out, but the flush completion code converts it to * a delete-on-disk record instead of destroying it. */ - hammer_lock_ex(&record->lock); if (record->flags & HAMMER_RECF_INTERLOCK_BE) { - hammer_unlock(&record->lock); + hammer_flush_record_done(record, 0); return(0); } record->flags |= HAMMER_RECF_INTERLOCK_BE; @@ -927,18 +1213,20 @@ hammer_sync_record_callback(hammer_record_t record, void *data) * If DELETED_FE is set we may have already sent dependant pieces * to the disk and we must flush the record as if it hadn't been * deleted. This creates a bit of a mess because we have to - * have ip_sync_record convert the record to DELETE_ONDISK before + * have ip_sync_record convert the record to MEM_RECORD_DEL before * it inserts the B-Tree record. Otherwise the media sync might * be visible to the frontend. */ - if (record->flags & HAMMER_RECF_DELETED_FE) - record->flags |= HAMMER_RECF_CONVERT_DELETE_ONDISK; + if (record->flags & HAMMER_RECF_DELETED_FE) { + KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); + record->flags |= HAMMER_RECF_CONVERT_DELETE; + } /* * Assign the create_tid for new records. Deletions already * have the record's entire key properly set up. */ - if ((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0) + if (record->type != HAMMER_MEM_RECORD_DEL) record->rec.inode.base.base.create_tid = trans->tid; error = hammer_ip_sync_record(trans, record); @@ -958,105 +1246,70 @@ hammer_sync_record_callback(hammer_record_t record, void *data) * XXX error handling */ int -hammer_sync_inode(hammer_inode_t ip, int handle_delete) +hammer_sync_inode(hammer_inode_t ip) { struct hammer_transaction trans; struct bio *bio; - hammer_depend_t depend; + hammer_record_t depend; + hammer_record_t next; int error, tmp_error; + u_int64_t nlinks; - if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0 && - handle_delete == 0) { + if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0) return(0); - } hammer_start_transaction_fls(&trans, ip->hmp); /* - * Any (directory) records this inode depends on must also be - * synchronized. The directory itself only needs to be flushed - * if its inode is not already on-disk. + * Any directory records referencing this inode which are not in + * our current flush group must adjust our nlink count for the + * purposes of synchronization to disk. + * + * Records which are in our flush group can be unlinked from our + * inode now, allowing the inode to be physically deleted. */ - while ((depend = TAILQ_FIRST(&ip->depend_list)) != NULL) { - hammer_record_t record; - - record = depend->record; - TAILQ_REMOVE(&depend->record->depend_list, depend, rec_entry); - TAILQ_REMOVE(&ip->depend_list, depend, ip_entry); - --ip->depend_count; - if (record->state != HAMMER_FST_FLUSH) { - record->state = HAMMER_FST_FLUSH; - /* add ref (steal ref from dependancy) */ - } else { - /* remove ref related to dependancy */ - /* record still has at least one ref from state */ - hammer_unref(&record->lock); - KKASSERT(record->lock.refs > 0); - } - if (record->ip->flags & HAMMER_INODE_ONDISK) { - kprintf("I"); - hammer_sync_record_callback(record, &trans); - } else { - kprintf("J"); - KKASSERT((record->ip->flags & HAMMER_INODE_NEW) == 0); - hammer_flush_inode(record->ip, 0); + nlinks = ip->ino_rec.ino_nlinks; + next = TAILQ_FIRST(&ip->target_list); + while ((depend = next) != NULL) { + next = TAILQ_NEXT(depend, target_entry); + if (depend->flush_state == HAMMER_FST_FLUSH && + depend->flush_group == ip->hmp->flusher_act) { + TAILQ_REMOVE(&ip->target_list, depend, target_entry); + depend->target_ip = NULL; + /* no need to signal target_ip, it is us */ + } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) { + switch(depend->type) { + case HAMMER_MEM_RECORD_ADD: + --nlinks; + break; + case HAMMER_MEM_RECORD_DEL: + ++nlinks; + break; + } } - hammer_unref(&ip->lock); - KKASSERT(ip->lock.refs > 0); - kfree(depend, M_HAMMER); } - /* - * Sync inode deletions and truncations. + * Set dirty if we had to modify the link count. */ - if (ip->sync_ino_rec.ino_nlinks == 0 && handle_delete && - (ip->flags & HAMMER_INODE_GONE) == 0) { - /* - * Handle the case where the inode has been completely deleted - * and is no longer referenceable from the filesystem - * namespace. - * - * NOTE: We do not set the RDIRTY flag when updating the - * delete_tid, setting HAMMER_INODE_DELETED takes care of it. - */ + if (ip->sync_ino_rec.ino_nlinks != nlinks) { + KKASSERT((int64_t)nlinks >= 0); + ip->sync_ino_rec.ino_nlinks = nlinks; + ip->sync_flags |= HAMMER_INODE_RDIRTY; + } - ip->flags |= HAMMER_INODE_GONE | HAMMER_INODE_DELETED; - ip->flags &= ~HAMMER_INODE_TRUNCATED; - ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; - if (ip->vp) - vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE); + /* + * If the inode has been unlinked and no longer has a vnode + * ref, destroy its data. + * + * Otherwise if there is a trunction queued destroy any data past + * the (aligned) truncation point. Userland will have dealt with + * the buffer containing the truncation point for us. + */ + if (ip->sync_ino_rec.ino_nlinks == 0 && ip->vp == NULL) { error = hammer_ip_delete_range_all(&trans, ip); if (error) Debugger("hammer_ip_delete_range_all errored"); - - /* - * Sanity check. The only records that remain should be - * marked for back-end deletion. - */ - { - hammer_record_t rec; - - RB_FOREACH(rec, hammer_rec_rb_tree, &ip->rec_tree) { - KKASSERT(rec->state == HAMMER_FST_FLUSH); - } - } - - /* - * Set delete_tid in both the frontend and backend - * copy of the inode record. - */ - ip->ino_rec.base.base.delete_tid = trans.tid; - ip->sync_ino_rec.base.base.delete_tid = trans.tid; - - /* - * Indicate that the inode has/is-being deleted. - */ - ip->flags |= HAMMER_NODE_DELETED; - hammer_modify_inode(&trans, ip, HAMMER_INODE_RDIRTY); - hammer_modify_volume(&trans, trans.rootvol, NULL, 0); - --ip->hmp->rootvol->ondisk->vol0_stat_inodes; - hammer_modify_volume_done(trans.rootvol); } else if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { /* * Interlock trunc_off. The VOP front-end may continue to @@ -1076,7 +1329,6 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete) * while we were blocked so do not just unconditionally * set it to the maximum offset. */ - kprintf("sync truncation range @ %016llx\n", aligned_trunc_off); error = hammer_ip_delete_range(&trans, ip, aligned_trunc_off, 0x7FFFFFFFFFFFFFFFLL); @@ -1087,14 +1339,78 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete) ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; ip->flags &= ~HAMMER_INODE_TRUNCATED; } + } else { + error = 0; } - error = 0; /* XXX vfsync used to be here */ + /* + * Now sync related records. These will typically be directory + * entries or delete-on-disk records. + */ + if (error == 0) { + tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, + hammer_sync_record_callback, &trans); + if (tmp_error < 0) + tmp_error = -error; + if (tmp_error) + error = tmp_error; + } + + /* + * Sync inode deletions, with certain restrictions. + * + * - Nlinks must be 0 for both the frontend and the backend. + * - All related directory entries and our own records must + * be synchronized. + * + * In the latter case a directory containing numerous directory + * entries may not be able to sync those entries due to topological + * recursion. If this is the case those records would not have + * been marked for flush action and ip->rec_tree will not be empty. + */ + if (ip->sync_ino_rec.ino_nlinks == 0 && + ip->ino_rec.ino_nlinks == 0 && + TAILQ_FIRST(&ip->target_list) == NULL && + RB_ROOT(&ip->rec_tree) == NULL && + (ip->flags & HAMMER_INODE_GONE) == 0) { + /* + * Handle the case where the inode has been completely deleted + * and is no longer referenceable from the filesystem + * namespace. + * + * NOTE: We do not set the RDIRTY flag when updating the + * delete_tid, setting HAMMER_INODE_DELETED takes care of it. + */ + kprintf("Y"); + + ip->flags |= HAMMER_INODE_GONE | HAMMER_INODE_DELETED; + ip->flags &= ~HAMMER_INODE_TRUNCATED; + ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; + if (ip->vp) + vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE); + + /* + * Set delete_tid in both the frontend and backend + * copy of the inode record. + */ + ip->ino_rec.base.base.delete_tid = trans.tid; + ip->sync_ino_rec.base.base.delete_tid = trans.tid; + + /* + * Indicate that the inode has/is-being deleted. + */ + ip->flags |= HAMMER_NODE_DELETED; + hammer_modify_inode(&trans, ip, HAMMER_INODE_RDIRTY); + hammer_modify_volume(&trans, trans.rootvol, NULL, 0); + --ip->hmp->rootvol->ondisk->vol0_stat_inodes; + hammer_modify_volume_done(trans.rootvol); + } /* * Flush any queued BIOs. */ while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) { + KKASSERT((ip->flags & HAMMER_INODE_DELETED) == 0); TAILQ_REMOVE(&ip->bio_list, bio, bio_act); #if 0 kprintf("dowrite %016llx ip %p bio %p @ %016llx\n", trans.tid, ip, bio, bio->bio_offset); @@ -1106,18 +1422,11 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete) ip->sync_flags &= ~HAMMER_INODE_BUFS; /* - * Now sync related records. + * We better have nothing left if the inode has been deleted. If it + * hasn't the frontend may have queued more stuff, which would be ok. */ - for (;;) { - tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, - hammer_sync_record_callback, &trans); - KKASSERT(error <= 0); - if (tmp_error < 0) - tmp_error = -error; - if (tmp_error) - error = tmp_error; - break; - } + KKASSERT((ip->flags & HAMMER_INODE_DELETED) == 0 || + RB_ROOT(&ip->rec_tree) == NULL); /* * XDIRTY represents rec_tree and bio_list. However, rec_tree may @@ -1156,7 +1465,6 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete) KKASSERT(record->lock.refs == 1); record->flags |= HAMMER_RECF_DELETED_FE; record->flags |= HAMMER_RECF_DELETED_BE; - hammer_cleardep_mem_record(record); hammer_rel_mem_record(record); } break; @@ -1211,3 +1519,46 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete) return(error); } +/* + * This routine is called when the OS is no longer actively referencing + * the inode (but might still be keeping it cached), or when releasing + * the last reference to an inode. + * + * At this point if the inode's nlinks count is zero we want to destroy + * it, which may mean destroying it on-media too. + */ +static int +hammer_inode_unloadable_check(hammer_inode_t ip) +{ + /* + * If the inode is on-media and the link count is 0 we MUST delete + * it on-media. + */ + if (ip->ino_rec.ino_nlinks == 0 && + (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) == + HAMMER_INODE_ONDISK) { + ip->flags |= HAMMER_INODE_DELETING; + } else { + ip->flags &= ~HAMMER_INODE_DELETING; + } + + /* + * If only one ref remains and the inode is not dirty, telling + * the caller that he can dispose of the inode. + */ + if (ip->lock.refs == 1 && (ip->flags & HAMMER_INODE_MODMASK) == 0) + return(1); + return(0); +} + +void +hammer_test_inode(hammer_inode_t ip) +{ + if (ip->flags & HAMMER_INODE_REFLUSH) { + ip->flags &= ~HAMMER_INODE_REFLUSH; + hammer_ref(&ip->lock); + hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); + hammer_rel_inode(ip, 0); + } +} + diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 08b352cb00..a2fb4af76e 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.47 2008/04/27 21:07:15 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.48 2008/05/02 01:00:42 dillon Exp $ */ #include "hammer.h" @@ -166,10 +166,9 @@ hammer_alloc_mem_record(hammer_inode_t ip) ++hammer_count_records; record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO); - record->state = HAMMER_FST_IDLE; + record->flush_state = HAMMER_FST_IDLE; record->ip = ip; record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD; - TAILQ_INIT(&record->depend_list); hammer_ref(&record->lock); return (record); } @@ -177,7 +176,7 @@ hammer_alloc_mem_record(hammer_inode_t ip) void hammer_wait_mem_record(hammer_record_t record) { - while (record->state == HAMMER_FST_FLUSH) { + while (record->flush_state == HAMMER_FST_FLUSH) { record->flags |= HAMMER_RECF_WANTED; tsleep(record, 0, "hmrrc2", 0); } @@ -194,7 +193,10 @@ hammer_wait_mem_record(hammer_record_t record) void hammer_flush_record_done(hammer_record_t record, int error) { - KKASSERT(record->state == HAMMER_FST_FLUSH); + hammer_inode_t target_ip; + int cleanup = 0; + + KKASSERT(record->flush_state == HAMMER_FST_FLUSH); KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); if (error) { @@ -202,7 +204,8 @@ hammer_flush_record_done(hammer_record_t record, int error) * An error occured, the backend was unable to sync the * record to its media. Leave the record intact. */ - } else if (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK) { + Debugger("flush_record_done error"); + } else if (record->flags & HAMMER_RECF_CONVERT_DELETE) { /* * deleted-record to delete-on-disk conversion, occurs when * we sync a record to disk which is marked deleted by the @@ -211,9 +214,9 @@ hammer_flush_record_done(hammer_record_t record, int error) */ if (record->flags & HAMMER_RECF_DELETED_BE) { record->flags |= HAMMER_RECF_DELETED_FE; - hammer_cleardep_mem_record(record); + cleanup = 1; } else { - KKASSERT(record->flags & HAMMER_RECF_DELETE_ONDISK); + KKASSERT(record->type == HAMMER_MEM_RECORD_DEL); } } else { /* @@ -221,12 +224,26 @@ hammer_flush_record_done(hammer_record_t record, int error) * having been synchronized to the media). */ record->flags |= HAMMER_RECF_DELETED_FE; - hammer_cleardep_mem_record(record); + record->flags |= HAMMER_RECF_DELETED_BE; + cleanup = 1; } - record->state = HAMMER_FST_IDLE; + if (cleanup) { + if ((target_ip = record->target_ip) != NULL) { + TAILQ_REMOVE(&target_ip->target_list, record, + target_entry); + record->target_ip = NULL; + hammer_test_inode(target_ip); + } + record->flush_state = HAMMER_FST_IDLE; + } else { + if (record->target_ip) + record->flush_state = HAMMER_FST_SETUP; + else + record->flush_state = HAMMER_FST_IDLE; + } + record->flags &= ~HAMMER_RECF_INTERLOCK_BE; - record->flags &= ~HAMMER_RECF_CONVERT_DELETE_ONDISK; - hammer_unlock(&record->lock); + record->flags &= ~HAMMER_RECF_CONVERT_DELETE; if (record->flags & HAMMER_RECF_WANTED) { record->flags &= ~HAMMER_RECF_WANTED; wakeup(record); @@ -234,28 +251,6 @@ hammer_flush_record_done(hammer_record_t record, int error) hammer_rel_mem_record(record); } -/* - * Clear dependancies associated with a memory record. - */ -void -hammer_cleardep_mem_record(struct hammer_record *record) -{ - hammer_depend_t depend; - - while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) { - TAILQ_REMOVE(&record->depend_list, depend, - rec_entry); - TAILQ_REMOVE(&depend->ip->depend_list, depend, - ip_entry); - --depend->ip->depend_count; - /* NOTE: inode is not flushed */ - hammer_rel_inode(depend->ip, 0); - hammer_unref(&record->lock); - KKASSERT(record->lock.refs > 0); - kfree(depend, M_HAMMER); - } -} - /* * Release a memory record. Records marked for deletion are immediately * removed from the RB-Tree but otherwise left intact until the last ref @@ -264,12 +259,21 @@ hammer_cleardep_mem_record(struct hammer_record *record) void hammer_rel_mem_record(struct hammer_record *record) { + hammer_inode_t ip, target_ip; + hammer_unref(&record->lock); if (record->flags & HAMMER_RECF_DELETED_FE) { if (record->lock.refs == 0) { - KKASSERT(record->state == HAMMER_FST_IDLE); - KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL); + KKASSERT(record->flush_state != HAMMER_FST_FLUSH); + + ip = record->ip; + if ((target_ip = record->target_ip) != NULL) { + TAILQ_REMOVE(&target_ip->target_list, + record, target_entry); + record->target_ip = NULL; + hammer_test_inode(target_ip); + } if (record->flags & HAMMER_RECF_ONRBTREE) { RB_REMOVE(hammer_rec_rb_tree, @@ -360,7 +364,7 @@ hammer_rec_scan_callback(hammer_record_t rec, void *data) #warning "This deadlocks" #if 0 - if (rec->state == HAMMER_FST_FLUSH) + if (rec->flush_state == HAMMER_FST_FLUSH) hammer_wait_mem_record(rec); #endif @@ -509,17 +513,16 @@ hammer_ip_add_directory(struct hammer_transaction *trans, struct hammer_inode *ip) { hammer_record_t record; - hammer_depend_t depend; int error; int bytes; record = hammer_alloc_mem_record(dip); - depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO); bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */ if (++trans->hmp->namekey_iterator == 0) ++trans->hmp->namekey_iterator; + record->type = HAMMER_MEM_RECORD_ADD; record->rec.entry.base.base.obj_id = dip->obj_id; record->rec.entry.base.base.key = hammer_directory_namekey(ncp->nc_name, bytes); @@ -531,20 +534,24 @@ hammer_ip_add_directory(struct hammer_transaction *trans, record->rec.entry.base.data_len = bytes; ++ip->ino_rec.ino_nlinks; hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); - /* NOTE: copies record->data */ /* - * If the inode gets synced cause the directory entry - * to be synced as well, or vise-versa. + * The target inode and the directory entry are bound together. + */ + record->target_ip = ip; + record->flush_state = HAMMER_FST_SETUP; + TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); + + /* + * The inode now has a dependancy and must be taken out of the idle + * state. An inode not in an idle state is given an extra reference. */ - hammer_ref(&record->lock); /* for depend entry */ - hammer_ref(&ip->lock); /* for depend entry */ - depend->ip = ip; - depend->record = record; - TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry); - TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry); - ++ip->depend_count; + if (ip->flush_state == HAMMER_FST_IDLE) { + hammer_ref(&ip->lock); + ip->flush_state = HAMMER_FST_SETUP; + } + /* NOTE: copies record->data */ error = hammer_mem_add(trans, record); return(error); } @@ -565,7 +572,6 @@ hammer_ip_del_directory(struct hammer_transaction *trans, struct hammer_inode *ip) { hammer_record_t record; - hammer_depend_t depend; int error; if (cursor->record == &cursor->iprec->rec) { @@ -585,8 +591,8 @@ hammer_ip_del_directory(struct hammer_transaction *trans, cursor->deadlk_rec = record; error = EDEADLK; } else { + KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); record->flags |= HAMMER_RECF_DELETED_FE; - hammer_cleardep_mem_record(record); error = 0; } } else { @@ -595,24 +601,24 @@ hammer_ip_del_directory(struct hammer_transaction *trans, * the record's key. This also causes lookups to skip the * record. */ - depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO); - record = hammer_alloc_mem_record(dip); + record->type = HAMMER_MEM_RECORD_DEL; record->rec.entry.base.base = cursor->record->base.base; hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); - record->flags |= HAMMER_RECF_DELETE_ONDISK; + + record->target_ip = ip; + record->flush_state = HAMMER_FST_SETUP; + TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); /* - * If the inode gets synced cause the directory entry - * to be synced as well, or vise-versa. + * The inode now has a dependancy and must be taken out of + * the idle state. An inode not in an idle state is given + * an extra reference. */ - hammer_ref(&ip->lock); /* for depend entry */ - hammer_ref(&record->lock); /* for depend entry */ - depend->ip = ip; - depend->record = record; - TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry); - TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry); - ++ip->depend_count; + if (ip->flush_state == HAMMER_FST_IDLE) { + hammer_ref(&ip->lock); + ip->flush_state = HAMMER_FST_SETUP; + } error = hammer_mem_add(trans, record); } @@ -666,6 +672,7 @@ hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type; hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); + /* NOTE: copies record->data */ error = hammer_mem_add(trans, record); return(error); @@ -788,8 +795,6 @@ done: * Sync an in-memory record to the disk. This is called by the backend. * This code is responsible for actually writing a record out to the disk. * - * Any inode dependancies will queue the inode to the backend. - * * This routine can only be called by the backend and the record * must have been interlocked with BE. It will remain interlocked on * return. The caller is responsible for the record's disposition. @@ -800,35 +805,13 @@ hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record) struct hammer_cursor cursor; hammer_record_ondisk_t rec; union hammer_btree_elm elm; - hammer_depend_t depend; hammer_off_t rec_offset; void *bdata; int error; - KKASSERT(record->state == HAMMER_FST_FLUSH); + KKASSERT(record->flush_state == HAMMER_FST_FLUSH); KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); - /* - * XXX A record with a dependancy is typically a directory record. - * The related inode must also be synchronized. This code is not - * currently synchronizing the inode atomically. XXX - * - * XXX Additional dependancies from the frontend might be added while - * the backend is syncing the record? - */ - while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) { - TAILQ_REMOVE(&record->depend_list, depend, rec_entry); - TAILQ_REMOVE(&depend->ip->depend_list, depend, ip_entry); - --depend->ip->depend_count; - kprintf("S"); - KKASSERT((depend->ip->flags & HAMMER_INODE_NEW) == 0); - hammer_flush_inode(depend->ip, 0); - hammer_rel_inode(depend->ip, 0); - hammer_unref(&record->lock); - KKASSERT(record->lock.refs > 0); - kfree(depend, M_HAMMER); - } - retry: /* * Get a cursor, we will either be inserting or deleting. @@ -842,7 +825,7 @@ retry: /* * If we are deleting an exact match must be found on-disk. */ - if (record->flags & HAMMER_RECF_DELETE_ONDISK) { + if (record->type == HAMMER_MEM_RECORD_DEL) { error = hammer_btree_lookup(&cursor); if (error == 0) error = hammer_ip_delete_record(&cursor, trans->tid); @@ -953,12 +936,19 @@ retry: * still sync the record to the media as if it were not deleted, * but must interlock with the frontend to ensure that the * synchronized record is not visible to the frontend, which means - * converted the 'deleted' record to a delete-on-disk record. + * converting it from an ADD record to a DEL record. + * + * The DEL record then masks the record synced to disk until another + * round can delete it for real. */ - if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK)) { - KKASSERT((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0); - record->flags |= HAMMER_RECF_DELETE_ONDISK; + if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE)) { + KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); record->flags &= ~HAMMER_RECF_DELETED_FE; + record->type = HAMMER_MEM_RECORD_DEL; + if (record->flush_state == HAMMER_FST_SETUP) { + hammer_test_inode(record->ip); + hammer_test_inode(record->target_ip); + } } /* @@ -1035,7 +1025,6 @@ hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record) while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){ record->flags |= HAMMER_RECF_DELETED_FE; - KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL); hammer_rel_mem_record(record); return (EEXIST); } @@ -1268,17 +1257,26 @@ next_memory: } /* - * If the entries match the memory entry must specify - * an on-disk deletion. Eat both entries unless the - * caller wants visibility into the special records. + * If the entries match exactly the memory entry typically + * specifies an on-disk deletion and we eat both entries. + * + * If the in-memory record is not an on-disk deletion we + * probably caught the syncer while it was syncing it to + * the media. Since we hold a shared lock on the cursor, + * the in-memory record had better be marked deleted at + * this point. */ if (r == 0) { - KKASSERT(cursor->iprec->flags & - HAMMER_RECF_DELETE_ONDISK); - if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { - cursor->flags |= HAMMER_CURSOR_ATEDISK; + if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { + if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { + cursor->flags |= HAMMER_CURSOR_ATEDISK; + cursor->flags |= HAMMER_CURSOR_ATEMEM; + goto next_btree; + } + } else { + KKASSERT(hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0); cursor->flags |= HAMMER_CURSOR_ATEMEM; - goto next_btree; + goto next_memory; } } /* fall through to the memory entry */ @@ -1290,7 +1288,7 @@ next_memory: */ cursor->record = &cursor->iprec->rec; cursor->flags |= HAMMER_CURSOR_ATEMEM; - if (cursor->iprec->flags & HAMMER_RECF_DELETE_ONDISK) { + if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) goto next_memory; } @@ -1487,8 +1485,9 @@ retry: } /* - * Delete all records associated with an inode except the inode record - * itself. + * Delete all user records associated with an inode except the inode record + * itself. Directory entries are not deleted (they must be properly disposed + * of or nlinks would get upset). */ int hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) @@ -1535,8 +1534,12 @@ retry: * data if the retention policy dictates. The function * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next() * uses to perform a fixup. + * + * Directory entries (and delete-on-disk directory entries) + * must be synced and cannot be deleted. */ - error = hammer_ip_delete_record(&cursor, trans->tid); + if (rec->base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) + error = hammer_ip_delete_record(&cursor, trans->tid); if (error) break; error = hammer_ip_next(&cursor); @@ -1574,14 +1577,11 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) * only occurs in range iterations since all other records are * individually synchronized. Thus there should be no confusion with * the interlock. - * - * */ if (cursor->record == &cursor->iprec->rec) { KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); cursor->iprec->flags |= HAMMER_RECF_DELETED_FE; cursor->iprec->flags |= HAMMER_RECF_DELETED_BE; - hammer_cleardep_mem_record(cursor->iprec); return(0); } @@ -1690,15 +1690,37 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes) } /* - * Determine whether a directory is empty or not. Returns 0 if the directory - * is empty, ENOTEMPTY if it isn't, plus other possible errors. + * Determine whether we can remove a directory. This routine checks whether + * a directory is empty or not and enforces flush connectivity. + * + * Flush connectivity requires that we block if the target directory is + * currently flushing, otherwise it may not end up in the same flush group. + * + * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure. */ int -hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) +hammer_ip_check_directory_empty(hammer_transaction_t trans, + hammer_cursor_t parent_cursor, hammer_inode_t ip) { struct hammer_cursor cursor; int error; +#if 0 + /* + * Check flush connectivity + */ + if (ip->flush_state != HAMMER_FST_IDLE) { + kprintf("FWAIT\n"); + hammer_done_cursor(parent_cursor); + hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL); + hammer_wait_inode(ip); + return (EDEADLK); + } +#endif + + /* + * Check directory empty + */ hammer_init_cursor(trans, &cursor, &ip->cache[0]); cursor.key_beg.obj_id = ip->obj_id; diff --git a/sys/vfs/hammer/hammer_undo.c b/sys/vfs/hammer/hammer_undo.c index 012e599922..5541c20852 100644 --- a/sys/vfs/hammer/hammer_undo.c +++ b/sys/vfs/hammer/hammer_undo.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.7 2008/04/29 01:10:37 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.8 2008/05/02 01:00:42 dillon Exp $ */ /* @@ -97,6 +97,8 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io, /* no undo recursion */ hammer_modify_volume(NULL, root_volume, NULL, 0); + kprintf("u"); + again: /* * Allocate space in the FIFO @@ -104,6 +106,8 @@ again: bytes = ((len + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK) + sizeof(struct hammer_fifo_undo) + sizeof(struct hammer_fifo_tail); + if (hammer_undo_space(trans->hmp) < bytes + HAMMER_BUFSIZE*2) + panic("hammer: insufficient undo FIFO space!"); next_offset = undomap->next_offset; @@ -188,3 +192,34 @@ again: return(error); } +int64_t +hammer_undo_space(hammer_mount_t hmp) +{ + hammer_blockmap_t rootmap; + int64_t bytes; + int64_t max_bytes; + + rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; + + if (rootmap->first_offset <= rootmap->next_offset) { + bytes = (int)(rootmap->next_offset - rootmap->first_offset); + } else { + bytes = (int)(rootmap->alloc_offset - rootmap->first_offset + + rootmap->next_offset); + } + max_bytes = (int)(rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK); + return(max_bytes - bytes); +} + +int64_t +hammer_undo_max(hammer_mount_t hmp) +{ + hammer_blockmap_t rootmap; + int64_t max_bytes; + + rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; + max_bytes = (int)(rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK); + + return(max_bytes); +} + diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index 50f16f6302..351a863a78 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.30 2008/04/29 01:10:37 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.31 2008/05/02 01:00:42 dillon Exp $ */ #include @@ -61,6 +61,7 @@ int hammer_count_buffers; int hammer_count_nodes; int hammer_count_dirtybufs; /* global */ int hammer_limit_dirtybufs = 100; /* per-mount */ +int hammer_bio_count; int64_t hammer_contention_count; int64_t hammer_zone_limit; diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index ad311a6352..bc48c36713 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.42 2008/04/27 21:07:15 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.43 2008/05/02 01:00:42 dillon Exp $ */ #include @@ -503,7 +503,6 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) * bump the inode's link count. */ error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip); - hammer_finalize_inode(&trans, nip, error); if (error) kprintf("hammer_ip_add_directory error %d\n", error); hammer_unlock(&dip->lock); @@ -877,7 +876,6 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) hammer_lock_sh(&nip->lock); hammer_lock_sh(&dip->lock); error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip); - hammer_finalize_inode(&trans, nip, error); hammer_unlock(&dip->lock); hammer_unlock(&nip->lock); if (error) @@ -946,7 +944,6 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) hammer_lock_sh(&nip->lock); hammer_lock_sh(&dip->lock); error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip); - hammer_finalize_inode(&trans, nip, error); hammer_unlock(&dip->lock); hammer_unlock(&nip->lock); @@ -1584,7 +1581,6 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) */ hammer_lock_sh(&nip->lock); hammer_lock_sh(&dip->lock); - error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip); /* * Add a record representing the symlink. symlink stores the link @@ -1609,7 +1605,8 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) hammer_modify_inode(&trans, nip, HAMMER_INODE_RDIRTY); } } - hammer_finalize_inode(&trans, nip, error); + if (error == 0) + error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip); hammer_unlock(&dip->lock); hammer_unlock(&nip->lock); @@ -1903,13 +1900,13 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) * records in the database. */ BUF_KERNPROC(bp); - if (ip->flush_state == HAMMER_FST_FLUSH) + if (ip->flags & HAMMER_INODE_WRITE_ALT) TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act); else TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act); - hammer_modify_inode(NULL, ip, HAMMER_INODE_XDIRTY); - hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); - kprintf("a"); + ++hammer_bio_count; + hammer_modify_inode(NULL, ip, HAMMER_INODE_BUFS); + hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL); return(0); } @@ -1980,6 +1977,7 @@ hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, struct bio *bio) bp->b_resid = 0; } biodone(bio); + --hammer_bio_count; return(error); } @@ -2063,13 +2061,26 @@ retry: kprintf("obj_id %016llx\n", rec->entry.obj_id); Debugger("ENOENT unlinking object that should exist"); } + + /* + * If we are trying to remove a directory the directory must + * be empty. + * + * WARNING: hammer_ip_check_directory_empty() may have to + * terminate the cursor to avoid a deadlock. It is ok to + * call hammer_done_cursor() twice. + */ if (error == 0 && ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) { - error = hammer_ip_check_directory_empty(trans, ip); + error = hammer_ip_check_directory_empty(trans, &cursor, + ip); } + /* + * Delete the directory entry. + * * WARNING: hammer_ip_del_directory() may have to terminate - * the cursor to avoid a lock recursion. It's ok to call + * the cursor to avoid a deadlock. It is ok to call * hammer_done_cursor() twice. */ if (error == 0) { -- 2.41.0