HAMMER 40A/Many: Inode/link-count sequencer.
authorMatthew Dillon <dillon@dragonflybsd.org>
Fri, 2 May 2008 01:00:42 +0000 (01:00 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Fri, 2 May 2008 01:00:42 +0000 (01:00 +0000)
* Remove the hammer_depend structure and build the dependancies directly
  into the hammer_record structure.

* Attempt to implement layout rules to ensure connectivity is maintained.
  This means, for example, that before HAMMER can flush a newly created
  file it will make sure the file has namespace connectivity to the
  directory it was created it, recursively to the root.

NOTE: 40A destabilizes the filesystem a bit, it's going to take a few
passes to get everything working properly.  There are numerous issues
with this commit.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_undo.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index a0a8833..1eb5fcf 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
  * 
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.54 2008/04/29 01:10:37 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.55 2008/05/02 01:00:42 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -135,16 +135,15 @@ hammer_lock_excl_owned(struct hammer_lock *lock, thread_t td)
 }
 
 /*
- * inode->inode dependancy
+ * Flush state, used by various structures
  */
-typedef struct hammer_depend {
-       TAILQ_ENTRY(hammer_depend) ip_entry;
-       TAILQ_ENTRY(hammer_depend) rec_entry;
-       struct hammer_inode *ip;
-       struct hammer_record *record;
-} *hammer_depend_t;
+typedef enum hammer_inode_state {
+       HAMMER_FST_IDLE,
+       HAMMER_FST_SETUP,
+       HAMMER_FST_FLUSH
+} hammer_inode_state_t;
 
-TAILQ_HEAD(hammer_depend_list, hammer_depend);
+TAILQ_HEAD(hammer_record_list, hammer_record);
 
 /*
  * Cache object ids.  A fixed number of objid cache structures are
@@ -194,24 +193,18 @@ RB_PROTOTYPEX(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
 
 TAILQ_HEAD(hammer_node_list, hammer_node);
 
-typedef enum hammer_inode_state {
-       HAMMER_FST_IDLE,
-       HAMMER_FST_SETUP,
-       HAMMER_FST_FLUSH
-} hammer_inode_state_t;
-
 struct hammer_inode {
-       RB_ENTRY(hammer_inode) rb_node;
-       hammer_inode_state_t flush_state;
+       RB_ENTRY(hammer_inode)  rb_node;
+       hammer_inode_state_t    flush_state;
+       int                     flush_group;
        TAILQ_ENTRY(hammer_inode) flush_entry;
-       struct hammer_depend_list depend_list;
+       struct hammer_record_list target_list;  /* target of dependant recs */
        u_int64_t               obj_id;         /* (key) object identifier */
        hammer_tid_t            obj_asof;       /* (key) snapshot or 0 */
        struct hammer_mount     *hmp;
        hammer_objid_cache_t    objid_cache;
        int                     flags;
        int                     error;          /* flush error */
-       int                     depend_count;
        int                     cursor_ip_refs; /* sanity */
        struct vnode            *vp;
        struct lockf            advlock;
@@ -243,7 +236,7 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_DDIRTY    0x0001  /* in-memory ino_data is dirty */
 #define HAMMER_INODE_RDIRTY    0x0002  /* in-memory ino_rec is dirty */
 #define HAMMER_INODE_ITIMES    0x0004  /* in-memory mtime/atime modified */
-#define HAMMER_INODE_XDIRTY    0x0008  /* in-memory records/flsbufs present */
+#define HAMMER_INODE_XDIRTY    0x0008  /* in-memory records */
 #define HAMMER_INODE_ONDISK    0x0010  /* inode is on-disk (else not yet) */
 #define HAMMER_INODE_FLUSH     0x0020  /* flush on last ref */
 #define HAMMER_INODE_DELETED   0x0080  /* inode ready for deletion */
@@ -253,44 +246,60 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_DONDISK   0x0800  /* data records may be on disk */
 #define HAMMER_INODE_BUFS      0x1000  /* dirty high level bps present */
 #define HAMMER_INODE_REFLUSH   0x2000  /* pipelined flush during flush */
-#define HAMMER_INODE_UNUSED4000        0x4000
+#define HAMMER_INODE_WRITE_ALT 0x4000  /* strategy writes to alt bioq */
 #define HAMMER_INODE_FLUSHW    0x8000  /* Someone waiting for flush */
 
 #define HAMMER_INODE_TRUNCATED 0x00010000
-#define HAMMER_INODE_NEW       0x00020000
+#define HAMMER_INODE_DELETING  0x00020000 /* Destroy the inode on-disk */
 
 #define HAMMER_INODE_MODMASK   (HAMMER_INODE_DDIRTY|HAMMER_INODE_RDIRTY| \
                                 HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS|   \
-                                HAMMER_INODE_ITIMES|HAMMER_INODE_TRUNCATED)
+                                HAMMER_INODE_ITIMES|HAMMER_INODE_TRUNCATED|\
+                                HAMMER_INODE_DELETING)
+
+#define HAMMER_INODE_MODMASK_NOXDIRTY \
+                               (HAMMER_INODE_MODMASK & ~HAMMER_INODE_XDIRTY)
 
 #define HAMMER_MAX_INODE_CURSORS       4
 
 #define HAMMER_FLUSH_SIGNAL    0x0001
 #define HAMMER_FLUSH_FORCE     0x0002
-#define HAMMER_FLUSH_RELEASE   0x0004
+#define HAMMER_FLUSH_RECURSION 0x0004
 
 /*
- * Structure used to represent an unsynchronized record in-memory.  This
- * structure is orgranized in a per-inode RB-tree.  If the inode is not
+ * Structure used to represent an unsynchronized record in-memory.  These
+ * records typically represent directory entries.  Only non-historical
+ * records are kept in-memory.
+ *
+ * Records are organized as a per-inode RB-Tree.  If the inode is not
  * on disk then neither are any records and the in-memory record tree
  * represents the entire contents of the inode.  If the inode is on disk
  * then the on-disk B-Tree is scanned in parallel with the in-memory
  * RB-Tree to synthesize the current state of the file.
  *
- * Only current (delete_tid == 0) unsynchronized records are kept in-memory.
- *
- * blocked is the count of the number of cursors (ip_first/ip_next) blocked
- * on the record waiting for a synchronization to complete.
+ * Records are also used to enforce the ordering of directory create/delete
+ * operations.  A new inode will not be flushed to disk unless its related
+ * directory entry is also being flushed at the same time.  A directory entry
+ * will not be removed unless its related inode is also being removed at the
+ * same time.
  */
+typedef enum hammer_record_type {
+       HAMMER_MEM_RECORD_ADD,          /* positive memory cache record */
+       HAMMER_MEM_RECORD_DEL           /* negative delete-on-disk record */
+} hammer_record_type_t;
+
 struct hammer_record {
        RB_ENTRY(hammer_record)         rb_node;
-       hammer_inode_state_t            state;
+       TAILQ_ENTRY(hammer_record)      target_entry;
+       hammer_inode_state_t            flush_state;
+       int                             flush_group;
+       hammer_record_type_t            type;
        struct hammer_lock              lock;
        struct hammer_inode             *ip;
+       struct hammer_inode             *target_ip;
        union hammer_record_ondisk      rec;
        union hammer_data_ondisk        *data;
        int                             flags;
-       struct hammer_depend_list       depend_list;
 };
 
 typedef struct hammer_record *hammer_record_t;
@@ -306,8 +315,7 @@ typedef struct hammer_record *hammer_record_t;
 #define HAMMER_RECF_INBAND             0x0010
 #define HAMMER_RECF_INTERLOCK_BE       0x0020  /* backend interlock */
 #define HAMMER_RECF_WANTED             0x0040
-#define HAMMER_RECF_DELETE_ONDISK      0x0080
-#define HAMMER_RECF_CONVERT_DELETE_ONDISK 0x0100 /* special case */
+#define HAMMER_RECF_CONVERT_DELETE     0x0100 /* special case */
 
 /*
  * In-memory structures representing on-disk structures.
@@ -510,8 +518,10 @@ struct hammer_mount {
        int     ronly;
        int     nvolumes;
        int     volume_iterator;
-       int     flusher_seq;
-       int     flusher_act;
+       int     flusher_signal; /* flusher thread sequencer */
+       int     flusher_act;    /* currently active flush group */
+       int     flusher_done;   /* set to act when complete */
+       int     flusher_next;   /* next flush group */
        int     flusher_exiting;
        int     reclaim_count;
        thread_t flusher_td;
@@ -533,11 +543,9 @@ struct hammer_mount {
        struct netexport export;
        struct hammer_lock sync_lock;
        struct lock blockmap_lock;
-       hammer_inode_t  flusher_demark;
        struct hammer_blockmap  blockmap[HAMMER_MAX_ZONES];
        struct hammer_holes holes[HAMMER_MAX_ZONES];
        TAILQ_HEAD(, hammer_inode) flush_list;
-       TAILQ_HEAD(, hammer_inode) flush_alt_list;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
 };
 
@@ -571,6 +579,7 @@ extern int hammer_count_buffers;
 extern int hammer_count_nodes;
 extern int hammer_count_dirtybufs;
 extern int hammer_limit_dirtybufs;
+extern int hammer_bio_count;
 extern int64_t hammer_contention_count;
 
 int    hammer_vop_inactive(struct vop_inactive_args *);
@@ -596,7 +605,7 @@ int hammer_ip_resolve_data(hammer_cursor_t cursor);
 int    hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid);
 int    hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes);
 int    hammer_ip_check_directory_empty(hammer_transaction_t trans,
-                       hammer_inode_t ip);
+                       hammer_cursor_t parent_cursor, hammer_inode_t ip);
 int    hammer_sync_hmp(hammer_mount_t hmp, int waitfor);
 
 hammer_record_t
@@ -604,7 +613,6 @@ hammer_record_t
 void   hammer_flush_record_done(hammer_record_t record, int error);
 void   hammer_wait_mem_record(hammer_record_t record);
 void   hammer_rel_mem_record(hammer_record_t record);
-void   hammer_cleardep_mem_record(struct hammer_record *record);
 
 int    hammer_cursor_up(hammer_cursor_t cursor);
 int    hammer_cursor_down(hammer_cursor_t cursor);
@@ -731,6 +739,9 @@ hammer_off_t hammer_blockmap_lookup(hammer_mount_t hmp, hammer_off_t bmap_off,
                        int *errorp);
 hammer_off_t hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t bmap_off,
                        int *errorp);
+int64_t hammer_undo_space(hammer_mount_t hmp);
+int64_t hammer_undo_max(hammer_mount_t hmp);
+
 
 void hammer_start_transaction(struct hammer_transaction *trans,
                              struct hammer_mount *hmp);
@@ -749,10 +760,9 @@ void hammer_wait_inode(hammer_inode_t ip);
 int  hammer_create_inode(struct hammer_transaction *trans, struct vattr *vap,
                        struct ucred *cred, struct hammer_inode *dip,
                        struct hammer_inode **ipp);
-void  hammer_finalize_inode(hammer_transaction_t trans, hammer_inode_t ip,
-                       int error);
 void hammer_rel_inode(hammer_inode_t ip, int flush);
-int hammer_sync_inode(hammer_inode_t ip, int handle_delete);
+int hammer_sync_inode(hammer_inode_t ip);
+void hammer_test_inode(hammer_inode_t ip);
 
 int  hammer_ip_add_directory(struct hammer_transaction *trans,
                        hammer_inode_t dip, struct namecache *ncp,
index 73205a4..42314ac 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.39 2008/04/26 19:08:14 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.40 2008/05/02 01:00:42 dillon Exp $
  */
 
 /*
@@ -655,7 +655,7 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags)
  * called.
  *
  * The caller may depend on the cursor's exclusive lock after return to
- * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE_ONDISK).
+ * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE).
  *
  * ENOSPC is returned if there is no room to insert a new record.
  */
index 3bd8eb4..3dac4cf 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.15 2008/04/24 21:20:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.16 2008/05/02 01:00:42 dillon Exp $
  */
 
 /*
@@ -120,7 +120,6 @@ typedef struct hammer_cursor *hammer_cursor_t;
 #define HAMMER_CURSOR_DELETE_VISIBILITY        0x0010  /* special del-on-disk recs */
 #define HAMMER_CURSOR_END_INCLUSIVE    0x0020  /* key_end is inclusive */
 #define HAMMER_CURSOR_END_EXCLUSIVE    0x0040  /* key_end is exclusive (def) */
-#define HAMMER_CURSOR_UNUSED0080       0x0080
 
 #define HAMMER_CURSOR_ATEDISK          0x0100
 #define HAMMER_CURSOR_ATEMEM           0x0200
index d2c698e..b777579 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.8 2008/04/29 04:43:08 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.9 2008/05/02 01:00:42 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -55,10 +55,13 @@ hammer_flusher_sync(hammer_mount_t hmp)
        int seq;
 
        if (hmp->flusher_td) {
-               seq = ++hmp->flusher_seq;
-               wakeup(&hmp->flusher_seq);
-               while ((int)(seq - hmp->flusher_act) > 0)
-                       tsleep(&hmp->flusher_act, 0, "hmrfls", 0);
+               seq = hmp->flusher_next;
+               if (hmp->flusher_signal == 0) {
+                       hmp->flusher_signal = 1;
+                       wakeup(&hmp->flusher_signal);
+               }
+               while ((int)(seq - hmp->flusher_done) > 0)
+                       tsleep(&hmp->flusher_done, 0, "hmrfls", 0);
        }
 }
 
@@ -66,14 +69,20 @@ void
 hammer_flusher_async(hammer_mount_t hmp)
 {
        if (hmp->flusher_td) {
-               ++hmp->flusher_seq;
-               wakeup(&hmp->flusher_seq);
+               if (hmp->flusher_signal == 0) {
+                       hmp->flusher_signal = 1;
+                       wakeup(&hmp->flusher_signal);
+               }
        }
 }
 
 void
 hammer_flusher_create(hammer_mount_t hmp)
 {
+       hmp->flusher_signal = 0;
+       hmp->flusher_act = 0;
+       hmp->flusher_done = 0;
+       hmp->flusher_next = 1;
        lwkt_create(hammer_flusher_thread, hmp, &hmp->flusher_td, NULL,
                    0, -1, "hammer");
 }
@@ -83,10 +92,11 @@ hammer_flusher_destroy(hammer_mount_t hmp)
 {
        if (hmp->flusher_td) {
                hmp->flusher_exiting = 1;
-               ++hmp->flusher_seq;
-               wakeup(&hmp->flusher_seq);
-               while (hmp->flusher_td)
+               while (hmp->flusher_td) {
+                       hmp->flusher_signal = 1;
+                       wakeup(&hmp->flusher_signal);
                        tsleep(&hmp->flusher_exiting, 0, "hmrwex", 0);
+               }
        }
 }
 
@@ -94,34 +104,31 @@ static void
 hammer_flusher_thread(void *arg)
 {
        hammer_mount_t hmp = arg;
-       int seq;
-
-       hmp->flusher_demark = kmalloc(sizeof(struct hammer_inode),
-                                     M_HAMMER, M_WAITOK | M_ZERO);
-       TAILQ_INSERT_TAIL(&hmp->flush_list, hmp->flusher_demark, flush_entry);
 
        for (;;) {
-               seq = hmp->flusher_seq;
+               hmp->flusher_act = hmp->flusher_next;
+               ++hmp->flusher_next;
+               kprintf("F");
                hammer_flusher_clean_loose_ios(hmp);
                hammer_flusher_flush(hmp);
                hammer_flusher_clean_loose_ios(hmp);
-               hmp->flusher_act = seq;
-               wakeup(&hmp->flusher_act);
+               hmp->flusher_done = hmp->flusher_act;
+
+               wakeup(&hmp->flusher_done);
 
                /*
-                * Loop if more got queued after our demark.
+                * Wait for activity.
                 */
-               if (TAILQ_NEXT(hmp->flusher_demark, flush_entry))
-                       continue;
-
-               if (hmp->flusher_exiting)
+               if (hmp->flusher_exiting && TAILQ_EMPTY(&hmp->flush_list))
                        break;
-               while (hmp->flusher_seq == hmp->flusher_act)
-                       tsleep(&hmp->flusher_seq, 0, "hmrwwa", 0);
+               kprintf("E");
+
+               while (hmp->flusher_signal == 0 &&
+                      TAILQ_EMPTY(&hmp->flush_list)) {
+                       tsleep(&hmp->flusher_signal, 0, "hmrwwa", 0);
+               }
+               hmp->flusher_signal = 0;
        }
-       TAILQ_REMOVE(&hmp->flush_list, hmp->flusher_demark, flush_entry);
-       kfree(hmp->flusher_demark, M_HAMMER);
-       hmp->flusher_demark = NULL;
        hmp->flusher_td = NULL;
        wakeup(&hmp->flusher_exiting);
        lwkt_exit();
@@ -164,22 +171,27 @@ hammer_flusher_flush(hammer_mount_t hmp)
        rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
        start_offset = rootmap->next_offset;
 
-       if (hammer_debug_general & 0x00010000)
-               kprintf("x");
-
-       TAILQ_REMOVE(&hmp->flush_list, hmp->flusher_demark, flush_entry);
-       TAILQ_INSERT_TAIL(&hmp->flush_list, hmp->flusher_demark, flush_entry);
+       while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) {
+               /*
+                * Stop when we hit a different flush group
+                */
+               if (ip->flush_group != hmp->flusher_act)
+                       break;
 
-       while ((ip = TAILQ_FIRST(&hmp->flush_list)) != hmp->flusher_demark) {
+               /*
+                * Remove the inode from the flush list and inherit
+                * its reference, sync, and clean-up.
+                */
                TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry);
+               kprintf("s");
+               ip->error = hammer_sync_inode(ip);
+               hammer_flush_inode_done(ip);
 
                /*
-                * We inherit the inode ref from the flush list
+                * XXX this breaks atomicy
                 */
-               ip->error = hammer_sync_inode(ip, (ip->vp ? 0 : 1));
-               hammer_flush_inode_done(ip);
-               if (hmp->locked_dirty_count > 64 ||
-                   hammer_must_finalize_undo(hmp)) {
+               if (hammer_must_finalize_undo(hmp)) {
+                       Debugger("Too many undos!!");
                        hammer_flusher_finalize(hmp, root_volume, start_offset);
                        start_offset = rootmap->next_offset;
                }
@@ -197,22 +209,12 @@ static
 int
 hammer_must_finalize_undo(hammer_mount_t hmp)
 {
-       hammer_blockmap_t rootmap;
-       int bytes;
-       int max_bytes;
-
-       rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
-
-       if (rootmap->first_offset <= rootmap->next_offset) {
-               bytes = (int)(rootmap->next_offset - rootmap->first_offset);
+       if (hammer_undo_space(hmp) < hammer_undo_max(hmp) / 2) {
+               kprintf("*");
+               return(1);
        } else {
-               bytes = (int)(rootmap->alloc_offset - rootmap->first_offset +
-                             rootmap->next_offset);
+               return(0);
        }
-       max_bytes = (int)(rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK);
-       if (bytes > max_bytes / 2)
-               kprintf("*");
-       return (bytes > max_bytes / 2);
 }
 
 /*
index d9cbef2..f461a04 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.44 2008/04/29 04:43:08 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.45 2008/05/02 01:00:42 dillon Exp $
  */
 
 #include "hammer.h"
 #include <sys/buf2.h>
 
 static int hammer_unload_inode(struct hammer_inode *ip);
-static void hammer_flush_inode_copysync(hammer_inode_t ip);
-static int hammer_mark_record_callback(hammer_record_t rec, void *data);
+static void hammer_flush_inode_core(hammer_inode_t ip, int flags);
+static int hammer_setup_child_callback(hammer_record_t rec, void *data);
+static int hammer_inode_unloadable_check(hammer_inode_t ip);
+static int hammer_setup_parent_inodes(hammer_record_t record);
 
 /*
  * The kernel is not actively referencing this vnode but is still holding
@@ -62,26 +64,21 @@ hammer_vop_inactive(struct vop_inactive_args *ap)
        }
 
        /*
-        * If the inode no longer has any references we recover its
-        * in-memory resources immediately.
-        *
-        * NOTE: called from frontend, use ino_rec instead of sync_ino_rec.
+        * If the inode no longer has visibility in the filesystem and is
+        * fairly clean, try to recycle it immediately.  This can deadlock
+        * in vfsync() if we aren't careful.
         */
-       if (ip->ino_rec.ino_nlinks == 0)
+       if (hammer_inode_unloadable_check(ip) && ip->ino_rec.ino_nlinks == 0)
                vrecycle(ap->a_vp);
        return(0);
 }
 
 /*
  * Release the vnode association.  This is typically (but not always)
- * the last reference on the inode and will flush the inode to the
- * buffer cache.
+ * the last reference on the inode.
  *
- * XXX Currently our sync code only runs through inodes with vnode
- * associations, so we depend on hammer_rel_inode() to sync any inode
- * record data to the block device prior to losing the association.
- * Otherwise transactions that the user expected to be distinct by
- * doing a manual sync may be merged.
+ * Once the association is lost we are on our own with regards to
+ * flushing the inode.
  */
 int
 hammer_vop_reclaim(struct vop_reclaim_args *ap)
@@ -94,18 +91,6 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap)
        if ((ip = vp->v_data) != NULL) {
                vp->v_data = NULL;
                ip->vp = NULL;
-
-               /*
-                * Don't let too many dependancies build up on unreferenced
-                * inodes or we could run ourselves out of memory.
-                */
-               if (TAILQ_FIRST(&ip->depend_list)) {
-                       ip->hmp->reclaim_count += ip->depend_count;
-                       if (ip->hmp->reclaim_count > 256) {
-                               ip->hmp->reclaim_count = 0;
-                               hammer_flusher_async(ip->hmp);
-                       }
-               }
                hammer_rel_inode(ip, 1);
        }
        return(0);
@@ -235,7 +220,7 @@ loop:
        RB_INIT(&ip->rec_tree);
        TAILQ_INIT(&ip->bio_list);
        TAILQ_INIT(&ip->bio_alt_list);
-       TAILQ_INIT(&ip->depend_list);
+       TAILQ_INIT(&ip->target_list);
 
        /*
         * Locate the on-disk inode.
@@ -303,9 +288,7 @@ retry:
 
 /*
  * Create a new filesystem object, returning the inode in *ipp.  The
- * returned inode will be referenced and also marked HAMMER_INODE_NEW,
- * preventing it from being synchronized too early.  The caller must
- * call hammer_finalize_inode() to make it available for media sync.
+ * returned inode will be referenced.
  *
  * The inode is created in-memory.
  */
@@ -328,13 +311,12 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
        ip->flush_state = HAMMER_FST_IDLE;
        ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_RDIRTY |
                    HAMMER_INODE_ITIMES;
-       ip->flags |= HAMMER_INODE_NEW;
 
        ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
        RB_INIT(&ip->rec_tree);
        TAILQ_INIT(&ip->bio_list);
        TAILQ_INIT(&ip->bio_alt_list);
-       TAILQ_INIT(&ip->depend_list);
+       TAILQ_INIT(&ip->target_list);
 
        ip->ino_rec.ino_atime = trans->time;
        ip->ino_rec.ino_mtime = trans->time;
@@ -392,31 +374,6 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
        return(0);
 }
 
-/*
- * Finalize a newly created inode, allowing it to be synchronized to the
- * media.  If an error occured make sure the inode has been cleaned up and
- * will not be synchronized to the media.
- */
-void
-hammer_finalize_inode(hammer_transaction_t trans, hammer_inode_t ip, int error)
-{
-       if (error) {
-               ip->flags &= ~HAMMER_INODE_MODMASK;
-
-               KASSERT(ip->lock.refs == 1,
-                       ("hammer_unload_inode: %d refs\n", ip->lock.refs));
-               KKASSERT(ip->vp == NULL);
-               KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
-               KKASSERT(ip->cursor_ip_refs == 0);
-               KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
-
-               KKASSERT(RB_EMPTY(&ip->rec_tree));
-               KKASSERT(TAILQ_EMPTY(&ip->bio_list));
-               KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
-       }
-       ip->flags &= ~HAMMER_INODE_NEW;
-}
-
 /*
  * Called by hammer_sync_inode().
  */
@@ -460,15 +417,16 @@ retry:
                        Debugger("hammer_update_inode");
                }
 
-
                if (error == 0) {
                        error = hammer_ip_delete_record(&cursor, trans->tid);
                        if (error && error != EDEADLK) {
                                kprintf("error %d\n", error);
                                Debugger("hammer_update_inode2");
                        }
-                       if (error == 0)
+                       if (error == 0) {
                                ip->flags |= HAMMER_INODE_DELONDISK;
+                               ip->sync_flags &= ~HAMMER_INODE_DELETING;
+                       }
                        hammer_cache_node(cursor.node, &ip->cache[0]);
                }
                hammer_done_cursor(&cursor);
@@ -486,7 +444,7 @@ retry:
         */
        if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) { 
                record = hammer_alloc_mem_record(ip);
-               record->state = HAMMER_FST_FLUSH;
+               record->flush_state = HAMMER_FST_FLUSH;
                record->rec.inode = ip->sync_ino_rec;
                record->rec.inode.base.base.create_tid = trans->tid;
                record->rec.inode.base.data_len = sizeof(ip->sync_ino_data);
@@ -504,8 +462,7 @@ retry:
                 */
                record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
                record->flags |= HAMMER_RECF_DELETED_FE;
-               record->state = HAMMER_FST_IDLE;
-               KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL);
+               record->flush_state = HAMMER_FST_IDLE;
                hammer_rel_mem_record(record);
 
                if (error == 0) {
@@ -513,6 +470,10 @@ retry:
                                            HAMMER_INODE_DDIRTY |
                                            HAMMER_INODE_ITIMES);
                        ip->flags &= ~HAMMER_INODE_DELONDISK;
+
+                       /*
+                        * Root volume count of inodes
+                        */
                        if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
                                hammer_modify_volume(trans, trans->rootvol,
                                                     NULL, 0);
@@ -588,8 +549,7 @@ retry:
 }
 
 /*
- * Release a reference on an inode.  If asked to flush the last release
- * will flush the inode.
+ * Release a reference on an inode, flush as requested.
  *
  * On the last reference we queue the inode to the flusher for its final
  * disposition.
@@ -597,39 +557,57 @@ retry:
 void
 hammer_rel_inode(struct hammer_inode *ip, int flush)
 {
+       hammer_mount_t hmp = ip->hmp;
+
        /*
         * Handle disposition when dropping the last ref.
         */
-       while (ip->lock.refs == 1) {
-               if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
-                       hammer_unload_inode(ip);
-                       return;
-               }
-
-               /*
-                * Hand the inode over to the flusher, which will
-                * add another ref to it.
-                */
-               if (++ip->hmp->reclaim_count > 256) {
-                       ip->hmp->reclaim_count = 0;
-                       hammer_flush_inode(ip, HAMMER_FLUSH_FORCE |
-                                               HAMMER_FLUSH_SIGNAL);
+       for (;;) {
+               if (ip->lock.refs == 1) {
+                       /*
+                        * Determine whether on-disk action is needed for
+                        * the inode's final disposition.
+                        */
+                       if (hammer_inode_unloadable_check(ip)) {
+                               hammer_unload_inode(ip);
+                               break;
+                       }
+                       hammer_flush_inode(ip, 0);
                } else {
-                       hammer_flush_inode(ip, HAMMER_FLUSH_FORCE);
+                       /*
+                        * We gotta flush inodes which do not have vnode
+                        * associations.
+                        */
+#if 0
+                       if (ip->vp == NULL) {
+                               kprintf("v%d:%04x\n", ip->flush_state, ip->flags);
+                               hammer_flush_inode(ip, 0);
+                       } else 
+#endif
+                       if (flush) {
+                               hammer_flush_inode(ip, 0);
+                       }
+                       /*
+                        * The inode still has multiple refs, try to drop
+                        * one ref.
+                        */
+                       KKASSERT(ip->lock.refs >= 1);
+                       if (ip->lock.refs > 1) {
+                               hammer_unref(&ip->lock);
+                               break;
+                       }
                }
-               /* retry */
        }
 
        /*
-        * The inode still has multiple refs, drop one ref.  If a flush was
-        * requested make sure the flusher sees it.  New inodes which have
-        * not been finalized cannot be flushed.
+        * XXX bad hack until I add code to track inodes in SETUP.  We
+        * can queue a lot of inodes to the syncer but if we don't wake
+        * it up the undo sets will be too large or too many unflushed
+        * records will build up and blow our malloc limit.
         */
-       if (flush && ip->flush_state == HAMMER_FST_IDLE && 
-           (ip->flags & HAMMER_INODE_NEW) == 0) {
-               hammer_flush_inode(ip, HAMMER_FLUSH_RELEASE);
-       } else {
-               hammer_unref(&ip->lock);
+       if (++hmp->reclaim_count > 256) {
+               hmp->reclaim_count = 0;
+               hammer_flusher_async(hmp);
        }
 }
 
@@ -650,6 +628,7 @@ hammer_unload_inode(struct hammer_inode *ip)
        KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
 
        KKASSERT(RB_EMPTY(&ip->rec_tree));
+       KKASSERT(TAILQ_EMPTY(&ip->target_list));
        KKASSERT(TAILQ_EMPTY(&ip->bio_list));
        KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
 
@@ -671,7 +650,8 @@ hammer_unload_inode(struct hammer_inode *ip)
  *
  * HAMMER_INODE_RDIRTY:        Inode record has been updated
  * HAMMER_INODE_DDIRTY: Inode data has been updated
- * HAMMER_INODE_XDIRTY: Dirty frontend buffer cache buffer strategized
+ * HAMMER_INODE_XDIRTY: Dirty in-memory records
+ * HAMMER_INODE_BUFS:   Dirty front-end buffer cache buffers
  * HAMMER_INODE_DELETED: Inode record/data must be deleted
  * HAMMER_INODE_ITIMES: mtime/atime has been updated
  */
@@ -680,152 +660,421 @@ hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
 {
        KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 ||
                  (flags & (HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
-                  HAMMER_INODE_XDIRTY|
+                  HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS|
                   HAMMER_INODE_DELETED|HAMMER_INODE_ITIMES)) == 0);
 
        ip->flags |= flags;
 }
 
 /*
- * Flush an inode.  If the inode is already being flushed wait for
- * it to complete, then flush it again.  The interlock is against
- * front-end transactions, the backend flusher does not hold the lock.
+ * Request that an inode be flushed.  This whole mess cannot block and may
+ * recurse.  Once requested HAMMER will attempt to actively flush it until
+ * the flush can be done.
  *
- * The flusher must distinguish between the records that are part of the
- * flush and any new records created in parallel with the flush.  The
- * inode data and truncation fields are also copied.  BIOs are a bit more
- * troublesome because some dirty buffers may not have been queued yet.
+ * The inode may already be flushing, or may be in a setup state.  We can
+ * place the inode in a flushing state if it is currently idle and flag it
+ * to reflush if it is currently flushing.
  */
 void
 hammer_flush_inode(hammer_inode_t ip, int flags)
 {
-       KKASSERT((ip->flags & HAMMER_INODE_NEW) == 0);
-       if (ip->flush_state != HAMMER_FST_IDLE &&
-           (ip->flags & HAMMER_INODE_MODMASK)) {
-               if ((ip->flags & HAMMER_INODE_REFLUSH) == 0) {
-                       ip->flags |= HAMMER_INODE_REFLUSH;
-                       if (flags & HAMMER_FLUSH_RELEASE) {
-                               hammer_unref(&ip->lock);
-                               KKASSERT(ip->lock.refs > 0);
-                       }
-                       if (flags & HAMMER_FLUSH_SIGNAL)
-                               hammer_flusher_async(ip->hmp);
+       hammer_record_t depend;
+       int r, good;
+
+       /*
+        * Trivial 'nothing to flush' case.  If the inode is ina SETUP
+        * state we have to put it back into an IDLE state so we can
+        * drop the extra ref.
+        */
+       if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
+           (flags & HAMMER_FLUSH_FORCE) == 0) {
+               if (ip->flush_state == HAMMER_FST_SETUP) {
+                       ip->flush_state = HAMMER_FST_IDLE;
+                       hammer_rel_inode(ip, 0);
                }
                return;
        }
-       if (ip->flush_state == HAMMER_FST_IDLE) {
-               if ((ip->flags & HAMMER_INODE_MODMASK) ||
-                   (flags & HAMMER_FLUSH_FORCE)) {
-                       /*
-                        * Add a reference to represent the inode being queued
-                        * to the flusher.  If the caller wants us to 
-                        * release a reference the two cancel each other out.
-                        */
-                       if ((flags & HAMMER_FLUSH_RELEASE) == 0)
-                               hammer_ref(&ip->lock);
 
-                       hammer_flush_inode_copysync(ip);
-                       /*
-                        * Move the inode to the flush list and add a ref to
-                        * it representing it on the list.
-                        */
-                       TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
-                       if (flags & HAMMER_FLUSH_SIGNAL)
-                               hammer_flusher_async(ip->hmp);
+       /*
+        * Our flush action will depend on the current state.
+        */
+       switch(ip->flush_state) {
+       case HAMMER_FST_IDLE:
+               /*
+                * We have no dependancies and can flush immediately.  Some
+                * our children may not be flushable so we have to re-test
+                * with that additional knowledge.
+                */
+               hammer_flush_inode_core(ip, flags);
+               break;
+       case HAMMER_FST_SETUP:
+               /*
+                * Recurse upwards through dependancies via target_list
+                * and start their flusher actions going if possible.
+                *
+                * 'good' is our connectivity.  -1 means we have none and
+                * can't flush, 0 means there weren't any dependancies, and
+                * 1 means we have good connectivity.
+                */
+               good = 0;
+               TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
+                       r = hammer_setup_parent_inodes(depend);
+                       if (r < 0 && good == 0)
+                               good = -1;
+                       if (r > 0)
+                               good = 1;
+               }
+
+               /*
+                * We can continue if good >= 0.  Determine how many records
+                * under our inode can be flushed (and mark them).
+                */
+               kprintf("g%d", good);
+               if (good >= 0) {
+                       hammer_flush_inode_core(ip, flags);
+               } else {
+                       ip->flags |= HAMMER_INODE_REFLUSH;
+               }
+               break;
+       default:
+               /*
+                * We are already flushing, flag the inode to reflush
+                * if needed after it completes its current flush.
+                */
+               if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
+                       ip->flags |= HAMMER_INODE_REFLUSH;
+               break;
+       }
+}
+
+/*
+ * We are asked to recurse upwards and convert the record from SETUP
+ * to FLUSH if possible.  record->ip is a parent of the caller's inode,
+ * and record->target_ip is the caller's inode.
+ *
+ * Return 1 if the record gives us connectivity
+ *
+ * Return 0 if the record is not relevant 
+ *
+ * Return -1 if we can't resolve the dependancy and there is no connectivity.
+ */
+static int
+hammer_setup_parent_inodes(hammer_record_t record)
+{
+       hammer_mount_t hmp = record->ip->hmp;
+       hammer_record_t depend;
+       hammer_inode_t ip;
+       int r, good;
+
+       KKASSERT(record->flush_state != HAMMER_FST_IDLE);
+       ip = record->ip;
+
+       /*
+        * If the record is already flushing, is it in our flush group?
+        *
+        * If it is in our flush group but it is a delete-on-disk, it
+        * does not improve our connectivity (return 0), and if the
+        * target inode is not trying to destroy itself we can't allow
+        * the operation yet anyway (the second return -1).
+        */
+       if (record->flush_state == HAMMER_FST_FLUSH) {
+               if (record->flush_group != hmp->flusher_next) {
+                       ip->flags |= HAMMER_INODE_REFLUSH;
+                       return(-1);
                }
+               if (record->type == HAMMER_MEM_RECORD_ADD)
+                       return(1);
+               return(0);
+       }
+
+       /*
+        * It must be a setup record.  Try to resolve the setup dependancies
+        * by recursing upwards so we can place ip on the flush list.
+        */
+       KKASSERT(record->flush_state == HAMMER_FST_SETUP);
+
+       good = 0;
+       TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
+               r = hammer_setup_parent_inodes(depend);
+               if (r < 0 && good == 0)
+                       good = -1;
+               if (r > 0)
+                       good = 1;
+       }
+
+       /*
+        * We can't flush ip because it has no connectivity (XXX also check
+        * nlinks for pre-existing connectivity!).  Flag it so any resolution
+        * recurses back down.
+        */
+       if (good < 0) {
+               ip->flags |= HAMMER_INODE_REFLUSH;
+               return(good);
+       }
+
+       /*
+        * We are go, place the parent inode in a flushing state so we can
+        * place its record in a flushing state.  Note that the parent
+        * may already be flushing.  The record must be in the same flush
+        * group as the parent.
+        */
+       if (ip->flush_state != HAMMER_FST_FLUSH)
+               hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION);
+       KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
+       KKASSERT(record->flush_state == HAMMER_FST_SETUP);
+
+#if 0
+       if (record->type == HAMMER_MEM_RECORD_DEL &&
+           (record->target_ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELONDISK)) == 0) {
+               /*
+                * Regardless of flushing state we cannot sync this path if the
+                * record represents a delete-on-disk but the target inode
+                * is not ready to sync its own deletion.
+                *
+                * XXX need to count effective nlinks to determine whether
+                * the flush is ok, otherwise removing a hardlink will
+                * just leave the DEL record to rot.
+                */
+               record->target_ip->flags |= HAMMER_INODE_REFLUSH;
+               return(-1);
+       } else
+#endif
+       if (ip->flush_group == ip->hmp->flusher_next) {
+               /*
+                * This is the record we wanted to synchronize.
+                */
+               record->flush_state = HAMMER_FST_FLUSH;
+               record->flush_group = ip->flush_group;
+               hammer_ref(&record->lock);
+               if (record->type == HAMMER_MEM_RECORD_ADD)
+                       return(1);
+
+               /*
+                * The record is a delete-n-disk.  It does not contribute
+                * to our visibility.  We can still flush it.
+                */
+               return(0);
+       } else {
+               /*
+                * We couldn't resolve the dependancies, request that the
+                * inode be flushed when the dependancies can be resolved.
+                */
+               ip->flags |= HAMMER_INODE_REFLUSH;
+               return(-1);
        }
 }
 
 /*
- * Helper routine to copy the frontend synchronization state to the backend.
- * This routine may be called by either the frontend or the backend.
+ * This is the core routine placing an inode into the FST_FLUSH state.
  */
 static void
-hammer_flush_inode_copysync(hammer_inode_t ip)
+hammer_flush_inode_core(hammer_inode_t ip, int flags)
 {
+       int go_count;
        int error;
-       int count;
+
+       KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
+       if (ip->flush_state == HAMMER_FST_IDLE)
+               hammer_ref(&ip->lock);
+       ip->flush_state = HAMMER_FST_FLUSH;
+       ip->flush_group = ip->hmp->flusher_next;
 
        /*
-        * Prevent anyone else from trying to do the same thing.
+        * Figure out how many in-memory records we can actually flush
+        * (not including inode meta-data, buffers, etc).
         */
-       ip->flush_state = HAMMER_FST_SETUP;
+       if (flags & HAMMER_FLUSH_RECURSION) {
+               go_count = 1;
+       } else {
+               go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
+                                  hammer_setup_child_callback, NULL);
+       }
 
        /*
-        * Sync the buffer cache.  This will queue the BIOs.  If called
-        * from the context of the flusher the BIO's are thrown into bio_list
-        * regardless of ip->flush_state.
+        * This is a more involved test that includes go_count.  If we
+        * can't flush, flag the inode and return.  If go_count is 0 we
+        * were are unable to flush any records in our rec_tree and
+        * must ignore the XDIRTY flag.
         */
-       if (ip->vp != NULL)
-               error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
-       else
-               error = 0;
+       if (go_count == 0) {
+               if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
+                       ip->flags |= HAMMER_INODE_REFLUSH;
+                       ip->flush_state = HAMMER_FST_SETUP;
+                       return;
+               }
+       }
 
        /*
-        * This freezes strategy writes, any further BIOs will be
-        * queued to alt_bio (unless we are 
+        * Inodes not in an IDLE state get an extra reference.
+        *
+        * Place the inode in a flush state and sync all frontend
+        * information to the backend.
         */
-       ip->flush_state = HAMMER_FST_FLUSH;
+
+       if ((flags & HAMMER_FLUSH_RECURSION) == 0)  {
+               if (ip->vp != NULL)
+                       error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
+               else
+                       error = 0;
+       }
+
+       /*
+        * Any further strategy calls will go into the inode's alternative
+        * bioq.
+        */
+       ip->flags |= HAMMER_INODE_WRITE_ALT;
 
        /*
         * Snapshot the state of the inode for the backend flusher.
         *
         * The truncation must be retained in the frontend until after
         * we've actually performed the record deletion.
+        *
+        * NOTE: The DELETING flag is a mod flag, but it is also sticky,
+        * and stays in ip->flags.  Once set, it stays set until the
+        * inode is destroyed.
         */
        ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
+       ip->sync_flags &= ~HAMMER_INODE_DELETING;
        ip->sync_trunc_off = ip->trunc_off;
        ip->sync_ino_rec = ip->ino_rec;
        ip->sync_ino_data = ip->ino_data;
        ip->flags &= ~HAMMER_INODE_MODMASK |
-                    HAMMER_INODE_TRUNCATED | HAMMER_INODE_BUFS;
+                    HAMMER_INODE_TRUNCATED | HAMMER_INODE_BUFS |
+                    HAMMER_INODE_DELETING;
 
        /*
         * Fix up the dirty buffer status.
         */
-       if (ip->vp == NULL || RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL)
-               ip->flags &= ~HAMMER_INODE_BUFS;
+       if (ip->vp == NULL || RB_ROOT(&ip->vp->v_rbdirty_tree) == NULL) {
+               if (TAILQ_FIRST(&ip->bio_alt_list) == NULL)
+                       ip->flags &= ~HAMMER_INODE_BUFS;
+       }
        if (TAILQ_FIRST(&ip->bio_list))
                ip->sync_flags |= HAMMER_INODE_BUFS;
        else
                ip->sync_flags &= ~HAMMER_INODE_BUFS;
 
        /*
-        * Set the state for the inode's in-memory records.  If some records
-        * could not be marked for backend flush (i.e. deleted records),
-        * re-set the XDIRTY flag.
+        * The flusher inherits our inode and reference.
         */
-       count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
-                       hammer_mark_record_callback, NULL);
-       if (count)
-               ip->flags |= HAMMER_INODE_XDIRTY;
+       TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
+
+       if (flags & HAMMER_FLUSH_SIGNAL)
+               hammer_flusher_async(ip->hmp);
 }
 
 /*
- * Mark records for backend flush, accumulate a count of the number of
- * records which could not be marked.  Records marked for deletion
- * by the frontend never make it to the media.  It is possible for
- * a record queued to the backend to wind up with FE set after the
- * fact, as long as BE has not yet been set.  The backend deals with
- * this race by syncing the record as if FE had not been set, and
- * then converting the record to a delete-on-disk record.
+ * Callback for scan of ip->rec_tree.  Try to include each record in our
+ * flush.  ip->flush_group has been set but the inode has not yet been
+ * moved into a flushing state.
+ *
+ * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
+ * both inodes.
+ *
+ * We return 1 for any record placed or found in FST_FLUSH, which prevents
+ * the caller from shortcutting the flush.
  */
 static int
-hammer_mark_record_callback(hammer_record_t rec, void *data)
+hammer_setup_child_callback(hammer_record_t rec, void *data)
 {
-       if (rec->state == HAMMER_FST_FLUSH) {
+       hammer_inode_t target_ip;
+       hammer_inode_t ip;
+       int r;
+
+       /*
+        * If the record has been deleted by the backend (it's being held
+        * by the frontend in a race), just ignore it.
+        */
+       if (rec->flags & HAMMER_RECF_DELETED_BE)
                return(0);
-       } else if ((rec->flags & HAMMER_RECF_DELETED_FE) == 0) {
-               rec->state = HAMMER_FST_FLUSH;
+
+       /*
+        * If the record is in an idle state it has no dependancies and
+        * can be flushed.
+        */
+       ip = rec->ip;
+       r = 0;
+
+       switch(rec->flush_state) {
+       case HAMMER_FST_IDLE:
+               /*
+                * Record has no setup dependancy, we can flush it.
+                */
+               KKASSERT(rec->target_ip == NULL);
+               rec->flush_state = HAMMER_FST_FLUSH;
+               rec->flush_group = ip->flush_group;
                hammer_ref(&rec->lock);
-               return(0);
-       } else {
-               return(1);
+               r = 1;
+               break;
+       case HAMMER_FST_SETUP:
+               /*
+                * Record has a setup dependancy.  Try to include the
+                * target ip in the flush. 
+                *
+                * We have to be careful here, if we do not do the right
+                * thing we can lose track of dirty inodes and the system
+                * will lockup trying to allocate buffers.
+                */
+               target_ip = rec->target_ip;
+               KKASSERT(target_ip != NULL);
+               KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
+               if (target_ip->flush_state == HAMMER_FST_FLUSH) {
+                       /*
+                        * If the target IP is already flushing in our group
+                        * we are golden, otherwise make sure the target
+                        * reflushes.
+                        */
+                       if (target_ip->flush_group == ip->flush_group) {
+                               rec->flush_state = HAMMER_FST_FLUSH;
+                               rec->flush_group = ip->flush_group;
+                               hammer_ref(&rec->lock);
+                               r = 1;
+                       } else {
+                               target_ip->flags |= HAMMER_INODE_REFLUSH;
+                       }
+               } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
+                       /*
+                        * If the target IP is not flushing we can force
+                        * it to flush, even if it is unable to write out
+                        * any of its own records we have at least one in
+                        * hand that we CAN deal with.
+                        */
+                       rec->flush_state = HAMMER_FST_FLUSH;
+                       rec->flush_group = ip->flush_group;
+                       hammer_ref(&rec->lock);
+                       hammer_flush_inode_core(target_ip,
+                                               HAMMER_FLUSH_RECURSION);
+                       r = 1;
+               } else {
+                       /*
+                        * XXX this needs help.  We have a delete-on-disk
+                        * which could disconnect the target.  If the target
+                        * has its own dependancies they really need to
+                        * be flushed.
+                        *
+                        * XXX
+                        */
+                       rec->flush_state = HAMMER_FST_FLUSH;
+                       rec->flush_group = ip->flush_group;
+                       hammer_ref(&rec->lock);
+                       hammer_flush_inode_core(target_ip,
+                                               HAMMER_FLUSH_RECURSION);
+                       r = 1;
+               }
+               break;
+       case HAMMER_FST_FLUSH:
+               /* 
+                * Record already associated with a flush group.  It had
+                * better be ours.
+                */
+               KKASSERT(rec->flush_group == ip->flush_group);
+               r = 1;
+               break;
        }
+       return(r);
 }
 
-
-
 /*
  * Wait for a previously queued flush to complete
  */
@@ -849,13 +1098,32 @@ void
 hammer_flush_inode_done(hammer_inode_t ip)
 {
        struct bio *bio;
+       int dorel = 0;
 
        KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
 
-       if (ip->sync_flags)
-               kprintf("ip %p leftover sync_flags %08x\n", ip, ip->sync_flags);
+       /*
+        * Allow BIOs to queue to the inode's primary bioq again.
+        */
+       ip->flags &= ~HAMMER_INODE_WRITE_ALT;
+
+       /*
+        * Merge left-over flags back into the frontend and fix the state.
+        */
        ip->flags |= ip->sync_flags;
-       ip->flush_state = HAMMER_FST_IDLE;
+       if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
+               ip->flush_state = HAMMER_FST_IDLE;
+               dorel = 1;
+       } else {
+               ip->flush_state = HAMMER_FST_SETUP;
+       }
+
+       /*
+        * The backend may have adjusted nlinks, so if the adjusted nlinks
+        * does not match the fronttend set the frontend's RDIRTY flag again.
+        */
+       if (ip->ino_rec.ino_nlinks != ip->sync_ino_rec.ino_nlinks)
+               ip->flags |= HAMMER_INODE_RDIRTY;
 
        /*
         * Reflush any BIOs that wound up in the alt list.  Our inode will
@@ -864,9 +1132,19 @@ hammer_flush_inode_done(hammer_inode_t ip)
        while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) {
                TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act);
                TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
+               kprintf("d");
+               ip->flags |= HAMMER_INODE_BUFS;
+               ip->flags |= HAMMER_INODE_REFLUSH;
+       }
+
+       /*
+        * Re-set the XDIRTY flag if some of the inode's in-memory records
+        * could not be flushed.
+        */
+       if (RB_ROOT(&ip->rec_tree)) {
                ip->flags |= HAMMER_INODE_XDIRTY;
                ip->flags |= HAMMER_INODE_REFLUSH;
-               kprintf("rebio %p ip %p @%016llx,%d\n", bio, ip, bio->bio_offset, bio->bio_buf->b_bufsize);
+               kprintf("e");
        }
 
        /*
@@ -875,7 +1153,7 @@ hammer_flush_inode_done(hammer_inode_t ip)
         */
        if (ip->flags & HAMMER_INODE_REFLUSH) {
                ip->flags &= ~HAMMER_INODE_REFLUSH;
-               hammer_flush_inode(ip, 0);
+               hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
                if (ip->flush_state == HAMMER_FST_IDLE) {
                        if (ip->flags & HAMMER_INODE_FLUSHW) {
                                ip->flags &= ~HAMMER_INODE_FLUSHW;
@@ -888,7 +1166,8 @@ hammer_flush_inode_done(hammer_inode_t ip)
                        wakeup(&ip->flags);
                }
        }
-       hammer_rel_inode(ip, 0);
+       if (dorel)
+               hammer_rel_inode(ip, 0);
 }
 
 /*
@@ -902,11 +1181,19 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
        int error;
 
        /*
-        * Skip records that do not belong to the current flush.  Records
-        * belonging to the flush will have been referenced for us.
+        * Skip records that do not belong to the current flush.
         */
-       if (record->state != HAMMER_FST_FLUSH)
+       if (record->flush_state != HAMMER_FST_FLUSH)
                return(0);
+       KKASSERT((record->flags & HAMMER_RECF_DELETED_BE) == 0);
+#if 1
+       if (record->flush_group != record->ip->flush_group) {
+               kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
+               Debugger("blah2");
+               return(0);
+       }
+#endif
+       KKASSERT(record->flush_group == record->ip->flush_group);
 
        /*
         * Interlock the record using the BE flag.  Once BE is set the
@@ -916,9 +1203,8 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
         * record out, but the flush completion code converts it to 
         * a delete-on-disk record instead of destroying it.
         */
-       hammer_lock_ex(&record->lock);
        if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
-               hammer_unlock(&record->lock);
+               hammer_flush_record_done(record, 0);
                return(0);
        }
        record->flags |= HAMMER_RECF_INTERLOCK_BE;
@@ -927,18 +1213,20 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
         * If DELETED_FE is set we may have already sent dependant pieces
         * to the disk and we must flush the record as if it hadn't been
         * deleted.  This creates a bit of a mess because we have to
-        * have ip_sync_record convert the record to DELETE_ONDISK before
+        * have ip_sync_record convert the record to MEM_RECORD_DEL before
         * it inserts the B-Tree record.  Otherwise the media sync might
         * be visible to the frontend.
         */
-       if (record->flags & HAMMER_RECF_DELETED_FE)
-               record->flags |= HAMMER_RECF_CONVERT_DELETE_ONDISK;
+       if (record->flags & HAMMER_RECF_DELETED_FE) {
+               KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
+               record->flags |= HAMMER_RECF_CONVERT_DELETE;
+       }
 
        /*
         * Assign the create_tid for new records.  Deletions already
         * have the record's entire key properly set up.
         */
-       if ((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0)
+       if (record->type != HAMMER_MEM_RECORD_DEL)
                record->rec.inode.base.base.create_tid = trans->tid;
        error = hammer_ip_sync_record(trans, record);
 
@@ -958,105 +1246,70 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
  * XXX error handling
  */
 int
-hammer_sync_inode(hammer_inode_t ip, int handle_delete)
+hammer_sync_inode(hammer_inode_t ip)
 {
        struct hammer_transaction trans;
        struct bio *bio;
-       hammer_depend_t depend;
+       hammer_record_t depend;
+       hammer_record_t next;
        int error, tmp_error;
+       u_int64_t nlinks;
 
-       if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0 &&
-           handle_delete == 0) {
+       if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
                return(0);
-       }
 
        hammer_start_transaction_fls(&trans, ip->hmp);
 
        /*
-        * Any (directory) records this inode depends on must also be
-        * synchronized.  The directory itself only needs to be flushed
-        * if its inode is not already on-disk.
+        * Any directory records referencing this inode which are not in
+        * our current flush group must adjust our nlink count for the
+        * purposes of synchronization to disk.
+        *
+        * Records which are in our flush group can be unlinked from our
+        * inode now, allowing the inode to be physically deleted.
         */
-       while ((depend = TAILQ_FIRST(&ip->depend_list)) != NULL) {
-               hammer_record_t record;
-
-               record = depend->record;
-                TAILQ_REMOVE(&depend->record->depend_list, depend, rec_entry);
-                TAILQ_REMOVE(&ip->depend_list, depend, ip_entry);
-               --ip->depend_count;
-               if (record->state != HAMMER_FST_FLUSH) {
-                       record->state = HAMMER_FST_FLUSH;
-                       /* add ref (steal ref from dependancy) */
-               } else {
-                       /* remove ref related to dependancy */
-                       /* record still has at least one ref from state */
-                       hammer_unref(&record->lock);
-                       KKASSERT(record->lock.refs > 0);
-               }
-               if (record->ip->flags & HAMMER_INODE_ONDISK) {
-                       kprintf("I");
-                       hammer_sync_record_callback(record, &trans);
-               } else {
-                       kprintf("J");
-                       KKASSERT((record->ip->flags & HAMMER_INODE_NEW) == 0);
-                       hammer_flush_inode(record->ip, 0);
+       nlinks = ip->ino_rec.ino_nlinks;
+       next = TAILQ_FIRST(&ip->target_list);
+       while ((depend = next) != NULL) {
+               next = TAILQ_NEXT(depend, target_entry);
+               if (depend->flush_state == HAMMER_FST_FLUSH &&
+                   depend->flush_group == ip->hmp->flusher_act) {
+                       TAILQ_REMOVE(&ip->target_list, depend, target_entry);
+                       depend->target_ip = NULL;
+                       /* no need to signal target_ip, it is us */
+               } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
+                       switch(depend->type) {
+                       case HAMMER_MEM_RECORD_ADD:
+                               --nlinks;
+                               break;
+                       case HAMMER_MEM_RECORD_DEL:
+                               ++nlinks;
+                               break;
+                       }
                }
-               hammer_unref(&ip->lock);
-               KKASSERT(ip->lock.refs > 0);
-                kfree(depend, M_HAMMER);
        }
 
-
        /*
-        * Sync inode deletions and truncations.
+        * Set dirty if we had to modify the link count.
         */
-       if (ip->sync_ino_rec.ino_nlinks == 0 && handle_delete && 
-           (ip->flags & HAMMER_INODE_GONE) == 0) {
-               /*
-                * Handle the case where the inode has been completely deleted
-                * and is no longer referenceable from the filesystem
-                * namespace.
-                *
-                * NOTE: We do not set the RDIRTY flag when updating the
-                * delete_tid, setting HAMMER_INODE_DELETED takes care of it.
-                */
+       if (ip->sync_ino_rec.ino_nlinks != nlinks) {
+               KKASSERT((int64_t)nlinks >= 0);
+               ip->sync_ino_rec.ino_nlinks = nlinks;
+               ip->sync_flags |= HAMMER_INODE_RDIRTY;
+       }
 
-               ip->flags |= HAMMER_INODE_GONE | HAMMER_INODE_DELETED;
-               ip->flags &= ~HAMMER_INODE_TRUNCATED;
-               ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
-               if (ip->vp)
-                       vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
+       /*
+        * If the inode has been unlinked and no longer has a vnode
+        * ref, destroy its data.
+        *
+        * Otherwise if there is a trunction queued destroy any data past
+        * the (aligned) truncation point.  Userland will have dealt with
+        * the buffer containing the truncation point for us.
+        */
+       if (ip->sync_ino_rec.ino_nlinks == 0 && ip->vp == NULL) {
                error = hammer_ip_delete_range_all(&trans, ip);
                if (error)
                        Debugger("hammer_ip_delete_range_all errored");
-
-               /*
-                * Sanity check.  The only records that remain should be
-                * marked for back-end deletion.
-                */
-               {
-                       hammer_record_t rec;
-
-                       RB_FOREACH(rec, hammer_rec_rb_tree, &ip->rec_tree) {
-                               KKASSERT(rec->state == HAMMER_FST_FLUSH);
-                       }
-               }
-
-               /*
-                * Set delete_tid in both the frontend and backend
-                * copy of the inode record.
-                */
-               ip->ino_rec.base.base.delete_tid = trans.tid;
-               ip->sync_ino_rec.base.base.delete_tid = trans.tid;
-
-               /*
-                * Indicate that the inode has/is-being deleted.
-                */
-               ip->flags |= HAMMER_NODE_DELETED;
-               hammer_modify_inode(&trans, ip, HAMMER_INODE_RDIRTY);
-               hammer_modify_volume(&trans, trans.rootvol, NULL, 0);
-               --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
-               hammer_modify_volume_done(trans.rootvol);
        } else if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
                /*
                 * Interlock trunc_off.  The VOP front-end may continue to
@@ -1076,7 +1329,6 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete)
                 * while we were blocked so do not just unconditionally
                 * set it to the maximum offset.
                 */
-               kprintf("sync truncation range @ %016llx\n", aligned_trunc_off);
                error = hammer_ip_delete_range(&trans, ip,
                                                aligned_trunc_off,
                                                0x7FFFFFFFFFFFFFFFLL);
@@ -1087,14 +1339,78 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete)
                        ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
                        ip->flags &= ~HAMMER_INODE_TRUNCATED;
                }
+       } else {
+               error = 0;
        }
 
-       error = 0;      /* XXX vfsync used to be here */
+       /*
+        * Now sync related records.  These will typically be directory
+        * entries or delete-on-disk records.
+        */
+       if (error == 0) {
+               tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
+                                   hammer_sync_record_callback, &trans);
+               if (tmp_error < 0)
+                       tmp_error = -error;
+               if (tmp_error)
+                       error = tmp_error;
+       }
+
+       /*
+        * Sync inode deletions, with certain restrictions.
+        *
+        * - Nlinks must be 0 for both the frontend and the backend.
+        * - All related directory entries and our own records must
+        *   be synchronized.
+        *
+        * In the latter case a directory containing numerous directory
+        * entries may not be able to sync those entries due to topological
+        * recursion.  If this is the case those records would not have
+        * been marked for flush action and ip->rec_tree will not be empty.
+        */
+       if (ip->sync_ino_rec.ino_nlinks == 0 && 
+           ip->ino_rec.ino_nlinks == 0 &&
+           TAILQ_FIRST(&ip->target_list) == NULL &&
+           RB_ROOT(&ip->rec_tree) == NULL &&
+           (ip->flags & HAMMER_INODE_GONE) == 0) {
+               /*
+                * Handle the case where the inode has been completely deleted
+                * and is no longer referenceable from the filesystem
+                * namespace.
+                *
+                * NOTE: We do not set the RDIRTY flag when updating the
+                * delete_tid, setting HAMMER_INODE_DELETED takes care of it.
+                */
+               kprintf("Y");
+
+               ip->flags |= HAMMER_INODE_GONE | HAMMER_INODE_DELETED;
+               ip->flags &= ~HAMMER_INODE_TRUNCATED;
+               ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
+               if (ip->vp)
+                       vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
+
+               /*
+                * Set delete_tid in both the frontend and backend
+                * copy of the inode record.
+                */
+               ip->ino_rec.base.base.delete_tid = trans.tid;
+               ip->sync_ino_rec.base.base.delete_tid = trans.tid;
+
+               /*
+                * Indicate that the inode has/is-being deleted.
+                */
+               ip->flags |= HAMMER_NODE_DELETED;
+               hammer_modify_inode(&trans, ip, HAMMER_INODE_RDIRTY);
+               hammer_modify_volume(&trans, trans.rootvol, NULL, 0);
+               --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
+               hammer_modify_volume_done(trans.rootvol);
+       }
 
        /*
         * Flush any queued BIOs.
         */
        while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) {
+               KKASSERT((ip->flags & HAMMER_INODE_DELETED) == 0);
                TAILQ_REMOVE(&ip->bio_list, bio, bio_act);
 #if 0
                kprintf("dowrite %016llx ip %p bio %p @ %016llx\n", trans.tid, ip, bio, bio->bio_offset);
@@ -1106,18 +1422,11 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete)
        ip->sync_flags &= ~HAMMER_INODE_BUFS;
 
        /*
-        * Now sync related records.
+        * We better have nothing left if the inode has been deleted.  If it
+        * hasn't the frontend may have queued more stuff, which would be ok.
         */
-       for (;;) {
-               tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
-                               hammer_sync_record_callback, &trans);
-               KKASSERT(error <= 0);
-               if (tmp_error < 0)
-                       tmp_error = -error;
-               if (tmp_error)
-                       error = tmp_error;
-               break;
-       }
+       KKASSERT((ip->flags & HAMMER_INODE_DELETED) == 0 || 
+                RB_ROOT(&ip->rec_tree) == NULL);
 
        /*
         * XDIRTY represents rec_tree and bio_list.  However, rec_tree may
@@ -1156,7 +1465,6 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete)
                        KKASSERT(record->lock.refs == 1);
                        record->flags |= HAMMER_RECF_DELETED_FE;
                        record->flags |= HAMMER_RECF_DELETED_BE;
-                       hammer_cleardep_mem_record(record);
                        hammer_rel_mem_record(record);
                }
                break;
@@ -1211,3 +1519,46 @@ hammer_sync_inode(hammer_inode_t ip, int handle_delete)
        return(error);
 }
 
+/*
+ * This routine is called when the OS is no longer actively referencing
+ * the inode (but might still be keeping it cached), or when releasing
+ * the last reference to an inode.
+ *
+ * At this point if the inode's nlinks count is zero we want to destroy
+ * it, which may mean destroying it on-media too.
+ */
+static int
+hammer_inode_unloadable_check(hammer_inode_t ip)
+{
+       /*
+        * If the inode is on-media and the link count is 0 we MUST delete
+        * it on-media.
+        */
+       if (ip->ino_rec.ino_nlinks == 0 &&
+           (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
+           HAMMER_INODE_ONDISK) {
+               ip->flags |= HAMMER_INODE_DELETING;
+       } else {
+               ip->flags &= ~HAMMER_INODE_DELETING;
+       }
+
+       /*
+        * If only one ref remains and the inode is not dirty, telling
+        * the caller that he can dispose of the inode.
+        */
+       if (ip->lock.refs == 1 && (ip->flags & HAMMER_INODE_MODMASK) == 0)
+               return(1);
+       return(0);
+}
+
+void
+hammer_test_inode(hammer_inode_t ip)
+{
+       if (ip->flags & HAMMER_INODE_REFLUSH) {
+               ip->flags &= ~HAMMER_INODE_REFLUSH;
+               hammer_ref(&ip->lock);
+               hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
+               hammer_rel_inode(ip, 0);
+       }
+}
+
index 08b352c..a2fb4af 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.47 2008/04/27 21:07:15 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.48 2008/05/02 01:00:42 dillon Exp $
  */
 
 #include "hammer.h"
@@ -166,10 +166,9 @@ hammer_alloc_mem_record(hammer_inode_t ip)
 
        ++hammer_count_records;
        record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
-       record->state = HAMMER_FST_IDLE;
+       record->flush_state = HAMMER_FST_IDLE;
        record->ip = ip;
        record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
-       TAILQ_INIT(&record->depend_list);
        hammer_ref(&record->lock);
        return (record);
 }
@@ -177,7 +176,7 @@ hammer_alloc_mem_record(hammer_inode_t ip)
 void
 hammer_wait_mem_record(hammer_record_t record)
 {
-       while (record->state == HAMMER_FST_FLUSH) {
+       while (record->flush_state == HAMMER_FST_FLUSH) {
                record->flags |= HAMMER_RECF_WANTED;
                tsleep(record, 0, "hmrrc2", 0);
        }
@@ -194,7 +193,10 @@ hammer_wait_mem_record(hammer_record_t record)
 void
 hammer_flush_record_done(hammer_record_t record, int error)
 {
-       KKASSERT(record->state == HAMMER_FST_FLUSH);
+       hammer_inode_t target_ip;
+       int cleanup = 0;
+
+       KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
        KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
 
        if (error) {
@@ -202,7 +204,8 @@ hammer_flush_record_done(hammer_record_t record, int error)
                 * An error occured, the backend was unable to sync the
                 * record to its media.  Leave the record intact.
                 */
-       } else if (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK) {
+               Debugger("flush_record_done error");
+       } else if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
                /*
                 * deleted-record to delete-on-disk conversion, occurs when
                 * we sync a record to disk which is marked deleted by the
@@ -211,9 +214,9 @@ hammer_flush_record_done(hammer_record_t record, int error)
                 */
                if (record->flags & HAMMER_RECF_DELETED_BE) {
                        record->flags |= HAMMER_RECF_DELETED_FE;
-                       hammer_cleardep_mem_record(record);
+                       cleanup = 1;
                } else {
-                       KKASSERT(record->flags & HAMMER_RECF_DELETE_ONDISK);
+                       KKASSERT(record->type == HAMMER_MEM_RECORD_DEL);
                }
        } else {
                /*
@@ -221,12 +224,26 @@ hammer_flush_record_done(hammer_record_t record, int error)
                 * having been synchronized to the media).
                 */
                record->flags |= HAMMER_RECF_DELETED_FE;
-               hammer_cleardep_mem_record(record);
+               record->flags |= HAMMER_RECF_DELETED_BE;
+               cleanup = 1;
        }
-       record->state = HAMMER_FST_IDLE;
+       if (cleanup) {
+               if ((target_ip = record->target_ip) != NULL) {
+                       TAILQ_REMOVE(&target_ip->target_list, record,
+                                    target_entry);
+                       record->target_ip = NULL;
+                       hammer_test_inode(target_ip);
+               }
+               record->flush_state = HAMMER_FST_IDLE;
+       } else {
+               if (record->target_ip)
+                       record->flush_state = HAMMER_FST_SETUP;
+               else
+                       record->flush_state = HAMMER_FST_IDLE;
+       }
+
        record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
-       record->flags &= ~HAMMER_RECF_CONVERT_DELETE_ONDISK;
-       hammer_unlock(&record->lock);
+       record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
        if (record->flags & HAMMER_RECF_WANTED) {
                record->flags &= ~HAMMER_RECF_WANTED;
                wakeup(record);
@@ -234,28 +251,6 @@ hammer_flush_record_done(hammer_record_t record, int error)
        hammer_rel_mem_record(record);
 }
 
-/*
- * Clear dependancies associated with a memory record.
- */
-void
-hammer_cleardep_mem_record(struct hammer_record *record)
-{
-       hammer_depend_t depend;
-
-       while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) {
-               TAILQ_REMOVE(&record->depend_list, depend,
-                            rec_entry);
-               TAILQ_REMOVE(&depend->ip->depend_list, depend,
-                            ip_entry);
-               --depend->ip->depend_count;
-               /* NOTE: inode is not flushed */
-               hammer_rel_inode(depend->ip, 0);
-               hammer_unref(&record->lock);
-               KKASSERT(record->lock.refs > 0);
-               kfree(depend, M_HAMMER);
-       }
-}
-
 /*
  * Release a memory record.  Records marked for deletion are immediately
  * removed from the RB-Tree but otherwise left intact until the last ref
@@ -264,12 +259,21 @@ hammer_cleardep_mem_record(struct hammer_record *record)
 void
 hammer_rel_mem_record(struct hammer_record *record)
 {
+       hammer_inode_t ip, target_ip;
+
        hammer_unref(&record->lock);
 
        if (record->flags & HAMMER_RECF_DELETED_FE) {
                if (record->lock.refs == 0) {
-                       KKASSERT(record->state == HAMMER_FST_IDLE);
-                       KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL);
+                       KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
+
+                       ip = record->ip;
+                       if ((target_ip = record->target_ip) != NULL) {
+                               TAILQ_REMOVE(&target_ip->target_list,
+                                            record, target_entry);
+                               record->target_ip = NULL;
+                               hammer_test_inode(target_ip);
+                       }
 
                        if (record->flags & HAMMER_RECF_ONRBTREE) {
                                RB_REMOVE(hammer_rec_rb_tree,
@@ -360,7 +364,7 @@ hammer_rec_scan_callback(hammer_record_t rec, void *data)
 
 #warning "This deadlocks"
 #if 0
-       if (rec->state == HAMMER_FST_FLUSH)
+       if (rec->flush_state == HAMMER_FST_FLUSH)
                hammer_wait_mem_record(rec);
 #endif
 
@@ -509,17 +513,16 @@ hammer_ip_add_directory(struct hammer_transaction *trans,
                     struct hammer_inode *ip)
 {
        hammer_record_t record;
-       hammer_depend_t depend;
        int error;
        int bytes;
 
        record = hammer_alloc_mem_record(dip);
-       depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO);
 
        bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
        if (++trans->hmp->namekey_iterator == 0)
                ++trans->hmp->namekey_iterator;
 
+       record->type = HAMMER_MEM_RECORD_ADD;
        record->rec.entry.base.base.obj_id = dip->obj_id;
        record->rec.entry.base.base.key =
                hammer_directory_namekey(ncp->nc_name, bytes);
@@ -531,20 +534,24 @@ hammer_ip_add_directory(struct hammer_transaction *trans,
        record->rec.entry.base.data_len = bytes;
        ++ip->ino_rec.ino_nlinks;
        hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
-       /* NOTE: copies record->data */
 
        /*
-        * If the inode gets synced cause the directory entry
-        * to be synced as well, or vise-versa.
+        * The target inode and the directory entry are bound together.
+        */
+       record->target_ip = ip;
+       record->flush_state = HAMMER_FST_SETUP;
+       TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
+
+       /*
+        * The inode now has a dependancy and must be taken out of the idle
+        * state.  An inode not in an idle state is given an extra reference.
         */
-       hammer_ref(&record->lock);      /* for depend entry */
-       hammer_ref(&ip->lock);          /* for depend entry */
-       depend->ip = ip;
-       depend->record = record;
-       TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry);
-       TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry);
-       ++ip->depend_count;
+       if (ip->flush_state == HAMMER_FST_IDLE) {
+               hammer_ref(&ip->lock);
+               ip->flush_state = HAMMER_FST_SETUP;
+       }
 
+       /* NOTE: copies record->data */
        error = hammer_mem_add(trans, record);
        return(error);
 }
@@ -565,7 +572,6 @@ hammer_ip_del_directory(struct hammer_transaction *trans,
                     struct hammer_inode *ip)
 {
        hammer_record_t record;
-       hammer_depend_t depend;
        int error;
 
        if (cursor->record == &cursor->iprec->rec) {
@@ -585,8 +591,8 @@ hammer_ip_del_directory(struct hammer_transaction *trans,
                        cursor->deadlk_rec = record;
                        error = EDEADLK;
                } else {
+                       KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
                        record->flags |= HAMMER_RECF_DELETED_FE;
-                       hammer_cleardep_mem_record(record);
                        error = 0;
                }
        } else {
@@ -595,24 +601,24 @@ hammer_ip_del_directory(struct hammer_transaction *trans,
                 * the record's key.  This also causes lookups to skip the
                 * record.
                 */
-               depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO);
-
                record = hammer_alloc_mem_record(dip);
+               record->type = HAMMER_MEM_RECORD_DEL;
                record->rec.entry.base.base = cursor->record->base.base;
                hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
-               record->flags |= HAMMER_RECF_DELETE_ONDISK;
+
+               record->target_ip = ip;
+               record->flush_state = HAMMER_FST_SETUP;
+               TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
 
                /*
-                * If the inode gets synced cause the directory entry
-                * to be synced as well, or vise-versa.
+                * The inode now has a dependancy and must be taken out of
+                * the idle state.  An inode not in an idle state is given
+                * an extra reference.
                 */
-               hammer_ref(&ip->lock);          /* for depend entry */
-               hammer_ref(&record->lock);      /* for depend entry */
-               depend->ip = ip;
-               depend->record = record;
-               TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry);
-               TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry);
-               ++ip->depend_count;
+               if (ip->flush_state == HAMMER_FST_IDLE) {
+                       hammer_ref(&ip->lock);
+                       ip->flush_state = HAMMER_FST_SETUP;
+               }
 
                error = hammer_mem_add(trans, record);
        }
@@ -666,6 +672,7 @@ hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
        record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
 
        hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
+
        /* NOTE: copies record->data */
        error = hammer_mem_add(trans, record);
        return(error);
@@ -788,8 +795,6 @@ done:
  * Sync an in-memory record to the disk.  This is called by the backend.
  * This code is responsible for actually writing a record out to the disk.
  *
- * Any inode dependancies will queue the inode to the backend.
- *
  * This routine can only be called by the backend and the record
  * must have been interlocked with BE.  It will remain interlocked on
  * return.  The caller is responsible for the record's disposition.
@@ -800,35 +805,13 @@ hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
        struct hammer_cursor cursor;
        hammer_record_ondisk_t rec;
        union hammer_btree_elm elm;
-       hammer_depend_t depend;
        hammer_off_t rec_offset;
        void *bdata;
        int error;
 
-       KKASSERT(record->state == HAMMER_FST_FLUSH);
+       KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
        KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
 
-       /*
-        * XXX A record with a dependancy is typically a directory record.
-        * The related inode must also be synchronized.  This code is not
-        * currently synchronizing the inode atomically. XXX
-        *
-        * XXX Additional dependancies from the frontend might be added while
-        * the backend is syncing the record?
-        */
-       while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) {
-               TAILQ_REMOVE(&record->depend_list, depend, rec_entry);
-               TAILQ_REMOVE(&depend->ip->depend_list, depend, ip_entry);
-               --depend->ip->depend_count;
-               kprintf("S");
-               KKASSERT((depend->ip->flags & HAMMER_INODE_NEW) == 0);
-               hammer_flush_inode(depend->ip, 0);
-               hammer_rel_inode(depend->ip, 0);
-               hammer_unref(&record->lock);
-               KKASSERT(record->lock.refs > 0);
-               kfree(depend, M_HAMMER);
-       }
-
 retry:
        /*
         * Get a cursor, we will either be inserting or deleting.
@@ -842,7 +825,7 @@ retry:
        /*
         * If we are deleting an exact match must be found on-disk.
         */
-       if (record->flags & HAMMER_RECF_DELETE_ONDISK) {
+       if (record->type == HAMMER_MEM_RECORD_DEL) {
                error = hammer_btree_lookup(&cursor);
                if (error == 0)
                        error = hammer_ip_delete_record(&cursor, trans->tid);
@@ -953,12 +936,19 @@ retry:
         * still sync the record to the media as if it were not deleted,
         * but must interlock with the frontend to ensure that the 
         * synchronized record is not visible to the frontend, which means
-        * converted the 'deleted' record to a delete-on-disk record.
+        * converting it from an ADD record to a DEL record.
+        *
+        * The DEL record then masks the record synced to disk until another
+        * round can delete it for real.
         */
-       if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK)) {
-                KKASSERT((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0);
-                record->flags |= HAMMER_RECF_DELETE_ONDISK;
+       if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE)) {
+               KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
                 record->flags &= ~HAMMER_RECF_DELETED_FE;
+               record->type = HAMMER_MEM_RECORD_DEL;
+               if (record->flush_state == HAMMER_FST_SETUP) {
+                       hammer_test_inode(record->ip);
+                       hammer_test_inode(record->target_ip);
+               }
        }
 
        /*
@@ -1035,7 +1025,6 @@ hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
        while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
                if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
                        record->flags |= HAMMER_RECF_DELETED_FE;
-                       KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL);
                        hammer_rel_mem_record(record);
                        return (EEXIST);
                }
@@ -1268,17 +1257,26 @@ next_memory:
                }
 
                /*
-                * If the entries match the memory entry must specify
-                * an on-disk deletion.  Eat both entries unless the
-                * caller wants visibility into the special records.
+                * If the entries match exactly the memory entry typically
+                * specifies an on-disk deletion and we eat both entries.
+                *
+                * If the in-memory record is not an on-disk deletion we
+                * probably caught the syncer while it was syncing it to
+                * the media.  Since we hold a shared lock on the cursor,
+                * the in-memory record had better be marked deleted at
+                * this point.
                 */
                if (r == 0) {
-                       KKASSERT(cursor->iprec->flags & 
-                                HAMMER_RECF_DELETE_ONDISK);
-                       if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
-                               cursor->flags |= HAMMER_CURSOR_ATEDISK;
+                       if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
+                               if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
+                                       cursor->flags |= HAMMER_CURSOR_ATEDISK;
+                                       cursor->flags |= HAMMER_CURSOR_ATEMEM;
+                                       goto next_btree;
+                               }
+                       } else {
+                               KKASSERT(hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0);
                                cursor->flags |= HAMMER_CURSOR_ATEMEM;
-                               goto next_btree;
+                               goto next_memory;
                        }
                }
                /* fall through to the memory entry */
@@ -1290,7 +1288,7 @@ next_memory:
                 */
                cursor->record = &cursor->iprec->rec;
                cursor->flags |= HAMMER_CURSOR_ATEMEM;
-               if (cursor->iprec->flags & HAMMER_RECF_DELETE_ONDISK) {
+               if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
                        if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
                                goto next_memory;
                }
@@ -1487,8 +1485,9 @@ retry:
 }
 
 /*
- * Delete all records associated with an inode except the inode record
- * itself.
+ * Delete all user records associated with an inode except the inode record
+ * itself.  Directory entries are not deleted (they must be properly disposed
+ * of or nlinks would get upset).
  */
 int
 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
@@ -1535,8 +1534,12 @@ retry:
                 * data if the retention policy dictates.  The function
                 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
                 * uses to perform a fixup.
+                *
+                * Directory entries (and delete-on-disk directory entries)
+                * must be synced and cannot be deleted.
                 */
-               error = hammer_ip_delete_record(&cursor, trans->tid);
+               if (rec->base.base.rec_type != HAMMER_RECTYPE_DIRENTRY)
+                       error = hammer_ip_delete_record(&cursor, trans->tid);
                if (error)
                        break;
                error = hammer_ip_next(&cursor);
@@ -1574,14 +1577,11 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
         * only occurs in range iterations since all other records are
         * individually synchronized.  Thus there should be no confusion with
         * the interlock.
-        *
-        * 
         */
        if (cursor->record == &cursor->iprec->rec) {
                KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
                cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
                cursor->iprec->flags |= HAMMER_RECF_DELETED_BE;
-               hammer_cleardep_mem_record(cursor->iprec);
                return(0);
        }
 
@@ -1690,15 +1690,37 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
 }
 
 /*
- * Determine whether a directory is empty or not.  Returns 0 if the directory
- * is empty, ENOTEMPTY if it isn't, plus other possible errors.
+ * Determine whether we can remove a directory.  This routine checks whether
+ * a directory is empty or not and enforces flush connectivity.
+ *
+ * Flush connectivity requires that we block if the target directory is
+ * currently flushing, otherwise it may not end up in the same flush group.
+ *
+ * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
  */
 int
-hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
+hammer_ip_check_directory_empty(hammer_transaction_t trans,
+                       hammer_cursor_t parent_cursor, hammer_inode_t ip)
 {
        struct hammer_cursor cursor;
        int error;
 
+#if 0
+       /*
+        * Check flush connectivity
+        */
+       if (ip->flush_state != HAMMER_FST_IDLE) {
+               kprintf("FWAIT\n");
+               hammer_done_cursor(parent_cursor);
+               hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL);
+               hammer_wait_inode(ip);
+               return (EDEADLK);
+       }
+#endif
+
+       /*
+        * Check directory empty
+        */
        hammer_init_cursor(trans, &cursor, &ip->cache[0]);
 
        cursor.key_beg.obj_id = ip->obj_id;
index 012e599..5541c20 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.7 2008/04/29 01:10:37 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.8 2008/05/02 01:00:42 dillon Exp $
  */
 
 /*
@@ -97,6 +97,8 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
        /* no undo recursion */
        hammer_modify_volume(NULL, root_volume, NULL, 0);
 
+       kprintf("u");
+
 again:
        /*
         * Allocate space in the FIFO
@@ -104,6 +106,8 @@ again:
        bytes = ((len + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK) +
                sizeof(struct hammer_fifo_undo) +
                sizeof(struct hammer_fifo_tail);
+       if (hammer_undo_space(trans->hmp) < bytes + HAMMER_BUFSIZE*2)
+               panic("hammer: insufficient undo FIFO space!");
 
        next_offset = undomap->next_offset;
 
@@ -188,3 +192,34 @@ again:
        return(error);
 }
 
+int64_t
+hammer_undo_space(hammer_mount_t hmp)
+{
+       hammer_blockmap_t rootmap;
+       int64_t bytes;
+       int64_t max_bytes;
+
+       rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
+
+       if (rootmap->first_offset <= rootmap->next_offset) {
+               bytes = (int)(rootmap->next_offset - rootmap->first_offset);
+       } else {
+               bytes = (int)(rootmap->alloc_offset - rootmap->first_offset +
+                             rootmap->next_offset);
+       }
+       max_bytes = (int)(rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK);
+       return(max_bytes - bytes);
+}
+
+int64_t
+hammer_undo_max(hammer_mount_t hmp)
+{
+       hammer_blockmap_t rootmap;
+       int64_t max_bytes;
+
+       rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
+       max_bytes = (int)(rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK);
+
+       return(max_bytes);
+}
+
index 50f16f6..351a863 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.30 2008/04/29 01:10:37 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.31 2008/05/02 01:00:42 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -61,6 +61,7 @@ int hammer_count_buffers;
 int hammer_count_nodes;
 int hammer_count_dirtybufs;            /* global */
 int hammer_limit_dirtybufs = 100;      /* per-mount */
+int hammer_bio_count;
 int64_t hammer_contention_count;
 int64_t hammer_zone_limit;
 
index ad311a6..bc48c36 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.42 2008/04/27 21:07:15 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.43 2008/05/02 01:00:42 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -503,7 +503,6 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap)
         * bump the inode's link count.
         */
        error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
-       hammer_finalize_inode(&trans, nip, error);
        if (error)
                kprintf("hammer_ip_add_directory error %d\n", error);
        hammer_unlock(&dip->lock);
@@ -877,7 +876,6 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
        hammer_lock_sh(&nip->lock);
        hammer_lock_sh(&dip->lock);
        error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
-       hammer_finalize_inode(&trans, nip, error);
        hammer_unlock(&dip->lock);
        hammer_unlock(&nip->lock);
        if (error)
@@ -946,7 +944,6 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap)
        hammer_lock_sh(&nip->lock);
        hammer_lock_sh(&dip->lock);
        error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
-       hammer_finalize_inode(&trans, nip, error);
        hammer_unlock(&dip->lock);
        hammer_unlock(&nip->lock);
 
@@ -1584,7 +1581,6 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
         */
        hammer_lock_sh(&nip->lock);
        hammer_lock_sh(&dip->lock);
-       error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
 
        /*
         * Add a record representing the symlink.  symlink stores the link
@@ -1609,7 +1605,8 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
                        hammer_modify_inode(&trans, nip, HAMMER_INODE_RDIRTY);
                }
        }
-       hammer_finalize_inode(&trans, nip, error);
+       if (error == 0)
+               error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
        hammer_unlock(&dip->lock);
        hammer_unlock(&nip->lock);
 
@@ -1903,13 +1900,13 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
         * records in the database.
         */
        BUF_KERNPROC(bp);
-       if (ip->flush_state == HAMMER_FST_FLUSH)
+       if (ip->flags & HAMMER_INODE_WRITE_ALT)
                TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act);
        else
                TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
-       hammer_modify_inode(NULL, ip, HAMMER_INODE_XDIRTY);
-       hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
-       kprintf("a");
+       ++hammer_bio_count;
+       hammer_modify_inode(NULL, ip, HAMMER_INODE_BUFS);
+       hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL);
        return(0);
 }
 
@@ -1980,6 +1977,7 @@ hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, struct bio *bio)
                bp->b_resid = 0;
        }
        biodone(bio);
+       --hammer_bio_count;
        return(error);
 }
 
@@ -2063,13 +2061,26 @@ retry:
                        kprintf("obj_id %016llx\n", rec->entry.obj_id);
                        Debugger("ENOENT unlinking object that should exist");
                }
+
+               /*
+                * If we are trying to remove a directory the directory must
+                * be empty.
+                *
+                * WARNING: hammer_ip_check_directory_empty() may have to
+                * terminate the cursor to avoid a deadlock.  It is ok to
+                * call hammer_done_cursor() twice.
+                */
                if (error == 0 && ip->ino_rec.base.base.obj_type ==
                                  HAMMER_OBJTYPE_DIRECTORY) {
-                       error = hammer_ip_check_directory_empty(trans, ip);
+                       error = hammer_ip_check_directory_empty(trans, &cursor,
+                                                               ip);
                }
+
                /*
+                * Delete the directory entry.
+                *
                 * WARNING: hammer_ip_del_directory() may have to terminate
-                * the cursor to avoid a lock recursion.  It's ok to call
+                * the cursor to avoid a deadlock.  It is ok to call
                 * hammer_done_cursor() twice.
                 */
                if (error == 0) {