HAMMER 38D/Many: Undo/Synchronization and crash recovery
authorMatthew Dillon <dillon@dragonflybsd.org>
Sat, 26 Apr 2008 02:54:00 +0000 (02:54 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Sat, 26 Apr 2008 02:54:00 +0000 (02:54 +0000)
* The flusher now waits for I/O to complete at the appropriate points.

* Implement instant crash recovery.  The UNDO FIFO is scanned backwards
  and reapplied to the filesystem on mount.  There is still more work
  to do here, inode<->inode associations (e.g. directory entry vs file)
  are not yet bound together.

* Clean up I/O sequencing a lot and get rid of a ton of unnecessary flusher
  wakeups.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_recover.c
sys/vfs/hammer/hammer_undo.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index af82916..83ce4a3 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.49 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.50 2008/04/26 02:54:00 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -231,6 +231,9 @@ typedef struct hammer_inode *hammer_inode_t;
 
 #define HAMMER_MAX_INODE_CURSORS       4
 
+#define HAMMER_FLUSH_SIGNAL    0x0001
+#define HAMMER_FLUSH_FORCE     0x0002
+
 /*
  * Structure used to represent an unsynchronized record in-memory.  This
  * structure is orgranized in a per-inode RB-tree.  If the inode is not
@@ -468,6 +471,7 @@ struct hammer_mount {
        int     flusher_seq;
        int     flusher_act;
        int     flusher_exiting;
+       int     reclaim_count;
        thread_t flusher_td;
        u_int   check_interrupt;
        uuid_t  fsid;
@@ -478,6 +482,7 @@ struct hammer_mount {
        struct hammer_io_list meta_list;        /* dirty meta bufs    */
        struct hammer_io_list lose_list;        /* loose buffers      */
        int     locked_dirty_count;             /* meta/volu count    */
+       int     io_running_count;
        hammer_tid_t asof;
        hammer_off_t next_tid;
        u_int32_t namekey_iterator;
@@ -685,7 +690,7 @@ void hammer_done_transaction(struct hammer_transaction *trans);
 
 void hammer_modify_inode(struct hammer_transaction *trans,
                        hammer_inode_t ip, int flags);
-void hammer_flush_inode(hammer_inode_t ip, int forceit);
+void hammer_flush_inode(hammer_inode_t ip, int flags);
 void hammer_flush_inode_done(hammer_inode_t ip);
 void hammer_wait_inode(hammer_inode_t ip);
 
@@ -745,6 +750,8 @@ void hammer_flusher_destroy(hammer_mount_t hmp);
 void hammer_flusher_sync(hammer_mount_t hmp);
 void hammer_flusher_async(hammer_mount_t hmp);
 
+int hammer_recover(hammer_mount_t hmp, hammer_volume_t rootvol);
+
 #endif
 
 static __inline void
index 332fb36..451695c 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.3 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.4 2008/04/26 02:54:00 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -53,17 +53,21 @@ hammer_flusher_sync(hammer_mount_t hmp)
 {
        int seq;
 
-       seq = ++hmp->flusher_seq;
-       wakeup(&hmp->flusher_seq);
-       while ((int)(seq - hmp->flusher_act) > 0)
-               tsleep(&hmp->flusher_act, 0, "hmrfls", 0);
+       if (hmp->flusher_td) {
+               seq = ++hmp->flusher_seq;
+               wakeup(&hmp->flusher_seq);
+               while ((int)(seq - hmp->flusher_act) > 0)
+                       tsleep(&hmp->flusher_act, 0, "hmrfls", 0);
+       }
 }
 
 void
 hammer_flusher_async(hammer_mount_t hmp)
 {
-       ++hmp->flusher_seq;
-       wakeup(&hmp->flusher_seq);
+       if (hmp->flusher_td) {
+               ++hmp->flusher_seq;
+               wakeup(&hmp->flusher_seq);
+       }
 }
 
 void
@@ -76,11 +80,13 @@ hammer_flusher_create(hammer_mount_t hmp)
 void
 hammer_flusher_destroy(hammer_mount_t hmp)
 {
-       hmp->flusher_exiting = 1;
-       ++hmp->flusher_seq;
-       wakeup(&hmp->flusher_seq);
-       while (hmp->flusher_td)
-               tsleep(&hmp->flusher_exiting, 0, "hmrwex", 0);
+       if (hmp->flusher_td) {
+               hmp->flusher_exiting = 1;
+               ++hmp->flusher_seq;
+               wakeup(&hmp->flusher_seq);
+               while (hmp->flusher_td)
+                       tsleep(&hmp->flusher_exiting, 0, "hmrwex", 0);
+       }
 }
 
 static void
@@ -122,7 +128,6 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
                TAILQ_REMOVE(io->mod_list, io, mod_entry);
                io->mod_list = NULL;
                hammer_ref(&io->lock);
-               kprintf("DELETE LOOSE %p\n", io);
                buffer = (void *)io;
                hammer_rel_buffer(buffer, 0);
        }
@@ -144,6 +149,9 @@ hammer_flusher_flush(hammer_mount_t hmp)
        rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
        start_offset = rootmap->next_offset;
 
+       if (hammer_debug_general & 0x00010000)
+               kprintf("x");
+
        while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) {
                TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry);
 
@@ -177,8 +185,6 @@ hammer_flusher_finalize(hammer_mount_t hmp, hammer_volume_t root_volume,
        hammer_blockmap_t rootmap;
        hammer_io_t io;
 
-       kprintf("FINALIZE %d\n", hmp->locked_dirty_count);
-
        /*
         * Flush undo bufs
         */
@@ -202,26 +208,34 @@ hammer_flusher_finalize(hammer_mount_t hmp, hammer_volume_t root_volume,
        }
 
        /*
-        * XXX wait for I/O's to complete
+        * Wait for I/O to complete
         */
+       crit_enter();
+       while (hmp->io_running_count) {
+               kprintf("WAIT1 %d\n", hmp->io_running_count);
+               tsleep(&hmp->io_running_count, 0, "hmrfl1", 0);
+       }
+       crit_exit();
 
        /*
         * Update the volume header
         */
        rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
        if (rootmap->first_offset != start_offset) {
-               kprintf("FINALIZE: ACTIVE VOLUME STAGE 1\n");
                hammer_modify_volume(NULL, root_volume, NULL, 0);
                rootmap->first_offset = start_offset;
                hammer_modify_volume_done(root_volume);
                hammer_io_flush(&root_volume->io);
-       } else {
-               kprintf("FINALIZE: ACTIVE VOLUME STAGE 2\n");
        }
 
        /*
-        * XXX wait for I/O to complete
+        * Wait for I/O to complete
         */
+       crit_enter();
+       while (hmp->io_running_count) {
+               tsleep(&hmp->io_running_count, 0, "hmrfl2", 0);
+       }
+       crit_exit();
 
        /*
         * Flush meta-data
index 3f5b7b7..56f7be0 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.37 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.38 2008/04/26 02:54:00 dillon Exp $
  */
 
 #include "hammer.h"
@@ -422,7 +422,7 @@ retry:
 
                if (error == 0) {
                        error = hammer_ip_delete_record(&cursor, trans->tid);
-                       if (error) {
+                       if (error && error != EDEADLK) {
                                kprintf("error %d\n", error);
                                Debugger("hammer_update_inode2");
                        }
@@ -471,6 +471,15 @@ retry:
                        }
                }
        }
+       if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
+               /*
+                * Clean out any left-over flags if the inode has been
+                * destroyed.
+                */
+               ip->sync_flags &= ~(HAMMER_INODE_RDIRTY |
+                                   HAMMER_INODE_DDIRTY |
+                                   HAMMER_INODE_ITIMES);
+       }
        return(error);
 }
 
@@ -537,28 +546,56 @@ retry:
 void
 hammer_rel_inode(struct hammer_inode *ip, int flush)
 {
-       if (ip->lock.refs == 1) {
+       /*
+        * Handle disposition when dropping the last ref.
+        */
+       while (ip->lock.refs == 1) {
                if (curthread == ip->hmp->flusher_td) {
                        /*
-                        * We are the flusher, actually dispose of the inode.
-                        * The unload routine inherits our (last) reference.
+                        * We are the flusher, do any required flushes
+                        * before unloading the inode.
                         */
+                       int error = 0;
+
                        KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
-                       KKASSERT(ip->cursor_ip_refs == 0);
+                       while (error == 0 &&
+                              (ip->flags & HAMMER_INODE_MODMASK)) {
+                               hammer_ref(&ip->lock);
+                               hammer_flush_inode_copysync(ip);
+                               error = hammer_sync_inode(ip, 1);
+                               hammer_flush_inode_done(ip);
+                       }
+                       if (error)
+                               kprintf("hammer_sync_inode failed error %d\n",
+                                       error);
+                       if (ip->lock.refs > 1)
+                               continue;
                        hammer_unload_inode(ip, (void *)MNT_NOWAIT);
+                       return;
+               }
+               if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
+                       hammer_unload_inode(ip, (void *)MNT_NOWAIT);
+                       return;
+               }
+
+               /*
+                * Hand the inode over to the flusher, which will
+                * add another ref to it.
+                */
+               if (++ip->hmp->reclaim_count > 256) {
+                       ip->hmp->reclaim_count = 0;
+                       hammer_flush_inode(ip, HAMMER_FLUSH_FORCE |
+                                               HAMMER_FLUSH_SIGNAL);
                } else {
-                       /*
-                        * flush_list inherits our last reference.
-                        *
-                        * Only the flusher can actually destroy the inode,
-                        * there had better still be a ref on it if we aren't
-                        * it.
-                        */
-                       hammer_flush_inode(ip, 1);
-                       KKASSERT(ip->lock.refs > 1);
-                       hammer_unref(&ip->lock);
+                       hammer_flush_inode(ip, HAMMER_FLUSH_FORCE);
                }
-       } else if (flush && ip->flush_state == HAMMER_FST_IDLE &&
+               /* retry */
+       }
+
+       /*
+        * Inode still has multiple refs
+        */
+       if (flush && ip->flush_state == HAMMER_FST_IDLE &&
                   curthread != ip->hmp->flusher_td) {
                /*
                 * Flush requested, make the inode visible to the flusher.
@@ -589,32 +626,25 @@ hammer_rel_inode(struct hammer_inode *ip, int flush)
 static int
 hammer_unload_inode(struct hammer_inode *ip, void *data)
 {
-       int error;
 
        KASSERT(ip->lock.refs == 1,
                ("hammer_unload_inode: %d refs\n", ip->lock.refs));
        KKASSERT(ip->vp == NULL);
+       KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
+       KKASSERT(ip->cursor_ip_refs == 0);
+       KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
 
-       do {
-               hammer_flush_inode_copysync(ip);
-               error = hammer_sync_inode(ip, 1);
-       } while (error == 0 && (ip->flags & HAMMER_INODE_MODMASK));
+       KKASSERT(RB_EMPTY(&ip->rec_tree));
+       KKASSERT(TAILQ_EMPTY(&ip->bio_list));
+       KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
+
+       RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
+
+       hammer_uncache_node(&ip->cache[0]);
+       hammer_uncache_node(&ip->cache[1]);
+       --hammer_count_inodes;
+       kfree(ip, M_HAMMER);
 
-       if (error)
-               kprintf("hammer_sync_inode failed error %d\n", error);
-       if (ip->lock.refs == 1) {
-               KKASSERT(RB_EMPTY(&ip->rec_tree));
-               KKASSERT(TAILQ_EMPTY(&ip->bio_list));
-               KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
-               RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
-
-               hammer_uncache_node(&ip->cache[0]);
-               hammer_uncache_node(&ip->cache[1]);
-               --hammer_count_inodes;
-               kfree(ip, M_HAMMER);
-       } else {
-               hammer_flush_inode_done(ip);
-       }
        return(0);
 }
 
@@ -650,7 +680,7 @@ hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
  * troublesome because some dirty buffers may not have been queued yet.
  */
 void
-hammer_flush_inode(hammer_inode_t ip, int forceit)
+hammer_flush_inode(hammer_inode_t ip, int flags)
 {
        if (ip->flush_state != HAMMER_FST_IDLE &&
            (ip->flags & HAMMER_INODE_MODMASK)) {
@@ -658,17 +688,20 @@ hammer_flush_inode(hammer_inode_t ip, int forceit)
                return;
        }
        hammer_lock_ex(&ip->lock);
-       if (ip->flush_state == HAMMER_FST_IDLE &&
-           ((ip->flags & HAMMER_INODE_MODMASK) || forceit)) {
-               hammer_ref(&ip->lock);
+       if (ip->flush_state == HAMMER_FST_IDLE) {
+               if ((ip->flags & HAMMER_INODE_MODMASK) ||
+                   (flags & HAMMER_FLUSH_FORCE)) {
+                       hammer_ref(&ip->lock);
 
-               hammer_flush_inode_copysync(ip);
-               /*
-                * Move the inode to the flush list and add a ref to it
-                * representing it on the list.
-                */
-               TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
-               hammer_flusher_async(ip->hmp);
+                       hammer_flush_inode_copysync(ip);
+                       /*
+                        * Move the inode to the flush list and add a ref to
+                        * it representing it on the list.
+                        */
+                       TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
+                       if (flags & HAMMER_FLUSH_SIGNAL)
+                               hammer_flusher_async(ip->hmp);
+               }
        }
        hammer_unlock(&ip->lock);
 }
@@ -780,6 +813,7 @@ hammer_flush_inode_done(hammer_inode_t ip)
        while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) {
                TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act);
                TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
+               ip->flags |= HAMMER_INODE_XDIRTY;
                ip->flags |= HAMMER_INODE_REFLUSH;
                kprintf("rebio %p ip %p @%016llx,%d\n", bio, ip, bio->bio_offset, bio->bio_buf->b_bufsize);
        }
@@ -790,7 +824,6 @@ hammer_flush_inode_done(hammer_inode_t ip)
         */
        if (ip->flags & HAMMER_INODE_REFLUSH) {
                ip->flags &= ~HAMMER_INODE_REFLUSH;
-               kprintf("reflush %p\n", ip);
                hammer_flush_inode(ip, 0);
        } else {
                if (ip->flags & HAMMER_INODE_FLUSHW) {
index 2384a56..8f5c382 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.26 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.27 2008/04/26 02:54:00 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
@@ -383,6 +383,7 @@ hammer_io_flush(struct hammer_io *io)
         */
        io->released = 1;
        io->running = 1;
+       ++io->hmp->io_running_count;
        bawrite(bp);
 }
 
@@ -562,12 +563,18 @@ hammer_io_complete(struct buf *bp)
 
        KKASSERT(iou->io.released == 1);
 
+       if (iou->io.running) {
+               if (--iou->io.hmp->io_running_count == 0)
+                       wakeup(&iou->io.hmp->io_running_count);
+               KKASSERT(iou->io.hmp->io_running_count >= 0);
+               iou->io.running = 0;
+       }
+
        /*
         * If no lock references remain and we can acquire the IO lock and
         * someone at some point wanted us to flush (B_LOCKED test), then
         * try to dispose of the IO.
         */
-       iou->io.running = 0;
        if (iou->io.waiting) {
                iou->io.waiting = 0;
                wakeup(iou);
@@ -617,7 +624,6 @@ hammer_io_deallocate(struct buf *bp)
                hammer_io_disassociate(iou, 0);
                if (iou->io.bp == NULL && 
                    iou->io.type != HAMMER_STRUCTURE_VOLUME) {
-                       kprintf("ADD LOOSE %p\n", &iou->io);
                        KKASSERT(iou->io.mod_list == NULL);
                        iou->io.mod_list = &iou->io.hmp->lose_list;
                        TAILQ_INSERT_TAIL(iou->io.mod_list, &iou->io, mod_entry);
@@ -685,6 +691,13 @@ hammer_io_checkwrite(struct buf *bp)
                io->mod_list = NULL;
                io->modified = 0;
        }
+
+       /*
+        * The kernel is going to start the IO, set io->running.
+        */
+       KKASSERT(io->running == 0);
+       io->running = 1;
+       ++io->hmp->io_running_count;
        return(0);
 }
 
index 681231a..e50cd08 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.42 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.43 2008/04/26 02:54:00 dillon Exp $
  */
 
 #include "hammer.h"
@@ -723,10 +723,8 @@ retry:
         */
        if (record->flags & HAMMER_RECF_DELETE_ONDISK) {
                error = hammer_btree_lookup(&cursor);
-               kprintf("DELETE MEM ENTRY1 %d\n", error);
                if (error == 0)
                        error = hammer_ip_delete_record(&cursor, trans->tid);
-               kprintf("DELETE MEM ENTRY2 %d\n", error);
                if (error == 0)
                        record->flags |= HAMMER_RECF_DELETED_FE;
                goto done;
@@ -1154,7 +1152,6 @@ next_memory:
                        if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
                                cursor->flags |= HAMMER_CURSOR_ATEDISK;
                                cursor->flags |= HAMMER_CURSOR_ATEMEM;
-                               kprintf("SKIP MEM ENTRY\n");
                                goto next_btree;
                        }
                }
index 39ca421..b5cc2a8 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.38 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.39 2008/04/26 02:54:00 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -496,13 +496,25 @@ hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
        zoneX_offset = buf_offset;
        zone = HAMMER_ZONE_DECODE(buf_offset);
 
-       if (zone == HAMMER_ZONE_LARGE_DATA_INDEX ||
-           zone == HAMMER_ZONE_SMALL_DATA_INDEX) {
+       /*
+        * What is the buffer class?
+        */
+       switch(zone) {
+       case HAMMER_ZONE_LARGE_DATA_INDEX:
+       case HAMMER_ZONE_SMALL_DATA_INDEX:
                iotype = HAMMER_STRUCTURE_DATA_BUFFER;
-       } else {
+               break;
+       case HAMMER_ZONE_UNDO_INDEX:
+               iotype = HAMMER_STRUCTURE_UNDO_BUFFER;
+               break;
+       default:
                iotype = HAMMER_STRUCTURE_META_BUFFER;
+               break;
        }
 
+       /*
+        * Handle blockmap offset translations
+        */
        if (zone >= HAMMER_ZONE_BTREE_INDEX) {
                buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp);
                KKASSERT(*errorp == 0);
@@ -510,6 +522,10 @@ hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
                buf_offset = hammer_undo_lookup(hmp, buf_offset, errorp);
                KKASSERT(*errorp == 0);
        }
+
+       /*
+        * Locate the buffer given its zone-2 offset.
+        */
        buf_offset &= ~HAMMER_BUFMASK64;
        KKASSERT((buf_offset & HAMMER_ZONE_RAW_BUFFER) ==
                 HAMMER_ZONE_RAW_BUFFER);
@@ -715,6 +731,7 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush)
        hammer_unref(&buffer->io.lock);
        crit_exit();
        if (freeme) {
+               KKASSERT(buffer->io.mod_list == NULL);
                --hammer_count_buffers;
                kfree(buffer, M_HAMMER);
        }
index 1a8cbcd..a410416 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.9 2008/03/18 05:19:16 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.10 2008/04/26 02:54:00 dillon Exp $
  */
 
 #include "hammer.h"
 
-#if 0
-
-static int hammer_recover_buffer_stage2(hammer_cluster_t cluster,
-                               int32_t buf_no);
-static int hammer_recover_record(hammer_cluster_t cluster,
-                               hammer_buffer_t buffer, int32_t rec_offset,
-                               hammer_record_ondisk_t rec);
-static int hammer_recover_btree(hammer_cluster_t cluster,
-                               hammer_buffer_t buffer, int32_t rec_offset,
-                               hammer_record_ondisk_t rec);
+static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
+                       hammer_off_t end_off);
+static void hammer_recover_copy_undo(hammer_off_t undo_offset,
+                       char *src, char *dst, int bytes);
+static void hammer_recover_debug_dump(int w, char *buf, int bytes);
+static int hammer_recover_undo(hammer_mount_t hmp, hammer_fifo_undo_t undo,
+                       int bytes);
 
 /*
- * Recover a cluster.  The caller has referenced and locked the cluster.
- * 
- * Generally returns 0 on success and EIO if the recovery was unsuccessful.
- *
- * WARNING!  The cluster being recovered must not have any cached buffers
- * (and hence no cached b-tree nodes).  Any cached nodes will become seriously
- * corrupted since we rip it all up and regenerate the B-Tree.
+ * Recover a filesystem on mount
  */
 int
-hammer_recover(hammer_cluster_t cluster)
+hammer_recover(hammer_mount_t hmp, hammer_volume_t root_volume)
 {
-       int buf_no;
-       int rec_no;
-       int maxblk;
-       int nbuffers;
-       int buffer_count;
-       int record_count;
-
-       kprintf("HAMMER_RECOVER %d:%d\n",
-               cluster->volume->vol_no, cluster->clu_no);
-       /*Debugger("RECOVER");*/
-       KKASSERT(cluster->ondisk->synchronized_rec_id);
-       if (RB_ROOT(&cluster->rb_bufs_root)) {
-               panic("hammer_recover: cluster %d:%d has cached buffers!",
-                       cluster->volume->vol_no,
-                       cluster->clu_no);
-       }
-
-       if (hammer_alist_find(&cluster->volume->alist, cluster->clu_no,
-                             cluster->clu_no + 1, 0) != cluster->clu_no) {
-               Debugger("hammer_recover: cluster not allocated!");
-       }
-
-       nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE;
-       hammer_modify_cluster(cluster);
-
-       /*
-        * Clear statistics.
-        */
-       cluster->ondisk->stat_inodes = 0;
-       cluster->ondisk->stat_records = 0;
-       cluster->ondisk->stat_data_bufs = 0;
-       cluster->ondisk->stat_rec_bufs = 0;
-       cluster->ondisk->stat_idx_bufs = 0;
-
-       /*
-        * Reset allocation heuristics.
-        */
-       cluster->ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
-       cluster->ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
-       cluster->ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
+       hammer_blockmap_t rootmap;
+       hammer_buffer_t buffer;
+       hammer_off_t scan_offset;
+       hammer_off_t bytes;
+       hammer_fifo_tail_t tail;
+       hammer_fifo_undo_t undo;
+       int error;
 
        /*
-        * Re-initialize the master, B-Tree, and mdata A-lists, and
-        * recover the record A-list.
+        * Examine the UNDO FIFO.  If it is empty the filesystem is clean
+        * and no action need be taken.
         */
-       hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
-                         HAMMER_ASTATE_FREE);
-       hammer_alist_init(&cluster->alist_btree,
-                         HAMMER_FSBUF_MAXBLKS,
-                         (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
-                         HAMMER_ASTATE_ALLOC);
-       hammer_alist_init(&cluster->alist_mdata,
-                         HAMMER_FSBUF_MAXBLKS,
-                         (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
-                         HAMMER_ASTATE_ALLOC);
-       hammer_alist_recover(&cluster->alist_record,
-                         0,
-                         HAMMER_FSBUF_MAXBLKS,
-                         (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS);
-       kprintf("\n");
+       rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
+       if (rootmap->first_offset == rootmap->next_offset)
+               return(0);
 
-       kprintf("hammer_recover(1): cluster_free %d\n",
-               cluster->alist_master.meta->bm_alist_freeblks);
+       if (rootmap->next_offset < rootmap->first_offset)
+               bytes = rootmap->alloc_offset - rootmap->first_offset +
+                       rootmap->next_offset;
+       bytes = (rootmap->next_offset - rootmap->first_offset);
+       kprintf("HAMMER(%s) Start Recovery (%lld bytes of UNDO)\n",
+               root_volume->ondisk->vol_name, bytes);
 
        /*
-        * The cluster is now in good enough shape that general allocations
-        * are possible.  Construct an empty B-Tree root.
+        * Scan the UNDOs backwards.
         */
-       {
-               hammer_node_t croot;
-               int error;
+       scan_offset = rootmap->next_offset;
+       buffer = NULL;
+       if (scan_offset > rootmap->alloc_offset) {
+               kprintf("HAMMER(%s) UNDO record at %016llx FIFO overflow\n",
+                       root_volume->ondisk->vol_name,
+                       scan_offset);
+               error = EIO;
+               goto failed;
+       }
 
-               croot = hammer_alloc_btree(cluster, &error);
-               if (error == 0) {
-                       hammer_modify_node_noundo(croot);
-                       bzero(croot->ondisk, sizeof(*croot->ondisk));
-                       croot->ondisk->count = 0;
-                       croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
-                       cluster->ondisk->clu_btree_root = croot->node_offset;
-                       hammer_rel_node(croot);
+       while ((int64_t)bytes > 0) {
+               kprintf("scan_offset %016llx\n", scan_offset);
+               if (scan_offset - sizeof(*tail) <
+                   HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
+                       kprintf("HAMMER(%s) UNDO record at %016llx FIFO "
+                               "underflow\n",
+                               root_volume->ondisk->vol_name,
+                               scan_offset);
+                       error = EIO;
+                       break;
+               }
+               if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
+                       scan_offset = rootmap->alloc_offset;
+                       continue;
+               }
+               tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
+                                   &error, &buffer);
+               if (error) {
+                       kprintf("HAMMER(%s) Unable to read UNDO TAIL "
+                               "at %016llx\n",
+                               root_volume->ondisk->vol_name,
+                               scan_offset - sizeof(*tail));
+                       break;
                }
-               KKASSERT(error == 0);
-       }
-       kprintf("hammer_recover(2): cluster_free %d\n",
-               cluster->alist_master.meta->bm_alist_freeblks);
 
-       /*
-        * Scan the cluster's recovered record A-list.  Just get the meta
-        * blocks and ignore all-allocated/uninitialized sections (which
-        * we use to indicate reserved areas not assigned to record buffers).
-        *
-        * The all-free sections are initialized and this is indicated by
-        * the alist config's bl_inverted flag being set.  These sections
-        * will be returned for recovery purposes.
-        */
-       buffer_count = 0;
-       record_count = 0;
+               if (hammer_check_tail_signature(tail, scan_offset) != 0) {
+                       kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
+                               "at %016llx\n",
+                               root_volume->ondisk->vol_name,
+                               scan_offset - sizeof(*tail));
+                       error = EIO;
+                       break;
+               }
+               undo = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
 
-       rec_no = HAMMER_FSBUF_MAXBLKS;
-       maxblk = nbuffers * HAMMER_FSBUF_MAXBLKS;
-       for (;;) {
-               rec_no = hammer_alist_find(&cluster->alist_record,
-                                          rec_no,
-                                          maxblk,
-                                          HAMMER_ALIST_FIND_NOSTACK |
-                                          HAMMER_ALIST_FIND_INITONLY);
-               if (rec_no == HAMMER_ALIST_BLOCK_NONE)
+               error = hammer_recover_undo(hmp, undo,
+                               HAMMER_BUFSIZE -
+                               (int)((char *)undo - (char *)buffer->ondisk));
+               if (error) {
+                       kprintf("HAMMER(%s) UNDO record at %016llx failed\n",
+                               root_volume->ondisk->vol_name,
+                               scan_offset - tail->tail_size);
                        break;
-               buf_no = rec_no / HAMMER_FSBUF_MAXBLKS;
-               KKASSERT(buf_no > 0 && buf_no <= nbuffers);
-               ++buffer_count;
-               kprintf("(%d)", buf_no);
-               record_count += hammer_recover_buffer_stage2(cluster, buf_no);
-               rec_no += HAMMER_FSBUF_MAXBLKS;
+               }
+               scan_offset -= tail->tail_size;
+               bytes -= tail->tail_size;
        }
-       kprintf("HAMMER_RECOVER DONE %d:%d buffers=%d records=%d\n",
-               cluster->volume->vol_no, cluster->clu_no,
-               buffer_count, record_count);
-
-       /*
-        * Validate the parent cluster pointer. XXX
-        */
-
-       /*
-        * On successful recovery mark the cluster validated.
-        */
-       cluster->io.validated = 1;
-       return(0);
+failed:
+       if (buffer)
+               hammer_rel_buffer(buffer, 0);
+       return (error);
 }
 
-/*
- * This is used in the alist callback and must return a negative error
- * code or a positive free block count.
- */
-int
-buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
+static int
+hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
 {
-       hammer_cluster_t cluster;
-       hammer_record_ondisk_t rec;
-       hammer_buffer_t buffer;
-       int32_t buf_no;
-       int32_t rec_no;
-       int32_t rec_offset;
-       int32_t r;
-       int error;
-       int xcount;
+       int max_bytes;
 
-       /*
-        * Extract cluster and buffer number to recover
-        */
-       cluster = info;
-       buf_no = blk / HAMMER_FSBUF_MAXBLKS;
-
-       kprintf("(%d)", buf_no);
-       buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
-       if (error) {
-               /*
-                * If we are unable to access the buffer leave it in a
-                * reserved state on the master alist.
-                */
-               kprintf("hammer_recover_buffer_stage1: error "
-                       "recovering %d:%d:%d\n",
-                       cluster->volume->vol_no, cluster->clu_no, buf_no);
-               r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
-               KKASSERT(r == buf_no);
-               return(-error);
-       }
-       KKASSERT(buffer->buf_type == HAMMER_FSBUF_RECORDS);
+       max_bytes = ((end_off - sizeof(*tail)) & HAMMER_BUFMASK);
+       max_bytes += sizeof(*tail);
 
        /*
-        * If the buffer contains no allocated records tell our parent to
-        * mark it as all-allocated/uninitialized and do not reserve it
-        * in the master list.
+        * tail overlaps buffer boundary
         */
-       if (hammer_alist_find(&buffer->alist, 0, HAMMER_RECORD_NODES, 0) ==
-           HAMMER_ALIST_BLOCK_NONE) {
-               kprintf("GENERAL RECOVERY BUFFER %d\n",
-                       blk / HAMMER_FSBUF_MAXBLKS);
-               hammer_rel_buffer(buffer, 0);
-               return(-EDOM);
+       if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) {
+               return(1);
        }
 
-
        /*
-        * Mark the buffer as allocated in the cluster's master A-list.
+        * signature check, the tail signature is allowed to be the head
+        * signature only for 8-byte PADs.
         */
-       r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
-       KKASSERT(r == buf_no);
-       ++cluster->ondisk->stat_rec_bufs;
-
-       kprintf("recover buffer1 %d:%d:%d cluster_free %d\n",
-               cluster->volume->vol_no,
-               cluster->clu_no, buf_no,
-               cluster->alist_master.meta->bm_alist_freeblks);
+       switch(tail->tail_signature) {
+       case HAMMER_TAIL_SIGNATURE:
+               break;
+       case HAMMER_HEAD_SIGNATURE:
+               if (tail->tail_type != HAMMER_HEAD_TYPE_PAD ||
+                   tail->tail_size != sizeof(*tail)) {
+                       return(2);
+               }
+               break;
+       }
 
        /*
-        * Recover the buffer, scan and validate allocated records.  Records
-        * which cannot be recovered are freed.
-        *
-        * The parent a-list must be properly adjusted so don't just call
-        * hammer_alist_recover() on the underlying buffer.  Go through the
-        * parent.
+        * The undo structure must not overlap a buffer boundary.
         */
-       hammer_modify_buffer(buffer);
-       count = hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES);
-       xcount = 0;
-       kprintf("hammer_recover_buffer count1 %d/%d\n",
-               HAMMER_RECORD_NODES - count, HAMMER_RECORD_NODES);
-       rec_no = 0;
-       for (;;) {
-               rec_no = hammer_alist_find(&buffer->alist, rec_no,
-                                          HAMMER_RECORD_NODES, 0);
-               if (rec_no == HAMMER_ALIST_BLOCK_NONE)
-                       break;
-#if 0
-               kprintf("recover record %d:%d:%d %d\n",
-                       cluster->volume->vol_no,
-                       cluster->clu_no, buf_no, rec_no);
-#endif
-               rec_offset = offsetof(union hammer_fsbuf_ondisk,
-                                     record.recs[rec_no]);
-               rec_offset += buf_no * HAMMER_BUFSIZE;
-               rec = &buffer->ondisk->record.recs[rec_no];
-               error = hammer_recover_record(cluster, buffer, rec_offset, rec);
-               if (error) {
-                       kprintf("hammer_recover_record: failed %d:%d@%d\n",
-                               cluster->clu_no, buffer->buf_no, rec_offset);
-                       hammer_alist_free(&buffer->alist, rec_no, 1);
-                       if (hammer_debug_recover_faults)
-                               Debugger("FAILED");
-                       ++count;        /* free count */
-                       --xcount;
-               }
-               ++rec_no;
-               ++xcount;
+       if (tail->tail_size < 0 || tail->tail_size > max_bytes) {
+               return(3);
        }
-       kprintf("hammer_recover_buffer count2 %d/%d/%d\n",
-               HAMMER_RECORD_NODES - count, xcount, HAMMER_RECORD_NODES);
-       KKASSERT(HAMMER_RECORD_NODES - count == xcount);
-       hammer_rel_buffer(buffer, 0);
-       return(count);
+       return(0);
 }
 
-/*
- * Recover a record, at least into a state that doesn't blow up the
- * filesystem.  Returns 0 on success, non-zero if the record is
- * unrecoverable.
- */
 static int
-hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer,
-                            int32_t rec_offset, hammer_record_ondisk_t rec)
+hammer_recover_undo(hammer_mount_t hmp, hammer_fifo_undo_t undo, int bytes)
 {
-       hammer_buffer_t dbuf;
-       u_int64_t syncid = cluster->ondisk->synchronized_rec_id;
-       int32_t data_offset;
-       int32_t data_len;
-       int32_t nblks;
-       int32_t dbuf_no;
-       int32_t dblk_no;
-       int32_t base_blk;
-       int32_t r;
-       int error = 0;
+       hammer_fifo_tail_t tail;
+       hammer_volume_t volume;
+       hammer_buffer_t buffer;
+       int zone;
+       int error;
+       int vol_no;
+       int max_bytes;
+       u_int32_t offset;
 
        /*
-        * We have to discard any records with rec_id's greater then the
-        * last sync of the cluster header (which guarenteed all related
-        * buffers had been synced).  Otherwise the record may reference
-        * information that was never synced to disk.
+        * Basic sanity checks
         */
-       if (rec->base.rec_id >= syncid) {
-               kprintf("recover record: syncid too large %016llx/%016llx\n",
-                       rec->base.rec_id, syncid);
-               if (hammer_debug_recover_faults)
-                       Debugger("DebugSyncid");
-               return(EINVAL);
+       if (bytes < HAMMER_HEAD_ALIGN) {
+               kprintf("HAMMER: Undo alignment error (%d)\n", bytes);
+               return(EIO);
        }
-
-#if 0
-       /* XXX undo incomplete deletions */
-       if (rec->base.base.delete_tid > syncid)
-               rec->base.base.delete_tid = 0;
-#endif
-
-       /*
-        * Validate the record's B-Tree key
-        */
-       KKASSERT(rec->base.base.rec_type != 0);
-       if (rec->base.base.rec_type != HAMMER_RECTYPE_CLUSTER) {
-               if (hammer_btree_cmp(&rec->base.base,
-                                    &cluster->ondisk->clu_btree_beg) < 0)  {
-                       kprintf("recover record: range low\n");
-                       Debugger("RANGE LOW");
-                       return(EINVAL);
-               }
-               if (hammer_btree_cmp(&rec->base.base,
-                                    &cluster->ondisk->clu_btree_end) >= 0)  {
-                       kprintf("recover record: range high\n");
-                       Debugger("RANGE HIGH");
-                       return(EINVAL);
-               }
+       if (undo->head.hdr_signature != HAMMER_HEAD_SIGNATURE) {
+               kprintf("HAMMER: Bad head signature %04x\n", 
+                       undo->head.hdr_signature);
+               return(EIO);
+       }
+       if (undo->head.hdr_size < HAMMER_HEAD_ALIGN ||
+           undo->head.hdr_size > bytes) {
+               kprintf("HAMMER: Bad size %d\n", bytes);
+               return(EIO);
        }
 
        /*
-        * Validate the record's data.  If the offset is 0 there is no data
-        * (or it is zero-fill) and we can return success immediately.
-        * Otherwise make sure everything is ok.
+        * Skip PAD records.  Note that PAD records also do not require
+        * a tail.
         */
-       data_offset = rec->base.data_offset;
-       data_len = rec->base.data_len;
-
-       if (data_len == 0)
-               rec->base.data_offset = data_offset = 0;
-       if (data_offset == 0)
-               goto done;
+       if (undo->head.hdr_type == HAMMER_HEAD_TYPE_PAD)
+               return(0);
 
        /*
-        * Non-zero data offset, recover the data
+        * Check the tail
         */
-       if (data_offset < HAMMER_BUFSIZE ||
-           data_offset >= cluster->ondisk->clu_limit ||
-           data_len < 0 || data_len > HAMMER_MAXDATA ||
-           data_offset + data_len > cluster->ondisk->clu_limit) {
-               kprintf("recover record: bad offset/len %d/%d\n",
-                       data_offset, data_len);
-               Debugger("BAD OFFSET");
-               return(EINVAL);
+       bytes = undo->head.hdr_size;
+       tail = (void *)((char *)undo + bytes - sizeof(*tail));
+       if (tail->tail_size != undo->head.hdr_size) {
+               kprintf("HAMMER: Bad tail size %d\n", tail->tail_size);
+               return(EIO);
+       }
+       if (tail->tail_type != undo->head.hdr_type) {
+               kprintf("HAMMER: Bad tail type %d\n", tail->tail_type);
+               return(EIO);
        }
 
        /*
-        * Check data_offset relative to rec_offset
+        * Only process UNDO records
         */
-       if (data_offset < rec_offset && data_offset + data_len > rec_offset) {
-               kprintf("recover record: bad offset: overlapping1\n");
-               Debugger("BAD OFFSET - OVERLAP1");
-               return(EINVAL);
-       }
-       if (data_offset >= rec_offset &&
-           data_offset < rec_offset + sizeof(struct hammer_base_record)) {
-               kprintf("recover record: bad offset: overlapping2\n");
-               Debugger("BAD OFFSET - OVERLAP2");
-               return(EINVAL);
-       }
+       if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
+               return(0);
 
        /*
-        * Check for data embedded in the record
+        * Validate the UNDO record.
         */
-       if (data_offset >= rec_offset &&
-           data_offset < rec_offset + HAMMER_RECORD_SIZE) {
-               if (data_offset + data_len > rec_offset + HAMMER_RECORD_SIZE) {
-                       kprintf("recover record: bad offset: overlapping3\n");
-                       Debugger("BAD OFFSET - OVERLAP3");
-                       return(EINVAL);
-               }
-               goto done;
+       max_bytes = undo->head.hdr_size - sizeof(*undo) - sizeof(*tail);
+       if (undo->undo_data_bytes < 0 || undo->undo_data_bytes > max_bytes) {
+               kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
+                       undo->undo_data_bytes, max_bytes);
+               return(EIO);
        }
 
-       KKASSERT(cluster->io.modified);
        /*
-        * Recover the allocated data either out of the cluster's master alist
-        * or as a buffer sub-allocation.
+        * The undo offset may only be a zone-1 or zone-2 offset.
+        *
+        * Currently we only support a zone-1 offset representing the
+        * volume header.
         */
-       if ((data_len & HAMMER_BUFMASK) == 0) {
-               if (data_offset & HAMMER_BUFMASK) {
-                       kprintf("recover record: bad offset: unaligned\n");
-                       Debugger("BAD OFFSET - UNALIGNED");
-                       return(EINVAL);
-               }
-               nblks = data_len / HAMMER_BUFSIZE;
-               dbuf_no = data_offset / HAMMER_BUFSIZE;
-               /* XXX power-of-2 check data_len */
+       zone = HAMMER_ZONE_DECODE(undo->undo_offset);
+       offset = undo->undo_offset & HAMMER_BUFMASK;
 
-               r = hammer_alist_alloc_fwd(&cluster->alist_master,
-                                          nblks, dbuf_no);
-               if (r == HAMMER_ALIST_BLOCK_NONE) {
-                       kprintf("recover record: cannot recover offset1\n");
-                       Debugger("CANNOT ALLOC DATABUFFER");
-                       return(EINVAL);
-               }
-               if (r != dbuf_no) {
-                       kprintf("recover record: cannot recover offset2\n");
-                       hammer_alist_free(&cluster->alist_master, r, nblks);
-                       KKASSERT(0);
-                       return(EINVAL);
-               }
-               ++cluster->ondisk->stat_data_bufs;
-       } else {
-               if ((data_offset & ~HAMMER_BUFMASK) !=
-                   ((data_offset + data_len - 1) & ~HAMMER_BUFMASK)) {
-                       kprintf("recover record: overlaps multiple bufs\n");
-                       Debugger("OVERLAP MULT");
-                       return(EINVAL);
-               }
-               if ((data_offset & HAMMER_BUFMASK) <
-                   sizeof(struct hammer_fsbuf_head)) {
-                       kprintf("recover record: data in header area\n");
-                       Debugger("DATA IN HEADER AREA");
-                       return(EINVAL);
-               }
-               if (data_offset & HAMMER_DATA_BLKMASK) {
-                       kprintf("recover record: data blk unaligned\n");
-                       Debugger("DATA BLK UNALIGNED");
-                       return(EINVAL);
-               }
-
-               /*
-                * Ok, recover the space in the data buffer.
-                */
-               dbuf_no = data_offset / HAMMER_BUFSIZE;
-               r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, dbuf_no);
-               if (r != dbuf_no && r != HAMMER_ALIST_BLOCK_NONE)
-                       hammer_alist_free(&cluster->alist_master, r, 1);
-               if (r == dbuf_no) {
-                       /*
-                        * This is the first time we've tried to recover
-                        * data in this data buffer, reinit it (but don't
-                        * zero it out, obviously).
-                        *
-                        * Calling initbuffer marks the data blocks within
-                        * the buffer as being all-allocated.  We have to
-                        * mark it free.
-                        */
-                       dbuf = hammer_get_buffer(cluster, dbuf_no,
-                                                0, &error);
-                       if (error == 0) {
-                               KKASSERT(dbuf->buf_type == HAMMER_FSBUF_DATA);
-                               hammer_modify_buffer(dbuf);
-                               hammer_initbuffer(&dbuf->alist, 
-                                                 &dbuf->ondisk->head,
-                                                 HAMMER_FSBUF_DATA);
-                               /*dbuf->buf_type = HAMMER_FSBUF_DATA;*/
-                               base_blk = dbuf_no * HAMMER_FSBUF_MAXBLKS;
-                               hammer_alist_free(&cluster->alist_mdata,
-                                                 base_blk,
-                                                 HAMMER_DATA_NODES);
-                               kprintf("FREE DATA %d/%d\n", base_blk, HAMMER_DATA_NODES);
-                               ++cluster->ondisk->stat_data_bufs;
-                       }
-               } else {
-                       /*
-                        * We've seen this data buffer before.
-                        */
-                       dbuf = hammer_get_buffer(cluster, dbuf_no,
-                                                0, &error);
-               }
-               if (error) {
-                       kprintf("recover record: data: getbuf failed\n");
-                       KKASSERT(0);
-                       return(EINVAL);
-               }
-
-               if (dbuf->buf_type != HAMMER_FSBUF_DATA) {
-                       hammer_rel_buffer(dbuf, 0);
-                       kprintf("recover record: data: wrong buffer type\n");
-                       KKASSERT(0);
-                       return(EINVAL);
-               }
+       if (offset + undo->undo_data_bytes > HAMMER_BUFSIZE) {
+               kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
+               return (EIO);
+       }
 
-               /*
-                * Figure out the data block number and number of blocks.
-                */
-               nblks = (data_len + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
-               nblks /= HAMMER_DATA_BLKSIZE;
-               dblk_no = ((data_offset & HAMMER_BUFMASK) - offsetof(union hammer_fsbuf_ondisk, data.data)) / HAMMER_DATA_BLKSIZE;
-               if ((data_offset & HAMMER_BUFMASK) != offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no])) {
-                       kprintf("dblk_no %d does not match data_offset %d/%d\n",
-                               dblk_no,
-                               offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no]),
-                               (data_offset & HAMMER_BUFMASK));
-                       hammer_rel_buffer(dbuf, 0);
-                       kprintf("recover record: data: not block aligned\n");
-                       Debugger("bad data");
-                       return(EINVAL);
+       switch(zone) {
+       case HAMMER_ZONE_RAW_VOLUME_INDEX:
+               vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
+               volume = hammer_get_volume(hmp, vol_no, &error);
+               if (volume == NULL) {
+                       kprintf("HAMMER: UNDO record, "
+                               "cannot access volume %d\n", vol_no);
+                       break;
                }
-               hammer_modify_buffer(dbuf);
-               dblk_no += dbuf_no * HAMMER_FSBUF_MAXBLKS;
-               r = hammer_alist_alloc_fwd(&cluster->alist_mdata, nblks, dblk_no);
-               if (r != dblk_no) {
-                       if (r != HAMMER_ALIST_BLOCK_NONE)
-                               hammer_alist_free(&cluster->alist_mdata, r, nblks);
-                       hammer_rel_buffer(dbuf, 0);
-                       kprintf("recover record: data: unable to realloc dbuf %d dblk %d\n", dbuf_no, dblk_no % HAMMER_FSBUF_MAXBLKS);
-                       KKASSERT(0);
-                       return(EINVAL);
+               hammer_modify_volume(NULL, volume, NULL, 0);
+               hammer_recover_copy_undo(undo->undo_offset,
+                                        (char *)(undo + 1),
+                                        (char *)volume->ondisk + offset,
+                                        undo->undo_data_bytes);
+               hammer_modify_volume_done(volume);
+               hammer_io_flush(&volume->io);
+               hammer_rel_volume(volume, 0);
+               break;
+       case HAMMER_ZONE_RAW_BUFFER_INDEX:
+               buffer = hammer_get_buffer(hmp, undo->undo_offset, 0, &error);
+               if (buffer == NULL) {
+                       kprintf("HAMMER: UNDO record, "
+                               "cannot access buffer %016llx\n",
+                               undo->undo_offset);
+                       break;
                }
-               hammer_rel_buffer(dbuf, 0);
+               hammer_modify_buffer(NULL, buffer, NULL, 0);
+               hammer_recover_copy_undo(undo->undo_offset,
+                                        (char *)(undo + 1),
+                                        (char *)buffer->ondisk + offset,
+                                        undo->undo_data_bytes);
+               hammer_modify_buffer_done(buffer);
+               hammer_io_flush(&buffer->io);
+               hammer_rel_buffer(buffer, 0);
+               break;
+       default:
+               kprintf("HAMMER: Corrupt UNDO record\n");
+               error = EIO;
        }
-done:
-       return(0);
+       return (error);
 }
 
-/*
- * Rebuild the B-Tree for the records residing in the specified buffer.
- *
- * Return the number of records recovered.
- */
-static int
-hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no)
+static void
+hammer_recover_copy_undo(hammer_off_t undo_offset, 
+                        char *src, char *dst, int bytes)
 {
-       hammer_record_ondisk_t rec;
-       hammer_buffer_t buffer;
-       int32_t rec_no;
-       int32_t rec_offset;
-       int record_count = 0;
-       int error;
-
-       buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
-       if (error) {
-               /*
-                * If we are unable to access the buffer leave it in a
-                * reserved state on the master alist.
-                */
-               kprintf("hammer_recover_buffer_stage2: error "
-                       "recovering %d:%d:%d\n",
-                       cluster->volume->vol_no, cluster->clu_no, buf_no);
-               Debugger("RECOVER BUFFER STAGE2 FAIL");
-               return(0);
-       }
-
-       /*
-        * Recover the buffer, scan and validate allocated records.  Records
-        * which cannot be recovered are freed.
-        */
-       rec_no = 0;
-       for (;;) {
-               rec_no = hammer_alist_find(&buffer->alist, rec_no,
-                                          HAMMER_RECORD_NODES, 0);
-               if (rec_no == HAMMER_ALIST_BLOCK_NONE)
-                       break;
-               rec_offset = offsetof(union hammer_fsbuf_ondisk,
-                                     record.recs[rec_no]);
-               rec_offset += buf_no * HAMMER_BUFSIZE;
-               rec = &buffer->ondisk->record.recs[rec_no];
-               error = hammer_recover_btree(cluster, buffer, rec_offset, rec);
-               if (error) {
-                       kprintf("hammer_recover_btree: failed %d:%d@%08x "
-                               "error %d buffer %p rec %p rec_no %d "
-                               " cluster_free %d\n",
-                               cluster->clu_no, buffer->buf_no, rec_offset,
-                               error, buffer, rec, rec_no,
-                               cluster->alist_master.meta->bm_alist_freeblks
-                       );
-                       Debugger("recover_btree failed");
-                       /* XXX free the record and its data? */
-                       /*hammer_alist_free(&buffer->alist, rec_no, 1);*/
-               } else {
-                       ++record_count;
-               }
-               ++rec_no;
-       }
-       hammer_rel_buffer(buffer, 0);
-       return(record_count);
+       kprintf("UNDO %016llx:", undo_offset);
+       hammer_recover_debug_dump(22, dst, bytes);
+       kprintf("%22s", "to:");
+       hammer_recover_debug_dump(22, src, bytes);
+       bcopy(src, dst, bytes);
 }
 
-/*
- * Enter a single record into the B-Tree.
- */
-static int
-hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer,
-                     int32_t rec_offset, hammer_record_ondisk_t rec)
+static void
+hammer_recover_debug_dump(int w, char *buf, int bytes)
 {
-       struct hammer_cursor cursor;
-       union hammer_btree_elm elm;
-       hammer_cluster_t ncluster;
-       int error = 0;
-
-       /*
-        * Check for a spike record.  When spiking into a new cluster do
-        * NOT allow a recursive recovery to occur.  We use a lot of 
-        * stack and the only thing we actually modify in the target
-        * cluster is its parent pointer.
-        */
-       if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER) {
-               hammer_volume_t ovolume = cluster->volume;
-               hammer_volume_t nvolume;
-
-               nvolume = hammer_get_volume(ovolume->hmp, rec->spike.vol_no,
-                                           &error);
-               if (error) {
-                       Debugger("recover_btree1");
-                       return(error);
-               }
-               ncluster = hammer_get_cluster(nvolume, rec->spike.clu_no,
-                                             &error, GET_CLUSTER_NORECOVER);
-               hammer_rel_volume(nvolume, 0);
-               if (error) {
-                       Debugger("recover_btree2");
-                       return(error);
-               }
-
-               /*
-                * Validate the cluster.  Allow the offset to be fixed up.
-                */
-               if (ncluster->ondisk->clu_btree_parent_vol_no != ovolume->vol_no ||
-                   ncluster->ondisk->clu_btree_parent_clu_no != cluster->clu_no) {
-                       kprintf("hammer_recover: Bad cluster spike hookup: "
-                               "%d:%d != %d:%d\n",
-                               ncluster->ondisk->clu_btree_parent_vol_no,
-                               ncluster->ondisk->clu_btree_parent_clu_no,
-                               ovolume->vol_no,
-                               cluster->clu_no);
-                       error = EINVAL;
-                       hammer_rel_cluster(ncluster, 0);
-                       Debugger("recover_btree3");
-                       return(error);
-               }
-       } else {
-               ncluster = NULL;
-       }
-
-       /*
-        * Locate the insertion point.  Note that we are using the cluster-
-        * localized cursor init so parent will start out NULL.
-        *
-        * The key(s) used for spike's are bounds and different from the
-        * key embedded in the spike record.  A special B-Tree insertion
-        * call is made to deal with spikes.
-        */
-       error = hammer_init_cursor_cluster(&cursor, cluster);
-       if (error) {
-               Debugger("recover_btree6");
-               goto failed;
-       }
-       KKASSERT(cursor.node);
-       if (ncluster)
-               cursor.key_beg = ncluster->ondisk->clu_btree_beg;
-       else
-               cursor.key_beg = rec->base.base;
-       cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_RECOVER;
-
-       error = hammer_btree_lookup(&cursor);
-       KKASSERT(error != EDEADLK);
-       KKASSERT(cursor.node);
-       if (error == 0) {
-               kprintf("hammer_recover_btree: Duplicate record cursor %p rec %p ncluster %p\n",
-                       &cursor, rec, ncluster);
-               hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index);
-               Debugger("duplicate record");
-       }
-       if (error != ENOENT) {
-               Debugger("recover_btree5");
-               goto failed;
-       }
-
+       int i;
 
-       if (ncluster) {
-               /*
-                * Spike record
-                */
-               kprintf("recover spike clu %d %016llx-%016llx clusterfree %d\n",
-                       ncluster->clu_no,
-                       ncluster->ondisk->clu_btree_beg.obj_id,
-                       ncluster->ondisk->clu_btree_end.obj_id,
-                       cluster->alist_master.meta->bm_alist_freeblks);
-               error = hammer_btree_insert_cluster(&cursor, ncluster,
-                                                   rec_offset);
-               kprintf("recover spike record error %d clusterfree %d\n",
-                       error, 
-                       cluster->alist_master.meta->bm_alist_freeblks);
-               KKASSERT(error != EDEADLK);
-               if (error)
-                       Debugger("spike recovery");
-       } else {
-               /*
-                * Normal record
-                */
-#if 0
-               kprintf("recover recrd clu %d %016llx\n",
-                       cluster->clu_no, rec->base.base.obj_id);
-#endif
-               elm.leaf.base = rec->base.base;
-               elm.leaf.rec_offset = rec_offset;
-               elm.leaf.data_offset = rec->base.data_offset;
-               elm.leaf.data_len = rec->base.data_len;
-               elm.leaf.data_crc = rec->base.data_crc;
-
-               error = hammer_btree_insert(&cursor, &elm);
-               KKASSERT(error != EDEADLK);
-       }
-
-       /*
-        * Success if error is 0!
-        */
-       if (error == 0) {
-               /*
-                * Update the cluster header's statistics count.  stat_records
-                * is very important for proper reservation of B-Tree space.
-                * Note that a spike record counts as 2.
-                */
-               ++cluster->ondisk->stat_records;
-               if (rec->base.base.rec_type == HAMMER_RECTYPE_INODE)
-                       ++cluster->ondisk->stat_inodes;
-               if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER)
-                       ++cluster->ondisk->stat_records;
+       for (i = 0; i < bytes; ++i) {
+               if (i && (i & 15) == 0)
+                       kprintf("\n%*.*s", w, w, "");
+               kprintf(" %02x", (unsigned char)buf[i]);
        }
-       if (error) {
-               kprintf("hammer_recover_btree: insertion failed\n");
-       }
-
-failed:
-       if (ncluster)
-               hammer_rel_cluster(ncluster, 0);
-       hammer_done_cursor(&cursor);
-       return(error);
+       kprintf("\n");
 }
 
-#endif
index c90a9fe..f070fc0 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.5 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.6 2008/04/26 02:54:00 dillon Exp $
  */
 
 /*
@@ -71,11 +71,11 @@ hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t zone3_off, int *errorp)
 
 /*
  * Generate an UNDO record for the block of data at the specified zone1
- * offset.
+ * or zone2 offset.
  */
 int
 hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
-                    hammer_off_t zone1_off, void *base, int len)
+                    hammer_off_t zone_off, void *base, int len)
 {
        hammer_volume_t root_volume;
        hammer_volume_ondisk_t ondisk;
@@ -90,9 +90,6 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
        int error;
        int bytes;
 
-       bytes = ((len + 7) & ~7) + sizeof(struct hammer_fifo_undo) +
-               sizeof(struct hammer_fifo_tail);
-
        root_volume = trans->rootvol;
        ondisk = root_volume->ondisk;
        undomap = &ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
@@ -100,10 +97,14 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
        /* no undo recursion */
        hammer_modify_volume(NULL, root_volume, NULL, 0);
 
+again:
        /*
         * Allocate space in the FIFO
         */
-again:
+       bytes = ((len + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK) +
+               sizeof(struct hammer_fifo_undo) +
+               sizeof(struct hammer_fifo_tail);
+
        next_offset = undomap->next_offset;
 
        /*
@@ -159,13 +160,19 @@ again:
         * We're good, create the entry.
         */
        undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
-       undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
+       undo->head.hdr_type = HAMMER_HEAD_TYPE_UNDO;
        undo->head.hdr_size = bytes;
        undo->head.reserved01 = 0;
        undo->head.hdr_crc = 0;
-       undo->undo_offset = zone1_off;
+       undo->undo_offset = zone_off;
        undo->undo_data_bytes = len;
        bcopy(base, undo + 1, len);
+
+       tail = (void *)((char *)undo + bytes - sizeof(*tail));
+       tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+       tail->tail_type = HAMMER_HEAD_TYPE_UNDO;
+       tail->tail_size = bytes;
+
        undo->head.hdr_crc = crc32(undo, bytes);
        hammer_modify_buffer_done(buffer);
 
index c5e9641..8c7bdd7 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.27 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.28 2008/04/26 02:54:00 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -289,7 +289,20 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
         */
        rootvol = hammer_get_root_volume(hmp, &error);
        if (error)
+               goto failed;
+
+       /*
+        * Perform any necessary UNDO operations
+        */
+       error = hammer_recover(hmp, rootvol);
+       if (error) {
+               kprintf("Failed to recover HAMMER filesystem on mount\n");
                goto done;
+       }
+
+       /*
+        * Finish setup now that we have a good root volume
+        */
        ksnprintf(mp->mnt_stat.f_mntfromname,
                  sizeof(mp->mnt_stat.f_mntfromname), "%s",
                  rootvol->ondisk->vol_name);
@@ -301,8 +314,6 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
        hmp->next_tid = rootvol->ondisk->vol0_next_tid;
        kprintf("on-disk next_tid %016llx\n", hmp->next_tid);
 
-       hammer_rel_volume(rootvol, 0);
-
        hammer_flusher_create(hmp);
 
        /*
@@ -319,6 +330,8 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
        /*vn_unlock(hmp->rootvp);*/
 
 done:
+       hammer_rel_volume(rootvol, 0);
+failed:
        /*
         * Cleanup and return.
         */
index da92b60..fcdb888 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.39 2008/04/25 21:49:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.40 2008/04/26 02:54:00 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -172,7 +172,7 @@ hammer_vop_fsync(struct vop_fsync_args *ap)
 {
        hammer_inode_t ip = VTOI(ap->a_vp);
 
-       hammer_flush_inode(ip, 0);
+       hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
        if (ap->a_waitfor == MNT_WAIT)
                hammer_wait_inode(ip);
        return (ip->error);
@@ -1896,7 +1896,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
        else
                TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
        hammer_modify_inode(NULL, ip, HAMMER_INODE_XDIRTY);
-       hammer_flush_inode(ip, 0);
+       hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
+       kprintf("a");
        return(0);
 }