HAMMER 59B/Many: Stabilization pass - fixes for large file issues
authorMatthew Dillon <dillon@dragonflybsd.org>
Fri, 27 Jun 2008 20:56:59 +0000 (20:56 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Fri, 27 Jun 2008 20:56:59 +0000 (20:56 +0000)
* The flusher was trying to flush whole inodes in one pass.  If a large
  (in the 100GB range) file is deleted or truncated this single pass could
  blow out the UNDO FIFO and cause a panic.

  Detect the case and only do a partial flush, then requeue the inode to
  the next flush group and reflush.  This continues until the file is
  completely eradicated or the truncation has completed.

* In addition, the same flusher issue could exhaust the kernel's buffer
  cache with unflushable dirty buffers.  The same fix applies.

* The UNDO FIFO code was calculating available space based on the next
  flush's start position instead of the last flush's start position,
  allowing areas of the UNDO FIFO still subject to a recovery pass
  (if a crash were to occur right then) to be overwritten too soon.

* The pruner and reblocker were doing insufficient UNDO FIFO space checks
  and could blow out the UNDO FIFO.  Fixed.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_mirror.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_prune.c
sys/vfs/hammer/hammer_reblock.c
sys/vfs/hammer/hammer_recover.c
sys/vfs/hammer/hammer_undo.c
sys/vfs/hammer/hammer_vfsops.c

index 40e5785..5e01f33 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.93 2008/06/26 04:06:22 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.94 2008/06/27 20:56:59 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -286,7 +286,8 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_DELETING  0x00020000 /* inode delete request (frontend)*/
 #define HAMMER_INODE_RESIGNAL  0x00040000 /* re-signal on re-flush */
 #define HAMMER_INODE_ATIME     0x00100000 /* in-memory atime modified */
-#define HAMMER_INODE_MTIME     0x00100000 /* in-memory mtime modified */
+#define HAMMER_INODE_MTIME     0x00200000 /* in-memory mtime modified */
+#define HAMMER_INODE_WOULDBLOCK 0x00400000 /* re-issue to new flush group */
 
 #define HAMMER_INODE_MODMASK   (HAMMER_INODE_DDIRTY|                       \
                                 HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS|     \
@@ -932,8 +933,8 @@ hammer_off_t hammer_blockmap_lookup(hammer_mount_t hmp, hammer_off_t bmap_off,
                        int *errorp);
 hammer_off_t hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t bmap_off,
                        int *errorp);
-int64_t hammer_undo_used(hammer_mount_t hmp);
-int64_t hammer_undo_space(hammer_mount_t hmp);
+int64_t hammer_undo_used(hammer_transaction_t trans);
+int64_t hammer_undo_space(hammer_transaction_t trans);
 int64_t hammer_undo_max(hammer_mount_t hmp);
 
 void hammer_start_transaction(struct hammer_transaction *trans,
@@ -1023,10 +1024,12 @@ void hammer_flusher_create(hammer_mount_t hmp);
 void hammer_flusher_destroy(hammer_mount_t hmp);
 void hammer_flusher_sync(hammer_mount_t hmp);
 void hammer_flusher_async(hammer_mount_t hmp);
+int  hammer_flusher_meta_limit(hammer_mount_t hmp);
+int  hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter);
 
 int hammer_recover(hammer_mount_t hmp, hammer_volume_t rootvol);
 void hammer_recover_flush_buffers(hammer_mount_t hmp,
-                       hammer_volume_t root_volume);
+                       hammer_volume_t root_volume, int final);
 
 void hammer_crc_set_blockmap(hammer_blockmap_t blockmap);
 void hammer_crc_set_volume(hammer_volume_ondisk_t ondisk);
index f333640..6765f44 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.29 2008/06/23 07:31:14 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.30 2008/06/27 20:56:59 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -48,7 +48,6 @@ static void hammer_flusher_clean_loose_ios(hammer_mount_t hmp);
 static void hammer_flusher_flush(hammer_mount_t hmp);
 static void hammer_flusher_flush_inode(hammer_inode_t ip,
                                        hammer_transaction_t trans);
-static int hammer_must_finalize_undo(hammer_mount_t hmp);
 static void hammer_flusher_finalize(hammer_transaction_t trans, int final);
 
 /*
@@ -293,6 +292,18 @@ hammer_flusher_flush(hammer_mount_t hmp)
        hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
 
        /*
+        * If the previous flush cycle just about exhausted our UNDO space
+        * we may have to do a dummy cycle to move the first_offset up
+        * before actually digging into a new cycle, or the new cycle will
+        * not have sufficient undo space.
+        */
+       if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3)) {
+               hammer_lock_ex(&hmp->flusher.finalize_lock);
+               hammer_flusher_finalize(&hmp->flusher.trans, 0);
+               hammer_unlock(&hmp->flusher.finalize_lock);
+       }
+
+       /*
         * Start work threads.
         */
        i = 0;
@@ -333,20 +344,28 @@ hammer_flusher_flush(hammer_mount_t hmp)
 
 /*
  * Flush a single inode that is part of a flush group.
+ *
+ * NOTE!  The sync code can return EWOULDBLOCK if the flush operation
+ * would otherwise blow out the buffer cache.  hammer_flush_inode_done()
+ * will re-queue the inode for the next flush sequence and force the
+ * flusher to run again if this occurs.
  */
 static
 void
 hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
 {
        hammer_mount_t hmp = ip->hmp;
+       int error;
 
        hammer_lock_sh(&hmp->flusher.finalize_lock);
-       ip->error = hammer_sync_inode(ip);
+       error = hammer_sync_inode(ip);
+       if (error != EWOULDBLOCK)
+               ip->error = error;
        hammer_flush_inode_done(ip);
        hammer_unlock(&hmp->flusher.finalize_lock);
        while (hmp->flusher.finalize_want)
                tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0);
-       if (hammer_must_finalize_undo(hmp)) {
+       if (hammer_flusher_undo_exhausted(trans, 1)) {
                hmp->flusher.finalize_want = 1;
                hammer_lock_ex(&hmp->flusher.finalize_lock);
                kprintf("HAMMER: Warning: UNDO area too small!\n");
@@ -354,8 +373,7 @@ hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
                hammer_unlock(&hmp->flusher.finalize_lock);
                hmp->flusher.finalize_want = 0;
                wakeup(&hmp->flusher.finalize_want);
-       } else if (trans->hmp->locked_dirty_count +
-                  trans->hmp->io_running_count > hammer_limit_dirtybufs) {
+       } else if (hammer_flusher_meta_limit(trans->hmp)) {
                hmp->flusher.finalize_want = 1;
                hammer_lock_ex(&hmp->flusher.finalize_lock);
                hammer_flusher_finalize(trans, 0);
@@ -366,16 +384,25 @@ hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
 }
 
 /*
- * If the UNDO area gets over half full we have to flush it.  We can't
- * afford the UNDO area becoming completely full as that would break
- * the crash recovery atomicy.
+ * Return non-zero if the UNDO area has less then (QUARTER / 4) of its
+ * space left.
+ *
+ * 1/4 - Emergency free undo space level.  Below this point the flusher
+ *      will finalize even if directory dependancies have not been resolved.
+ *
+ * 2/4 - Used by the pruning and reblocking code.  These functions may be
+ *      running in parallel with a flush and cannot be allowed to drop
+ *      available undo space to emergency levels.
+ *
+ * 3/4 - Used at the beginning of a flush to force-sync the volume header
+ *      to give the flush plenty of runway to work in.
  */
-static
 int
-hammer_must_finalize_undo(hammer_mount_t hmp)
+hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter)
 {
-       if (hammer_undo_space(hmp) < hammer_undo_max(hmp) / 2) {
-               hkprintf("*");
+       if (hammer_undo_space(trans) <
+           hammer_undo_max(trans->hmp) * quarter / 4) {
+               kprintf("%c", '0' + quarter);
                return(1);
        } else {
                return(0);
@@ -477,6 +504,11 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
         * an UNDO.  However, because our TID is generated before we get
         * the sync lock another sync may have beat us to the punch.
         *
+        * This also has the side effect of updating first_offset based on
+        * a prior finalization when the first finalization of the next flush
+        * cycle occurs, removing any undo info from the prior finalization
+        * from consideration.
+        *
         * The volume header will be flushed out synchronously.
         */
        dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
@@ -527,8 +559,10 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
        /*
         * If this is the final finalization for the flush group set
         * up for the next sequence by setting a new first_offset in
-        * our cached blockmap and
-        * clearing the undo history.
+        * our cached blockmap and clearing the undo history.
+        *
+        * Even though we have updated our cached first_offset, the on-disk
+        * first_offset still governs available-undo-space calculations.
         */
        if (final) {
                cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
@@ -539,3 +573,19 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
        hammer_sync_unlock(trans);
 }
 
+/*
+ * Return non-zero if too many dirty meta-data buffers have built up.
+ *
+ * Since we cannot allow such buffers to flush until we have dealt with
+ * the UNDOs, we risk deadlocking the kernel's buffer cache.
+ */
+int
+hammer_flusher_meta_limit(hammer_mount_t hmp)
+{
+       if (hmp->locked_dirty_count + hmp->io_running_count >
+           hammer_limit_dirtybufs) {
+               return(1);
+       }
+       return(0);
+}
+
index 565faa1..8100d80 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.85 2008/06/26 04:06:23 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.86 2008/06/27 20:56:59 dillon Exp $
  */
 
 #include "hammer.h"
@@ -599,6 +599,7 @@ hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
        hammer_transaction_t trans = cursor->trans;
        hammer_record_t record;
        int error;
+       int redirty;
 
 retry:
        error = 0;
@@ -680,6 +681,21 @@ retry:
                record->leaf.create_ts = trans->time32;
                record->data = (void *)&ip->sync_ino_data;
                record->flags |= HAMMER_RECF_INTERLOCK_BE;
+
+               /*
+                * If this flag is set we cannot sync the new file size
+                * because we haven't finished related truncations.  The
+                * inode will be flushed in another flush group to finish
+                * the job.
+                */
+               if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
+                   ip->sync_ino_data.size != ip->ino_data.size) {
+                       redirty = 1;
+                       ip->sync_ino_data.size = ip->ino_data.size;
+               } else {
+                       redirty = 0;
+               }
+
                for (;;) {
                        error = hammer_ip_sync_record_cursor(cursor, record);
                        if (hammer_debug_inode)
@@ -719,6 +735,8 @@ retry:
                                            HAMMER_INODE_ATIME |
                                            HAMMER_INODE_MTIME);
                        ip->flags &= ~HAMMER_INODE_DELONDISK;
+                       if (redirty)
+                               ip->sync_flags |= HAMMER_INODE_DDIRTY;
 
                        /*
                         * Root volume count of inodes
@@ -1440,10 +1458,18 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
                break;
        case HAMMER_FST_FLUSH:
                /* 
-                * Record already associated with a flush group.  It had
-                * better be ours.
+                * If the WOULDBLOCK flag is set records may have been left
+                * over from a previous flush attempt and should be moved
+                * to the current flush group.  If it is not set then all
+                * such records had better have been flushed already or
+                * already associated with the current flush group.
                 */
-               KKASSERT(rec->flush_group == ip->flush_group);
+               if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
+                       kprintf("b");
+                       rec->flush_group = ip->flush_group;
+               } else {
+                       KKASSERT(rec->flush_group == ip->flush_group);
+               }
                r = 1;
                break;
        }
@@ -1467,7 +1493,6 @@ hammer_wait_inode(hammer_inode_t ip)
        waitcount = (ip->flags & HAMMER_INODE_REFLUSH) ? 2 : 1;
 
        if (ip->flush_state == HAMMER_FST_SETUP) {
-               kprintf("X");
                hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
        }
        /* XXX can we make this != FST_IDLE ? check SETUP depends */
@@ -1541,9 +1566,20 @@ hammer_flush_inode_done(hammer_inode_t ip)
        /*
         * Adjust flush_state.  The target state (idle or setup) shouldn't
         * be terribly important since we will reflush if we really need
-        * to do anything. XXX
+        * to do anything.
+        *
+        * If the WOULDBLOCK flag is set we must re-flush immediately
+        * to continue a potentially large deletion.  The flag also causes
+        * the hammer_setup_child_callback() to move records in the old
+        * flush group to the new one.
         */
-       if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
+       if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
+               kprintf("B");
+               ip->flush_state = HAMMER_FST_IDLE;
+               hammer_flush_inode_core(ip, HAMMER_FLUSH_SIGNAL);
+               ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
+               dorel = 1;
+       } else if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
                ip->flush_state = HAMMER_FST_IDLE;
                dorel = 1;
        } else {
@@ -1854,10 +1890,19 @@ hammer_sync_inode(hammer_inode_t ip)
                 * already cleaned out any partial block and made it
                 * pending.  The front-end may have updated trunc_off
                 * while we were blocked so we only use sync_trunc_off.
+                *
+                * This operation can blow out the buffer cache, EWOULDBLOCK
+                * means we were unable to complete the deletion.
                 */
                error = hammer_ip_delete_range(&cursor, ip,
                                                aligned_trunc_off,
-                                               0x7FFFFFFFFFFFFFFFLL, 1);
+                                               0x7FFFFFFFFFFFFFFFLL, 2);
+               if (error == EWOULDBLOCK) {
+                       ip->flags |= HAMMER_INODE_WOULDBLOCK;
+                       error = 0;
+                       goto defer_buffer_flush;
+               }
+
                if (error)
                        Debugger("hammer_ip_delete_range errored");
 
@@ -1922,9 +1967,9 @@ hammer_sync_inode(hammer_inode_t ip)
            (ip->flags & HAMMER_INODE_DELETED) == 0) {
                int count1 = 0;
 
-               ip->flags |= HAMMER_INODE_DELETED;
                error = hammer_ip_delete_range_all(&cursor, ip, &count1);
                if (error == 0) {
+                       ip->flags |= HAMMER_INODE_DELETED;
                        ip->sync_flags &= ~HAMMER_INODE_DELETING;
                        ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
                        KKASSERT(RB_EMPTY(&ip->rec_tree));
@@ -1950,8 +1995,11 @@ hammer_sync_inode(hammer_inode_t ip)
                                --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
                                hammer_modify_volume_done(trans.rootvol);
                        }
+               } else if (error == EWOULDBLOCK) {
+                       ip->flags |= HAMMER_INODE_WOULDBLOCK;
+                       error = 0;
+                       goto defer_buffer_flush;
                } else {
-                       ip->flags &= ~HAMMER_INODE_DELETED;
                        Debugger("hammer_ip_delete_range_all errored");
                }
        }
@@ -1961,9 +2009,14 @@ hammer_sync_inode(hammer_inode_t ip)
        if (error)
                Debugger("RB_SCAN errored");
 
+defer_buffer_flush:
        /*
         * Now update the inode's on-disk inode-data and/or on-disk record.
         * DELETED and ONDISK are managed only in ip->flags.
+        *
+        * In the case of a defered buffer flush we still update the on-disk
+        * inode to satisfy visibility requirements if there happen to be
+        * directory dependancies.
         */
        switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
        case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
index af09097..654a8d2 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.2 2008/06/26 04:06:23 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.3 2008/06/27 20:56:59 dillon Exp $
  */
 /*
  * HAMMER mirroring ioctls - serialize and deserialize modifications made
@@ -328,6 +328,14 @@ hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
        hammer_btree_leaf_elm_t elm;
 
        elm = cursor->leaf;
+
+       if (mrec->leaf.base.delete_tid == 0) {
+               kprintf("mirror_write: object %016llx:%016llx deleted on "
+                       "target, not deleted on source\n",
+                       elm->base.obj_id, elm->base.key);
+               return(0);
+       }
+
        KKASSERT(elm->base.create_tid < mrec->leaf.base.delete_tid);
        hammer_modify_node(cursor->trans, cursor->node, elm, sizeof(*elm));
        elm->base.delete_tid = mrec->leaf.base.delete_tid;
index 66f8206..e429a66 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.76 2008/06/26 04:06:23 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.77 2008/06/27 20:56:59 dillon Exp $
  */
 
 #include "hammer.h"
@@ -619,10 +619,14 @@ hammer_ip_add_directory(struct hammer_transaction *trans,
         * Find an unused namekey.  Both the in-memory record tree and
         * the B-Tree are checked.  Exact matches also match create_tid
         * so use an ASOF search to (mostly) ignore it.
+        *
+        * delete-visibility is set so pending deletions do not give us
+        * a false-negative on our ability to use an iterator.
         */
        hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
        cursor.key_beg = record->leaf.base;
        cursor.flags |= HAMMER_CURSOR_ASOF;
+       cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
        cursor.asof = ip->obj_asof;
 
        count = 0;
@@ -1572,7 +1576,10 @@ hammer_ip_resolve_data(hammer_cursor_t cursor)
  * Backend truncation / record replacement - delete records in range.
  *
  * Delete all records within the specified range for inode ip.  In-memory
- * records still associated with the frontend are ignored.
+ * records still associated with the frontend are ignored. 
+ *
+ * If truncating is non-zero in-memory records associated with the back-end
+ * are ignored.  If truncating is > 1 we can return EWOULDBLOCK.
  *
  * NOTE: An unaligned range will cause new records to be added to cover
  * the edge cases. (XXX not implemented yet).
@@ -1693,9 +1700,18 @@ retry:
                 * data if the retention policy dictates.  The function
                 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
                 * uses to perform a fixup.
+                *
+                * If we have built up too many meta-buffers we risk
+                * deadlocking the kernel and must stop.  This can occur
+                * when deleting ridiculously huge files.
                 */
-               if (truncating == 0 || hammer_cursor_ondisk(cursor))
+               if (truncating == 0 || hammer_cursor_ondisk(cursor)) {
                        error = hammer_ip_delete_record(cursor, ip, trans->tid);
+                       if (truncating > 1 && error == 0 &&
+                           hammer_flusher_meta_limit(ip->hmp)) {
+                               error = EWOULDBLOCK;
+                       }
+               }
                if (error)
                        break;
                error = hammer_ip_next(cursor);
@@ -1720,6 +1736,8 @@ retry:
  * Delete all user records associated with an inode except the inode record
  * itself.  Directory entries are not deleted (they must be properly disposed
  * of or nlinks would get upset).
+ *
+ * This function can return EWOULDBLOCK.
  */
 int
 hammer_ip_delete_range_all(hammer_cursor_t cursor, hammer_inode_t ip,
@@ -1774,6 +1792,8 @@ retry:
                if (leaf->base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
                        error = hammer_ip_delete_record(cursor, ip, trans->tid);
                        ++*countp;
+                       if (error == 0 && hammer_flusher_meta_limit(ip->hmp))
+                               error = EWOULDBLOCK;
                }
                if (error)
                        break;
index 0940ab4..47f86de 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.8 2008/06/26 04:06:23 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.9 2008/06/27 20:56:59 dillon Exp $
  */
 
 #include "hammer.h"
  * created during the iteration due to alignments.  This also allows us
  * to adjust alignments without blowing up the B-Tree.
  */
-static int check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm);
+static int prune_should_delete(struct hammer_ioc_prune *prune,
+                              hammer_btree_leaf_elm_t elm);
+static void prune_check_nlinks(hammer_cursor_t cursor,
+                              hammer_btree_leaf_elm_t elm);
 
 int
 hammer_ioc_prune(hammer_transaction_t trans, hammer_inode_t ip,
                 struct hammer_ioc_prune *prune)
 {
        struct hammer_cursor cursor;
-       hammer_btree_elm_t elm;
+       hammer_btree_leaf_elm_t elm;
        struct hammer_ioc_prune_elm *copy_elms;
        struct hammer_ioc_prune_elm *user_elms;
        int error;
@@ -128,6 +131,12 @@ retry:
 
        while (error == 0) {
                /*
+                * Check for work
+                */
+               elm = &cursor.node->ondisk->elms[cursor.index].leaf;
+               prune->key_cur = elm->base;
+
+               /*
                 * Yield to more important tasks
                 */
                if ((error = hammer_signal_check(trans->hmp)) != 0)
@@ -137,22 +146,14 @@ retry:
                        tsleep(trans, 0, "hmrslo", hz / 10);
                        hammer_sync_lock_sh(trans);
                }
-               if (trans->hmp->locked_dirty_count +
-                   trans->hmp->io_running_count > hammer_limit_dirtybufs) {
-                       hammer_sync_unlock(trans);
-                       hammer_flusher_async(trans->hmp);
-                       tsleep(trans, 0, "hmrslo", hz / 10);
-                       hammer_sync_lock_sh(trans);
+               if (hammer_flusher_meta_limit(trans->hmp) ||
+                   hammer_flusher_undo_exhausted(trans, 2)) {
+                       error = EWOULDBLOCK;
+                       break;
                }
 
-               /*
-                * Check for work
-                */
-               elm = &cursor.node->ondisk->elms[cursor.index];
-               prune->key_cur = elm->base;
-
-               if (prune->stat_oldest_tid > elm->leaf.base.create_tid)
-                       prune->stat_oldest_tid = elm->leaf.base.create_tid;
+               if (prune->stat_oldest_tid > elm->base.create_tid)
+                       prune->stat_oldest_tid = elm->base.create_tid;
 
                if (hammer_debug_general & 0x0200) {
                        kprintf("check %016llx %016llx cre=%016llx del=%016llx\n",
@@ -162,7 +163,7 @@ retry:
                                        elm->base.delete_tid);
                }
                                
-               if (check_prune(prune, elm) == 0) {
+               if (prune_should_delete(prune, elm)) {
                        if (hammer_debug_general & 0x0200) {
                                kprintf("check %016llx %016llx: DELETE\n",
                                        elm->base.obj_id, elm->base.key);
@@ -194,6 +195,11 @@ retry:
                         */
                        cursor.flags |= HAMMER_CURSOR_ATEDISK;
                } else {
+                       /*
+                        * Nothing to delete, but we may have to check other
+                        * things.
+                        */
+                       prune_check_nlinks(&cursor, elm);
                        cursor.flags |= HAMMER_CURSOR_ATEDISK;
                        if (hammer_debug_general & 0x0100) {
                                kprintf("check %016llx %016llx: SKIP\n",
@@ -207,6 +213,10 @@ retry:
        if (error == ENOENT)
                error = 0;
        hammer_done_cursor(&cursor);
+       if (error == EWOULDBLOCK) {
+               hammer_flusher_sync(trans->hmp);
+               goto retry;
+       }
        if (error == EDEADLK)
                goto retry;
        if (error == EINTR) {
@@ -222,9 +232,11 @@ failed:
 
 /*
  * Check pruning list.  The list must be sorted in descending order.
+ *
+ * Return non-zero if the record should be deleted.
  */
 static int
-check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm)
+prune_should_delete(struct hammer_ioc_prune *prune, hammer_btree_leaf_elm_t elm)
 {
        struct hammer_ioc_prune_elm *scan;
        int i;
@@ -235,31 +247,15 @@ check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm)
         */
        if (prune->head.flags & HAMMER_IOC_PRUNE_ALL) {
                if (elm->base.delete_tid != 0)
-                       return(0);
-               return(-1);
+                       return(1);
+               return(0);
        }
 
        for (i = 0; i < prune->nelms; ++i) {
                scan = &prune->elms[i];
 
-#if 0
                /*
-                * Locate the scan index covering the create and delete TIDs.
-                */
-               if (*realign_cre < 0 &&
-                   elm->base.create_tid >= scan->beg_tid &&
-                   elm->base.create_tid < scan->end_tid) {
-                       *realign_cre = i;
-               }
-               if (*realign_del < 0 && elm->base.delete_tid &&
-                   elm->base.delete_tid > scan->beg_tid &&
-                   elm->base.delete_tid <= scan->end_tid) {
-                       *realign_del = i;
-               }
-#endif
-
-               /*
-                * Now check for loop termination.
+                * Check for loop termination.
                 */
                if (elm->base.create_tid >= scan->end_tid ||
                    elm->base.delete_tid > scan->end_tid) {
@@ -267,17 +263,32 @@ check_prune(struct hammer_ioc_prune *prune, hammer_btree_elm_t elm)
                }
 
                /*
-                * Now determine if we can delete the record.
+                * Determine if we can delete the record.
                 */
                if (elm->base.delete_tid &&
                    elm->base.create_tid >= scan->beg_tid &&
                    elm->base.delete_tid <= scan->end_tid &&
                    (elm->base.create_tid - scan->beg_tid) / scan->mod_tid ==
                    (elm->base.delete_tid - scan->beg_tid) / scan->mod_tid) {
-                       return(0);
+                       return(1);
                }
        }
-       return(-1);
+       return(0);
+}
+
+static
+void
+prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
+{
+       if (elm->base.rec_type != HAMMER_RECTYPE_INODE)
+               return;
+       if (elm->base.delete_tid != 0)
+               return;
+       if (hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA))
+               return;
+       if (cursor->data->inode.nlinks)
+               return;
+       kprintf("found disconnected inode %016llx\n", elm->base.obj_id);
 }
 
 #if 0
index a39dbb5..d5c24a0 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.21 2008/06/24 17:38:17 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.22 2008/06/27 20:56:59 dillon Exp $
  */
 /*
  * HAMMER reblocker - This code frees up fragmented physical space
@@ -126,10 +126,10 @@ retry:
                if (trans->hmp->sync_lock.wanted) {
                        tsleep(trans, 0, "hmrslo", hz / 10);
                }
-               if (trans->hmp->locked_dirty_count +
-                   trans->hmp->io_running_count > hammer_limit_dirtybufs) {
-                       hammer_flusher_async(trans->hmp);
-                       tsleep(trans, 0, "hmrslo", hz / 10);
+               if (hammer_flusher_meta_limit(trans->hmp) ||
+                   hammer_flusher_undo_exhausted(trans, 2)) {
+                       error = EWOULDBLOCK;
+                       break;
                }
 
                /*
@@ -149,6 +149,10 @@ retry:
        if (error == ENOENT)
                error = 0;
        hammer_done_cursor(&cursor);
+       if (error == EWOULDBLOCK) {
+               hammer_flusher_sync(trans->hmp);
+               goto retry;
+       }
        if (error == EDEADLK)
                goto retry;
        if (error == EINTR) {
index 6c3d645..a0364f9 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.25 2008/06/20 05:38:26 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.26 2008/06/27 20:56:59 dillon Exp $
  */
 
 #include "hammer.h"
@@ -65,6 +65,7 @@ hammer_recover(hammer_mount_t hmp, hammer_volume_t root_volume)
        hammer_off_t first_offset;
        hammer_off_t last_offset;
        int error;
+       int reported = 0;
 
        /*
         * Examine the UNDO FIFO.  If it is empty the filesystem is clean
@@ -155,6 +156,22 @@ hammer_recover(hammer_mount_t hmp, hammer_volume_t root_volume)
                }
                scan_offset -= tail->tail_size;
                bytes -= tail->tail_size;
+
+               /*
+                * If too many dirty buffers have built up 
+                */
+               if (hammer_flusher_meta_limit(hmp)) {
+                       if (hmp->ronly == 0) {
+                               hammer_recover_flush_buffers(hmp, root_volume,
+                                                            0);
+                               kprintf("HAMMER(%s) Continuing recovery\n",
+                                       root_volume->ondisk->vol_name);
+                       } else if (reported == 0) {
+                               reported = 1;
+                               kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
+                                       root_volume->ondisk->vol_name);
+                       }
+               }
        }
 done:
        if (buffer)
@@ -180,7 +197,7 @@ done:
         * be flushed out last.
         */
        if (hmp->ronly == 0 && error == 0)
-               hammer_recover_flush_buffers(hmp, root_volume);
+               hammer_recover_flush_buffers(hmp, root_volume, 1);
        kprintf("HAMMER(%s) End Recovery\n", root_volume->ondisk->vol_name);
        return (error);
 }
@@ -429,7 +446,8 @@ static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
 
 void
-hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume)
+hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
+                            int final)
 {
         /*
          * Flush the buffers out asynchronously, wait for all the I/O to
@@ -448,7 +466,7 @@ hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume)
        /*
         * Finaly, deal with the volume header.
         */
-       if (root_volume->io.recovered) {
+       if (root_volume->io.recovered && final) {
                crit_enter();
                while (hmp->io_running_count)
                        tsleep(&hmp->io_running_count, 0, "hmrflx", 0);
index 8041822..64b793e 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.17 2008/06/17 04:02:38 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.18 2008/06/27 20:56:59 dillon Exp $
  */
 
 /*
@@ -117,7 +117,7 @@ again:
        bytes = ((len + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK) +
                sizeof(struct hammer_fifo_undo) +
                sizeof(struct hammer_fifo_tail);
-       if (hammer_undo_space(hmp) < bytes + HAMMER_BUFSIZE*2)
+       if (hammer_undo_space(trans) < bytes + HAMMER_BUFSIZE*2)
                panic("hammer: insufficient undo FIFO space!");
 
        next_offset = undomap->next_offset;
@@ -251,37 +251,47 @@ hammer_clear_undo_history(hammer_mount_t hmp)
 }
 
 /*
- * Misc helper routines.  Return available space and total space.
+ * Return how much of the undo FIFO has been used
+ *
+ * The calculation includes undo FIFO space still reserved from a previous
+ * flush (because it will still be run on recovery if a crash occurs and
+ * we can't overwrite it yet).
  */
 int64_t
-hammer_undo_used(hammer_mount_t hmp)
+hammer_undo_used(hammer_transaction_t trans)
 {
-       hammer_blockmap_t rootmap;
+       hammer_blockmap_t cundomap;
+       hammer_blockmap_t dundomap;
        int64_t max_bytes;
        int64_t bytes;
 
-       rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
+       cundomap = &trans->hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
+       dundomap = &trans->rootvol->ondisk->
+                               vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
 
-       if (rootmap->first_offset <= rootmap->next_offset) {
-               bytes = rootmap->next_offset - rootmap->first_offset;
+       if (dundomap->first_offset <= cundomap->next_offset) {
+               bytes = cundomap->next_offset - dundomap->first_offset;
        } else {
-               bytes = rootmap->alloc_offset - rootmap->first_offset +
-                       (rootmap->next_offset & HAMMER_OFF_LONG_MASK);
+               bytes = cundomap->alloc_offset - dundomap->first_offset +
+                       (cundomap->next_offset & HAMMER_OFF_LONG_MASK);
        }
-       max_bytes = rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK;
+       max_bytes = cundomap->alloc_offset & HAMMER_OFF_SHORT_MASK;
        KKASSERT(bytes <= max_bytes);
        return(bytes);
 }
 
+/*
+ * Return how much of the undo FIFO is available for new records.
+ */
 int64_t
-hammer_undo_space(hammer_mount_t hmp)
+hammer_undo_space(hammer_transaction_t trans)
 {
        hammer_blockmap_t rootmap;
        int64_t max_bytes;
 
-       rootmap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
+       rootmap = &trans->hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
        max_bytes = rootmap->alloc_offset & HAMMER_OFF_SHORT_MASK;
-       return(max_bytes - hammer_undo_used(hmp));
+       return(max_bytes - hammer_undo_used(trans));
 }
 
 int64_t
index 49da550..4e859a3 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.54 2008/06/26 04:06:23 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.55 2008/06/27 20:56:59 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -320,7 +320,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                                hammer_adjust_volume_mode, NULL);
                        rootvol = hammer_get_root_volume(hmp, &error);
                        if (rootvol) {
-                               hammer_recover_flush_buffers(hmp, rootvol);
+                               hammer_recover_flush_buffers(hmp, rootvol, 1);
                                bcopy(rootvol->ondisk->vol0_blockmap,
                                      hmp->blockmap,
                                      sizeof(hmp->blockmap));