HAMMER 62/Many: Stabilization, performance, and cleanup
authorMatthew Dillon <dillon@dragonflybsd.org>
Wed, 16 Jul 2008 18:30:59 +0000 (18:30 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Wed, 16 Jul 2008 18:30:59 +0000 (18:30 +0000)
* Fix 'hammer synctid'.  It was blocking until the next filesystem sync in
  some cases.  Also redo the flush sequencing for synctid so the inodes
  are flushed in large groups instead of individually, improving performance.

* Fix a case where reclaimed inodes were sometimes not being reclaimed on
  the backend in a timely fashion.  There are still some performance issues
  here.

* Fix a race in the buffer invalidation code that could cause an assertion.

* Remove a short-cut in hammer_checkspace() that was not taking into
  account reserved big-blocks and allowing operations to exhaust all
  free space on smaller medias and then assert, instead of returning ENOSPC.

* Clean up the flush_group append code by tracking the append point.

* Clean up documentation in the inode flush path.

* Allow the reblocker to dig deeper into available free space when run
  with a low fill level (<= 20%).

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_blockmap.c
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_ioctl.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_reblock.c
sys/vfs/hammer/hammer_undo.c

index e2907e6..2145db5 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.118 2008/07/14 20:27:54 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.119 2008/07/16 18:30:59 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -312,7 +312,7 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_DDIRTY    0x0001  /* in-memory ino_data is dirty */
                                        /* (not including atime/mtime) */
 #define HAMMER_INODE_RSV_INODES        0x0002  /* hmp->rsv_inodes bumped */
-#define HAMMER_INODE_UNUSED0004        0x0004
+#define HAMMER_INODE_CONN_DOWN 0x0004  /* include in downward recursion */
 #define HAMMER_INODE_XDIRTY    0x0008  /* in-memory records */
 #define HAMMER_INODE_ONDISK    0x0010  /* inode is on-disk (else not yet) */
 #define HAMMER_INODE_FLUSH     0x0020  /* flush on last ref */
@@ -731,6 +731,7 @@ struct hammer_mount {
        TAILQ_HEAD(, hammer_undo)  undo_lru_list;
        TAILQ_HEAD(, hammer_reserve) delay_list;
        struct hammer_flush_group_list  flush_group_list;
+       hammer_flush_group_t    next_flush_group;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
        TAILQ_HEAD(, hammer_reclaim) reclaim_list;
 };
@@ -754,6 +755,7 @@ struct hammer_sync_info {
 #define HAMMER_CHKSPC_WRITE    20
 #define HAMMER_CHKSPC_CREATE   20
 #define HAMMER_CHKSPC_REMOVE   10
+#define HAMMER_CHKSPC_EMERGENCY        0
 
 #if defined(_KERNEL)
 
@@ -1051,7 +1053,7 @@ int hammer_reload_inode(hammer_inode_t ip, void *arg __unused);
 int hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2);
 
 int hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip);
-void hammer_test_inode(hammer_inode_t ip);
+void hammer_test_inode(hammer_inode_t dip);
 void hammer_inode_unloadable_check(hammer_inode_t ip, int getvp);
 
 int  hammer_ip_add_directory(struct hammer_transaction *trans,
index 6d117d6..54fe2a3 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.24 2008/07/14 03:20:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.25 2008/07/16 18:30:59 dillon Exp $
  */
 
 /*
@@ -1080,29 +1080,19 @@ hammer_checkspace(hammer_mount_t hmp, int slop)
        const int rec_size = (sizeof(union hammer_btree_elm) * 2);
        int64_t usedbytes;
 
-       /*
-        * Hopefully a quick and fast check.
-        */
-       if (hmp->copy_stat_freebigblocks * HAMMER_LARGEBLOCK_SIZE >=
-           (int64_t)hidirtybufspace * 4 + 10 * HAMMER_LARGEBLOCK_SIZE) {
-               hammer_count_extra_space_used = -1;
-               return(0);
-       }
-
-       /*
-        * Do a more involved check
-        */
        usedbytes = hmp->rsv_inodes * in_size +
                    hmp->rsv_recs * rec_size +
                    hmp->rsv_databytes +
-                   hmp->rsv_fromdelay * HAMMER_LARGEBLOCK_SIZE +
-                   hidirtybufspace +
-                   slop * HAMMER_LARGEBLOCK_SIZE;
+                   ((int64_t)hmp->rsv_fromdelay << HAMMER_LARGEBLOCK_BITS) +
+                   ((int64_t)hidirtybufspace << 2) +
+                   (slop << HAMMER_LARGEBLOCK_BITS);
 
-       hammer_count_extra_space_used = usedbytes;
+       hammer_count_extra_space_used = usedbytes;      /* debugging */
 
-       if (hmp->copy_stat_freebigblocks >= usedbytes / HAMMER_LARGEBLOCK_SIZE)
+       if (hmp->copy_stat_freebigblocks >=
+           (usedbytes >> HAMMER_LARGEBLOCK_BITS)) {
                return(0);
+       }
        return (ENOSPC);
 }
 
index fbdd389..d78ba72 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.41 2008/07/14 20:27:54 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.42 2008/07/16 18:30:59 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -275,10 +275,17 @@ hammer_flusher_flush(hammer_mount_t hmp)
                        hammer_flusher_finalize(&hmp->flusher.trans, 0);
 
                /*
+                * Ok, we are running this flush group now (this prevents new
+                * additions to it).
+                */
+               flg->running = 1;
+               if (hmp->next_flush_group == flg)
+                       hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
+
+               /*
                 * Iterate the inodes in the flg's flush_list and assign
                 * them to slaves.
                 */
-               flg->running = 1;
                slave_index = 0;
                info = TAILQ_FIRST(&hmp->flusher.ready_list);
                next_ip = TAILQ_FIRST(&flg->flush_list);
index bcdbb39..081fde7 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.103 2008/07/14 03:20:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.104 2008/07/16 18:30:59 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1311,23 +1311,21 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
        int good;
 
        /*
-        * Setup a flush group.  It remains cached so it is ok if we
-        * wind up not flushing the inode.
+        * next_flush_group is the first flush group we can place the inode
+        * in.  It may be NULL.  If it becomes full we append a new flush
+        * group and make that the next_flush_group.
         */
        hmp = ip->hmp;
-       flg = TAILQ_LAST(&ip->hmp->flush_group_list, hammer_flush_group_list);
-
-       if (flg) {
-               if (flg->running) {
-                       flg = NULL;
-               } else if (flg->total_count + flg->refs >
-                          ip->hmp->undo_rec_limit) {
-                       hammer_flusher_async(ip->hmp, flg);
-                       flg = NULL;
-               }
+       while ((flg = hmp->next_flush_group) != NULL) {
+               KKASSERT(flg->running == 0);
+               if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
+                       break;
+               hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
+               hammer_flusher_async(ip->hmp, flg);
        }
        if (flg == NULL) {
                flg = kmalloc(sizeof(*flg), M_HAMMER, M_WAITOK|M_ZERO);
+               hmp->next_flush_group = flg;
                TAILQ_INIT(&flg->flush_list);
                TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
        }
@@ -1373,21 +1371,26 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
                 */
                good = hammer_setup_parent_inodes(ip, flg);
 
-               /*
-                * We can continue if good >= 0.  Determine how many records
-                * under our inode can be flushed (and mark them).
-                */
                if (good >= 0) {
+                       /*
+                        * We can continue if good >= 0.  Determine how 
+                        * many records under our inode can be flushed (and
+                        * mark them).
+                        */
                        hammer_flush_inode_core(ip, flg, flags);
                } else {
-                       ip->flags |= HAMMER_INODE_REFLUSH;
+                       /*
+                        * parent has no connectivity, tell it to flush
+                        * us as soon as it does.
+                        */
+                       ip->flags |= HAMMER_INODE_CONN_DOWN;
                        if (flags & HAMMER_FLUSH_SIGNAL) {
                                ip->flags |= HAMMER_INODE_RESIGNAL;
                                hammer_flusher_async(ip->hmp, flg);
                        }
                }
                break;
-       default:
+       case HAMMER_FST_FLUSH:
                /*
                 * We are already flushing, flag the inode to reflush
                 * if needed after it completes its current flush.
@@ -1466,13 +1469,23 @@ hammer_setup_parent_inodes_helper(hammer_record_t record,
         * allow the operation yet anyway (the second return -1).
         */
        if (record->flush_state == HAMMER_FST_FLUSH) {
+               /*
+                * If not in our flush group ask the parent to reflush
+                * us as soon as possible.
+                */
                if (record->flush_group != flg) {
                        pip->flags |= HAMMER_INODE_REFLUSH;
+                       record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
                        return(-1);
                }
+
+               /*
+                * If in our flush group everything is already set up,
+                * just return whether the record will improve our
+                * visibility or not.
+                */
                if (record->type == HAMMER_MEM_RECORD_ADD)
                        return(1);
-               /* GENERAL or DEL */
                return(0);
        }
 
@@ -1485,12 +1498,14 @@ hammer_setup_parent_inodes_helper(hammer_record_t record,
        good = hammer_setup_parent_inodes(pip, flg);
 
        /*
-        * We can't flush ip because it has no connectivity (XXX also check
-        * nlinks for pre-existing connectivity!).  Flag it so any resolution
-        * recurses back down.
+        * If good < 0 the parent has no connectivity and we cannot safely
+        * flush the directory entry, which also means we can't flush our
+        * ip.  Flag the parent and us for downward recursion once the
+        * parent's connectivity is resolved.
         */
        if (good < 0) {
-               pip->flags |= HAMMER_INODE_REFLUSH;
+               /* pip->flags |= HAMMER_INODE_CONN_DOWN; set by recursion */
+               record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
                return(good);
        }
 
@@ -1523,32 +1538,34 @@ hammer_setup_parent_inodes_helper(hammer_record_t record,
 #endif
        if (pip->flush_group == flg) {
                /*
-                * This is the record we wanted to synchronize.  If the
-                * record went into a flush state while we blocked it 
-                * had better be in the correct flush group.
+                * If the parent is in the same flush group as us we can
+                * just set the record to a flushing state and we are
+                * done.
                 */
-               if (record->flush_state != HAMMER_FST_FLUSH) {
-                       record->flush_state = HAMMER_FST_FLUSH;
-                       record->flush_group = pip->flush_group;
-                       ++record->flush_group->refs;
-                       hammer_ref(&record->lock);
-               } else {
-                       KKASSERT(record->flush_group == pip->flush_group);
-               }
-               if (record->type == HAMMER_MEM_RECORD_ADD)
-                       return(1);
+               record->flush_state = HAMMER_FST_FLUSH;
+               record->flush_group = flg;
+               ++record->flush_group->refs;
+               hammer_ref(&record->lock);
 
                /*
-                * A general or delete-on-disk record does not contribute
-                * to our visibility.  We can still flush it, however.
+                * A general directory-add contributes to our visibility.
+                *
+                * Otherwise it is probably a directory-delete or 
+                * delete-on-disk record and does not contribute to our
+                * visbility (but we can still flush it).
                 */
+               if (record->type == HAMMER_MEM_RECORD_ADD)
+                       return(1);
                return(0);
        } else {
                /*
-                * We couldn't resolve the dependancies, request that the
-                * inode be flushed when the dependancies can be resolved.
+                * If the parent is not in our flush group we cannot
+                * flush this record yet, there is no visibility.
+                * We tell the parent to reflush and mark ourselves
+                * so the parent knows it should flush us too.
                 */
                pip->flags |= HAMMER_INODE_REFLUSH;
+               record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
                return(-1);
        }
 }
@@ -1772,8 +1789,10 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
                /*
                 * The record has a setup dependancy.  These are typically
                 * directory entry adds and deletes.  Such entries will be
-                * flushed when their inodes are flushed so we do not have
-                * to add them to the flush here.
+                * flushed when their inodes are flushed so we do not
+                * usually have to add them to the flush here.  However,
+                * if the target_ip has set HAMMER_INODE_CONN_DOWN then
+                * it is asking us to flush this record (and it).
                 */
                target_ip = rec->target_ip;
                KKASSERT(target_ip != NULL);
@@ -1800,15 +1819,26 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
                /*
                 * Target IP is not yet flushing.  This can get complex
                 * because we have to be careful about the recursion.
+                *
+                * Directories create an issue for us in that if a flush
+                * of a directory is requested the expectation is to flush
+                * any pending directory entries, but this will cause the
+                * related inodes to recursively flush as well.  We can't
+                * really defer the operation so just get as many as we
+                * can and
                 */
+#if 0
                if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
-                   (target_ip->flags & HAMMER_INODE_REFLUSH) == 0) {
+                   (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
                        /*
-                        * We aren't reclaiming or trying to flush target_ip.
-                        * Let the record flush with the target.
+                        * We aren't reclaiming and the target ip was not
+                        * previously prevented from flushing due to this
+                        * record dependancy.  Do not flush this record.
                         */
                        /*r = 0;*/
-               } else if (flg->total_count + flg->refs >
+               } else
+#endif
+               if (flg->total_count + flg->refs >
                           ip->hmp->undo_rec_limit) {
                        /*
                         * Our flush group is over-full and we risk blowing
@@ -2029,6 +2059,12 @@ hammer_flush_inode_done(hammer_inode_t ip)
        }
 
        /*
+        * If we have no parent dependancies we can clear CONN_DOWN
+        */
+       if (TAILQ_EMPTY(&ip->target_list))
+               ip->flags &= ~HAMMER_INODE_CONN_DOWN;
+
+       /*
         * If the inode is now clean drop the space reservation.
         */
        if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
@@ -2568,8 +2604,8 @@ hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
 }
 
 /*
- * Re-test an inode when a dependancy had gone away to see if we
- * can chain flush it.
+ * After potentially resolving a dependancy the inode is tested
+ * to determine whether it needs to be reflushed.
  */
 void
 hammer_test_inode(hammer_inode_t ip)
index 6820de0..364dc18 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.28 2008/07/12 23:04:50 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.29 2008/07/16 18:30:59 dillon Exp $
  */
 
 #include "hammer.h"
@@ -363,8 +363,8 @@ hammer_ioc_synctid(hammer_transaction_t trans, hammer_inode_t ip,
                break;
        case HAMMER_SYNCTID_ASYNC:
                hammer_queue_inodes_flusher(hmp, MNT_NOWAIT);
-               std->tid = hmp->flusher.tid;    /* inaccurate */
                hammer_flusher_async(hmp, NULL);
+               std->tid = hmp->flusher.tid;    /* inaccurate */
                break;
        case HAMMER_SYNCTID_SYNC1:
                hammer_queue_inodes_flusher(hmp, MNT_WAIT);
index 8df8525..c259e70 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.69 2008/07/14 03:20:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.70 2008/07/16 18:30:59 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -686,12 +686,14 @@ hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset,
                buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root,
                                   base_offset);
                if (buffer) {
-                       KKASSERT(buffer->zone2_offset == zone2_offset);
-                       hammer_io_clear_modify(&buffer->io, 1);
-                       buffer->io.reclaim = 1;
-                       KKASSERT(buffer->volume == volume);
-                       if (buffer->io.lock.refs == 0)
-                               hammer_unload_buffer(buffer, NULL);
+                       error = hammer_ref_buffer(buffer);
+                       if (error == 0) {
+                               KKASSERT(buffer->zone2_offset == zone2_offset);
+                               hammer_io_clear_modify(&buffer->io, 1);
+                               buffer->io.reclaim = 1;
+                               KKASSERT(buffer->volume == volume);
+                               hammer_rel_buffer(buffer, 0);
+                       }
                } else {
                        hammer_io_inval(volume, zone2_offset);
                }
@@ -1383,6 +1385,9 @@ hammer_alloc_data(hammer_transaction_t trans, int32_t data_len,
 
 /*
  * Sync dirty buffers to the media and clean-up any loose ends.
+ *
+ * These functions do not start the flusher going, they simply
+ * queue everything up to the flusher.
  */
 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
@@ -1460,7 +1465,7 @@ hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
             RB_EMPTY(&vp->v_rbdirty_tree))) {
                return(0);
        }
-       error = VOP_FSYNC(vp, info->waitfor);
+       error = VOP_FSYNC(vp, MNT_NOWAIT);
        if (error)
                info->error = error;
        return(0);
index 727bdea..8e3c38b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.32 2008/07/14 03:20:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.33 2008/07/16 18:30:59 dillon Exp $
  */
 /*
  * HAMMER reblocker - This code frees up fragmented physical space
@@ -64,6 +64,16 @@ hammer_ioc_reblock(hammer_transaction_t trans, hammer_inode_t ip,
        int checkspace_count;
        int error;
        int seq;
+       int slop;
+
+       /*
+        * A fill level <= 20% is considered an emergency.  free_level is
+        * inverted from fill_level.
+        */
+       if (reblock->free_level >= HAMMER_LARGEBLOCK_SIZE * 8 / 10)
+               slop = HAMMER_CHKSPC_EMERGENCY;
+       else
+               slop = HAMMER_CHKSPC_REBLOCK;
 
        if ((reblock->key_beg.localization | reblock->key_end.localization) &
            HAMMER_LOCALIZE_PSEUDOFS_MASK) {
@@ -134,7 +144,7 @@ retry:
                 * If there is insufficient free space it may be due to
                 * reserved bigblocks, which flushing might fix.
                 */
-               if (hammer_checkspace(trans->hmp, HAMMER_CHKSPC_REBLOCK)) {
+               if (hammer_checkspace(trans->hmp, slop)) {
                        if (++checkspace_count == 10) {
                                error = ENOSPC;
                                break;
@@ -143,6 +153,7 @@ retry:
                        hammer_flusher_wait(trans->hmp, seq);
                        hammer_lock_cursor(&cursor, 0);
                        seq = hammer_flusher_async(trans->hmp, NULL);
+                       continue;
                }
 
                /*
index 64b793e..0aa60bf 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.18 2008/06/27 20:56:59 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_undo.c,v 1.19 2008/07/16 18:30:59 dillon Exp $
  */
 
 /*
@@ -128,7 +128,6 @@ again:
        if (undomap->next_offset == undomap->alloc_offset) {
                next_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
                undomap->next_offset = next_offset;
-               hkprintf("undo zone's next_offset wrapped\n");
        }
 
        /*