HAMMER 55: Performance tuning and bug fixes - MEDIA STRUCTURES CHANGED!
authorMatthew Dillon <dillon@dragonflybsd.org>
Sat, 14 Jun 2008 01:42:13 +0000 (01:42 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Sat, 14 Jun 2008 01:42:13 +0000 (01:42 +0000)
* BUG-FIX: Fix a race in hammer_rel_mem_record() which could result in a
  machine lockup.  The code could block at an inappropriate time with both
  the record and a dependancy inode pointer left unprotected.

* BUG-FIX: The direct-write code could assert on (*error != 0) due to an
  incorrect conditional in the in-memory record scanning code.

* Inode data and directory entry data has been given its own zone as a
  stop-gap until the low level allocator can be rewritten.

* Increase the directory object-id cache from 128 entries to 1024 entries.

* General cleanup.

* Introduce a separate reblocking domain for directories: 'hammer reblock-dirs'.

12 files changed:
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.h
sys/vfs/hammer/hammer_disk.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_ioctl.h
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_reblock.c
sys/vfs/hammer/hammer_vnops.c

index ef1c5f7..872ccba 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.83 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.84 2008/06/14 01:42:12 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -155,7 +155,7 @@ TAILQ_HEAD(hammer_record_list, hammer_record);
  * directories to retain fairly localized object ids which in turn
  * improves reblocking performance and layout.
  */
-#define OBJID_CACHE_SIZE       128
+#define OBJID_CACHE_SIZE       1024
 #define OBJID_CACHE_BULK       100000
 
 typedef struct hammer_objid_cache {
@@ -865,7 +865,7 @@ void hammer_dup_buffer(struct hammer_buffer **bufferp,
                        struct hammer_buffer *buffer);
 hammer_node_t hammer_alloc_btree(hammer_transaction_t trans, int *errorp);
 void *hammer_alloc_data(hammer_transaction_t trans, int32_t data_len,
-                       hammer_off_t *data_offsetp,
+                       u_int16_t rec_type, hammer_off_t *data_offsetp,
                        struct hammer_buffer **data_bufferp, int *errorp);
 
 int hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
index 664ca7d..2940523 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.52 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.53 2008/06/14 01:42:13 dillon Exp $
  */
 
 /*
@@ -528,7 +528,7 @@ hammer_btree_lookup(hammer_cursor_t cursor)
        } else {
                error = btree_search(cursor, 0);
        }
-       if (error == 0 && cursor->flags)
+       if (error == 0)
                error = hammer_btree_extract(cursor, cursor->flags);
        return(error);
 }
@@ -602,7 +602,6 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags)
        elm = &node->elms[cursor->index];
        cursor->data = NULL;
        hmp = cursor->node->hmp;
-       flags |= cursor->flags & HAMMER_CURSOR_DATAEXTOK;
 
        /*
         * There is nothing to extract for an internal element.
index 0041466..c6829af 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.20 2008/06/07 07:41:51 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.21 2008/06/14 01:42:13 dillon Exp $
  */
 
 /*
@@ -126,7 +126,7 @@ typedef struct hammer_cursor *hammer_cursor_t;
 #define HAMMER_CURSOR_DISKEOF          0x0400
 #define HAMMER_CURSOR_MEMEOF           0x0800
 #define HAMMER_CURSOR_DELBTREE         0x1000  /* ip_delete from b-tree */
-#define HAMMER_CURSOR_DATAEXTOK                0x2000  /* allow data extension */
+#define HAMMER_CURSOR_UNUSED2000       0x2000
 #define HAMMER_CURSOR_ASOF             0x4000  /* as-of lookup */
 #define HAMMER_CURSOR_CREATE_CHECK     0x8000  /* as-of lookup */
 
index 3c1994d..eff5a99 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.35 2008/06/07 07:41:51 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.36 2008/06/14 01:42:13 dillon Exp $
  */
 
 #ifndef VFS_HAMMER_DISK_H_
@@ -132,7 +132,7 @@ typedef u_int32_t hammer_crc_t;
 #define HAMMER_ZONE_RESERVED06         0x6000000000000000ULL
 #define HAMMER_ZONE_RESERVED07         0x7000000000000000ULL
 #define HAMMER_ZONE_BTREE              0x8000000000000000ULL
-#define HAMMER_ZONE_RECORD             0x9000000000000000ULL
+#define HAMMER_ZONE_META               0x9000000000000000ULL
 #define HAMMER_ZONE_LARGE_DATA         0xA000000000000000ULL
 #define HAMMER_ZONE_SMALL_DATA         0xB000000000000000ULL
 #define HAMMER_ZONE_RESERVED0C         0xC000000000000000ULL
@@ -145,7 +145,7 @@ typedef u_int32_t hammer_crc_t;
 #define HAMMER_ZONE_UNDO_INDEX         3
 #define HAMMER_ZONE_FREEMAP_INDEX      4
 #define HAMMER_ZONE_BTREE_INDEX                8
-#define HAMMER_ZONE_RECORD_INDEX       9
+#define HAMMER_ZONE_META_INDEX         9
 #define HAMMER_ZONE_LARGE_DATA_INDEX   10
 #define HAMMER_ZONE_SMALL_DATA_INDEX   11
 
@@ -528,7 +528,7 @@ typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t;
 #define HAMMER_RECTYPE_LOWEST          1       /* lowest record type avail */
 #define HAMMER_RECTYPE_INODE           1       /* inode in obj_id space */
 #define HAMMER_RECTYPE_PSEUDO_INODE    2       /* pseudo filesysem */
-#define HAMMER_RECTYPE_CLUSTER         3       /* inter-cluster reference */
+#define HAMMER_RECTYPE_UNUSED03                3       /* inter-cluster reference */
 #define HAMMER_RECTYPE_DATA            0x0010
 #define HAMMER_RECTYPE_DIRENTRY                0x0011
 #define HAMMER_RECTYPE_DB              0x0012
index 9acbe16..268bc8d 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.26 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.27 2008/06/14 01:42:13 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -247,7 +247,7 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
        while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
                KKASSERT(--panic_count > 0);
                KKASSERT(io->mod_list == &hmp->lose_list);
-               TAILQ_REMOVE(io->mod_list, io, mod_entry);
+               TAILQ_REMOVE(&hmp->lose_list, io, mod_entry);
                io->mod_list = NULL;
                if (io->lock.refs == 0)
                        ++hammer_count_refedbufs;
index fb74ed5..bdbc00b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.74 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.75 2008/06/14 01:42:13 dillon Exp $
  */
 
 #include "hammer.h"
@@ -42,7 +42,8 @@
 static int     hammer_unload_inode(struct hammer_inode *ip);
 static void    hammer_flush_inode_core(hammer_inode_t ip, int flags);
 static int     hammer_setup_child_callback(hammer_record_t rec, void *data);
-static int     hammer_setup_parent_inodes(hammer_record_t record);
+static int     hammer_setup_parent_inodes(hammer_inode_t ip);
+static int     hammer_setup_parent_inodes_helper(hammer_record_t record);
 static void    hammer_inode_wakereclaims(hammer_inode_t ip);
 
 #ifdef DEBUG_TRUNCATE
@@ -836,8 +837,7 @@ hammer_modify_inode(hammer_inode_t ip, int flags)
 void
 hammer_flush_inode(hammer_inode_t ip, int flags)
 {
-       hammer_record_t depend;
-       int r, good;
+       int good;
 
        /*
         * Trivial 'nothing to flush' case.  If the inode is ina SETUP
@@ -873,14 +873,7 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
                 * can't flush, 0 means there weren't any dependancies, and
                 * 1 means we have good connectivity.
                 */
-               good = 0;
-               TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
-                       r = hammer_setup_parent_inodes(depend);
-                       if (r < 0 && good == 0)
-                               good = -1;
-                       if (r > 0)
-                               good = 1;
-               }
+               good = hammer_setup_parent_inodes(ip);
 
                /*
                 * We can continue if good >= 0.  Determine how many records
@@ -912,9 +905,78 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
 }
 
 /*
+ * Scan ip->target_list, which is a list of records owned by PARENTS to our
+ * ip which reference our ip.
+ *
+ * XXX This is a huge mess of recursive code, but not one bit of it blocks
+ *     so for now do not ref/deref the structures.  Note that if we use the
+ *     ref/rel code later, the rel CAN block.
+ */
+static int
+hammer_setup_parent_inodes(hammer_inode_t ip)
+{
+       hammer_record_t depend;
+#if 0
+       hammer_record_t next;
+       hammer_inode_t  pip;
+#endif
+       int good;
+       int r;
+
+       good = 0;
+       TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
+               r = hammer_setup_parent_inodes_helper(depend);
+               KKASSERT(depend->target_ip == ip);
+               if (r < 0 && good == 0)
+                       good = -1;
+               if (r > 0)
+                       good = 1;
+       }
+       return(good);
+
+#if 0
+retry:
+       good = 0;
+       next = TAILQ_FIRST(&ip->target_list);
+       if (next) {
+               hammer_ref(&next->lock);
+               hammer_ref(&next->ip->lock);
+       }
+       while ((depend = next) != NULL) {
+               if (depend->target_ip == NULL) {
+                       pip = depend->ip;
+                       hammer_rel_mem_record(depend);
+                       hammer_rel_inode(pip, 0);
+                       goto retry;
+               }
+               KKASSERT(depend->target_ip == ip);
+               next = TAILQ_NEXT(depend, target_entry);
+               if (next) {
+                       hammer_ref(&next->lock);
+                       hammer_ref(&next->ip->lock);
+               }
+               r = hammer_setup_parent_inodes_helper(depend);
+               if (r < 0 && good == 0)
+                       good = -1;
+               if (r > 0)
+                       good = 1;
+               pip = depend->ip;
+               hammer_rel_mem_record(depend);
+               hammer_rel_inode(pip, 0);
+       }
+       return(good);
+#endif
+}
+
+/*
+ * This helper function takes a record representing the dependancy between
+ * the parent inode and child inode.
+ *
+ * record->ip          = parent inode
+ * record->target_ip   = child inode
+ * 
  * We are asked to recurse upwards and convert the record from SETUP
- * to FLUSH if possible.  record->ip is a parent of the caller's inode,
- * and record->target_ip is the caller's inode.
+ * to FLUSH if possible.
  *
  * Return 1 if the record gives us connectivity
  *
@@ -923,15 +985,15 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
  * Return -1 if we can't resolve the dependancy and there is no connectivity.
  */
 static int
-hammer_setup_parent_inodes(hammer_record_t record)
+hammer_setup_parent_inodes_helper(hammer_record_t record)
 {
-       hammer_mount_t hmp = record->ip->hmp;
-       hammer_record_t depend;
-       hammer_inode_t ip;
-       int r, good;
+       hammer_mount_t hmp;
+       hammer_inode_t pip;
+       int good;
 
        KKASSERT(record->flush_state != HAMMER_FST_IDLE);
-       ip = record->ip;
+       pip = record->ip;
+       hmp = pip->hmp;
 
        /*
         * If the record is already flushing, is it in our flush group?
@@ -943,7 +1005,7 @@ hammer_setup_parent_inodes(hammer_record_t record)
         */
        if (record->flush_state == HAMMER_FST_FLUSH) {
                if (record->flush_group != hmp->flusher.next) {
-                       ip->flags |= HAMMER_INODE_REFLUSH;
+                       pip->flags |= HAMMER_INODE_REFLUSH;
                        return(-1);
                }
                if (record->type == HAMMER_MEM_RECORD_ADD)
@@ -958,14 +1020,7 @@ hammer_setup_parent_inodes(hammer_record_t record)
         */
        KKASSERT(record->flush_state == HAMMER_FST_SETUP);
 
-       good = 0;
-       TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
-               r = hammer_setup_parent_inodes(depend);
-               if (r < 0 && good == 0)
-                       good = -1;
-               if (r > 0)
-                       good = 1;
-       }
+       good = hammer_setup_parent_inodes(pip);
 
        /*
         * We can't flush ip because it has no connectivity (XXX also check
@@ -973,7 +1028,7 @@ hammer_setup_parent_inodes(hammer_record_t record)
         * recurses back down.
         */
        if (good < 0) {
-               ip->flags |= HAMMER_INODE_REFLUSH;
+               pip->flags |= HAMMER_INODE_REFLUSH;
                return(good);
        }
 
@@ -983,9 +1038,9 @@ hammer_setup_parent_inodes(hammer_record_t record)
         * may already be flushing.  The record must be in the same flush
         * group as the parent.
         */
-       if (ip->flush_state != HAMMER_FST_FLUSH)
-               hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION);
-       KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
+       if (pip->flush_state != HAMMER_FST_FLUSH)
+               hammer_flush_inode_core(pip, HAMMER_FLUSH_RECURSION);
+       KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
        KKASSERT(record->flush_state == HAMMER_FST_SETUP);
 
 #if 0
@@ -1004,13 +1059,19 @@ hammer_setup_parent_inodes(hammer_record_t record)
                return(-1);
        } else
 #endif
-       if (ip->flush_group == ip->hmp->flusher.next) {
+       if (pip->flush_group == pip->hmp->flusher.next) {
                /*
-                * This is the record we wanted to synchronize.
+                * This is the record we wanted to synchronize.  If the
+                * record went into a flush state while we blocked it 
+                * had better be in the correct flush group.
                 */
-               record->flush_state = HAMMER_FST_FLUSH;
-               record->flush_group = ip->flush_group;
-               hammer_ref(&record->lock);
+               if (record->flush_state != HAMMER_FST_FLUSH) {
+                       record->flush_state = HAMMER_FST_FLUSH;
+                       record->flush_group = pip->flush_group;
+                       hammer_ref(&record->lock);
+               } else {
+                       KKASSERT(record->flush_group == pip->flush_group);
+               }
                if (record->type == HAMMER_MEM_RECORD_ADD)
                        return(1);
 
@@ -1024,7 +1085,7 @@ hammer_setup_parent_inodes(hammer_record_t record)
                 * We couldn't resolve the dependancies, request that the
                 * inode be flushed when the dependancies can be resolved.
                 */
-               ip->flags |= HAMMER_INODE_REFLUSH;
+               pip->flags |= HAMMER_INODE_REFLUSH;
                return(-1);
        }
 }
@@ -1571,6 +1632,8 @@ hammer_sync_inode(hammer_inode_t ip)
         * Records which are in our flush group can be unlinked from our
         * inode now, potentially allowing the inode to be physically
         * deleted.
+        *
+        * This cannot block.
         */
        nlinks = ip->ino_data.nlinks;
        next = TAILQ_FIRST(&ip->target_list);
index 5baeef1..d25783f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.40 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.41 2008/06/14 01:42:13 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
@@ -68,7 +68,7 @@ hammer_io_init(hammer_io_t io, hammer_mount_t hmp, enum hammer_io_type type)
 
 /*
  * Helper routine to disassociate a buffer cache buffer from an I/O
- * structure.  Called with the io structure exclusively locked.
+ * structure.
  *
  * The io may have 0 or 1 references depending on who called us.  The
  * caller is responsible for dealing with the refs.
@@ -95,8 +95,8 @@ hammer_io_disassociate(hammer_io_structure_t iou, int elseit)
        }
 
        /*
-        * elseit is 0 when called from the kernel path, the caller is
-        * holding the buffer locked and will deal with its final disposition.
+        * elseit is 0 when called from the kernel path when the io
+        * might have no references.
         */
        if (elseit) {
                KKASSERT(iou->io.released == 0);
@@ -684,6 +684,9 @@ hammer_io_complete(struct buf *bp)
 
        KKASSERT(iou->io.released == 1);
 
+       /*
+        * Deal with people waiting for I/O to drain
+        */
        if (iou->io.running) {
                --hammer_count_io_running_write;
                if (--iou->io.hmp->io_running_count == 0)
@@ -692,18 +695,14 @@ hammer_io_complete(struct buf *bp)
                iou->io.running = 0;
        }
 
-       /*
-        * If no lock references remain and we can acquire the IO lock and
-        * someone at some point wanted us to flush (B_LOCKED test), then
-        * try to dispose of the IO.
-        */
        if (iou->io.waiting) {
                iou->io.waiting = 0;
                wakeup(iou);
        }
 
        /*
-        * Someone wanted us to flush, try to clean out the buffer. 
+        * If B_LOCKED is set someone wanted to deallocate the bp at some
+        * point, do it now if refs has become zero.
         */
        if ((bp->b_flags & B_LOCKED) && iou->io.lock.refs == 0) {
                KKASSERT(iou->io.modified == 0);
index 859c94a..868ddd5 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.10 2008/05/31 18:37:57 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.11 2008/06/14 01:42:13 dillon Exp $
  */
 /*
  * HAMMER ioctl's.  This file can be #included from userland
@@ -59,10 +59,12 @@ struct hammer_ioc_head {
 #define HAMMER_IOC_DO_BTREE    0x00020000      /* reblocker */
 #define HAMMER_IOC_DO_INODES   0x00040000      /* reblocker */
 #define HAMMER_IOC_DO_DATA     0x00080000      /* reblocker */
+#define HAMMER_IOC_DO_DIRS     0x00100000      /* reblocker */
 
 #define HAMMER_IOC_DO_FLAGS    (HAMMER_IOC_DO_BTREE |  \
                                 HAMMER_IOC_DO_INODES | \
-                                HAMMER_IOC_DO_DATA)
+                                HAMMER_IOC_DO_DATA |   \
+                                HAMMER_IOC_DO_DIRS)
 
 /*
  * HAMMERIOC_PRUNE
index 4ea25b2..0795b22 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.67 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.68 2008/06/14 01:42:13 dillon Exp $
  */
 
 #include "hammer.h"
@@ -63,27 +63,14 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
        if (rec1->leaf.base.key > rec2->leaf.base.key)
                return(1);
 
-#if 0
-       /*
-        * XXX create_tid is set during sync, memory records are always
-        * current.  Do not match against create_tid.
-        */
-       if (rec1->leaf.base.create_tid == 0) {
-               if (rec2->leaf.base.create_tid == 0)
-                       return(0);
-               return(1);
-       }
-       if (rec2->leaf.base.create_tid == 0)
-               return(-1);
-
-       if (rec1->leaf.base.create_tid < rec2->leaf.base.create_tid)
-               return(-1);
-       if (rec1->leaf.base.create_tid > rec2->leaf.base.create_tid)
-               return(1);
-#endif
-
        /*
         * Never match against an item deleted by the front-end.
+        *
+        * rec1 is greater then rec2 if rec1 is marked deleted.
+        * rec1 is less then rec2 if rec2 is marked deleted.
+        *
+        * Multiple deleted records may be present, do not return 0
+        * if both are marked deleted.
         */
        if (rec1->flags & HAMMER_RECF_DELETED_FE)
                return(1);
@@ -109,28 +96,12 @@ hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
         if (elm->key > rec->leaf.base.key)
                 return(2);
 
-#if 0
-       /*
-        * XXX create_tid is set during sync, memory records are always
-        * current.  Do not match against create_tid.
-        */
-       if (elm->create_tid == 0) {
-               if (rec->leaf.base.create_tid == 0)
-                       return(0);
-               return(1);
-       }
-       if (rec->leaf.base.create_tid == 0)
-               return(-1);
-       if (elm->create_tid < rec->leaf.base.create_tid)
-               return(-1);
-       if (elm->create_tid > rec->leaf.base.create_tid)
-               return(1);
-#endif
        /*
         * Never match against an item deleted by the front-end.
+        * elm is less then rec if rec is marked deleted.
         */
        if (rec->flags & HAMMER_RECF_DELETED_FE)
-               return(1);
+               return(-1);
         return(0);
 }
 
@@ -164,24 +135,12 @@ hammer_rec_overlap_compare(hammer_btree_leaf_elm_t leaf, hammer_record_t rec)
                        return(2);
        }
 
-#if 0
-       if (leaf->base.create_tid == 0) {
-               if (rec->leaf.base.create_tid == 0)
-                       return(0);
-               return(1);
-       }
-       if (rec->leaf.base.create_tid == 0)
-               return(-1);
-       if (leaf->base.create_tid < rec->leaf.base.create_tid)
-               return(-1);
-       if (leaf->base.create_tid > rec->leaf.base.create_tid)
-               return(1);
-#endif
        /*
         * Never match against an item deleted by the front-end.
+        * leaf is less then rec if rec is marked deleted.
         */
        if (rec->flags & HAMMER_RECF_DELETED_FE)
-               return(1);
+               return(-1);
         return(0);
 }
 
@@ -372,6 +331,9 @@ hammer_rel_mem_record(struct hammer_record *record)
                 * Upon release of the last reference wakeup any waiters.
                 * The record structure may get destroyed so callers will
                 * loop up and do a relookup.
+                *
+                * WARNING!  Record must be removed from RB-TREE before we
+                * might possibly block.  hammer_test_inode() can block!
                 */
                ip = record->ip;
 
@@ -380,13 +342,19 @@ hammer_rel_mem_record(struct hammer_record *record)
                 * is destroyed.
                 */
                if (record->flags & HAMMER_RECF_DELETED_FE) {
+                       KKASSERT(ip->lock.refs > 0);
                        KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
 
+                       /*
+                        * target_ip may have zero refs, we have to ref it
+                        * to prevent it from being ripped out from under
+                        * us.
+                        */
                        if ((target_ip = record->target_ip) != NULL) {
                                TAILQ_REMOVE(&target_ip->target_list,
                                             record, target_entry);
                                record->target_ip = NULL;
-                               hammer_test_inode(target_ip);
+                               hammer_ref(&target_ip->lock);
                        }
 
                        if (record->flags & HAMMER_RECF_ONRBTREE) {
@@ -410,6 +378,15 @@ hammer_rel_mem_record(struct hammer_record *record)
                                        hammer_test_inode(record->ip);
                                }
                        }
+
+                       /*
+                        * Do this test after removing record from the B-Tree.
+                        */
+                       if (target_ip) {
+                               hammer_test_inode(target_ip);
+                               hammer_rel_inode(target_ip, 0);
+                       }
+
                        if (record->flags & HAMMER_RECF_ALLOCDATA) {
                                --hammer_count_record_datas;
                                kfree(record->data, M_HAMMER);
@@ -808,6 +785,7 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
        hammer_record_t record;
        hammer_record_t conflict;
        int zone;
+       int flags;
 
        /*
         * Deal with conflicting in-memory records.  We cannot have multiple
@@ -823,10 +801,9 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
                if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) {
                        conflict->flags |= HAMMER_RECF_WANTED;
                        tsleep(conflict, 0, "hmrrc3", 0);
-                       hammer_rel_mem_record(conflict);
-                       continue;
+               } else {
+                       conflict->flags |= HAMMER_RECF_DELETED_FE;
                }
-               conflict->flags |= HAMMER_RECF_DELETED_FE;
                hammer_rel_mem_record(conflict);
        }
 
@@ -859,6 +836,7 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
        record->leaf.base.localization = HAMMER_LOCALIZE_MISC;
        record->leaf.data_len = bytes;
        record->leaf.data_crc = crc32(data, bytes);
+       flags = record->flags;
 
        hammer_ref(&record->lock);      /* mem_add eats a reference */
        *errorp = hammer_mem_add(record);
@@ -909,201 +887,6 @@ hammer_rec_trunc_callback(hammer_record_t record, void *data __unused)
        return(0);
 }
 
-
-/*
- * Backend code
- *
- * Sync data from a buffer cache buffer (typically) to the filesystem.  This
- * is called via the strategy called from a cached data source.  This code
- * is responsible for actually writing a data record out to the disk.
- *
- * This can only occur non-historically (i.e. 'current' data only).
- *
- * The file offset must be HAMMER_BUFSIZE aligned but the data length
- * can be truncated.  The record (currently) always represents a BUFSIZE
- * swath of space whether the data is truncated or not.
- */
-int
-hammer_ip_sync_data(hammer_cursor_t cursor, hammer_inode_t ip,
-                      int64_t offset, void *data, int bytes)
-{
-       hammer_transaction_t trans = cursor->trans;
-       struct hammer_btree_leaf_elm elm;
-       hammer_off_t data_offset;
-       void *bdata;
-       int error;
-       int aligned_bytes;
-
-       KKASSERT((offset & HAMMER_BUFMASK) == 0);
-       KKASSERT(trans->type == HAMMER_TRANS_FLS);
-       KKASSERT(bytes != 0);
-
-       /*
-        * We don't have to do this but it's probably a good idea to
-        * align data allocations to 64-byte boundaries for future
-        * expansion.
-        */
-       aligned_bytes = (bytes + 15) & ~15;
-retry:
-       hammer_normalize_cursor(cursor);
-       cursor->key_beg.localization = HAMMER_LOCALIZE_MISC;
-       cursor->key_beg.obj_id = ip->obj_id;
-       cursor->key_beg.key = offset + aligned_bytes;
-       cursor->key_beg.create_tid = trans->tid;
-       cursor->key_beg.delete_tid = 0;
-       cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
-       cursor->asof = trans->tid;
-       cursor->flags &= ~HAMMER_CURSOR_INITMASK;
-       cursor->flags |= HAMMER_CURSOR_INSERT;
-       cursor->flags |= HAMMER_CURSOR_BACKEND;
-
-       /*
-        * Issue a lookup to position the cursor.
-        */
-       error = hammer_btree_lookup(cursor);
-       if (error == 0) {
-               kprintf("hammer_ip_sync_data: duplicate data at "
-                       "(%lld,%d) tid %016llx\n",
-                       offset, aligned_bytes, trans->tid);
-               hammer_print_btree_elm(&cursor->node->ondisk->
-                                       elms[cursor->index],
-                                      HAMMER_BTREE_TYPE_LEAF, cursor->index);
-               panic("Duplicate data");
-               error = EIO;
-       }
-       if (error != ENOENT)
-               goto done;
-
-       /*
-        * Allocate our data.  The data buffer is not marked modified (yet)
-        */
-       bdata = hammer_alloc_data(trans, aligned_bytes, &data_offset,
-                                 &cursor->data_buffer, &error);
-
-       if (bdata == NULL)
-               goto done;
-
-       /*
-        * Fill everything in and insert our B-Tree node.
-        *
-        * NOTE: hammer_alloc_data() has already marked the data buffer
-        * as modified.  If we do it again we will generate unnecessary
-        * undo elements.
-        */
-       elm.base.btype = HAMMER_BTREE_TYPE_RECORD;
-       elm.base.localization = HAMMER_LOCALIZE_MISC;
-       elm.base.obj_id = ip->obj_id;
-       elm.base.key = offset + aligned_bytes;
-       elm.base.create_tid = trans->tid;
-       elm.base.delete_tid = 0;
-       elm.base.rec_type = HAMMER_RECTYPE_DATA;
-       elm.atime = 0;
-       elm.data_offset = data_offset;
-       elm.data_len = aligned_bytes;
-
-       /*
-        * Copy the data to the allocated buffer.  Since we are aligning
-        * the record size as specified in elm.data_len, make sure to zero
-        * out any extranious bytes.
-        */
-       hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
-       bcopy(data, bdata, bytes);
-       if (aligned_bytes > bytes)
-               bzero((char *)bdata + bytes, aligned_bytes - bytes);
-       hammer_modify_buffer_done(cursor->data_buffer);
-       elm.data_crc = crc32(bdata, aligned_bytes);
-
-       /*
-        * Data records can wind up on-disk before the inode itself is
-        * on-disk.  One must assume data records may be on-disk if either
-        * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
-        */
-       ip->flags |= HAMMER_INODE_DONDISK;
-
-       error = hammer_btree_insert(cursor, &elm);
-       if (error == 0)
-               goto done;
-
-       hammer_blockmap_free(trans, data_offset, aligned_bytes);
-done:
-       if (error == EDEADLK) {
-               hammer_done_cursor(cursor);
-               error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
-               if (error == 0)
-                       goto retry;
-       }
-       return(error);
-}
-
-#if 0
-
-/*
- * Backend code which actually performs the write to the media.  This
- * routine is typically called from the flusher.  The bio will be disposed
- * of (biodone'd) by this routine.
- *
- * Iterate the related records and mark for deletion.  If existing edge
- * records (left and right side) overlap our write they have to be marked
- * deleted and new records created, usually referencing a portion of the
- * original data.  Then add a record to represent the buffer.
- */
-int
-hammer_dowrite(hammer_cursor_t cursor, hammer_inode_t ip,
-              off_t file_offset, void *data, int bytes)
-{
-       int error;
-
-       KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
-
-       /*
-        * If the inode is going or gone, just throw away any frontend
-        * buffers.
-        */
-       if (ip->flags & HAMMER_INODE_DELETED)
-               return(0);
-
-       /*
-        * Delete any records overlapping our range.  This function will
-        * (eventually) properly truncate partial overlaps.
-        */
-       if (ip->sync_ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
-               error = hammer_ip_delete_range(cursor, ip, file_offset,
-                                              file_offset, 0);
-       } else {
-               error = hammer_ip_delete_range(cursor, ip, file_offset,
-                                              file_offset + bytes - 1, 0);
-       }
-
-       /*
-        * Add a single record to cover the write.  We can write a record
-        * with only the actual file data - for example, a small 200 byte
-        * file does not have to write out a 16K record.
-        *
-        * While the data size does not have to be aligned, we still do it
-        * to reduce fragmentation in a future allocation model.
-        */
-       if (error == 0) {
-               int limit_size;
-
-               if (ip->sync_ino_data.size - file_offset > bytes) {
-                           limit_size = bytes;
-               } else {
-                       limit_size = (int)(ip->sync_ino_data.size -
-                                          file_offset);
-                       KKASSERT(limit_size >= 0);
-               }
-               if (limit_size) {
-                       error = hammer_ip_sync_data(cursor, ip, file_offset,
-                                                   data, limit_size);
-               }
-       }
-       if (error)
-               Debugger("hammer_dowrite: error");
-       return(error);
-}
-
-#endif
-
 /*
  * Backend code.  Sync a record to the media.
  */
@@ -1240,6 +1023,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                 * Wholely cached record, with data.  Allocate the data.
                 */
                bdata = hammer_alloc_data(trans, record->leaf.data_len,
+                                         record->leaf.base.rec_type,
                                          &record->leaf.data_offset,
                                          &cursor->data_buffer, &error);
                if (bdata == NULL)
@@ -1576,10 +1360,8 @@ next_memory:
                        int64_t base1 = elm->leaf.base.key - elm->leaf.data_len;
                        int64_t base2 = cursor->iprec->leaf.base.key -
                                        cursor->iprec->leaf.data_len;
-                       if (base1 == base2) {
-                               kprintf("G");
+                       if (base1 == base2)
                                r = 0;
-                       }
                }
 
                if (r < 0) {
@@ -2030,6 +1812,7 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
                switch(data_offset & HAMMER_OFF_ZONE_MASK) {
                case HAMMER_ZONE_LARGE_DATA:
                case HAMMER_ZONE_SMALL_DATA:
+               case HAMMER_ZONE_META:
                        hammer_blockmap_free(cursor->trans,
                                             data_offset, data_len);
                        break;
index e232453..7c696bd 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.56 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.57 2008/06/14 01:42:13 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -529,6 +529,8 @@ again:
                 * cannot become loose once it gains a ref.  Loose
                 * buffers will never be in a modified state.  This should
                 * only occur on the 0->1 transition of refs.
+                *
+                * lose_list can be modified via a biodone() interrupt.
                 */
                if (buffer->io.mod_list == &hmp->lose_list) {
                        crit_enter();   /* biodone race against list */
@@ -549,6 +551,7 @@ again:
        switch(zone) {
        case HAMMER_ZONE_LARGE_DATA_INDEX:
        case HAMMER_ZONE_SMALL_DATA_INDEX:
+       case HAMMER_ZONE_META_INDEX:  /* meta-data isn't a meta-buffer */
                iotype = HAMMER_STRUCTURE_DATA_BUFFER;
                break;
        case HAMMER_ZONE_UNDO_INDEX:
@@ -739,10 +742,14 @@ hammer_ref_buffer(hammer_buffer_t buffer)
        hammer_ref(&buffer->io.lock);
 
        /*
+        * At this point a biodone() will not touch the buffer other then
+        * incidental bits.  However, lose_list can be modified via
+        * a biodone() interrupt.
+        *
         * No longer loose
         */
        if (buffer->io.mod_list == &buffer->io.hmp->lose_list) {
-               crit_enter();   /* biodone race against list */
+               crit_enter();
                TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, mod_entry);
                buffer->io.mod_list = NULL;
                crit_exit();
@@ -1264,24 +1271,39 @@ hammer_alloc_btree(hammer_transaction_t trans, int *errorp)
  */
 void *
 hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, 
-                 hammer_off_t *data_offsetp,
+                 u_int16_t rec_type, hammer_off_t *data_offsetp,
                  struct hammer_buffer **data_bufferp, int *errorp)
 {
        void *data;
+       int zone;
 
        /*
         * Allocate data
         */
        if (data_len) {
-               if (data_len < HAMMER_BUFSIZE) {
-                       *data_offsetp = hammer_blockmap_alloc(trans,
-                                               HAMMER_ZONE_SMALL_DATA_INDEX,
-                                               data_len, errorp);
-               } else {
-                       *data_offsetp = hammer_blockmap_alloc(trans,
-                                               HAMMER_ZONE_LARGE_DATA_INDEX,
-                                               data_len, errorp);
+               switch(rec_type) {
+               case HAMMER_RECTYPE_INODE:
+               case HAMMER_RECTYPE_PSEUDO_INODE:
+               case HAMMER_RECTYPE_DIRENTRY:
+               case HAMMER_RECTYPE_EXT:
+               case HAMMER_RECTYPE_FIX:
+                       zone = HAMMER_ZONE_META_INDEX;
+                       break;
+               case HAMMER_RECTYPE_DATA:
+               case HAMMER_RECTYPE_DB:
+                       if (data_len <= HAMMER_BUFSIZE / 2)
+                               zone = HAMMER_ZONE_SMALL_DATA_INDEX;
+                       else
+                               zone = HAMMER_ZONE_LARGE_DATA_INDEX;
+                       break;
+               default:
+                       panic("hammer_alloc_data: rec_type %04x unknown",
+                             rec_type);
+                       zone = 0;       /* NOT REACHED */
+                       break;
                }
+               *data_offsetp = hammer_blockmap_alloc(trans, zone,
+                                                     data_len, errorp);
        } else {
                *data_offsetp = 0;
        }
index 59c6d45..cc90465 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.17 2008/06/09 04:19:10 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.18 2008/06/14 01:42:13 dillon Exp $
  */
 /*
  * HAMMER reblocker - This code frees up fragmented physical space
@@ -165,9 +165,9 @@ hammer_reblock_helper(struct hammer_ioc_reblock *reblock,
 {
        hammer_off_t tmp_offset;
        int error;
-       int zone;
        int bytes;
        int cur;
+       int iocflags;
 
        error = 0;
 
@@ -181,17 +181,42 @@ hammer_reblock_helper(struct hammer_ioc_reblock *reblock,
        if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD)
                return(0);
        tmp_offset = elm->leaf.data_offset;
-       zone = HAMMER_ZONE_DECODE(tmp_offset);          /* can be 0 */
-       if ((zone == HAMMER_ZONE_SMALL_DATA_INDEX ||
-            zone == HAMMER_ZONE_LARGE_DATA_INDEX) &&
-           error == 0 && (reblock->head.flags & (HAMMER_IOC_DO_DATA | HAMMER_IOC_DO_INODES))) {
+       if (tmp_offset == 0)
+               goto skip;
+       if (error)
+               goto skip;
+
+       /*
+        * NOTE: Localization restrictions may also have been set-up, we can't
+        * just set the match flags willy-nilly here.
+        */
+       switch(elm->leaf.base.rec_type) {
+       case HAMMER_RECTYPE_INODE:
+       case HAMMER_RECTYPE_PSEUDO_INODE:
+               iocflags = HAMMER_IOC_DO_INODES;
+               break;
+       case HAMMER_RECTYPE_EXT:
+       case HAMMER_RECTYPE_FIX:
+       case HAMMER_RECTYPE_DIRENTRY:
+               iocflags = HAMMER_IOC_DO_DIRS;
+               break;
+       case HAMMER_RECTYPE_DATA:
+       case HAMMER_RECTYPE_DB:
+               iocflags = HAMMER_IOC_DO_DATA;
+               break;
+       default:
+               iocflags = 0;
+               break;
+       }
+       if (reblock->head.flags & iocflags) {
                ++reblock->data_count;
                reblock->data_byte_count += elm->leaf.data_len;
                bytes = hammer_blockmap_getfree(cursor->trans->hmp, tmp_offset,
                                                &cur, &error);
                if (hammer_debug_general & 0x4000)
                        kprintf("D %6d/%d\n", bytes, reblock->free_level);
-               if (error == 0 && cur == 0 && bytes >= reblock->free_level) {
+               if (error == 0 && (cur == 0 || reblock->free_level == 0) &&
+                   bytes >= reblock->free_level) {
                        error = hammer_cursor_upgrade(cursor);
                        if (error == 0) {
                                error = hammer_reblock_data(reblock,
@@ -209,15 +234,15 @@ skip:
         * Reblock a B-Tree internal or leaf node.
         */
        tmp_offset = cursor->node->node_offset;
-       zone = HAMMER_ZONE_DECODE(tmp_offset);
-       if (zone == HAMMER_ZONE_BTREE_INDEX && cursor->index == 0 &&
+       if (cursor->index == 0 &&
            error == 0 && (reblock->head.flags & HAMMER_IOC_DO_BTREE)) {
                ++reblock->btree_count;
                bytes = hammer_blockmap_getfree(cursor->trans->hmp, tmp_offset,
                                                &cur, &error);
                if (hammer_debug_general & 0x4000)
                        kprintf("B %6d/%d\n", bytes, reblock->free_level);
-               if (error == 0 && cur == 0 && bytes >= reblock->free_level) {
+               if (error == 0 && (cur == 0 || reblock->free_level == 0) &&
+                   bytes >= reblock->free_level) {
                        error = hammer_cursor_upgrade(cursor);
                        if (error == 0) {
                                if (cursor->parent)
@@ -265,6 +290,7 @@ hammer_reblock_data(struct hammer_ioc_reblock *reblock,
        if (error)
                return (error);
        ndata = hammer_alloc_data(cursor->trans, elm->leaf.data_len,
+                                 elm->leaf.base.rec_type,
                                  &ndata_offset, &data_buffer, &error);
        if (error)
                goto done;
index 37a6d4f..5a995b6 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.69 2008/06/13 00:25:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.70 2008/06/14 01:42:13 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -1891,7 +1891,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
        cursor.key_beg.obj_type = 0;
        cursor.key_beg.key = bio->bio_offset + 1;
        cursor.asof = ip->obj_asof;
-       cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
+       cursor.flags |= HAMMER_CURSOR_ASOF;
 
        cursor.key_end = cursor.key_beg;
        KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
@@ -2122,7 +2122,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
        if (cursor.key_beg.key < 0)
                cursor.key_beg.key = 0;
        cursor.asof = ip->obj_asof;
-       cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
+       cursor.flags |= HAMMER_CURSOR_ASOF;
 
        cursor.key_end = cursor.key_beg;
        KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);