hammer2 - bug fixes master
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 31 Aug 2015 02:55:53 +0000 (19:55 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 31 Aug 2015 03:46:27 +0000 (20:46 -0700)
* When a dirty inode is reclaimed meta-data changes stored in the inode
  structure could sometimes get lost.  These changes must be synchronized
  to the chains in the underlying cluster nodes.

  Repurpose the unlinkq into a more general 'sideq' that handles both
  deletion-on-reclaim and dirty-meta-data-on-reclaim.

  Add a flag to the inode to represent when it is on the sideq, preventing
  situations where it might be added twice.

* Enable the free->allocated transition in the bulkfree code.  Add
  additional statistics and an underflow/overflow check for
  hammer2_bmap_data->avail.  Also adjust the volume free space
  in both directions.  Do not update any live field if 'nofree' is
  set for the chain (~1GB granularity representing possible
  frontend/backend races).

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_bulkfree.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_strategy.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index a700df3..4109a07 100644 (file)
@@ -729,6 +729,7 @@ typedef struct hammer2_inode hammer2_inode_t;
 #define HAMMER2_INODE_ISDELETED                0x0020  /* deleted, not in ihidden */
 #define HAMMER2_INODE_ISUNLINKED       0x0040
 #define HAMMER2_INODE_METAGOOD         0x0080  /* inode meta-data good */
+#define HAMMER2_INODE_ONSIDEQ          0x0100  /* on side processing queue */
 
 int hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2);
 RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
@@ -737,13 +738,13 @@ RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
 /*
  * inode-unlink side-structure
  */
-struct hammer2_inode_unlink {
-       TAILQ_ENTRY(hammer2_inode_unlink) entry;
+struct hammer2_inode_sideq {
+       TAILQ_ENTRY(hammer2_inode_sideq) entry;
        hammer2_inode_t *ip;
 };
-TAILQ_HEAD(h2_unlk_list, hammer2_inode_unlink);
+TAILQ_HEAD(h2_sideq_list, hammer2_inode_sideq);
 
-typedef struct hammer2_inode_unlink hammer2_inode_unlink_t;
+typedef struct hammer2_inode_sideq hammer2_inode_sideq_t;
 
 /*
  * Transaction management sub-structure under hammer2_pfs
@@ -1139,7 +1140,7 @@ struct hammer2_pfs {
        uint32_t                inmem_dirty_chains;
        int                     count_lwinprog; /* logical write in prog */
        struct spinlock         list_spin;
-       struct h2_unlk_list     unlinkq;        /* last-close unlink */
+       struct h2_sideq_list    sideq;          /* last-close dirty/unlink */
        hammer2_thread_t        sync_thrs[HAMMER2_MAXCLUSTER];
        uint32_t                cluster_flags;  /* cached cluster flags */
        int                     has_xop_threads;
@@ -1327,7 +1328,7 @@ void hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
 void hammer2_inode_repoint_one(hammer2_inode_t *ip, hammer2_cluster_t *cluster,
                        int idx);
 void hammer2_inode_modify(hammer2_inode_t *ip);
-void hammer2_inode_run_unlinkq(hammer2_pfs_t *pmp);
+void hammer2_inode_run_sideq(hammer2_pfs_t *pmp);
 
 hammer2_inode_t *hammer2_inode_create(hammer2_inode_t *dip,
                        struct vattr *vap, struct ucred *cred,
index 2b1cb11..059f534 100644 (file)
@@ -65,9 +65,11 @@ typedef struct hammer2_bulkfree_info {
        hammer2_off_t           sstop;
        hammer2_bmap_data_t     *bmap;
        int                     depth;
-       long                    count_10_00;
-       long                    count_11_10;
-       long                    count_10_11;
+       long                    count_10_00;    /* staged->free      */
+       long                    count_11_10;    /* allocated->staged */
+       long                    count_00_11;    /* (should not happen) */
+       long                    count_01_11;    /* (should not happen) */
+       long                    count_10_11;    /* staged->allocated */
        long                    count_l0cleans;
        long                    count_linadjusts;
        long                    count_inodes_scanned;
@@ -410,7 +412,9 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi)
 
        kprintf("    transition->free   %ld\n", cbinfo.count_10_00);
        kprintf("    transition->staged %ld\n", cbinfo.count_11_10);
-       kprintf("    raced on           %ld\n", cbinfo.count_10_11);
+       kprintf("    ERR(00)->allocated %ld\n", cbinfo.count_00_11);
+       kprintf("    ERR(01)->allocated %ld\n", cbinfo.count_01_11);
+       kprintf("    staged->allocated  %ld\n", cbinfo.count_10_11);
        kprintf("    ~2MB segs cleaned  %ld\n", cbinfo.count_l0cleans);
        kprintf("    linear adjusts     %ld\n", cbinfo.count_linadjusts);
        kprintf("    dedup factor       %ld\n", cbinfo.count_dedup_factor);
@@ -591,6 +595,10 @@ h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
        live_chain = NULL;
        nofree = 1;     /* safety */
 
+       /*
+        * Iterate each hammer2_bmap_data_t line (128 bytes) managing
+        * 4MB of storage.
+        */
        while (data_off < cbinfo->sstop) {
                /*
                 * The freemap is not used below allocator_beg or beyond
@@ -619,7 +627,8 @@ h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
                                            HAMMER2_LOOKUP_ALWAYS);
                        /*
                         * If recent allocations were made we avoid races by
-                        * not freeing any blocks.
+                        * not staging or freeing any blocks.  We can still
+                        * remark blocks as fully allocated.
                         */
                        if (live_chain) {
                                kprintf("live_chain %016jx\n", (intmax_t)key);
@@ -636,6 +645,11 @@ h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
                                        
                }
                if (live_chain == NULL) {
+                       /*
+                        * XXX if we implement a full recovery mode we need
+                        * to create/recreate missing freemap chains if our
+                        * bmap has any allocated blocks.
+                        */
                        if (bmap->class &&
                            bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) {
                                kprintf("hammer2_bulkfree: cannot locate "
@@ -661,13 +675,10 @@ h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
                live = &live_chain->data->bmdata[bmapindex];
 
                /*
-                * For now just handle the 11->10, 10->00, and 10->11
-                * transitions.
+                * TODO - we could shortcut this by testing that both
+                * live->class and bmap->class are 0, and both avails are
+                * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB).
                 */
-               if (live->class == 0 ||
-                   live->avail == HAMMER2_FREEMAP_LEVEL0_SIZE) {
-                       goto next;
-               }
                if (bcmp(live->bitmapq, bmap->bitmapq,
                         sizeof(bmap->bitmapq)) == 0) {
                        goto next;
@@ -677,6 +688,7 @@ h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
                        data_off, bmapindex, live->class, live->avail);
 
                hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0);
+
                h2_bulkfree_sync_adjust(cbinfo, live, bmap, nofree);
 next:
                data_off += HAMMER2_FREEMAP_LEVEL0_SIZE;
@@ -722,6 +734,10 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                                /*
                                 * in-memory 00         live 11 -> 10
                                 *                      live 10 -> 00
+                                *
+                                * Storage might be marked allocated or
+                                * staged and must be remarked staged or
+                                * free.
                                 */
                                switch (lmask & 3) {
                                case 0: /* 00 */
@@ -737,6 +753,11 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                                            ~((hammer2_bitmap_t)2 << scount);
                                        live->avail +=
                                                HAMMER2_FREEMAP_BLOCK_SIZE;
+                                       if (live->avail >
+                                           HAMMER2_FREEMAP_LEVEL0_SIZE) {
+                                               live->avail =
+                                                   HAMMER2_FREEMAP_LEVEL0_SIZE;
+                                       }
                                        cbinfo->adj_free +=
                                                HAMMER2_FREEMAP_BLOCK_SIZE;
                                        ++cbinfo->count_10_00;
@@ -749,28 +770,36 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                                        ++cbinfo->count_11_10;
                                        break;
                                }
-                       } else if ((lmask & 3) == 3) {
+                       } else if ((mmask & 3) == 3) {
                                /*
                                 * in-memory 11         live 10 -> 11
                                 *                      live ** -> 11
+                                *
+                                * Storage might be incorrectly marked free
+                                * or staged and must be remarked fully
+                                * allocated.
                                 */
                                switch (lmask & 3) {
                                case 0: /* 00 */
-                                       kprintf("hammer2_bulkfree: cannot "
-                                               "transition m=11/l=00\n");
+                                       ++cbinfo->count_00_11;
+                                       cbinfo->adj_free -=
+                                               HAMMER2_FREEMAP_BLOCK_SIZE;
+                                       live->avail -=
+                                               HAMMER2_FREEMAP_BLOCK_SIZE;
+                                       if ((int32_t)live->avail < 0)
+                                               live->avail = 0;
                                        break;
                                case 1: /* 01 */
-                                       kprintf("hammer2_bulkfree: cannot "
-                                               "transition m=11/l=01\n");
+                                       ++cbinfo->count_01_11;
                                        break;
                                case 2: /* 10 -> 11 */
-                                       live->bitmapq[bindex] |=
-                                               ((hammer2_bitmap_t)1 << scount);
                                        ++cbinfo->count_10_11;
                                        break;
                                case 3: /* 11 */
                                        break;
                                }
+                               live->bitmapq[bindex] |=
+                                       ((hammer2_bitmap_t)3 << scount);
                        }
                        mmask >>= 2;
                        lmask >>= 2;
@@ -786,7 +815,9 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                if (live->bitmapq[bindex] != 0)
                        break;
        }
-       if (bindex < 0) {
+       if (nofree) {
+               /* do nothing */
+       } else if (bindex < 0) {
                live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
                live->class = 0;
                live->linear = 0;
@@ -797,6 +828,16 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
                        live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE;
                        ++cbinfo->count_linadjusts;
                }
+
+               /*
+                * XXX this fine-grained measure still has some issues.
+                */
+               if (live->linear < bindex * HAMMER2_FREEMAP_BLOCK_SIZE) {
+                       live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE;
+                       ++cbinfo->count_linadjusts;
+               }
+       } else {
+               live->linear = HAMMER2_SEGSIZE;
        }
 
 #if 0
index 38fa0c6..5fde737 100644 (file)
@@ -1342,39 +1342,62 @@ hammer2_inode_chain_sync(hammer2_inode_t *ip)
 }
 
 /*
- * This handles unlinked open files after the vnode is finally dereferenced.
- * To avoid deadlocks it cannot be called from the normal vnode recycling
- * path, so we call it (1) after a unlink, rmdir, or rename, (2) on every
- * flush, and (3) on umount.
+ * The normal filesystem sync no longer has visibility to an inode structure
+ * after its vnode has been reclaimed.  In this situation an unlinked-but-open
+ * inode or a dirty inode may require additional processing to synchronize
+ * ip->meta to its underlying cluster nodes.
+ *
+ * In particular, reclaims can occur in almost any state (for example, when
+ * doing operations on unrelated vnodes) and flushing the reclaimed inode
+ * in the reclaim path itself is a non-starter.
  *
  * Caller must be in a transaction.
  */
 void
-hammer2_inode_run_unlinkq(hammer2_pfs_t *pmp)
+hammer2_inode_run_sideq(hammer2_pfs_t *pmp)
 {
        hammer2_xop_destroy_t *xop;
-       hammer2_inode_unlink_t *ipul;
+       hammer2_inode_sideq_t *ipul;
        hammer2_inode_t *ip;
        int error;
 
-       if (TAILQ_EMPTY(&pmp->unlinkq))
+       if (TAILQ_EMPTY(&pmp->sideq))
                return;
 
        LOCKSTART;
        hammer2_spin_ex(&pmp->list_spin);
-       while ((ipul = TAILQ_FIRST(&pmp->unlinkq)) != NULL) {
-               TAILQ_REMOVE(&pmp->unlinkq, ipul, entry);
-               hammer2_spin_unex(&pmp->list_spin);
+       while ((ipul = TAILQ_FIRST(&pmp->sideq)) != NULL) {
+               TAILQ_REMOVE(&pmp->sideq, ipul, entry);
                ip = ipul->ip;
+               KKASSERT(ip->flags & HAMMER2_INODE_ONSIDEQ);
+               atomic_clear_int(&ip->flags, HAMMER2_INODE_ONSIDEQ);
+               hammer2_spin_unex(&pmp->list_spin);
                kfree(ipul, pmp->minode);
 
                hammer2_inode_lock(ip, 0);
-               xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
-               hammer2_xop_start(&xop->head, hammer2_inode_xop_destroy);
-               error = hammer2_xop_collect(&xop->head, 0);
-               hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
+               if (ip->flags & HAMMER2_INODE_ISUNLINKED) {
+                       /*
+                        * The inode was unlinked while open, causing H2
+                        * to relink it to a hidden directory to allow
+                        * cluster operations to continue until close.
+                        *
+                        * The inode must be deleted and destroyed.
+                        */
+                       xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
+                       hammer2_xop_start(&xop->head,
+                                         hammer2_inode_xop_destroy);
+                       error = hammer2_xop_collect(&xop->head, 0);
+                       hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
 
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_ISDELETED);
+                       atomic_clear_int(&ip->flags, HAMMER2_INODE_ISDELETED);
+               } else {
+                       /*
+                        * The inode was dirty as-of the reclaim, requiring
+                        * synchronization of ip->meta with its underlying
+                        * chains.
+                        */
+                       hammer2_inode_chain_sync(ip);
+               }
 
                hammer2_inode_unlock(ip);
                hammer2_inode_drop(ip);                 /* ipul ref */
@@ -1458,7 +1481,7 @@ fail:
 /*
  * Inode delete helper (backend, threaded)
  *
- * Generally used by hammer2_run_unlinkq()
+ * Generally used by hammer2_run_sideq()
  */
 void
 hammer2_inode_xop_destroy(hammer2_xop_t *arg, int clindex)
index 7ec8e31..b707d7a 100644 (file)
 /*
  * This module handles low level logical file I/O (strategy) which backs
  * the logical buffer cache.
+ *
+ * [De]compression, zero-block, check codes, and buffer cache operations
+ * for file data is handled here.
+ *
+ * Live dedup makes its home here as well.
  */
 
 #include <sys/param.h>
index b29b0ae..a0ab286 100644 (file)
@@ -357,7 +357,7 @@ hammer2_pfsalloc(hammer2_chain_t *chain, const hammer2_inode_data_t *ripdata,
                spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
                spin_init(&pmp->xop_spin, "h2xop");
                RB_INIT(&pmp->inum_tree);
-               TAILQ_INIT(&pmp->unlinkq);
+               TAILQ_INIT(&pmp->sideq);
                spin_init(&pmp->list_spin, "hm2pfsalloc_list");
 
                /*
@@ -1434,7 +1434,7 @@ again:
        /*
         * Cycle the volume data lock as a safety (probably not needed any
         * more).  To ensure everything is out we need to flush at least
-        * three times.  (1) The running of the unlinkq can dirty the
+        * three times.  (1) The running of the sideq can dirty the
         * filesystem, (2) A normal flush can dirty the freemap, and
         * (3) ensure that the freemap is fully synchronized.
         *
@@ -1991,7 +1991,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
         */
        hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH |
                                HAMMER2_TRANS_PREFLUSH);
-       hammer2_inode_run_unlinkq(pmp);
+       hammer2_inode_run_sideq(pmp);
 
        info.error = 0;
        info.waitfor = MNT_NOWAIT;
index c9333a8..ffdb560 100644 (file)
@@ -105,7 +105,7 @@ hammer2_vop_inactive(struct vop_inactive_args *ap)
         * so we do not waste time trying to flush them.
         *
         * Note that deleting the file block chains under the inode chain
-        * would just be a waste of time.
+        * would just be a waste of energy, so don't do it.
         *
         * WARNING: nvtruncbuf() can only be safely called without the inode
         *          lock held due to the way our write thread works.
@@ -170,23 +170,37 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
         * An unlinked inode may have been relinked to the ihidden directory.
         * This occurs if the inode was unlinked while open.  Reclamation of
         * these inodes requires processing we cannot safely do here so add
-        * the inode to the unlinkq in that situation.
+        * the inode to the sideq in that situation.
+        *
+        * A modified inode may require chain synchronization which will no
+        * longer be driven by a sync or fsync without the vnode, also use
+        * the sideq for that.
         *
         * A reclaim can occur at any time so we cannot safely start a
         * transaction to handle reclamation of unlinked files.  Instead,
         * the ip is left with a reference and placed on a linked list and
         * handled later on.
         */
-       if ((ip->flags & HAMMER2_INODE_ISUNLINKED) &&
+       if ((ip->flags & (HAMMER2_INODE_ISUNLINKED |
+                         HAMMER2_INODE_MODIFIED |
+                         HAMMER2_INODE_RESIZED)) &&
            (ip->flags & HAMMER2_INODE_ISDELETED) == 0) {
-               hammer2_inode_unlink_t *ipul;
+               hammer2_inode_sideq_t *ipul;
 
                ipul = kmalloc(sizeof(*ipul), pmp->minode, M_WAITOK | M_ZERO);
                ipul->ip = ip;
 
                hammer2_spin_ex(&pmp->list_spin);
-               TAILQ_INSERT_TAIL(&pmp->unlinkq, ipul, entry);
-               hammer2_spin_unex(&pmp->list_spin);
+               if ((ip->flags & HAMMER2_INODE_ONSIDEQ) == 0) {
+                       /* ref -> sideq */
+                       atomic_set_int(&ip->flags, HAMMER2_INODE_ONSIDEQ);
+                       TAILQ_INSERT_TAIL(&pmp->sideq, ipul, entry);
+                       hammer2_spin_unex(&pmp->list_spin);
+               } else {
+                       hammer2_spin_unex(&pmp->list_spin);
+                       kfree(ipul, pmp->minode);
+                       hammer2_inode_drop(ip);         /* vp ref */
+               }
                /* retain ref from vp for ipul */
        } else {
                hammer2_inode_drop(ip);                 /* vp ref */
@@ -1604,7 +1618,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap)
                hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
        }
 
-       hammer2_inode_run_unlinkq(dip->pmp);
+       hammer2_inode_run_sideq(dip->pmp);
        hammer2_trans_done(dip->pmp);
        if (error == 0)
                cache_unlink(ap->a_nch);
@@ -1664,7 +1678,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
        } else {
                hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
        }
-       hammer2_inode_run_unlinkq(dip->pmp);
+       hammer2_inode_run_sideq(dip->pmp);
        hammer2_trans_done(dip->pmp);
        if (error == 0)
                cache_unlink(ap->a_nch);
@@ -1888,7 +1902,7 @@ done2:
        hammer2_inode_unlock(cdip);
        hammer2_inode_drop(ip);
        hammer2_inode_drop(cdip);
-       hammer2_inode_run_unlinkq(fdip->pmp);
+       hammer2_inode_run_sideq(fdip->pmp);
        hammer2_trans_done(tdip->pmp);
 
        /*