hammer2 - cleanup, write path work
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 1 Oct 2013 05:48:20 +0000 (22:48 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 2 Oct 2013 00:01:25 +0000 (17:01 -0700)
* Code synchronous and asynchronous b*write() in the write path based on
  request flags, instead of just using bdwrite().

* Document a bunch of stuff.

* Remove dead code.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index beacd55..fa040cc 100644 (file)
@@ -137,7 +137,7 @@ typedef struct hammer2_chain_layer hammer2_chain_layer_t;
 struct hammer2_chain_core {
        int             good;
        struct ccms_cst cst;
-       struct h2_core_list ownerq;     /* chain's which own this core */
+       struct h2_core_list ownerq;     /* all chains sharing this core */
        struct h2_layer_list layerq;
        u_int           chain_count;    /* total chains in layers */
        u_int           sharecnt;
@@ -289,21 +289,57 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
  * HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES
  *
  * There is an in-memory representation of all on-media data structure.
- *
- * When accessed read-only the data will be mapped to the related buffer
- * cache buffer.
- *
- * When accessed read-write (marked modified) a kmalloc()'d copy of the
- * is created which can then be modified.  The copy is destroyed when a
- * filesystem block is allocated to replace it.
- *
- * Active inodes (those with vnodes attached) will maintain the kmalloc()'d
- * copy for both the read-only and the read-write case.  The combination of
- * (bp) and (data) determines whether (data) was allocated or not.
+ * Basically everything is represented by a hammer2_chain structure
+ * in-memory and other higher-level structures map to chains.
+ *
+ * A great deal of data is accessed simply via its buffer cache buffer,
+ * which is mapped for the duration of the chain's lock.  However, because
+ * chains may represent blocks smaller than the 16KB minimum we impose
+ * on buffer cache buffers, we cannot hold related buffer cache buffers
+ * locked for smaller blocks.  In these situations we kmalloc() a copy
+ * of the block.
+ *
+ * When modifications are made to a chain a new filesystem block must be
+ * allocated.  Multiple modifications do not necessarily allocate new
+ * blocks.  However, when a flush occurs a flush synchronization point
+ * is created and any new modifications made after this point will allocate
+ * a new block even if the chain is already in a modified state.
  *
  * The in-memory representation may remain cached (for example in order to
  * placemark clustering locks) even after the related data has been
  * detached.
+ *
+ *                             CORE SHARING
+ *
+ * In order to support concurrent flushes a flush synchronization point
+ * is created represented by a transaction id.  Among other things,
+ * operations may move filesystem objects from one part of the topology
+ * to another (for example, if you rename a file or when indirect blocks
+ * are created or destroyed, and a few other things).  When this occurs
+ * across a flush synchronization point the flusher needs to be able to
+ * recurse down BOTH the 'before' version of the topology and the 'after'
+ * version.
+ *
+ * To facilitate this modifications to chains do what is called a
+ * DELETE-DUPLICATE operation.  Chains are not actually moved in-memory.
+ * Instead the chain we wish to move is deleted and a new chain is created
+ * at the target location in the topology.  ANY SUBCHAINS PLACED UNDER THE
+ * CHAIN BEING MOVED HAVE TO EXIST IN BOTH PLACES.  To make this work
+ * all sub-chains are managed by the hammer2_chain_core structure.  This
+ * structure can be multi-homed, meaning that it can have more than one
+ * chain as its parent.  When a chain is delete-duplicated the chain's core
+ * becomes shared under both the old and new chain.
+ *
+ *                             STALE CHAINS
+ *
+ * When a chain is delete-duplicated the old chain typically becomes stale.
+ * This is detected via the HAMMER2_CHAIN_DUPLICATED flag in chain->flags.
+ * To avoid executing live filesystem operations on stale chains, the inode
+ * locking code will follow stale chains via core->ownerq until it finds
+ * the live chain.  The lock prevents ripups by other threads.  Lookups
+ * must properly order locking operations to prevent other threads from
+ * racing the lookup operation and will also follow stale chains when
+ * required.
  */
 
 RB_HEAD(hammer2_inode_tree, hammer2_inode);
index 98aa86f..1b41a2e 100644 (file)
@@ -674,13 +674,15 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
            chain == &hmp->fchain) {
                /*
                 * Drop the ref from the MODIFIED bit we cleared.
+                * Net is -0 or -1 ref depending.
                 */
                if (wasmodified)
                        hammer2_chain_drop(chain);
        } else {
                /*
-                * If we were MODIFIED we inherit the ref from clearing
-                * that bit, otherwise we need another ref.
+                * Drop the ref from the MODIFIED bit we cleared and
+                * set a ref for the MOVED bit we are setting.  Net
+                * is +0 or +1 ref depending.
                 */
                if (wasmodified == 0)
                        hammer2_chain_ref(chain);
index ea65067..fbf8ff0 100644 (file)
@@ -1481,9 +1481,10 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
                        }
 
                        /*
-                        * Final drop of embedded freemap root chain to clean up
-                        * fchain.core (fchain structure is not flagged ALLOCATED
-                        * so it is cleaned out and then left to rot).
+                        * Final drop of embedded freemap root chain to
+                        * clean up fchain.core (fchain structure is not
+                        * flagged ALLOCATED so it is cleaned out and then
+                        * left to rot).
                         */
                        hammer2_chain_drop(&hmp->fchain);
 
@@ -1657,6 +1658,9 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
 
        hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH);
 
+       /*
+        * vfsync the vnodes. XXX
+        */
        info.error = 0;
        info.waitfor = MNT_NOWAIT;
        vmntvnodescan(mp, flags | VMSC_NOWAIT,
@@ -1681,6 +1685,15 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        for (i = 0; i < pmp->cluster.nchains; ++i) {
                hmp = pmp->cluster.chains[i]->hmp;
 
+               /*
+                * Media mounts have two 'roots', vchain for the topology
+                * and fchain for the free block table.  Flush both.
+                *
+                * Note that the topology and free block table are handled
+                * independently, so the free block table can wind up being
+                * ahead of the topology.  We depend on the bulk free scan
+                * code to deal with any loose ends.
+                */
                hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
                if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
                                          HAMMER2_CHAIN_SUBMODIFIED)) {
@@ -1688,19 +1701,13 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                }
                hammer2_chain_unlock(&hmp->vchain);
 
-#if 1
-               /*
-                * Rollup flush.  The fsyncs above basically just flushed
-                * data blocks.  The flush below gets all the meta-data.
-                */
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
                if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
                                         HAMMER2_CHAIN_SUBMODIFIED)) {
-                       /* this will modify vchain as a side effect */
+                       /* this will also modify vchain as a side effect */
                        hammer2_chain_flush(&info.trans, &hmp->fchain);
                }
                hammer2_chain_unlock(&hmp->fchain);
-#endif
 
                error = 0;
 
index c53fd41..cd51082 100644 (file)
@@ -1019,10 +1019,10 @@ hammer2_write_file(hammer2_inode_t *ip,
        while (uio->uio_resid > 0) {
                hammer2_key_t lbase;
                int trivial;
+               int endofblk;
                int lblksize;
                int loff;
                int n;
-               int rem_size;
 
                /*
                 * Don't allow the buffer build to blow out the buffer
@@ -1041,13 +1041,6 @@ hammer2_write_file(hammer2_inode_t *ip,
                                                &lbase, NULL);
                loff = (int)(uio->uio_offset - lbase);
                
-               if (uio->uio_resid < lblksize) {
-                       rem_size = (int)uio->uio_resid;
-               }
-               else {
-                       rem_size = 0;
-               }
-               
                KKASSERT(lblksize <= 65536);
 
                /*
@@ -1060,8 +1053,11 @@ hammer2_write_file(hammer2_inode_t *ip,
                        n = uio->uio_resid;
                        if (loff == lbase && uio->uio_offset + n == new_eof)
                                trivial = 1;
-               } else if (loff == 0) {
-                       trivial = 1;
+                       endofblk = 0;
+               } else {
+                       if (loff == 0)
+                               trivial = 1;
+                       endofblk = 1;
                }
 
                /*
@@ -1117,9 +1113,21 @@ hammer2_write_file(hammer2_inode_t *ip,
                        brelse(bp);
                        break;
                }
-               bdwrite(bp);
-               if (error)
-                       break;
+
+               /*
+                * WARNING: Pageout daemon will issue UIO_NOCOPY writes
+                *          with IO_SYNC or IO_ASYNC set.  These writes
+                *          must be handled as the pageout daemon expects.
+                */
+               if (ap->a_ioflag & IO_SYNC) {
+                       bwrite(bp);
+               } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
+                       bawrite(bp);
+               } else if (ap->a_ioflag & IO_ASYNC) {
+                       bawrite(bp);
+               } else {
+                       bdwrite(bp);
+               }
        }
 
        /*
@@ -1389,6 +1397,9 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
  * request, in bytes.
  *
  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
+ *
+ * Basically disabled, the logical buffer write thread has to deal with
+ * buffers one-at-a-time.
  */
 static
 int
@@ -1400,100 +1411,6 @@ hammer2_vop_bmap(struct vop_bmap_args *ap)
        if (ap->a_runb)
                *ap->a_runb = 0;
        return (EOPNOTSUPP);
-#if 0
-       struct vnode *vp;
-       hammer2_inode_t *ip;
-       hammer2_chain_t *parent;
-       hammer2_chain_t *chain;
-       hammer2_key_t key_next;
-       hammer2_key_t lbeg;
-       hammer2_key_t lend;
-       hammer2_off_t pbeg;
-       hammer2_off_t pbytes;
-       hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
-       int loff;
-       int ai;
-       int cache_index;
-
-       /*
-        * Only supported on regular files
-        *
-        * Only supported for read operations (required for cluster_read).
-        * The block allocation is delayed for write operations.
-        */
-       vp = ap->a_vp;
-       if (vp->v_type != VREG)
-               return (EOPNOTSUPP);
-       if (ap->a_cmd != BUF_CMD_READ)
-               return (EOPNOTSUPP);
-
-       ip = VTOI(vp);
-       bzero(array, sizeof(array));
-
-       /*
-        * Calculate logical range
-        */
-       KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
-       lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
-       lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
-       if (lend < lbeg)
-               lend = lbeg;
-       loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
-
-       parent = hammer2_inode_lock_sh(ip);
-       chain = hammer2_chain_lookup(&parent, &key_next,
-                                    lbeg, lend,
-                                    &cache_index,
-                                    HAMMER2_LOOKUP_NODATA |
-                                    HAMMER2_LOOKUP_SHARED);
-       if (chain == NULL) {
-               *ap->a_doffsetp = ZFOFFSET;
-               hammer2_inode_unlock_sh(ip, parent);
-               return (0);
-       }
-
-       while (chain) {
-               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
-                       ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
-                       KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
-                       array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
-                       array[ai][1] = chain->bytes;
-               }
-               chain = hammer2_chain_next(&parent, chain, &key_next,
-                                          key_next, lend,
-                                          &cache_index,
-                                          HAMMER2_LOOKUP_NODATA |
-                                          HAMMER2_LOOKUP_SHARED);
-       }
-       hammer2_inode_unlock_sh(ip, parent);
-
-       /*
-        * If the requested loffset is not mappable physically we can't
-        * bmap.  The caller will have to access the file data via a
-        * device buffer.
-        */
-       if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_MINIOSIZE) {
-               *ap->a_doffsetp = NOOFFSET;
-               return (0);
-       }
-
-       /*
-        * Calculate the physical disk offset range for array[0]
-        */
-       pbeg = array[0][0] + loff;
-       pbytes = array[0][1] - loff;
-
-       for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
-               if (array[ai][0] != pbeg + pbytes)
-                       break;
-               pbytes += array[ai][1];
-       }
-
-       *ap->a_doffsetp = pbeg;
-       if (ap->a_runp)
-               *ap->a_runp = pbytes;
-       return (0);
-#endif
 }
 
 static