struct hammer2_chain_core {
int good;
struct ccms_cst cst;
- struct h2_core_list ownerq; /* chain's which own this core */
+ struct h2_core_list ownerq; /* all chains sharing this core */
struct h2_layer_list layerq;
u_int chain_count; /* total chains in layers */
u_int sharecnt;
* HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES
*
* There is an in-memory representation of all on-media data structure.
- *
- * When accessed read-only the data will be mapped to the related buffer
- * cache buffer.
- *
- * When accessed read-write (marked modified) a kmalloc()'d copy of the
- * is created which can then be modified. The copy is destroyed when a
- * filesystem block is allocated to replace it.
- *
- * Active inodes (those with vnodes attached) will maintain the kmalloc()'d
- * copy for both the read-only and the read-write case. The combination of
- * (bp) and (data) determines whether (data) was allocated or not.
+ * Basically everything is represented by a hammer2_chain structure
+ * in-memory and other higher-level structures map to chains.
+ *
+ * A great deal of data is accessed simply via its buffer cache buffer,
+ * which is mapped for the duration of the chain's lock. However, because
+ * chains may represent blocks smaller than the 16KB minimum we impose
+ * on buffer cache buffers, we cannot hold related buffer cache buffers
+ * locked for smaller blocks. In these situations we kmalloc() a copy
+ * of the block.
+ *
+ * When modifications are made to a chain a new filesystem block must be
+ * allocated. Multiple modifications do not necessarily allocate new
+ * blocks. However, when a flush occurs a flush synchronization point
+ * is created and any new modifications made after this point will allocate
+ * a new block even if the chain is already in a modified state.
*
* The in-memory representation may remain cached (for example in order to
* placemark clustering locks) even after the related data has been
* detached.
+ *
+ * CORE SHARING
+ *
+ * In order to support concurrent flushes a flush synchronization point
+ * is created represented by a transaction id. Among other things,
+ * operations may move filesystem objects from one part of the topology
+ * to another (for example, if you rename a file or when indirect blocks
+ * are created or destroyed, and a few other things). When this occurs
+ * across a flush synchronization point the flusher needs to be able to
+ * recurse down BOTH the 'before' version of the topology and the 'after'
+ * version.
+ *
+ * To facilitate this modifications to chains do what is called a
+ * DELETE-DUPLICATE operation. Chains are not actually moved in-memory.
+ * Instead the chain we wish to move is deleted and a new chain is created
+ * at the target location in the topology. ANY SUBCHAINS PLACED UNDER THE
+ * CHAIN BEING MOVED HAVE TO EXIST IN BOTH PLACES. To make this work
+ * all sub-chains are managed by the hammer2_chain_core structure. This
+ * structure can be multi-homed, meaning that it can have more than one
+ * chain as its parent. When a chain is delete-duplicated the chain's core
+ * becomes shared under both the old and new chain.
+ *
+ * STALE CHAINS
+ *
+ * When a chain is delete-duplicated the old chain typically becomes stale.
+ * This is detected via the HAMMER2_CHAIN_DUPLICATED flag in chain->flags.
+ * To avoid executing live filesystem operations on stale chains, the inode
+ * locking code will follow stale chains via core->ownerq until it finds
+ * the live chain. The lock prevents ripups by other threads. Lookups
+ * must properly order locking operations to prevent other threads from
+ * racing the lookup operation and will also follow stale chains when
+ * required.
*/
RB_HEAD(hammer2_inode_tree, hammer2_inode);
chain == &hmp->fchain) {
/*
* Drop the ref from the MODIFIED bit we cleared.
+ * Net is -0 or -1 ref depending.
*/
if (wasmodified)
hammer2_chain_drop(chain);
} else {
/*
- * If we were MODIFIED we inherit the ref from clearing
- * that bit, otherwise we need another ref.
+ * Drop the ref from the MODIFIED bit we cleared and
+ * set a ref for the MOVED bit we are setting. Net
+ * is +0 or +1 ref depending.
*/
if (wasmodified == 0)
hammer2_chain_ref(chain);
}
/*
- * Final drop of embedded freemap root chain to clean up
- * fchain.core (fchain structure is not flagged ALLOCATED
- * so it is cleaned out and then left to rot).
+ * Final drop of embedded freemap root chain to
+ * clean up fchain.core (fchain structure is not
+ * flagged ALLOCATED so it is cleaned out and then
+ * left to rot).
*/
hammer2_chain_drop(&hmp->fchain);
hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH);
+ /*
+ * vfsync the vnodes. XXX
+ */
info.error = 0;
info.waitfor = MNT_NOWAIT;
vmntvnodescan(mp, flags | VMSC_NOWAIT,
for (i = 0; i < pmp->cluster.nchains; ++i) {
hmp = pmp->cluster.chains[i]->hmp;
+ /*
+ * Media mounts have two 'roots', vchain for the topology
+ * and fchain for the free block table. Flush both.
+ *
+ * Note that the topology and free block table are handled
+ * independently, so the free block table can wind up being
+ * ahead of the topology. We depend on the bulk free scan
+ * code to deal with any loose ends.
+ */
hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
HAMMER2_CHAIN_SUBMODIFIED)) {
}
hammer2_chain_unlock(&hmp->vchain);
-#if 1
- /*
- * Rollup flush. The fsyncs above basically just flushed
- * data blocks. The flush below gets all the meta-data.
- */
hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
HAMMER2_CHAIN_SUBMODIFIED)) {
- /* this will modify vchain as a side effect */
+ /* this will also modify vchain as a side effect */
hammer2_chain_flush(&info.trans, &hmp->fchain);
}
hammer2_chain_unlock(&hmp->fchain);
-#endif
error = 0;
while (uio->uio_resid > 0) {
hammer2_key_t lbase;
int trivial;
+ int endofblk;
int lblksize;
int loff;
int n;
- int rem_size;
/*
* Don't allow the buffer build to blow out the buffer
&lbase, NULL);
loff = (int)(uio->uio_offset - lbase);
- if (uio->uio_resid < lblksize) {
- rem_size = (int)uio->uio_resid;
- }
- else {
- rem_size = 0;
- }
-
KKASSERT(lblksize <= 65536);
/*
n = uio->uio_resid;
if (loff == lbase && uio->uio_offset + n == new_eof)
trivial = 1;
- } else if (loff == 0) {
- trivial = 1;
+ endofblk = 0;
+ } else {
+ if (loff == 0)
+ trivial = 1;
+ endofblk = 1;
}
/*
brelse(bp);
break;
}
- bdwrite(bp);
- if (error)
- break;
+
+ /*
+ * WARNING: Pageout daemon will issue UIO_NOCOPY writes
+ * with IO_SYNC or IO_ASYNC set. These writes
+ * must be handled as the pageout daemon expects.
+ */
+ if (ap->a_ioflag & IO_SYNC) {
+ bwrite(bp);
+ } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
+ bawrite(bp);
+ } else if (ap->a_ioflag & IO_ASYNC) {
+ bawrite(bp);
+ } else {
+ bdwrite(bp);
+ }
}
/*
* request, in bytes.
*
* (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
+ *
+ * Basically disabled, the logical buffer write thread has to deal with
+ * buffers one-at-a-time.
*/
static
int
if (ap->a_runb)
*ap->a_runb = 0;
return (EOPNOTSUPP);
-#if 0
- struct vnode *vp;
- hammer2_inode_t *ip;
- hammer2_chain_t *parent;
- hammer2_chain_t *chain;
- hammer2_key_t key_next;
- hammer2_key_t lbeg;
- hammer2_key_t lend;
- hammer2_off_t pbeg;
- hammer2_off_t pbytes;
- hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
- int loff;
- int ai;
- int cache_index;
-
- /*
- * Only supported on regular files
- *
- * Only supported for read operations (required for cluster_read).
- * The block allocation is delayed for write operations.
- */
- vp = ap->a_vp;
- if (vp->v_type != VREG)
- return (EOPNOTSUPP);
- if (ap->a_cmd != BUF_CMD_READ)
- return (EOPNOTSUPP);
-
- ip = VTOI(vp);
- bzero(array, sizeof(array));
-
- /*
- * Calculate logical range
- */
- KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
- lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
- lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
- if (lend < lbeg)
- lend = lbeg;
- loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
-
- parent = hammer2_inode_lock_sh(ip);
- chain = hammer2_chain_lookup(&parent, &key_next,
- lbeg, lend,
- &cache_index,
- HAMMER2_LOOKUP_NODATA |
- HAMMER2_LOOKUP_SHARED);
- if (chain == NULL) {
- *ap->a_doffsetp = ZFOFFSET;
- hammer2_inode_unlock_sh(ip, parent);
- return (0);
- }
-
- while (chain) {
- if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
- ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
- KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
- array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
- array[ai][1] = chain->bytes;
- }
- chain = hammer2_chain_next(&parent, chain, &key_next,
- key_next, lend,
- &cache_index,
- HAMMER2_LOOKUP_NODATA |
- HAMMER2_LOOKUP_SHARED);
- }
- hammer2_inode_unlock_sh(ip, parent);
-
- /*
- * If the requested loffset is not mappable physically we can't
- * bmap. The caller will have to access the file data via a
- * device buffer.
- */
- if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_MINIOSIZE) {
- *ap->a_doffsetp = NOOFFSET;
- return (0);
- }
-
- /*
- * Calculate the physical disk offset range for array[0]
- */
- pbeg = array[0][0] + loff;
- pbytes = array[0][1] - loff;
-
- for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
- if (array[ai][0] != pbeg + pbytes)
- break;
- pbytes += array[ai][1];
- }
-
- *ap->a_doffsetp = pbeg;
- if (ap->a_runp)
- *ap->a_runp = pbytes;
- return (0);
-#endif
}
static