From d001f460bbf1795b8f37fce7ccc3243202a5d5f6 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 3 May 2013 21:31:31 -0700 Subject: [PATCH] hammer2 - flush sequencing part 1 * Rip out the jerry-rigged flush sequencer and start work on a real one. * Sync and fsync calls create synchronization points and will be serialized against each other. * Modifying operations occurring before a synchronization point will stall modifying operations occurring after the synchronization point until they complete. This will need to be optimized. * However, the synchronization points are coded such that modifying operations occurring after a synchronization point will be able to run concurrently with the disk flush related to that synchronization point. So if there is only one synchronization point (sync or fsync or background sync) active, modifying operations will generally not stall. At least not for very long. --- sys/vfs/hammer2/hammer2.h | 58 ++++++++++++--- sys/vfs/hammer2/hammer2_chain.c | 50 +++++-------- sys/vfs/hammer2/hammer2_flush.c | 121 +++++++++++++++++++++---------- sys/vfs/hammer2/hammer2_ioctl.c | 4 +- sys/vfs/hammer2/hammer2_vfsops.c | 23 +++++- sys/vfs/hammer2/hammer2_vnops.c | 42 +++++++---- 6 files changed, 197 insertions(+), 101 deletions(-) diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index 3775e545d2..213e4ff127 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -277,21 +277,56 @@ RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp, hammer2_tid_t); /* - * A hammer2 transaction placeholder. + * A hammer2 transaction and flush sequencing structure. * - * This structure is required for all modifying operations, including - * flushes. It holds the transaction id allocated for the modifying - * operation and is also used to interlock flushes and snapshots. + * This global structure is tied into hammer2_mount and is used + * to sequence modifying operations and flushes. + * + * (a) Any modifying operations with sync_tid >= flush_tid will stall until + * all modifying operating with sync_tid < flush_tid complete. + * + * The flush related to flush_tid stalls until all modifying operations + * with sync_tid < flush_tid complete. + * + * (b) Once unstalled, modifying operations with sync_tid > flush_tid are + * allowed to run. All modifications cause modify/duplicate operations + * to occur on the related chains. Note that most INDIRECT blocks will + * be unaffected because the modifications just overload the RBTREE + * structurally instead of actually modifying the indirect blocks. + * + * (c) The actual flush unstalls and RUNS CONCURRENTLY with (b), but only + * utilizes the chain structures with sync_tid <= flush_tid. The + * flush will modify related indirect blocks and inodes in-place + * (rather than duplicate) since the adjustments are compatible with + * (b)'s RBTREE overloading + * + * SPECIAL NOTE: Inode modifications have to also propagate along any + * modify/duplicate chains. File writes detect the flush + * and force out the conflicting buffer cache buffer(s) + * before reusing them. + * + * TODO: Flush merging. When fsync() is called on multiple discrete files + * concurrently there is no reason to stall the second fsync. + * The final flush that reaches to root can cover both fsync()s. + * + * The chains typically terminate as they fly onto the disk. The flush + * ultimately reaches the volume header. */ struct hammer2_trans { + TAILQ_ENTRY(hammer2_trans) entry; struct hammer2_mount *hmp; hammer2_tid_t sync_tid; + thread_t td; + int flags; + int blocked; uint8_t inodes_created; uint8_t dummy[7]; }; typedef struct hammer2_trans hammer2_trans_t; +#define HAMMER2_TRANS_ISFLUSH 0x0001 + /* * XXX */ @@ -305,6 +340,8 @@ typedef struct hammer2_freecache hammer2_freecache_t; /* * Global (per device) mount structure for device (aka vp->v_mount->hmp) */ +TAILQ_HEAD(hammer2_trans_queue, hammer2_trans); + struct hammer2_mount { struct vnode *devvp; /* device vnode */ int ronly; /* read-only mount */ @@ -323,9 +360,10 @@ struct hammer2_mount { hammer2_inode_t *sroot; /* super-root inode */ struct lock alloclk; /* lockmgr lock */ struct lock voldatalk; /* lockmgr lock */ - hammer2_tid_t flush_tid; /* (voldata locked, flush running) */ - thread_t flush_td; /* vfs_sync cycle owner */ - int flush_wait; + struct hammer2_trans_queue transq; /* all in-progress transactions */ + hammer2_trans_t *curflush; /* current flush in progress */ + int flushcnt; /* #of flush trans on the list */ + int volhdrno; /* last volhdrno written */ hammer2_volume_data_t voldata; hammer2_volume_data_t volsync; /* synchronized voldata */ @@ -532,10 +570,8 @@ void hammer2_chain_parent_setsubmod(hammer2_chain_t *chain); /* * hammer2_trans.c */ -void hammer2_trans_init_flush(hammer2_mount_t *hmp, hammer2_trans_t *trans, - int master); -void hammer2_trans_done_flush(hammer2_trans_t *trans, int master); -void hammer2_trans_init(hammer2_mount_t *hmp, hammer2_trans_t *trans); +void hammer2_trans_init(hammer2_mount_t *hmp, hammer2_trans_t *trans, + int flags); void hammer2_trans_done(hammer2_trans_t *trans); /* diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index 80e7ae490b..b8cecc109d 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -1822,36 +1822,26 @@ again2: /* * Create and return a new hammer2 system memory structure of the specified - * key, type and size and insert it RELATIVE TO (PARENT). - * - * (parent) is typically either an inode or an indirect block, acquired - * acquired as a side effect of issuing a prior failed lookup. parent - * must be locked and held. Do not pass the inode chain to this function - * unless that is the chain returned by the failed lookup. - * - * (*chainp) is either NULL, a newly allocated chain, or a chain allocated - * via hammer2_chain_duplicate(). When not NULL, the passed-in chain must - * NOT be attached to any parent, and will be attached by this function. - * This mechanic is used by the rename code. - * - * Non-indirect types will automatically allocate indirect blocks as required - * if the new item does not fit in the current (parent). - * - * Indirect types will move a portion of the existing blockref array in - * (parent) into the new indirect type and then use one of the free slots - * to emplace the new indirect type. - * - * A new locked chain element is returned of the specified type. The - * element may or may not have a data area associated with it: - * - * VOLUME not allowed here - * INODE kmalloc()'d data area is set up - * INDIRECT not allowed here - * DATA no data area will be set-up (caller is expected - * to have logical buffers, we don't want to alias - * the data onto device buffers!). - * - * Requires an exclusively locked parent. + * key, type and size and insert it under (*parentp). This is a full + * insertion, based on the supplied key/keybits, and may involve creating + * indirect blocks and moving other chains around via delete/duplicate. + * + * (*parentp) must be exclusive locked and may be replaced on return + * depending on how much work the function had to do. + * + * (*chainp) usually starts out NULL and returns the newly created chain, + * but if the caller desires the caller may allocate a disconnected chain + * and pass it in instead. (It is also possible for the caller to use + * chain_duplicate() to create a disconnected chain, manipulate it, then + * pass it into this function to insert it). + * + * This function should NOT be used to insert INDIRECT blocks. It is + * typically used to create/insert inodes and data blocks. + * + * Caller must pass-in an exclusively locked parent the new chain is to + * be inserted under, and optionally pass-in a disconnected, exclusively + * locked chain to insert (else we create a new chain). The function will + * adjust (*parentp) as necessary and return the existing or new chain. */ int hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, diff --git a/sys/vfs/hammer2/hammer2_flush.c b/sys/vfs/hammer2/hammer2_flush.c index 6a040a1cad..34c0b841fa 100644 --- a/sys/vfs/hammer2/hammer2_flush.c +++ b/sys/vfs/hammer2/hammer2_flush.c @@ -74,6 +74,14 @@ static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); * don't bother marking the volume header MODIFIED. Instead, the volume * will be synchronized at a later time as part of a larger flush sequence. * + * Non-flush transactions can typically run concurrently. However if + * there are non-flush transaction both before AND after a flush trans, + * the transactions after stall until the ones before finish. + * + * Non-flush transactions occuring after a flush pointer can run concurrently + * with that flush. They only have to wait for transactions prior to the + * flush trans to complete before they unstall. + * * WARNING! Modifications to the root volume cannot dup the root volume * header to handle synchronization points, so alloc_tid can * wind up (harmlessly) more advanced on flush. @@ -83,70 +91,103 @@ static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); * collisions (which key off of delete_tid). */ void -hammer2_trans_init(hammer2_mount_t *hmp, hammer2_trans_t *trans) -{ - bzero(trans, sizeof(*trans)); - trans->hmp = hmp; - hammer2_voldata_lock(hmp); - trans->sync_tid = hmp->voldata.alloc_tid++; - hammer2_voldata_unlock(hmp, 0); -} - -void -hammer2_trans_init_flush(hammer2_mount_t *hmp, hammer2_trans_t *trans, - int master) +hammer2_trans_init(hammer2_mount_t *hmp, hammer2_trans_t *trans, int flags) { - thread_t td = curthread; + hammer2_trans_t *scan; bzero(trans, sizeof(*trans)); trans->hmp = hmp; hammer2_voldata_lock(hmp); - if (master) { + trans->sync_tid = hmp->voldata.alloc_tid++; + trans->flags = flags; + trans->td = curthread; + TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); + + if (flags & HAMMER2_TRANS_ISFLUSH) { /* - * New master flush (sync). + * If we are a flush we have to wait for all transactions + * prior to our flush synchronization point to complete + * before we can start our flush. */ - while (hmp->flush_td) { - hmp->flush_wait = 1; - lksleep(&hmp->flush_wait, &hmp->voldatalk, - 0, "h2sync", hz); + ++hmp->flushcnt; + if (hmp->curflush == NULL) + hmp->curflush = trans; + while (TAILQ_FIRST(&hmp->transq) != trans) { + lksleep(&trans->sync_tid, &hmp->voldatalk, + 0, "h2syncw", hz); } - hmp->flush_td = td; - hmp->flush_tid = hmp->voldata.alloc_tid++; - trans->sync_tid = hmp->flush_tid; - } else if (hmp->flush_td == td) { + /* - * Part of a running master flush (sync->fsync) + * Once we become the running flush we can wakeup anyone + * who blocked on us. */ - trans->sync_tid = hmp->flush_tid; - KKASSERT(trans->sync_tid != 0); + scan = trans; + while ((scan = TAILQ_NEXT(scan, entry)) != NULL) { + if (scan->flags & HAMMER2_TRANS_ISFLUSH) + break; + if (scan->blocked == 0) + break; + scan->blocked = 0; + wakeup(&scan->blocked); + } } else { /* - * Independent flush request, make sure the sync_tid - * covers all modifications made to date. + * If we are not a flush but our sync_tid is after a + * stalled flush, we have to wait until that flush unstalls + * (that is, all transactions prior to that flush complete), + * but then we can run concurrently with that flush. + * + * (flushcnt check only good as pre-condition, otherwise it + * may represent elements queued after us after we block). */ - trans->sync_tid = hmp->voldata.alloc_tid++; + if (hmp->flushcnt > 1 || + (hmp->curflush && + TAILQ_FIRST(&hmp->transq) != hmp->curflush)) { + trans->blocked = 1; + while (trans->blocked) { + lksleep(&trans->blocked, &hmp->voldatalk, + 0, "h2trans", hz); + } + } } hammer2_voldata_unlock(hmp, 0); } void hammer2_trans_done(hammer2_trans_t *trans) -{ - trans->hmp = NULL; -} - -void -hammer2_trans_done_flush(hammer2_trans_t *trans, int master) { hammer2_mount_t *hmp = trans->hmp; + hammer2_trans_t *scan; hammer2_voldata_lock(hmp); - if (master) { - hmp->flush_td = NULL; - if (hmp->flush_wait) { - hmp->flush_wait = 0; - wakeup(&hmp->flush_wait); + TAILQ_REMOVE(&hmp->transq, trans, entry); + if (trans->flags & HAMMER2_TRANS_ISFLUSH) { + /* + * If we were a flush we have to adjust curflush to the + * next flush. + */ + --hmp->flushcnt; + if (hmp->flushcnt) { + TAILQ_FOREACH(scan, &hmp->transq, entry) { + if (scan->flags & HAMMER2_TRANS_ISFLUSH) + break; + } + hmp->curflush = scan; + } else { + hmp->curflush = NULL; + } + } else { + /* + * If we are not a flush but a flush is now at the head + * of the queue and we were previously blocking it, + * we can now unblock it. + */ + if (hmp->flushcnt && + (scan = TAILQ_FIRST(&hmp->transq)) != NULL && + trans->sync_tid < scan->sync_tid && + (scan->flags & HAMMER2_TRANS_ISFLUSH)) { + wakeup(&scan->sync_tid); } } hammer2_voldata_unlock(hmp, 0); diff --git a/sys/vfs/hammer2/hammer2_ioctl.c b/sys/vfs/hammer2/hammer2_ioctl.c index a8c1c77265..1c74998b8e 100644 --- a/sys/vfs/hammer2/hammer2_ioctl.c +++ b/sys/vfs/hammer2/hammer2_ioctl.c @@ -458,7 +458,7 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data) pfs->name[sizeof(pfs->name) - 1] = 0; /* ensure 0-termination */ - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); nip = hammer2_inode_create(&trans, hmp->sroot, NULL, NULL, pfs->name, strlen(pfs->name), &error); @@ -484,7 +484,7 @@ hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data) hammer2_trans_t trans; int error; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); error = hammer2_unlink_file(&trans, hmp->sroot, pfs->name, strlen(pfs->name), 0, NULL); diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index 44fa6e8dbe..b4ac832fa7 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -388,6 +388,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, */ lockinit(&hmp->alloclk, "h2alloc", 0, 0); lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE); + TAILQ_INIT(&hmp->transq); /* * vchain setup. vchain.data is special cased to NULL. @@ -817,7 +818,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) if (waitfor & MNT_LAZY) flags |= VMSC_ONEPASS; - hammer2_trans_init_flush(hmp, &info.trans, 1); + hammer2_trans_init(hmp, &info.trans, HAMMER2_TRANS_ISFLUSH); info.error = 0; info.waitfor = MNT_NOWAIT; vmntvnodescan(mp, flags | VMSC_NOWAIT, @@ -906,7 +907,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) bawrite(bp); hmp->volhdrno = i; } - hammer2_trans_done_flush(&info.trans, 1); + hammer2_trans_done(&info.trans); return (error); } @@ -946,7 +947,25 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data) RB_EMPTY(&vp->v_rbdirty_tree))) { return(0); } + + /* + * VOP_FSYNC will start a new transaction so replicate some code + * here to do it inline (see hammer2_vop_fsync()). + */ + hammer2_inode_lock_ex(ip); + if (ip->vp) + vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL); + if (ip->flags & HAMMER2_INODE_DIRTYEMBED) { + atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED); + atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); + hammer2_chain_modify(&info->trans, ip->chain, 0); + } + hammer2_chain_flush(&info->trans, ip->chain); + hammer2_inode_unlock_ex(ip); + error = 0; +#if 0 error = VOP_FSYNC(vp, MNT_NOWAIT, 0); +#endif if (error) info->error = error; return(0); diff --git a/sys/vfs/hammer2/hammer2_vnops.c b/sys/vfs/hammer2/hammer2_vnops.c index 0686d844f1..3c4577e0ba 100644 --- a/sys/vfs/hammer2/hammer2_vnops.c +++ b/sys/vfs/hammer2/hammer2_vnops.c @@ -85,9 +85,9 @@ int hammer2_vop_inactive(struct vop_inactive_args *ap) { hammer2_inode_t *ip; - hammer2_trans_t trans; struct vnode *vp; #if 0 + hammer2_trans_t trans; struct hammer2_mount *hmp; #endif @@ -109,13 +109,16 @@ hammer2_vop_inactive(struct vop_inactive_args *ap) */ hammer2_inode_lock_ex(ip); KKASSERT(ip->chain); +#if 0 + /* XXX lock order reversal on inode/trans */ if (ip->flags & HAMMER2_INODE_DIRTYEMBED) { atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED); atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); - hammer2_trans_init(ip->hmp, &trans); + hammer2_trans_init(ip->hmp, &trans, 0); hammer2_chain_modify(&trans, ip->chain, 0); hammer2_trans_done(&trans); } +#endif /* * Check for deleted inodes and recycle immediately. @@ -200,9 +203,9 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap) if (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_DELETED | HAMMER2_CHAIN_SUBMODIFIED)) { - hammer2_trans_init_flush(ip->hmp, &trans, 0); + hammer2_trans_init(ip->hmp, &trans, HAMMER2_TRANS_ISFLUSH); hammer2_chain_flush(&trans, chain); - hammer2_trans_done_flush(&trans, 0); + hammer2_trans_done(&trans); } #endif if (ip->refs > 2) /* (our lock + vp ref) */ @@ -235,7 +238,7 @@ hammer2_vop_fsync(struct vop_fsync_args *ap) ip = VTOI(vp); hmp = ip->hmp; - hammer2_trans_init_flush(hmp, &trans, 0); + hammer2_trans_init(hmp, &trans, HAMMER2_TRANS_ISFLUSH); hammer2_inode_lock_ex(ip); vfsync(vp, ap->a_waitfor, 1, NULL, NULL); @@ -264,7 +267,7 @@ hammer2_vop_fsync(struct vop_fsync_args *ap) hammer2_chain_flush(&trans, ip->chain); } hammer2_inode_unlock_ex(ip); - hammer2_trans_done_flush(&trans, 0); + hammer2_trans_done(&trans); return (0); } @@ -361,7 +364,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap) if (hmp->ronly) return(EROFS); - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); hammer2_inode_lock_ex(ip); ipdata = &ip->chain->data->ipdata; error = 0; @@ -766,8 +769,8 @@ hammer2_vop_write(struct vop_write_args *ap) * ip must be marked modified, particularly because the write * might wind up being copied into the embedded data area. */ + hammer2_trans_init(ip->hmp, &trans, 0); hammer2_inode_lock_ex(ip); - hammer2_trans_init(ip->hmp, &trans); error = hammer2_write_file(ip, &trans, uio, ap->a_ioflag, seqcount); hammer2_inode_unlock_ex(ip); hammer2_trans_done(&trans); @@ -1558,7 +1561,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap) kprintf("hammer2: need to unconsolidate hardlink for %s\n", chain->data->ipdata.filename); /* XXX retain shared lock on dip? (currently not held) */ - hammer2_trans_init(dip->hmp, &trans); + hammer2_trans_init(dip->hmp, &trans, 0); hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain); hammer2_trans_done(&trans); } @@ -1655,7 +1658,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap) name = ncp->nc_name; name_len = ncp->nc_nlen; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred, name, name_len, &error); if (error) { @@ -1837,7 +1840,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap) ncp = ap->a_nch->ncp; name = ncp->nc_name; name_len = ncp->nc_nlen; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); /* * ip represents the file being hardlinked. The file could be a @@ -1908,7 +1911,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap) ncp = ap->a_nch->ncp; name = ncp->nc_name; name_len = ncp->nc_nlen; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred, name, name_len, &error); @@ -1952,7 +1955,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap) ncp = ap->a_nch->ncp; name = ncp->nc_name; name_len = ncp->nc_nlen; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); ap->a_vap->va_type = VLNK; /* enforce type */ @@ -2038,7 +2041,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap) ncp = ap->a_nch->ncp; name = ncp->nc_name; name_len = ncp->nc_nlen; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); error = hammer2_unlink_file(&trans, dip, name, name_len, 0, NULL); hammer2_trans_done(&trans); if (error == 0) { @@ -2071,7 +2074,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap) name = ncp->nc_name; name_len = ncp->nc_nlen; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); error = hammer2_unlink_file(&trans, dip, name, name_len, 1, NULL); hammer2_trans_done(&trans); if (error == 0) { @@ -2122,7 +2125,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap) tname = tncp->nc_name; tname_len = tncp->nc_nlen; - hammer2_trans_init(hmp, &trans); + hammer2_trans_init(hmp, &trans, 0); /* * ip is the inode being renamed. If this is a hardlink then @@ -2210,6 +2213,13 @@ done: return (error); } +/* + * Strategy code + * + * WARNING: The strategy code cannot safely use hammer2 transactions + * as this can deadlock against vfs_sync's vfsync() call + * if multiple flushes are queued. + */ static int hammer2_strategy_read(struct vop_strategy_args *ap); static int hammer2_strategy_write(struct vop_strategy_args *ap); -- 2.41.0