From b37f18d6eedb01b25a3ddf16efed48064dd9074a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 23 Aug 2010 21:50:29 -0700 Subject: [PATCH 01/16] kernel - Add additional fields to kinfo_cputime * Add a message field and address to allow the kernel to report contention points on the cpus to userland. * Enhance the mplock and token subsystems to record contention points. * Enhance the scheduler to record contention information in the per-cpu cpu_time structure. --- .../linux/i386/linprocfs/linprocfs_subr.c | 2 +- sys/emulation/ndis/kern_ndis.c | 2 +- sys/emulation/ndis/subr_ntoskrnl.c | 2 +- sys/kern/kern_mplock.c | 2 + sys/kern/lwkt_thread.c | 26 +++++++++++++++++-- sys/kern/lwkt_token.c | 15 +++++++---- sys/kern/subr_disk.c | 2 +- sys/kern/subr_kobj.c | 2 +- sys/kern/subr_rman.c | 4 +- sys/kern/sys_pipe.c | 4 +- sys/kern/vfs_lock.c | 2 +- sys/kern/vfs_mount.c | 8 +++--- sys/kern/vfs_subr.c | 2 +- sys/kern/vfs_sync.c | 2 +- sys/sys/kinfo.h | 5 ++++ sys/sys/thread.h | 11 +++++--- sys/vfs/gnu/ext2fs/ext2_ihash.c | 2 +- sys/vfs/hpfs/hpfs_hash.c | 2 +- sys/vfs/hpfs/hpfs_vfsops.c | 2 +- sys/vfs/isofs/cd9660/cd9660_node.c | 2 +- sys/vfs/msdosfs/msdosfs_denode.c | 2 +- sys/vfs/ntfs/ntfs_ihash.c | 2 +- sys/vfs/udf/udf_vfsops.c | 2 +- sys/vfs/ufs/ufs_ihash.c | 2 +- sys/vm/vm_vmspace.c | 2 +- sys/vm/vm_zeroidle.c | 1 - 26 files changed, 71 insertions(+), 39 deletions(-) diff --git a/sys/emulation/linux/i386/linprocfs/linprocfs_subr.c b/sys/emulation/linux/i386/linprocfs/linprocfs_subr.c index 8108b59..48a0f22 100644 --- a/sys/emulation/linux/i386/linprocfs/linprocfs_subr.c +++ b/sys/emulation/linux/i386/linprocfs/linprocfs_subr.c @@ -403,7 +403,7 @@ vfs_findname(vfs_namemap_t *nm, char *buf, int buflen) void linprocfs_init(void) { - lwkt_token_init(&pfs_token, 1); + lwkt_token_init(&pfs_token, 1, "linprocfs"); } void diff --git a/sys/emulation/ndis/kern_ndis.c b/sys/emulation/ndis/kern_ndis.c index 2f9efdd..7778125 100644 --- a/sys/emulation/ndis/kern_ndis.c +++ b/sys/emulation/ndis/kern_ndis.c @@ -246,7 +246,7 @@ ndis_create_kthreads(void) struct ndis_req *r; int i, error = 0; - lwkt_token_init(&ndis_thr_token, 1); + lwkt_token_init(&ndis_thr_token, 1, "ndis"); STAILQ_INIT(&ndis_ttodo); STAILQ_INIT(&ndis_itodo); diff --git a/sys/emulation/ndis/subr_ntoskrnl.c b/sys/emulation/ndis/subr_ntoskrnl.c index 2889a11..8c69c14 100644 --- a/sys/emulation/ndis/subr_ntoskrnl.c +++ b/sys/emulation/ndis/subr_ntoskrnl.c @@ -184,7 +184,7 @@ static MALLOC_DEFINE(M_NDIS, "ndis", "ndis emulation"); int ntoskrnl_libinit(void) { - lwkt_token_init(&ntoskrnl_dispatchtoken, 1); + lwkt_token_init(&ntoskrnl_dispatchtoken, 1, "ndiskrnl"); ntoskrnl_init_lock(&ntoskrnl_global); TAILQ_INIT(&ntoskrnl_reflist); return(0); diff --git a/sys/kern/kern_mplock.c b/sys/kern/kern_mplock.c index 14e2a5a..12c2705 100644 --- a/sys/kern/kern_mplock.c +++ b/sys/kern/kern_mplock.c @@ -109,6 +109,7 @@ _get_mplock_contested(const char *file, int line) globaldata_t gd = mycpu; int ov; int nv; + const void **stkframe = (const void **)&file; ++mplock_contention_count; for (;;) { @@ -120,6 +121,7 @@ _get_mplock_contested(const char *file, int line) if (atomic_cmpset_int(&mp_lock, ov, gd->gd_cpuid)) break; } else { + gd->gd_curthread->td_mplock_stallpc = stkframe[-1]; loggiant(beg); lwkt_switch(); loggiant(end); diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index 48c498c..e273cbb 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -501,6 +502,8 @@ lwkt_switch(void) int mpheld; #endif int didaccumulate; + const char *lmsg; /* diagnostic - 'systat -pv 1' */ + const void *laddr; /* * Switching from within a 'fast' (non thread switched) interrupt or IPI @@ -654,6 +657,8 @@ lwkt_switch(void) continue; } } + cpu_time.cp_msg[0] = 0; + cpu_time.cp_stallpc = 0; goto haveidle; } @@ -664,7 +669,7 @@ lwkt_switch(void) #ifdef SMP (ntd->td_mpcount == 0 || mpheld || cpu_try_mplock()) && #endif - (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd)) + (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd, &lmsg, &laddr)) ) { #ifdef SMP clr_mplock_contention_mask(gd); @@ -672,9 +677,16 @@ lwkt_switch(void) goto havethread; } + lmsg = NULL; + laddr = NULL; + #ifdef SMP /* Reload mpheld (it become stale after mplock/token ops) */ mpheld = MP_LOCK_HELD(); + if (ntd->td_mpcount && mpheld == 0) { + lmsg = "mplock"; + laddr = ntd->td_mplock_stallpc; + } #endif /* @@ -743,7 +755,10 @@ lwkt_switch(void) * the (almost) top. */ if (didaccumulate) - break; + break; /* try again from the top, almost */ + if (lmsg) + strlcpy(cpu_time.cp_msg, lmsg, sizeof(cpu_time.cp_msg)); + cpu_time.cp_stallpc = (uintptr_t)laddr; goto haveidle; } @@ -755,7 +770,7 @@ lwkt_switch(void) #ifdef SMP (ntd->td_mpcount == 0 || mpheld || cpu_try_mplock()) && #endif - (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd)) + (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd, &lmsg, &laddr)) ) { #ifdef SMP clr_mplock_contention_mask(gd); @@ -765,6 +780,11 @@ lwkt_switch(void) #ifdef SMP /* Reload mpheld (it become stale after mplock/token ops) */ mpheld = MP_LOCK_HELD(); + if (ntd->td_mpcount && mpheld == 0) { + lmsg = "mplock"; + laddr = ntd->td_mplock_stallpc; + } + if (ntd->td_pri >= TDPRI_KERN_LPSCHED && ntd->td_fairq_accum >= 0) nquserok = 0; #endif diff --git a/sys/kern/lwkt_token.c b/sys/kern/lwkt_token.c index ccb251f..aed1be2 100644 --- a/sys/kern/lwkt_token.c +++ b/sys/kern/lwkt_token.c @@ -227,7 +227,7 @@ _lwkt_tokref_init(lwkt_tokref_t ref, lwkt_token_t tok, thread_t td) * Called from a critical section. */ int -lwkt_getalltokens(thread_t td) +lwkt_getalltokens(thread_t td, const char **msgp, const void **addrp) { lwkt_tokref_t scan; lwkt_tokref_t ref; @@ -268,6 +268,8 @@ lwkt_getalltokens(thread_t td) * Otherwise we failed to acquire all the tokens. * Undo and return. */ + *msgp = tok->t_desc; + *addrp = scan->tr_stallpc; atomic_add_long(&tok->t_collisions, 1); lwkt_relalltokens(td); return(FALSE); @@ -391,7 +393,7 @@ _lwkt_trytokref(lwkt_tokref_t ref, thread_t td) */ static __inline void -_lwkt_gettokref(lwkt_tokref_t ref, thread_t td) +_lwkt_gettokref(lwkt_tokref_t ref, thread_t td, const void **stkframe) { if ((ref->tr_flags & LWKT_TOKEN_MPSAFE) == 0) get_mplock(); @@ -407,6 +409,7 @@ _lwkt_gettokref(lwkt_tokref_t ref, thread_t td) * return tr_tok->t_ref should be assigned to this specific * ref. */ + ref->tr_stallpc = stkframe[-1]; atomic_add_long(&ref->tr_tok->t_collisions, 1); logtoken(fail, ref); lwkt_switch(); @@ -425,7 +428,7 @@ lwkt_gettoken(lwkt_token_t tok) KKASSERT(ref < &td->td_toks_end); _lwkt_tokref_init(ref, tok, td); ++td->td_toks_stop; - _lwkt_gettokref(ref, td); + _lwkt_gettokref(ref, td, (const void **)&tok); } lwkt_token_t @@ -440,7 +443,7 @@ lwkt_getpooltoken(void *ptr) tok = _lwkt_token_pool_lookup(ptr); _lwkt_tokref_init(ref, tok, td); ++td->td_toks_stop; - _lwkt_gettokref(ref, td); + _lwkt_gettokref(ref, td, (const void **)&ptr); return(tok); } @@ -510,7 +513,7 @@ lwkt_token_pool_init(void) int i; for (i = 0; i < LWKT_NUM_POOL_TOKENS; ++i) - lwkt_token_init(&pool_tokens[i], 1); + lwkt_token_init(&pool_tokens[i], 1, "pool"); } lwkt_token_t @@ -524,7 +527,7 @@ lwkt_token_pool_lookup(void *ptr) * acquiring the token and released after releasing the token. */ void -lwkt_token_init(lwkt_token_t tok, int mpsafe) +lwkt_token_init(lwkt_token_t tok, int mpsafe, const char *desc) { tok->t_ref = NULL; tok->t_flags = mpsafe ? LWKT_TOKEN_MPSAFE : 0; diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index 0bd0636..4840046 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -1258,7 +1258,7 @@ disk_init(void) objcache_malloc_free, &disk_msg_malloc_args); - lwkt_token_init(&disklist_token, 1); + lwkt_token_init(&disklist_token, 1, "disks"); /* * Initialize the reply-only port which acts as a message drain diff --git a/sys/kern/subr_kobj.c b/sys/kern/subr_kobj.c index 749ac5f..5433fdf 100644 --- a/sys/kern/subr_kobj.c +++ b/sys/kern/subr_kobj.c @@ -66,7 +66,7 @@ static int kobj_next_id = 1; static void kobj_init_token(void *arg) { - lwkt_token_init(&kobj_token, 1); + lwkt_token_init(&kobj_token, 1, "kobj"); } SYSINIT(kobj, SI_BOOT1_LOCK, SI_ORDER_ANY, kobj_init_token, NULL); diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c index e98e89e..4e6588a 100644 --- a/sys/kern/subr_rman.c +++ b/sys/kern/subr_rman.c @@ -91,7 +91,7 @@ rman_init(struct rman *rm) if (once == 0) { once = 1; TAILQ_INIT(&rman_head); - lwkt_token_init(&rman_tok, 1); + lwkt_token_init(&rman_tok, 1, "rman"); } if (rm->rm_type == RMAN_UNINIT) @@ -103,7 +103,7 @@ rman_init(struct rman *rm) rm->rm_slock = kmalloc(sizeof *rm->rm_slock, M_RMAN, M_NOWAIT); if (rm->rm_slock == NULL) return ENOMEM; - lwkt_token_init(rm->rm_slock, 1); + lwkt_token_init(rm->rm_slock, 1, "rmanslock"); lwkt_gettoken(&rman_tok); TAILQ_INSERT_TAIL(&rman_head, rm, rm_link); diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 2fbc592..95c0475 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -381,8 +381,8 @@ pipe_create(struct pipe **cpipep) vfs_timestamp(&cpipe->pipe_ctime); cpipe->pipe_atime = cpipe->pipe_ctime; cpipe->pipe_mtime = cpipe->pipe_ctime; - lwkt_token_init(&cpipe->pipe_rlock, 1); - lwkt_token_init(&cpipe->pipe_wlock, 1); + lwkt_token_init(&cpipe->pipe_rlock, 1, "piper"); + lwkt_token_init(&cpipe->pipe_wlock, 1, "pipew"); return (0); } diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c index 7e70cd9..6982ef5 100644 --- a/sys/kern/vfs_lock.c +++ b/sys/kern/vfs_lock.c @@ -415,7 +415,7 @@ vnode_ctor(void *obj, void *private, int ocflags) { struct vnode *vp = obj; - lwkt_token_init(&vp->v_token, 1); + lwkt_token_init(&vp->v_token, 1, "vnode"); lockinit(&vp->v_lock, "vnode", 0, 0); ccms_dataspace_init(&vp->v_ccms); TAILQ_INIT(&vp->v_namecache); diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 40c30cd..7595a63 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -135,9 +135,9 @@ static TAILQ_HEAD(,bio_ops) bio_ops_list = TAILQ_HEAD_INITIALIZER(bio_ops_list); void vfs_mount_init(void) { - lwkt_token_init(&mountlist_token, 1); - lwkt_token_init(&mntvnode_token, 1); - lwkt_token_init(&mntid_token, 1); + lwkt_token_init(&mountlist_token, 1, "mntlist"); + lwkt_token_init(&mntvnode_token, 1, "mntvnode"); + lwkt_token_init(&mntid_token, 1, "mntid"); TAILQ_INIT(&mountscan_list); TAILQ_INIT(&mntvnodescan_list); mount_init(&dummymount); @@ -320,7 +320,7 @@ void mount_init(struct mount *mp) { lockinit(&mp->mnt_lock, "vfslock", 0, 0); - lwkt_token_init(&mp->mnt_token, 1); + lwkt_token_init(&mp->mnt_token, 1, "permnt"); TAILQ_INIT(&mp->mnt_nvnodelist); TAILQ_INIT(&mp->mnt_reservedvnlist); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 6d0efe3..635355b 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -195,7 +195,7 @@ vfs_subr_init(void) KvaSize / factor2); desiredvnodes = imax(desiredvnodes, maxproc * 8); - lwkt_token_init(&spechash_token, 1); + lwkt_token_init(&spechash_token, 1, "spechash"); } /* diff --git a/sys/kern/vfs_sync.c b/sys/kern/vfs_sync.c index 66f3657..1f02e39 100644 --- a/sys/kern/vfs_sync.c +++ b/sys/kern/vfs_sync.c @@ -120,7 +120,7 @@ vfs_sync_init(void) syncer_workitem_pending = hashinit(syncer_maxdelay, M_DEVBUF, &syncer_mask); syncer_maxdelay = syncer_mask + 1; - lwkt_token_init(&syncer_token, 1); + lwkt_token_init(&syncer_token, 1, "syncer"); } /* diff --git a/sys/sys/kinfo.h b/sys/sys/kinfo.h index e7123fe..c547c6d 100644 --- a/sys/sys/kinfo.h +++ b/sys/sys/kinfo.h @@ -74,6 +74,11 @@ struct kinfo_cputime { uint64_t cp_sys; uint64_t cp_intr; uint64_t cp_idle; + uint64_t cp_unused01; + uint64_t cp_unused02; + uint64_t cp_unused03; + uint64_t cp_stallpc; /* code stall address */ + char cp_msg[32]; /* code stall token or mplock */ }; /* diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 7846409..c31ccca 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -104,6 +104,7 @@ typedef struct lwkt_token { struct lwkt_tokref *t_ref; /* Owning ref or NULL */ intptr_t t_flags; /* MP lock required */ long t_collisions; /* Collision counter */ + const char *t_desc; /* Descriptive name */ } lwkt_token; #define LWKT_TOKEN_MPSAFE 0x0001 @@ -134,6 +135,7 @@ struct lwkt_tokref { lwkt_token_t tr_tok; /* token in question */ struct thread *tr_owner; /* me */ intptr_t tr_flags; /* copy of t_flags */ + const void *tr_stallpc; /* stalled at pc */ }; #define MAXCPUFIFO 16 /* power of 2 */ @@ -229,7 +231,6 @@ struct thread { __uint64_t td_sticks; /* Statclock hits in system mode (uS) */ __uint64_t td_iticks; /* Statclock hits processing intr (uS) */ int td_locks; /* lockmgr lock debugging */ - int td_fairq_lticks; /* fairq wakeup accumulator reset */ void *td_dsched_priv1; /* priv data for I/O schedulers */ int td_refs; /* hold position in gd_tdallq / hold free */ int td_nest_count; /* prevent splz nesting */ @@ -248,6 +249,9 @@ struct thread { struct caps_kinfo *td_caps; /* list of client and server registrations */ lwkt_tokref_t td_toks_stop; struct lwkt_tokref td_toks_array[LWKT_MAXTOKENS]; + int td_fairq_lticks; /* fairq wakeup accumulator reset */ + int td_fairq_accum; /* fairq priority accumulator */ + const void *td_mplock_stallpc; /* last mplock stall address */ #ifdef DEBUG_CRIT_SECTIONS #define CRIT_DEBUG_ARRAY_SIZE 32 #define CRIT_DEBUG_ARRAY_MASK (CRIT_DEBUG_ARRAY_SIZE - 1) @@ -255,7 +259,6 @@ struct thread { int td_crit_debug_index; int td_in_crit_report; #endif - int td_fairq_accum; /* fairq priority accumulator */ struct md_thread td_mach; }; @@ -397,10 +400,10 @@ extern void lwkt_passive_release(thread_t); extern void lwkt_gettoken(lwkt_token_t); extern int lwkt_trytoken(lwkt_token_t); extern void lwkt_reltoken(lwkt_token_t); -extern int lwkt_getalltokens(thread_t); +extern int lwkt_getalltokens(thread_t, const char **, const void **); extern void lwkt_relalltokens(thread_t); extern void lwkt_drain_token_requests(void); -extern void lwkt_token_init(lwkt_token_t, int); +extern void lwkt_token_init(lwkt_token_t, int, const char *); extern void lwkt_token_uninit(lwkt_token_t); extern void lwkt_token_pool_init(void); diff --git a/sys/vfs/gnu/ext2fs/ext2_ihash.c b/sys/vfs/gnu/ext2fs/ext2_ihash.c index 686aba7..2121f32 100644 --- a/sys/vfs/gnu/ext2fs/ext2_ihash.c +++ b/sys/vfs/gnu/ext2fs/ext2_ihash.c @@ -69,7 +69,7 @@ ext2_ihashinit(void) ext2_ihash <<= 1; ext2_ihashtbl = kmalloc(sizeof(void *) * ext2_ihash, M_EXT2IHASH, M_WAITOK|M_ZERO); --ext2_ihash; - lwkt_token_init(&ext2_ihash_token, 1); + lwkt_token_init(&ext2_ihash_token, 1, "ext2ihash"); } int diff --git a/sys/vfs/hpfs/hpfs_hash.c b/sys/vfs/hpfs/hpfs_hash.c index d329e26..428e1ff 100644 --- a/sys/vfs/hpfs/hpfs_hash.c +++ b/sys/vfs/hpfs/hpfs_hash.c @@ -69,7 +69,7 @@ hpfs_hphashinit(void) lockinit (&hpfs_hphash_lock, "hpfs_hphashlock", 0, 0); hpfs_hphashtbl = HASHINIT(desiredvnodes, M_HPFSHASH, M_WAITOK, &hpfs_hphash); - lwkt_token_init(&hpfs_hphash_token, 1); + lwkt_token_init(&hpfs_hphash_token, 1, "hpfsihash"); } /* diff --git a/sys/vfs/hpfs/hpfs_vfsops.c b/sys/vfs/hpfs/hpfs_vfsops.c index 5f3df47..10b37cc 100644 --- a/sys/vfs/hpfs/hpfs_vfsops.c +++ b/sys/vfs/hpfs/hpfs_vfsops.c @@ -500,7 +500,7 @@ hpfs_vget(struct mount *mp, struct vnode *dvp, ino_t ino, struct vnode **vpp) if (ino == (ino_t)hpmp->hpm_su.su_rootfno) vsetflags(vp, VROOT); - lwkt_token_init(&hp->h_interlock, 1); + lwkt_token_init(&hp->h_interlock, 1, "hpfsilock"); hp->h_flag = H_INVAL; hp->h_vp = vp; diff --git a/sys/vfs/isofs/cd9660/cd9660_node.c b/sys/vfs/isofs/cd9660/cd9660_node.c index cf3f5d0..66f397d 100644 --- a/sys/vfs/isofs/cd9660/cd9660_node.c +++ b/sys/vfs/isofs/cd9660/cd9660_node.c @@ -87,7 +87,7 @@ cd9660_init(struct vfsconf *vfsp) isohashtbl = kmalloc(sizeof(void *) * isohash, M_ISOFSMNT, M_WAITOK|M_ZERO); --isohash; - lwkt_token_init(&cd9660_ihash_token, 1); + lwkt_token_init(&cd9660_ihash_token, 1, "cd9660ihash"); return (0); } diff --git a/sys/vfs/msdosfs/msdosfs_denode.c b/sys/vfs/msdosfs/msdosfs_denode.c index b2a2799..5b82fa1 100644 --- a/sys/vfs/msdosfs/msdosfs_denode.c +++ b/sys/vfs/msdosfs/msdosfs_denode.c @@ -122,7 +122,7 @@ msdosfs_init(struct vfsconf *vfsp) dehashtbl = kmalloc(sizeof(void *) * dehash, M_MSDOSFSMNT, M_WAITOK|M_ZERO); --dehash; - lwkt_token_init(&dehash_token, 1); + lwkt_token_init(&dehash_token, 1, "msdosihash"); return (0); } diff --git a/sys/vfs/ntfs/ntfs_ihash.c b/sys/vfs/ntfs/ntfs_ihash.c index 7af561e..2ce78db 100644 --- a/sys/vfs/ntfs/ntfs_ihash.c +++ b/sys/vfs/ntfs/ntfs_ihash.c @@ -70,7 +70,7 @@ ntfs_nthashinit(void) lockinit(&ntfs_hashlock, "ntfs_nthashlock", 0, 0); ntfs_nthashtbl = HASHINIT(desiredvnodes, M_NTFSNTHASH, M_WAITOK, &ntfs_nthash); - lwkt_token_init(&ntfs_nthash_slock, 1); + lwkt_token_init(&ntfs_nthash_slock, 1, "ntfsihash"); } /* diff --git a/sys/vfs/udf/udf_vfsops.c b/sys/vfs/udf/udf_vfsops.c index 84b3a46..2a2909a 100644 --- a/sys/vfs/udf/udf_vfsops.c +++ b/sys/vfs/udf/udf_vfsops.c @@ -384,7 +384,7 @@ udf_mountfs(struct vnode *devvp, struct mount *mp) brelse(bp); bp = NULL; - lwkt_token_init(&udfmp->hash_token, 1); + lwkt_token_init(&udfmp->hash_token, 1, "udfihash"); udfmp->hashtbl = phashinit(UDF_HASHTBLSIZE, M_UDFMOUNT, &udfmp->hashsz); return(0); diff --git a/sys/vfs/ufs/ufs_ihash.c b/sys/vfs/ufs/ufs_ihash.c index ae543f6..6ed2d76 100644 --- a/sys/vfs/ufs/ufs_ihash.c +++ b/sys/vfs/ufs/ufs_ihash.c @@ -68,7 +68,7 @@ ufs_ihashinit(void) ihash <<= 1; ihashtbl = kmalloc(sizeof(void *) * ihash, M_UFSIHASH, M_WAITOK|M_ZERO); --ihash; - lwkt_token_init(&ufs_ihash_token, 1); + lwkt_token_init(&ufs_ihash_token, 1, "ufsihash"); } int diff --git a/sys/vm/vm_vmspace.c b/sys/vm/vm_vmspace.c index 80bcee4..eaa3e66 100644 --- a/sys/vm/vm_vmspace.c +++ b/sys/vm/vm_vmspace.c @@ -97,7 +97,7 @@ sys_vmspace_create(struct vmspace_create_args *uap) lwkt_gettoken(&proc_token); if (p->p_vkernel == NULL) { vkp->refs = 1; - lwkt_token_init(&vkp->token, 1); + lwkt_token_init(&vkp->token, 1, "vkernel"); RB_INIT(&vkp->root); p->p_vkernel = vkp; } else { diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c index 700b773..ae5ff93 100644 --- a/sys/vm/vm_zeroidle.c +++ b/sys/vm/vm_zeroidle.c @@ -208,7 +208,6 @@ vm_pagezero(void __unused *arg) * resched has been requested. */ while (i < PAGE_SIZE) { - lwkt_yield(); if (idlezero_nocache == 1) bzeront(&pg[i], IDLEZERO_RUN); else -- 1.7.7.2 From 4d983f7917ea342ad7c6d58fd6de4b3b443c7786 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 23 Aug 2010 21:55:55 -0700 Subject: [PATCH 02/16] systat - Enhance systat -pv * Enhance the -pv option to display the contention points on all cpus --- usr.bin/systat/Makefile | 2 +- usr.bin/systat/symbols.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++ usr.bin/systat/symbols.h | 41 ++++++++++++++ usr.bin/systat/vmmeter.c | 41 +++++++++++--- 4 files changed, 208 insertions(+), 9 deletions(-) create mode 100644 usr.bin/systat/symbols.c create mode 100644 usr.bin/systat/symbols.h diff --git a/usr.bin/systat/Makefile b/usr.bin/systat/Makefile index e8f4619..eb67d47 100644 --- a/usr.bin/systat/Makefile +++ b/usr.bin/systat/Makefile @@ -5,7 +5,7 @@ PROG= systat CFLAGS+=-DINET6 -I${.CURDIR}/../../sys SRCS= cmds.c cmdtab.c convtbl.c devs.c fetch.c ifcmds.c ifstat.c iostat.c \ keyboard.c main.c mbufs.c netcmds.c netstat.c pigs.c sensors.c swap.c \ - icmp.c mode.c ip.c tcp.c vmstat.c ip6.c icmp6.c vmmeter.c + icmp.c mode.c ip.c tcp.c vmstat.c ip6.c icmp6.c vmmeter.c symbols.c DPADD= ${LIBCURSES} ${LIBTERMCAP} ${LIBM} ${LIBKVM} ${LIBDEVSTAT} ${LIBKINFO} LDADD= -lcurses -ltermcap -lm -lkvm -ldevstat -lkinfo BINGRP= kmem diff --git a/usr.bin/systat/symbols.c b/usr.bin/systat/symbols.c new file mode 100644 index 0000000..3f821ba --- /dev/null +++ b/usr.bin/systat/symbols.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2010 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "symbols.h" + +struct symdata { + TAILQ_ENTRY(symdata) link; + const char *symname; + char *symaddr; + char symtype; +}; + +static TAILQ_HEAD(symlist, symdata) symlist; +static struct symdata *symcache; +static char *symbegin; +static char *symend; + +void +read_symbols(const char *file) +{ + char buf[256]; + char cmd[256]; + size_t buflen = sizeof(buf); + FILE *fp; + struct symdata *sym; + char *s1; + char *s2; + char *s3; + + TAILQ_INIT(&symlist); + + if (file == NULL) { + if (sysctlbyname("kern.bootfile", buf, &buflen, NULL, 0) < 0) + file = "/boot/kernel"; + else + file = buf; + } + snprintf(cmd, sizeof(cmd), "nm -n %s", file); + if ((fp = popen(cmd, "r")) != NULL) { + while (fgets(buf, sizeof(buf), fp) != NULL) { + s1 = strtok(buf, " \t\n"); + s2 = strtok(NULL, " \t\n"); + s3 = strtok(NULL, " \t\n"); + if (s1 && s2 && s3) { + sym = malloc(sizeof(struct symdata)); + sym->symaddr = (char *)strtoul(s1, NULL, 16); + sym->symtype = s2[0]; + sym->symname = strdup(s3); + if (strcmp(s3, "kernbase") == 0) + symbegin = sym->symaddr; + if (strcmp(s3, "end") == 0) + symend = sym->symaddr; + TAILQ_INSERT_TAIL(&symlist, sym, link); + } + } + pclose(fp); + } + symcache = TAILQ_FIRST(&symlist); +} + +const char * +address_to_symbol(void *kptr, struct save_ctx *ctx) +{ + char *buf = ctx->save_buf; + int size = sizeof(ctx->save_buf); + + if (symcache == NULL || + (char *)kptr < symbegin || (char *)kptr >= symend + ) { + snprintf(buf, size, "%p", kptr); + return(buf); + } + while ((char *)symcache->symaddr < (char *)kptr) { + if (TAILQ_NEXT(symcache, link) == NULL) + break; + symcache = TAILQ_NEXT(symcache, link); + } + while ((char *)symcache->symaddr > (char *)kptr) { + if (symcache != TAILQ_FIRST(&symlist)) + symcache = TAILQ_PREV(symcache, symlist, link); + } + snprintf(buf, size, "%s+%d", symcache->symname, + (int)((char *)kptr - symcache->symaddr)); + return(buf); +} diff --git a/usr.bin/systat/symbols.h b/usr.bin/systat/symbols.h new file mode 100644 index 0000000..459ba83 --- /dev/null +++ b/usr.bin/systat/symbols.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +struct save_ctx { + char save_buf[512]; + const void *save_kptr; +}; + +void read_symbols(const char *); +const char *address_to_symbol(void *, struct save_ctx *); diff --git a/usr.bin/systat/vmmeter.c b/usr.bin/systat/vmmeter.c index d798549..11b5e51 100644 --- a/usr.bin/systat/vmmeter.c +++ b/usr.bin/systat/vmmeter.c @@ -2,6 +2,7 @@ #include #include #include +#include "symbols.h" #include #include @@ -15,6 +16,7 @@ #define X_START 1 #define CPU_START 1 +#define CPU_STARTX (3 + vmm_ncpus) #define CPU_LABEL_W 7 #define DRAW_ROW(n, y, w, fmt, args...) \ @@ -27,6 +29,8 @@ static int vmm_ncpus; static int vmm_fetched; static struct vmmeter *vmm_cur, *vmm_prev; static struct kinfo_cputime *vmm_cptime_cur, *vmm_cptime_prev; +static struct save_ctx symctx; +static int symbols_read; static void getvmm(void) @@ -107,6 +111,15 @@ do { \ #undef CPUV #undef CPUD +#define CPUC(idx, field) vmm_cptime_cur[idx].cp_##field + + n = X_START + CPU_LABEL_W; + + DRAW_ROW(n, CPU_STARTX + i, 15, "%-*s", CPUC(i, msg)); + DRAW_ROW(n, CPU_STARTX + i, 35, "%-*s", + address_to_symbol((void *)(intptr_t)CPUC(i, stallpc), + &symctx)); +#undef CPUC } } @@ -130,22 +143,34 @@ labelvmm(void) n = X_START + CPU_LABEL_W; - DRAW_ROW(n, 0, 6, "%*s", "timer"); - DRAW_ROW(n, 0, 8, "%*s", "ipi"); - DRAW_ROW(n, 0, 8, "%*s", "extint"); - DRAW_ROW(n, 0, 7, "%*s", "user%"); - DRAW_ROW(n, 0, 7, "%*s", "nice%"); - DRAW_ROW(n, 0, 7, "%*s", "sys%"); - DRAW_ROW(n, 0, 7, "%*s", "intr%"); - DRAW_ROW(n, 0, 7, "%*s", "idle%"); + DRAW_ROW(n, CPU_START - 1, 6, "%*s", "timer"); + DRAW_ROW(n, CPU_START - 1, 8, "%*s", "ipi"); + DRAW_ROW(n, CPU_START - 1, 8, "%*s", "extint"); + DRAW_ROW(n, CPU_START - 1, 7, "%*s", "user%"); + DRAW_ROW(n, CPU_START - 1, 7, "%*s", "nice%"); + DRAW_ROW(n, CPU_START - 1, 7, "%*s", "sys%"); + DRAW_ROW(n, CPU_START - 1, 7, "%*s", "intr%"); + DRAW_ROW(n, CPU_START - 1, 7, "%*s", "idle%"); for (i = 0; i < vmm_ncpus; ++i) mvprintw(CPU_START + i, X_START, "cpu%d", i); + + n = X_START + CPU_LABEL_W; + DRAW_ROW(n, CPU_STARTX - 1, 15, "%-*s", "contention"); + DRAW_ROW(n, CPU_STARTX - 1, 35, "%-*s", "function"); + + for (i = 0; i < vmm_ncpus; ++i) + mvprintw(CPU_STARTX + i, X_START, "cpu%d", i); } WINDOW * openvmm(void) { + if (symbols_read == 0) { + symbols_read = 1; + read_symbols(NULL); + } + if (kinfo_get_cpus(&vmm_ncpus)) err(1, "kinfo_get_cpus"); -- 1.7.7.2 From b580dce7b0a217d202bc430339914e44f79ce122 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 23 Aug 2010 22:19:41 -0700 Subject: [PATCH 03/16] buildworld - Adjust ps for recent commits * Fixes buildworld --- bin/ps/print.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/ps/print.c b/bin/ps/print.c index 149d8d0..0a409be 100644 --- a/bin/ps/print.c +++ b/bin/ps/print.c @@ -256,7 +256,7 @@ pri(const KINFO *k, const struct varent *vent) if (KI_LWP(k, pid) != -1) printf("%*d", vent->width, KI_LWP(k, prio)); else - printf("%*d", vent->width, -(KI_LWP(k, tdprio) & TDPRI_MASK)); + printf("%*d", vent->width, -(KI_LWP(k, tdprio))); } void @@ -265,7 +265,7 @@ tdpri(const KINFO *k, const struct varent *vent) char buf[32]; int val = KI_LWP(k, tdprio); - snprintf(buf, sizeof(buf), "%02d/%d", val & TDPRI_MASK, val / TDPRI_CRIT); + snprintf(buf, sizeof(buf), "%2d", val); printf("%*s", vent->width, buf); } -- 1.7.7.2 From 6f207a2c748ac4bf659dd2a79f30216ff6c156eb Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 23 Aug 2010 22:56:05 -0700 Subject: [PATCH 04/16] kernel - Adjustments to fix UP kernel build * Minor SMP wrapping issues. --- sys/kern/lwkt_thread.c | 14 +++++++++++++- 1 files changed, 13 insertions(+), 1 deletions(-) diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index e273cbb..85f488d 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -497,8 +497,8 @@ lwkt_switch(void) thread_t ntd; thread_t xtd; thread_t nlast; -#ifdef SMP int nquserok; +#ifdef SMP int mpheld; #endif int didaccumulate; @@ -649,6 +649,7 @@ lwkt_switch(void) ntd = &gd->gd_idlethread; if (gd->gd_reqflags & RQF_IDLECHECK_MASK) ntd->td_flags |= TDF_IDLE_NOHLT; +#ifdef SMP if (ntd->td_mpcount) { if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) panic("Idle thread %p was holding the BGL!", ntd); @@ -657,6 +658,7 @@ lwkt_switch(void) continue; } } +#endif cpu_time.cp_msg[0] = 0; cpu_time.cp_stallpc = 0; goto haveidle; @@ -664,6 +666,9 @@ lwkt_switch(void) /* * Hotpath schedule + * + * NOTE: For UP there is no mplock and lwkt_getalltokens() + * always succeeds. */ if (ntd->td_fairq_accum >= 0 && #ifdef SMP @@ -699,6 +704,8 @@ lwkt_switch(void) #ifdef SMP nquserok = ((ntd->td_pri < TDPRI_KERN_LPSCHED) || (ntd->td_fairq_accum < 0)); +#else + nquserok = 1; #endif nlast = NULL; @@ -737,6 +744,7 @@ lwkt_switch(void) cpu_pause(); ntd = &gd->gd_idlethread; ntd->td_flags |= TDF_IDLE_NOHLT; +#ifdef SMP set_mplock_contention_mask(gd); cpu_mplock_contested(); if (ntd->td_mpcount) { @@ -748,6 +756,7 @@ lwkt_switch(void) break; /* try again from the top, almost */ } } +#endif /* * If fairq accumulations occured we do not schedule the @@ -764,6 +773,9 @@ lwkt_switch(void) /* * Try to switch to this thread. + * + * NOTE: For UP there is no mplock and lwkt_getalltokens() + * always succeeds. */ if ((ntd->td_pri >= TDPRI_KERN_LPSCHED || nquserok) && ntd->td_fairq_accum >= 0 && -- 1.7.7.2 From c5fcbac78c1afd853c4f736423750671d4988c38 Mon Sep 17 00:00:00 2001 From: Alex Hornung Date: Tue, 24 Aug 2010 07:54:09 +0100 Subject: [PATCH 05/16] opencrypto - Only yield for cryptosoft * There's no need to yield the CPU when doing hardware-assisted crypto. Move the lwkt_yield()s into cryptosoft. --- sys/opencrypto/crypto.c | 2 -- sys/opencrypto/cryptosoft.c | 1 + 2 files changed, 1 insertions(+), 2 deletions(-) diff --git a/sys/opencrypto/crypto.c b/sys/opencrypto/crypto.c index cbc6bee..7e34357 100644 --- a/sys/opencrypto/crypto.c +++ b/sys/opencrypto/crypto.c @@ -830,7 +830,6 @@ crypto_dispatch(struct cryptop *crp) KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__)); if (!cap->cc_qblocked) { result = crypto_invoke(cap, crp, 0); - lwkt_yield(); if (result != ERESTART) return (result); /* @@ -1363,7 +1362,6 @@ crypto_proc(void *arg) CRYPTO_Q_UNLOCK(tdinfo); result = crypto_invoke(cap, submit, hint); - lwkt_yield(); CRYPTO_Q_LOCK(tdinfo); if (result == ERESTART) { diff --git a/sys/opencrypto/cryptosoft.c b/sys/opencrypto/cryptosoft.c index e228178..d940a5b 100644 --- a/sys/opencrypto/cryptosoft.c +++ b/sys/opencrypto/cryptosoft.c @@ -1129,6 +1129,7 @@ swcr_process(device_t dev, struct cryptop *crp, int hint) done: crypto_done(crp); + lwkt_yield(); return 0; } -- 1.7.7.2 From 8801b30b82e74afb41651e3bf29e7a99fcccecf3 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 24 Aug 2010 16:02:09 -0700 Subject: [PATCH 06/16] kernel - Add define for MNTK_ALL_MPSAFE * Add a define to specify all available MPSAFE flags for a vfs. --- sys/sys/mount.h | 3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/sys/sys/mount.h b/sys/sys/mount.h index af2ca6e..d335f4d8 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -299,6 +299,9 @@ struct mount { #define MNTK_NOSTKMNT 0x10000000 /* no stacked mount point allowed */ #define MNTK_NOMSYNC 0x20000000 /* used by tmpfs */ +#define MNTK_ALL_MPSAFE (MNTK_MPSAFE | MNTK_RD_MPSAFE | MNTK_WR_MPSAFE | \ + MNTK_GA_MPSAFE | MNTK_IN_MPSAFE | MNTK_SG_MPSAFE) + /* * mountlist_*() defines */ -- 1.7.7.2 From b0aab9b9c78ffcdedf2e7c00e9731f069aef3710 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 24 Aug 2010 16:04:59 -0700 Subject: [PATCH 07/16] HAMMER VFS - Make all entry points MPSAFE, remove giant & critical sections * All VFS, VOP, ioops, and bio_done entry points are now mpsafe and no longer use giant. * Implement hmp->fs_token and hmp->io_token for each HAMMER mount. All operations that previously needed the MP lock now use hmp->fs_token. All operations that interact with BIO callbacks now use hmp->io_token. All critical sections now use io_token (these previously interlocked against IO callbacks). NOTE: read (for cached data) and getattr were MPSAFE before and continue to be MPSAFE. --- sys/vfs/hammer/hammer.h | 7 +- sys/vfs/hammer/hammer_blockmap.c | 2 + sys/vfs/hammer/hammer_flusher.c | 15 ++- sys/vfs/hammer/hammer_inode.c | 8 +- sys/vfs/hammer/hammer_io.c | 191 +++++++++++++++++++++++-------- sys/vfs/hammer/hammer_ondisk.c | 29 +++-- sys/vfs/hammer/hammer_signal.c | 5 + sys/vfs/hammer/hammer_vfsops.c | 82 ++++++++++---- sys/vfs/hammer/hammer_vnops.c | 231 +++++++++++++++++++++++++++----------- 9 files changed, 412 insertions(+), 158 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 76914c5..22fa93d 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -859,8 +859,8 @@ struct hammer_mount { struct hammer_io_list meta_list; /* dirty meta bufs */ struct hammer_io_list lose_list; /* loose buffers */ int locked_dirty_space; /* meta/volu count */ - int io_running_space; /* track I/O in progress */ - int io_running_wakeup; + int io_running_space; /* io_token */ + int io_running_wakeup; /* io_token */ int objid_cache_count; int error; /* critical I/O error */ struct krate krate; /* rate limited kprintf */ @@ -891,6 +891,9 @@ struct hammer_mount { TAILQ_HEAD(, hammer_reclaim) reclaim_list; TAILQ_HEAD(, hammer_io) iorun_list; + struct lwkt_token fs_token; /* high level */ + struct lwkt_token io_token; /* low level (IO callback) */ + struct hammer_inostats inostats[HAMMER_INOSTATS_HSIZE]; }; diff --git a/sys/vfs/hammer/hammer_blockmap.c b/sys/vfs/hammer/hammer_blockmap.c index e75615e..b308eab 100644 --- a/sys/vfs/hammer/hammer_blockmap.c +++ b/sys/vfs/hammer/hammer_blockmap.c @@ -1223,6 +1223,8 @@ failed: /* * Check space availability + * + * MPSAFE - does not require fs_token */ int _hammer_checkspace(hammer_mount_t hmp, int slop, int64_t *resp) diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index 1c8a433..45a2b4f 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -171,13 +171,13 @@ hammer_flusher_create(hammer_mount_t hmp) TAILQ_INIT(&hmp->flusher.ready_list); lwkt_create(hammer_flusher_master_thread, hmp, - &hmp->flusher.td, NULL, 0, -1, "hammer-M"); + &hmp->flusher.td, NULL, TDF_MPSAFE, -1, "hammer-M"); for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) { info = kmalloc(sizeof(*info), hmp->m_misc, M_WAITOK|M_ZERO); info->hmp = hmp; TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry); lwkt_create(hammer_flusher_slave_thread, info, - &info->td, NULL, 0, -1, "hammer-S%d", i); + &info->td, NULL, TDF_MPSAFE, -1, "hammer-S%d", i); } } @@ -222,6 +222,8 @@ hammer_flusher_master_thread(void *arg) hmp = arg; + lwkt_gettoken(&hmp->fs_token); + for (;;) { /* * Do at least one flush cycle. We may have to update the @@ -264,6 +266,7 @@ hammer_flusher_master_thread(void *arg) */ hmp->flusher.td = NULL; wakeup(&hmp->flusher.exiting); + lwkt_reltoken(&hmp->fs_token); lwkt_exit(); } @@ -438,6 +441,7 @@ hammer_flusher_slave_thread(void *arg) info = arg; hmp = info->hmp; + lwkt_gettoken(&hmp->fs_token); for (;;) { while (info->runstate == 0) @@ -459,6 +463,7 @@ hammer_flusher_slave_thread(void *arg) } info->td = NULL; wakeup(&info->td); + lwkt_reltoken(&hmp->fs_token); lwkt_exit(); } @@ -472,9 +477,11 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp) * loose ends - buffers without bp's aren't tracked by the kernel * and can build up, so clean them out. This can occur when an * IO completes on a buffer with no references left. + * + * The io_token is needed to protect the list. */ if ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) { - crit_enter(); /* biodone() race */ + lwkt_gettoken(&hmp->io_token); while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) { KKASSERT(io->mod_list == &hmp->lose_list); TAILQ_REMOVE(&hmp->lose_list, io, mod_entry); @@ -483,7 +490,7 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp) buffer = (void *)io; hammer_rel_buffer(buffer, 0); } - crit_exit(); + lwkt_reltoken(&hmp->io_token); } } diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 4678bf7..d4784fa 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -177,6 +177,7 @@ int hammer_vop_inactive(struct vop_inactive_args *ap) { struct hammer_inode *ip = VTOI(ap->a_vp); + hammer_mount_t hmp; /* * Degenerate case @@ -199,12 +200,13 @@ hammer_vop_inactive(struct vop_inactive_args *ap) * multiple inode updates. */ if (ip->ino_data.nlinks == 0) { - get_mplock(); + hmp = ip->hmp; + lwkt_gettoken(&hmp->fs_token); hammer_inode_unloadable_check(ip, 0); if (ip->flags & HAMMER_INODE_MODMASK) hammer_flush_inode(ip, 0); + lwkt_reltoken(&hmp->fs_token); vrecycle(ap->a_vp); - rel_mplock(); } return(0); } @@ -229,6 +231,7 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap) if ((ip = vp->v_data) != NULL) { hmp = ip->hmp; + lwkt_gettoken(&hmp->fs_token); hammer_lock_ex(&ip->lock); vp->v_data = NULL; ip->vp = NULL; @@ -240,6 +243,7 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap) } hammer_unlock(&ip->lock); hammer_rel_inode(ip, 1); + lwkt_reltoken(&hmp->fs_token); } return(0); } diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index a959a83..c934468 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -77,11 +77,17 @@ hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type) /* * Helper routine to disassociate a buffer cache buffer from an I/O - * structure. The buffer is unlocked and marked appropriate for reclamation. + * structure. The io must be interlocked marked appropriately for + * reclamation. * * The io may have 0 or 1 references depending on who called us. The * caller is responsible for dealing with the refs. * + * The io must be in a released state with the io->bp owned and + * locked by the caller of this function. When not called from an + * io_deallocate() this cannot race an io_deallocate() since the + * kernel would be unable to get the buffer lock in that case. + * * This call can only be made when no action is required on the buffer. * * The caller must own the buffer and the IO must indicate that the @@ -102,7 +108,7 @@ hammer_io_disassociate(hammer_io_structure_t iou) * If the buffer was locked someone wanted to get rid of it. */ if (bp->b_flags & B_LOCKED) { - --hammer_count_io_locked; + atomic_add_int(&hammer_count_io_locked, -1); bp->b_flags &= ~B_LOCKED; } if (iou->io.reclaim) { @@ -135,15 +141,16 @@ void hammer_io_wait(hammer_io_t io) { if (io->running) { - for (;;) { + hammer_mount_t hmp = io->hmp; + + lwkt_gettoken(&hmp->io_token); + while (io->running) { io->waiting = 1; tsleep_interlock(io, 0); - if (io->running == 0) - break; - tsleep(io, PINTERLOCKED, "hmrflw", hz); - if (io->running == 0) - break; + if (io->running) + tsleep(io, PINTERLOCKED, "hmrflw", hz); } + lwkt_reltoken(&hmp->io_token); } } @@ -162,9 +169,9 @@ hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush) /* * Degenerate case, no I/O is running */ - crit_enter(); + lwkt_gettoken(&hmp->io_token); if (TAILQ_EMPTY(&hmp->iorun_list)) { - crit_exit(); + lwkt_reltoken(&hmp->io_token); if (doflush) hammer_io_flush_sync(hmp); return; @@ -188,7 +195,7 @@ hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush) io = TAILQ_FIRST(&hmp->iorun_list); if (io && io->type == HAMMER_STRUCTURE_DUMMY) wakeup(io); - crit_exit(); + lwkt_reltoken(&hmp->io_token); if (doflush) hammer_io_flush_sync(hmp); @@ -216,11 +223,20 @@ hammer_io_clear_error(struct hammer_io *io) * This is used by HAMMER's reblocking code to avoid trying to * swapcache the filesystem's data when it is read or written * by the reblocking code. + * + * The caller has a ref on the buffer preventing the bp from + * being disassociated from it. */ void hammer_io_notmeta(hammer_buffer_t buffer) { - buffer->io.bp->b_flags |= B_NOTMETA; + if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) { + hammer_mount_t hmp = buffer->io.hmp; + + lwkt_gettoken(&hmp->io_token); + buffer->io.bp->b_flags |= B_NOTMETA; + lwkt_reltoken(&hmp->io_token); + } } @@ -238,6 +254,8 @@ hammer_io_notmeta(hammer_buffer_t buffer) * zones cannot be clustered due to their mixed buffer sizes. This is * not an issue since such clustering occurs in hammer_vnops at the * regular file layer, whereas this is the buffered block device layer. + * + * No I/O callbacks can occur while we hold the buffer locked. */ int hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) @@ -246,7 +264,7 @@ hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) int error; if ((bp = io->bp) == NULL) { - hammer_count_io_running_read += io->bytes; + atomic_add_int(&hammer_count_io_running_read, io->bytes); if (hammer_cluster_enable && limit > io->bytes) { error = cluster_read(devvp, io->offset + limit, io->offset, io->bytes, @@ -257,7 +275,7 @@ hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) error = bread(devvp, io->offset, io->bytes, &io->bp); } hammer_stats_disk_read += io->bytes; - hammer_count_io_running_read -= io->bytes; + atomic_add_int(&hammer_count_io_running_read, -io->bytes); /* * The code generally assumes b_ops/b_dep has been set-up, @@ -305,6 +323,8 @@ hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) bp->b_flags &= ~B_IODEBUG; bp->b_ops = &hammer_bioops; KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); + + /* io->worklist is locked by the io lock */ LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); BUF_KERNPROC(bp); KKASSERT(io->modified == 0); @@ -327,6 +347,8 @@ hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) * * This function will also mark the IO as modified but it will not * increment the modify_refs count. + * + * No I/O callbacks can occur while we hold the buffer locked. */ int hammer_io_new(struct vnode *devvp, struct hammer_io *io) @@ -338,6 +360,8 @@ hammer_io_new(struct vnode *devvp, struct hammer_io *io) bp = io->bp; bp->b_ops = &hammer_bioops; KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); + + /* io->worklist is locked by the io lock */ LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); io->released = 0; KKASSERT(io->running == 0); @@ -358,6 +382,8 @@ hammer_io_new(struct vnode *devvp, struct hammer_io *io) /* * Advance the activity count on the underlying buffer because * HAMMER does not getblk/brelse on every access. + * + * The io->bp cannot go away while the buffer is referenced. */ void hammer_io_advance(struct hammer_io *io) @@ -382,17 +408,21 @@ int hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) { hammer_io_structure_t iou; + hammer_mount_t hmp; hammer_off_t phys_offset; struct buf *bp; int error; + hmp = volume->io.hmp; + lwkt_gettoken(&hmp->io_token); + phys_offset = volume->ondisk->vol_buf_beg + (zone2_offset & HAMMER_OFF_SHORT_MASK); - crit_enter(); if ((bp = findblk(volume->devvp, phys_offset, FINDBLK_TEST)) != NULL) bp = getblk(volume->devvp, phys_offset, bp->b_bufsize, 0, 0); else bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0); + if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) { #if 0 hammer_ref(&iou->io.lock); @@ -415,7 +445,7 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) brelse(bp); error = 0; } - crit_exit(); + lwkt_reltoken(&hmp->io_token); return(error); } @@ -593,9 +623,8 @@ hammer_io_flush(struct hammer_io *io, int reclaim) /* * Degenerate case - nothing to flush if nothing is dirty. */ - if (io->modified == 0) { + if (io->modified == 0) return; - } KKASSERT(io->bp); KKASSERT(io->modify_refs <= 0); @@ -607,6 +636,8 @@ hammer_io_flush(struct hammer_io *io, int reclaim) * We are going to bawrite() this bp. Don't leave a window where * io->released is set, we actually own the bp rather then our * buffer. + * + * The io_token should not be required here as only */ bp = io->bp; if (io->released) { @@ -615,14 +646,15 @@ hammer_io_flush(struct hammer_io *io, int reclaim) /* io->released = 0; */ KKASSERT(io->released); KKASSERT(io->bp == bp); + } else { + io->released = 1; } - io->released = 1; if (reclaim) { io->reclaim = 1; if ((bp->b_flags & B_LOCKED) == 0) { bp->b_flags |= B_LOCKED; - ++hammer_count_io_locked; + atomic_add_int(&hammer_count_io_locked, 1); } } @@ -651,11 +683,14 @@ hammer_io_flush(struct hammer_io *io, int reclaim) /* * Transfer ownership to the kernel and initiate I/O. + * + * NOTE: We do not hold io_token so an atomic op is required to + * update io_running_space. */ io->running = 1; - io->hmp->io_running_space += io->bytes; + atomic_add_int(&io->hmp->io_running_space, io->bytes); + atomic_add_int(&hammer_count_io_running_write, io->bytes); TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry); - hammer_count_io_running_write += io->bytes; bawrite(bp); hammer_io_flush_mark(io->volume); } @@ -826,7 +861,7 @@ hammer_io_clear_modify(struct hammer_io *io, int inval) if (io->mod_list == &io->hmp->volu_list || io->mod_list == &io->hmp->meta_list) { io->hmp->locked_dirty_space -= io->bytes; - hammer_count_dirtybufspace -= io->bytes; + atomic_add_int(&hammer_count_dirtybufspace, -io->bytes); } TAILQ_REMOVE(io->mod_list, io, mod_entry); io->mod_list = NULL; @@ -868,17 +903,23 @@ restart: * Clear the IO's modify list. Even though the IO is no longer modified * it may still be on the lose_list. This routine is called just before * the governing hammer_buffer is destroyed. + * + * mod_list requires io_token protection. */ void hammer_io_clear_modlist(struct hammer_io *io) { + hammer_mount_t hmp = io->hmp; + KKASSERT(io->modified == 0); if (io->mod_list) { - crit_enter(); /* biodone race against list */ - KKASSERT(io->mod_list == &io->hmp->lose_list); - TAILQ_REMOVE(io->mod_list, io, mod_entry); - io->mod_list = NULL; - crit_exit(); + lwkt_gettoken(&hmp->io_token); + if (io->mod_list) { + KKASSERT(io->mod_list == &io->hmp->lose_list); + TAILQ_REMOVE(io->mod_list, io, mod_entry); + io->mod_list = NULL; + } + lwkt_reltoken(&hmp->io_token); } } @@ -893,12 +934,12 @@ hammer_io_set_modlist(struct hammer_io *io) case HAMMER_STRUCTURE_VOLUME: io->mod_list = &hmp->volu_list; hmp->locked_dirty_space += io->bytes; - hammer_count_dirtybufspace += io->bytes; + atomic_add_int(&hammer_count_dirtybufspace, io->bytes); break; case HAMMER_STRUCTURE_META_BUFFER: io->mod_list = &hmp->meta_list; hmp->locked_dirty_space += io->bytes; - hammer_count_dirtybufspace += io->bytes; + atomic_add_int(&hammer_count_dirtybufspace, io->bytes); break; case HAMMER_STRUCTURE_UNDO_BUFFER: io->mod_list = &hmp->undo_list; @@ -921,10 +962,13 @@ hammer_io_set_modlist(struct hammer_io *io) /* * Pre-IO initiation kernel callback - cluster build only + * + * bioops callback - hold io_token */ static void hammer_io_start(struct buf *bp) { + /* nothing to do, so io_token not needed */ } /* @@ -933,6 +977,8 @@ hammer_io_start(struct buf *bp) * NOTE: HAMMER may modify a buffer after initiating I/O. The modified bit * may also be set if we were marking a cluster header open. Only remove * our dependancy if the modified bit is clear. + * + * bioops callback - hold io_token */ static void hammer_io_complete(struct buf *bp) @@ -941,6 +987,8 @@ hammer_io_complete(struct buf *bp) struct hammer_mount *hmp = iou->io.hmp; struct hammer_io *ionext; + lwkt_gettoken(&hmp->io_token); + KKASSERT(iou->io.released == 1); /* @@ -981,8 +1029,8 @@ hammer_io_complete(struct buf *bp) #endif } hammer_stats_disk_write += iou->io.bytes; - hammer_count_io_running_write -= iou->io.bytes; - hmp->io_running_space -= iou->io.bytes; + atomic_add_int(&hammer_count_io_running_write, -iou->io.bytes); + atomic_add_int(&hmp->io_running_space, -iou->io.bytes); if (hmp->io_running_wakeup && hmp->io_running_space < hammer_limit_running_io / 2) { hmp->io_running_wakeup = 0; @@ -1016,11 +1064,12 @@ hammer_io_complete(struct buf *bp) * interlock. */ if (bp->b_flags & B_LOCKED) { - --hammer_count_io_locked; + atomic_add_int(&hammer_count_io_locked, -1); bp->b_flags &= ~B_LOCKED; hammer_io_deallocate(bp); /* structure may be dead now */ } + lwkt_reltoken(&hmp->io_token); } /* @@ -1037,11 +1086,18 @@ hammer_io_complete(struct buf *bp) * our only recourse is to set B_LOCKED. * * WARNING: This may be called from an interrupt via hammer_io_complete() + * + * bioops callback - hold io_token */ static void hammer_io_deallocate(struct buf *bp) { hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep); + hammer_mount_t hmp; + + hmp = iou->io.hmp; + + lwkt_gettoken(&hmp->io_token); KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0); if (hammer_try_interlock_norefs(&iou->io.lock) == 0) { @@ -1050,52 +1106,64 @@ hammer_io_deallocate(struct buf *bp) * or interlocked HAMMER structure. */ bp->b_flags |= B_LOCKED; - ++hammer_count_io_locked; + atomic_add_int(&hammer_count_io_locked, 1); } else if (iou->io.modified) { /* * It is not legal to disassociate a modified buffer. This * case really shouldn't ever occur. */ bp->b_flags |= B_LOCKED; - ++hammer_count_io_locked; + atomic_add_int(&hammer_count_io_locked, 1); hammer_put_interlock(&iou->io.lock, 0); } else { /* * Disassociate the BP. If the io has no refs left we - * have to add it to the loose list. + * have to add it to the loose list. The kernel has + * locked the buffer and therefore our io must be + * in a released state. */ hammer_io_disassociate(iou); if (iou->io.type != HAMMER_STRUCTURE_VOLUME) { KKASSERT(iou->io.bp == NULL); KKASSERT(iou->io.mod_list == NULL); - crit_enter(); /* biodone race against list */ - iou->io.mod_list = &iou->io.hmp->lose_list; + iou->io.mod_list = &hmp->lose_list; TAILQ_INSERT_TAIL(iou->io.mod_list, &iou->io, mod_entry); - crit_exit(); } hammer_put_interlock(&iou->io.lock, 1); } + lwkt_reltoken(&hmp->io_token); } +/* + * bioops callback - hold io_token + */ static int hammer_io_fsync(struct vnode *vp) { + /* nothing to do, so io_token not needed */ return(0); } /* * NOTE: will not be called unless we tell the kernel about the * bioops. Unused... we use the mount's VFS_SYNC instead. + * + * bioops callback - hold io_token */ static int hammer_io_sync(struct mount *mp) { + /* nothing to do, so io_token not needed */ return(0); } +/* + * bioops callback - hold io_token + */ static void hammer_io_movedeps(struct buf *bp1, struct buf *bp2) { + /* nothing to do, so io_token not needed */ } /* @@ -1111,29 +1179,38 @@ hammer_io_movedeps(struct buf *bp1, struct buf *bp2) * * checkwrite will only be called for bdwrite()n buffers. If we return * success the kernel is guaranteed to initiate the buffer write. + * + * bioops callback - hold io_token */ static int hammer_io_checkread(struct buf *bp) { + /* nothing to do, so io_token not needed */ return(0); } +/* + * bioops callback - hold io_token + */ static int hammer_io_checkwrite(struct buf *bp) { hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep); + hammer_mount_t hmp = io->hmp; /* * This shouldn't happen under normal operation. */ + lwkt_gettoken(&hmp->io_token); if (io->type == HAMMER_STRUCTURE_VOLUME || io->type == HAMMER_STRUCTURE_META_BUFFER) { if (!panicstr) panic("hammer_io_checkwrite: illegal buffer"); if ((bp->b_flags & B_LOCKED) == 0) { bp->b_flags |= B_LOCKED; - ++hammer_count_io_locked; + atomic_add_int(&hammer_count_io_locked, 1); } + lwkt_reltoken(&hmp->io_token); return(1); } @@ -1158,19 +1235,25 @@ hammer_io_checkwrite(struct buf *bp) */ KKASSERT(io->running == 0); io->running = 1; - io->hmp->io_running_space += io->bytes; + atomic_add_int(&io->hmp->io_running_space, io->bytes); + atomic_add_int(&hammer_count_io_running_write, io->bytes); TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry); - hammer_count_io_running_write += io->bytes; + + lwkt_reltoken(&hmp->io_token); + return(0); } /* * Return non-zero if we wish to delay the kernel's attempt to flush * this buffer to disk. + * + * bioops callback - hold io_token */ static int hammer_io_countdeps(struct buf *bp, int n) { + /* nothing to do, so io_token not needed */ return(0); } @@ -1284,6 +1367,9 @@ done: /* * On completion of the BIO this callback must check the data CRC * and chain to the previous bio. + * + * MPSAFE - since we do not modify and hammer_records we do not need + * io_token. */ static void @@ -1442,19 +1528,25 @@ hammer_io_direct_write_complete(struct bio *nbio) { struct bio *obio; struct buf *bp; - hammer_record_t record = nbio->bio_caller_info1.ptr; + hammer_record_t record; + hammer_mount_t hmp; + + record = nbio->bio_caller_info1.ptr; + KKASSERT(record != NULL); + hmp = record->ip->hmp; + + lwkt_gettoken(&hmp->io_token); bp = nbio->bio_buf; obio = pop_bio(nbio); if (bp->b_flags & B_ERROR) { - hammer_critical_error(record->ip->hmp, record->ip, + hammer_critical_error(hmp, record->ip, bp->b_error, "while writing bulk data"); bp->b_flags |= B_INVAL; } biodone(obio); - KKASSERT(record != NULL); KKASSERT(record->flags & HAMMER_RECF_DIRECT_IO); if (record->flags & HAMMER_RECF_DIRECT_WAIT) { record->flags &= ~(HAMMER_RECF_DIRECT_IO | @@ -1465,6 +1557,7 @@ hammer_io_direct_write_complete(struct bio *nbio) record->flags &= ~HAMMER_RECF_DIRECT_IO; /* record can disappear once DIRECT_IO flag is cleared */ } + lwkt_reltoken(&hmp->io_token); } @@ -1481,16 +1574,18 @@ hammer_io_direct_write_complete(struct bio *nbio) void hammer_io_direct_wait(hammer_record_t record) { + hammer_mount_t hmp = record->ip->hmp; + /* * Wait for I/O to complete */ if (record->flags & HAMMER_RECF_DIRECT_IO) { - crit_enter(); + lwkt_gettoken(&hmp->io_token); while (record->flags & HAMMER_RECF_DIRECT_IO) { record->flags |= HAMMER_RECF_DIRECT_WAIT; tsleep(&record->flags, 0, "hmdiow", 0); } - crit_exit(); + lwkt_reltoken(&hmp->io_token); } /* @@ -1505,7 +1600,7 @@ hammer_io_direct_wait(hammer_record_t record) */ if (record->flags & HAMMER_RECF_DIRECT_INVAL) { KKASSERT(record->leaf.data_offset); - hammer_del_buffers(record->ip->hmp, record->leaf.data_offset, + hammer_del_buffers(hmp, record->leaf.data_offset, record->zone2_offset, record->leaf.data_len, 1); record->flags &= ~HAMMER_RECF_DIRECT_INVAL; diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index 694e85f..b9d1eec 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -554,15 +554,18 @@ again: * buffers will never be in a modified state. This should * only occur on the 0->1 transition of refs. * - * lose_list can be modified via a biodone() interrupt. + * lose_list can be modified via a biodone() interrupt + * so the io_token must be held. */ if (buffer->io.mod_list == &hmp->lose_list) { - crit_enter(); /* biodone race against list */ - TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, - mod_entry); - crit_exit(); - buffer->io.mod_list = NULL; - KKASSERT(buffer->io.modified == 0); + lwkt_gettoken(&hmp->io_token); + if (buffer->io.mod_list == &hmp->lose_list) { + TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, + mod_entry); + buffer->io.mod_list = NULL; + KKASSERT(buffer->io.modified == 0); + } + lwkt_reltoken(&hmp->io_token); } goto found; } @@ -903,6 +906,7 @@ hammer_unload_buffer(hammer_buffer_t buffer, void *data) int hammer_ref_buffer(hammer_buffer_t buffer) { + hammer_mount_t hmp; int error; int locked; @@ -911,22 +915,23 @@ hammer_ref_buffer(hammer_buffer_t buffer) * 0->1 transition. */ locked = hammer_ref_interlock(&buffer->io.lock); + hmp = buffer->io.hmp; /* * At this point a biodone() will not touch the buffer other then * incidental bits. However, lose_list can be modified via * a biodone() interrupt. * - * No longer loose + * No longer loose. lose_list requires the io_token. */ - if (buffer->io.mod_list == &buffer->io.hmp->lose_list) { - crit_enter(); - if (buffer->io.mod_list == &buffer->io.hmp->lose_list) { + if (buffer->io.mod_list == &hmp->lose_list) { + lwkt_gettoken(&hmp->io_token); + if (buffer->io.mod_list == &hmp->lose_list) { TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, mod_entry); buffer->io.mod_list = NULL; } - crit_exit(); + lwkt_reltoken(&hmp->io_token); } if (locked) { diff --git a/sys/vfs/hammer/hammer_signal.c b/sys/vfs/hammer/hammer_signal.c index fd48879..8804b13 100644 --- a/sys/vfs/hammer/hammer_signal.c +++ b/sys/vfs/hammer/hammer_signal.c @@ -39,6 +39,11 @@ #include "hammer.h" +/* + * Check for a user signal interrupting a long operation + * + * MPSAFE + */ int hammer_signal_check(hammer_mount_t hmp) { diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index ff591d6..c9b01fc 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -395,7 +395,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, } /* - * Interal mount data structure + * Internal mount data structure */ if (hmp == NULL) { hmp = kmalloc(sizeof(*hmp), M_HAMMER, M_WAITOK | M_ZERO); @@ -467,6 +467,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, * recovery if it has not already been run. */ if (mp->mnt_flag & MNT_UPDATE) { + lwkt_gettoken(&hmp->fs_token); error = 0; if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { kprintf("HAMMER read-only -> read-write\n"); @@ -498,6 +499,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, hammer_adjust_volume_mode, NULL); } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -519,6 +521,11 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, TAILQ_INIT(&hmp->lose_list); TAILQ_INIT(&hmp->iorun_list); + lwkt_token_init(&hmp->fs_token, 1, "hammerfs"); + lwkt_token_init(&hmp->io_token, 1, "hammerio"); + + lwkt_gettoken(&hmp->fs_token); + /* * Load volumes */ @@ -586,6 +593,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, } if (error) { + /* called with fs_token held */ hammer_free_hmp(mp); return (error); } @@ -603,8 +611,8 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, * on return, so even if we do not specify it we no longer get * the BGL regardlless of how we are flagged. */ - mp->mnt_kern_flag |= MNTK_RD_MPSAFE | MNTK_GA_MPSAFE | - MNTK_IN_MPSAFE; + mp->mnt_kern_flag |= MNTK_ALL_MPSAFE; + /*MNTK_RD_MPSAFE | MNTK_GA_MPSAFE | MNTK_IN_MPSAFE;*/ /* * note: f_iosize is used by vnode_pager_haspage() when constructing @@ -741,45 +749,54 @@ failed: /* * Cleanup and return. */ - if (error) + if (error) { + /* called with fs_token held */ hammer_free_hmp(mp); + } else { + lwkt_reltoken(&hmp->fs_token); + } return (error); } static int hammer_vfs_unmount(struct mount *mp, int mntflags) { -#if 0 - struct hammer_mount *hmp = (void *)mp->mnt_data; -#endif + hammer_mount_t hmp = (void *)mp->mnt_data; int flags; int error; /* * Clean out the vnodes */ + lwkt_gettoken(&hmp->fs_token); flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; - if ((error = vflush(mp, 0, flags)) != 0) - return (error); + error = vflush(mp, 0, flags); /* * Clean up the internal mount structure and related entities. This * may issue I/O. */ - hammer_free_hmp(mp); - return(0); + if (error == 0) { + /* called with fs_token held */ + hammer_free_hmp(mp); + } else { + lwkt_reltoken(&hmp->fs_token); + } + return(error); } /* * Clean up the internal mount structure and disassociate it from the mount. * This may issue I/O. + * + * Called with fs_token held. */ static void hammer_free_hmp(struct mount *mp) { - struct hammer_mount *hmp = (void *)mp->mnt_data; + hammer_mount_t hmp = (void *)mp->mnt_data; hammer_flush_group_t flg; int count; int dummy; @@ -857,6 +874,7 @@ hammer_free_hmp(struct mount *mp) hammer_destroy_objid_cache(hmp); kmalloc_destroy(&hmp->m_misc); kmalloc_destroy(&hmp->m_inodes); + lwkt_reltoken(&hmp->fs_token); kfree(hmp, M_HAMMER); } @@ -901,6 +919,7 @@ hammer_vfs_vget(struct mount *mp, struct vnode *dvp, int error; u_int32_t localization; + lwkt_gettoken(&hmp->fs_token); hammer_simple_transaction(&trans, hmp); /* @@ -925,12 +944,12 @@ hammer_vfs_vget(struct mount *mp, struct vnode *dvp, 0, &error); if (ip == NULL) { *vpp = NULL; - hammer_done_transaction(&trans); - return(error); + } else { + error = hammer_get_vnode(ip, vpp); + hammer_rel_inode(ip, 0); } - error = hammer_get_vnode(ip, vpp); - hammer_rel_inode(ip, 0); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -943,9 +962,6 @@ hammer_vfs_vget(struct mount *mp, struct vnode *dvp, static int hammer_vfs_root(struct mount *mp, struct vnode **vpp) { -#if 0 - struct hammer_mount *hmp = (void *)mp->mnt_data; -#endif int error; error = hammer_vfs_vget(mp, NULL, 1, vpp); @@ -962,9 +978,12 @@ hammer_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) int64_t bfree; int64_t breserved; + lwkt_gettoken(&hmp->fs_token); volume = hammer_get_root_volume(hmp, &error); - if (error) + if (error) { + lwkt_reltoken(&hmp->fs_token); return(error); + } ondisk = volume->ondisk; /* @@ -981,6 +1000,7 @@ hammer_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) mp->mnt_stat.f_files = 0; *sbp = mp->mnt_stat; + lwkt_reltoken(&hmp->fs_token); return(0); } @@ -994,9 +1014,12 @@ hammer_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) int64_t bfree; int64_t breserved; + lwkt_gettoken(&hmp->fs_token); volume = hammer_get_root_volume(hmp, &error); - if (error) + if (error) { + lwkt_reltoken(&hmp->fs_token); return(error); + } ondisk = volume->ondisk; /* @@ -1012,6 +1035,7 @@ hammer_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) if (mp->mnt_vstat.f_files < 0) mp->mnt_vstat.f_files = 0; *sbp = mp->mnt_vstat; + lwkt_reltoken(&hmp->fs_token); return(0); } @@ -1029,16 +1053,21 @@ hammer_vfs_sync(struct mount *mp, int waitfor) struct hammer_mount *hmp = (void *)mp->mnt_data; int error; + lwkt_gettoken(&hmp->fs_token); if (panicstr == NULL) { error = hammer_sync_hmp(hmp, waitfor); } else { error = EIO; } + lwkt_reltoken(&hmp->fs_token); return (error); } /* * Convert a vnode to a file handle. + * + * Accesses read-only fields on already-referenced structures so + * no token is needed. */ static int hammer_vfs_vptofh(struct vnode *vp, struct fid *fhp) @@ -1065,6 +1094,7 @@ static int hammer_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, struct fid *fhp, struct vnode **vpp) { + hammer_mount_t hmp = (void *)mp->mnt_data; struct hammer_transaction trans; struct hammer_inode *ip; struct hammer_inode_info info; @@ -1078,7 +1108,8 @@ hammer_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, else localization = (u_int32_t)fhp->fid_ext << 16; - hammer_simple_transaction(&trans, (void *)mp->mnt_data); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); /* * Get/allocate the hammer_inode structure. The structure must be @@ -1094,6 +1125,7 @@ hammer_vfs_fhtovp(struct mount *mp, struct vnode *rootvp, *vpp = NULL; } hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1105,6 +1137,7 @@ hammer_vfs_checkexp(struct mount *mp, struct sockaddr *nam, struct netcred *np; int error; + lwkt_gettoken(&hmp->fs_token); np = vfs_export_lookup(mp, &hmp->export, nam); if (np) { *exflagsp = np->netc_exflags; @@ -1113,6 +1146,7 @@ hammer_vfs_checkexp(struct mount *mp, struct sockaddr *nam, } else { error = EACCES; } + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1123,6 +1157,8 @@ hammer_vfs_export(struct mount *mp, int op, const struct export_args *export) hammer_mount_t hmp = (void *)mp->mnt_data; int error; + lwkt_gettoken(&hmp->fs_token); + switch(op) { case MOUNTCTL_SET_EXPORT: error = vfs_export(mp, &hmp->export, export); @@ -1131,6 +1167,8 @@ hammer_vfs_export(struct mount *mp, int op, const struct export_args *export) error = EOPNOTSUPP; break; } + lwkt_reltoken(&hmp->fs_token); + return(error); } diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 05c4084..048d312 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -210,6 +210,8 @@ hammer_vop_fsync(struct vop_fsync_args *ap) int waitfor = ap->a_waitfor; int mode; + lwkt_gettoken(&hmp->fs_token); + /* * Fsync rule relaxation (default is either full synchronous flush * or REDO semantics with synchronous flush). @@ -242,6 +244,7 @@ mode1: break; case 4: /* ignore the fsync() system call */ + lwkt_reltoken(&hmp->fs_token); return(0); default: /* we have to do something */ @@ -262,6 +265,7 @@ mode1: ++hammer_count_fsyncs; hammer_flusher_flush_undos(hmp, mode); ip->redo_count = 0; + lwkt_reltoken(&hmp->fs_token); return(0); } @@ -294,13 +298,14 @@ skip: hammer_wait_inode(ip); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); } + lwkt_reltoken(&hmp->fs_token); return (ip->error); } /* * hammer_vop_read { vp, uio, ioflag, cred } * - * MPALMOSTSAFE + * MPSAFE (for the cache safe does not require fs_token) */ static int @@ -308,6 +313,7 @@ hammer_vop_read(struct vop_read_args *ap) { struct hammer_transaction trans; hammer_inode_t ip; + hammer_mount_t hmp; off_t offset; struct buf *bp; struct uio *uio; @@ -316,12 +322,13 @@ hammer_vop_read(struct vop_read_args *ap) int seqcount; int ioseqcount; int blksize; - int got_mplock; int bigread; + int got_fstoken; if (ap->a_vp->v_type != VREG) return (EINVAL); ip = VTOI(ap->a_vp); + hmp = ip->hmp; error = 0; uio = ap->a_uio; @@ -335,26 +342,12 @@ hammer_vop_read(struct vop_read_args *ap) seqcount = ioseqcount; /* - * Temporary hack until more of HAMMER can be made MPSAFE. - */ -#ifdef SMP - if (curthread->td_mpcount) { - got_mplock = -1; - hammer_start_transaction(&trans, ip->hmp); - } else { - got_mplock = 0; - } -#else - hammer_start_transaction(&trans, ip->hmp); - got_mplock = -1; -#endif - - /* * If reading or writing a huge amount of data we have to break * atomicy and allow the operation to be interrupted by a signal * or it can DOS the machine. */ bigread = (uio->uio_resid > 100 * 1024 * 1024); + got_fstoken = 0; /* * Access the data typically in HAMMER_BUFSIZE blocks via the @@ -388,9 +381,9 @@ hammer_vop_read(struct vop_read_args *ap) /* * MPUNSAFE */ - if (got_mplock == 0) { - got_mplock = 1; - get_mplock(); + if (got_fstoken == 0) { + lwkt_gettoken(&hmp->fs_token); + got_fstoken = 1; hammer_start_transaction(&trans, ip->hmp); } @@ -445,15 +438,14 @@ skip: * XXX only update the atime if we had to get the MP lock. * XXX hack hack hack, fixme. */ - if (got_mplock) { + if (got_fstoken) { if ((ip->flags & HAMMER_INODE_RO) == 0 && (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { ip->ino_data.atime = trans.time; hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); } hammer_done_transaction(&trans); - if (got_mplock > 0) - rel_mplock(); + lwkt_reltoken(&hmp->fs_token); } return (error); } @@ -493,6 +485,7 @@ hammer_vop_write(struct vop_write_args *ap) /* * Create a transaction to cover the operations we perform. */ + lwkt_gettoken(&hmp->fs_token); hammer_start_transaction(&trans, hmp); uio = ap->a_uio; @@ -510,11 +503,13 @@ hammer_vop_write(struct vop_write_args *ap) */ if (uio->uio_offset < 0) { hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (EFBIG); } base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */ if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) { hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (EFBIG); } @@ -811,11 +806,14 @@ hammer_vop_write(struct vop_write_args *ap) } hammer_done_transaction(&trans); hammer_knote(ap->a_vp, kflags); + lwkt_reltoken(&hmp->fs_token); return (error); } /* * hammer_vop_access { vp, mode, cred } + * + * MPSAFE - does not require fs_token */ static int @@ -837,6 +835,8 @@ hammer_vop_access(struct vop_access_args *ap) /* * hammer_vop_advlock { vp, id, op, fl, flags } + * + * MPSAFE - does not require fs_token */ static int @@ -850,7 +850,7 @@ hammer_vop_advlock(struct vop_advlock_args *ap) /* * hammer_vop_close { vp, fflag } * - * We can only sync-on-close for normal closes. + * We can only sync-on-close for normal closes. XXX disabled for now. */ static int @@ -890,20 +890,23 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -918,6 +921,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) hkprintf("hammer_create_inode error %d\n", error); hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -948,6 +952,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) } hammer_knote(ap->a_dvp, NOTE_WRITE); } + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -959,7 +964,7 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) * The atime field is stored in the B-Tree element and allowed to be * updated without cycling the element. * - * MPSAFE + * MPSAFE - does not require fs_token */ static int @@ -1070,6 +1075,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) { struct hammer_transaction trans; struct namecache *ncp; + hammer_mount_t hmp; hammer_inode_t dip; hammer_inode_t ip; hammer_tid_t asof; @@ -1097,8 +1103,10 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) nlen = ncp->nc_nlen; flags = dip->flags & HAMMER_INODE_RO; ispfs = 0; + hmp = dip->hmp; - hammer_simple_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; for (i = 0; i < nlen; ++i) { @@ -1253,6 +1261,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) } done: hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1279,6 +1288,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) struct hammer_transaction trans; struct hammer_inode *dip; struct hammer_inode *ip; + hammer_mount_t hmp; int64_t parent_obj_id; u_int32_t parent_obj_localization; hammer_tid_t asof; @@ -1286,11 +1296,13 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) dip = VTOI(ap->a_dvp); asof = dip->obj_asof; + hmp = dip->hmp; /* * Whos are parent? This could be the root of a pseudo-filesystem * whos parent is in another localization domain. */ + lwkt_gettoken(&hmp->fs_token); parent_obj_id = dip->ino_data.parent_obj_id; if (dip->obj_id == HAMMER_OBJID_ROOT) parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization; @@ -1299,19 +1311,20 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) if (parent_obj_id == 0) { if (dip->obj_id == HAMMER_OBJID_ROOT && - asof != dip->hmp->asof) { + asof != hmp->asof) { parent_obj_id = dip->obj_id; - asof = dip->hmp->asof; + asof = hmp->asof; *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK); ksnprintf(*ap->a_fakename, 19, "0x%016llx", (long long)dip->obj_asof); } else { *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return ENOENT; } } - hammer_simple_transaction(&trans, dip->hmp); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; ip = hammer_get_inode(&trans, dip, parent_obj_id, @@ -1324,6 +1337,7 @@ hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap) *ap->a_vpp = NULL; } hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1338,6 +1352,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) struct hammer_inode *dip; struct hammer_inode *ip; struct nchandle *nch; + hammer_mount_t hmp; int error; if (ap->a_dvp->v_mount != ap->a_vp->v_mount) @@ -1346,6 +1361,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) nch = ap->a_nch; dip = VTOI(ap->a_dvp); ip = VTOI(ap->a_vp); + hmp = dip->hmp; if (dip->obj_localization != ip->obj_localization) return(EXDEV); @@ -1354,13 +1370,14 @@ hammer_vop_nlink(struct vop_nlink_args *ap) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1382,6 +1399,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) hammer_done_transaction(&trans); hammer_knote(ap->a_vp, NOTE_LINK); hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1399,20 +1417,23 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1426,6 +1447,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) hkprintf("hammer_mkdir error %d\n", error); hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } /* @@ -1455,6 +1477,7 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1472,20 +1495,23 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) struct hammer_inode *dip; struct hammer_inode *nip; struct nchandle *nch; + hammer_mount_t hmp; int error; nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1500,6 +1526,7 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) if (error) { hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1528,11 +1555,14 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } /* * hammer_vop_open { vp, mode, cred, fp } + * + * MPSAFE (does not require fs_token) */ static int @@ -1568,6 +1598,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap) struct hammer_transaction trans; struct hammer_cursor cursor; struct hammer_inode *ip; + hammer_mount_t hmp; struct uio *uio; hammer_base_elm_t base; int error; @@ -1582,6 +1613,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap) ip = VTOI(ap->a_vp); uio = ap->a_uio; saveoff = uio->uio_offset; + hmp = ip->hmp; if (ap->a_ncookies) { ncookies = uio->uio_resid / 16 + 1; @@ -1595,7 +1627,8 @@ hammer_vop_readdir(struct vop_readdir_args *ap) cookie_index = 0; } - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); /* * Handle artificial entries @@ -1710,6 +1743,7 @@ done: *ap->a_cookies = cookies; } } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -1723,12 +1757,16 @@ hammer_vop_readlink(struct vop_readlink_args *ap) struct hammer_transaction trans; struct hammer_cursor cursor; struct hammer_inode *ip; + hammer_mount_t hmp; char buf[32]; u_int32_t localization; hammer_pseudofs_inmem_t pfsm; int error; ip = VTOI(ap->a_vp); + hmp = ip->hmp; + + lwkt_gettoken(&hmp->fs_token); /* * Shortcut if the symlink data was stuffed into ino_data. @@ -1747,7 +1785,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap) ip->obj_asof == HAMMER_MAX_TID && ip->obj_localization == 0 && strncmp(ptr, "@@PFS", 5) == 0) { - hammer_simple_transaction(&trans, ip->hmp); + hammer_simple_transaction(&trans, hmp); bcopy(ptr + 5, buf, 5); buf[5] = 0; localization = strtoul(buf, NULL, 10) << 16; @@ -1777,17 +1815,18 @@ hammer_vop_readlink(struct vop_readlink_args *ap) bytes = strlen(buf); } if (pfsm) - hammer_rel_pseudofs(trans.hmp, pfsm); + hammer_rel_pseudofs(hmp, pfsm); hammer_done_transaction(&trans); } error = uiomove(ptr, bytes, ap->a_uio); + lwkt_reltoken(&hmp->fs_token); return(error); } /* * Long version */ - hammer_simple_transaction(&trans, ip->hmp); + hammer_simple_transaction(&trans, hmp); ++hammer_stats_file_iopsr; hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); @@ -1820,6 +1859,7 @@ hammer_vop_readlink(struct vop_readlink_args *ap) } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -1832,21 +1872,25 @@ hammer_vop_nremove(struct vop_nremove_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0); hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -1863,6 +1907,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) struct hammer_inode *fdip; struct hammer_inode *tdip; struct hammer_inode *ip; + hammer_mount_t hmp; struct hammer_cursor cursor; int64_t namekey; u_int32_t max_iterations; @@ -1880,6 +1925,8 @@ hammer_vop_nrename(struct vop_nrename_args *ap) ip = VTOI(fncp->nc_vp); KKASSERT(ip != NULL); + hmp = ip->hmp; + if (fdip->obj_localization != tdip->obj_localization) return(EXDEV); if (fdip->obj_localization != ip->obj_localization) @@ -1891,10 +1938,11 @@ hammer_vop_nrename(struct vop_nrename_args *ap) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); - hammer_start_transaction(&trans, fdip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -1999,6 +2047,7 @@ retry: failed: hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2011,21 +2060,25 @@ hammer_vop_nrmdir(struct vop_nrmdir_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1); hammer_done_transaction(&trans); if (error == 0) hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2038,21 +2091,25 @@ hammer_vop_markatime(struct vop_markatime_args *ap) { struct hammer_transaction trans; struct hammer_inode *ip; + hammer_mount_t hmp; ip = VTOI(ap->a_vp); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); - if (ip->hmp->mp->mnt_flag & MNT_NOATIME) + hmp = ip->hmp; + if (hmp->mp->mnt_flag & MNT_NOATIME) return (0); - hammer_start_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; ip->ino_data.atime = trans.time; hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, NOTE_ATTRIB); + lwkt_reltoken(&hmp->fs_token); return (0); } @@ -2064,8 +2121,9 @@ int hammer_vop_setattr(struct vop_setattr_args *ap) { struct hammer_transaction trans; - struct vattr *vap; struct hammer_inode *ip; + struct vattr *vap; + hammer_mount_t hmp; int modflags; int error; int truncating; @@ -2080,17 +2138,19 @@ hammer_vop_setattr(struct vop_setattr_args *ap) ip = ap->a_vp->v_data; modflags = 0; kflags = 0; + hmp = ip->hmp; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return(EROFS); if (ip->flags & HAMMER_INODE_RO) return (EROFS); if (hammer_nohistory(ip) == 0 && - (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) { return (error); } - hammer_start_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = 0; @@ -2296,6 +2356,7 @@ done: hammer_modify_inode(&trans, ip, modflags); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, kflags); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2309,8 +2370,9 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) struct hammer_transaction trans; struct hammer_inode *dip; struct hammer_inode *nip; - struct nchandle *nch; hammer_record_t record; + struct nchandle *nch; + hammer_mount_t hmp; int error; int bytes; @@ -2318,16 +2380,18 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) nch = ap->a_nch; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); - if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) + if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) return (error); /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; /* @@ -2341,6 +2405,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) if (error) { hammer_done_transaction(&trans); *ap->a_vpp = NULL; + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2395,6 +2460,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) } } hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2407,20 +2473,24 @@ hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) { struct hammer_transaction trans; struct hammer_inode *dip; + hammer_mount_t hmp; int error; dip = VTOI(ap->a_dvp); + hmp = dip->hmp; if (hammer_nohistory(dip) == 0 && - (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) { + (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) { return (error); } - hammer_start_transaction(&trans, dip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_start_transaction(&trans, hmp); ++hammer_stats_file_iopsw; error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags, -1); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2433,10 +2503,15 @@ int hammer_vop_ioctl(struct vop_ioctl_args *ap) { struct hammer_inode *ip = ap->a_vp->v_data; + hammer_mount_t hmp = ip->hmp; + int error; ++hammer_stats_file_iopsr; - return(hammer_ioctl(ip, ap->a_command, ap->a_data, - ap->a_fflag, ap->a_cred)); + lwkt_gettoken(&hmp->fs_token); + error = hammer_ioctl(ip, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_cred); + lwkt_reltoken(&hmp->fs_token); + return (error); } static @@ -2460,8 +2535,9 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) KKASSERT(mp->mnt_data != NULL); hmp = (struct hammer_mount *)mp->mnt_data; - switch(ap->a_op) { + lwkt_gettoken(&hmp->fs_token); + switch(ap->a_op) { case MOUNTCTL_SET_EXPORT: if (ap->a_ctllen != sizeof(struct export_args)) error = EINVAL; @@ -2482,7 +2558,8 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) usedbytes = *ap->a_res; if (usedbytes > 0 && usedbytes < ap->a_buflen) { - usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf, + usedbytes += vfs_flagstostr(hmp->hflags, extraopt, + ap->a_buf, ap->a_buflen - usedbytes, &error); } @@ -2494,6 +2571,7 @@ hammer_vop_mountctl(struct vop_mountctl_args *ap) error = vop_stdmountctl(ap); break; } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2550,6 +2628,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) struct hammer_transaction trans; struct hammer_inode *ip; struct hammer_inode *dip; + hammer_mount_t hmp; struct hammer_cursor cursor; hammer_base_elm_t base; hammer_off_t disk_offset; @@ -2567,6 +2646,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) bio = ap->a_bio; bp = bio->bio_buf; ip = ap->a_vp->v_data; + hmp = ip->hmp; /* * The zone-2 disk offset may have been set by the cluster code via @@ -2577,7 +2657,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) nbio = push_bio(bio); if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_LARGE_DATA) { - error = hammer_io_direct_read(ip->hmp, nbio, NULL); + lwkt_gettoken(&hmp->fs_token); + error = hammer_io_direct_read(hmp, nbio, NULL); + lwkt_reltoken(&hmp->fs_token); return (error); } @@ -2585,7 +2667,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) * Well, that sucked. Do it the hard way. If all the stars are * aligned we may still be able to issue a direct-read. */ - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); /* @@ -2719,8 +2802,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap) KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_LARGE_DATA); nbio->bio_offset = disk_offset; - error = hammer_io_direct_read(trans.hmp, nbio, - cursor.leaf); + error = hammer_io_direct_read(hmp, nbio, cursor.leaf); goto done; } else if (n) { error = hammer_ip_resolve_data(&cursor); @@ -2781,6 +2863,7 @@ done: } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -2805,6 +2888,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) { struct hammer_transaction trans; struct hammer_inode *ip; + hammer_mount_t hmp; struct hammer_cursor cursor; hammer_base_elm_t base; int64_t rec_offset; @@ -2821,6 +2905,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) ++hammer_stats_file_iopsr; ip = ap->a_vp->v_data; + hmp = ip->hmp; /* * We can only BMAP regular files. We can't BMAP database files, @@ -2840,7 +2925,8 @@ hammer_vop_bmap(struct vop_bmap_args *ap) * Scan the B-Tree to acquire blockmap addresses, then translate * to raw addresses. */ - hammer_simple_transaction(&trans, ip->hmp); + lwkt_gettoken(&hmp->fs_token); + hammer_simple_transaction(&trans, hmp); #if 0 kprintf("bmap_beg %016llx ip->cache %p\n", (long long)ap->a_loffset, ip->cache[1]); @@ -2965,6 +3051,7 @@ hammer_vop_bmap(struct vop_bmap_args *ap) } hammer_done_cursor(&cursor); hammer_done_transaction(&trans); + lwkt_reltoken(&hmp->fs_token); /* * If we couldn't find any records or the records we did find were @@ -3057,6 +3144,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) return(EROFS); } + lwkt_gettoken(&hmp->fs_token); + /* * Interlock with inode destruction (no in-kernel or directory * topology visibility). If we queue new IO while trying to @@ -3071,6 +3160,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) { bp->b_resid = 0; biodone(ap->a_bio); + lwkt_reltoken(&hmp->fs_token); return(0); } @@ -3120,6 +3210,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap) bp->b_flags |= B_ERROR; biodone(ap->a_bio); } + lwkt_reltoken(&hmp->fs_token); return(error); } @@ -3136,6 +3227,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, struct namecache *ncp; hammer_inode_t dip; hammer_inode_t ip; + hammer_mount_t hmp; struct hammer_cursor cursor; int64_t namekey; u_int32_t max_iterations; @@ -3150,6 +3242,7 @@ hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch, */ dip = VTOI(dvp); ncp = nch->ncp; + hmp = dip->hmp; if (dip->flags & HAMMER_INODE_RO) return (EROFS); @@ -3206,7 +3299,7 @@ retry: if (error == 0) { hammer_unlock(&cursor.ip->lock); ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id, - dip->hmp->asof, + hmp->asof, cursor.data->entry.localization, 0, &error); hammer_lock_sh(&cursor.ip->lock); @@ -3304,7 +3397,6 @@ retry: ************************************************************************ * */ - static int hammer_vop_fifoclose (struct vop_close_args *ap) { @@ -3402,14 +3494,17 @@ filt_hammerread(struct knote *kn, long hint) { struct vnode *vp = (void *)kn->kn_hook; hammer_inode_t ip = VTOI(vp); + hammer_mount_t hmp = ip->hmp; off_t off; if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return(1); } + lwkt_gettoken(&hmp->fs_token); /* XXX use per-ip-token */ off = ip->ino_data.size - kn->kn_fp->f_offset; kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; + lwkt_reltoken(&hmp->fs_token); if (kn->kn_sfflags & NOTE_OLDAPI) return(1); return (kn->kn_data != 0); -- 1.7.7.2 From bb5add8c2c55a959b77a8677e60ca66573f7e865 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 24 Aug 2010 16:09:48 -0700 Subject: [PATCH 08/16] HAMMER VFS - Add another lwkt_user_yield() * Add a lwkt_user_yield() call to reverse b-tree iterations, which are used by the pruning code. Forward iterations already call lwkt_user_yield(). --- sys/vfs/hammer/hammer_btree.c | 12 +++++++++++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index b1e2a28..44e5382 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -433,6 +433,7 @@ hammer_btree_iterate_reverse(hammer_cursor_t cursor) { hammer_node_ondisk_t node; hammer_btree_elm_t elm; + hammer_mount_t hmp; int error = 0; int r; int s; @@ -456,11 +457,20 @@ hammer_btree_iterate_reverse(hammer_cursor_t cursor) --cursor->index; /* + * HAMMER can wind up being cpu-bound. + */ + hmp = cursor->trans->hmp; + if (++hmp->check_yield > hammer_yield_check) { + hmp->check_yield = 0; + lwkt_user_yield(); + } + + /* * Loop until an element is found or we are done. */ for (;;) { ++hammer_stats_btree_iterations; - hammer_flusher_clean_loose_ios(cursor->trans->hmp); + hammer_flusher_clean_loose_ios(hmp); /* * We iterate up the tree and then index over one element -- 1.7.7.2 From a8ca8ac67294ac211f6120be8ba4a061ec96dfec Mon Sep 17 00:00:00 2001 From: Venkatesh Srinivas Date: Tue, 24 Aug 2010 17:39:51 -0700 Subject: [PATCH 09/16] ioprio(1): Higher priorities receive more I/O time; note in manpage. --- usr.bin/ioprio/ioprio.1 | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/usr.bin/ioprio/ioprio.1 b/usr.bin/ioprio/ioprio.1 index a6bc73c..39fe49c 100644 --- a/usr.bin/ioprio/ioprio.1 +++ b/usr.bin/ioprio/ioprio.1 @@ -47,7 +47,7 @@ utility runs .Ar utility at a certain I/O scheduling priority, specified by .Ar priority , -or a default value of 4. +or a default value of 4. Higher priorities receive greater I/O budgets. .Sh SEE ALSO .Xr ioprio_set 2 .Sh HISTORY -- 1.7.7.2 From dcaa8a41662f2b0cf579a6e912564c9fc8275ac1 Mon Sep 17 00:00:00 2001 From: Venkatesh Srinivas Date: Tue, 24 Aug 2010 18:37:54 -0700 Subject: [PATCH 10/16] tmpfs: Convert tmpfs node allocation zone into a per-mount pool. Each tmpfs mount tracked the number and max nodes separately, leading to an inaccurate measure of the limit of the tmpfs node malloc zone. We now create a kmalloc zone for each mount, as in HAMMER (hammer_vfsops.c). --- sys/vfs/tmpfs/tmpfs.h | 4 ++++ sys/vfs/tmpfs/tmpfs_vfsops.c | 14 +++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/sys/vfs/tmpfs/tmpfs.h b/sys/vfs/tmpfs/tmpfs.h index 343e931..39cf5f0 100644 --- a/sys/vfs/tmpfs/tmpfs.h +++ b/sys/vfs/tmpfs/tmpfs.h @@ -385,6 +385,10 @@ struct tmpfs_mount { /* All node lock to protect the node list and tmp_pages_used */ struct lock allnode_lock; + /* Per-mount malloc zone for tmpfs nodes */ + struct malloc_type *tm_node_zone; + struct objcache_malloc_args tm_node_zone_malloc_args; + /* Pools used to store file system meta data. These are not shared * across several instances of tmpfs for the reasons described in * tmpfs_pool.c. */ diff --git a/sys/vfs/tmpfs/tmpfs_vfsops.c b/sys/vfs/tmpfs/tmpfs_vfsops.c index f9d8cdc..df26224 100644 --- a/sys/vfs/tmpfs/tmpfs_vfsops.c +++ b/sys/vfs/tmpfs/tmpfs_vfsops.c @@ -68,7 +68,6 @@ MALLOC_DEFINE(M_TMPFSMNT, "tmpfs mount", "tmpfs mount structures"); MALLOC_DEFINE(M_TMPFSNAME, "tmpfs name", "tmpfs file names"); MALLOC_DEFINE(M_TMPFS_DIRENT, "tmpfs dirent", "tmpfs dirent structures"); -MALLOC_DEFINE(M_TMPFS_NODE, "tmpfs node", "tmpfs node structures"); /* --------------------------------------------------------------------- */ @@ -126,8 +125,6 @@ tmpfs_node_fini(void *obj, void *args) struct objcache_malloc_args tmpfs_dirent_pool_malloc_args = { sizeof(struct tmpfs_dirent), M_TMPFS_DIRENT }; -struct objcache_malloc_args tmpfs_node_pool_malloc_args = - { sizeof(struct tmpfs_node), M_TMPFS_NODE }; static int tmpfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred) @@ -226,9 +223,14 @@ tmpfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred) tmp->tm_pages_max = pages; tmp->tm_pages_used = 0; - kmalloc_raise_limit(M_TMPFS_NODE, sizeof(struct tmpfs_node) * + kmalloc_create(&tmp->tm_node_zone, "tmpfs node"); + + kmalloc_raise_limit(tmp->tm_node_zone, sizeof(struct tmpfs_node) * tmp->tm_nodes_max); + tmp->tm_node_zone_malloc_args.objsize = sizeof(struct tmpfs_node); + tmp->tm_node_zone_malloc_args.mtype = tmp->tm_node_zone; + tmp->tm_dirent_pool = objcache_create( "tmpfs dirent cache", 0, 0, NULL, NULL, NULL, @@ -238,7 +240,7 @@ tmpfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred) 0, 0, tmpfs_node_ctor, tmpfs_node_dtor, NULL, tmpfs_node_init, tmpfs_node_fini, - &tmpfs_node_pool_malloc_args); + &tmp->tm_node_zone_malloc_args); /* Allocate the root node. */ error = tmpfs_alloc_node(tmp, VDIR, root_uid, root_gid, @@ -390,6 +392,8 @@ tmpfs_unmount(struct mount *mp, int mntflags) objcache_destroy(tmp->tm_dirent_pool); objcache_destroy(tmp->tm_node_pool); + kmalloc_destroy(&tmp->tm_node_zone); + lockuninit(&tmp->allnode_lock); KKASSERT(tmp->tm_pages_used == 0); KKASSERT(tmp->tm_nodes_inuse == 0); -- 1.7.7.2 From 8e771504ede4fe826607300e9e4c0c7444652cc4 Mon Sep 17 00:00:00 2001 From: Venkatesh Srinivas Date: Tue, 24 Aug 2010 18:51:35 -0700 Subject: [PATCH 11/16] tmpfs: Convert dirent malloc zone to a per-mount zone. --- sys/vfs/tmpfs/tmpfs.h | 5 ++++- sys/vfs/tmpfs/tmpfs_vfsops.c | 13 ++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/sys/vfs/tmpfs/tmpfs.h b/sys/vfs/tmpfs/tmpfs.h index 39cf5f0..8528db8 100644 --- a/sys/vfs/tmpfs/tmpfs.h +++ b/sys/vfs/tmpfs/tmpfs.h @@ -385,9 +385,12 @@ struct tmpfs_mount { /* All node lock to protect the node list and tmp_pages_used */ struct lock allnode_lock; - /* Per-mount malloc zone for tmpfs nodes */ + /* Per-mount malloc zones for tmpfs nodes and dirents */ struct malloc_type *tm_node_zone; + struct malloc_type *tm_dirent_zone; + struct objcache_malloc_args tm_node_zone_malloc_args; + struct objcache_malloc_args tm_dirent_zone_malloc_args; /* Pools used to store file system meta data. These are not shared * across several instances of tmpfs for the reasons described in diff --git a/sys/vfs/tmpfs/tmpfs_vfsops.c b/sys/vfs/tmpfs/tmpfs_vfsops.c index df26224..9edc68e 100644 --- a/sys/vfs/tmpfs/tmpfs_vfsops.c +++ b/sys/vfs/tmpfs/tmpfs_vfsops.c @@ -67,7 +67,6 @@ MALLOC_DEFINE(M_TMPFSMNT, "tmpfs mount", "tmpfs mount structures"); MALLOC_DEFINE(M_TMPFSNAME, "tmpfs name", "tmpfs file names"); -MALLOC_DEFINE(M_TMPFS_DIRENT, "tmpfs dirent", "tmpfs dirent structures"); /* --------------------------------------------------------------------- */ @@ -123,9 +122,6 @@ tmpfs_node_fini(void *obj, void *args) objcache_malloc_free(obj, args); } -struct objcache_malloc_args tmpfs_dirent_pool_malloc_args = - { sizeof(struct tmpfs_dirent), M_TMPFS_DIRENT }; - static int tmpfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred) { @@ -224,6 +220,7 @@ tmpfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred) tmp->tm_pages_used = 0; kmalloc_create(&tmp->tm_node_zone, "tmpfs node"); + kmalloc_create(&tmp->tm_dirent_zone, "tmpfs dirent"); kmalloc_raise_limit(tmp->tm_node_zone, sizeof(struct tmpfs_node) * tmp->tm_nodes_max); @@ -231,11 +228,14 @@ tmpfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred) tmp->tm_node_zone_malloc_args.objsize = sizeof(struct tmpfs_node); tmp->tm_node_zone_malloc_args.mtype = tmp->tm_node_zone; + tmp->tm_dirent_zone_malloc_args.objsize = sizeof(struct tmpfs_dirent); + tmp->tm_dirent_zone_malloc_args.mtype = tmp->tm_dirent_zone; + tmp->tm_dirent_pool = objcache_create( "tmpfs dirent cache", 0, 0, NULL, NULL, NULL, objcache_malloc_alloc, objcache_malloc_free, - &tmpfs_dirent_pool_malloc_args); + &tmp->tm_dirent_zone_malloc_args); tmp->tm_node_pool = objcache_create( "tmpfs node cache", 0, 0, tmpfs_node_ctor, tmpfs_node_dtor, NULL, @@ -392,8 +392,11 @@ tmpfs_unmount(struct mount *mp, int mntflags) objcache_destroy(tmp->tm_dirent_pool); objcache_destroy(tmp->tm_node_pool); + kmalloc_destroy(&tmp->tm_dirent_zone); kmalloc_destroy(&tmp->tm_node_zone); + tmp->tm_node_zone = tmp->tm_dirent_zone = NULL; + lockuninit(&tmp->allnode_lock); KKASSERT(tmp->tm_pages_used == 0); KKASSERT(tmp->tm_nodes_inuse == 0); -- 1.7.7.2 From 42f6f6b1b2dcc2ca10d31421d2dd6273851e012d Mon Sep 17 00:00:00 2001 From: Venkatesh Srinivas Date: Tue, 24 Aug 2010 20:03:34 -0700 Subject: [PATCH 12/16] tmpfs: Allow kmalloc from M_TMPFSNAME zone to return NULL; handle null cases. tmpfs now survives fsstress without panicing the system. --- sys/vfs/tmpfs/tmpfs_subr.c | 17 +++++++++++++---- sys/vfs/tmpfs/tmpfs_vnops.c | 7 ++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/sys/vfs/tmpfs/tmpfs_subr.c b/sys/vfs/tmpfs/tmpfs_subr.c index 564c8b7..8f286e4 100644 --- a/sys/vfs/tmpfs/tmpfs_subr.c +++ b/sys/vfs/tmpfs/tmpfs_subr.c @@ -126,6 +126,7 @@ tmpfs_alloc_node(struct tmpfs_mount *tmp, enum vtype type, case VCHR: rdev = makeudev(rmajor, rminor); if (rdev == NOUDEV) { + objcache_put(tmp->tm_node_pool, nnode); return(EINVAL); } nnode->tn_rdev = rdev; @@ -155,7 +156,11 @@ tmpfs_alloc_node(struct tmpfs_mount *tmp, enum vtype type, case VLNK: nnode->tn_size = strlen(target); nnode->tn_link = kmalloc(nnode->tn_size + 1, M_TMPFSNAME, - M_WAITOK); + M_WAITOK | M_NULLOK); + if (nnode->tn_link == NULL) { + objcache_put(tmp->tm_node_pool, nnode); + return (ENOSPC); + } bcopy(target, nnode->tn_link, nnode->tn_size); nnode->tn_link[nnode->tn_size] = '\0'; break; @@ -315,9 +320,13 @@ tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, { struct tmpfs_dirent *nde; - - nde = (struct tmpfs_dirent *)objcache_get(tmp->tm_dirent_pool, M_WAITOK); - nde->td_name = kmalloc(len + 1, M_TMPFSNAME, M_WAITOK); + nde = objcache_get(tmp->tm_dirent_pool, M_WAITOK); + nde->td_name = kmalloc(len + 1, M_TMPFSNAME, M_WAITOK | M_NULLOK); + if (nde->td_name == NULL) { + objcache_put(tmp->tm_dirent_pool, nde); + *de = NULL; + return (ENOSPC); + } nde->td_namelen = len; bcopy(name, nde->td_name, len); nde->td_name[len] = '\0'; diff --git a/sys/vfs/tmpfs/tmpfs_vnops.c b/sys/vfs/tmpfs/tmpfs_vnops.c index a582618..9dfae26 100644 --- a/sys/vfs/tmpfs/tmpfs_vnops.c +++ b/sys/vfs/tmpfs/tmpfs_vnops.c @@ -937,7 +937,12 @@ tmpfs_nrename(struct vop_nrename_args *v) */ if (fncp->nc_nlen != tncp->nc_nlen || bcmp(fncp->nc_name, tncp->nc_name, fncp->nc_nlen) != 0) { - newname = kmalloc(tncp->nc_nlen + 1, M_TMPFSNAME, M_WAITOK); + newname = kmalloc(tncp->nc_nlen + 1, M_TMPFSNAME, + M_WAITOK | M_NULLOK); + if (newname == NULL) { + error = ENOSPC; + goto out_locked; + } bcopy(tncp->nc_name, newname, tncp->nc_nlen); newname[tncp->nc_nlen] = '\0'; } else { -- 1.7.7.2 From 8787825a65f14bcf19955bf9fd4fe5bcfc25865e Mon Sep 17 00:00:00 2001 From: Venkatesh Srinivas Date: Tue, 24 Aug 2010 20:31:07 -0700 Subject: [PATCH 13/16] vm: Idlezero changes * Restore yield to idlezero loop; we lwkt_yield() every 64-bytes rather than every page now. * Unmark as MPSAFE; we are not MPSAFE, we hold the MP lock. When the page queues are safe to access with the vm token alone, we can revert to a prior edition of this code, which was mpsafe. --- sys/vm/vm_zeroidle.c | 11 ++++------- 1 files changed, 4 insertions(+), 7 deletions(-) diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c index ae5ff93..2d286e6 100644 --- a/sys/vm/vm_zeroidle.c +++ b/sys/vm/vm_zeroidle.c @@ -1,6 +1,4 @@ /* - * (MPSAFE) - * * Copyright (c) 1994 John Dyson * Copyright (c) 2001 Matt Dillon * Copyright (c) 2010 The DragonFly Project @@ -62,7 +60,7 @@ #define ZIDLE_HI(v) ((v) * 4 / 5) /* Number of bytes to zero between reschedule checks */ -#define IDLEZERO_RUN (32) +#define IDLEZERO_RUN (64) /* Maximum number of pages per second to zero */ #define NPAGES_RUN (20000) @@ -204,8 +202,7 @@ vm_pagezero(void __unused *arg) break; case STATE_ZERO_PAGE: /* - * Zero-out the page, stop immediately if a - * resched has been requested. + * Zero-out the page */ while (i < PAGE_SIZE) { if (idlezero_nocache == 1) @@ -213,9 +210,9 @@ vm_pagezero(void __unused *arg) else bzero(&pg[i], IDLEZERO_RUN); i += IDLEZERO_RUN; + lwkt_yield(); } - if (i == PAGE_SIZE) - state = STATE_RELEASE_PAGE; + state = STATE_RELEASE_PAGE; break; case STATE_RELEASE_PAGE: lwbuf_free(buf); -- 1.7.7.2 From e0e739f8950d09536c5cc0825e41b322360908b2 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 24 Aug 2010 21:02:54 -0700 Subject: [PATCH 14/16] kernel - lock sim in cam_periph_alloc() * cam_periph_alloc() is called from places where the sim lock is not being held. Acquire the lock internally as needed. --- sys/bus/cam/cam_periph.c | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/sys/bus/cam/cam_periph.c b/sys/bus/cam/cam_periph.c index 2571643..412d91b 100644 --- a/sys/bus/cam/cam_periph.c +++ b/sys/bus/cam/cam_periph.c @@ -170,6 +170,7 @@ cam_periph_alloc(periph_ctor_t *periph_ctor, xpt_unlock_buses(); sim = xpt_path_sim(path); + CAM_SIM_LOCK(sim); path_id = xpt_path_path_id(path); target_id = xpt_path_target_id(path); lun_id = xpt_path_lun_id(path); @@ -237,6 +238,7 @@ failure: default: panic("cam_periph_alloc: Unknown init level"); } + CAM_SIM_UNLOCK(sim); return(status); } -- 1.7.7.2 From ba974ec48b8cbc8c84064444ecd45cf03a1e5b72 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 24 Aug 2010 23:54:56 -0700 Subject: [PATCH 15/16] kernel - Change PROP_RWLOCK from spinlock to mtx * The PROP_RWLOCK can be held across switches and sleeps and so cannot be a spinlock. * Fixes spinlocks-held-during-switch panic. --- sys/libprop/prop_object_impl.h | 21 +++++++++++++-------- 1 files changed, 13 insertions(+), 8 deletions(-) diff --git a/sys/libprop/prop_object_impl.h b/sys/libprop/prop_object_impl.h index fd1fb3b..2f5c4c9 100644 --- a/sys/libprop/prop_object_impl.h +++ b/sys/libprop/prop_object_impl.h @@ -253,8 +253,9 @@ struct _prop_object_iterator { #include #include #include -#include -#include +#include +#include +#include #include #define _PROP_ASSERT(x) KKASSERT(x) @@ -285,17 +286,21 @@ SYSINIT(pp##_init, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, pp##_init, NULL) #define _PROP_MALLOC_DEFINE(t, s, l) \ MALLOC_DEFINE(t, s, l); +/* + * NOTE: These locks might be held through a sleep so no spinlocks + * can be used. + */ #define _PROP_MUTEX_DECL_STATIC(x) static struct lock x; #define _PROP_MUTEX_INIT(x) lockinit(&(x),"proplib",0,LK_CANRECURSE) #define _PROP_MUTEX_LOCK(x) lockmgr(&(x), LK_EXCLUSIVE) #define _PROP_MUTEX_UNLOCK(x) lockmgr(&(x), LK_RELEASE) -#define _PROP_RWLOCK_DECL(x) struct spinlock x; -#define _PROP_RWLOCK_INIT(x) spin_init(&(x)) -#define _PROP_RWLOCK_RDLOCK(x) spin_lock_wr(&(x)) -#define _PROP_RWLOCK_WRLOCK(x) spin_lock_wr(&(x)) -#define _PROP_RWLOCK_UNLOCK(x) spin_unlock_wr(&(x)) -#define _PROP_RWLOCK_DESTROY(x) spin_uninit(&(x)) +#define _PROP_RWLOCK_DECL(x) struct mtx x; +#define _PROP_RWLOCK_INIT(x) mtx_init(&(x)) +#define _PROP_RWLOCK_RDLOCK(x) mtx_lock(&(x)) +#define _PROP_RWLOCK_WRLOCK(x) mtx_lock(&(x)) +#define _PROP_RWLOCK_UNLOCK(x) mtx_unlock(&(x)) +#define _PROP_RWLOCK_DESTROY(x) mtx_uninit(&(x)) #define _PROP_ONCE_DECL(x) static int x = 0; #define _PROP_ONCE_RUN(x,f) if (atomic_cmpset_int(&(x), 0, 1)) f() -- 1.7.7.2 From 3c499555f10068eaf5493071d7906a849401072e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 25 Aug 2010 00:02:05 -0700 Subject: [PATCH 16/16] kernel - Make sys_ioctl() MPSAFE * Tokenize the ioctl mapper and stay MPSAFE through the fo_ioctl() call. Beyond that it will be determined by the fops driver and for vnodes will be determined by the MNTK_*_MPSAFE flags. * HAMMER's ioctls will now be entered without holding the MP lock, aka reblock, prune. --- sys/kern/sys_generic.c | 45 +++++++++++++++++++++++++++++++-------------- 1 files changed, 31 insertions(+), 14 deletions(-) diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 0bab31d..f6672f6 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -108,6 +108,8 @@ struct poll_kevent_copyin_args { int error; }; +static struct lwkt_token mapped_ioctl_token = LWKT_TOKEN_MP_INITIALIZER; + static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timespec *ts, int *res); static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts, @@ -544,16 +546,14 @@ dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) /* * Ioctl system call * - * MPALMOSTSAFE + * MPSAFE */ int sys_ioctl(struct ioctl_args *uap) { int error; - get_mplock(); error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg); - rel_mplock(); return (error); } @@ -567,6 +567,8 @@ struct ioctl_map_entry { * The true heart of all ioctl syscall handlers (native, emulation). * If map != NULL, it will be searched for a matching entry for com, * and appropriate conversions/conversion functions will be utilized. + * + * MPSAFE */ int mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, @@ -601,6 +603,7 @@ mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, maskcmd = com & map->mask; + lwkt_gettoken(&mapped_ioctl_token); LIST_FOREACH(e, &map->mapping, entries) { for (iomc = e->cmd_ranges; iomc->start != 0 || iomc->maptocmd != 0 || iomc->wrapfunc != NULL || @@ -616,6 +619,7 @@ mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, iomc->wrapfunc != NULL || iomc->mapfunc != NULL) break; } + lwkt_reltoken(&mapped_ioctl_token); if (iomc == NULL || (iomc->start == 0 && iomc->maptocmd == 0 @@ -708,17 +712,17 @@ mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, switch (com) { case FIONBIO: if ((tmp = *(int *)data)) - fp->f_flag |= FNONBLOCK; + atomic_set_int(&fp->f_flag, FNONBLOCK); else - fp->f_flag &= ~FNONBLOCK; + atomic_clear_int(&fp->f_flag, FNONBLOCK); error = 0; break; case FIOASYNC: if ((tmp = *(int *)data)) - fp->f_flag |= FASYNC; + atomic_set_int(&fp->f_flag, FASYNC); else - fp->f_flag &= ~FASYNC; + atomic_clear_int(&fp->f_flag, FASYNC); error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); break; @@ -746,6 +750,9 @@ done: return(error); } +/* + * MPSAFE + */ int mapped_ioctl_register_handler(struct ioctl_map_handler *he) { @@ -754,31 +761,41 @@ mapped_ioctl_register_handler(struct ioctl_map_handler *he) KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && he->subsys != NULL && *he->subsys != '\0'); - ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); + ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, + M_WAITOK | M_ZERO); ne->subsys = he->subsys; ne->cmd_ranges = he->cmd_ranges; + lwkt_gettoken(&mapped_ioctl_token); LIST_INSERT_HEAD(&he->map->mapping, ne, entries); + lwkt_reltoken(&mapped_ioctl_token); return(0); } +/* + * MPSAFE + */ int mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) { struct ioctl_map_entry *ne; + int error = EINVAL; KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); + lwkt_gettoken(&mapped_ioctl_token); LIST_FOREACH(ne, &he->map->mapping, entries) { - if (ne->cmd_ranges != he->cmd_ranges) - continue; - LIST_REMOVE(ne, entries); - kfree(ne, M_IOCTLMAP); - return(0); + if (ne->cmd_ranges == he->cmd_ranges) { + LIST_REMOVE(ne, entries); + kfree(ne, M_IOCTLMAP); + error = 0; + break; + } } - return(EINVAL); + lwkt_reltoken(&mapped_ioctl_token); + return(error); } static int nselcoll; /* Select collisions since boot */ -- 1.7.7.2