From d5b2d319bb8e5009b38780b6cacd8dae71b2f06d Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 10 Jan 2011 15:17:25 -0800 Subject: [PATCH] kernel - refactor cpusync and pmap_inval code, fix lockup * Simplify the cpusync API. The API now has only one initialization call, one simplified rollup call, and two primary calls handling a single function callback (instead of three callbacks). cpusync_interlock() interlocks the specified set of cpus and ensures they are running in a safe place, cpusync_deinterlock() executes the initialized function on the cpu set and does not return until all cpus have completed the operation. * Simplify the pmap_inval per-platform API. pmap_inval_interlock() and pmap_inval_deinterlock() now reflect similar functionality to the cpusync API. pmap/pte operations are now synchronized when pmap_inval_deinterlock() is called and not when pmap_inval_done() is called, reducing the range of code which can execute while the cpu set is held quiescent. pmap_inval_flush() has been removed. Critical section handling has also been rearranged slightly in the pmap_inval* and cpusync* APIs. * Fixes a cpusync lockup which occurs when the x86-64 pmap and pmap_inval code is used to hold a cpu mask quiescent across complex subsystem calls. Primarily accomplished by moving the synchronization out of pmap_inval_flush() and into pmap_inval_deinterlock(). --- sys/kern/lwkt_ipiq.c | 264 ++++++++++--------------- sys/kern/lwkt_thread.c | 13 ++ sys/platform/pc32/i386/i686_mem.c | 12 +- sys/platform/pc32/i386/pmap.c | 7 +- sys/platform/pc32/i386/pmap_inval.c | 90 +++------ sys/platform/pc32/i386/sys_machdep.c | 4 +- sys/platform/pc32/include/pmap_inval.h | 2 +- sys/platform/pc64/include/pmap_inval.h | 2 +- sys/platform/pc64/x86_64/pmap.c | 7 +- sys/platform/pc64/x86_64/pmap_inval.c | 96 +++------ sys/sys/thread.h | 21 +- sys/sys/thread2.h | 13 ++ 12 files changed, 199 insertions(+), 332 deletions(-) diff --git a/sys/kern/lwkt_ipiq.c b/sys/kern/lwkt_ipiq.c index 59c9600dbc..b5b6beee9f 100644 --- a/sys/kern/lwkt_ipiq.c +++ b/sys/kern/lwkt_ipiq.c @@ -76,6 +76,7 @@ static __int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */ static __int64_t ipiq_passive; /* passive IPI messages */ static __int64_t ipiq_cscount; /* number of cpu synchronizations */ static int ipiq_optimized = 1; /* XXX temporary sysctl */ +static int ipiq_debug; /* set to 1 for debug */ #ifdef PANIC_DEBUG static int panic_ipiq_cpu = -1; static int panic_ipiq_count = 100; @@ -95,6 +96,8 @@ SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0, "Number of cpu synchronizations"); SYSCTL_INT(_lwkt, OID_AUTO, ipiq_optimized, CTLFLAG_RW, &ipiq_optimized, 0, ""); +SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0, + ""); #ifdef PANIC_DEBUG SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, ""); SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, ""); @@ -113,7 +116,7 @@ KTR_INFO(KTR_IPIQ, ipiq, send_nbio, 2, IPIQ_STRING, IPIQ_ARG_SIZE); KTR_INFO(KTR_IPIQ, ipiq, send_fail, 3, IPIQ_STRING, IPIQ_ARG_SIZE); KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARG_SIZE); KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08x", sizeof(cpumask_t)); -KTR_INFO(KTR_IPIQ, ipiq, sync_add, 6, "cpumask=%08x", sizeof(cpumask_t)); +KTR_INFO(KTR_IPIQ, ipiq, sync_end, 6, "cpumask=%08x", sizeof(cpumask_t)); KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARG_SIZE); KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARG_SIZE); @@ -128,8 +131,8 @@ KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARG_SIZE); static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, struct intrframe *frame); -static void lwkt_cpusync_remote1(lwkt_cpusync_t poll); -static void lwkt_cpusync_remote2(lwkt_cpusync_t poll); +static void lwkt_cpusync_remote1(lwkt_cpusync_t cs); +static void lwkt_cpusync_remote2(lwkt_cpusync_t cs); /* * Send a function execution request to another cpu. The request is queued @@ -464,6 +467,11 @@ lwkt_seq_ipiq(globaldata_t target) * There are two versions, one where no interrupt frame is available (when * called from the send code and from splz, and one where an interrupt * frame is available. + * + * When the current cpu is mastering a cpusync we do NOT internally loop + * on the cpusyncq poll. We also do not re-flag a pending ipi due to + * the cpusyncq poll because this can cause doreti/splz to loop internally. + * The cpusync master's own loop must be allowed to run to avoid a deadlock. */ void lwkt_process_ipiq(void) @@ -488,7 +496,6 @@ again: if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) { if (gd->gd_curthread->td_cscount == 0) goto again; - need_ipiq(); } } } @@ -516,7 +523,6 @@ again: if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) { if (gd->gd_curthread->td_cscount == 0) goto again; - need_ipiq(); } } } @@ -551,20 +557,36 @@ lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, * may make, it is possible for both rindex and windex to advance and * thus for rindex to advance passed our cached windex. * - * NOTE: A memory fence is required to prevent speculative loads prior + * NOTE: A load fence is required to prevent speculative loads prior * to the loading of ip_rindex. Even though stores might be - * ordered, loads are probably not. + * ordered, loads are probably not. A memory fence is required + * to prevent reordering of the loads after the ip_rindex update. */ while (wi - (ri = ip->ip_rindex) > 0) { ri &= MAXCPUFIFO_MASK; - cpu_mfence(); + cpu_lfence(); copy_func = ip->ip_func[ri]; copy_arg1 = ip->ip_arg1[ri]; copy_arg2 = ip->ip_arg2[ri]; + cpu_mfence(); ++ip->ip_rindex; KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == ((ri + 1) & MAXCPUFIFO_MASK)); logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu); +#ifdef INVARIANTS + if (ipiq_debug && (ip->ip_rindex & 0xFFFFFF) == 0) { + kprintf("cpu %d ipifunc %p %p %d (frame %p)\n", + mycpu->gd_cpuid, + copy_func, copy_arg1, copy_arg2, +#if defined(__i386__) + (frame ? (void *)frame->if_eip : NULL)); +#elif defined(__amd64__) + (frame ? (void *)frame->if_rip : NULL)); +#else + NULL); +#endif + } +#endif copy_func(copy_arg1, copy_arg2, frame); cpu_sfence(); ip->ip_xindex = ip->ip_rindex; @@ -626,164 +648,94 @@ lwkt_synchronize_ipiqs(const char *wmesg) /* * CPU Synchronization Support * - * lwkt_cpusync_simple() + * lwkt_cpusync_interlock() - Place specified cpus in a quiescent state. + * The current cpu is placed in a hard critical + * section. * - * The function is executed synchronously before return on remote cpus. - * A lwkt_cpusync_t pointer is passed as an argument. The data can - * be accessed via arg->cs_data. - * - * XXX should I just pass the data as an argument to be consistent? + * lwkt_cpusync_deinterlock() - Execute cs_func on specified cpus, including + * current cpu if specified, then return. */ - void -lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *data) +lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *arg) { - struct lwkt_cpusync cmd; - - cmd.cs_run_func = NULL; - cmd.cs_fin1_func = func; - cmd.cs_fin2_func = NULL; - cmd.cs_data = data; - lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd); - if (mask & CPUMASK(mycpu->gd_cpuid)) - func(&cmd); - lwkt_cpusync_finish(&cmd); -} + struct lwkt_cpusync cs; -/* - * lwkt_cpusync_fastdata() - * - * The function is executed in tandem with return on remote cpus. - * The data is directly passed as an argument. Do not pass pointers to - * temporary storage as the storage might have - * gone poof by the time the target cpu executes - * the function. - * - * At the moment lwkt_cpusync is declared on the stack and we must wait - * for all remote cpus to ack in lwkt_cpusync_finish(), but as a future - * optimization we should be able to put a counter in the globaldata - * structure (if it is not otherwise being used) and just poke it and - * return without waiting. XXX - */ -void -lwkt_cpusync_fastdata(cpumask_t mask, cpusync_func2_t func, void *data) -{ - struct lwkt_cpusync cmd; - - cmd.cs_run_func = NULL; - cmd.cs_fin1_func = NULL; - cmd.cs_fin2_func = func; - cmd.cs_data = NULL; - lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd); - if (mask & CPUMASK(mycpu->gd_cpuid)) - func(data); - lwkt_cpusync_finish(&cmd); + lwkt_cpusync_init(&cs, mask, func, arg); + lwkt_cpusync_interlock(&cs); + lwkt_cpusync_deinterlock(&cs); } -/* - * lwkt_cpusync_start() - * - * Start synchronization with a set of target cpus, return once they are - * known to be in a synchronization loop. The target cpus will execute - * poll->cs_run_func() IN TANDEM WITH THE RETURN. - * - * XXX future: add lwkt_cpusync_start_quick() and require a call to - * lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to - * potentially absorb the IPI latency doing something useful. - */ + void -lwkt_cpusync_start(cpumask_t mask, lwkt_cpusync_t poll) +lwkt_cpusync_interlock(lwkt_cpusync_t cs) { +#ifdef SMP globaldata_t gd = mycpu; + cpumask_t mask; - poll->cs_count = 0; - poll->cs_mask = mask; -#ifdef SMP - logipiq2(sync_start, mask & gd->gd_other_cpus); - poll->cs_maxcount = lwkt_send_ipiq_mask( - mask & gd->gd_other_cpus & smp_active_mask, - (ipifunc1_t)lwkt_cpusync_remote1, poll); -#endif - if (mask & gd->gd_cpumask) { - if (poll->cs_run_func) - poll->cs_run_func(poll); - } -#ifdef SMP - if (poll->cs_maxcount) { + /* + * mask acknowledge (cs_mack): 0->mask for stage 1 + * + * mack does not include the current cpu. + */ + mask = cs->cs_mask & gd->gd_other_cpus & smp_active_mask; + cs->cs_mack = 0; + crit_enter_id("cpusync"); + if (mask) { ++ipiq_cscount; ++gd->gd_curthread->td_cscount; - while (poll->cs_count != poll->cs_maxcount) { - crit_enter(); + lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs); + logipiq2(sync_start, mask); + while (cs->cs_mack != mask) { lwkt_process_ipiq(); - crit_exit(); - } - } -#endif -} - -void -lwkt_cpusync_add(cpumask_t mask, lwkt_cpusync_t poll) -{ - globaldata_t gd = mycpu; -#ifdef SMP - int count; -#endif - - mask &= ~poll->cs_mask; - poll->cs_mask |= mask; -#ifdef SMP - logipiq2(sync_add, mask & gd->gd_other_cpus); - count = lwkt_send_ipiq_mask( - mask & gd->gd_other_cpus & smp_active_mask, - (ipifunc1_t)lwkt_cpusync_remote1, poll); -#endif - if (mask & gd->gd_cpumask) { - if (poll->cs_run_func) - poll->cs_run_func(poll); - } -#ifdef SMP - poll->cs_maxcount += count; - if (poll->cs_maxcount) { - if (poll->cs_maxcount == count) - ++gd->gd_curthread->td_cscount; - while (poll->cs_count != poll->cs_maxcount) { - crit_enter(); - lwkt_process_ipiq(); - crit_exit(); + cpu_pause(); } } +#else + cs->cs_mack = 0; #endif } /* - * Finish synchronization with a set of target cpus. The target cpus will - * execute cs_fin1_func(poll) prior to this function returning, and will - * execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN. + * Interlocked cpus have executed remote1 and are polling in remote2. + * To deinterlock we clear cs_mack and wait for the cpus to execute + * the func and set their bit in cs_mack again. * - * If cs_maxcount is non-zero then we are mastering a cpusync with one or - * more remote cpus and must account for it in our thread structure. */ void -lwkt_cpusync_finish(lwkt_cpusync_t poll) +lwkt_cpusync_deinterlock(lwkt_cpusync_t cs) { globaldata_t gd = mycpu; - - poll->cs_count = -1; - if (poll->cs_mask & gd->gd_cpumask) { - if (poll->cs_fin1_func) - poll->cs_fin1_func(poll); - if (poll->cs_fin2_func) - poll->cs_fin2_func(poll->cs_data); - } #ifdef SMP - if (poll->cs_maxcount) { - while (poll->cs_count != -(poll->cs_maxcount + 1)) { - crit_enter(); + cpumask_t mask; + + /* + * mask acknowledge (cs_mack): mack->0->mack for stage 2 + * + * Clearing cpu bits for polling cpus in cs_mack will cause them to + * execute stage 2, which executes the cs_func(cs_data) and then sets + * their bit in cs_mack again. + * + * mack does not include the current cpu. + */ + mask = cs->cs_mack; + cpu_ccfence(); + cs->cs_mack = 0; + if (cs->cs_func && (cs->cs_mask & gd->gd_cpumask)) + cs->cs_func(cs->cs_data); + if (mask) { + while (cs->cs_mack != mask) { lwkt_process_ipiq(); - crit_exit(); + cpu_pause(); } --gd->gd_curthread->td_cscount; + lwkt_process_ipiq(); + logipiq2(sync_end, mask); } + crit_exit_id("cpusync"); +#else + if (cs->cs_func && (cs->cs_mask & gd->gd_cpumask)) + cs->cs_func(cs->cs_data); #endif } @@ -797,49 +749,37 @@ lwkt_cpusync_finish(lwkt_cpusync_t poll) * the request so we spin on it. */ static void -lwkt_cpusync_remote1(lwkt_cpusync_t poll) +lwkt_cpusync_remote1(lwkt_cpusync_t cs) { - atomic_add_int(&poll->cs_count, 1); - if (poll->cs_run_func) - poll->cs_run_func(poll); - lwkt_cpusync_remote2(poll); + globaldata_t gd = mycpu; + + atomic_set_cpumask(&cs->cs_mack, gd->gd_cpumask); + lwkt_cpusync_remote2(cs); } /* * helper IPI remote messaging function. * * Poll for the originator telling us to finish. If it hasn't, requeue - * our request so we spin on it. When the originator requests that we - * finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data) - * in tandem with the release. + * our request so we spin on it. */ static void -lwkt_cpusync_remote2(lwkt_cpusync_t poll) +lwkt_cpusync_remote2(lwkt_cpusync_t cs) { - if (poll->cs_count < 0) { - cpusync_func2_t savef; - void *saved; - - if (poll->cs_fin1_func) - poll->cs_fin1_func(poll); - if (poll->cs_fin2_func) { - savef = poll->cs_fin2_func; - saved = poll->cs_data; - cpu_ccfence(); /* required ordering for MP operation */ - atomic_add_int(&poll->cs_count, -1); - savef(saved); - } else { - atomic_add_int(&poll->cs_count, -1); - } + globaldata_t gd = mycpu; + + if ((cs->cs_mack & gd->gd_cpumask) == 0) { + if (cs->cs_func) + cs->cs_func(cs->cs_data); + atomic_set_cpumask(&cs->cs_mack, gd->gd_cpumask); } else { - globaldata_t gd = mycpu; lwkt_ipiq_t ip; int wi; ip = &gd->gd_cpusyncq; wi = ip->ip_windex & MAXCPUFIFO_MASK; ip->ip_func[wi] = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2; - ip->ip_arg1[wi] = poll; + ip->ip_arg1[wi] = cs; ip->ip_arg2[wi] = 0; cpu_sfence(); ++ip->ip_windex; diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index 3939767ee8..2310f2d748 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -141,6 +141,9 @@ SYSCTL_INT(_lwkt, OID_AUTO, spin_delay, CTLFLAG_RW, static int lwkt_spin_method = 1; SYSCTL_INT(_lwkt, OID_AUTO, spin_method, CTLFLAG_RW, &lwkt_spin_method, 0, "LWKT scheduler behavior when contended"); +static int lwkt_spin_fatal = 0; /* disabled */ +SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW, + &lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic"); static int preempt_enable = 1; SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW, &preempt_enable, 0, "Enable preemption"); @@ -502,6 +505,7 @@ lwkt_switch(void) int reqflags; int cseq; int oseq; + int fatal_count; /* * Switching from within a 'fast' (non thread switched) interrupt or IPI @@ -848,7 +852,14 @@ skip: * WARNING! We can't call splz_check() or anything else here as * it could cause a deadlock. */ +#ifdef __amd64__ + if ((read_rflags() & PSL_I) == 0) { + cpu_enable_intr(); + panic("lwkt_switch() called with interrupts disabled"); + } +#endif cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1); + fatal_count = lwkt_spin_fatal; while ((oseq = lwkt_cseq_rindex) != cseq) { cpu_ccfence(); #if !defined(_KERNEL_VIRTUAL) @@ -860,6 +871,8 @@ skip: DELAY(1); cpu_lfence(); } + if (fatal_count && --fatal_count == 0) + panic("lwkt_switch: fatal spin wait"); } cseq = lwkt_spin_delay; /* don't trust the system operator */ cpu_ccfence(); diff --git a/sys/platform/pc32/i386/i686_mem.c b/sys/platform/pc32/i386/i686_mem.c index 329e7699a3..ee79e6a307 100644 --- a/sys/platform/pc32/i386/i686_mem.c +++ b/sys/platform/pc32/i386/i686_mem.c @@ -97,8 +97,8 @@ static int i686_mtrrconflict(int flag1, int flag2); static void i686_mrstore(struct mem_range_softc *sc); static void i686_mrstoreone(void *arg); #ifdef SMP -static void i686_mrstoreone_cpusync(struct lwkt_cpusync *cmd); -static void i686_mrAPinit_cpusync(struct lwkt_cpusync *cmd); +static void i686_mrstoreone_cpusync(void *arg); +static void i686_mrAPinit_cpusync(void *arg); #endif static struct mem_range_desc *i686_mtrrfixsearch(struct mem_range_softc *sc, u_int64_t addr); @@ -296,9 +296,9 @@ i686_mrstore(struct mem_range_softc *sc) #ifdef SMP static void -i686_mrstoreone_cpusync(struct lwkt_cpusync *cmd) +i686_mrstoreone_cpusync(void *arg) { - i686_mrstoreone(cmd->cs_data); + i686_mrstoreone(arg); } #endif @@ -676,9 +676,9 @@ i686_mrinit(struct mem_range_softc *sc) #ifdef SMP static void -i686_mrAPinit_cpusync(struct lwkt_cpusync *cmd) +i686_mrAPinit_cpusync(void *arg) { - i686_mrAPinit(cmd->cs_data); + i686_mrAPinit(arg); } #endif diff --git a/sys/platform/pc32/i386/pmap.c b/sys/platform/pc32/i386/pmap.c index 8a8667cf1d..39add33d14 100644 --- a/sys/platform/pc32/i386/pmap.c +++ b/sys/platform/pc32/i386/pmap.c @@ -1042,7 +1042,6 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info) * any active flushes if we block. */ if (m->flags & PG_BUSY) { - pmap_inval_flush(info); while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) ; } @@ -1130,7 +1129,6 @@ pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte, (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { - pmap_inval_flush(info); mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } @@ -2129,10 +2127,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) vm_page_t m; /* - * XXX non-optimal. Note also that there can be - * no pmap_inval_flush() calls until after we modify - * ptbase[sindex] (or otherwise we have to do another - * pmap_inval_interlock() call). + * XXX non-optimal. */ pmap_inval_interlock(&info, pmap, i386_ptob(sindex)); again: diff --git a/sys/platform/pc32/i386/pmap_inval.c b/sys/platform/pc32/i386/pmap_inval.c index 29edf8b5fa..664c485c11 100644 --- a/sys/platform/pc32/i386/pmap_inval.c +++ b/sys/platform/pc32/i386/pmap_inval.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. + * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon @@ -30,8 +30,6 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $DragonFly: src/sys/platform/pc32/i386/pmap_inval.c,v 1.5 2005/11/04 08:57:27 dillon Exp $ */ /* @@ -64,21 +62,7 @@ #include #include -#ifdef SMP - -static void -_cpu_invltlb(void *dummy) -{ - cpu_invltlb(); -} - -static void -_cpu_invl1pg(void *data) -{ - cpu_invlpg(data); -} - -#endif +static void pmap_inval_callback(void *arg); /* * Initialize for add or flush @@ -99,8 +83,8 @@ pmap_inval_init(pmap_inval_info_t info) void pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va) { -#ifdef SMP cpumask_t oactive; +#ifdef SMP cpumask_t nactive; for (;;) { @@ -108,75 +92,45 @@ pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va) nactive = oactive | CPUMASK_LOCK; if (atomic_cmpset_cpumask(&pmap->pm_active, oactive, nactive)) break; - crit_enter(); lwkt_process_ipiq(); - crit_exit(); - } - - if ((info->pir_flags & PIRF_CPUSYNC) == 0) { - info->pir_flags |= PIRF_CPUSYNC; - info->pir_cpusync.cs_run_func = NULL; - info->pir_cpusync.cs_fin1_func = NULL; - info->pir_cpusync.cs_fin2_func = NULL; - lwkt_cpusync_start(oactive, &info->pir_cpusync); - } else if (pmap->pm_active & ~info->pir_cpusync.cs_mask) { - lwkt_cpusync_add(oactive, &info->pir_cpusync); + cpu_pause(); } #else - if (pmap->pm_active == 0) - return; + oactive = pmap->pm_active & ~CPUMASK_LOCK; #endif - if ((info->pir_flags & (PIRF_INVLTLB|PIRF_INVL1PG)) == 0) { - if (va == (vm_offset_t)-1) { - info->pir_flags |= PIRF_INVLTLB; -#ifdef SMP - info->pir_cpusync.cs_fin2_func = _cpu_invltlb; -#endif - } else { - info->pir_flags |= PIRF_INVL1PG; - info->pir_cpusync.cs_data = (void *)va; -#ifdef SMP - info->pir_cpusync.cs_fin2_func = _cpu_invl1pg; -#endif - } - } else { - info->pir_flags |= PIRF_INVLTLB; -#ifdef SMP - info->pir_cpusync.cs_fin2_func = _cpu_invltlb; -#endif - } + KKASSERT((info->pir_flags & PIRF_CPUSYNC) == 0); + info->pir_va = va; + info->pir_flags = PIRF_CPUSYNC; + lwkt_cpusync_init(&info->pir_cpusync, oactive, pmap_inval_callback, info); + lwkt_cpusync_interlock(&info->pir_cpusync); } void pmap_inval_deinterlock(pmap_inval_info_t info, pmap_t pmap) { + KKASSERT(info->pir_flags & PIRF_CPUSYNC); #ifdef SMP atomic_clear_cpumask(&pmap->pm_active, CPUMASK_LOCK); #endif + lwkt_cpusync_deinterlock(&info->pir_cpusync); + info->pir_flags = 0; } -/* - * Synchronize changes with target cpus. - */ -void -pmap_inval_flush(pmap_inval_info_t info) +static void +pmap_inval_callback(void *arg) { -#ifdef SMP - if (info->pir_flags & PIRF_CPUSYNC) - lwkt_cpusync_finish(&info->pir_cpusync); -#else - if (info->pir_flags & PIRF_INVLTLB) + pmap_inval_info_t info = arg; + + if (info->pir_va == (vm_offset_t)-1) cpu_invltlb(); - else if (info->pir_flags & PIRF_INVL1PG) - cpu_invlpg(info->pir_cpusync.cs_data); -#endif - info->pir_flags = 0; + else + cpu_invlpg((void *)info->pir_va); } void pmap_inval_done(pmap_inval_info_t info) { - pmap_inval_flush(info); - crit_exit_id("flush"); + KKASSERT((info->pir_flags & PIRF_CPUSYNC) == 0); + crit_exit_id("inval"); } diff --git a/sys/platform/pc32/i386/sys_machdep.c b/sys/platform/pc32/i386/sys_machdep.c index c9d4988afe..6dc99c7ddb 100644 --- a/sys/platform/pc32/i386/sys_machdep.c +++ b/sys/platform/pc32/i386/sys_machdep.c @@ -268,9 +268,9 @@ set_user_TLS(void) #ifdef SMP static void -set_user_ldt_cpusync(struct lwkt_cpusync *cmd) +set_user_ldt_cpusync(void *arg) { - set_user_ldt(cmd->cs_data); + set_user_ldt(arg); } #endif diff --git a/sys/platform/pc32/include/pmap_inval.h b/sys/platform/pc32/include/pmap_inval.h index c4a18d4a0b..7d42de1ce5 100644 --- a/sys/platform/pc32/include/pmap_inval.h +++ b/sys/platform/pc32/include/pmap_inval.h @@ -43,6 +43,7 @@ typedef struct pmap_inval_info { int pir_flags; + vm_offset_t pir_va; struct lwkt_cpusync pir_cpusync; } pmap_inval_info; @@ -61,7 +62,6 @@ typedef pmap_inval_info *pmap_inval_info_t; void pmap_inval_init(pmap_inval_info_t); void pmap_inval_interlock(pmap_inval_info_t, pmap_t, vm_offset_t); void pmap_inval_deinterlock(pmap_inval_info_t, pmap_t); -void pmap_inval_flush(pmap_inval_info_t); void pmap_inval_done(pmap_inval_info_t); #endif diff --git a/sys/platform/pc64/include/pmap_inval.h b/sys/platform/pc64/include/pmap_inval.h index c70dbb21a0..a1685a96b5 100644 --- a/sys/platform/pc64/include/pmap_inval.h +++ b/sys/platform/pc64/include/pmap_inval.h @@ -43,6 +43,7 @@ typedef struct pmap_inval_info { int pir_flags; + vm_offset_t pir_va; struct lwkt_cpusync pir_cpusync; } pmap_inval_info; @@ -61,7 +62,6 @@ typedef pmap_inval_info *pmap_inval_info_t; void pmap_inval_init(pmap_inval_info_t); void pmap_inval_interlock(pmap_inval_info_t, pmap_t, vm_offset_t); void pmap_inval_deinterlock(pmap_inval_info_t, pmap_t); -void pmap_inval_flush(pmap_inval_info_t); void pmap_inval_done(pmap_inval_info_t); #endif diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index 2f9866cc22..89e52ffcb7 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -1198,7 +1198,6 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, * page so it cannot be freed out from under us. */ if (m->flags & PG_BUSY) { - pmap_inval_flush(info); while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) ; } @@ -1301,7 +1300,6 @@ pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte, mpte = pmap->pm_ptphint; } else { #endif - pmap_inval_flush(info); mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; #if JGHINT @@ -2496,10 +2494,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) vm_page_t m; /* - * XXX non-optimal. Note also that there can be - * no pmap_inval_flush() calls until after we modify - * ptbase[sindex] (or otherwise we have to do another - * pmap_inval_add() call). + * XXX non-optimal. */ pmap_inval_interlock(&info, pmap, sva); again: diff --git a/sys/platform/pc64/x86_64/pmap_inval.c b/sys/platform/pc64/x86_64/pmap_inval.c index e03bfda3b9..b559adfbd1 100644 --- a/sys/platform/pc64/x86_64/pmap_inval.c +++ b/sys/platform/pc64/x86_64/pmap_inval.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003,2004,2008 The DragonFly Project. All rights reserved. + * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon @@ -62,24 +62,14 @@ #include #include -#ifdef SMP - -static void -_cpu_invltlb(void *dummy) -{ - cpu_invltlb(); -} - -static void -_cpu_invl1pg(void *data) -{ - cpu_invlpg(data); -} - -#endif +static void pmap_inval_callback(void *arg); /* * Initialize for add or flush + * + * The critical section is required to prevent preemption, allowing us to + * set CPUMASK_LOCK on the pmap. The critical section is also assumed + * when lwkt_process_ipiq() is called. */ void pmap_inval_init(pmap_inval_info_t info) @@ -92,13 +82,15 @@ pmap_inval_init(pmap_inval_info_t info) * Add a (pmap, va) pair to the invalidation list and protect access * as appropriate. * - * CPUMASK_LOCK is used to interlock thread switchins + * CPUMASK_LOCK is used to interlock thread switchins, otherwise another + * cpu can switch in a pmap that we are unaware of and interfere with our + * pte operation. */ void pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va) { -#ifdef SMP cpumask_t oactive; +#ifdef SMP cpumask_t nactive; for (;;) { @@ -106,75 +98,45 @@ pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va) nactive = oactive | CPUMASK_LOCK; if (atomic_cmpset_cpumask(&pmap->pm_active, oactive, nactive)) break; - crit_enter(); lwkt_process_ipiq(); - crit_exit(); - } - - if ((info->pir_flags & PIRF_CPUSYNC) == 0) { - info->pir_flags |= PIRF_CPUSYNC; - info->pir_cpusync.cs_run_func = NULL; - info->pir_cpusync.cs_fin1_func = NULL; - info->pir_cpusync.cs_fin2_func = NULL; - lwkt_cpusync_start(oactive, &info->pir_cpusync); - } else if (pmap->pm_active & ~info->pir_cpusync.cs_mask) { - lwkt_cpusync_add(oactive, &info->pir_cpusync); + cpu_pause(); } #else - if (pmap->pm_active == 0) - return; + oactive = pmap->pm_active & ~CPUMASK_LOCK; #endif - if ((info->pir_flags & (PIRF_INVLTLB|PIRF_INVL1PG)) == 0) { - if (va == (vm_offset_t)-1) { - info->pir_flags |= PIRF_INVLTLB; -#ifdef SMP - info->pir_cpusync.cs_fin2_func = _cpu_invltlb; -#endif - } else { - info->pir_flags |= PIRF_INVL1PG; - info->pir_cpusync.cs_data = (void *)va; -#ifdef SMP - info->pir_cpusync.cs_fin2_func = _cpu_invl1pg; -#endif - } - } else { - info->pir_flags |= PIRF_INVLTLB; -#ifdef SMP - info->pir_cpusync.cs_fin2_func = _cpu_invltlb; -#endif - } + KKASSERT((info->pir_flags & PIRF_CPUSYNC) == 0); + info->pir_va = va; + info->pir_flags = PIRF_CPUSYNC; + lwkt_cpusync_init(&info->pir_cpusync, oactive, pmap_inval_callback, info); + lwkt_cpusync_interlock(&info->pir_cpusync); } void pmap_inval_deinterlock(pmap_inval_info_t info, pmap_t pmap) { + KKASSERT(info->pir_flags & PIRF_CPUSYNC); #ifdef SMP - atomic_clear_cpumask(&pmap->pm_active, CPUMASK_LOCK); + atomic_clear_cpumask(&pmap->pm_active, CPUMASK_LOCK); #endif + lwkt_cpusync_deinterlock(&info->pir_cpusync); + info->pir_flags = 0; } -/* - * Synchronize changes with target cpus. - */ -void -pmap_inval_flush(pmap_inval_info_t info) +static void +pmap_inval_callback(void *arg) { -#ifdef SMP - if (info->pir_flags & PIRF_CPUSYNC) - lwkt_cpusync_finish(&info->pir_cpusync); -#else - if (info->pir_flags & PIRF_INVLTLB) + pmap_inval_info_t info = arg; + + if (info->pir_va == (vm_offset_t)-1) cpu_invltlb(); - else if (info->pir_flags & PIRF_INVL1PG) - cpu_invlpg(info->pir_cpusync.cs_data); -#endif - info->pir_flags = 0; + else + cpu_invlpg((void *)info->pir_va); } void pmap_inval_done(pmap_inval_info_t info) { - pmap_inval_flush(info); + KKASSERT((info->pir_flags & PIRF_CPUSYNC) == 0); crit_exit_id("inval"); } diff --git a/sys/sys/thread.h b/sys/sys/thread.h index ebbb65766b..21624871f7 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -206,17 +206,13 @@ typedef struct lwkt_ipiq { * CPU Synchronization structure. See lwkt_cpusync_start() and * lwkt_cpusync_finish() for more information. */ -typedef void (*cpusync_func_t)(lwkt_cpusync_t poll); -typedef void (*cpusync_func2_t)(void *data); +typedef void (*cpusync_func_t)(void *arg); struct lwkt_cpusync { - cpusync_func_t cs_run_func; /* run (tandem w/ acquire) */ - cpusync_func_t cs_fin1_func; /* fin1 (synchronized) */ - cpusync_func2_t cs_fin2_func; /* fin2 (tandem w/ release) */ - void *cs_data; - int cs_maxcount; - volatile int cs_count; - cpumask_t cs_mask; + cpumask_t cs_mask; /* cpus running the sync */ + cpumask_t cs_mack; /* mask acknowledge */ + cpusync_func_t cs_func; /* function to execute */ + void *cs_data; /* function data */ }; /* @@ -496,11 +492,10 @@ extern void lwkt_synchronize_ipiqs(const char *); #endif /* SMP */ +/* lwkt_cpusync_init() - inline function in sys/thread2.h */ extern void lwkt_cpusync_simple(cpumask_t, cpusync_func_t, void *); -extern void lwkt_cpusync_fastdata(cpumask_t, cpusync_func2_t, void *); -extern void lwkt_cpusync_start(cpumask_t, lwkt_cpusync_t); -extern void lwkt_cpusync_add(cpumask_t, lwkt_cpusync_t); -extern void lwkt_cpusync_finish(lwkt_cpusync_t); +extern void lwkt_cpusync_interlock(lwkt_cpusync_t); +extern void lwkt_cpusync_deinterlock(lwkt_cpusync_t); extern void crit_panic(void) __dead2; extern struct lwp *lwkt_preempted_proc(void); diff --git a/sys/sys/thread2.h b/sys/sys/thread2.h index 20bdc1d67c..d9f26baf8a 100644 --- a/sys/sys/thread2.h +++ b/sys/sys/thread2.h @@ -249,6 +249,19 @@ lwkt_passive_recover(thread_t td) td->td_release = NULL; } +/* + * cpusync support + */ +static __inline void +lwkt_cpusync_init(lwkt_cpusync_t cs, cpumask_t mask, + cpusync_func_t func, void *data) +{ + cs->cs_mask = mask; + /* cs->cs_mack = 0; handled by _interlock */ + cs->cs_func = func; + cs->cs_data = data; +} + #ifdef SMP /* -- 2.41.0