From d9eea1a5e4f207878f33647f9d5274ecca8b75c5 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 11 Jul 2003 17:42:11 +0000 Subject: [PATCH] MP Implmentation 4/4: Final cleanup for this stage. Deal with a race that occurs due to not having to hold the MP lock through an lwkt_switch() where another cpu may pull off a process from the userland scheduler and schedule its thread before the original cpu has completely switched out it. Oddly enough latencies were enough that this bug never caused a crash! Cleanup the scheduling code and in particular the switch assembly code, save and restore eflags (cli/sti state) when switching heavy weight processes (this is already done for light weight threads), add some counters, and optimize fork() to (statistically) stay on the current cpu for a short while to take advantage of locality of cache reference, which greatly improves fork/exec times. Note that synchronous pipe operations between two procseses already (statistically) stick to the same cpu (which is what we want). --- sys/i386/apic/mpapic.c | 17 ++-- sys/i386/i386/exception.s | 19 ++-- sys/i386/i386/genassym.c | 4 +- sys/i386/i386/machdep.c | 8 +- sys/i386/i386/mpapic.c | 17 ++-- sys/i386/i386/mpboot.s | 12 ++- sys/i386/i386/swtch.s | 146 ++++++++++++++------------- sys/i386/i386/trap.c | 12 ++- sys/i386/i386/vm_machdep.c | 12 ++- sys/kern/kern_exit.c | 8 +- sys/kern/kern_switch.c | 31 +++++- sys/kern/kern_synch.c | 6 +- sys/kern/lwkt_thread.c | 18 +++- sys/platform/pc32/apic/mpapic.c | 17 ++-- sys/platform/pc32/i386/exception.s | 19 ++-- sys/platform/pc32/i386/genassym.c | 4 +- sys/platform/pc32/i386/machdep.c | 8 +- sys/platform/pc32/i386/mpapic.c | 17 ++-- sys/platform/pc32/i386/mpboot.s | 12 ++- sys/platform/pc32/i386/swtch.s | 146 ++++++++++++++------------- sys/platform/pc32/i386/trap.c | 12 ++- sys/platform/pc32/i386/vm_machdep.c | 12 ++- sys/platform/vkernel/i386/genassym.c | 4 +- sys/sys/proc.h | 7 +- sys/sys/thread.h | 11 +- 25 files changed, 322 insertions(+), 257 deletions(-) diff --git a/sys/i386/apic/mpapic.c b/sys/i386/apic/mpapic.c index 638b7a9a21..ebbbace1b3 100644 --- a/sys/i386/apic/mpapic.c +++ b/sys/i386/apic/mpapic.c @@ -23,7 +23,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/mpapic.c,v 1.37.2.7 2003/01/25 02:31:47 peter Exp $ - * $DragonFly: src/sys/i386/apic/Attic/mpapic.c,v 1.5 2003/07/08 06:27:26 dillon Exp $ + * $DragonFly: src/sys/i386/apic/Attic/mpapic.c,v 1.6 2003/07/11 17:42:08 dillon Exp $ */ #include @@ -72,18 +72,15 @@ apic_initialize(void) temp |= 0x00010400; /* masked, edge trigger, active hi */ lapic.lvt_lint1 = temp; - /* set the Task Priority Register as needed */ + /* + * Set the Task Priority Register as needed. At the moment allow + * interrupts on all cpus (the APs will remain CLId until they are + * ready to deal). We could disable all but IPIs by setting + * temp |= TPR_IPI_ONLY for cpu != 0. + */ temp = lapic.tpr; temp &= ~APIC_TPR_PRIO; /* clear priority field */ - /* - * Leave the BSP and TPR 0 during boot so it gets all the interrupts, - * set APs at TPR 0xF0 at boot so they get no ints. - */ -#if 0 - if (mycpu->gd_cpuid != 0) - temp |= TPR_IPI_ONLY; /* disable INTs on this cpu */ -#endif lapic.tpr = temp; /* enable the local APIC */ diff --git a/sys/i386/i386/exception.s b/sys/i386/i386/exception.s index 137ae17c07..dfa5a92a01 100644 --- a/sys/i386/i386/exception.s +++ b/sys/i386/i386/exception.s @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/exception.s,v 1.65.2.3 2001/08/15 01:23:49 peter Exp $ - * $DragonFly: src/sys/i386/i386/Attic/exception.s,v 1.12 2003/07/08 06:27:26 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/exception.s,v 1.13 2003/07/11 17:42:08 dillon Exp $ */ #include "npx.h" @@ -306,16 +306,17 @@ IDTVEC(int0x80_syscall) /* * This function is what cpu_heavy_restore jumps to after a new process - * is created. We are in a critical section in order to prevent - * cpu_heavy_restore from being interrupted (especially since it stores - * its context in a static place!). + * is created. The LWKT subsystem switches while holding a critical + * section and we maintain that abstraction here (e.g. because + * cpu_heavy_restore needs it due to PCB_*() manipulation), then get out of + * it before calling the initial function (typically fork_return()) and/or + * returning to user mode. * - * The MP lock is held on entry, but for processes fork_return (esi) + * The MP lock is held on entry, but for processes fork_return(esi) * releases it. 'doreti' always runs without the MP lock. * - * We need to be careful to hold interrupts disabled through until - * doreti iret's YYY this is due to the PCB_ storage in the heavy switcher, - * fixme! + * I'm not sure the cli is necessary but I am not taking any chances in + * regards to the init code. */ ENTRY(fork_trampoline) cli @@ -334,6 +335,7 @@ ENTRY(fork_trampoline) addl $4,%esp /* cut from syscall */ + sti call spl0 call splz @@ -346,6 +348,7 @@ ENTRY(fork_trampoline) pushl $pmsg4 call panic pmsg4: .asciz "fork_trampoline mpcount %d after calling %p" + .p2align 2 1: #endif /* diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index ca8652f0b0..1cf1d4f2d6 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -35,7 +35,7 @@ * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD: src/sys/i386/i386/genassym.c,v 1.86.2.3 2002/03/03 05:42:49 nyan Exp $ - * $DragonFly: src/sys/i386/i386/Attic/genassym.c,v 1.23 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/genassym.c,v 1.24 2003/07/11 17:42:08 dillon Exp $ */ #include "opt_user_ldt.h" @@ -92,7 +92,7 @@ ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan)); ASSYM(TD_MPCOUNT, offsetof(struct thread, td_mpcount)); #endif ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); -ASSYM(TDF_EXITED, TDF_EXITED); +ASSYM(TDF_RUNNING, TDF_RUNNING); #ifdef SMP ASSYM(MP_FREE_LOCK, MP_FREE_LOCK); #endif diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 851552a21f..938534419c 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -36,7 +36,7 @@ * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ - * $DragonFly: src/sys/i386/i386/Attic/machdep.c,v 1.25 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/machdep.c,v 1.26 2003/07/11 17:42:08 dillon Exp $ */ #include "apm.h" @@ -1914,6 +1914,7 @@ init386(int first) proc0.p_addr = (void *)thread0.td_kstack; proc0.p_thread = &thread0; proc0.p_flag |= P_CP_RELEASED; /* early set. See also init_main.c */ + thread0.td_flags |= TDF_RUNNING; thread0.td_proc = &proc0; thread0.td_switch = cpu_heavy_switch; /* YYY eventually LWKT */ safepri = thread0.td_cpl = SWI_MASK | HWI_MASK; @@ -2052,11 +2053,6 @@ init386(int first) /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_pcb->pcb_cr3 = (int)IdlePTD; /* should already be setup */ -#ifdef SMP -#if 0 - thread0.td_pcb->pcb_mpnest = 1; -#endif -#endif thread0.td_pcb->pcb_ext = 0; proc0.p_md.md_regs = &proc0_tf; } diff --git a/sys/i386/i386/mpapic.c b/sys/i386/i386/mpapic.c index 89d8f89ee5..85fb851370 100644 --- a/sys/i386/i386/mpapic.c +++ b/sys/i386/i386/mpapic.c @@ -23,7 +23,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/mpapic.c,v 1.37.2.7 2003/01/25 02:31:47 peter Exp $ - * $DragonFly: src/sys/i386/i386/Attic/mpapic.c,v 1.5 2003/07/08 06:27:26 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/mpapic.c,v 1.6 2003/07/11 17:42:08 dillon Exp $ */ #include @@ -72,18 +72,15 @@ apic_initialize(void) temp |= 0x00010400; /* masked, edge trigger, active hi */ lapic.lvt_lint1 = temp; - /* set the Task Priority Register as needed */ + /* + * Set the Task Priority Register as needed. At the moment allow + * interrupts on all cpus (the APs will remain CLId until they are + * ready to deal). We could disable all but IPIs by setting + * temp |= TPR_IPI_ONLY for cpu != 0. + */ temp = lapic.tpr; temp &= ~APIC_TPR_PRIO; /* clear priority field */ - /* - * Leave the BSP and TPR 0 during boot so it gets all the interrupts, - * set APs at TPR 0xF0 at boot so they get no ints. - */ -#if 0 - if (mycpu->gd_cpuid != 0) - temp |= TPR_IPI_ONLY; /* disable INTs on this cpu */ -#endif lapic.tpr = temp; /* enable the local APIC */ diff --git a/sys/i386/i386/mpboot.s b/sys/i386/i386/mpboot.s index 8c581ad13a..1b39d880c1 100644 --- a/sys/i386/i386/mpboot.s +++ b/sys/i386/i386/mpboot.s @@ -32,7 +32,7 @@ * multiprocessor systems. * * $FreeBSD: src/sys/i386/i386/mpboot.s,v 1.13.2.3 2000/09/07 01:18:26 tegge Exp $ - * $DragonFly: src/sys/i386/i386/Attic/mpboot.s,v 1.4 2003/07/06 21:23:48 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/mpboot.s,v 1.5 2003/07/11 17:42:08 dillon Exp $ */ #include /* miscellaneous asm macros */ @@ -118,12 +118,14 @@ mp_begin: /* now running relocated at KERNBASE */ /* * Execute the context restore function for the idlethread which * has conveniently been set as curthread. Remember, %eax must - * contain the target thread. Or BSP/AP synchronization occurs - * in ap_init(). We do not need to mess with the BGL for this - * because LWKT threads are self-contained on each cpu (or, at least, - * the idlethread is!). + * contain the target thread and %ebx must contain the originating + * thread (which we just set the same since we have no originating + * thread). BSP/AP synchronization occurs in ap_init(). We do + * not need to mess with the BGL for this because LWKT threads are + * self-contained on each cpu (or, at least, the idlethread is!). */ movl PCPU(curthread),%eax + movl %eax,%ebx movl TD_SP(%eax),%esp ret diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index e5494ce80a..53d2152f1a 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.89.2.10 2003/01/23 03:36:24 ps Exp $ - * $DragonFly: src/sys/i386/i386/Attic/swtch.s,v 1.23 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/swtch.s,v 1.24 2003/07/11 17:42:08 dillon Exp $ */ #include "npx.h" @@ -81,24 +81,18 @@ tlb_flush_count: .long 0 * is normally called via the thread->td_switch function, and will * only be called when the current thread is a heavy weight process. * + * Some instructions have been reordered to reduce pipeline stalls. + * * YYY disable interrupts once giant is removed. */ ENTRY(cpu_heavy_switch) - movl PCPU(curthread),%ecx - movl TD_PROC(%ecx),%ecx - - cli - movl P_VMSPACE(%ecx), %edx - movl PCPU(cpuid), %eax - MPLOCKED btrl %eax, VM_PMAP+PM_ACTIVE(%edx) - /* * Save general regs */ - movl P_THREAD(%ecx),%edx - movl TD_PCB(%edx),%edx - movl (%esp),%eax /* Hardware registers */ - movl %eax,PCB_EIP(%edx) + movl PCPU(curthread),%ecx + movl (%esp),%eax /* (reorder optimization) */ + movl TD_PCB(%ecx),%edx /* EDX = PCB */ + movl %eax,PCB_EIP(%edx) /* return PC may be modified */ movl %ebx,PCB_EBX(%edx) movl %esp,PCB_ESP(%edx) movl %ebp,PCB_EBP(%edx) @@ -106,16 +100,22 @@ ENTRY(cpu_heavy_switch) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) + movl %ecx,%ebx /* EBX = curthread */ + movl TD_PROC(%ecx),%ecx + movl PCPU(cpuid), %eax + movl P_VMSPACE(%ecx), %ecx /* ECX = vmspace */ + MPLOCKED btrl %eax, VM_PMAP+PM_ACTIVE(%ecx) + /* * Push the LWKT switch restore function, which resumes a heavy * weight process. Note that the LWKT switcher is based on * TD_SP, while the heavy weight process switcher is based on - * PCB_ESP. TD_SP is usually one pointer pushed relative to - * PCB_ESP. + * PCB_ESP. TD_SP is usually two ints pushed relative to + * PCB_ESP. We push the flags for later restore by cpu_heavy_restore. */ - movl P_THREAD(%ecx),%eax + pushfl pushl $cpu_heavy_restore - movl %esp,TD_SP(%eax) + movl %esp,TD_SP(%ebx) /* * Save debug regs if necessary @@ -144,24 +144,26 @@ ENTRY(cpu_heavy_switch) * npxsave will NULL out PCPU(npxthread). */ #if NNPX > 0 - movl P_THREAD(%ecx),%ecx - cmpl %ecx,PCPU(npxthread) + cmpl %ebx,PCPU(npxthread) jne 1f - addl $PCB_SAVEFPU,%edx /* h/w bugs make saving complicated */ + addl $PCB_SAVEFPU,%edx pushl %edx call npxsave /* do it in a big C function */ - addl $4,%esp + addl $4,%esp /* EAX, ECX, EDX trashed */ 1: - /* %ecx,%edx trashed */ #endif /* NNPX > 0 */ /* * Switch to the next thread, which was passed as an argument - * to cpu_heavy_switch(). Due to the switch-restore function we pushed, - * the argument is at 8(%esp). Set the current thread, load the - * stack pointer, and 'ret' into the switch-restore function. + * to cpu_heavy_switch(). Due to the eflags and switch-restore + * function we pushed, the argument is at 12(%esp). Set the current + * thread, load the stack pointer, and 'ret' into the switch-restore + * function. + * + * The switch restore function expects the new thread to be in %eax + * and the old one to be in %ebx. */ - movl 8(%esp),%eax + movl 12(%esp),%eax /* EAX = newtd, EBX = oldtd */ movl %eax,PCPU(curthread) movl TD_SP(%eax),%esp ret @@ -187,33 +189,14 @@ ENTRY(cpu_exit_switch) je 1f movl %ecx,%cr3 1: - movl PCPU(curthread),%ecx + movl PCPU(curthread),%ebx /* - * Switch to the next thread. + * Switch to the next thread. RET into the restore function, which + * expects the new thread in EAX and the old in EBX. */ - cli movl 4(%esp),%eax movl %eax,PCPU(curthread) movl TD_SP(%eax),%esp - - /* - * We are now the next thread, set the exited flag and wakeup - * any waiters. - */ - orl $TDF_EXITED,TD_FLAGS(%ecx) -#if 0 /* YYY MP lock may not be held by new target */ - pushl %eax - pushl %ecx /* wakeup(oldthread) */ - call wakeup - addl $4,%esp - popl %eax /* note: next thread expects curthread in %eax */ -#endif - - /* - * Restore the next thread's state and resume it. Note: the - * restore function assumes that the next thread's address is - * in %eax. - */ ret /* @@ -224,13 +207,15 @@ ENTRY(cpu_exit_switch) * off the thread stack and jumped to. * * This entry is only called if the thread was previously saved - * using cpu_heavy_switch() (the heavy weight process thread switcher). + * using cpu_heavy_switch() (the heavy weight process thread switcher), + * or when a new process is initially scheduled. The first thing we + * do is clear the TDF_RUNNING bit in the old thread and set it in the + * new thread. * * YYY theoretically we do not have to restore everything here, a lot * of this junk can wait until we return to usermode. But for now * we restore everything. * - * YYY STI/CLI sequencing. * YYY the PCB crap is really crap, it makes startup a bitch because * we can't switch away. * @@ -238,8 +223,8 @@ ENTRY(cpu_exit_switch) */ ENTRY(cpu_heavy_restore) - /* interrupts are disabled */ - movl TD_PCB(%eax),%edx + popfl + movl TD_PCB(%eax),%edx /* EDX = PCB */ movl TD_PROC(%eax),%ecx #ifdef DIAGNOSTIC cmpb $SRUN,P_STAT(%ecx) @@ -254,9 +239,9 @@ ENTRY(cpu_heavy_restore) * safely test/reload %cr3 until after we have set the bit in the * pmap (remember, we do not hold the MP lock in the switch code). */ - movl P_VMSPACE(%ecx), %ebx - movl PCPU(cpuid), %eax - MPLOCKED btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) + movl P_VMSPACE(%ecx), %ecx /* ECX = vmspace */ + movl PCPU(cpuid), %esi + MPLOCKED btsl %esi, VM_PMAP+PM_ACTIVE(%ecx) /* * Restore the MMU address space. If it is the same as the last @@ -264,16 +249,24 @@ ENTRY(cpu_heavy_restore) * YYY which naturally also means that the PM_ACTIVE bit had better * already have been set before we set it above, check? YYY */ - movl %cr3,%eax - movl PCB_CR3(%edx),%ebx - cmpl %eax,%ebx + movl %cr3,%esi + movl PCB_CR3(%edx),%ecx + cmpl %esi,%ecx je 4f #if defined(SWTCH_OPTIM_STATS) decl _swtch_optim_stats incl _tlb_flush_count #endif - movl %ebx,%cr3 + movl %ecx,%cr3 4: + /* + * Clear TDF_RUNNING flag in old thread only after cleaning up + * %cr3. The target thread is already protected by being TDF_RUNQ + * so setting TDF_RUNNING isn't as big a deal. + */ + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) + /* * Deal with the PCB extension, restore the private tss */ @@ -385,7 +378,6 @@ cpu_switch_load_gs: movl %eax,%dr7 1: - sti /* XXX */ ret CROSSJUMPTARGET(sw1a) @@ -464,6 +456,8 @@ ENTRY(savectx) * cpu_idle() LWKT only, after that cpu_lwkt_*() will be used for * switching. * + * Clear TDF_RUNNING in old thread only after we've cleaned up %cr3. + * * If we are an AP we have to call ap_init() before jumping to * cpu_idle(). ap_init() will synchronize with the BP and finish * setting up various ncpu-dependant globaldata fields. This may @@ -471,10 +465,13 @@ ENTRY(savectx) * cpus. */ ENTRY(cpu_idle_restore) + /* cli */ movl IdlePTD,%ecx movl $0,%ebp pushl $0 movl %ecx,%cr3 + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) #ifdef SMP cmpl $0,PCPU(cpuid) je 1f @@ -496,16 +493,18 @@ ENTRY(cpu_idle_restore) * we can release our critical section and enable interrupts early. */ ENTRY(cpu_kthread_restore) + sti movl IdlePTD,%ecx - movl TD_PCB(%eax),%ebx + movl TD_PCB(%eax),%edx movl $0,%ebp movl %ecx,%cr3 + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) subl $TDPRI_CRIT,TD_PRI(%eax) - sti - popl %edx /* kthread exit function */ - pushl PCB_EBX(%ebx) /* argument to ESI function */ - pushl %edx /* set exit func as return address */ - movl PCB_ESI(%ebx),%eax + popl %eax /* kthread exit function */ + pushl PCB_EBX(%edx) /* argument to ESI function */ + pushl %eax /* set exit func as return address */ + movl PCB_ESI(%edx),%eax jmp *%eax /* @@ -525,12 +524,15 @@ ENTRY(cpu_lwkt_switch) pushl %esi pushl %edi pushfl - movl PCPU(curthread),%ecx + movl PCPU(curthread),%ebx pushl $cpu_lwkt_restore - cli - movl %esp,TD_SP(%ecx) + movl %esp,TD_SP(%ebx) movl %eax,PCPU(curthread) movl TD_SP(%eax),%esp + + /* + * eax contains new thread, ebx contains old thread. + */ ret /* @@ -549,11 +551,13 @@ ENTRY(cpu_lwkt_switch) */ ENTRY(cpu_lwkt_restore) movl IdlePTD,%ecx /* YYY borrow but beware desched/cpuchg/exit */ - movl %cr3,%eax - cmpl %ecx,%eax + movl %cr3,%edx + cmpl %ecx,%edx je 1f movl %ecx,%cr3 1: + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) popfl popl %edi popl %esi diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index b6bcd4a255..11c2866d98 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -36,7 +36,7 @@ * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ - * $DragonFly: src/sys/i386/i386/Attic/trap.c,v 1.21 2003/07/11 01:23:21 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/trap.c,v 1.22 2003/07/11 17:42:08 dillon Exp $ */ /* @@ -156,6 +156,12 @@ SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); +static int fast_release; +SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW, + &fast_release, 0, "Passive Release was optimal"); +static int slow_release; +SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW, + &slow_release, 0, "Passive Release was nonoptimal"); /* * USER->KERNEL transition. Do not transition us out of userland from the @@ -206,9 +212,11 @@ userexit(struct proc *p) * lwkt_maybe_switch() to deal with it. */ if (td->td_release) { + ++fast_release; td->td_release = NULL; KKASSERT(p->p_flag & P_CURPROC); } else { + ++slow_release; acquire_curproc(p); switch(p->p_rtprio.type) { case RTP_PRIO_IDLE: @@ -1368,7 +1376,9 @@ fork_return(p, frame) if (KTRPOINT(p->p_thread, KTR_SYSRET)) ktrsysret(p->p_tracep, SYS_fork, 0, 0); #endif + p->p_flag |= P_PASSIVE_ACQ; userexit(p); + p->p_flag &= ~P_PASSIVE_ACQ; #ifdef SMP KKASSERT(curthread->td_mpcount == 1); rel_mplock(); diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index e7ff444a3a..5c66a84a64 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -39,7 +39,7 @@ * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ * $FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.132.2.9 2003/01/25 19:02:23 dillon Exp $ - * $DragonFly: src/sys/i386/i386/Attic/vm_machdep.c,v 1.19 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/vm_machdep.c,v 1.20 2003/07/11 17:42:08 dillon Exp $ */ #include "npx.h" @@ -162,10 +162,10 @@ cpu_fork(p1, p2, flags) * common_tss.esp0 (kernel stack pointer on entry from user mode) * * pcb_esp must allocate an additional call-return pointer below - * the trap frame which will be restored by cpu_restore, and the - * thread's td_sp pointer must allocate an additonal call-return - * pointer below the pcb_esp call-return pointer to hold the LWKT - * restore function pointer. + * the trap frame which will be restored by cpu_restore from + * PCB_EIP, and the thread's td_sp pointer must allocate an + * additonal two worsd below the pcb_esp call-return pointer to + * hold the LWKT restore function pointer and eflags. * * The LWKT restore function pointer must be set to cpu_restore, * which is our standard heavy weight process switch-in function. @@ -188,6 +188,8 @@ cpu_fork(p1, p2, flags) pcb2->pcb_ebx = (int)p2; /* fork_trampoline argument */ pcb2->pcb_eip = (int)fork_trampoline; p2->p_thread->td_sp = (char *)(pcb2->pcb_esp - sizeof(void *)); + *(u_int32_t *)p2->p_thread->td_sp = PSL_USER; + p2->p_thread->td_sp -= sizeof(void *); *(void **)p2->p_thread->td_sp = (void *)cpu_heavy_restore; /* * pcb2->pcb_ldt: duplicated below, if necessary. diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 5ebc883d05..da1559a647 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -37,7 +37,7 @@ * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 * $FreeBSD: src/sys/kern/kern_exit.c,v 1.92.2.11 2003/01/13 22:51:16 dillon Exp $ - * $DragonFly: src/sys/kern/kern_exit.c,v 1.16 2003/07/10 04:47:54 dillon Exp $ + * $DragonFly: src/sys/kern/kern_exit.c,v 1.17 2003/07/11 17:42:10 dillon Exp $ */ #include "opt_compat.h" @@ -278,7 +278,7 @@ exit1(int rv) /* * Once we set SZOMB the process can get reaped. The wait1 code - * will also wait for TDF_EXITED to be set in the thread's flags, + * will also wait for TDF_RUNNING to be cleared in the thread's flags, * indicating that it has been completely switched out. */ @@ -445,11 +445,11 @@ loop: /* * The process's thread may still be in the middle * of switching away, we can't rip its stack out from - * under it until TDF_EXITED is set. + * under it until TDF_RUNNING clears! * * YYY no wakeup occurs so we depend on the timeout. */ - if ((p->p_thread->td_flags & TDF_EXITED) == 0) { + if ((p->p_thread->td_flags & TDF_RUNNING) != 0) { tsleep(p->p_thread, PWAIT, "reap", 1); goto loop; } diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index 5a66081ce8..67496746d1 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -24,7 +24,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/kern/kern_switch.c,v 1.3.2.1 2000/05/16 06:58:12 dillon Exp $ - * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.7 2003/07/11 01:23:24 dillon Exp $ + * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.8 2003/07/11 17:42:10 dillon Exp $ */ #include @@ -69,6 +69,14 @@ static u_int32_t rdyprocmask; /* ready to accept a user process */ static int runqcount; SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, ""); +static int usched_steal; +SYSCTL_INT(_debug, OID_AUTO, usched_steal, CTLFLAG_RW, + &usched_steal, 0, "Passive Release was nonoptimal"); +static int usched_optimal; +SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, + &usched_optimal, 0, "Passive Release was nonoptimal"); + +#define USCHED_COUNTER(td) ((td->td_cpu == mycpu->gd_cpuid) ? ++usched_optimal : ++usched_steal) /* * Initialize the run queues at boot time. @@ -162,6 +170,18 @@ chooseproc(void) * signal other cpus in the system that may need to be woken up to service * the new 'user' process. * + * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target + * cpus in an attempt to keep the process on the current cpu at least for + * a little while to take advantage of locality of reference (e.g. fork/exec + * or short fork/exit). + * + * WARNING! a thread can be acquired by another cpu the moment it is put + * on the user scheduler's run queue AND we release the MP lock. Since we + * release the MP lock before switching out another cpu may begin stealing + * our current thread before we are completely switched out! The + * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the + * thread before stealing it. + * * The associated thread must NOT be scheduled. * The process must be runnable. * This must be called at splhigh(). @@ -198,6 +218,7 @@ setrunqueue(struct proc *p) if ((curprocmask & (1 << cpuid)) == 0) { curprocmask |= 1 << cpuid; p->p_flag |= P_CURPROC; + USCHED_COUNTER(p->p_thread); lwkt_acquire(p->p_thread); lwkt_schedule(p->p_thread); crit_exit(); @@ -238,7 +259,8 @@ setrunqueue(struct proc *p) * We use rdyprocmask to avoid unnecessarily waking up the scheduler * thread when it is already running. */ - if ((mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0) { + if ((mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 && + (p->p_flag & P_PASSIVE_ACQ) == 0) { int count = runqcount; if (!mask) printf("PROC %d nocpu to schedule it on\n", p->p_pid); @@ -331,6 +353,7 @@ release_curproc(struct proc *p) KKASSERT(curprocmask & (1 << cpuid)); if ((np = chooseproc()) != NULL) { np->p_flag |= P_CURPROC; + USCHED_COUNTER(np->p_thread); lwkt_acquire(np->p_thread); lwkt_schedule(np->p_thread); } else { @@ -394,6 +417,7 @@ acquire_curproc(struct proc *p) KKASSERT((np->p_flag & P_CP_RELEASED) == 0); if (test_resched(p, np)) { np->p_flag |= P_CURPROC; + USCHED_COUNTER(np->p_thread); lwkt_acquire(np->p_thread); lwkt_schedule(np->p_thread); } else { @@ -435,8 +459,10 @@ uio_yield(void) lwkt_switch(); if (p) { + p->p_flag |= P_PASSIVE_ACQ; acquire_curproc(p); release_curproc(p); + p->p_flag &= ~P_PASSIVE_ACQ; } } @@ -472,6 +498,7 @@ sched_thread(void *dummy) if ((curprocmask & cpumask) == 0 && (np = chooseproc()) != NULL) { curprocmask |= cpumask; np->p_flag |= P_CURPROC; + USCHED_COUNTER(np->p_thread); lwkt_acquire(np->p_thread); lwkt_schedule(np->p_thread); } diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index b55adad83b..66a3ff5ade 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -37,7 +37,7 @@ * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ - * $DragonFly: src/sys/kern/kern_synch.c,v 1.17 2003/07/11 01:23:24 dillon Exp $ + * $DragonFly: src/sys/kern/kern_synch.c,v 1.18 2003/07/11 17:42:10 dillon Exp $ */ #include "opt_ktrace.h" @@ -175,12 +175,16 @@ roundrobin(void *arg) timeout(roundrobin, NULL, sched_quantum); } +#ifdef SMP + void resched_cpus(u_int32_t mask) { lwkt_send_ipiq_mask(mask, roundrobin_remote, NULL); } +#endif + /* * Constants for digital decay and forget: * 90% of (p_estcpu) usage in 5 * loadav time diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index de55930c19..1cff149ac6 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -28,7 +28,7 @@ * to use a critical section to avoid problems. Foreign thread * scheduling is queued via (async) IPIs. * - * $DragonFly: src/sys/kern/lwkt_thread.c,v 1.20 2003/07/11 01:23:24 dillon Exp $ + * $DragonFly: src/sys/kern/lwkt_thread.c,v 1.21 2003/07/11 17:42:10 dillon Exp $ */ #include @@ -170,7 +170,7 @@ lwkt_alloc_thread(struct thread *td) if (mycpu->gd_tdfreecount > 0) { --mycpu->gd_tdfreecount; td = TAILQ_FIRST(&mycpu->gd_tdfreeq); - KASSERT(td != NULL && (td->td_flags & TDF_EXITED), + KASSERT(td != NULL && (td->td_flags & TDF_RUNNING) == 0, ("lwkt_alloc_thread: unexpected NULL or corrupted td")); TAILQ_REMOVE(&mycpu->gd_tdfreeq, td, td_threadq); crit_exit(); @@ -205,7 +205,7 @@ lwkt_init_thread(thread_t td, void *stack, int flags, struct globaldata *gd) td->td_flags |= flags; td->td_gd = gd; td->td_pri = TDPRI_CRIT; - td->td_cpu = gd->gd_cpuid; /* YYY don't need this if have td_gd */ + td->td_cpu = gd->gd_cpuid; /* YYY don't really need this if have td_gd */ pmap_init_thread(td); crit_enter(); TAILQ_INSERT_TAIL(&mycpu->gd_tdallq, td, td_allq); @@ -247,7 +247,7 @@ lwkt_free_thread(thread_t td) { struct globaldata *gd = mycpu; - KASSERT(td->td_flags & TDF_EXITED, + KASSERT((td->td_flags & TDF_RUNNING) == 0, ("lwkt_free_thread: did not exit! %p", td)); crit_enter(); @@ -735,6 +735,14 @@ lwkt_schedule(thread_t td) crit_exit(); } +/* + * Managed acquisition. This code assumes that the MP lock is held for + * the tdallq operation and that the thread has been descheduled from its + * original cpu. We also have to wait for the thread to be entirely switched + * out on its original cpu (this is usually fast enough that we never loop) + * since the LWKT system does not have to hold the MP lock while switching + * and the target may have released it before switching. + */ void lwkt_acquire(thread_t td) { @@ -743,6 +751,8 @@ lwkt_acquire(thread_t td) gd = td->td_gd; KKASSERT((td->td_flags & TDF_RUNQ) == 0); + while (td->td_flags & TDF_RUNNING) /* XXX spin */ + ; if (gd != mycpu) { ocpu = td->td_cpu; crit_enter(); diff --git a/sys/platform/pc32/apic/mpapic.c b/sys/platform/pc32/apic/mpapic.c index a48c8be109..20432a3e89 100644 --- a/sys/platform/pc32/apic/mpapic.c +++ b/sys/platform/pc32/apic/mpapic.c @@ -23,7 +23,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/mpapic.c,v 1.37.2.7 2003/01/25 02:31:47 peter Exp $ - * $DragonFly: src/sys/platform/pc32/apic/mpapic.c,v 1.5 2003/07/08 06:27:26 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/apic/mpapic.c,v 1.6 2003/07/11 17:42:08 dillon Exp $ */ #include @@ -72,18 +72,15 @@ apic_initialize(void) temp |= 0x00010400; /* masked, edge trigger, active hi */ lapic.lvt_lint1 = temp; - /* set the Task Priority Register as needed */ + /* + * Set the Task Priority Register as needed. At the moment allow + * interrupts on all cpus (the APs will remain CLId until they are + * ready to deal). We could disable all but IPIs by setting + * temp |= TPR_IPI_ONLY for cpu != 0. + */ temp = lapic.tpr; temp &= ~APIC_TPR_PRIO; /* clear priority field */ - /* - * Leave the BSP and TPR 0 during boot so it gets all the interrupts, - * set APs at TPR 0xF0 at boot so they get no ints. - */ -#if 0 - if (mycpu->gd_cpuid != 0) - temp |= TPR_IPI_ONLY; /* disable INTs on this cpu */ -#endif lapic.tpr = temp; /* enable the local APIC */ diff --git a/sys/platform/pc32/i386/exception.s b/sys/platform/pc32/i386/exception.s index 1e4308874f..0fe0d47fbd 100644 --- a/sys/platform/pc32/i386/exception.s +++ b/sys/platform/pc32/i386/exception.s @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/exception.s,v 1.65.2.3 2001/08/15 01:23:49 peter Exp $ - * $DragonFly: src/sys/platform/pc32/i386/exception.s,v 1.12 2003/07/08 06:27:26 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/exception.s,v 1.13 2003/07/11 17:42:08 dillon Exp $ */ #include "npx.h" @@ -306,16 +306,17 @@ IDTVEC(int0x80_syscall) /* * This function is what cpu_heavy_restore jumps to after a new process - * is created. We are in a critical section in order to prevent - * cpu_heavy_restore from being interrupted (especially since it stores - * its context in a static place!). + * is created. The LWKT subsystem switches while holding a critical + * section and we maintain that abstraction here (e.g. because + * cpu_heavy_restore needs it due to PCB_*() manipulation), then get out of + * it before calling the initial function (typically fork_return()) and/or + * returning to user mode. * - * The MP lock is held on entry, but for processes fork_return (esi) + * The MP lock is held on entry, but for processes fork_return(esi) * releases it. 'doreti' always runs without the MP lock. * - * We need to be careful to hold interrupts disabled through until - * doreti iret's YYY this is due to the PCB_ storage in the heavy switcher, - * fixme! + * I'm not sure the cli is necessary but I am not taking any chances in + * regards to the init code. */ ENTRY(fork_trampoline) cli @@ -334,6 +335,7 @@ ENTRY(fork_trampoline) addl $4,%esp /* cut from syscall */ + sti call spl0 call splz @@ -346,6 +348,7 @@ ENTRY(fork_trampoline) pushl $pmsg4 call panic pmsg4: .asciz "fork_trampoline mpcount %d after calling %p" + .p2align 2 1: #endif /* diff --git a/sys/platform/pc32/i386/genassym.c b/sys/platform/pc32/i386/genassym.c index 15d0205b9c..ddc73e2b32 100644 --- a/sys/platform/pc32/i386/genassym.c +++ b/sys/platform/pc32/i386/genassym.c @@ -35,7 +35,7 @@ * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD: src/sys/i386/i386/genassym.c,v 1.86.2.3 2002/03/03 05:42:49 nyan Exp $ - * $DragonFly: src/sys/platform/pc32/i386/genassym.c,v 1.23 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/genassym.c,v 1.24 2003/07/11 17:42:08 dillon Exp $ */ #include "opt_user_ldt.h" @@ -92,7 +92,7 @@ ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan)); ASSYM(TD_MPCOUNT, offsetof(struct thread, td_mpcount)); #endif ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); -ASSYM(TDF_EXITED, TDF_EXITED); +ASSYM(TDF_RUNNING, TDF_RUNNING); #ifdef SMP ASSYM(MP_FREE_LOCK, MP_FREE_LOCK); #endif diff --git a/sys/platform/pc32/i386/machdep.c b/sys/platform/pc32/i386/machdep.c index 57016d02bf..001c19f5c0 100644 --- a/sys/platform/pc32/i386/machdep.c +++ b/sys/platform/pc32/i386/machdep.c @@ -36,7 +36,7 @@ * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ - * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.25 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.26 2003/07/11 17:42:08 dillon Exp $ */ #include "apm.h" @@ -1914,6 +1914,7 @@ init386(int first) proc0.p_addr = (void *)thread0.td_kstack; proc0.p_thread = &thread0; proc0.p_flag |= P_CP_RELEASED; /* early set. See also init_main.c */ + thread0.td_flags |= TDF_RUNNING; thread0.td_proc = &proc0; thread0.td_switch = cpu_heavy_switch; /* YYY eventually LWKT */ safepri = thread0.td_cpl = SWI_MASK | HWI_MASK; @@ -2052,11 +2053,6 @@ init386(int first) /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_pcb->pcb_cr3 = (int)IdlePTD; /* should already be setup */ -#ifdef SMP -#if 0 - thread0.td_pcb->pcb_mpnest = 1; -#endif -#endif thread0.td_pcb->pcb_ext = 0; proc0.p_md.md_regs = &proc0_tf; } diff --git a/sys/platform/pc32/i386/mpapic.c b/sys/platform/pc32/i386/mpapic.c index 507f386e30..53ea4657d7 100644 --- a/sys/platform/pc32/i386/mpapic.c +++ b/sys/platform/pc32/i386/mpapic.c @@ -23,7 +23,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/mpapic.c,v 1.37.2.7 2003/01/25 02:31:47 peter Exp $ - * $DragonFly: src/sys/platform/pc32/i386/Attic/mpapic.c,v 1.5 2003/07/08 06:27:26 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/Attic/mpapic.c,v 1.6 2003/07/11 17:42:08 dillon Exp $ */ #include @@ -72,18 +72,15 @@ apic_initialize(void) temp |= 0x00010400; /* masked, edge trigger, active hi */ lapic.lvt_lint1 = temp; - /* set the Task Priority Register as needed */ + /* + * Set the Task Priority Register as needed. At the moment allow + * interrupts on all cpus (the APs will remain CLId until they are + * ready to deal). We could disable all but IPIs by setting + * temp |= TPR_IPI_ONLY for cpu != 0. + */ temp = lapic.tpr; temp &= ~APIC_TPR_PRIO; /* clear priority field */ - /* - * Leave the BSP and TPR 0 during boot so it gets all the interrupts, - * set APs at TPR 0xF0 at boot so they get no ints. - */ -#if 0 - if (mycpu->gd_cpuid != 0) - temp |= TPR_IPI_ONLY; /* disable INTs on this cpu */ -#endif lapic.tpr = temp; /* enable the local APIC */ diff --git a/sys/platform/pc32/i386/mpboot.s b/sys/platform/pc32/i386/mpboot.s index 9c85e26b92..9d8092b830 100644 --- a/sys/platform/pc32/i386/mpboot.s +++ b/sys/platform/pc32/i386/mpboot.s @@ -32,7 +32,7 @@ * multiprocessor systems. * * $FreeBSD: src/sys/i386/i386/mpboot.s,v 1.13.2.3 2000/09/07 01:18:26 tegge Exp $ - * $DragonFly: src/sys/platform/pc32/i386/mpboot.s,v 1.4 2003/07/06 21:23:48 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/mpboot.s,v 1.5 2003/07/11 17:42:08 dillon Exp $ */ #include /* miscellaneous asm macros */ @@ -118,12 +118,14 @@ mp_begin: /* now running relocated at KERNBASE */ /* * Execute the context restore function for the idlethread which * has conveniently been set as curthread. Remember, %eax must - * contain the target thread. Or BSP/AP synchronization occurs - * in ap_init(). We do not need to mess with the BGL for this - * because LWKT threads are self-contained on each cpu (or, at least, - * the idlethread is!). + * contain the target thread and %ebx must contain the originating + * thread (which we just set the same since we have no originating + * thread). BSP/AP synchronization occurs in ap_init(). We do + * not need to mess with the BGL for this because LWKT threads are + * self-contained on each cpu (or, at least, the idlethread is!). */ movl PCPU(curthread),%eax + movl %eax,%ebx movl TD_SP(%eax),%esp ret diff --git a/sys/platform/pc32/i386/swtch.s b/sys/platform/pc32/i386/swtch.s index 9a1166be92..193c3579d4 100644 --- a/sys/platform/pc32/i386/swtch.s +++ b/sys/platform/pc32/i386/swtch.s @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.89.2.10 2003/01/23 03:36:24 ps Exp $ - * $DragonFly: src/sys/platform/pc32/i386/swtch.s,v 1.23 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/swtch.s,v 1.24 2003/07/11 17:42:08 dillon Exp $ */ #include "npx.h" @@ -81,24 +81,18 @@ tlb_flush_count: .long 0 * is normally called via the thread->td_switch function, and will * only be called when the current thread is a heavy weight process. * + * Some instructions have been reordered to reduce pipeline stalls. + * * YYY disable interrupts once giant is removed. */ ENTRY(cpu_heavy_switch) - movl PCPU(curthread),%ecx - movl TD_PROC(%ecx),%ecx - - cli - movl P_VMSPACE(%ecx), %edx - movl PCPU(cpuid), %eax - MPLOCKED btrl %eax, VM_PMAP+PM_ACTIVE(%edx) - /* * Save general regs */ - movl P_THREAD(%ecx),%edx - movl TD_PCB(%edx),%edx - movl (%esp),%eax /* Hardware registers */ - movl %eax,PCB_EIP(%edx) + movl PCPU(curthread),%ecx + movl (%esp),%eax /* (reorder optimization) */ + movl TD_PCB(%ecx),%edx /* EDX = PCB */ + movl %eax,PCB_EIP(%edx) /* return PC may be modified */ movl %ebx,PCB_EBX(%edx) movl %esp,PCB_ESP(%edx) movl %ebp,PCB_EBP(%edx) @@ -106,16 +100,22 @@ ENTRY(cpu_heavy_switch) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) + movl %ecx,%ebx /* EBX = curthread */ + movl TD_PROC(%ecx),%ecx + movl PCPU(cpuid), %eax + movl P_VMSPACE(%ecx), %ecx /* ECX = vmspace */ + MPLOCKED btrl %eax, VM_PMAP+PM_ACTIVE(%ecx) + /* * Push the LWKT switch restore function, which resumes a heavy * weight process. Note that the LWKT switcher is based on * TD_SP, while the heavy weight process switcher is based on - * PCB_ESP. TD_SP is usually one pointer pushed relative to - * PCB_ESP. + * PCB_ESP. TD_SP is usually two ints pushed relative to + * PCB_ESP. We push the flags for later restore by cpu_heavy_restore. */ - movl P_THREAD(%ecx),%eax + pushfl pushl $cpu_heavy_restore - movl %esp,TD_SP(%eax) + movl %esp,TD_SP(%ebx) /* * Save debug regs if necessary @@ -144,24 +144,26 @@ ENTRY(cpu_heavy_switch) * npxsave will NULL out PCPU(npxthread). */ #if NNPX > 0 - movl P_THREAD(%ecx),%ecx - cmpl %ecx,PCPU(npxthread) + cmpl %ebx,PCPU(npxthread) jne 1f - addl $PCB_SAVEFPU,%edx /* h/w bugs make saving complicated */ + addl $PCB_SAVEFPU,%edx pushl %edx call npxsave /* do it in a big C function */ - addl $4,%esp + addl $4,%esp /* EAX, ECX, EDX trashed */ 1: - /* %ecx,%edx trashed */ #endif /* NNPX > 0 */ /* * Switch to the next thread, which was passed as an argument - * to cpu_heavy_switch(). Due to the switch-restore function we pushed, - * the argument is at 8(%esp). Set the current thread, load the - * stack pointer, and 'ret' into the switch-restore function. + * to cpu_heavy_switch(). Due to the eflags and switch-restore + * function we pushed, the argument is at 12(%esp). Set the current + * thread, load the stack pointer, and 'ret' into the switch-restore + * function. + * + * The switch restore function expects the new thread to be in %eax + * and the old one to be in %ebx. */ - movl 8(%esp),%eax + movl 12(%esp),%eax /* EAX = newtd, EBX = oldtd */ movl %eax,PCPU(curthread) movl TD_SP(%eax),%esp ret @@ -187,33 +189,14 @@ ENTRY(cpu_exit_switch) je 1f movl %ecx,%cr3 1: - movl PCPU(curthread),%ecx + movl PCPU(curthread),%ebx /* - * Switch to the next thread. + * Switch to the next thread. RET into the restore function, which + * expects the new thread in EAX and the old in EBX. */ - cli movl 4(%esp),%eax movl %eax,PCPU(curthread) movl TD_SP(%eax),%esp - - /* - * We are now the next thread, set the exited flag and wakeup - * any waiters. - */ - orl $TDF_EXITED,TD_FLAGS(%ecx) -#if 0 /* YYY MP lock may not be held by new target */ - pushl %eax - pushl %ecx /* wakeup(oldthread) */ - call wakeup - addl $4,%esp - popl %eax /* note: next thread expects curthread in %eax */ -#endif - - /* - * Restore the next thread's state and resume it. Note: the - * restore function assumes that the next thread's address is - * in %eax. - */ ret /* @@ -224,13 +207,15 @@ ENTRY(cpu_exit_switch) * off the thread stack and jumped to. * * This entry is only called if the thread was previously saved - * using cpu_heavy_switch() (the heavy weight process thread switcher). + * using cpu_heavy_switch() (the heavy weight process thread switcher), + * or when a new process is initially scheduled. The first thing we + * do is clear the TDF_RUNNING bit in the old thread and set it in the + * new thread. * * YYY theoretically we do not have to restore everything here, a lot * of this junk can wait until we return to usermode. But for now * we restore everything. * - * YYY STI/CLI sequencing. * YYY the PCB crap is really crap, it makes startup a bitch because * we can't switch away. * @@ -238,8 +223,8 @@ ENTRY(cpu_exit_switch) */ ENTRY(cpu_heavy_restore) - /* interrupts are disabled */ - movl TD_PCB(%eax),%edx + popfl + movl TD_PCB(%eax),%edx /* EDX = PCB */ movl TD_PROC(%eax),%ecx #ifdef DIAGNOSTIC cmpb $SRUN,P_STAT(%ecx) @@ -254,9 +239,9 @@ ENTRY(cpu_heavy_restore) * safely test/reload %cr3 until after we have set the bit in the * pmap (remember, we do not hold the MP lock in the switch code). */ - movl P_VMSPACE(%ecx), %ebx - movl PCPU(cpuid), %eax - MPLOCKED btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) + movl P_VMSPACE(%ecx), %ecx /* ECX = vmspace */ + movl PCPU(cpuid), %esi + MPLOCKED btsl %esi, VM_PMAP+PM_ACTIVE(%ecx) /* * Restore the MMU address space. If it is the same as the last @@ -264,16 +249,24 @@ ENTRY(cpu_heavy_restore) * YYY which naturally also means that the PM_ACTIVE bit had better * already have been set before we set it above, check? YYY */ - movl %cr3,%eax - movl PCB_CR3(%edx),%ebx - cmpl %eax,%ebx + movl %cr3,%esi + movl PCB_CR3(%edx),%ecx + cmpl %esi,%ecx je 4f #if defined(SWTCH_OPTIM_STATS) decl _swtch_optim_stats incl _tlb_flush_count #endif - movl %ebx,%cr3 + movl %ecx,%cr3 4: + /* + * Clear TDF_RUNNING flag in old thread only after cleaning up + * %cr3. The target thread is already protected by being TDF_RUNQ + * so setting TDF_RUNNING isn't as big a deal. + */ + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) + /* * Deal with the PCB extension, restore the private tss */ @@ -385,7 +378,6 @@ cpu_switch_load_gs: movl %eax,%dr7 1: - sti /* XXX */ ret CROSSJUMPTARGET(sw1a) @@ -464,6 +456,8 @@ ENTRY(savectx) * cpu_idle() LWKT only, after that cpu_lwkt_*() will be used for * switching. * + * Clear TDF_RUNNING in old thread only after we've cleaned up %cr3. + * * If we are an AP we have to call ap_init() before jumping to * cpu_idle(). ap_init() will synchronize with the BP and finish * setting up various ncpu-dependant globaldata fields. This may @@ -471,10 +465,13 @@ ENTRY(savectx) * cpus. */ ENTRY(cpu_idle_restore) + /* cli */ movl IdlePTD,%ecx movl $0,%ebp pushl $0 movl %ecx,%cr3 + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) #ifdef SMP cmpl $0,PCPU(cpuid) je 1f @@ -496,16 +493,18 @@ ENTRY(cpu_idle_restore) * we can release our critical section and enable interrupts early. */ ENTRY(cpu_kthread_restore) + sti movl IdlePTD,%ecx - movl TD_PCB(%eax),%ebx + movl TD_PCB(%eax),%edx movl $0,%ebp movl %ecx,%cr3 + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) subl $TDPRI_CRIT,TD_PRI(%eax) - sti - popl %edx /* kthread exit function */ - pushl PCB_EBX(%ebx) /* argument to ESI function */ - pushl %edx /* set exit func as return address */ - movl PCB_ESI(%ebx),%eax + popl %eax /* kthread exit function */ + pushl PCB_EBX(%edx) /* argument to ESI function */ + pushl %eax /* set exit func as return address */ + movl PCB_ESI(%edx),%eax jmp *%eax /* @@ -525,12 +524,15 @@ ENTRY(cpu_lwkt_switch) pushl %esi pushl %edi pushfl - movl PCPU(curthread),%ecx + movl PCPU(curthread),%ebx pushl $cpu_lwkt_restore - cli - movl %esp,TD_SP(%ecx) + movl %esp,TD_SP(%ebx) movl %eax,PCPU(curthread) movl TD_SP(%eax),%esp + + /* + * eax contains new thread, ebx contains old thread. + */ ret /* @@ -549,11 +551,13 @@ ENTRY(cpu_lwkt_switch) */ ENTRY(cpu_lwkt_restore) movl IdlePTD,%ecx /* YYY borrow but beware desched/cpuchg/exit */ - movl %cr3,%eax - cmpl %ecx,%eax + movl %cr3,%edx + cmpl %ecx,%edx je 1f movl %ecx,%cr3 1: + andl $~TDF_RUNNING,TD_FLAGS(%ebx) + orl $TDF_RUNNING,TD_FLAGS(%eax) popfl popl %edi popl %esi diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c index 5be0776168..51fea80877 100644 --- a/sys/platform/pc32/i386/trap.c +++ b/sys/platform/pc32/i386/trap.c @@ -36,7 +36,7 @@ * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ - * $DragonFly: src/sys/platform/pc32/i386/trap.c,v 1.21 2003/07/11 01:23:21 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/trap.c,v 1.22 2003/07/11 17:42:08 dillon Exp $ */ /* @@ -156,6 +156,12 @@ SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); +static int fast_release; +SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW, + &fast_release, 0, "Passive Release was optimal"); +static int slow_release; +SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW, + &slow_release, 0, "Passive Release was nonoptimal"); /* * USER->KERNEL transition. Do not transition us out of userland from the @@ -206,9 +212,11 @@ userexit(struct proc *p) * lwkt_maybe_switch() to deal with it. */ if (td->td_release) { + ++fast_release; td->td_release = NULL; KKASSERT(p->p_flag & P_CURPROC); } else { + ++slow_release; acquire_curproc(p); switch(p->p_rtprio.type) { case RTP_PRIO_IDLE: @@ -1368,7 +1376,9 @@ fork_return(p, frame) if (KTRPOINT(p->p_thread, KTR_SYSRET)) ktrsysret(p->p_tracep, SYS_fork, 0, 0); #endif + p->p_flag |= P_PASSIVE_ACQ; userexit(p); + p->p_flag &= ~P_PASSIVE_ACQ; #ifdef SMP KKASSERT(curthread->td_mpcount == 1); rel_mplock(); diff --git a/sys/platform/pc32/i386/vm_machdep.c b/sys/platform/pc32/i386/vm_machdep.c index 5349acea59..469825d295 100644 --- a/sys/platform/pc32/i386/vm_machdep.c +++ b/sys/platform/pc32/i386/vm_machdep.c @@ -39,7 +39,7 @@ * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ * $FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.132.2.9 2003/01/25 19:02:23 dillon Exp $ - * $DragonFly: src/sys/platform/pc32/i386/vm_machdep.c,v 1.19 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/vm_machdep.c,v 1.20 2003/07/11 17:42:08 dillon Exp $ */ #include "npx.h" @@ -162,10 +162,10 @@ cpu_fork(p1, p2, flags) * common_tss.esp0 (kernel stack pointer on entry from user mode) * * pcb_esp must allocate an additional call-return pointer below - * the trap frame which will be restored by cpu_restore, and the - * thread's td_sp pointer must allocate an additonal call-return - * pointer below the pcb_esp call-return pointer to hold the LWKT - * restore function pointer. + * the trap frame which will be restored by cpu_restore from + * PCB_EIP, and the thread's td_sp pointer must allocate an + * additonal two worsd below the pcb_esp call-return pointer to + * hold the LWKT restore function pointer and eflags. * * The LWKT restore function pointer must be set to cpu_restore, * which is our standard heavy weight process switch-in function. @@ -188,6 +188,8 @@ cpu_fork(p1, p2, flags) pcb2->pcb_ebx = (int)p2; /* fork_trampoline argument */ pcb2->pcb_eip = (int)fork_trampoline; p2->p_thread->td_sp = (char *)(pcb2->pcb_esp - sizeof(void *)); + *(u_int32_t *)p2->p_thread->td_sp = PSL_USER; + p2->p_thread->td_sp -= sizeof(void *); *(void **)p2->p_thread->td_sp = (void *)cpu_heavy_restore; /* * pcb2->pcb_ldt: duplicated below, if necessary. diff --git a/sys/platform/vkernel/i386/genassym.c b/sys/platform/vkernel/i386/genassym.c index ffbe76866f..a86086713a 100644 --- a/sys/platform/vkernel/i386/genassym.c +++ b/sys/platform/vkernel/i386/genassym.c @@ -35,7 +35,7 @@ * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD: src/sys/i386/i386/genassym.c,v 1.86.2.3 2002/03/03 05:42:49 nyan Exp $ - * $DragonFly: src/sys/platform/vkernel/i386/genassym.c,v 1.23 2003/07/10 04:47:53 dillon Exp $ + * $DragonFly: src/sys/platform/vkernel/i386/genassym.c,v 1.24 2003/07/11 17:42:08 dillon Exp $ */ #include "opt_user_ldt.h" @@ -92,7 +92,7 @@ ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan)); ASSYM(TD_MPCOUNT, offsetof(struct thread, td_mpcount)); #endif ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); -ASSYM(TDF_EXITED, TDF_EXITED); +ASSYM(TDF_RUNNING, TDF_RUNNING); #ifdef SMP ASSYM(MP_FREE_LOCK, MP_FREE_LOCK); #endif diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 47f8178501..94277a9b54 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -37,7 +37,7 @@ * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD: src/sys/sys/proc.h,v 1.99.2.9 2003/06/06 20:21:32 tegge Exp $ - * $DragonFly: src/sys/sys/proc.h,v 1.24 2003/07/11 01:23:24 dillon Exp $ + * $DragonFly: src/sys/sys/proc.h,v 1.25 2003/07/11 17:42:11 dillon Exp $ */ #ifndef _SYS_PROC_H_ @@ -282,6 +282,7 @@ struct proc { #define P_OLDMASK 0x2000000 /* need to restore mask before pause */ #define P_ALTSTACK 0x4000000 /* have alternate signal stack */ #define P_INEXEC 0x8000000 /* Process is in execve(). */ +#define P_PASSIVE_ACQ 0x10000000 /* Passive acquire cpu (see kern_switch) */ #ifdef _KERNEL @@ -412,8 +413,8 @@ int suser __P((struct thread *td)); int suser_proc __P((struct proc *p)); int suser_cred __P((struct ucred *cred, int flag)); void remrunqueue __P((struct proc *)); -void release_curproc __P((struct proc *)); -void acquire_curproc __P((struct proc *)); +void release_curproc __P((struct proc *curp)); +void acquire_curproc __P((struct proc *curp)); void cpu_heavy_switch __P((struct thread *)); void cpu_lwkt_switch __P((struct thread *)); void unsleep __P((struct thread *)); diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 45f6426daa..c3c5048fd7 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -4,7 +4,7 @@ * Implements the architecture independant portion of the LWKT * subsystem. * - * $DragonFly: src/sys/sys/thread.h,v 1.22 2003/07/11 01:23:24 dillon Exp $ + * $DragonFly: src/sys/sys/thread.h,v 1.23 2003/07/11 17:42:11 dillon Exp $ */ #ifndef _SYS_THREAD_H_ @@ -183,15 +183,16 @@ struct thread { }; /* - * Thread flags. Note that TDF_EXITED is set by the appropriate switchout - * code when a thread exits, after it has switched to another stack and - * cleaned up the MMU state. + * Thread flags. Note that TDF_RUNNING is cleared on the old thread after + * we switch to the new one, which is necessary because LWKTs don't need + * to hold the BGL. This flag is used by the exit code and the managed + * thread migration code. * * LWKT threads stay on their (per-cpu) run queue while running, not to * be confused with user processes which are removed from the user scheduling * run queue while actually running. */ -#define TDF_EXITED 0x0001 /* thread finished exiting */ +#define TDF_RUNNING 0x0001 /* thread still active */ #define TDF_RUNQ 0x0002 /* on an LWKT run queue */ #define TDF_PREEMPT_LOCK 0x0004 /* I have been preempted */ #define TDF_PREEMPT_DONE 0x0008 /* acknowledge preemption complete */ -- 2.41.0