From 263541dbed4ce4d88ed87b418356a3c7cc501e3b Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 29 Apr 2004 17:25:03 +0000 Subject: [PATCH] Rewrite the optimized memcpy/bcopy/bzero support subsystem. Rip out the old FreeBSD code almost entirely. * Add support for stacked ONFAULT routines, allowing copyin and copyout to call the general memcpy entry point instead of rolling their own. * Split memcpy/bcopy and bzero into their own files * Add support for XMM (128 bit) and MMX (64 bit) media instruction copies * Rewrite the integer code. Also note that most of the previous integer and FP special case support had been ripped out of DragonFly long ago in that the assembly was no longer being referenced. It doesn't make sense to have a dozen different zeroing/copying routines so focus on the ones that work well with recent (last ~5 years) cpus. * Rewrite the FP state handling code. Instead of restoring the FP state let it hang, which allows userland to make multiple syscalls and/or for the system to make multiple bcopy()/memcpy() calls without having to save/restore the FP state on each call. Userland will take a fault when it needs the FP again. Note that FP optimized copies only occur for block sizes >= 2048 bytes, so this is not something that userland, or the kernel, will trip up on every time it tries to do a bcopy(). * LWKT threads need to be able to save the FP state, add the simple conditional and 5 lines of assembly required to do that. AMD Athlon notes: 64 bit media instructions will get us 90% of the way there. It is possible to squeeze out slightly more memory bandwidth from the 128 bit XMM instructions (SSE2). While it does not exist in this commit there are two additional features that can be used: prefetching and non-temporal writes. Prefetching is a 3dNOW instruction and can squeeze out significant additionaL performance if you fetch ~128 bytes ahead of the game, but I believe it is AMD-only. Non-temporal writes can double UNCACHED memory bandwidth, but they have a horrible effect on L1/L2 performance and you can't mix non-temporal writes with normal writes without completely destroying memory performance (e.g. multiple GB/s -> less then 100 MBytes/sec). Neither prefetching nor non-temporal writes are implemented in this commit. --- sys/conf/files.i386 | 4 +- sys/i386/i386/bcopy.s | 417 +++++++++++ sys/i386/i386/bzero.s | 115 +++ sys/i386/i386/genassym.c | 3 +- sys/i386/i386/globals.s | 5 +- sys/i386/i386/support.s | 975 +++---------------------- sys/i386/i386/swtch.s | 29 +- sys/i386/include/globaldata.h | 3 +- sys/i386/include/md_var.h | 15 +- sys/i386/isa/npx.c | 75 +- sys/platform/pc32/i386/bcopy.s | 417 +++++++++++ sys/platform/pc32/i386/bzero.s | 115 +++ sys/platform/pc32/i386/genassym.c | 3 +- sys/platform/pc32/i386/globals.s | 5 +- sys/platform/pc32/i386/support.s | 975 +++---------------------- sys/platform/pc32/i386/swtch.s | 29 +- sys/platform/pc32/include/globaldata.h | 3 +- sys/platform/pc32/include/md_var.h | 15 +- sys/platform/pc32/isa/npx.c | 75 +- sys/platform/vkernel/i386/genassym.c | 3 +- 20 files changed, 1405 insertions(+), 1876 deletions(-) create mode 100644 sys/i386/i386/bcopy.s create mode 100644 sys/i386/i386/bzero.s create mode 100644 sys/platform/pc32/i386/bcopy.s create mode 100644 sys/platform/pc32/i386/bzero.s diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index 9ef8fae21d..9575785e1f 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -2,7 +2,7 @@ # files marked standard are always included. # # $FreeBSD: src/sys/conf/files.i386,v 1.307.2.38 2003/01/02 20:41:33 kan Exp $ -# $DragonFly: src/sys/conf/Attic/files.i386,v 1.23 2004/04/29 12:11:15 joerg Exp $ +# $DragonFly: src/sys/conf/Attic/files.i386,v 1.24 2004/04/29 17:25:03 dillon Exp $ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and @@ -221,6 +221,8 @@ i386/i386/pnpbios.c optional pnpbios i386/i386/procfs_machdep.c standard i386/i386/spinlock.s standard i386/i386/support.s standard +i386/i386/bcopy.s standard +i386/i386/bzero.s standard i386/i386/swtch.s standard i386/i386/sys_machdep.c standard i386/i386/trap.c standard diff --git a/sys/i386/i386/bcopy.s b/sys/i386/i386/bcopy.s new file mode 100644 index 0000000000..99c2c10c87 --- /dev/null +++ b/sys/i386/i386/bcopy.s @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2003 Matthew Dillon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/i386/i386/Attic/bcopy.s,v 1.1 2004/04/29 17:24:58 dillon Exp $ + */ +/* + * bcopy(source:%esi, target:%edi, count:%ecx) + * + * note: esi, edi, eax, ecx, and edx may be destroyed + */ + +#include "use_npx.h" + +#include +#include +#include +#include + +#include "assym.s" + + .text + + /* + * If memcpy/bcopy is called as part of a copyin or copyout, the + * on-fault routine is set up to do a 'ret'. We hve to restore + * %ebx and return to the copyin/copyout fault handler. + */ +generic_onfault: + popl %ebx + addl $4,%esp /* skip normal return vector */ + ret /* return to copyin/copyout fault handler */ + + /* + * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY + * + * Reasonably optimal on all modern machines. + */ + + SUPERALIGN_TEXT +ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */ + pushl %ebx + pushl $generic_onfault + jmp 2f + + SUPERALIGN_TEXT +ENTRY(asm_generic_bcopy) + pushl %ebx + pushl $generic_onfault + cmpl %esi,%edi /* if (edi < esi) fwd copy ok */ + jb 2f + addl %ecx,%esi + cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */ + jb 10f + subl %ecx,%esi + jmp 2f + + SUPERALIGN_TEXT +1: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%edx + movl %eax,(%edi) + movl 12(%esi),%eax + movl %ebx,4(%edi) + movl 16(%esi),%ebx + movl %edx,8(%edi) + movl 20(%esi),%edx + movl %eax,12(%edi) + movl 24(%esi),%eax + movl %ebx,16(%edi) + movl 28(%esi),%ebx + movl %edx,20(%edi) + movl %eax,24(%edi) + addl $32,%esi + movl %ebx,28(%edi) + addl $32,%edi +2: + subl $32,%ecx + jae 1b + addl $32,%ecx + jz 3f + cld + rep + movsb +3: + addl $4,%esp + popl %ebx + ret + + /* + * GENERIC_BCOPY() - BACKWARDS COPY + */ + SUPERALIGN_TEXT +10: + addl %ecx,%edi + jmp 12f + + SUPERALIGN_TEXT +11: + movl -4(%esi),%eax + movl -8(%esi),%ebx + movl -12(%esi),%edx + movl %eax,-4(%edi) + movl -16(%esi),%eax + movl %ebx,-8(%edi) + movl -20(%esi),%ebx + movl %edx,-12(%edi) + movl -24(%esi),%edx + movl %eax,-16(%edi) + movl -28(%esi),%eax + movl %ebx,-20(%edi) + movl -32(%esi),%ebx + movl %edx,-24(%edi) + movl %eax,-28(%edi) + subl $32,%esi + movl %ebx,-32(%edi) + subl $32,%edi +12: + subl $32,%ecx + jae 11b + addl $32,%ecx + jz 13f + decl %esi + decl %edi + std + rep + movsb + cld +13: + addl $4,%esp + popl %ebx + ret + + /* + * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY + * + * Reasonably optimal on all modern machines with MMX or SSE2. + * XXX But very messy, we need a better way to use fp in the kernel. + * + * note: esi, edi, eax, ecx, and edx may be destroyed + * + * In order for the kernel to be able to use the FPU: + * + * (1) The kernel may not already be using the fpu + * (2) If the fpu is owned by the application, we must save + * and restore its state. + * (3) Our thread begins using the FPU, we clts (clear CR0_TS) + * to prevent an FP fault, fninit, and set our thread as + * the npxthread. + * + * (4) While we are using the FP unit, an interrupt may come + * along and preempt us, causing our FP state to be saved. + * We will fault/restore upon resumption. + * + * (5) To cleanup we have to restore the original application's + * FP state, which means restoring any saved state, CR0_TS, + * and npxthread settings as appropriate. + * + * However, as an optimization we can instead copy the + * saved state to the PCB, clear npxthread, and set CR0_TS, + * which will allow additional bcopy's to use the FP unit + * at virtually no cost and cause the application to trap + * when it tries to use the FP unit again. + * + * So, why choose one over another? Well, having to save + * and restore the FP state eats a lot of cycles. Many + * kernel operations actually wind up looping on a bcopy + * (e.g. the PIPE/SFBUF case), or looping in userland without + * any intervening FP ops. Our minimum copy size check + * (2048) avoids the use of FP for the smaller copies that + * are more likely to be intermingled with user FP ops, so + * it is my belief that saving the user FP state to the PCB + * is a better solution then restoring it. + * + * NOTE: fxsave requires a 16-byte aligned address + * + * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn. + * MMX: Typical on XPs and P3s. 64 bit media insn. + */ + +#define MMX_SAVE_BLOCK(missfunc) \ + cmpl $2048,%ecx ; \ + jb missfunc ; \ + btsl $1,PCPU(kernel_fpu_lock) ; \ + jc missfunc ; \ + pushl %ebp ; \ + movl %esp, %ebp ; \ + smsw %ax ; \ + movl PCPU(npxthread),%edx ; \ + testl %edx,%edx ; \ + jz 100f ; \ + clts ; \ + subl $512,%esp ; \ + andl $0xfffffff0,%esp ; \ + fxsave 0(%esp) ; \ +100: ; \ + pushl %eax ; \ + pushl %edx ; \ + movl PCPU(curthread),%edx ; \ + movl %edx,PCPU(npxthread) ; \ + clts ; \ + fninit ; \ + pushl $mmx_onfault + + +#define MMX_RESTORE_BLOCK \ + addl $4,%esp ; \ + MMX_RESTORE_BLOCK2 + +#define MMX_RESTORE_BLOCK2 \ + popl %edx ; \ + popl %eax ; \ + testl %edx,%edx ; \ + jz 100f ; \ + movl %esp,%esi ; \ + movl PCPU(curthread),%edi ; \ + movl TD_PCB(%edi),%edi ; \ + addl $PCB_SAVEFPU,%edi ; \ + movl $512>>2,%ecx ; \ + cld ; \ + rep ; \ + movsl ; \ + orb $CR0_TS,%al ; \ +100: ; \ + movl %ebp,%esp ; \ + popl %ebp ; \ + movl $0,PCPU(npxthread) ; \ + lmsw %ax ; \ + movl $0,PCPU(kernel_fpu_lock) + + /* + * xmm/mmx_onfault routine. Restore the fpu state, skip the normal + * return vector, and return to the caller's on-fault routine + * (which was pushed on the callers stack just before he calle us) + */ +mmx_onfault: + MMX_RESTORE_BLOCK2 + addl $4,%esp + ret + + /* + * MXX entry points - only support 64 bit media instructions + */ + SUPERALIGN_TEXT +ENTRY(asm_mmx_memcpy) /* memcpy() entry point use optimal copy */ + MMX_SAVE_BLOCK(asm_generic_memcpy) + jmp 5f + + SUPERALIGN_TEXT +ENTRY(asm_mmx_bcopy) + MMX_SAVE_BLOCK(asm_generic_bcopy) + cmpl %esi,%edi /* if (edi < esi) fwd copy ok */ + jb 5f + addl %ecx,%esi + cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */ + jb 10f + subl %ecx,%esi + jmp 5f + + /* + * XMM entry points - support 128 bit media instructions + */ + SUPERALIGN_TEXT +ENTRY(asm_xmm_memcpy) /* memcpy() entry point use optimal copy */ + MMX_SAVE_BLOCK(asm_generic_memcpy) + jmp 1f + + SUPERALIGN_TEXT +ENTRY(asm_xmm_bcopy) + MMX_SAVE_BLOCK(asm_generic_bcopy) + cmpl %esi,%edi /* if (edi < esi) fwd copy ok */ + jb 1f + addl %ecx,%esi + cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */ + jb 10f + subl %ecx,%esi +1: + movl %esi,%eax /* skip xmm if the data is not aligned */ + andl $15,%eax + jnz 5f + movl %edi,%eax + andl $15,%eax + jz 3f + jmp 5f + + SUPERALIGN_TEXT +2: + movdqa (%esi),%xmm0 + movdqa 16(%esi),%xmm1 + movdqa 32(%esi),%xmm2 + movdqa 48(%esi),%xmm3 + movdqa 64(%esi),%xmm4 + movdqa 80(%esi),%xmm5 + movdqa 96(%esi),%xmm6 + movdqa 112(%esi),%xmm7 + /*prefetchnta 128(%esi) 3dNOW */ + addl $128,%esi + + /* + * movdqa or movntdq can be used. + */ + movdqa %xmm0,(%edi) + movdqa %xmm1,16(%edi) + movdqa %xmm2,32(%edi) + movdqa %xmm3,48(%edi) + movdqa %xmm4,64(%edi) + movdqa %xmm5,80(%edi) + movdqa %xmm6,96(%edi) + movdqa %xmm7,112(%edi) + addl $128,%edi +3: + subl $128,%ecx + jae 2b + addl $128,%ecx + jz 6f + jmp 5f + SUPERALIGN_TEXT +4: + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + /*prefetchnta 128(%esi) 3dNOW */ + addl $64,%esi + movq %mm0,(%edi) + movq %mm1,8(%edi) + movq %mm2,16(%edi) + movq %mm3,24(%edi) + movq %mm4,32(%edi) + movq %mm5,40(%edi) + movq %mm6,48(%edi) + movq %mm7,56(%edi) + addl $64,%edi +5: + subl $64,%ecx + jae 4b + addl $64,%ecx + jz 6f + cld + rep + movsb +6: + MMX_RESTORE_BLOCK + ret + + /* + * GENERIC_BCOPY() - BACKWARDS COPY + * + * Don't bother using xmm optimizations, just stick with mmx. + */ + SUPERALIGN_TEXT +10: + addl %ecx,%edi + jmp 12f + + SUPERALIGN_TEXT +11: + movq -64(%esi),%mm0 + movq -56(%esi),%mm1 + movq -48(%esi),%mm2 + movq -40(%esi),%mm3 + movq -32(%esi),%mm4 + movq -24(%esi),%mm5 + movq -16(%esi),%mm6 + movq -8(%esi),%mm7 + /*prefetchnta -128(%esi)*/ + subl $64,%esi + movq %mm0,-64(%edi) + movq %mm1,-56(%edi) + movq %mm2,-48(%edi) + movq %mm3,-40(%edi) + movq %mm4,-32(%edi) + movq %mm5,-24(%edi) + movq %mm6,-16(%edi) + movq %mm7,-8(%edi) + subl $64,%edi +12: + subl $64,%ecx + jae 11b + addl $64,%ecx + jz 13f + decl %esi + decl %edi + std + rep + movsb + cld +13: + MMX_RESTORE_BLOCK + ret + diff --git a/sys/i386/i386/bzero.s b/sys/i386/i386/bzero.s new file mode 100644 index 0000000000..e9d9ace766 --- /dev/null +++ b/sys/i386/i386/bzero.s @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2003 Matthew Dillon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/i386/i386/Attic/bzero.s,v 1.1 2004/04/29 17:24:58 dillon Exp $ + */ +/* + * void bzero(void *buf, u_int len) (arguments passed on stack) + */ + +#include "use_npx.h" + +#include +#include +#include +#include + +#include "assym.s" + + .text + +ENTRY(generic_bzero) + pushl %edi + subl %eax,%eax + movl 4+4(%esp),%edi + movl 8+4(%esp),%ecx + jmp 2f + SUPERALIGN_TEXT +1: + movl %eax,(%edi) + movl %eax,4(%edi) + addl $8,%edi +2: + subl $8,%ecx + jae 1b + addl $8,%ecx + jz 3f + cld + rep + stosb +3: + popl %edi + ret + +ENTRY(i686_pagezero) + pushl %edi + pushl %ebx + + movl 12(%esp), %edi + movl $1024, %ecx + cld + + ALIGN_TEXT +1: + xorl %eax, %eax + repe + scasl + jnz 2f + + popl %ebx + popl %edi + ret + + ALIGN_TEXT + +2: + incl %ecx + subl $4, %edi + + movl %ecx, %edx + cmpl $16, %ecx + + jge 3f + + movl %edi, %ebx + andl $0x3f, %ebx + shrl %ebx + shrl %ebx + movl $16, %ecx + subl %ebx, %ecx + +3: + subl %ecx, %edx + rep + stosl + + movl %edx, %ecx + testl %edx, %edx + jnz 1b + + popl %ebx + popl %edi + ret + diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index d0f42edd84..2ea2e79883 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -35,7 +35,7 @@ * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD: src/sys/i386/i386/genassym.c,v 1.86.2.3 2002/03/03 05:42:49 nyan Exp $ - * $DragonFly: src/sys/i386/i386/Attic/genassym.c,v 1.36 2004/03/30 19:14:04 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/genassym.c,v 1.37 2004/04/29 17:24:58 dillon Exp $ */ #include @@ -202,6 +202,7 @@ ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss)); ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt)); ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread)); +ASSYM(GD_KERNEL_FPU_LOCK, offsetof(struct mdglobaldata, gd_kernel_fpu_lock)); ASSYM(GD_OTHER_CPUS, offsetof(struct mdglobaldata, gd_other_cpus)); ASSYM(GD_SS_EFLAGS, offsetof(struct mdglobaldata, gd_ss_eflags)); ASSYM(GD_CMAP1, offsetof(struct mdglobaldata, gd_CMAP1)); diff --git a/sys/i386/i386/globals.s b/sys/i386/i386/globals.s index 6fe54bab44..94e3308e7b 100644 --- a/sys/i386/i386/globals.s +++ b/sys/i386/i386/globals.s @@ -24,7 +24,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/globals.s,v 1.13.2.1 2000/05/16 06:58:06 dillon Exp $ - * $DragonFly: src/sys/i386/i386/Attic/globals.s,v 1.20 2004/02/17 19:38:53 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/globals.s,v 1.21 2004/04/29 17:24:58 dillon Exp $ */ #include @@ -69,6 +69,9 @@ .globl gd_currentldt .set gd_currentldt,globaldata + GD_CURRENTLDT + .globl gd_kernel_fpu_lock + .set gd_kernel_fpu_lock, globaldata + GD_KERNEL_FPU_LOCK + /* * The BSP version of these get setup in locore.s and pmap.c, while * the AP versions are setup in mp_machdep.c. diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index b25fd7a8a8..93002a24e1 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/support.s,v 1.67.2.5 2001/08/15 01:23:50 peter Exp $ - * $DragonFly: src/sys/i386/i386/Attic/support.s,v 1.10 2004/04/03 08:21:16 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/support.s,v 1.11 2004/04/29 17:24:58 dillon Exp $ */ #include "use_npx.h" @@ -46,344 +46,25 @@ #define IDXSHIFT 10 .data + + .globl memcpy_vector +memcpy_vector: + .long asm_generic_memcpy + .globl bcopy_vector bcopy_vector: - .long generic_bcopy + .long asm_generic_bcopy + .globl bzero bzero: .long generic_bzero - .globl copyin_vector -copyin_vector: - .long generic_copyin - .globl copyout_vector -copyout_vector: - .long generic_copyout + .globl ovbcopy_vector ovbcopy_vector: - .long generic_bcopy -#if defined(I586_CPU) && NNPX > 0 -kernel_fpu_lock: - .byte 0xfe - .space 3 -#endif + .long asm_generic_bcopy .text -/* - * bcopy family - * void bzero(void *buf, u_int len) - */ - -ENTRY(generic_bzero) - pushl %edi - movl 8(%esp),%edi - movl 12(%esp),%ecx - xorl %eax,%eax - shrl $2,%ecx - cld - rep - stosl - movl 12(%esp),%ecx - andl $3,%ecx - rep - stosb - popl %edi - ret - -#if defined(I486_CPU) -ENTRY(i486_bzero) - movl 4(%esp),%edx - movl 8(%esp),%ecx - xorl %eax,%eax -/* - * do 64 byte chunks first - * - * XXX this is probably over-unrolled at least for DX2's - */ -2: - cmpl $64,%ecx - jb 3f - movl %eax,(%edx) - movl %eax,4(%edx) - movl %eax,8(%edx) - movl %eax,12(%edx) - movl %eax,16(%edx) - movl %eax,20(%edx) - movl %eax,24(%edx) - movl %eax,28(%edx) - movl %eax,32(%edx) - movl %eax,36(%edx) - movl %eax,40(%edx) - movl %eax,44(%edx) - movl %eax,48(%edx) - movl %eax,52(%edx) - movl %eax,56(%edx) - movl %eax,60(%edx) - addl $64,%edx - subl $64,%ecx - jnz 2b - ret - -/* - * do 16 byte chunks - */ - SUPERALIGN_TEXT -3: - cmpl $16,%ecx - jb 4f - movl %eax,(%edx) - movl %eax,4(%edx) - movl %eax,8(%edx) - movl %eax,12(%edx) - addl $16,%edx - subl $16,%ecx - jnz 3b - ret - -/* - * do 4 byte chunks - */ - SUPERALIGN_TEXT -4: - cmpl $4,%ecx - jb 5f - movl %eax,(%edx) - addl $4,%edx - subl $4,%ecx - jnz 4b - ret - -/* - * do 1 byte chunks - * a jump table seems to be faster than a loop or more range reductions - * - * XXX need a const section for non-text - */ - .data -jtab: - .long do0 - .long do1 - .long do2 - .long do3 - - .text - SUPERALIGN_TEXT -5: - jmp *jtab(,%ecx,4) - - SUPERALIGN_TEXT -do3: - movw %ax,(%edx) - movb %al,2(%edx) - ret - - SUPERALIGN_TEXT -do2: - movw %ax,(%edx) - ret - - SUPERALIGN_TEXT -do1: - movb %al,(%edx) - ret - - SUPERALIGN_TEXT -do0: - ret -#endif - -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_bzero) - movl 4(%esp),%edx - movl 8(%esp),%ecx - - /* - * The FPU register method is twice as fast as the integer register - * method unless the target is in the L1 cache and we pre-allocate a - * cache line for it (then the integer register method is 4-5 times - * faster). However, we never pre-allocate cache lines, since that - * would make the integer method 25% or more slower for the common - * case when the target isn't in either the L1 cache or the L2 cache. - * Thus we normally use the FPU register method unless the overhead - * would be too large. - */ - cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ - jb intreg_i586_bzero - - /* - * The FPU registers may belong to an application or to fastmove() - * or to another invocation of bcopy() or ourself in a higher level - * interrupt or trap handler. Preserving the registers is - * complicated since we avoid it if possible at all levels. We - * want to localize the complications even when that increases them. - * Here the extra work involves preserving CR0_TS in TS. - * `npxthread != NULL' is supposed to be the condition that all the - * FPU resources belong to an application, but npxthread and CR0_TS - * aren't set atomically enough for this condition to work in - * interrupt handlers. - * - * Case 1: FPU registers belong to the application: we must preserve - * the registers if we use them, so we only use the FPU register - * method if the target size is large enough to amortize the extra - * overhead for preserving them. CR0_TS must be preserved although - * it is very likely to end up as set. - * - * Case 2: FPU registers belong to fastmove(): fastmove() currently - * makes the registers look like they belong to an application so - * that cpu_switch() and savectx() don't have to know about it, so - * this case reduces to case 1. - * - * Case 3: FPU registers belong to the kernel: don't use the FPU - * register method. This case is unlikely, and supporting it would - * be more complicated and might take too much stack. - * - * Case 4: FPU registers don't belong to anyone: the FPU registers - * don't need to be preserved, so we always use the FPU register - * method. CR0_TS must be preserved although it is very likely to - * always end up as clear. - */ - cmpl $0,PCPU(npxthread) - je i586_bz1 - cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ - jb intreg_i586_bzero - sarb $1,kernel_fpu_lock - jc intreg_i586_bzero - smsw %ax - clts - subl $108,%esp - fnsave 0(%esp) - jmp i586_bz2 - -i586_bz1: - sarb $1,kernel_fpu_lock - jc intreg_i586_bzero - smsw %ax - clts - fninit /* XXX should avoid needing this */ -i586_bz2: - fldz - - /* - * Align to an 8 byte boundary (misalignment in the main loop would - * cost a factor of >= 2). Avoid jumps (at little cost if it is - * already aligned) by always zeroing 8 bytes and using the part up - * to the _next_ alignment position. - */ - fstl 0(%edx) - addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ - addl $8,%edx - andl $~7,%edx - subl %edx,%ecx - - /* - * Similarly align `len' to a multiple of 8. - */ - fstl -8(%edx,%ecx) - decl %ecx - andl $~7,%ecx - - /* - * This wouldn't be any faster if it were unrolled, since the loop - * control instructions are much faster than the fstl and/or done - * in parallel with it so their overhead is insignificant. - */ -fpureg_i586_bzero_loop: - fstl 0(%edx) - addl $8,%edx - subl $8,%ecx - cmpl $8,%ecx - jae fpureg_i586_bzero_loop - - cmpl $0,PCPU(npxthread) - je i586_bz3 - frstor 0(%esp) - addl $108,%esp - lmsw %ax - movb $0xfe,kernel_fpu_lock - ret - -i586_bz3: - fstp %st(0) - lmsw %ax - movb $0xfe,kernel_fpu_lock - ret - -intreg_i586_bzero: - /* - * `rep stos' seems to be the best method in practice for small - * counts. Fancy methods usually take too long to start up due - * to cache and BTB misses. - */ - pushl %edi - movl %edx,%edi - xorl %eax,%eax - shrl $2,%ecx - cld - rep - stosl - movl 12(%esp),%ecx - andl $3,%ecx - jne 1f - popl %edi - ret - -1: - rep - stosb - popl %edi - ret -#endif /* I586_CPU && NNPX > 0 */ - -ENTRY(i686_pagezero) - pushl %edi - pushl %ebx - - movl 12(%esp), %edi - movl $1024, %ecx - cld - - ALIGN_TEXT -1: - xorl %eax, %eax - repe - scasl - jnz 2f - - popl %ebx - popl %edi - ret - - ALIGN_TEXT - -2: - incl %ecx - subl $4, %edi - - movl %ecx, %edx - cmpl $16, %ecx - - jge 3f - - movl %edi, %ebx - andl $0x3f, %ebx - shrl %ebx - shrl %ebx - movl $16, %ecx - subl %ebx, %ecx - -3: - subl %ecx, %edx - rep - stosl - - movl %edx, %ecx - testl %edx, %edx - jnz 1b - - popl %ebx - popl %edi - ret - /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi @@ -396,258 +77,75 @@ ENTRY(fillw) popl %edi ret -ENTRY(bcopyb) +/* + * void bcopy(const void *s, void *d, size_t count) + * + * Normal bcopy() vector, an optimized bcopy may be installed in + * bcopy_vector. + */ +ENTRY(bcopy) pushl %esi pushl %edi - movl 12(%esp),%esi - movl 16(%esp),%edi - movl 20(%esp),%ecx - movl %edi,%eax - subl %esi,%eax - cmpl %ecx,%eax /* overlapping && src < dst? */ - jb 1f - cld /* nope, copy forwards */ - rep - movsb - popl %edi - popl %esi - ret - - ALIGN_TEXT -1: - addl %ecx,%edi /* copy backwards. */ - addl %ecx,%esi - decl %edi - decl %esi - std - rep - movsb + movl 4+8(%esp),%esi /* caddr_t from */ + movl 8+8(%esp),%edi /* caddr_t to */ + movl 12+8(%esp),%ecx /* size_t len */ + call *bcopy_vector popl %edi popl %esi - cld ret -ENTRY(bcopy) - MEXITCOUNT - jmp *bcopy_vector - -ENTRY(ovbcopy) - MEXITCOUNT - jmp *ovbcopy_vector - /* - * generic_bcopy(src, dst, cnt) - * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 + * Generic (integer-only) bcopy() vector. */ ENTRY(generic_bcopy) - pushl %ebp /* debugging */ - movl %esp,%ebp pushl %esi pushl %edi - movl 16(%esp),%esi - movl 20(%esp),%edi - movl 24(%esp),%ecx - - movl %edi,%eax - subl %esi,%eax - cmpl %ecx,%eax /* overlapping && src < dst? */ - jb 1f - - shrl $2,%ecx /* copy by 32-bit words */ - cld /* nope, copy forwards */ - rep - movsl - movl 24(%esp),%ecx - andl $3,%ecx /* any bytes left? */ - rep - movsb + movl 4+8(%esp),%esi /* caddr_t from */ + movl 8+8(%esp),%edi /* caddr_t to */ + movl 12+8(%esp),%ecx /* size_t len */ + call asm_generic_bcopy popl %edi popl %esi - popl %ebp ret - ALIGN_TEXT -1: - addl %ecx,%edi /* copy backwards */ - addl %ecx,%esi - decl %edi - decl %esi - andl $3,%ecx /* any fractional bytes? */ - std - rep - movsb - movl 24(%esp),%ecx /* copy remainder by 32-bit words */ - shrl $2,%ecx - subl $3,%esi - subl $3,%edi - rep - movsl - popl %edi - popl %esi - popl %ebp - cld - ret - -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_bcopy) +ENTRY(ovbcopy) pushl %esi pushl %edi - movl 12(%esp),%esi - movl 16(%esp),%edi - movl 20(%esp),%ecx - - movl %edi,%eax - subl %esi,%eax - cmpl %ecx,%eax /* overlapping && src < dst? */ - jb 1f - - cmpl $1024,%ecx - jb small_i586_bcopy - - sarb $1,kernel_fpu_lock - jc small_i586_bcopy - cmpl $0,PCPU(npxthread) - je i586_bc1 - smsw %dx - clts - subl $108,%esp - fnsave 0(%esp) - jmp 4f - -i586_bc1: - smsw %dx - clts - fninit /* XXX should avoid needing this */ - - ALIGN_TEXT -4: - pushl %ecx -#define DCACHE_SIZE 8192 - cmpl $(DCACHE_SIZE-512)/2,%ecx - jbe 2f - movl $(DCACHE_SIZE-512)/2,%ecx -2: - subl %ecx,0(%esp) - cmpl $256,%ecx - jb 5f /* XXX should prefetch if %ecx >= 32 */ - pushl %esi - pushl %ecx - ALIGN_TEXT -3: - movl 0(%esi),%eax - movl 32(%esi),%eax - movl 64(%esi),%eax - movl 96(%esi),%eax - movl 128(%esi),%eax - movl 160(%esi),%eax - movl 192(%esi),%eax - movl 224(%esi),%eax - addl $256,%esi - subl $256,%ecx - cmpl $256,%ecx - jae 3b - popl %ecx - popl %esi -5: - ALIGN_TEXT -large_i586_bcopy_loop: - fildq 0(%esi) - fildq 8(%esi) - fildq 16(%esi) - fildq 24(%esi) - fildq 32(%esi) - fildq 40(%esi) - fildq 48(%esi) - fildq 56(%esi) - fistpq 56(%edi) - fistpq 48(%edi) - fistpq 40(%edi) - fistpq 32(%edi) - fistpq 24(%edi) - fistpq 16(%edi) - fistpq 8(%edi) - fistpq 0(%edi) - addl $64,%esi - addl $64,%edi - subl $64,%ecx - cmpl $64,%ecx - jae large_i586_bcopy_loop - popl %eax - addl %eax,%ecx - cmpl $64,%ecx - jae 4b - - cmpl $0,PCPU(npxthread) - je i586_bc2 - frstor 0(%esp) - addl $108,%esp -i586_bc2: - lmsw %dx - movb $0xfe,kernel_fpu_lock - -/* - * This is a duplicate of the main part of generic_bcopy. See the comments - * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and - * would mess up high resolution profiling. - */ - ALIGN_TEXT -small_i586_bcopy: - shrl $2,%ecx - cld - rep - movsl - movl 20(%esp),%ecx - andl $3,%ecx - rep - movsb + movl 4+8(%esp),%esi /* caddr_t from */ + movl 8+8(%esp),%edi /* caddr_t to */ + movl 12+8(%esp),%ecx /* size_t len */ + call *ovbcopy_vector popl %edi popl %esi ret - ALIGN_TEXT -1: - addl %ecx,%edi - addl %ecx,%esi - decl %edi - decl %esi - andl $3,%ecx - std - rep - movsb - movl 20(%esp),%ecx - shrl $2,%ecx - subl $3,%esi - subl $3,%edi - rep - movsl - popl %edi - popl %esi - cld - ret -#endif /* I586_CPU && NNPX > 0 */ - /* - * Note: memcpy does not support overlapping copies + * void *memcpy(void *d, const void *s, size_t count) + * + * Note: memcpy does not have to support overlapping copies. + * + * Note: (d, s) arguments reversed from bcopy, and memcpy() returns d + * while bcopy() returns void. */ ENTRY(memcpy) - pushl %edi pushl %esi - movl 12(%esp),%edi - movl 16(%esp),%esi - movl 20(%esp),%ecx - movl %edi,%eax - shrl $2,%ecx /* copy by 32-bit words */ - cld /* nope, copy forwards */ - rep - movsl - movl 20(%esp),%ecx - andl $3,%ecx /* any bytes left? */ - rep - movsb - popl %esi + pushl %edi + movl 4+8(%esp),%edi + movl 8+8(%esp),%esi + movl 12+8(%esp),%ecx + call *memcpy_vector + movl 4+8(%esp),%eax popl %edi + popl %esi ret +/* + * A stack-based on-fault routine is used for more complex PCB_ONFAULT + * situations (such as memcpy/bcopy/bzero). In this case the on-fault + * routine must be pushed on the stack. + */ +stack_onfault: + ret /*****************************************************************************/ /* copyout and fubyte family */ @@ -671,19 +169,16 @@ ENTRY(memcpy) * copyout(from_kernel, to_user, len) - MP SAFE (if not I386_CPU) */ ENTRY(copyout) - MEXITCOUNT - jmp *copyout_vector - -ENTRY(generic_copyout) movl PCPU(curthread),%eax movl TD_PCB(%eax),%eax - movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx - movl 16(%esp),%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx + pushl $copyout_fault + movl $stack_onfault,PCB_ONFAULT(%eax) + movl 4+16(%esp),%esi + movl 8+16(%esp),%edi + movl 12+16(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout @@ -765,30 +260,24 @@ ENTRY(generic_copyout) jnz 1b /* check next page */ #endif /* I386_CPU */ - /* bcopy(%esi, %edi, %ebx) */ + /* + * Convert copyout to memcpy_vector(dest:%edi, src:%esi, conut:%ecx) + */ 3: movl %ebx,%ecx - -#if defined(I586_CPU) && NNPX > 0 - ALIGN_TEXT -slow_copyout: -#endif - shrl $2,%ecx - cld - rep - movsl - movb %bl,%cl - andb $3,%cl - rep - movsb + call *memcpy_vector done_copyout: + /* + * non-error return + */ + addl $4,%esp + movl PCPU(curthread),%edx + xorl %eax,%eax + movl TD_PCB(%edx),%edx popl %ebx popl %edi popl %esi - xorl %eax,%eax - movl PCPU(curthread),%edx - movl TD_PCB(%edx),%edx movl %eax,PCB_ONFAULT(%edx) ret @@ -803,77 +292,20 @@ copyout_fault: movl $EFAULT,%eax ret -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_copyout) - /* - * Duplicated from generic_copyout. Could be done a bit better. - */ - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $copyout_fault,PCB_ONFAULT(%eax) - pushl %esi - pushl %edi - pushl %ebx - movl 16(%esp),%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx - testl %ebx,%ebx /* anything to do? */ - jz done_copyout - - /* - * Check explicitly for non-user addresses. If 486 write protection - * is being used, this check is essential because we are in kernel - * mode so the h/w does not provide any protection against writing - * kernel addresses. - */ - - /* - * First, prevent address wrapping. - */ - movl %edi,%eax - addl %ebx,%eax - jc copyout_fault -/* - * XXX STOP USING VM_MAXUSER_ADDRESS. - * It is an end address, not a max, so every time it is used correctly it - * looks like there is an off by one error, and of course it caused an off - * by one error in several places. - */ - cmpl $VM_MAXUSER_ADDRESS,%eax - ja copyout_fault - - /* bcopy(%esi, %edi, %ebx) */ -3: - movl %ebx,%ecx - /* - * End of duplicated code. - */ - - cmpl $1024,%ecx - jb slow_copyout - - pushl %ecx - call fastmove - addl $4,%esp - jmp done_copyout -#endif /* I586_CPU && NNPX > 0 */ - /* * copyin(from_user, to_kernel, len) - MP SAFE */ -ENTRY(copyin) - MEXITCOUNT - jmp *copyin_vector -ENTRY(generic_copyin) +ENTRY(copyin) movl PCPU(curthread),%eax movl TD_PCB(%eax),%eax - movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi - movl 12(%esp),%esi /* caddr_t from */ - movl 16(%esp),%edi /* caddr_t to */ - movl 20(%esp),%ecx /* size_t len */ + pushl $copyin_fault + movl $stack_onfault,PCB_ONFAULT(%eax) + movl 4+12(%esp),%esi /* caddr_t from */ + movl 8+12(%esp),%edi /* caddr_t to */ + movl 12+12(%esp),%ecx /* size_t len */ /* * make sure address is valid @@ -884,264 +316,28 @@ ENTRY(generic_copyin) cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault -#if defined(I586_CPU) && NNPX > 0 - ALIGN_TEXT -slow_copyin: -#endif - movb %cl,%al - shrl $2,%ecx /* copy longword-wise */ - cld - rep - movsl - movb %al,%cl - andb $3,%cl /* copy remaining bytes */ - rep - movsb + /* + * Call memcpy(destination:%edi, source:%esi, bytes:%ecx) + */ + call *memcpy_vector -#if defined(I586_CPU) && NNPX > 0 - ALIGN_TEXT -done_copyin: -#endif - popl %edi - popl %esi - xorl %eax,%eax + /* + * return 0 (no error) + */ + addl $4,%esp movl PCPU(curthread),%edx + xorl %eax,%eax movl TD_PCB(%edx),%edx - movl %eax,PCB_ONFAULT(%edx) - ret - - ALIGN_TEXT -copyin_fault: popl %edi popl %esi - movl PCPU(curthread),%edx - movl TD_PCB(%edx),%edx - movl $0,PCB_ONFAULT(%edx) - movl $EFAULT,%eax + movl %eax,PCB_ONFAULT(%edx) ret -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_copyin) /* - * Duplicated from generic_copyin. Could be done a bit better. - */ - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $copyin_fault,PCB_ONFAULT(%eax) - pushl %esi - pushl %edi - movl 12(%esp),%esi /* caddr_t from */ - movl 16(%esp),%edi /* caddr_t to */ - movl 20(%esp),%ecx /* size_t len */ - - /* - * make sure address is valid + * return EFAULT */ - movl %esi,%edx - addl %ecx,%edx - jc copyin_fault - cmpl $VM_MAXUSER_ADDRESS,%edx - ja copyin_fault - /* - * End of duplicated code. - */ - - cmpl $1024,%ecx - jb slow_copyin - - pushl %ebx /* XXX prepare for fastmove_fault */ - pushl %ecx - call fastmove - addl $8,%esp - jmp done_copyin -#endif /* I586_CPU && NNPX > 0 */ - -#if defined(I586_CPU) && NNPX > 0 -/* fastmove(src, dst, len) - src in %esi - dst in %edi - len in %ecx XXX changed to on stack for profiling - uses %eax and %edx for tmp. storage - */ -/* XXX use ENTRY() to get profiling. fastmove() is actually a non-entry. */ -ENTRY(fastmove) - pushl %ebp - movl %esp,%ebp - subl $PCB_SAVE87_SIZE+3*4,%esp - - movl 8(%ebp),%ecx - cmpl $63,%ecx - jbe fastmove_tail - - testl $7,%esi /* check if src addr is multiple of 8 */ - jnz fastmove_tail - - testl $7,%edi /* check if dst addr is multiple of 8 */ - jnz fastmove_tail - -/* if (npxthread != NULL) { */ - cmpl $0,PCPU(npxthread) - je 6f -/* fnsave(&curpcb->pcb_savefpu); */ - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - fnsave PCB_SAVEFPU(%eax) -/* npxthread = NULL; */ - movl $0,PCPU(npxthread) -/* } */ -6: -/* now we own the FPU. */ - -/* - * The process' FP state is saved in the pcb, but if we get - * switched, the cpu_switch() will store our FP state in the - * pcb. It should be possible to avoid all the copying for - * this, e.g., by setting a flag to tell cpu_switch() to - * save the state somewhere else. - */ -/* tmp = curpcb->pcb_savefpu; */ - movl %ecx,-12(%ebp) - movl %esi,-8(%ebp) - movl %edi,-4(%ebp) - movl %esp,%edi - movl PCPU(curthread),%esi - movl TD_PCB(%esi),%esi - addl $PCB_SAVEFPU,%esi - cld - movl $PCB_SAVE87_SIZE>>2,%ecx - rep - movsl - movl -12(%ebp),%ecx - movl -8(%ebp),%esi - movl -4(%ebp),%edi -/* stop_emulating(); */ - clts -/* npxthread = curthread; */ - movl PCPU(curthread),%eax - movl %eax,PCPU(npxthread) - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $fastmove_fault,PCB_ONFAULT(%eax) -4: - movl %ecx,-12(%ebp) - cmpl $1792,%ecx - jbe 2f - movl $1792,%ecx -2: - subl %ecx,-12(%ebp) - cmpl $256,%ecx - jb 5f - movl %ecx,-8(%ebp) - movl %esi,-4(%ebp) - ALIGN_TEXT -3: - movl 0(%esi),%eax - movl 32(%esi),%eax - movl 64(%esi),%eax - movl 96(%esi),%eax - movl 128(%esi),%eax - movl 160(%esi),%eax - movl 192(%esi),%eax - movl 224(%esi),%eax - addl $256,%esi - subl $256,%ecx - cmpl $256,%ecx - jae 3b - movl -8(%ebp),%ecx - movl -4(%ebp),%esi -5: - ALIGN_TEXT -fastmove_loop: - fildq 0(%esi) - fildq 8(%esi) - fildq 16(%esi) - fildq 24(%esi) - fildq 32(%esi) - fildq 40(%esi) - fildq 48(%esi) - fildq 56(%esi) - fistpq 56(%edi) - fistpq 48(%edi) - fistpq 40(%edi) - fistpq 32(%edi) - fistpq 24(%edi) - fistpq 16(%edi) - fistpq 8(%edi) - fistpq 0(%edi) - addl $-64,%ecx - addl $64,%esi - addl $64,%edi - cmpl $63,%ecx - ja fastmove_loop - movl -12(%ebp),%eax - addl %eax,%ecx - cmpl $64,%ecx - jae 4b - -/* curpcb->pcb_savefpu = tmp; */ - movl %ecx,-12(%ebp) - movl %esi,-8(%ebp) - movl %edi,-4(%ebp) - movl PCPU(curthread),%edi - movl TD_PCB(%edi),%edi - addl $PCB_SAVEFPU,%edi - movl %esp,%esi - cld - movl $PCB_SAVE87_SIZE>>2,%ecx - rep - movsl - movl -12(%ebp),%ecx - movl -8(%ebp),%esi - movl -4(%ebp),%edi - -/* start_emulating(); */ - smsw %ax - orb $CR0_TS,%al - lmsw %ax -/* npxthread = NULL; */ - movl $0,PCPU(npxthread) - ALIGN_TEXT -fastmove_tail: - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $fastmove_tail_fault,PCB_ONFAULT(%eax) - - movb %cl,%al - shrl $2,%ecx /* copy longword-wise */ - cld - rep - movsl - movb %al,%cl - andb $3,%cl /* copy remaining bytes */ - rep - movsb - - movl %ebp,%esp - popl %ebp - ret - - ALIGN_TEXT -fastmove_fault: - movl PCPU(curthread),%edi - movl TD_PCB(%edi),%edi - addl $PCB_SAVEFPU,%edi - movl %esp,%esi - cld - movl $PCB_SAVE87_SIZE>>2,%ecx - rep - movsl - - smsw %ax - orb $CR0_TS,%al - lmsw %ax - movl $0,PCPU(npxthread) - -fastmove_tail_fault: - movl %ebp,%esp - popl %ebp - addl $8,%esp - popl %ebx +copyin_fault: popl %edi popl %esi movl PCPU(curthread),%edx @@ -1149,7 +345,6 @@ fastmove_tail_fault: movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret -#endif /* I586_CPU && NNPX > 0 */ /* * fu{byte,sword,word} - MP SAFE diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index 02482a3ce4..c1123154b6 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.89.2.10 2003/01/23 03:36:24 ps Exp $ - * $DragonFly: src/sys/i386/i386/Attic/swtch.s,v 1.31 2004/03/28 08:03:05 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/swtch.s,v 1.32 2004/04/29 17:24:58 dillon Exp $ */ #include "use_npx.h" @@ -136,11 +136,11 @@ ENTRY(cpu_heavy_switch) movl %eax,PCB_DR0(%edx) 1: +#if NNPX > 0 /* * Save the FP state if we have used the FP. Note that calling * npxsave will NULL out PCPU(npxthread). */ -#if NNPX > 0 cmpl %ebx,PCPU(npxthread) jne 1f addl $PCB_SAVEFPU,%edx @@ -531,13 +531,34 @@ ENTRY(cpu_kthread_restore) * YYY BGL, SPL */ ENTRY(cpu_lwkt_switch) - movl 4(%esp),%eax pushl %ebp /* note: GDB hacked to locate ebp relative to td_sp */ pushl %ebx + movl PCPU(curthread),%ebx pushl %esi pushl %edi pushfl - movl PCPU(curthread),%ebx + /* warning: adjust movl into %eax below if you change the pushes */ + +#if NNPX > 0 + /* + * Save the FP state if we have used the FP. Note that calling + * npxsave will NULL out PCPU(npxthread). + * + * We have to deal with the FP state for LWKT threads in case they + * happen to get preempted or block while doing an optimized + * bzero/bcopy/memcpy. + */ + cmpl %ebx,PCPU(npxthread) + jne 1f + movl TD_PCB(%ebx),%edx /* EDX = PCB */ + addl $PCB_SAVEFPU,%edx + pushl %edx + call npxsave /* do it in a big C function */ + addl $4,%esp /* EAX, ECX, EDX trashed */ +1: +#endif /* NNPX > 0 */ + + movl 4+20(%esp),%eax /* switch to this thread */ pushl $cpu_lwkt_restore movl %esp,TD_SP(%ebx) movl %eax,PCPU(curthread) diff --git a/sys/i386/include/globaldata.h b/sys/i386/include/globaldata.h index 05360cb187..2512a96750 100644 --- a/sys/i386/include/globaldata.h +++ b/sys/i386/include/globaldata.h @@ -28,7 +28,7 @@ * should not include this file. * * $FreeBSD: src/sys/i386/include/globaldata.h,v 1.11.2.1 2000/05/16 06:58:10 dillon Exp $ - * $DragonFly: src/sys/i386/include/Attic/globaldata.h,v 1.23 2004/02/21 06:37:07 dillon Exp $ + * $DragonFly: src/sys/i386/include/Attic/globaldata.h,v 1.24 2004/04/29 17:25:00 dillon Exp $ */ #ifndef _MACHINE_GLOBALDATA_H_ @@ -67,6 +67,7 @@ struct mdglobaldata { struct segment_descriptor *gd_tss_gdt; struct thread *gd_npxthread; struct i386tss gd_common_tss; + int gd_kernel_fpu_lock; /* fast bcopy/zero cpu lock */ int gd_fpending; /* fast interrupt pending */ int gd_ipending; /* normal interrupt pending */ int gd_idelayed; /* delayed software ints */ diff --git a/sys/i386/include/md_var.h b/sys/i386/include/md_var.h index 8e8ab43408..e5d5e89929 100644 --- a/sys/i386/include/md_var.h +++ b/sys/i386/include/md_var.h @@ -27,7 +27,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/include/md_var.h,v 1.35.2.4 2003/01/22 20:14:53 jhb Exp $ - * $DragonFly: src/sys/i386/include/Attic/md_var.h,v 1.12 2003/11/03 17:11:19 dillon Exp $ + * $DragonFly: src/sys/i386/include/Attic/md_var.h,v 1.13 2004/04/29 17:25:00 dillon Exp $ */ #ifndef _MACHINE_MD_VAR_H_ @@ -39,7 +39,8 @@ extern vm_paddr_t Maxmem; extern u_int atdevbase; /* offset in virtual memory of ISA io mem */ -extern void (*bcopy_vector) (const void *from, void *to, size_t len); +extern void **bcopy_vector; +extern void **memcpy_vector; extern int busdma_swi_pending; extern int (*copyin_vector) (const void *udaddr, void *kaddr, size_t len); @@ -59,7 +60,7 @@ extern int need_pre_dma_flush; extern int need_post_dma_flush; #endif extern int nfs_diskless_valid; -extern void (*ovbcopy_vector) (const void *from, void *to, size_t len); +extern void **ovbcopy_vector; extern char sigcode[]; extern int szsigcode; @@ -98,11 +99,19 @@ int fill_fpregs (struct proc *, struct fpreg *); int fill_regs (struct proc *p, struct reg *regs); int fill_dbregs (struct proc *p, struct dbreg *dbregs); void fillw (int /*u_short*/ pat, void *base, size_t cnt); +#if 0 void i486_bzero (volatile void *buf, size_t len); void i586_bzero (volatile void *buf, size_t len); void i586_bcopy (const void *from, void *to, size_t len); int i586_copyin (const void *udaddr, void *kaddr, size_t len); int i586_copyout (const void *kaddr, void *udaddr, size_t len); +#endif +void asm_generic_memcpy(void); +void asm_mmx_memcpy(void); +void asm_xmm_memcpy(void); +void asm_generic_bcopy(void); +void asm_mmx_bcopy(void); +void asm_xmm_bcopy(void); void i686_pagezero (void *addr); void init_AMD_Elan_sc520(void); int is_physical_memory (vm_offset_t addr); diff --git a/sys/i386/isa/npx.c b/sys/i386/isa/npx.c index 95ef20f35e..070f557533 100644 --- a/sys/i386/isa/npx.c +++ b/sys/i386/isa/npx.c @@ -33,7 +33,7 @@ * * from: @(#)npx.c 7.2 (Berkeley) 5/12/91 * $FreeBSD: src/sys/i386/isa/npx.c,v 1.80.2.3 2001/10/20 19:04:38 tegge Exp $ - * $DragonFly: src/sys/i386/isa/Attic/npx.c,v 1.13 2003/08/26 21:42:19 rob Exp $ + * $DragonFly: src/sys/i386/isa/Attic/npx.c,v 1.14 2004/04/29 17:25:02 dillon Exp $ */ #include "opt_cpu.h" @@ -146,10 +146,6 @@ static int npx_probe (device_t dev); static int npx_probe1 (device_t dev); static void fpusave (union savefpu *); static void fpurstor (union savefpu *); -#ifdef I586_CPU -static long timezero (const char *funcname, - void (*func)(volatile void *buf, size_t len)); -#endif /* I586_CPU */ int hw_float; /* XXX currently just alias for npx_exists */ @@ -434,6 +430,9 @@ npx_attach(dev) device_t dev; { int flags; +#if defined(I586_CPU) || defined(I686_CPU) + int mmxopt = 1; +#endif if (resource_int_value("npx", 0, "flags", &flags) != 0) flags = 0; @@ -470,7 +469,40 @@ npx_attach(dev) } npxinit(__INITIAL_NPXCW__); -#ifdef I586_CPU +#if defined(I586_CPU) || defined(I686_CPU) + /* + * The asm_mmx_*() routines actually use XMM as well, so only + * enable them if we have SSE2 and are using FXSR (fxsave/fxrstore). + */ + TUNABLE_INT_FETCH("kern.mmxopt", &mmxopt); + if ((cpu_feature & CPUID_MMX) && (cpu_feature & CPUID_SSE) && + (cpu_feature & CPUID_SSE2) && + npx_ex16 && npx_exists && mmxopt && cpu_fxsr + ) { + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BCOPY) == 0) { + bcopy_vector = (void **)asm_xmm_bcopy; + ovbcopy_vector = (void **)asm_xmm_bcopy; + memcpy_vector = (void **)asm_xmm_memcpy; + printf("Using XMM optimized bcopy/copyin/copyout\n"); + } + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BZERO) == 0) { + /* XXX */ + } + } else if ((cpu_feature & CPUID_MMX) && (cpu_feature & CPUID_SSE) && + npx_ex16 && npx_exists && mmxopt && cpu_fxsr + ) { + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BCOPY) == 0) { + bcopy_vector = (void **)asm_mmx_bcopy; + ovbcopy_vector = (void **)asm_mmx_bcopy; + memcpy_vector = (void **)asm_mmx_memcpy; + printf("Using MMX optimized bcopy/copyin/copyout\n"); + } + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BZERO) == 0) { + /* XXX */ + } + } +#endif +#if 0 if (cpu_class == CPUCLASS_586 && npx_ex16 && npx_exists && timezero("i586_bzero()", i586_bzero) < timezero("bzero()", bzero) * 4 / 5) { @@ -486,7 +518,6 @@ npx_attach(dev) } } #endif - return (0); /* XXX unused */ } @@ -943,36 +974,6 @@ fpurstor(addr) frstor(addr); } -#ifdef I586_CPU -static long -timezero(funcname, func) - const char *funcname; - void (*func) (volatile void *buf, size_t len); - -{ - void *buf; -#define BUFSIZE 1000000 - long usec; - struct timeval finish, start; - - buf = malloc(BUFSIZE, M_TEMP, M_NOWAIT); - if (buf == NULL) - return (BUFSIZE); - microtime(&start); - (*func)(buf, BUFSIZE); - microtime(&finish); - usec = 1000000 * (finish.tv_sec - start.tv_sec) + - finish.tv_usec - start.tv_usec; - if (usec <= 0) - usec = 1; - if (bootverbose) - printf("%s bandwidth = %ld bytes/sec\n", - funcname, (long)(BUFSIZE * (int64_t)1000000 / usec)); - free(buf, M_TEMP); - return (usec); -} -#endif /* I586_CPU */ - static device_method_t npx_methods[] = { /* Device interface */ DEVMETHOD(device_identify, npx_identify), diff --git a/sys/platform/pc32/i386/bcopy.s b/sys/platform/pc32/i386/bcopy.s new file mode 100644 index 0000000000..a734654e91 --- /dev/null +++ b/sys/platform/pc32/i386/bcopy.s @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2003 Matthew Dillon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/platform/pc32/i386/bcopy.s,v 1.1 2004/04/29 17:24:58 dillon Exp $ + */ +/* + * bcopy(source:%esi, target:%edi, count:%ecx) + * + * note: esi, edi, eax, ecx, and edx may be destroyed + */ + +#include "use_npx.h" + +#include +#include +#include +#include + +#include "assym.s" + + .text + + /* + * If memcpy/bcopy is called as part of a copyin or copyout, the + * on-fault routine is set up to do a 'ret'. We hve to restore + * %ebx and return to the copyin/copyout fault handler. + */ +generic_onfault: + popl %ebx + addl $4,%esp /* skip normal return vector */ + ret /* return to copyin/copyout fault handler */ + + /* + * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY + * + * Reasonably optimal on all modern machines. + */ + + SUPERALIGN_TEXT +ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */ + pushl %ebx + pushl $generic_onfault + jmp 2f + + SUPERALIGN_TEXT +ENTRY(asm_generic_bcopy) + pushl %ebx + pushl $generic_onfault + cmpl %esi,%edi /* if (edi < esi) fwd copy ok */ + jb 2f + addl %ecx,%esi + cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */ + jb 10f + subl %ecx,%esi + jmp 2f + + SUPERALIGN_TEXT +1: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%edx + movl %eax,(%edi) + movl 12(%esi),%eax + movl %ebx,4(%edi) + movl 16(%esi),%ebx + movl %edx,8(%edi) + movl 20(%esi),%edx + movl %eax,12(%edi) + movl 24(%esi),%eax + movl %ebx,16(%edi) + movl 28(%esi),%ebx + movl %edx,20(%edi) + movl %eax,24(%edi) + addl $32,%esi + movl %ebx,28(%edi) + addl $32,%edi +2: + subl $32,%ecx + jae 1b + addl $32,%ecx + jz 3f + cld + rep + movsb +3: + addl $4,%esp + popl %ebx + ret + + /* + * GENERIC_BCOPY() - BACKWARDS COPY + */ + SUPERALIGN_TEXT +10: + addl %ecx,%edi + jmp 12f + + SUPERALIGN_TEXT +11: + movl -4(%esi),%eax + movl -8(%esi),%ebx + movl -12(%esi),%edx + movl %eax,-4(%edi) + movl -16(%esi),%eax + movl %ebx,-8(%edi) + movl -20(%esi),%ebx + movl %edx,-12(%edi) + movl -24(%esi),%edx + movl %eax,-16(%edi) + movl -28(%esi),%eax + movl %ebx,-20(%edi) + movl -32(%esi),%ebx + movl %edx,-24(%edi) + movl %eax,-28(%edi) + subl $32,%esi + movl %ebx,-32(%edi) + subl $32,%edi +12: + subl $32,%ecx + jae 11b + addl $32,%ecx + jz 13f + decl %esi + decl %edi + std + rep + movsb + cld +13: + addl $4,%esp + popl %ebx + ret + + /* + * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY + * + * Reasonably optimal on all modern machines with MMX or SSE2. + * XXX But very messy, we need a better way to use fp in the kernel. + * + * note: esi, edi, eax, ecx, and edx may be destroyed + * + * In order for the kernel to be able to use the FPU: + * + * (1) The kernel may not already be using the fpu + * (2) If the fpu is owned by the application, we must save + * and restore its state. + * (3) Our thread begins using the FPU, we clts (clear CR0_TS) + * to prevent an FP fault, fninit, and set our thread as + * the npxthread. + * + * (4) While we are using the FP unit, an interrupt may come + * along and preempt us, causing our FP state to be saved. + * We will fault/restore upon resumption. + * + * (5) To cleanup we have to restore the original application's + * FP state, which means restoring any saved state, CR0_TS, + * and npxthread settings as appropriate. + * + * However, as an optimization we can instead copy the + * saved state to the PCB, clear npxthread, and set CR0_TS, + * which will allow additional bcopy's to use the FP unit + * at virtually no cost and cause the application to trap + * when it tries to use the FP unit again. + * + * So, why choose one over another? Well, having to save + * and restore the FP state eats a lot of cycles. Many + * kernel operations actually wind up looping on a bcopy + * (e.g. the PIPE/SFBUF case), or looping in userland without + * any intervening FP ops. Our minimum copy size check + * (2048) avoids the use of FP for the smaller copies that + * are more likely to be intermingled with user FP ops, so + * it is my belief that saving the user FP state to the PCB + * is a better solution then restoring it. + * + * NOTE: fxsave requires a 16-byte aligned address + * + * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn. + * MMX: Typical on XPs and P3s. 64 bit media insn. + */ + +#define MMX_SAVE_BLOCK(missfunc) \ + cmpl $2048,%ecx ; \ + jb missfunc ; \ + btsl $1,PCPU(kernel_fpu_lock) ; \ + jc missfunc ; \ + pushl %ebp ; \ + movl %esp, %ebp ; \ + smsw %ax ; \ + movl PCPU(npxthread),%edx ; \ + testl %edx,%edx ; \ + jz 100f ; \ + clts ; \ + subl $512,%esp ; \ + andl $0xfffffff0,%esp ; \ + fxsave 0(%esp) ; \ +100: ; \ + pushl %eax ; \ + pushl %edx ; \ + movl PCPU(curthread),%edx ; \ + movl %edx,PCPU(npxthread) ; \ + clts ; \ + fninit ; \ + pushl $mmx_onfault + + +#define MMX_RESTORE_BLOCK \ + addl $4,%esp ; \ + MMX_RESTORE_BLOCK2 + +#define MMX_RESTORE_BLOCK2 \ + popl %edx ; \ + popl %eax ; \ + testl %edx,%edx ; \ + jz 100f ; \ + movl %esp,%esi ; \ + movl PCPU(curthread),%edi ; \ + movl TD_PCB(%edi),%edi ; \ + addl $PCB_SAVEFPU,%edi ; \ + movl $512>>2,%ecx ; \ + cld ; \ + rep ; \ + movsl ; \ + orb $CR0_TS,%al ; \ +100: ; \ + movl %ebp,%esp ; \ + popl %ebp ; \ + movl $0,PCPU(npxthread) ; \ + lmsw %ax ; \ + movl $0,PCPU(kernel_fpu_lock) + + /* + * xmm/mmx_onfault routine. Restore the fpu state, skip the normal + * return vector, and return to the caller's on-fault routine + * (which was pushed on the callers stack just before he calle us) + */ +mmx_onfault: + MMX_RESTORE_BLOCK2 + addl $4,%esp + ret + + /* + * MXX entry points - only support 64 bit media instructions + */ + SUPERALIGN_TEXT +ENTRY(asm_mmx_memcpy) /* memcpy() entry point use optimal copy */ + MMX_SAVE_BLOCK(asm_generic_memcpy) + jmp 5f + + SUPERALIGN_TEXT +ENTRY(asm_mmx_bcopy) + MMX_SAVE_BLOCK(asm_generic_bcopy) + cmpl %esi,%edi /* if (edi < esi) fwd copy ok */ + jb 5f + addl %ecx,%esi + cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */ + jb 10f + subl %ecx,%esi + jmp 5f + + /* + * XMM entry points - support 128 bit media instructions + */ + SUPERALIGN_TEXT +ENTRY(asm_xmm_memcpy) /* memcpy() entry point use optimal copy */ + MMX_SAVE_BLOCK(asm_generic_memcpy) + jmp 1f + + SUPERALIGN_TEXT +ENTRY(asm_xmm_bcopy) + MMX_SAVE_BLOCK(asm_generic_bcopy) + cmpl %esi,%edi /* if (edi < esi) fwd copy ok */ + jb 1f + addl %ecx,%esi + cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */ + jb 10f + subl %ecx,%esi +1: + movl %esi,%eax /* skip xmm if the data is not aligned */ + andl $15,%eax + jnz 5f + movl %edi,%eax + andl $15,%eax + jz 3f + jmp 5f + + SUPERALIGN_TEXT +2: + movdqa (%esi),%xmm0 + movdqa 16(%esi),%xmm1 + movdqa 32(%esi),%xmm2 + movdqa 48(%esi),%xmm3 + movdqa 64(%esi),%xmm4 + movdqa 80(%esi),%xmm5 + movdqa 96(%esi),%xmm6 + movdqa 112(%esi),%xmm7 + /*prefetchnta 128(%esi) 3dNOW */ + addl $128,%esi + + /* + * movdqa or movntdq can be used. + */ + movdqa %xmm0,(%edi) + movdqa %xmm1,16(%edi) + movdqa %xmm2,32(%edi) + movdqa %xmm3,48(%edi) + movdqa %xmm4,64(%edi) + movdqa %xmm5,80(%edi) + movdqa %xmm6,96(%edi) + movdqa %xmm7,112(%edi) + addl $128,%edi +3: + subl $128,%ecx + jae 2b + addl $128,%ecx + jz 6f + jmp 5f + SUPERALIGN_TEXT +4: + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + /*prefetchnta 128(%esi) 3dNOW */ + addl $64,%esi + movq %mm0,(%edi) + movq %mm1,8(%edi) + movq %mm2,16(%edi) + movq %mm3,24(%edi) + movq %mm4,32(%edi) + movq %mm5,40(%edi) + movq %mm6,48(%edi) + movq %mm7,56(%edi) + addl $64,%edi +5: + subl $64,%ecx + jae 4b + addl $64,%ecx + jz 6f + cld + rep + movsb +6: + MMX_RESTORE_BLOCK + ret + + /* + * GENERIC_BCOPY() - BACKWARDS COPY + * + * Don't bother using xmm optimizations, just stick with mmx. + */ + SUPERALIGN_TEXT +10: + addl %ecx,%edi + jmp 12f + + SUPERALIGN_TEXT +11: + movq -64(%esi),%mm0 + movq -56(%esi),%mm1 + movq -48(%esi),%mm2 + movq -40(%esi),%mm3 + movq -32(%esi),%mm4 + movq -24(%esi),%mm5 + movq -16(%esi),%mm6 + movq -8(%esi),%mm7 + /*prefetchnta -128(%esi)*/ + subl $64,%esi + movq %mm0,-64(%edi) + movq %mm1,-56(%edi) + movq %mm2,-48(%edi) + movq %mm3,-40(%edi) + movq %mm4,-32(%edi) + movq %mm5,-24(%edi) + movq %mm6,-16(%edi) + movq %mm7,-8(%edi) + subl $64,%edi +12: + subl $64,%ecx + jae 11b + addl $64,%ecx + jz 13f + decl %esi + decl %edi + std + rep + movsb + cld +13: + MMX_RESTORE_BLOCK + ret + diff --git a/sys/platform/pc32/i386/bzero.s b/sys/platform/pc32/i386/bzero.s new file mode 100644 index 0000000000..4b090dee8c --- /dev/null +++ b/sys/platform/pc32/i386/bzero.s @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2003 Matthew Dillon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/platform/pc32/i386/bzero.s,v 1.1 2004/04/29 17:24:58 dillon Exp $ + */ +/* + * void bzero(void *buf, u_int len) (arguments passed on stack) + */ + +#include "use_npx.h" + +#include +#include +#include +#include + +#include "assym.s" + + .text + +ENTRY(generic_bzero) + pushl %edi + subl %eax,%eax + movl 4+4(%esp),%edi + movl 8+4(%esp),%ecx + jmp 2f + SUPERALIGN_TEXT +1: + movl %eax,(%edi) + movl %eax,4(%edi) + addl $8,%edi +2: + subl $8,%ecx + jae 1b + addl $8,%ecx + jz 3f + cld + rep + stosb +3: + popl %edi + ret + +ENTRY(i686_pagezero) + pushl %edi + pushl %ebx + + movl 12(%esp), %edi + movl $1024, %ecx + cld + + ALIGN_TEXT +1: + xorl %eax, %eax + repe + scasl + jnz 2f + + popl %ebx + popl %edi + ret + + ALIGN_TEXT + +2: + incl %ecx + subl $4, %edi + + movl %ecx, %edx + cmpl $16, %ecx + + jge 3f + + movl %edi, %ebx + andl $0x3f, %ebx + shrl %ebx + shrl %ebx + movl $16, %ecx + subl %ebx, %ecx + +3: + subl %ecx, %edx + rep + stosl + + movl %edx, %ecx + testl %edx, %edx + jnz 1b + + popl %ebx + popl %edi + ret + diff --git a/sys/platform/pc32/i386/genassym.c b/sys/platform/pc32/i386/genassym.c index 384cc7db4b..c4904685b1 100644 --- a/sys/platform/pc32/i386/genassym.c +++ b/sys/platform/pc32/i386/genassym.c @@ -35,7 +35,7 @@ * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD: src/sys/i386/i386/genassym.c,v 1.86.2.3 2002/03/03 05:42:49 nyan Exp $ - * $DragonFly: src/sys/platform/pc32/i386/genassym.c,v 1.36 2004/03/30 19:14:04 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/genassym.c,v 1.37 2004/04/29 17:24:58 dillon Exp $ */ #include @@ -202,6 +202,7 @@ ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss)); ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt)); ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread)); +ASSYM(GD_KERNEL_FPU_LOCK, offsetof(struct mdglobaldata, gd_kernel_fpu_lock)); ASSYM(GD_OTHER_CPUS, offsetof(struct mdglobaldata, gd_other_cpus)); ASSYM(GD_SS_EFLAGS, offsetof(struct mdglobaldata, gd_ss_eflags)); ASSYM(GD_CMAP1, offsetof(struct mdglobaldata, gd_CMAP1)); diff --git a/sys/platform/pc32/i386/globals.s b/sys/platform/pc32/i386/globals.s index 159410e6cd..f619bc971a 100644 --- a/sys/platform/pc32/i386/globals.s +++ b/sys/platform/pc32/i386/globals.s @@ -24,7 +24,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/globals.s,v 1.13.2.1 2000/05/16 06:58:06 dillon Exp $ - * $DragonFly: src/sys/platform/pc32/i386/globals.s,v 1.20 2004/02/17 19:38:53 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/globals.s,v 1.21 2004/04/29 17:24:58 dillon Exp $ */ #include @@ -69,6 +69,9 @@ .globl gd_currentldt .set gd_currentldt,globaldata + GD_CURRENTLDT + .globl gd_kernel_fpu_lock + .set gd_kernel_fpu_lock, globaldata + GD_KERNEL_FPU_LOCK + /* * The BSP version of these get setup in locore.s and pmap.c, while * the AP versions are setup in mp_machdep.c. diff --git a/sys/platform/pc32/i386/support.s b/sys/platform/pc32/i386/support.s index 452f7d6c94..4fcf21da0b 100644 --- a/sys/platform/pc32/i386/support.s +++ b/sys/platform/pc32/i386/support.s @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/support.s,v 1.67.2.5 2001/08/15 01:23:50 peter Exp $ - * $DragonFly: src/sys/platform/pc32/i386/support.s,v 1.10 2004/04/03 08:21:16 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/support.s,v 1.11 2004/04/29 17:24:58 dillon Exp $ */ #include "use_npx.h" @@ -46,344 +46,25 @@ #define IDXSHIFT 10 .data + + .globl memcpy_vector +memcpy_vector: + .long asm_generic_memcpy + .globl bcopy_vector bcopy_vector: - .long generic_bcopy + .long asm_generic_bcopy + .globl bzero bzero: .long generic_bzero - .globl copyin_vector -copyin_vector: - .long generic_copyin - .globl copyout_vector -copyout_vector: - .long generic_copyout + .globl ovbcopy_vector ovbcopy_vector: - .long generic_bcopy -#if defined(I586_CPU) && NNPX > 0 -kernel_fpu_lock: - .byte 0xfe - .space 3 -#endif + .long asm_generic_bcopy .text -/* - * bcopy family - * void bzero(void *buf, u_int len) - */ - -ENTRY(generic_bzero) - pushl %edi - movl 8(%esp),%edi - movl 12(%esp),%ecx - xorl %eax,%eax - shrl $2,%ecx - cld - rep - stosl - movl 12(%esp),%ecx - andl $3,%ecx - rep - stosb - popl %edi - ret - -#if defined(I486_CPU) -ENTRY(i486_bzero) - movl 4(%esp),%edx - movl 8(%esp),%ecx - xorl %eax,%eax -/* - * do 64 byte chunks first - * - * XXX this is probably over-unrolled at least for DX2's - */ -2: - cmpl $64,%ecx - jb 3f - movl %eax,(%edx) - movl %eax,4(%edx) - movl %eax,8(%edx) - movl %eax,12(%edx) - movl %eax,16(%edx) - movl %eax,20(%edx) - movl %eax,24(%edx) - movl %eax,28(%edx) - movl %eax,32(%edx) - movl %eax,36(%edx) - movl %eax,40(%edx) - movl %eax,44(%edx) - movl %eax,48(%edx) - movl %eax,52(%edx) - movl %eax,56(%edx) - movl %eax,60(%edx) - addl $64,%edx - subl $64,%ecx - jnz 2b - ret - -/* - * do 16 byte chunks - */ - SUPERALIGN_TEXT -3: - cmpl $16,%ecx - jb 4f - movl %eax,(%edx) - movl %eax,4(%edx) - movl %eax,8(%edx) - movl %eax,12(%edx) - addl $16,%edx - subl $16,%ecx - jnz 3b - ret - -/* - * do 4 byte chunks - */ - SUPERALIGN_TEXT -4: - cmpl $4,%ecx - jb 5f - movl %eax,(%edx) - addl $4,%edx - subl $4,%ecx - jnz 4b - ret - -/* - * do 1 byte chunks - * a jump table seems to be faster than a loop or more range reductions - * - * XXX need a const section for non-text - */ - .data -jtab: - .long do0 - .long do1 - .long do2 - .long do3 - - .text - SUPERALIGN_TEXT -5: - jmp *jtab(,%ecx,4) - - SUPERALIGN_TEXT -do3: - movw %ax,(%edx) - movb %al,2(%edx) - ret - - SUPERALIGN_TEXT -do2: - movw %ax,(%edx) - ret - - SUPERALIGN_TEXT -do1: - movb %al,(%edx) - ret - - SUPERALIGN_TEXT -do0: - ret -#endif - -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_bzero) - movl 4(%esp),%edx - movl 8(%esp),%ecx - - /* - * The FPU register method is twice as fast as the integer register - * method unless the target is in the L1 cache and we pre-allocate a - * cache line for it (then the integer register method is 4-5 times - * faster). However, we never pre-allocate cache lines, since that - * would make the integer method 25% or more slower for the common - * case when the target isn't in either the L1 cache or the L2 cache. - * Thus we normally use the FPU register method unless the overhead - * would be too large. - */ - cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ - jb intreg_i586_bzero - - /* - * The FPU registers may belong to an application or to fastmove() - * or to another invocation of bcopy() or ourself in a higher level - * interrupt or trap handler. Preserving the registers is - * complicated since we avoid it if possible at all levels. We - * want to localize the complications even when that increases them. - * Here the extra work involves preserving CR0_TS in TS. - * `npxthread != NULL' is supposed to be the condition that all the - * FPU resources belong to an application, but npxthread and CR0_TS - * aren't set atomically enough for this condition to work in - * interrupt handlers. - * - * Case 1: FPU registers belong to the application: we must preserve - * the registers if we use them, so we only use the FPU register - * method if the target size is large enough to amortize the extra - * overhead for preserving them. CR0_TS must be preserved although - * it is very likely to end up as set. - * - * Case 2: FPU registers belong to fastmove(): fastmove() currently - * makes the registers look like they belong to an application so - * that cpu_switch() and savectx() don't have to know about it, so - * this case reduces to case 1. - * - * Case 3: FPU registers belong to the kernel: don't use the FPU - * register method. This case is unlikely, and supporting it would - * be more complicated and might take too much stack. - * - * Case 4: FPU registers don't belong to anyone: the FPU registers - * don't need to be preserved, so we always use the FPU register - * method. CR0_TS must be preserved although it is very likely to - * always end up as clear. - */ - cmpl $0,PCPU(npxthread) - je i586_bz1 - cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ - jb intreg_i586_bzero - sarb $1,kernel_fpu_lock - jc intreg_i586_bzero - smsw %ax - clts - subl $108,%esp - fnsave 0(%esp) - jmp i586_bz2 - -i586_bz1: - sarb $1,kernel_fpu_lock - jc intreg_i586_bzero - smsw %ax - clts - fninit /* XXX should avoid needing this */ -i586_bz2: - fldz - - /* - * Align to an 8 byte boundary (misalignment in the main loop would - * cost a factor of >= 2). Avoid jumps (at little cost if it is - * already aligned) by always zeroing 8 bytes and using the part up - * to the _next_ alignment position. - */ - fstl 0(%edx) - addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ - addl $8,%edx - andl $~7,%edx - subl %edx,%ecx - - /* - * Similarly align `len' to a multiple of 8. - */ - fstl -8(%edx,%ecx) - decl %ecx - andl $~7,%ecx - - /* - * This wouldn't be any faster if it were unrolled, since the loop - * control instructions are much faster than the fstl and/or done - * in parallel with it so their overhead is insignificant. - */ -fpureg_i586_bzero_loop: - fstl 0(%edx) - addl $8,%edx - subl $8,%ecx - cmpl $8,%ecx - jae fpureg_i586_bzero_loop - - cmpl $0,PCPU(npxthread) - je i586_bz3 - frstor 0(%esp) - addl $108,%esp - lmsw %ax - movb $0xfe,kernel_fpu_lock - ret - -i586_bz3: - fstp %st(0) - lmsw %ax - movb $0xfe,kernel_fpu_lock - ret - -intreg_i586_bzero: - /* - * `rep stos' seems to be the best method in practice for small - * counts. Fancy methods usually take too long to start up due - * to cache and BTB misses. - */ - pushl %edi - movl %edx,%edi - xorl %eax,%eax - shrl $2,%ecx - cld - rep - stosl - movl 12(%esp),%ecx - andl $3,%ecx - jne 1f - popl %edi - ret - -1: - rep - stosb - popl %edi - ret -#endif /* I586_CPU && NNPX > 0 */ - -ENTRY(i686_pagezero) - pushl %edi - pushl %ebx - - movl 12(%esp), %edi - movl $1024, %ecx - cld - - ALIGN_TEXT -1: - xorl %eax, %eax - repe - scasl - jnz 2f - - popl %ebx - popl %edi - ret - - ALIGN_TEXT - -2: - incl %ecx - subl $4, %edi - - movl %ecx, %edx - cmpl $16, %ecx - - jge 3f - - movl %edi, %ebx - andl $0x3f, %ebx - shrl %ebx - shrl %ebx - movl $16, %ecx - subl %ebx, %ecx - -3: - subl %ecx, %edx - rep - stosl - - movl %edx, %ecx - testl %edx, %edx - jnz 1b - - popl %ebx - popl %edi - ret - /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi @@ -396,258 +77,75 @@ ENTRY(fillw) popl %edi ret -ENTRY(bcopyb) +/* + * void bcopy(const void *s, void *d, size_t count) + * + * Normal bcopy() vector, an optimized bcopy may be installed in + * bcopy_vector. + */ +ENTRY(bcopy) pushl %esi pushl %edi - movl 12(%esp),%esi - movl 16(%esp),%edi - movl 20(%esp),%ecx - movl %edi,%eax - subl %esi,%eax - cmpl %ecx,%eax /* overlapping && src < dst? */ - jb 1f - cld /* nope, copy forwards */ - rep - movsb - popl %edi - popl %esi - ret - - ALIGN_TEXT -1: - addl %ecx,%edi /* copy backwards. */ - addl %ecx,%esi - decl %edi - decl %esi - std - rep - movsb + movl 4+8(%esp),%esi /* caddr_t from */ + movl 8+8(%esp),%edi /* caddr_t to */ + movl 12+8(%esp),%ecx /* size_t len */ + call *bcopy_vector popl %edi popl %esi - cld ret -ENTRY(bcopy) - MEXITCOUNT - jmp *bcopy_vector - -ENTRY(ovbcopy) - MEXITCOUNT - jmp *ovbcopy_vector - /* - * generic_bcopy(src, dst, cnt) - * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 + * Generic (integer-only) bcopy() vector. */ ENTRY(generic_bcopy) - pushl %ebp /* debugging */ - movl %esp,%ebp pushl %esi pushl %edi - movl 16(%esp),%esi - movl 20(%esp),%edi - movl 24(%esp),%ecx - - movl %edi,%eax - subl %esi,%eax - cmpl %ecx,%eax /* overlapping && src < dst? */ - jb 1f - - shrl $2,%ecx /* copy by 32-bit words */ - cld /* nope, copy forwards */ - rep - movsl - movl 24(%esp),%ecx - andl $3,%ecx /* any bytes left? */ - rep - movsb + movl 4+8(%esp),%esi /* caddr_t from */ + movl 8+8(%esp),%edi /* caddr_t to */ + movl 12+8(%esp),%ecx /* size_t len */ + call asm_generic_bcopy popl %edi popl %esi - popl %ebp ret - ALIGN_TEXT -1: - addl %ecx,%edi /* copy backwards */ - addl %ecx,%esi - decl %edi - decl %esi - andl $3,%ecx /* any fractional bytes? */ - std - rep - movsb - movl 24(%esp),%ecx /* copy remainder by 32-bit words */ - shrl $2,%ecx - subl $3,%esi - subl $3,%edi - rep - movsl - popl %edi - popl %esi - popl %ebp - cld - ret - -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_bcopy) +ENTRY(ovbcopy) pushl %esi pushl %edi - movl 12(%esp),%esi - movl 16(%esp),%edi - movl 20(%esp),%ecx - - movl %edi,%eax - subl %esi,%eax - cmpl %ecx,%eax /* overlapping && src < dst? */ - jb 1f - - cmpl $1024,%ecx - jb small_i586_bcopy - - sarb $1,kernel_fpu_lock - jc small_i586_bcopy - cmpl $0,PCPU(npxthread) - je i586_bc1 - smsw %dx - clts - subl $108,%esp - fnsave 0(%esp) - jmp 4f - -i586_bc1: - smsw %dx - clts - fninit /* XXX should avoid needing this */ - - ALIGN_TEXT -4: - pushl %ecx -#define DCACHE_SIZE 8192 - cmpl $(DCACHE_SIZE-512)/2,%ecx - jbe 2f - movl $(DCACHE_SIZE-512)/2,%ecx -2: - subl %ecx,0(%esp) - cmpl $256,%ecx - jb 5f /* XXX should prefetch if %ecx >= 32 */ - pushl %esi - pushl %ecx - ALIGN_TEXT -3: - movl 0(%esi),%eax - movl 32(%esi),%eax - movl 64(%esi),%eax - movl 96(%esi),%eax - movl 128(%esi),%eax - movl 160(%esi),%eax - movl 192(%esi),%eax - movl 224(%esi),%eax - addl $256,%esi - subl $256,%ecx - cmpl $256,%ecx - jae 3b - popl %ecx - popl %esi -5: - ALIGN_TEXT -large_i586_bcopy_loop: - fildq 0(%esi) - fildq 8(%esi) - fildq 16(%esi) - fildq 24(%esi) - fildq 32(%esi) - fildq 40(%esi) - fildq 48(%esi) - fildq 56(%esi) - fistpq 56(%edi) - fistpq 48(%edi) - fistpq 40(%edi) - fistpq 32(%edi) - fistpq 24(%edi) - fistpq 16(%edi) - fistpq 8(%edi) - fistpq 0(%edi) - addl $64,%esi - addl $64,%edi - subl $64,%ecx - cmpl $64,%ecx - jae large_i586_bcopy_loop - popl %eax - addl %eax,%ecx - cmpl $64,%ecx - jae 4b - - cmpl $0,PCPU(npxthread) - je i586_bc2 - frstor 0(%esp) - addl $108,%esp -i586_bc2: - lmsw %dx - movb $0xfe,kernel_fpu_lock - -/* - * This is a duplicate of the main part of generic_bcopy. See the comments - * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and - * would mess up high resolution profiling. - */ - ALIGN_TEXT -small_i586_bcopy: - shrl $2,%ecx - cld - rep - movsl - movl 20(%esp),%ecx - andl $3,%ecx - rep - movsb + movl 4+8(%esp),%esi /* caddr_t from */ + movl 8+8(%esp),%edi /* caddr_t to */ + movl 12+8(%esp),%ecx /* size_t len */ + call *ovbcopy_vector popl %edi popl %esi ret - ALIGN_TEXT -1: - addl %ecx,%edi - addl %ecx,%esi - decl %edi - decl %esi - andl $3,%ecx - std - rep - movsb - movl 20(%esp),%ecx - shrl $2,%ecx - subl $3,%esi - subl $3,%edi - rep - movsl - popl %edi - popl %esi - cld - ret -#endif /* I586_CPU && NNPX > 0 */ - /* - * Note: memcpy does not support overlapping copies + * void *memcpy(void *d, const void *s, size_t count) + * + * Note: memcpy does not have to support overlapping copies. + * + * Note: (d, s) arguments reversed from bcopy, and memcpy() returns d + * while bcopy() returns void. */ ENTRY(memcpy) - pushl %edi pushl %esi - movl 12(%esp),%edi - movl 16(%esp),%esi - movl 20(%esp),%ecx - movl %edi,%eax - shrl $2,%ecx /* copy by 32-bit words */ - cld /* nope, copy forwards */ - rep - movsl - movl 20(%esp),%ecx - andl $3,%ecx /* any bytes left? */ - rep - movsb - popl %esi + pushl %edi + movl 4+8(%esp),%edi + movl 8+8(%esp),%esi + movl 12+8(%esp),%ecx + call *memcpy_vector + movl 4+8(%esp),%eax popl %edi + popl %esi ret +/* + * A stack-based on-fault routine is used for more complex PCB_ONFAULT + * situations (such as memcpy/bcopy/bzero). In this case the on-fault + * routine must be pushed on the stack. + */ +stack_onfault: + ret /*****************************************************************************/ /* copyout and fubyte family */ @@ -671,19 +169,16 @@ ENTRY(memcpy) * copyout(from_kernel, to_user, len) - MP SAFE (if not I386_CPU) */ ENTRY(copyout) - MEXITCOUNT - jmp *copyout_vector - -ENTRY(generic_copyout) movl PCPU(curthread),%eax movl TD_PCB(%eax),%eax - movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx - movl 16(%esp),%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx + pushl $copyout_fault + movl $stack_onfault,PCB_ONFAULT(%eax) + movl 4+16(%esp),%esi + movl 8+16(%esp),%edi + movl 12+16(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout @@ -765,30 +260,24 @@ ENTRY(generic_copyout) jnz 1b /* check next page */ #endif /* I386_CPU */ - /* bcopy(%esi, %edi, %ebx) */ + /* + * Convert copyout to memcpy_vector(dest:%edi, src:%esi, conut:%ecx) + */ 3: movl %ebx,%ecx - -#if defined(I586_CPU) && NNPX > 0 - ALIGN_TEXT -slow_copyout: -#endif - shrl $2,%ecx - cld - rep - movsl - movb %bl,%cl - andb $3,%cl - rep - movsb + call *memcpy_vector done_copyout: + /* + * non-error return + */ + addl $4,%esp + movl PCPU(curthread),%edx + xorl %eax,%eax + movl TD_PCB(%edx),%edx popl %ebx popl %edi popl %esi - xorl %eax,%eax - movl PCPU(curthread),%edx - movl TD_PCB(%edx),%edx movl %eax,PCB_ONFAULT(%edx) ret @@ -803,77 +292,20 @@ copyout_fault: movl $EFAULT,%eax ret -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_copyout) - /* - * Duplicated from generic_copyout. Could be done a bit better. - */ - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $copyout_fault,PCB_ONFAULT(%eax) - pushl %esi - pushl %edi - pushl %ebx - movl 16(%esp),%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx - testl %ebx,%ebx /* anything to do? */ - jz done_copyout - - /* - * Check explicitly for non-user addresses. If 486 write protection - * is being used, this check is essential because we are in kernel - * mode so the h/w does not provide any protection against writing - * kernel addresses. - */ - - /* - * First, prevent address wrapping. - */ - movl %edi,%eax - addl %ebx,%eax - jc copyout_fault -/* - * XXX STOP USING VM_MAXUSER_ADDRESS. - * It is an end address, not a max, so every time it is used correctly it - * looks like there is an off by one error, and of course it caused an off - * by one error in several places. - */ - cmpl $VM_MAXUSER_ADDRESS,%eax - ja copyout_fault - - /* bcopy(%esi, %edi, %ebx) */ -3: - movl %ebx,%ecx - /* - * End of duplicated code. - */ - - cmpl $1024,%ecx - jb slow_copyout - - pushl %ecx - call fastmove - addl $4,%esp - jmp done_copyout -#endif /* I586_CPU && NNPX > 0 */ - /* * copyin(from_user, to_kernel, len) - MP SAFE */ -ENTRY(copyin) - MEXITCOUNT - jmp *copyin_vector -ENTRY(generic_copyin) +ENTRY(copyin) movl PCPU(curthread),%eax movl TD_PCB(%eax),%eax - movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi - movl 12(%esp),%esi /* caddr_t from */ - movl 16(%esp),%edi /* caddr_t to */ - movl 20(%esp),%ecx /* size_t len */ + pushl $copyin_fault + movl $stack_onfault,PCB_ONFAULT(%eax) + movl 4+12(%esp),%esi /* caddr_t from */ + movl 8+12(%esp),%edi /* caddr_t to */ + movl 12+12(%esp),%ecx /* size_t len */ /* * make sure address is valid @@ -884,264 +316,28 @@ ENTRY(generic_copyin) cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault -#if defined(I586_CPU) && NNPX > 0 - ALIGN_TEXT -slow_copyin: -#endif - movb %cl,%al - shrl $2,%ecx /* copy longword-wise */ - cld - rep - movsl - movb %al,%cl - andb $3,%cl /* copy remaining bytes */ - rep - movsb + /* + * Call memcpy(destination:%edi, source:%esi, bytes:%ecx) + */ + call *memcpy_vector -#if defined(I586_CPU) && NNPX > 0 - ALIGN_TEXT -done_copyin: -#endif - popl %edi - popl %esi - xorl %eax,%eax + /* + * return 0 (no error) + */ + addl $4,%esp movl PCPU(curthread),%edx + xorl %eax,%eax movl TD_PCB(%edx),%edx - movl %eax,PCB_ONFAULT(%edx) - ret - - ALIGN_TEXT -copyin_fault: popl %edi popl %esi - movl PCPU(curthread),%edx - movl TD_PCB(%edx),%edx - movl $0,PCB_ONFAULT(%edx) - movl $EFAULT,%eax + movl %eax,PCB_ONFAULT(%edx) ret -#if defined(I586_CPU) && NNPX > 0 -ENTRY(i586_copyin) /* - * Duplicated from generic_copyin. Could be done a bit better. - */ - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $copyin_fault,PCB_ONFAULT(%eax) - pushl %esi - pushl %edi - movl 12(%esp),%esi /* caddr_t from */ - movl 16(%esp),%edi /* caddr_t to */ - movl 20(%esp),%ecx /* size_t len */ - - /* - * make sure address is valid + * return EFAULT */ - movl %esi,%edx - addl %ecx,%edx - jc copyin_fault - cmpl $VM_MAXUSER_ADDRESS,%edx - ja copyin_fault - /* - * End of duplicated code. - */ - - cmpl $1024,%ecx - jb slow_copyin - - pushl %ebx /* XXX prepare for fastmove_fault */ - pushl %ecx - call fastmove - addl $8,%esp - jmp done_copyin -#endif /* I586_CPU && NNPX > 0 */ - -#if defined(I586_CPU) && NNPX > 0 -/* fastmove(src, dst, len) - src in %esi - dst in %edi - len in %ecx XXX changed to on stack for profiling - uses %eax and %edx for tmp. storage - */ -/* XXX use ENTRY() to get profiling. fastmove() is actually a non-entry. */ -ENTRY(fastmove) - pushl %ebp - movl %esp,%ebp - subl $PCB_SAVE87_SIZE+3*4,%esp - - movl 8(%ebp),%ecx - cmpl $63,%ecx - jbe fastmove_tail - - testl $7,%esi /* check if src addr is multiple of 8 */ - jnz fastmove_tail - - testl $7,%edi /* check if dst addr is multiple of 8 */ - jnz fastmove_tail - -/* if (npxthread != NULL) { */ - cmpl $0,PCPU(npxthread) - je 6f -/* fnsave(&curpcb->pcb_savefpu); */ - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - fnsave PCB_SAVEFPU(%eax) -/* npxthread = NULL; */ - movl $0,PCPU(npxthread) -/* } */ -6: -/* now we own the FPU. */ - -/* - * The process' FP state is saved in the pcb, but if we get - * switched, the cpu_switch() will store our FP state in the - * pcb. It should be possible to avoid all the copying for - * this, e.g., by setting a flag to tell cpu_switch() to - * save the state somewhere else. - */ -/* tmp = curpcb->pcb_savefpu; */ - movl %ecx,-12(%ebp) - movl %esi,-8(%ebp) - movl %edi,-4(%ebp) - movl %esp,%edi - movl PCPU(curthread),%esi - movl TD_PCB(%esi),%esi - addl $PCB_SAVEFPU,%esi - cld - movl $PCB_SAVE87_SIZE>>2,%ecx - rep - movsl - movl -12(%ebp),%ecx - movl -8(%ebp),%esi - movl -4(%ebp),%edi -/* stop_emulating(); */ - clts -/* npxthread = curthread; */ - movl PCPU(curthread),%eax - movl %eax,PCPU(npxthread) - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $fastmove_fault,PCB_ONFAULT(%eax) -4: - movl %ecx,-12(%ebp) - cmpl $1792,%ecx - jbe 2f - movl $1792,%ecx -2: - subl %ecx,-12(%ebp) - cmpl $256,%ecx - jb 5f - movl %ecx,-8(%ebp) - movl %esi,-4(%ebp) - ALIGN_TEXT -3: - movl 0(%esi),%eax - movl 32(%esi),%eax - movl 64(%esi),%eax - movl 96(%esi),%eax - movl 128(%esi),%eax - movl 160(%esi),%eax - movl 192(%esi),%eax - movl 224(%esi),%eax - addl $256,%esi - subl $256,%ecx - cmpl $256,%ecx - jae 3b - movl -8(%ebp),%ecx - movl -4(%ebp),%esi -5: - ALIGN_TEXT -fastmove_loop: - fildq 0(%esi) - fildq 8(%esi) - fildq 16(%esi) - fildq 24(%esi) - fildq 32(%esi) - fildq 40(%esi) - fildq 48(%esi) - fildq 56(%esi) - fistpq 56(%edi) - fistpq 48(%edi) - fistpq 40(%edi) - fistpq 32(%edi) - fistpq 24(%edi) - fistpq 16(%edi) - fistpq 8(%edi) - fistpq 0(%edi) - addl $-64,%ecx - addl $64,%esi - addl $64,%edi - cmpl $63,%ecx - ja fastmove_loop - movl -12(%ebp),%eax - addl %eax,%ecx - cmpl $64,%ecx - jae 4b - -/* curpcb->pcb_savefpu = tmp; */ - movl %ecx,-12(%ebp) - movl %esi,-8(%ebp) - movl %edi,-4(%ebp) - movl PCPU(curthread),%edi - movl TD_PCB(%edi),%edi - addl $PCB_SAVEFPU,%edi - movl %esp,%esi - cld - movl $PCB_SAVE87_SIZE>>2,%ecx - rep - movsl - movl -12(%ebp),%ecx - movl -8(%ebp),%esi - movl -4(%ebp),%edi - -/* start_emulating(); */ - smsw %ax - orb $CR0_TS,%al - lmsw %ax -/* npxthread = NULL; */ - movl $0,PCPU(npxthread) - ALIGN_TEXT -fastmove_tail: - movl PCPU(curthread),%eax - movl TD_PCB(%eax),%eax - movl $fastmove_tail_fault,PCB_ONFAULT(%eax) - - movb %cl,%al - shrl $2,%ecx /* copy longword-wise */ - cld - rep - movsl - movb %al,%cl - andb $3,%cl /* copy remaining bytes */ - rep - movsb - - movl %ebp,%esp - popl %ebp - ret - - ALIGN_TEXT -fastmove_fault: - movl PCPU(curthread),%edi - movl TD_PCB(%edi),%edi - addl $PCB_SAVEFPU,%edi - movl %esp,%esi - cld - movl $PCB_SAVE87_SIZE>>2,%ecx - rep - movsl - - smsw %ax - orb $CR0_TS,%al - lmsw %ax - movl $0,PCPU(npxthread) - -fastmove_tail_fault: - movl %ebp,%esp - popl %ebp - addl $8,%esp - popl %ebx +copyin_fault: popl %edi popl %esi movl PCPU(curthread),%edx @@ -1149,7 +345,6 @@ fastmove_tail_fault: movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret -#endif /* I586_CPU && NNPX > 0 */ /* * fu{byte,sword,word} - MP SAFE diff --git a/sys/platform/pc32/i386/swtch.s b/sys/platform/pc32/i386/swtch.s index 4266ca888f..8aaafba7d7 100644 --- a/sys/platform/pc32/i386/swtch.s +++ b/sys/platform/pc32/i386/swtch.s @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.89.2.10 2003/01/23 03:36:24 ps Exp $ - * $DragonFly: src/sys/platform/pc32/i386/swtch.s,v 1.31 2004/03/28 08:03:05 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/swtch.s,v 1.32 2004/04/29 17:24:58 dillon Exp $ */ #include "use_npx.h" @@ -136,11 +136,11 @@ ENTRY(cpu_heavy_switch) movl %eax,PCB_DR0(%edx) 1: +#if NNPX > 0 /* * Save the FP state if we have used the FP. Note that calling * npxsave will NULL out PCPU(npxthread). */ -#if NNPX > 0 cmpl %ebx,PCPU(npxthread) jne 1f addl $PCB_SAVEFPU,%edx @@ -531,13 +531,34 @@ ENTRY(cpu_kthread_restore) * YYY BGL, SPL */ ENTRY(cpu_lwkt_switch) - movl 4(%esp),%eax pushl %ebp /* note: GDB hacked to locate ebp relative to td_sp */ pushl %ebx + movl PCPU(curthread),%ebx pushl %esi pushl %edi pushfl - movl PCPU(curthread),%ebx + /* warning: adjust movl into %eax below if you change the pushes */ + +#if NNPX > 0 + /* + * Save the FP state if we have used the FP. Note that calling + * npxsave will NULL out PCPU(npxthread). + * + * We have to deal with the FP state for LWKT threads in case they + * happen to get preempted or block while doing an optimized + * bzero/bcopy/memcpy. + */ + cmpl %ebx,PCPU(npxthread) + jne 1f + movl TD_PCB(%ebx),%edx /* EDX = PCB */ + addl $PCB_SAVEFPU,%edx + pushl %edx + call npxsave /* do it in a big C function */ + addl $4,%esp /* EAX, ECX, EDX trashed */ +1: +#endif /* NNPX > 0 */ + + movl 4+20(%esp),%eax /* switch to this thread */ pushl $cpu_lwkt_restore movl %esp,TD_SP(%ebx) movl %eax,PCPU(curthread) diff --git a/sys/platform/pc32/include/globaldata.h b/sys/platform/pc32/include/globaldata.h index 744014d338..3f7779226b 100644 --- a/sys/platform/pc32/include/globaldata.h +++ b/sys/platform/pc32/include/globaldata.h @@ -28,7 +28,7 @@ * should not include this file. * * $FreeBSD: src/sys/i386/include/globaldata.h,v 1.11.2.1 2000/05/16 06:58:10 dillon Exp $ - * $DragonFly: src/sys/platform/pc32/include/globaldata.h,v 1.23 2004/02/21 06:37:07 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/include/globaldata.h,v 1.24 2004/04/29 17:25:00 dillon Exp $ */ #ifndef _MACHINE_GLOBALDATA_H_ @@ -67,6 +67,7 @@ struct mdglobaldata { struct segment_descriptor *gd_tss_gdt; struct thread *gd_npxthread; struct i386tss gd_common_tss; + int gd_kernel_fpu_lock; /* fast bcopy/zero cpu lock */ int gd_fpending; /* fast interrupt pending */ int gd_ipending; /* normal interrupt pending */ int gd_idelayed; /* delayed software ints */ diff --git a/sys/platform/pc32/include/md_var.h b/sys/platform/pc32/include/md_var.h index c5de38122c..5d705327e1 100644 --- a/sys/platform/pc32/include/md_var.h +++ b/sys/platform/pc32/include/md_var.h @@ -27,7 +27,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/i386/include/md_var.h,v 1.35.2.4 2003/01/22 20:14:53 jhb Exp $ - * $DragonFly: src/sys/platform/pc32/include/md_var.h,v 1.12 2003/11/03 17:11:19 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/include/md_var.h,v 1.13 2004/04/29 17:25:00 dillon Exp $ */ #ifndef _MACHINE_MD_VAR_H_ @@ -39,7 +39,8 @@ extern vm_paddr_t Maxmem; extern u_int atdevbase; /* offset in virtual memory of ISA io mem */ -extern void (*bcopy_vector) (const void *from, void *to, size_t len); +extern void **bcopy_vector; +extern void **memcpy_vector; extern int busdma_swi_pending; extern int (*copyin_vector) (const void *udaddr, void *kaddr, size_t len); @@ -59,7 +60,7 @@ extern int need_pre_dma_flush; extern int need_post_dma_flush; #endif extern int nfs_diskless_valid; -extern void (*ovbcopy_vector) (const void *from, void *to, size_t len); +extern void **ovbcopy_vector; extern char sigcode[]; extern int szsigcode; @@ -98,11 +99,19 @@ int fill_fpregs (struct proc *, struct fpreg *); int fill_regs (struct proc *p, struct reg *regs); int fill_dbregs (struct proc *p, struct dbreg *dbregs); void fillw (int /*u_short*/ pat, void *base, size_t cnt); +#if 0 void i486_bzero (volatile void *buf, size_t len); void i586_bzero (volatile void *buf, size_t len); void i586_bcopy (const void *from, void *to, size_t len); int i586_copyin (const void *udaddr, void *kaddr, size_t len); int i586_copyout (const void *kaddr, void *udaddr, size_t len); +#endif +void asm_generic_memcpy(void); +void asm_mmx_memcpy(void); +void asm_xmm_memcpy(void); +void asm_generic_bcopy(void); +void asm_mmx_bcopy(void); +void asm_xmm_bcopy(void); void i686_pagezero (void *addr); void init_AMD_Elan_sc520(void); int is_physical_memory (vm_offset_t addr); diff --git a/sys/platform/pc32/isa/npx.c b/sys/platform/pc32/isa/npx.c index ba108c4fa8..c83fe222b6 100644 --- a/sys/platform/pc32/isa/npx.c +++ b/sys/platform/pc32/isa/npx.c @@ -33,7 +33,7 @@ * * from: @(#)npx.c 7.2 (Berkeley) 5/12/91 * $FreeBSD: src/sys/i386/isa/npx.c,v 1.80.2.3 2001/10/20 19:04:38 tegge Exp $ - * $DragonFly: src/sys/platform/pc32/isa/npx.c,v 1.13 2003/08/26 21:42:19 rob Exp $ + * $DragonFly: src/sys/platform/pc32/isa/npx.c,v 1.14 2004/04/29 17:25:02 dillon Exp $ */ #include "opt_cpu.h" @@ -146,10 +146,6 @@ static int npx_probe (device_t dev); static int npx_probe1 (device_t dev); static void fpusave (union savefpu *); static void fpurstor (union savefpu *); -#ifdef I586_CPU -static long timezero (const char *funcname, - void (*func)(volatile void *buf, size_t len)); -#endif /* I586_CPU */ int hw_float; /* XXX currently just alias for npx_exists */ @@ -434,6 +430,9 @@ npx_attach(dev) device_t dev; { int flags; +#if defined(I586_CPU) || defined(I686_CPU) + int mmxopt = 1; +#endif if (resource_int_value("npx", 0, "flags", &flags) != 0) flags = 0; @@ -470,7 +469,40 @@ npx_attach(dev) } npxinit(__INITIAL_NPXCW__); -#ifdef I586_CPU +#if defined(I586_CPU) || defined(I686_CPU) + /* + * The asm_mmx_*() routines actually use XMM as well, so only + * enable them if we have SSE2 and are using FXSR (fxsave/fxrstore). + */ + TUNABLE_INT_FETCH("kern.mmxopt", &mmxopt); + if ((cpu_feature & CPUID_MMX) && (cpu_feature & CPUID_SSE) && + (cpu_feature & CPUID_SSE2) && + npx_ex16 && npx_exists && mmxopt && cpu_fxsr + ) { + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BCOPY) == 0) { + bcopy_vector = (void **)asm_xmm_bcopy; + ovbcopy_vector = (void **)asm_xmm_bcopy; + memcpy_vector = (void **)asm_xmm_memcpy; + printf("Using XMM optimized bcopy/copyin/copyout\n"); + } + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BZERO) == 0) { + /* XXX */ + } + } else if ((cpu_feature & CPUID_MMX) && (cpu_feature & CPUID_SSE) && + npx_ex16 && npx_exists && mmxopt && cpu_fxsr + ) { + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BCOPY) == 0) { + bcopy_vector = (void **)asm_mmx_bcopy; + ovbcopy_vector = (void **)asm_mmx_bcopy; + memcpy_vector = (void **)asm_mmx_memcpy; + printf("Using MMX optimized bcopy/copyin/copyout\n"); + } + if ((flags & NPX_DISABLE_I586_OPTIMIZED_BZERO) == 0) { + /* XXX */ + } + } +#endif +#if 0 if (cpu_class == CPUCLASS_586 && npx_ex16 && npx_exists && timezero("i586_bzero()", i586_bzero) < timezero("bzero()", bzero) * 4 / 5) { @@ -486,7 +518,6 @@ npx_attach(dev) } } #endif - return (0); /* XXX unused */ } @@ -943,36 +974,6 @@ fpurstor(addr) frstor(addr); } -#ifdef I586_CPU -static long -timezero(funcname, func) - const char *funcname; - void (*func) (volatile void *buf, size_t len); - -{ - void *buf; -#define BUFSIZE 1000000 - long usec; - struct timeval finish, start; - - buf = malloc(BUFSIZE, M_TEMP, M_NOWAIT); - if (buf == NULL) - return (BUFSIZE); - microtime(&start); - (*func)(buf, BUFSIZE); - microtime(&finish); - usec = 1000000 * (finish.tv_sec - start.tv_sec) + - finish.tv_usec - start.tv_usec; - if (usec <= 0) - usec = 1; - if (bootverbose) - printf("%s bandwidth = %ld bytes/sec\n", - funcname, (long)(BUFSIZE * (int64_t)1000000 / usec)); - free(buf, M_TEMP); - return (usec); -} -#endif /* I586_CPU */ - static device_method_t npx_methods[] = { /* Device interface */ DEVMETHOD(device_identify, npx_identify), diff --git a/sys/platform/vkernel/i386/genassym.c b/sys/platform/vkernel/i386/genassym.c index 50ad652e12..7331a26378 100644 --- a/sys/platform/vkernel/i386/genassym.c +++ b/sys/platform/vkernel/i386/genassym.c @@ -35,7 +35,7 @@ * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD: src/sys/i386/i386/genassym.c,v 1.86.2.3 2002/03/03 05:42:49 nyan Exp $ - * $DragonFly: src/sys/platform/vkernel/i386/genassym.c,v 1.36 2004/03/30 19:14:04 dillon Exp $ + * $DragonFly: src/sys/platform/vkernel/i386/genassym.c,v 1.37 2004/04/29 17:24:58 dillon Exp $ */ #include @@ -202,6 +202,7 @@ ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss)); ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt)); ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread)); +ASSYM(GD_KERNEL_FPU_LOCK, offsetof(struct mdglobaldata, gd_kernel_fpu_lock)); ASSYM(GD_OTHER_CPUS, offsetof(struct mdglobaldata, gd_other_cpus)); ASSYM(GD_SS_EFLAGS, offsetof(struct mdglobaldata, gd_ss_eflags)); ASSYM(GD_CMAP1, offsetof(struct mdglobaldata, gd_CMAP1)); -- 2.41.0