From: Matthew Dillon Date: Sun, 6 May 2018 04:52:37 +0000 (-0700) Subject: kernel - Refactor bcmp, bcopy, bzero, memset X-Git-Tag: v5.5.0~664 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/5d48b3120a651eee088cced1b4cffd8a264722c6 kernel - Refactor bcmp, bcopy, bzero, memset * For now continue to use stosq/stosb, movsq/movsb, cmpsq/cmpsb sequences which are well optimized on AMD and Intel. Do not just use the '*b' string op. While this is optimized on Intel it is not optimized on AMD. * Note that two string ops in a row result in a serious pessimization. To fix this, for now, conditionalize the movsb, stosb, or cmpsb op so it is only executed when the remaining count is non-zero. That is, assume nominal 8-byte alignment. * Refactor pagezero() to use a movq/addq/jne sequence. This is significantly faster than movsq on AMD and only just very slightly slower than movsq on Intel. * Also use the above adjusted kernel code in libc for these functions, with minor modifications. Since we are copying the code wholesale, replace the copyright for the related files in libc. * Refactor libc's memset() to replicate the data to all 64 bits code and then use code similar to bzero(). Reported-by: mjg_ (info on pessimizations) --- diff --git a/lib/libc/x86_64/string/bcmp.S b/lib/libc/x86_64/string/bcmp.S index d9bf5e969c..0077025ba5 100644 --- a/lib/libc/x86_64/string/bcmp.S +++ b/lib/libc/x86_64/string/bcmp.S @@ -1,26 +1,51 @@ -/* - * $NetBSD: bcmp.S,v 1.1 2001/06/19 00:25:04 fvdl Exp $ - * $FreeBSD: src/lib/libc/amd64/string/bcmp.S,v 1.3 2008/11/02 01:10:54 peter Exp $ +/*- + * Copyright (c) 1993 The Regents of the University of California. + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 2008 The DragonFly Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ - #include ENTRY(bcmp) - cld /* set compare direction forward */ - - movq %rdx,%rcx /* compare by words */ - shrq $3,%rcx + movq %rdx,%rcx + shrq $3,%rcx + cld /* compare forwards */ repe cmpsq - jne L1 + jne 1f - movq %rdx,%rcx /* compare remainder by bytes */ - andq $7,%rcx + movq %rdx,%rcx + andq $7,%rcx + je 1f repe cmpsb -L1: - setne %al - movsbl %al,%eax +1: + setne %al + movsbl %al,%eax ret END(bcmp) diff --git a/lib/libc/x86_64/string/bcopy.S b/lib/libc/x86_64/string/bcopy.S index bd78cbb5b7..f6eca4fb23 100644 --- a/lib/libc/x86_64/string/bcopy.S +++ b/lib/libc/x86_64/string/bcopy.S @@ -1,9 +1,9 @@ /*- - * Copyright (c) 1990 The Regents of the University of California. + * Copyright (c) 1993 The Regents of the University of California. + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 2008 The DragonFly Project. * All rights reserved. * - * This code is derived from locore.s. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -27,18 +27,12 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $NetBSD: bcopy.S,v 1.2 2003/08/07 16:42:36 agc Exp $ - * $FreeBSD: src/lib/libc/amd64/string/bcopy.S,v 1.3 2008/11/02 01:10:54 peter Exp $ */ - #include - /* - * (ov)bcopy (src,dst,cnt) - * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 - */ - +/* + * bcopy(src:%rdi, dst:%rsi, cnt:%rdx) + */ #ifdef MEMCOPY ENTRY(memcpy) #else @@ -53,37 +47,44 @@ ENTRY(bcopy) #else xchgq %rdi,%rsi #endif - movq %rdx,%rcx - movq %rdi,%r8 - subq %rsi,%r8 - cmpq %rcx,%r8 /* overlapping? */ - jb 1f - cld /* nope, copy forwards. */ - shrq $3,%rcx /* copy by words */ - rep - movsq - movq %rdx,%rcx - andq $7,%rcx /* any bytes left? */ - rep - movsb - ret -1: - addq %rcx,%rdi /* copy backwards. */ - addq %rcx,%rsi - std - andq $7,%rcx /* any fractional bytes? */ - decq %rdi - decq %rsi - rep - movsb - movq %rdx,%rcx /* copy remainder by words */ - shrq $3,%rcx - subq $7,%rsi - subq $7,%rdi - rep - movsq - cld - ret + movq %rdx,%rcx + + movq %rdi,%r11 + subq %rsi,%r11 + cmpq %rcx,%r11 /* overlapping && src < dst? */ + jb 2f + + cld /* nope, copy forwards */ + shrq $3,%rcx /* copy by 64-bit words */ + rep + movsq + movq %rdx,%rcx + andq $7,%rcx /* any bytes left? */ + jnz 1f + ret +1: rep + movsb + ret + + .align 4 +2: + addq %rcx,%rdi /* copy backwards */ + addq %rcx,%rsi + std + decq %rdi + decq %rsi + andq $7,%rcx /* any fractional bytes? */ + jz 3f + rep + movsb +3: movq %rdx,%rcx /* copy by 32-bit words */ + shrq $3,%rcx + subq $7,%rsi + subq $7,%rdi + rep + movsq + cld + ret #ifdef MEMCOPY END(memcpy) #else diff --git a/lib/libc/x86_64/string/bzero.S b/lib/libc/x86_64/string/bzero.S index 4e7eb10b7a..e4b03abe28 100644 --- a/lib/libc/x86_64/string/bzero.S +++ b/lib/libc/x86_64/string/bzero.S @@ -1,43 +1,48 @@ -/* - * Written by J.T. Conklin . - * Public domain. - * Adapted for NetBSD/x86_64 by Frank van der Linden +/*- + * Copyright (c) 1993 The Regents of the University of California. + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 2008 The DragonFly Project. + * All rights reserved. * - * $NetBSD: bzero.S,v 1.2 2003/07/26 19:24:38 salo Exp $ - * $FreeBSD: src/lib/libc/amd64/string/bzero.S,v 1.3 2008/11/02 01:10:54 peter Exp $ + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ - #include ENTRY(bzero) - cld /* set fill direction forward */ - xorq %rax,%rax /* set fill data to 0 */ - - /* - * if the string is too short, it's really not worth the overhead - * of aligning to word boundries, etc. So we jump to a plain - * unaligned set. - */ - cmpq $16,%rsi - jb L1 - - movq %rdi,%rcx /* compute misalignment */ - negq %rcx - andq $7,%rcx - subq %rcx,%rsi - rep /* zero until word aligned */ - stosb - - movq %rsi,%rcx /* zero by words */ - shrq $3,%rcx - andq $7,%rsi + movq %rsi,%rcx + xorl %eax,%eax + shrq $3,%rcx + cld rep stosq - -L1: movq %rsi,%rcx /* zero remainder by bytes */ - rep + movq %rsi,%rcx + andq $7,%rcx + jnz 1f + ret +1: rep stosb - ret END(bzero) diff --git a/lib/libc/x86_64/string/memset.S b/lib/libc/x86_64/string/memset.S index 13a7b310a1..7c4dd38ab1 100644 --- a/lib/libc/x86_64/string/memset.S +++ b/lib/libc/x86_64/string/memset.S @@ -1,60 +1,73 @@ /* - * Written by J.T. Conklin . - * Public domain. - * Adapted for NetBSD/x86_64 by Frank van der Linden + * Copyright (c) 2018 The DragonFly Project. All rights reserved. * - * $NetBSD: memset.S,v 1.3 2004/02/26 20:50:06 drochner Exp $ - * $FreeBSD: src/lib/libc/amd64/string/memset.S,v 1.2 2008/11/02 01:10:54 peter Exp $ + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ - #include +/* + * %rdi:ptr %rsi:data %rdx:bytes + */ ENTRY(memset) - movq %rsi,%rax - andq $0xff,%rax - movq %rdx,%rcx - movq %rdi,%r11 - - cld /* set fill direction forward */ - - /* - * if the string is too short, it's really not worth the overhead - * of aligning to word boundries, etc. So we jump to a plain - * unaligned set. - */ - cmpq $0x0f,%rcx - jle L1 - - movb %al,%ah /* copy char to all bytes in word */ - movl %eax,%edx - sall $16,%eax - orl %edx,%eax + movq %rdi,%r8 /* Save base pointer for return */ + movq %rdx,%rcx /* Setup count in %rcx */ + movq %rsi,%rax /* Setup data in %rax for byte store */ + cld + cmpq $15,%rdx /* trivial case */ + jle 2f + testq %rax,%rax /* trivial case (0) */ + jz 1f - movl %eax,%edx - salq $32,%rax - orq %rdx,%rax - - movq %rdi,%rdx /* compute misalignment */ - negq %rdx - andq $7,%rdx - movq %rcx,%r8 - subq %rdx,%r8 - - movq %rdx,%rcx /* set until word aligned */ - rep - stosb - - movq %r8,%rcx - shrq $3,%rcx /* set by words */ + andq $0xFF,%rax /* shift 0:7 to all bytes in %rax */ + movq %rax,%rsi + salq $8,%rax + orq %rsi,%rax + orq %rax,%rsi + salq $16,%rsi + orq %rsi,%rax + orq %rax,%rsi + salq $32,%rsi + orq %rsi,%rax +1: + shrq $3,%rcx rep stosq - - movq %r8,%rcx /* set remainder by bytes */ - andq $7,%rcx -L1: rep + movq %rdx,%rcx + andq $7,%rcx + jnz 2f + movq %r8,%rax + ret +2: rep stosb - movq %r11,%rax - + movq %r8,%rax ret END(memset) diff --git a/sys/platform/pc64/x86_64/support.s b/sys/platform/pc64/x86_64/support.s index cbb0bac88d..3c88f8bd31 100644 --- a/sys/platform/pc64/x86_64/support.s +++ b/sys/platform/pc64/x86_64/support.s @@ -58,7 +58,9 @@ ENTRY(bzero) stosq movq %rsi,%rcx andq $7,%rcx - rep + jnz 1f + ret +1: rep stosb ret END(bzero) @@ -72,6 +74,8 @@ END(bzero) * Do not use non-termportal instructions here as we do not know the caller's * intent. */ +#if 0 + ENTRY(pagezero) movq $PAGE_SIZE>>3,%rcx xorl %eax,%eax @@ -81,6 +85,19 @@ ENTRY(pagezero) ret END(pagezero) +#endif + +ENTRY(pagezero) + addq $4096,%rdi + movq $-4096,%rax + ALIGN_TEXT +1: + movq $0,(%rdi,%rax,1) + addq $8,%rax + jne 1b + ret +END(pagezero) + /* * bcmp(ptr:%rdi, ptr:%rsi, bytes:%rdx) */ @@ -94,6 +111,7 @@ ENTRY(bcmp) movq %rdx,%rcx andq $7,%rcx + je 1f repe cmpsb 1: @@ -114,29 +132,32 @@ ENTRY(bcopy) movq %rdi,%rax subq %rsi,%rax cmpq %rcx,%rax /* overlapping && src < dst? */ - jb 1f + jb 2f - shrq $3,%rcx /* copy by 64-bit words */ cld /* nope, copy forwards */ + shrq $3,%rcx /* copy by 64-bit words */ rep movsq movq %rdx,%rcx andq $7,%rcx /* any bytes left? */ - rep + jnz 1f + ret +1: rep movsb ret ALIGN_TEXT -1: +2: addq %rcx,%rdi /* copy backwards */ addq %rcx,%rsi + std decq %rdi decq %rsi andq $7,%rcx /* any fractional bytes? */ - std + jz 3f rep movsb - movq %rdx,%rcx /* copy by 32-bit words */ +3: movq %rdx,%rcx /* copy by 32-bit words */ shrq $3,%rcx subq $7,%rsi subq $7,%rdi @@ -172,7 +193,10 @@ ENTRY(memcpy) movsq movq %rdx,%rcx andq $7,%rcx /* any bytes left? */ - rep + jnz 1f + movq %r8,%rax + ret +1: rep movsb movq %r8,%rax ret