Add mem1 and mem2 .... memory copying and zeroing test suites, making it
authorMatthew Dillon <dillon@dragonflybsd.org>
Thu, 29 Apr 2004 16:14:53 +0000 (16:14 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Thu, 29 Apr 2004 16:14:53 +0000 (16:14 +0000)
fairly easy for anyone to test various copying and zeroing memory algorithms
and compare them against other algorithms.

test/sysperf/Makefile
test/sysperf/memcpy.S [new file with mode: 0644]
test/sysperf/memcpy.c [new file with mode: 0644]
test/sysperf/memzero.S [new file with mode: 0644]
test/sysperf/memzero.c [new file with mode: 0644]

index 08754c4..14578fc 100644 (file)
@@ -1,5 +1,5 @@
 #
-# $DragonFly: src/test/sysperf/Makefile,v 1.7 2004/04/10 21:36:52 dillon Exp $
+# $DragonFly: src/test/sysperf/Makefile,v 1.8 2004/04/29 16:14:53 dillon Exp $
 #
 
 TARGETS=/tmp/sc1 /tmp/sc2 /tmp/sc3 /tmp/sc4 \
@@ -10,7 +10,8 @@ TARGETS=/tmp/sc1 /tmp/sc2 /tmp/sc3 /tmp/sc4 \
        /tmp/sw1 /tmp/sw2 /tmp/sw3 \
        /tmp/mbw1 \
        /tmp/upc1 \
-       /tmp/exec1 /tmp/exec2
+       /tmp/exec1 /tmp/exec2 \
+       /tmp/mem1 /tmp/mem2
 
 CFLAGS= -O2 -g -I/usr/src/sys
 
@@ -79,5 +80,11 @@ all: $(TARGETS)
 /tmp/exec2: exec1.c blib.c
        $(CC) $(CFLAGS) exec1.c blib.c -o /tmp/exec2
 
+/tmp/mem1: memcpy.c memcpy.S blib.c
+       $(CC) $(CFLAGS) memcpy.c memcpy.S blib.c -o /tmp/mem1
+
+/tmp/mem2: memzero.c memzero.S blib.c
+       $(CC) $(CFLAGS) memzero.c memzero.S blib.c -o /tmp/mem2
+
 clean:
        rm -f $(TARGETS)
diff --git a/test/sysperf/memcpy.S b/test/sysperf/memcpy.S
new file mode 100644 (file)
index 0000000..4acc521
--- /dev/null
@@ -0,0 +1,285 @@
+
+               /*
+                * memcpy.S
+                *
+                * AMD64: MOVNTQ vs MOVQ, MOVNTDQ vs MOVDQ[A/U], PREFETCH[x]
+                *
+                *      NT stands for 'non-temportal', which basically means
+                *      'bypass L1 cache on write'.  Write bandwidth is
+                *      effectively reduced to the L2 cache bandwidth but
+                *      the L1 cache will not be wiped out by the copy.
+                *
+                *      DO NOT MIX 'nt' and standard writes!  Your performance
+                *      will go poof.
+                *
+                * PREFETCH[NTA,T0,T1,T2]
+                *
+                *      These instructions prefetch a cache line (typically
+                *      128 bytes).  'NT' means 'non-temporal', which bypasses
+                *      the L1 cache if the data is not already in the L1 
+                *      cache.  HOWEVER, using PREFETCHNT can put a slow memory
+                *      op in the cpu's memory request queue if a L1 or L2
+                *      miss occurs, and it can stall an L1-cache-hit access
+                *      for a small but noticeable period of time, so it is
+                *      a good idea not to put a memory op just after a 
+                *      prefetchnta instruction.
+                *
+                *      You can get better L2 bandwidth using prefetchnt but
+                *      it will not be much more then prefetcht0 and 
+                *      'prefetcht0' will give you better cache-miss
+                *      bandwidth.
+                *
+                *      The prefetch has to be done far enough ahead to do
+                *      some good, but it only has a significant effect when
+                *      it is able to move date from L2 to L1.  Prefetching
+                *      from main memory does not have a significant effect
+                *      durign a copy or zeroing operation because main
+                *      memory bandwidth is already saturated.
+                *
+                * $DragonFly: src/test/sysperf/memcpy.S,v 1.1 2004/04/29 16:14:53 dillon Exp $
+                */
+               .text
+               .globl  docopy1
+               .globl  docopy2
+               .globl  docopy3
+               .globl  docopy4
+               .globl  docopy5
+               .globl  docopy6
+               .globl  docopy7
+               .globl  fpcleanup
+
+               .p2align 4,0x90
+docopy1:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%esi
+               movl    8+16(%esp),%edi
+               movl    12+16(%esp),%ecx
+               shrl    $2,%ecx
+               cld
+               rep
+               movsl
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+docopy2:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%esi
+               movl    8+16(%esp),%edi
+               movl    12+16(%esp),%ecx
+               addl    %ecx,%esi
+               addl    %ecx,%edi
+               shrl    $2,%ecx
+               std
+               rep
+               movsl
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+docopy3:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%esi
+               movl    8+16(%esp),%edi
+               movl    12+16(%esp),%ecx
+
+               .p2align 4,0x90
+1:
+               movl    (%esi),%eax
+               movl    4(%esi),%ebx
+               movl    8(%esi),%edx
+               movl    %eax,(%edi)
+               movl    12(%esi),%eax
+               movl    %ebx,4(%edi)
+               movl    16(%esi),%ebx
+               movl    %edx,8(%edi)
+               movl    20(%esi),%edx
+               movl    %eax,12(%edi)
+               movl    24(%esi),%eax
+               movl    %ebx,16(%edi)
+               movl    28(%esi),%ebx
+               movl    %edx,20(%edi)
+               prefetcht0 96(%esi)
+               subl    $32,%ecx
+               movl    %eax,24(%edi)
+               addl    $32,%esi
+               movl    %ebx,28(%edi)
+               addl    $32,%edi
+
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+docopy4:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%esi
+               movl    8+16(%esp),%edi
+               movl    12+16(%esp),%ecx
+
+               .p2align 4,0x90
+1:
+               movl    (%esi),%eax
+               movl    4(%esi),%ebx
+               addl    $8,%esi
+               prefetcht0 64(%esi)
+               subl    $8,%ecx
+               movl    %eax,(%edi)
+               movl    %ebx,4(%edi)
+               addl    $8,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+docopy5:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%esi
+               movl    8+16(%esp),%edi
+               movl    12+16(%esp),%ecx
+
+               .p2align 4,0x90
+1:
+               movq    (%esi),%mm0
+               movq    8(%esi),%mm1
+               movq    16(%esi),%mm2
+               movq    24(%esi),%mm3
+               movq    32(%esi),%mm4
+               movq    40(%esi),%mm5
+               movq    48(%esi),%mm6
+               movq    56(%esi),%mm7
+               prefetchnta 128(%esi)
+               subl    $64,%ecx
+               addl    $64,%esi
+               movq    %mm0,(%edi)
+               movq    %mm1,8(%edi)
+               movq    %mm2,16(%edi)
+               movq    %mm3,24(%edi)
+               movq    %mm4,32(%edi)
+               movq    %mm5,40(%edi)
+               movq    %mm6,48(%edi)
+               movq    %mm7,56(%edi)
+               addl    $64,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+docopy6:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%esi
+               movl    8+16(%esp),%edi
+               movl    12+16(%esp),%ecx
+               movl    $16,%eax
+
+               .p2align 4,0x90
+1:
+               prefetcht0 96(%esi)
+               subl    %eax,%ecx
+               movq    (%esi),%mm0
+               movq    8(%esi),%mm1
+               addl    %eax,%esi
+               movntq  %mm0,(%edi)
+               movntq  %mm1,8(%edi)
+               addl    %eax,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+docopy7:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%esi
+               movl    8+16(%esp),%edi
+               movl    12+16(%esp),%ecx
+               movl    $128,%eax
+
+               .p2align 4,0x90
+1:
+               movdqa  (%esi),%xmm0
+               movdqa  16(%esi),%xmm1
+               movdqa  32(%esi),%xmm2
+               movdqa  48(%esi),%xmm3
+               movdqa  64(%esi),%xmm4
+               movdqa  80(%esi),%xmm5
+               movdqa  96(%esi),%xmm6
+               movdqa  112(%esi),%xmm7
+               subl    %eax,%ecx
+               addl    %eax,%esi
+               movntdq  %xmm0,(%edi)
+               movntdq  %xmm1,16(%edi)
+               movntdq  %xmm2,32(%edi)
+               movntdq  %xmm3,48(%edi)
+               movntdq  %xmm4,64(%edi)
+               movntdq  %xmm5,80(%edi)
+               movntdq  %xmm6,96(%edi)
+               movntdq  %xmm7,112(%edi)
+               addl    %eax,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+fpcleanup:
+               fninit
+               ret
+
diff --git a/test/sysperf/memcpy.c b/test/sysperf/memcpy.c
new file mode 100644 (file)
index 0000000..997bc9a
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * memcpy.c
+ *
+ * $DragonFly: src/test/sysperf/memcpy.c,v 1.1 2004/04/29 16:14:53 dillon Exp $
+ */
+
+#include "blib.h"
+
+int glob[16384];
+
+void test_using(const char *ctl, char *buf, int bytes, void (*copyf)(const void *s1, void *d, size_t bytes));
+
+extern void docopy1(const void *s, void *d, size_t bytes);
+extern void docopy2(const void *s, void *d, size_t bytes);
+extern void docopy3(const void *s, void *d, size_t bytes);
+extern void docopy4(const void *s, void *d, size_t bytes);
+extern void docopy5(const void *s, void *d, size_t bytes);
+extern void docopy6(const void *s, void *d, size_t bytes);
+extern void docopy7(const void *s, void *d, size_t bytes);
+extern void fpcleanup(void);
+
+int
+main(int ac, char **av)
+{
+    int bytes;
+    char *ptr;
+    char *buf;
+
+    if (ac == 1) {
+       fprintf(stderr, "%s bytes\n", av[0]);
+       exit(1);
+    }
+
+    bytes = strtol(av[1], &ptr, 0);
+    switch(*ptr) {
+    case 'k':
+    case 'K':
+       bytes *= 1024;
+       break;
+    case 'm':
+    case 'M':
+       bytes *= 1024 * 1024;
+       break;
+    case 'g':
+    case 'G':
+       bytes *= 1024 * 1024 * 1024;
+       break;
+    case 0:
+       break;
+    default:
+       fprintf(stderr, "suffix '%s' not understood\n");
+       exit(1);
+    }
+    if (bytes <= 0 && (bytes & 127)) {
+       fprintf(stderr, "# of bytes must be a multiple of 128\n");
+       exit(1);
+    }
+    buf = mmap(NULL, bytes * 2, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANON, -1, 0);
+    if (buf == MAP_FAILED) {
+       perror("mmap/buffer");
+       exit(1);
+    }
+    bzero(buf, bytes * 2);
+
+    test_using("bcopy", buf, bytes, bcopy);
+    test_using("docopy1", buf, bytes, docopy1);
+    test_using("docopy2", buf, bytes, docopy2);
+    test_using("docopy3", buf, bytes, docopy3);
+    test_using("docopy4", buf, bytes, docopy4);
+    test_using("docopy5", buf, bytes, docopy5);
+    test_using("docopy6", buf, bytes, docopy6);
+    test_using("docopy7", buf, bytes, docopy7);
+    return(0);
+}
+
+void
+test_using(const char *ctl, char *buf, int bytes, void (*copyf)(const void *s1, void *d, size_t bytes))
+{
+    int i;
+    int loops;
+    long long us;
+
+    start_timing();
+    for (i = 0; (i & 31) || stop_timing(0, NULL) == 0; ++i) {
+       copyf(buf, buf + bytes, bytes);
+    }
+
+    loops = i * 2;
+    start_timing();
+    for (i = loops - 1; i >= 0; --i) {
+       copyf(buf, buf + bytes, bytes);
+    }
+    fpcleanup();
+    stop_timing(loops, ctl);
+    us = get_timing();
+    printf("%s %d %5.2f MBytes/sec\n", ctl, bytes, 
+       (double)loops * (double)bytes / (double)us);
+}
+
diff --git a/test/sysperf/memzero.S b/test/sysperf/memzero.S
new file mode 100644 (file)
index 0000000..e70923d
--- /dev/null
@@ -0,0 +1,205 @@
+
+               /*
+                * memcpy.S
+                *
+                * $DragonFly: src/test/sysperf/memzero.S,v 1.1 2004/04/29 16:14:53 dillon Exp $
+                */
+               .text
+               .globl  dozero1
+               .globl  dozero2
+               .globl  dozero3
+               .globl  dozero4
+               .globl  dozero5
+               .globl  dozero6
+               .globl  dozero7
+               .globl  fpcleanup
+
+               .p2align 4,0x90
+dozero1:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%edi
+               movl    8+16(%esp),%ecx
+               shrl    $2,%ecx
+               subl    %eax,%eax
+               cld
+               rep
+               stosl
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+dozero2:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%edi
+               movl    8+16(%esp),%ecx
+               addl    %ecx,%esi
+               addl    %ecx,%edi
+               shrl    $2,%ecx
+               subl    %eax,%eax
+               std
+               rep
+               stosl
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+dozero3:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%edi
+               movl    8+16(%esp),%ecx
+               movl    $8,%edx
+               subl    %eax,%eax
+               .p2align 4,0x90
+1:
+               subl    %edx,%ecx
+               movl    %eax,(%edi)
+               movl    %eax,4(%edi)
+               addl    %edx,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+dozero4:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%edi
+               movl    8+16(%esp),%ecx
+               subl    %eax,%eax
+               .p2align 4,0x90
+1:
+               subl    $16,%ecx
+               movnti  %eax,0(%edi)
+               movnti  %eax,4(%edi)
+               movnti  %eax,8(%edi)
+               movnti  %eax,12(%edi)
+               addl    $16,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+dozero5:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%edi
+               movl    8+16(%esp),%ecx
+
+               subl    $108,%esp
+               fnsave  0(%esp)
+               fninit
+               fldz
+               .p2align 4,0x90
+1:
+               subl    $64,%ecx
+               movq    %mm0,(%edi)
+               movq    %mm1,8(%edi)
+               movq    %mm2,16(%edi)
+               movq    %mm3,24(%edi)
+               movq    %mm4,32(%edi)
+               movq    %mm5,40(%edi)
+               movq    %mm6,48(%edi)
+               movq    %mm7,56(%edi)
+               addl    $64,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               frstor  0(%esp)
+               addl    $108,%esp
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+dozero6:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%edi
+               movl    8+16(%esp),%ecx
+               movl    $16,%eax
+               .p2align 4,0x90
+1:
+               subl    %eax,%ecx
+               movq    %mm0,(%edi)
+               movq    %mm1,8(%edi)
+               addl    %eax,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+dozero7:
+               pushl   %esi
+               pushl   %edi
+               pushl   %ecx
+               pushl   %ebx
+
+               movl    4+16(%esp),%edi
+               movl    8+16(%esp),%ecx
+               movl    $32,%eax
+               .p2align 4,0x90
+1:
+               subl    %eax,%ecx
+               movntdq %xmm0,(%edi)
+               movntdq %xmm1,16(%edi)
+               addl    %eax,%edi
+               testl   %ecx,%ecx
+               jnz     1b
+               sfence
+
+               popl    %ebx
+               popl    %ecx
+               popl    %edi
+               popl    %esi
+               ret
+
+               .p2align 4,0x90
+fpcleanup:
+               fninit
+               ret
+
diff --git a/test/sysperf/memzero.c b/test/sysperf/memzero.c
new file mode 100644 (file)
index 0000000..7731402
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * memzero.c
+ *
+ * $DragonFly: src/test/sysperf/memzero.c,v 1.1 2004/04/29 16:14:53 dillon Exp $
+ */
+
+#include "blib.h"
+
+int glob[16384];
+
+void test_using(const char *ctl, char *buf, int bytes, void (*zerof)(void *d, size_t bytes));
+
+extern void dozero1(void *d, size_t bytes);
+extern void dozero2(void *d, size_t bytes);
+extern void dozero3(void *d, size_t bytes);
+extern void dozero4(void *d, size_t bytes);
+extern void dozero5(void *d, size_t bytes);
+extern void dozero6(void *d, size_t bytes);
+extern void dozero7(void *d, size_t bytes);
+extern void fpcleanup(void);
+
+int
+main(int ac, char **av)
+{
+    int bytes;
+    char *ptr;
+    char *buf;
+
+    if (ac == 1) {
+       fprintf(stderr, "%s bytes\n", av[0]);
+       exit(1);
+    }
+
+    bytes = strtol(av[1], &ptr, 0);
+    switch(*ptr) {
+    case 'k':
+    case 'K':
+       bytes *= 1024;
+       break;
+    case 'm':
+    case 'M':
+       bytes *= 1024 * 1024;
+       break;
+    case 'g':
+    case 'G':
+       bytes *= 1024 * 1024 * 1024;
+       break;
+    case 0:
+       break;
+    default:
+       fprintf(stderr, "suffix '%s' not understood\n");
+       exit(1);
+    }
+    if (bytes <= 0 && (bytes & 127)) {
+       fprintf(stderr, "# of bytes must be a multiple of 128\n");
+       exit(1);
+    }
+
+    buf = mmap(NULL, bytes * 2, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANON, -1, 0);
+    if (buf == MAP_FAILED) {
+       perror("mmap/buffer");
+       exit(1);
+    }
+    bzero(buf, bytes * 2);
+
+    test_using("bzero", buf, bytes, (void *)bzero);
+    test_using("dozero1", buf, bytes, dozero1);
+    test_using("dozero2", buf, bytes, dozero2);
+    test_using("dozero3", buf, bytes, dozero3);
+    test_using("dozero4", buf, bytes, dozero4);
+    test_using("dozero5", buf, bytes, dozero5);
+    test_using("dozero6", buf, bytes, dozero6);
+    test_using("dozero7", buf, bytes, dozero7);
+    return(0);
+}
+
+void
+test_using(const char *ctl, char *buf, int bytes, void (*zerof)(void *d, size_t bytes))
+{
+    int i;
+    int loops;
+    long long us;
+
+    start_timing();
+    for (i = 0; (i & 31) || stop_timing(0, NULL) == 0; ++i) {
+       zerof(buf, bytes);
+    }
+
+    loops = i * 2;
+    start_timing();
+    for (i = loops - 1; i >= 0; --i) {
+       zerof(buf, bytes);
+    }
+    fpcleanup();
+    stop_timing(loops, ctl);
+    us = get_timing();
+    printf("%s %d %5.2f MBytes/sec\n", ctl, bytes, 
+       (double)loops * (double)bytes / (double)us);
+}