Add mem1 and mem2 .... memory copying and zeroing test suites, making it
[dragonfly.git] / test / sysperf / memcpy.S
1
2                 /*
3                  * memcpy.S
4                  *
5                  * AMD64: MOVNTQ vs MOVQ, MOVNTDQ vs MOVDQ[A/U], PREFETCH[x]
6                  *
7                  *      NT stands for 'non-temportal', which basically means
8                  *      'bypass L1 cache on write'.  Write bandwidth is
9                  *      effectively reduced to the L2 cache bandwidth but
10                  *      the L1 cache will not be wiped out by the copy.
11                  *
12                  *      DO NOT MIX 'nt' and standard writes!  Your performance
13                  *      will go poof.
14                  *
15                  * PREFETCH[NTA,T0,T1,T2]
16                  *
17                  *      These instructions prefetch a cache line (typically
18                  *      128 bytes).  'NT' means 'non-temporal', which bypasses
19                  *      the L1 cache if the data is not already in the L1 
20                  *      cache.  HOWEVER, using PREFETCHNT can put a slow memory
21                  *      op in the cpu's memory request queue if a L1 or L2
22                  *      miss occurs, and it can stall an L1-cache-hit access
23                  *      for a small but noticeable period of time, so it is
24                  *      a good idea not to put a memory op just after a 
25                  *      prefetchnta instruction.
26                  *
27                  *      You can get better L2 bandwidth using prefetchnt but
28                  *      it will not be much more then prefetcht0 and 
29                  *      'prefetcht0' will give you better cache-miss
30                  *      bandwidth.
31                  *
32                  *      The prefetch has to be done far enough ahead to do
33                  *      some good, but it only has a significant effect when
34                  *      it is able to move date from L2 to L1.  Prefetching
35                  *      from main memory does not have a significant effect
36                  *      durign a copy or zeroing operation because main
37                  *      memory bandwidth is already saturated.
38                  *
39                  * $DragonFly: src/test/sysperf/memcpy.S,v 1.1 2004/04/29 16:14:53 dillon Exp $
40                  */
41                 .text
42                 .globl  docopy1
43                 .globl  docopy2
44                 .globl  docopy3
45                 .globl  docopy4
46                 .globl  docopy5
47                 .globl  docopy6
48                 .globl  docopy7
49                 .globl  fpcleanup
50
51                 .p2align 4,0x90
52 docopy1:
53                 pushl   %esi
54                 pushl   %edi
55                 pushl   %ecx
56                 pushl   %ebx
57
58                 movl    4+16(%esp),%esi
59                 movl    8+16(%esp),%edi
60                 movl    12+16(%esp),%ecx
61                 shrl    $2,%ecx
62                 cld
63                 rep
64                 movsl
65                 popl    %ebx
66                 popl    %ecx
67                 popl    %edi
68                 popl    %esi
69                 ret
70
71                 .p2align 4,0x90
72 docopy2:
73                 pushl   %esi
74                 pushl   %edi
75                 pushl   %ecx
76                 pushl   %ebx
77
78                 movl    4+16(%esp),%esi
79                 movl    8+16(%esp),%edi
80                 movl    12+16(%esp),%ecx
81                 addl    %ecx,%esi
82                 addl    %ecx,%edi
83                 shrl    $2,%ecx
84                 std
85                 rep
86                 movsl
87                 popl    %ebx
88                 popl    %ecx
89                 popl    %edi
90                 popl    %esi
91                 ret
92
93                 .p2align 4,0x90
94 docopy3:
95                 pushl   %esi
96                 pushl   %edi
97                 pushl   %ecx
98                 pushl   %ebx
99
100                 movl    4+16(%esp),%esi
101                 movl    8+16(%esp),%edi
102                 movl    12+16(%esp),%ecx
103
104                 .p2align 4,0x90
105 1:
106                 movl    (%esi),%eax
107                 movl    4(%esi),%ebx
108                 movl    8(%esi),%edx
109                 movl    %eax,(%edi)
110                 movl    12(%esi),%eax
111                 movl    %ebx,4(%edi)
112                 movl    16(%esi),%ebx
113                 movl    %edx,8(%edi)
114                 movl    20(%esi),%edx
115                 movl    %eax,12(%edi)
116                 movl    24(%esi),%eax
117                 movl    %ebx,16(%edi)
118                 movl    28(%esi),%ebx
119                 movl    %edx,20(%edi)
120                 prefetcht0 96(%esi)
121                 subl    $32,%ecx
122                 movl    %eax,24(%edi)
123                 addl    $32,%esi
124                 movl    %ebx,28(%edi)
125                 addl    $32,%edi
126
127                 testl   %ecx,%ecx
128                 jnz     1b
129
130                 popl    %ebx
131                 popl    %ecx
132                 popl    %edi
133                 popl    %esi
134                 ret
135
136                 .p2align 4,0x90
137 docopy4:
138                 pushl   %esi
139                 pushl   %edi
140                 pushl   %ecx
141                 pushl   %ebx
142
143                 movl    4+16(%esp),%esi
144                 movl    8+16(%esp),%edi
145                 movl    12+16(%esp),%ecx
146
147                 .p2align 4,0x90
148 1:
149                 movl    (%esi),%eax
150                 movl    4(%esi),%ebx
151                 addl    $8,%esi
152                 prefetcht0 64(%esi)
153                 subl    $8,%ecx
154                 movl    %eax,(%edi)
155                 movl    %ebx,4(%edi)
156                 addl    $8,%edi
157                 testl   %ecx,%ecx
158                 jnz     1b
159
160                 popl    %ebx
161                 popl    %ecx
162                 popl    %edi
163                 popl    %esi
164                 ret
165
166                 .p2align 4,0x90
167 docopy5:
168                 pushl   %esi
169                 pushl   %edi
170                 pushl   %ecx
171                 pushl   %ebx
172
173                 movl    4+16(%esp),%esi
174                 movl    8+16(%esp),%edi
175                 movl    12+16(%esp),%ecx
176
177                 .p2align 4,0x90
178 1:
179                 movq    (%esi),%mm0
180                 movq    8(%esi),%mm1
181                 movq    16(%esi),%mm2
182                 movq    24(%esi),%mm3
183                 movq    32(%esi),%mm4
184                 movq    40(%esi),%mm5
185                 movq    48(%esi),%mm6
186                 movq    56(%esi),%mm7
187                 prefetchnta 128(%esi)
188                 subl    $64,%ecx
189                 addl    $64,%esi
190                 movq    %mm0,(%edi)
191                 movq    %mm1,8(%edi)
192                 movq    %mm2,16(%edi)
193                 movq    %mm3,24(%edi)
194                 movq    %mm4,32(%edi)
195                 movq    %mm5,40(%edi)
196                 movq    %mm6,48(%edi)
197                 movq    %mm7,56(%edi)
198                 addl    $64,%edi
199                 testl   %ecx,%ecx
200                 jnz     1b
201
202                 popl    %ebx
203                 popl    %ecx
204                 popl    %edi
205                 popl    %esi
206                 ret
207
208                 .p2align 4,0x90
209 docopy6:
210                 pushl   %esi
211                 pushl   %edi
212                 pushl   %ecx
213                 pushl   %ebx
214
215                 movl    4+16(%esp),%esi
216                 movl    8+16(%esp),%edi
217                 movl    12+16(%esp),%ecx
218                 movl    $16,%eax
219
220                 .p2align 4,0x90
221 1:
222                 prefetcht0 96(%esi)
223                 subl    %eax,%ecx
224                 movq    (%esi),%mm0
225                 movq    8(%esi),%mm1
226                 addl    %eax,%esi
227                 movntq  %mm0,(%edi)
228                 movntq  %mm1,8(%edi)
229                 addl    %eax,%edi
230                 testl   %ecx,%ecx
231                 jnz     1b
232
233                 popl    %ebx
234                 popl    %ecx
235                 popl    %edi
236                 popl    %esi
237                 ret
238
239                 .p2align 4,0x90
240 docopy7:
241                 pushl   %esi
242                 pushl   %edi
243                 pushl   %ecx
244                 pushl   %ebx
245
246                 movl    4+16(%esp),%esi
247                 movl    8+16(%esp),%edi
248                 movl    12+16(%esp),%ecx
249                 movl    $128,%eax
250
251                 .p2align 4,0x90
252 1:
253                 movdqa  (%esi),%xmm0
254                 movdqa  16(%esi),%xmm1
255                 movdqa  32(%esi),%xmm2
256                 movdqa  48(%esi),%xmm3
257                 movdqa  64(%esi),%xmm4
258                 movdqa  80(%esi),%xmm5
259                 movdqa  96(%esi),%xmm6
260                 movdqa  112(%esi),%xmm7
261                 subl    %eax,%ecx
262                 addl    %eax,%esi
263                 movntdq  %xmm0,(%edi)
264                 movntdq  %xmm1,16(%edi)
265                 movntdq  %xmm2,32(%edi)
266                 movntdq  %xmm3,48(%edi)
267                 movntdq  %xmm4,64(%edi)
268                 movntdq  %xmm5,80(%edi)
269                 movntdq  %xmm6,96(%edi)
270                 movntdq  %xmm7,112(%edi)
271                 addl    %eax,%edi
272                 testl   %ecx,%ecx
273                 jnz     1b
274
275                 popl    %ebx
276                 popl    %ecx
277                 popl    %edi
278                 popl    %esi
279                 ret
280
281                 .p2align 4,0x90
282 fpcleanup:
283                 fninit
284                 ret
285