rename amd64 architecture to x86_64
[dragonfly.git] / test / sysperf / memcpy.S
CommitLineData
ab4bc20f
MD
1
2 /*
3 * memcpy.S
4 *
c1543a89 5 * x86_64: MOVNTQ vs MOVQ, MOVNTDQ vs MOVDQ[A/U], PREFETCH[x]
ab4bc20f
MD
6 *
7 * NT stands for 'non-temportal', which basically means
8 * 'bypass L1 cache on write'. Write bandwidth is
9 * effectively reduced to the L2 cache bandwidth but
10 * the L1 cache will not be wiped out by the copy.
11 *
12 * DO NOT MIX 'nt' and standard writes! Your performance
13 * will go poof.
14 *
15 * PREFETCH[NTA,T0,T1,T2]
16 *
17 * These instructions prefetch a cache line (typically
18 * 128 bytes). 'NT' means 'non-temporal', which bypasses
19 * the L1 cache if the data is not already in the L1
20 * cache. HOWEVER, using PREFETCHNT can put a slow memory
21 * op in the cpu's memory request queue if a L1 or L2
22 * miss occurs, and it can stall an L1-cache-hit access
23 * for a small but noticeable period of time, so it is
24 * a good idea not to put a memory op just after a
25 * prefetchnta instruction.
26 *
27 * You can get better L2 bandwidth using prefetchnt but
28 * it will not be much more then prefetcht0 and
29 * 'prefetcht0' will give you better cache-miss
30 * bandwidth.
31 *
32 * The prefetch has to be done far enough ahead to do
33 * some good, but it only has a significant effect when
34 * it is able to move date from L2 to L1. Prefetching
35 * from main memory does not have a significant effect
36 * durign a copy or zeroing operation because main
37 * memory bandwidth is already saturated.
38 *
39 * $DragonFly: src/test/sysperf/memcpy.S,v 1.1 2004/04/29 16:14:53 dillon Exp $
40 */
41 .text
42 .globl docopy1
43 .globl docopy2
44 .globl docopy3
45 .globl docopy4
46 .globl docopy5
47 .globl docopy6
48 .globl docopy7
49 .globl fpcleanup
50
51 .p2align 4,0x90
52docopy1:
53 pushl %esi
54 pushl %edi
55 pushl %ecx
56 pushl %ebx
57
58 movl 4+16(%esp),%esi
59 movl 8+16(%esp),%edi
60 movl 12+16(%esp),%ecx
61 shrl $2,%ecx
62 cld
63 rep
64 movsl
65 popl %ebx
66 popl %ecx
67 popl %edi
68 popl %esi
69 ret
70
71 .p2align 4,0x90
72docopy2:
73 pushl %esi
74 pushl %edi
75 pushl %ecx
76 pushl %ebx
77
78 movl 4+16(%esp),%esi
79 movl 8+16(%esp),%edi
80 movl 12+16(%esp),%ecx
81 addl %ecx,%esi
82 addl %ecx,%edi
83 shrl $2,%ecx
84 std
85 rep
86 movsl
87 popl %ebx
88 popl %ecx
89 popl %edi
90 popl %esi
91 ret
92
93 .p2align 4,0x90
94docopy3:
95 pushl %esi
96 pushl %edi
97 pushl %ecx
98 pushl %ebx
99
100 movl 4+16(%esp),%esi
101 movl 8+16(%esp),%edi
102 movl 12+16(%esp),%ecx
103
104 .p2align 4,0x90
1051:
106 movl (%esi),%eax
107 movl 4(%esi),%ebx
108 movl 8(%esi),%edx
109 movl %eax,(%edi)
110 movl 12(%esi),%eax
111 movl %ebx,4(%edi)
112 movl 16(%esi),%ebx
113 movl %edx,8(%edi)
114 movl 20(%esi),%edx
115 movl %eax,12(%edi)
116 movl 24(%esi),%eax
117 movl %ebx,16(%edi)
118 movl 28(%esi),%ebx
119 movl %edx,20(%edi)
120 prefetcht0 96(%esi)
121 subl $32,%ecx
122 movl %eax,24(%edi)
123 addl $32,%esi
124 movl %ebx,28(%edi)
125 addl $32,%edi
126
127 testl %ecx,%ecx
128 jnz 1b
129
130 popl %ebx
131 popl %ecx
132 popl %edi
133 popl %esi
134 ret
135
136 .p2align 4,0x90
137docopy4:
138 pushl %esi
139 pushl %edi
140 pushl %ecx
141 pushl %ebx
142
143 movl 4+16(%esp),%esi
144 movl 8+16(%esp),%edi
145 movl 12+16(%esp),%ecx
146
147 .p2align 4,0x90
1481:
149 movl (%esi),%eax
150 movl 4(%esi),%ebx
151 addl $8,%esi
152 prefetcht0 64(%esi)
153 subl $8,%ecx
154 movl %eax,(%edi)
155 movl %ebx,4(%edi)
156 addl $8,%edi
157 testl %ecx,%ecx
158 jnz 1b
159
160 popl %ebx
161 popl %ecx
162 popl %edi
163 popl %esi
164 ret
165
166 .p2align 4,0x90
167docopy5:
168 pushl %esi
169 pushl %edi
170 pushl %ecx
171 pushl %ebx
172
173 movl 4+16(%esp),%esi
174 movl 8+16(%esp),%edi
175 movl 12+16(%esp),%ecx
176
177 .p2align 4,0x90
1781:
179 movq (%esi),%mm0
180 movq 8(%esi),%mm1
181 movq 16(%esi),%mm2
182 movq 24(%esi),%mm3
183 movq 32(%esi),%mm4
184 movq 40(%esi),%mm5
185 movq 48(%esi),%mm6
186 movq 56(%esi),%mm7
187 prefetchnta 128(%esi)
188 subl $64,%ecx
189 addl $64,%esi
190 movq %mm0,(%edi)
191 movq %mm1,8(%edi)
192 movq %mm2,16(%edi)
193 movq %mm3,24(%edi)
194 movq %mm4,32(%edi)
195 movq %mm5,40(%edi)
196 movq %mm6,48(%edi)
197 movq %mm7,56(%edi)
198 addl $64,%edi
199 testl %ecx,%ecx
200 jnz 1b
201
202 popl %ebx
203 popl %ecx
204 popl %edi
205 popl %esi
206 ret
207
208 .p2align 4,0x90
209docopy6:
210 pushl %esi
211 pushl %edi
212 pushl %ecx
213 pushl %ebx
214
215 movl 4+16(%esp),%esi
216 movl 8+16(%esp),%edi
217 movl 12+16(%esp),%ecx
218 movl $16,%eax
219
220 .p2align 4,0x90
2211:
222 prefetcht0 96(%esi)
223 subl %eax,%ecx
224 movq (%esi),%mm0
225 movq 8(%esi),%mm1
226 addl %eax,%esi
227 movntq %mm0,(%edi)
228 movntq %mm1,8(%edi)
229 addl %eax,%edi
230 testl %ecx,%ecx
231 jnz 1b
232
233 popl %ebx
234 popl %ecx
235 popl %edi
236 popl %esi
237 ret
238
239 .p2align 4,0x90
240docopy7:
241 pushl %esi
242 pushl %edi
243 pushl %ecx
244 pushl %ebx
245
246 movl 4+16(%esp),%esi
247 movl 8+16(%esp),%edi
248 movl 12+16(%esp),%ecx
249 movl $128,%eax
250
251 .p2align 4,0x90
2521:
253 movdqa (%esi),%xmm0
254 movdqa 16(%esi),%xmm1
255 movdqa 32(%esi),%xmm2
256 movdqa 48(%esi),%xmm3
257 movdqa 64(%esi),%xmm4
258 movdqa 80(%esi),%xmm5
259 movdqa 96(%esi),%xmm6
260 movdqa 112(%esi),%xmm7
261 subl %eax,%ecx
262 addl %eax,%esi
263 movntdq %xmm0,(%edi)
264 movntdq %xmm1,16(%edi)
265 movntdq %xmm2,32(%edi)
266 movntdq %xmm3,48(%edi)
267 movntdq %xmm4,64(%edi)
268 movntdq %xmm5,80(%edi)
269 movntdq %xmm6,96(%edi)
270 movntdq %xmm7,112(%edi)
271 addl %eax,%edi
272 testl %ecx,%ecx
273 jnz 1b
274
275 popl %ebx
276 popl %ecx
277 popl %edi
278 popl %esi
279 ret
280
281 .p2align 4,0x90
282fpcleanup:
283 fninit
284 ret
285