Remove i386 support.
[dragonfly.git] / sys / platform / pc32 / i386 / bcopy.s
CommitLineData
263541db 1/*
8c10bfcf
MD
2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
263541db
MD
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
8c10bfcf 10 *
263541db
MD
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
8c10bfcf
MD
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
263541db 32 * SUCH DAMAGE.
8c10bfcf 33 *
431d0fef 34 * $DragonFly: src/sys/platform/pc32/i386/bcopy.s,v 1.9 2007/01/09 23:34:02 dillon Exp $
263541db
MD
35 */
36/*
37 * bcopy(source:%esi, target:%edi, count:%ecx)
38 *
39 * note: esi, edi, eax, ecx, and edx may be destroyed
40 */
41
42#include "use_npx.h"
43
44#include <machine/asmacros.h>
45#include <machine/cputypes.h>
46#include <machine/pmap.h>
47#include <machine/specialreg.h>
48
49#include "assym.s"
50
51 .text
52
af5ac7fb
MD
53 /*
54 * bcopyb() is a 'dumb' byte-granular bcopy. It is only used by
55 * devices which need to bcopy device-mapped memory which cannot
56 * otherwise handle 16 or 32 bit ops.
57 */
58 ALIGN_TEXT
59ENTRY(bcopyb)
60 pushl %esi
61 pushl %edi
62 movl 12(%esp),%esi
63 movl 16(%esp),%edi
64 movl 20(%esp),%ecx
65 movl %edi,%eax
66 subl %esi,%eax
67 cmpl %ecx,%eax /* overlapping && src < dst? */
68 jb 1f
69 cld /* nope, copy forwards */
70 rep
71 movsb
72 popl %edi
73 popl %esi
74 ret
75
76 ALIGN_TEXT
771:
78 addl %ecx,%edi /* copy backwards. */
79 addl %ecx,%esi
80 decl %edi
81 decl %esi
82 std
83 rep
84 movsb
85 popl %edi
86 popl %esi
87 cld
88 ret
89
e14fb90b
MD
90 /*
91 * bcopyi(s, d, len) (NON OVERLAPPING)
92 *
93 * This is a dumb 32-bit-granular bcopy
94 */
95 ALIGN_TEXT
96ENTRY(bcopyi)
97 pushl %esi
98 pushl %edi
99 movl 12(%esp),%esi
100 movl 16(%esp),%edi
101 movl 20(%esp),%ecx
102 shrl $2,%ecx
103 cld
104 rep
105 movsl
106 popl %edi
107 popl %esi
108 ret
af5ac7fb 109
263541db
MD
110 /*
111 * If memcpy/bcopy is called as part of a copyin or copyout, the
af5ac7fb 112 * on-fault routine is set up to do a 'ret'. We have to restore
263541db
MD
113 * %ebx and return to the copyin/copyout fault handler.
114 */
115generic_onfault:
116 popl %ebx
117 addl $4,%esp /* skip normal return vector */
118 ret /* return to copyin/copyout fault handler */
119
120 /*
121 * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
122 *
123 * Reasonably optimal on all modern machines.
124 */
125
126 SUPERALIGN_TEXT
127ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */
128 pushl %ebx
129 pushl $generic_onfault
130 jmp 2f
131
132 SUPERALIGN_TEXT
133ENTRY(asm_generic_bcopy)
134 pushl %ebx
135 pushl $generic_onfault
136 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
137 jb 2f
138 addl %ecx,%esi
139 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
140 jb 10f
141 subl %ecx,%esi
142 jmp 2f
143
144 SUPERALIGN_TEXT
1451:
146 movl (%esi),%eax
147 movl 4(%esi),%ebx
148 movl 8(%esi),%edx
149 movl %eax,(%edi)
150 movl 12(%esi),%eax
151 movl %ebx,4(%edi)
152 movl 16(%esi),%ebx
153 movl %edx,8(%edi)
154 movl 20(%esi),%edx
155 movl %eax,12(%edi)
156 movl 24(%esi),%eax
157 movl %ebx,16(%edi)
158 movl 28(%esi),%ebx
159 movl %edx,20(%edi)
160 movl %eax,24(%edi)
161 addl $32,%esi
162 movl %ebx,28(%edi)
163 addl $32,%edi
1642:
165 subl $32,%ecx
166 jae 1b
167 addl $32,%ecx
168 jz 3f
169 cld
170 rep
171 movsb
1723:
173 addl $4,%esp
174 popl %ebx
175 ret
176
177 /*
178 * GENERIC_BCOPY() - BACKWARDS COPY
179 */
180 SUPERALIGN_TEXT
18110:
182 addl %ecx,%edi
183 jmp 12f
184
185 SUPERALIGN_TEXT
18611:
187 movl -4(%esi),%eax
188 movl -8(%esi),%ebx
189 movl -12(%esi),%edx
190 movl %eax,-4(%edi)
191 movl -16(%esi),%eax
192 movl %ebx,-8(%edi)
193 movl -20(%esi),%ebx
194 movl %edx,-12(%edi)
195 movl -24(%esi),%edx
196 movl %eax,-16(%edi)
197 movl -28(%esi),%eax
198 movl %ebx,-20(%edi)
199 movl -32(%esi),%ebx
200 movl %edx,-24(%edi)
201 movl %eax,-28(%edi)
202 subl $32,%esi
203 movl %ebx,-32(%edi)
204 subl $32,%edi
20512:
206 subl $32,%ecx
207 jae 11b
208 addl $32,%ecx
209 jz 13f
210 decl %esi
211 decl %edi
212 std
213 rep
214 movsb
215 cld
21613:
217 addl $4,%esp
218 popl %ebx
219 ret
220
221 /*
222 * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
223 *
a02705a9 224 * note: esi, edi, eax, ecx, and edx are allowed to be destroyed.
263541db
MD
225 *
226 * In order for the kernel to be able to use the FPU:
227 *
a02705a9 228 * (1) The kernel may not already be using the fpu.
65d6ce10 229 *
263541db 230 * (2) If the fpu is owned by the application, we must save
65d6ce10
MD
231 * its state. If the fpu is not owned by the application
232 * the application's saved fp state may already exist
233 * in TD_SAVEFPU.
263541db 234 *
a02705a9
MD
235 * (3) We cannot allow the kernel to overwrite the application's
236 * FPU state with our own, so we make sure the application's
237 * FPU state has been saved and then point TD_SAVEFPU at a
238 * temporary fpu save area in the globaldata structure.
65d6ce10 239 *
a02705a9
MD
240 * RACES/ALGORITHM:
241 *
242 * If gd_npxthread is not NULL we must save the application's
243 * current FP state to the current save area and then NULL
244 * out gd_npxthread to interlock against new interruptions
245 * changing the FP state further.
263541db 246 *
a02705a9
MD
247 * If gd_npxthread is NULL the FP unit is in a known 'safe'
248 * state and may be used once the new save area is installed.
263541db 249 *
a02705a9
MD
250 * race(1): If an interrupt occurs just prior to calling fxsave
251 * all that happens is that fxsave gets a npxdna trap, restores
252 * the app's environment, and immediately traps, restores,
253 * and saves it again.
263541db 254 *
a02705a9 255 * race(2): No interrupt can safely occur after we NULL-out
e687cf27 256 * npxthread until we fnclex, because the kernel assumes that
a02705a9
MD
257 * the FP unit is in a safe state when npxthread is NULL. It's
258 * more convenient to use a cli sequence here (it is not
259 * considered to be in the critical path), but a critical
260 * section would also work.
263541db 261 *
e687cf27
MD
262 * NOTE ON FNINIT vs FNCLEX - Making the FP unit safe here is
263 * the goal. It should be sufficient to just call FNCLEX rather
264 * then having to FNINIT the entire unit.
265 *
a02705a9
MD
266 * race(3): The FP unit is in a known state (because npxthread
267 * was either previously NULL or we saved and init'd and made
268 * it NULL). This is true even if we are preempted and the
269 * preempting thread uses the FP unit, because it will be
270 * fninit's again on return. ANY STATE WE SAVE TO THE FPU MAY
271 * BE DESTROYED BY PREEMPTION WHILE NPXTHREAD IS NULL! However,
272 * an interrupt occuring inbetween clts and the setting of
273 * gd_npxthread may set the TS bit again and cause the next
274 * npxdna() to panic when it sees a non-NULL gd_npxthread.
275 *
276 * We can safely set TD_SAVEFPU to point to a new uninitialized
277 * save area and then set GD_NPXTHREAD to non-NULL. If an
278 * interrupt occurs after we set GD_NPXTHREAD, all that happens
279 * is that the safe FP state gets saved and restored. We do not
e687cf27 280 * need to clex again.
330938c0 281 *
a02705a9
MD
282 * We can safely clts after setting up the new save-area, before
283 * installing gd_npxthread, even if we get preempted just after
284 * calling clts. This is because the FP unit will be in a safe
285 * state while gd_npxthread is NULL. Setting gd_npxthread will
286 * simply lock-in that safe-state. Calling clts saves
287 * unnecessary trap overhead since we are about to use the FP
288 * unit anyway and don't need to 'restore' any state prior to
289 * that first use.
330938c0 290 *
263541db
MD
291 * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn.
292 * MMX: Typical on XPs and P3s. 64 bit media insn.
293 */
294
a02705a9
MD
295#define MMX_SAVE_BLOCK(missfunc) \
296 cmpl $2048,%ecx ; \
297 jb missfunc ; \
298 movl MYCPU,%eax ; /* EAX = MYCPU */ \
299 btsl $1,GD_FPU_LOCK(%eax) ; \
300 jc missfunc ; \
301 pushl %ebx ; \
302 pushl %ecx ; \
303 movl GD_CURTHREAD(%eax),%edx ; /* EDX = CURTHREAD */ \
304 movl TD_SAVEFPU(%edx),%ebx ; /* save app save area */\
305 addl $TDPRI_CRIT,TD_PRI(%edx) ; \
306 cmpl $0,GD_NPXTHREAD(%eax) ; \
307 je 100f ; \
308 fxsave 0(%ebx) ; /* race(1) */ \
309 movl $0,GD_NPXTHREAD(%eax) ; /* interlock intr */ \
310 clts ; \
e687cf27 311 fnclex ; /* race(2) */ \
a02705a9
MD
312100: ; \
313 leal GD_SAVEFPU(%eax),%ecx ; \
314 movl %ecx,TD_SAVEFPU(%edx) ; \
315 clts ; \
316 movl %edx,GD_NPXTHREAD(%eax) ; /* race(3) */ \
317 subl $TDPRI_CRIT,TD_PRI(%edx) ; /* crit_exit() */ \
318 cmpl $0,GD_REQFLAGS(%eax) ; \
319 je 101f ; \
320 cmpl $TDPRI_CRIT,TD_PRI(%edx) ; \
321 jge 101f ; \
322 call lwkt_yield_quick ; \
323 /* note: eax,ecx,edx destroyed */ \
324101: ; \
325 movl (%esp),%ecx ; \
326 movl $mmx_onfault,(%esp) ; \
263541db 327
330938c0 328 /*
a02705a9
MD
329 * When restoring the application's FP state we must first clear
330 * npxthread to prevent further saves, then restore the pointer
331 * to the app's save area. We do not have to (and should not)
332 * restore the app's FP state now. Note that we do not have to
e687cf27 333 * call fnclex because our use of the FP guarentees that it is in
a02705a9 334 * a 'safe' state (at least for kernel use).
330938c0 335 *
a02705a9
MD
336 * NOTE: it is not usually safe to mess with CR0 outside of a
337 * critical section, because TS may get set by a preemptive
338 * interrupt. However, we *can* race a load/set-ts/store against
339 * an interrupt doing the same thing.
431d0fef
MD
340 *
341 * WARNING! A Virtual kernel depends on CR0_TS remaining set after
342 * we use the FP unit if it asked it to be set.
330938c0 343 */
263541db
MD
344
345#define MMX_RESTORE_BLOCK \
346 addl $4,%esp ; \
347 MMX_RESTORE_BLOCK2
348
349#define MMX_RESTORE_BLOCK2 \
a02705a9
MD
350 movl MYCPU,%ecx ; \
351 movl GD_CURTHREAD(%ecx),%edx ; \
352 movl $0,GD_NPXTHREAD(%ecx) ; \
65d6ce10
MD
353 movl %ebx,TD_SAVEFPU(%edx) ; \
354 smsw %ax ; \
a02705a9 355 popl %ebx ; \
65d6ce10 356 orb $CR0_TS,%al ; \
263541db 357 lmsw %ax ; \
a02705a9 358 movl $0,GD_FPU_LOCK(%ecx)
263541db
MD
359
360 /*
361 * xmm/mmx_onfault routine. Restore the fpu state, skip the normal
362 * return vector, and return to the caller's on-fault routine
a02705a9 363 * (which was pushed on the callers stack just before he called us)
263541db 364 */
a02705a9 365 ALIGN_TEXT
263541db
MD
366mmx_onfault:
367 MMX_RESTORE_BLOCK2
368 addl $4,%esp
369 ret
370
371 /*
372 * MXX entry points - only support 64 bit media instructions
373 */
374 SUPERALIGN_TEXT
375ENTRY(asm_mmx_memcpy) /* memcpy() entry point use optimal copy */
376 MMX_SAVE_BLOCK(asm_generic_memcpy)
377 jmp 5f
378
379 SUPERALIGN_TEXT
380ENTRY(asm_mmx_bcopy)
381 MMX_SAVE_BLOCK(asm_generic_bcopy)
382 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
383 jb 5f
384 addl %ecx,%esi
385 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
386 jb 10f
387 subl %ecx,%esi
388 jmp 5f
389
390 /*
391 * XMM entry points - support 128 bit media instructions
392 */
393 SUPERALIGN_TEXT
394ENTRY(asm_xmm_memcpy) /* memcpy() entry point use optimal copy */
395 MMX_SAVE_BLOCK(asm_generic_memcpy)
396 jmp 1f
397
398 SUPERALIGN_TEXT
399ENTRY(asm_xmm_bcopy)
400 MMX_SAVE_BLOCK(asm_generic_bcopy)
401 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
402 jb 1f
403 addl %ecx,%esi
404 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
405 jb 10f
406 subl %ecx,%esi
4071:
408 movl %esi,%eax /* skip xmm if the data is not aligned */
409 andl $15,%eax
410 jnz 5f
411 movl %edi,%eax
412 andl $15,%eax
413 jz 3f
414 jmp 5f
415
416 SUPERALIGN_TEXT
4172:
418 movdqa (%esi),%xmm0
419 movdqa 16(%esi),%xmm1
420 movdqa 32(%esi),%xmm2
421 movdqa 48(%esi),%xmm3
422 movdqa 64(%esi),%xmm4
423 movdqa 80(%esi),%xmm5
424 movdqa 96(%esi),%xmm6
425 movdqa 112(%esi),%xmm7
426 /*prefetchnta 128(%esi) 3dNOW */
427 addl $128,%esi
428
429 /*
430 * movdqa or movntdq can be used.
431 */
432 movdqa %xmm0,(%edi)
433 movdqa %xmm1,16(%edi)
434 movdqa %xmm2,32(%edi)
435 movdqa %xmm3,48(%edi)
436 movdqa %xmm4,64(%edi)
437 movdqa %xmm5,80(%edi)
438 movdqa %xmm6,96(%edi)
439 movdqa %xmm7,112(%edi)
440 addl $128,%edi
4413:
442 subl $128,%ecx
443 jae 2b
444 addl $128,%ecx
445 jz 6f
446 jmp 5f
447 SUPERALIGN_TEXT
4484:
449 movq (%esi),%mm0
450 movq 8(%esi),%mm1
451 movq 16(%esi),%mm2
452 movq 24(%esi),%mm3
453 movq 32(%esi),%mm4
454 movq 40(%esi),%mm5
455 movq 48(%esi),%mm6
456 movq 56(%esi),%mm7
457 /*prefetchnta 128(%esi) 3dNOW */
458 addl $64,%esi
459 movq %mm0,(%edi)
460 movq %mm1,8(%edi)
461 movq %mm2,16(%edi)
462 movq %mm3,24(%edi)
463 movq %mm4,32(%edi)
464 movq %mm5,40(%edi)
465 movq %mm6,48(%edi)
466 movq %mm7,56(%edi)
467 addl $64,%edi
4685:
469 subl $64,%ecx
470 jae 4b
471 addl $64,%ecx
472 jz 6f
473 cld
474 rep
475 movsb
4766:
477 MMX_RESTORE_BLOCK
478 ret
479
480 /*
481 * GENERIC_BCOPY() - BACKWARDS COPY
482 *
483 * Don't bother using xmm optimizations, just stick with mmx.
484 */
485 SUPERALIGN_TEXT
48610:
487 addl %ecx,%edi
488 jmp 12f
489
490 SUPERALIGN_TEXT
49111:
492 movq -64(%esi),%mm0
493 movq -56(%esi),%mm1
494 movq -48(%esi),%mm2
495 movq -40(%esi),%mm3
496 movq -32(%esi),%mm4
497 movq -24(%esi),%mm5
498 movq -16(%esi),%mm6
499 movq -8(%esi),%mm7
500 /*prefetchnta -128(%esi)*/
501 subl $64,%esi
502 movq %mm0,-64(%edi)
503 movq %mm1,-56(%edi)
504 movq %mm2,-48(%edi)
505 movq %mm3,-40(%edi)
506 movq %mm4,-32(%edi)
507 movq %mm5,-24(%edi)
508 movq %mm6,-16(%edi)
509 movq %mm7,-8(%edi)
510 subl $64,%edi
51112:
512 subl $64,%ecx
513 jae 11b
514 addl $64,%ecx
515 jz 13f
516 decl %esi
517 decl %edi
518 std
519 rep
520 movsb
521 cld
52213:
523 MMX_RESTORE_BLOCK
524 ret
525