kernel - Make pcb_onfault more robust.
[dragonfly.git] / sys / platform / pc32 / i386 / bcopy.s
CommitLineData
263541db 1/*
8c10bfcf
MD
2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
263541db
MD
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
8c10bfcf 10 *
263541db
MD
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
8c10bfcf
MD
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
263541db 32 * SUCH DAMAGE.
263541db
MD
33 */
34/*
35 * bcopy(source:%esi, target:%edi, count:%ecx)
36 *
37 * note: esi, edi, eax, ecx, and edx may be destroyed
38 */
39
263541db
MD
40#include <machine/asmacros.h>
41#include <machine/cputypes.h>
42#include <machine/pmap.h>
43#include <machine/specialreg.h>
44
45#include "assym.s"
46
47 .text
48
49 /*
af5ac7fb
MD
50 * bcopyb() is a 'dumb' byte-granular bcopy. It is only used by
51 * devices which need to bcopy device-mapped memory which cannot
52 * otherwise handle 16 or 32 bit ops.
53 */
54 ALIGN_TEXT
55ENTRY(bcopyb)
56 pushl %esi
57 pushl %edi
58 movl 12(%esp),%esi
59 movl 16(%esp),%edi
60 movl 20(%esp),%ecx
61 movl %edi,%eax
62 subl %esi,%eax
63 cmpl %ecx,%eax /* overlapping && src < dst? */
64 jb 1f
65 cld /* nope, copy forwards */
66 rep
67 movsb
68 popl %edi
69 popl %esi
70 ret
71
72 ALIGN_TEXT
731:
74 addl %ecx,%edi /* copy backwards. */
75 addl %ecx,%esi
76 decl %edi
77 decl %esi
78 std
79 rep
80 movsb
81 popl %edi
82 popl %esi
83 cld
84 ret
85
e14fb90b
MD
86 /*
87 * bcopyi(s, d, len) (NON OVERLAPPING)
88 *
89 * This is a dumb 32-bit-granular bcopy
90 */
91 ALIGN_TEXT
92ENTRY(bcopyi)
93 pushl %esi
94 pushl %edi
95 movl 12(%esp),%esi
96 movl 16(%esp),%edi
97 movl 20(%esp),%ecx
98 shrl $2,%ecx
99 cld
100 rep
101 movsl
102 popl %edi
103 popl %esi
104 ret
af5ac7fb
MD
105
106 /*
263541db 107 * If memcpy/bcopy is called as part of a copyin or copyout, the
af5ac7fb 108 * on-fault routine is set up to do a 'ret'. We have to restore
263541db
MD
109 * %ebx and return to the copyin/copyout fault handler.
110 */
111generic_onfault:
112 popl %ebx
113 addl $4,%esp /* skip normal return vector */
114 ret /* return to copyin/copyout fault handler */
115
116 /*
117 * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
118 *
93ad6da2 119 * Reasonably optimal on all modern machines.
263541db
MD
120 */
121
122 SUPERALIGN_TEXT
123ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */
93ad6da2
MD
124 pushl %ebx /* WARNING COPYIN/OUT EXPECTS THIS FRAME */
125 pushl $generic_onfault/* WARNING COPYIN/OUT EXPECTS THIS FRAME */
263541db
MD
126 jmp 2f
127
128 SUPERALIGN_TEXT
129ENTRY(asm_generic_bcopy)
93ad6da2
MD
130 pushl %ebx /* WARNING COPYIN/OUT EXPECTS THIS FRAME */
131 pushl $generic_onfault/* WARNING COPYIN/OUT EXPECTS THIS FRAME */
263541db
MD
132 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
133 jb 2f
134 addl %ecx,%esi
135 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
136 jb 10f
137 subl %ecx,%esi
138 jmp 2f
139
140 SUPERALIGN_TEXT
1411:
142 movl (%esi),%eax
143 movl 4(%esi),%ebx
144 movl 8(%esi),%edx
145 movl %eax,(%edi)
146 movl 12(%esi),%eax
147 movl %ebx,4(%edi)
148 movl 16(%esi),%ebx
149 movl %edx,8(%edi)
150 movl 20(%esi),%edx
151 movl %eax,12(%edi)
152 movl 24(%esi),%eax
153 movl %ebx,16(%edi)
154 movl 28(%esi),%ebx
155 movl %edx,20(%edi)
156 movl %eax,24(%edi)
157 addl $32,%esi
158 movl %ebx,28(%edi)
159 addl $32,%edi
1602:
161 subl $32,%ecx
162 jae 1b
163 addl $32,%ecx
164 jz 3f
165 cld
166 rep
167 movsb
1683:
169 addl $4,%esp
170 popl %ebx
171 ret
172
173 /*
174 * GENERIC_BCOPY() - BACKWARDS COPY
175 */
176 SUPERALIGN_TEXT
17710:
178 addl %ecx,%edi
179 jmp 12f
180
181 SUPERALIGN_TEXT
18211:
183 movl -4(%esi),%eax
184 movl -8(%esi),%ebx
185 movl -12(%esi),%edx
186 movl %eax,-4(%edi)
187 movl -16(%esi),%eax
188 movl %ebx,-8(%edi)
189 movl -20(%esi),%ebx
190 movl %edx,-12(%edi)
191 movl -24(%esi),%edx
192 movl %eax,-16(%edi)
193 movl -28(%esi),%eax
194 movl %ebx,-20(%edi)
195 movl -32(%esi),%ebx
196 movl %edx,-24(%edi)
197 movl %eax,-28(%edi)
198 subl $32,%esi
199 movl %ebx,-32(%edi)
200 subl $32,%edi
20112:
202 subl $32,%ecx
203 jae 11b
204 addl $32,%ecx
205 jz 13f
206 decl %esi
207 decl %edi
208 std
209 rep
210 movsb
211 cld
21213:
213 addl $4,%esp
214 popl %ebx
215 ret
216
217 /*
218 * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
219 *
a02705a9 220 * note: esi, edi, eax, ecx, and edx are allowed to be destroyed.
263541db
MD
221 *
222 * In order for the kernel to be able to use the FPU:
223 *
a02705a9 224 * (1) The kernel may not already be using the fpu.
65d6ce10 225 *
263541db 226 * (2) If the fpu is owned by the application, we must save
65d6ce10
MD
227 * its state. If the fpu is not owned by the application
228 * the application's saved fp state may already exist
229 * in TD_SAVEFPU.
263541db 230 *
a02705a9
MD
231 * (3) We cannot allow the kernel to overwrite the application's
232 * FPU state with our own, so we make sure the application's
233 * FPU state has been saved and then point TD_SAVEFPU at a
234 * temporary fpu save area in the globaldata structure.
65d6ce10 235 *
a02705a9
MD
236 * RACES/ALGORITHM:
237 *
238 * If gd_npxthread is not NULL we must save the application's
239 * current FP state to the current save area and then NULL
240 * out gd_npxthread to interlock against new interruptions
241 * changing the FP state further.
263541db 242 *
a02705a9
MD
243 * If gd_npxthread is NULL the FP unit is in a known 'safe'
244 * state and may be used once the new save area is installed.
263541db 245 *
a02705a9
MD
246 * race(1): If an interrupt occurs just prior to calling fxsave
247 * all that happens is that fxsave gets a npxdna trap, restores
248 * the app's environment, and immediately traps, restores,
249 * and saves it again.
263541db 250 *
a02705a9 251 * race(2): No interrupt can safely occur after we NULL-out
e687cf27 252 * npxthread until we fnclex, because the kernel assumes that
a02705a9
MD
253 * the FP unit is in a safe state when npxthread is NULL. It's
254 * more convenient to use a cli sequence here (it is not
255 * considered to be in the critical path), but a critical
256 * section would also work.
263541db 257 *
e687cf27
MD
258 * NOTE ON FNINIT vs FNCLEX - Making the FP unit safe here is
259 * the goal. It should be sufficient to just call FNCLEX rather
260 * then having to FNINIT the entire unit.
261 *
a02705a9
MD
262 * race(3): The FP unit is in a known state (because npxthread
263 * was either previously NULL or we saved and init'd and made
264 * it NULL). This is true even if we are preempted and the
265 * preempting thread uses the FP unit, because it will be
266 * fninit's again on return. ANY STATE WE SAVE TO THE FPU MAY
267 * BE DESTROYED BY PREEMPTION WHILE NPXTHREAD IS NULL! However,
268 * an interrupt occuring inbetween clts and the setting of
269 * gd_npxthread may set the TS bit again and cause the next
270 * npxdna() to panic when it sees a non-NULL gd_npxthread.
271 *
272 * We can safely set TD_SAVEFPU to point to a new uninitialized
273 * save area and then set GD_NPXTHREAD to non-NULL. If an
274 * interrupt occurs after we set GD_NPXTHREAD, all that happens
275 * is that the safe FP state gets saved and restored. We do not
e687cf27 276 * need to clex again.
330938c0 277 *
a02705a9
MD
278 * We can safely clts after setting up the new save-area, before
279 * installing gd_npxthread, even if we get preempted just after
280 * calling clts. This is because the FP unit will be in a safe
281 * state while gd_npxthread is NULL. Setting gd_npxthread will
282 * simply lock-in that safe-state. Calling clts saves
283 * unnecessary trap overhead since we are about to use the FP
284 * unit anyway and don't need to 'restore' any state prior to
285 * that first use.
330938c0 286 *
263541db
MD
287 * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn.
288 * MMX: Typical on XPs and P3s. 64 bit media insn.
93ad6da2
MD
289 *
290 * WARNING! copyin/copyout expects a push %ebx/onfault frame ONLY,
291 * we can't mess with the stack any more than that.
263541db
MD
292 */
293
a02705a9
MD
294#define MMX_SAVE_BLOCK(missfunc) \
295 cmpl $2048,%ecx ; \
296 jb missfunc ; \
297 movl MYCPU,%eax ; /* EAX = MYCPU */ \
298 btsl $1,GD_FPU_LOCK(%eax) ; \
299 jc missfunc ; \
300 pushl %ebx ; \
301 pushl %ecx ; \
302 movl GD_CURTHREAD(%eax),%edx ; /* EDX = CURTHREAD */ \
303 movl TD_SAVEFPU(%edx),%ebx ; /* save app save area */\
f9235b6d 304 incl TD_CRITCOUNT(%edx) ; \
a02705a9
MD
305 cmpl $0,GD_NPXTHREAD(%eax) ; \
306 je 100f ; \
307 fxsave 0(%ebx) ; /* race(1) */ \
308 movl $0,GD_NPXTHREAD(%eax) ; /* interlock intr */ \
309 clts ; \
e687cf27 310 fnclex ; /* race(2) */ \
a02705a9
MD
311100: ; \
312 leal GD_SAVEFPU(%eax),%ecx ; \
313 movl %ecx,TD_SAVEFPU(%edx) ; \
aad81e48 314 orl $TDF_KERNELFP,TD_FLAGS(%edx) ; \
a02705a9
MD
315 clts ; \
316 movl %edx,GD_NPXTHREAD(%eax) ; /* race(3) */ \
f9235b6d 317 decl TD_CRITCOUNT(%edx) ; /* crit_exit() */ \
a02705a9
MD
318 cmpl $0,GD_REQFLAGS(%eax) ; \
319 je 101f ; \
f9235b6d
MD
320 testl $-1,TD_CRITCOUNT(%edx) ; \
321 jne 101f ; \
faaeffac 322 call splz_check ; \
a02705a9
MD
323 /* note: eax,ecx,edx destroyed */ \
324101: ; \
325 movl (%esp),%ecx ; \
326 movl $mmx_onfault,(%esp) ; \
263541db 327
330938c0 328 /*
a02705a9
MD
329 * When restoring the application's FP state we must first clear
330 * npxthread to prevent further saves, then restore the pointer
331 * to the app's save area. We do not have to (and should not)
332 * restore the app's FP state now. Note that we do not have to
e687cf27 333 * call fnclex because our use of the FP guarentees that it is in
a02705a9 334 * a 'safe' state (at least for kernel use).
330938c0 335 *
a02705a9
MD
336 * NOTE: it is not usually safe to mess with CR0 outside of a
337 * critical section, because TS may get set by a preemptive
338 * interrupt. However, we *can* race a load/set-ts/store against
339 * an interrupt doing the same thing.
431d0fef
MD
340 *
341 * WARNING! A Virtual kernel depends on CR0_TS remaining set after
342 * we use the FP unit if it asked it to be set.
330938c0 343 */
263541db
MD
344
345#define MMX_RESTORE_BLOCK \
346 addl $4,%esp ; \
347 MMX_RESTORE_BLOCK2
348
349#define MMX_RESTORE_BLOCK2 \
a02705a9
MD
350 movl MYCPU,%ecx ; \
351 movl GD_CURTHREAD(%ecx),%edx ; \
352 movl $0,GD_NPXTHREAD(%ecx) ; \
aad81e48 353 andl $~TDF_KERNELFP,TD_FLAGS(%edx) ; \
65d6ce10
MD
354 movl %ebx,TD_SAVEFPU(%edx) ; \
355 smsw %ax ; \
a02705a9 356 popl %ebx ; \
65d6ce10 357 orb $CR0_TS,%al ; \
263541db 358 lmsw %ax ; \
a02705a9 359 movl $0,GD_FPU_LOCK(%ecx)
263541db
MD
360
361 /*
362 * xmm/mmx_onfault routine. Restore the fpu state, skip the normal
363 * return vector, and return to the caller's on-fault routine
a02705a9 364 * (which was pushed on the callers stack just before he called us)
263541db 365 */
a02705a9 366 ALIGN_TEXT
263541db
MD
367mmx_onfault:
368 MMX_RESTORE_BLOCK2
369 addl $4,%esp
370 ret
371
372 /*
373 * MXX entry points - only support 64 bit media instructions
374 */
375 SUPERALIGN_TEXT
376ENTRY(asm_mmx_memcpy) /* memcpy() entry point use optimal copy */
377 MMX_SAVE_BLOCK(asm_generic_memcpy)
378 jmp 5f
379
380 SUPERALIGN_TEXT
381ENTRY(asm_mmx_bcopy)
382 MMX_SAVE_BLOCK(asm_generic_bcopy)
383 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
384 jb 5f
385 addl %ecx,%esi
386 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
387 jb 10f
388 subl %ecx,%esi
389 jmp 5f
390
391 /*
392 * XMM entry points - support 128 bit media instructions
393 */
394 SUPERALIGN_TEXT
395ENTRY(asm_xmm_memcpy) /* memcpy() entry point use optimal copy */
396 MMX_SAVE_BLOCK(asm_generic_memcpy)
397 jmp 1f
398
399 SUPERALIGN_TEXT
400ENTRY(asm_xmm_bcopy)
401 MMX_SAVE_BLOCK(asm_generic_bcopy)
402 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
403 jb 1f
404 addl %ecx,%esi
405 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
406 jb 10f
407 subl %ecx,%esi
4081:
409 movl %esi,%eax /* skip xmm if the data is not aligned */
410 andl $15,%eax
411 jnz 5f
412 movl %edi,%eax
413 andl $15,%eax
414 jz 3f
415 jmp 5f
416
417 SUPERALIGN_TEXT
4182:
419 movdqa (%esi),%xmm0
420 movdqa 16(%esi),%xmm1
421 movdqa 32(%esi),%xmm2
422 movdqa 48(%esi),%xmm3
423 movdqa 64(%esi),%xmm4
424 movdqa 80(%esi),%xmm5
425 movdqa 96(%esi),%xmm6
426 movdqa 112(%esi),%xmm7
427 /*prefetchnta 128(%esi) 3dNOW */
428 addl $128,%esi
429
430 /*
431 * movdqa or movntdq can be used.
432 */
433 movdqa %xmm0,(%edi)
434 movdqa %xmm1,16(%edi)
435 movdqa %xmm2,32(%edi)
436 movdqa %xmm3,48(%edi)
437 movdqa %xmm4,64(%edi)
438 movdqa %xmm5,80(%edi)
439 movdqa %xmm6,96(%edi)
440 movdqa %xmm7,112(%edi)
441 addl $128,%edi
4423:
443 subl $128,%ecx
444 jae 2b
445 addl $128,%ecx
446 jz 6f
447 jmp 5f
448 SUPERALIGN_TEXT
4494:
450 movq (%esi),%mm0
451 movq 8(%esi),%mm1
452 movq 16(%esi),%mm2
453 movq 24(%esi),%mm3
454 movq 32(%esi),%mm4
455 movq 40(%esi),%mm5
456 movq 48(%esi),%mm6
457 movq 56(%esi),%mm7
458 /*prefetchnta 128(%esi) 3dNOW */
459 addl $64,%esi
460 movq %mm0,(%edi)
461 movq %mm1,8(%edi)
462 movq %mm2,16(%edi)
463 movq %mm3,24(%edi)
464 movq %mm4,32(%edi)
465 movq %mm5,40(%edi)
466 movq %mm6,48(%edi)
467 movq %mm7,56(%edi)
468 addl $64,%edi
4695:
470 subl $64,%ecx
471 jae 4b
472 addl $64,%ecx
473 jz 6f
474 cld
475 rep
476 movsb
4776:
478 MMX_RESTORE_BLOCK
479 ret
480
481 /*
482 * GENERIC_BCOPY() - BACKWARDS COPY
483 *
484 * Don't bother using xmm optimizations, just stick with mmx.
485 */
486 SUPERALIGN_TEXT
48710:
488 addl %ecx,%edi
489 jmp 12f
490
491 SUPERALIGN_TEXT
49211:
493 movq -64(%esi),%mm0
494 movq -56(%esi),%mm1
495 movq -48(%esi),%mm2
496 movq -40(%esi),%mm3
497 movq -32(%esi),%mm4
498 movq -24(%esi),%mm5
499 movq -16(%esi),%mm6
500 movq -8(%esi),%mm7
501 /*prefetchnta -128(%esi)*/
502 subl $64,%esi
503 movq %mm0,-64(%edi)
504 movq %mm1,-56(%edi)
505 movq %mm2,-48(%edi)
506 movq %mm3,-40(%edi)
507 movq %mm4,-32(%edi)
508 movq %mm5,-24(%edi)
509 movq %mm6,-16(%edi)
510 movq %mm7,-8(%edi)
511 subl $64,%edi
51212:
513 subl $64,%ecx
514 jae 11b
515 addl $64,%ecx
516 jz 13f
517 decl %esi
518 decl %edi
519 std
520 rep
521 movsb
522 cld
52313:
524 MMX_RESTORE_BLOCK
525 ret
526