2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * bcopy(source:%esi, target:%edi, count:%ecx)
37 * note: esi, edi, eax, ecx, and edx may be destroyed
40 #include <machine/asmacros.h>
41 #include <machine/cputypes.h>
42 #include <machine/pmap.h>
43 #include <machine/specialreg.h>
50 * bcopyb() is a 'dumb' byte-granular bcopy. It is only used by
51 * devices which need to bcopy device-mapped memory which cannot
52 * otherwise handle 16 or 32 bit ops.
63 cmpl %ecx,%eax /* overlapping && src < dst? */
65 cld /* nope, copy forwards */
74 addl %ecx,%edi /* copy backwards. */
87 * bcopyi(s, d, len) (NON OVERLAPPING)
89 * This is a dumb 32-bit-granular bcopy
107 * If memcpy/bcopy is called as part of a copyin or copyout, the
108 * on-fault routine is set up to do a 'ret'. We have to restore
109 * %ebx and return to the copyin/copyout fault handler.
113 addl $4,%esp /* skip normal return vector */
114 ret /* return to copyin/copyout fault handler */
117 * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
119 * Reasonably optimal on all modern machines.
123 ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */
124 pushl %ebx /* WARNING COPYIN/OUT EXPECTS THIS FRAME */
125 pushl $generic_onfault/* WARNING COPYIN/OUT EXPECTS THIS FRAME */
129 ENTRY(asm_generic_bcopy)
130 pushl %ebx /* WARNING COPYIN/OUT EXPECTS THIS FRAME */
131 pushl $generic_onfault/* WARNING COPYIN/OUT EXPECTS THIS FRAME */
132 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
135 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
174 * GENERIC_BCOPY() - BACKWARDS COPY
218 * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
220 * note: esi, edi, eax, ecx, and edx are allowed to be destroyed.
222 * In order for the kernel to be able to use the FPU:
224 * (1) The kernel may not already be using the fpu.
226 * (2) If the fpu is owned by the application, we must save
227 * its state. If the fpu is not owned by the application
228 * the application's saved fp state may already exist
231 * (3) We cannot allow the kernel to overwrite the application's
232 * FPU state with our own, so we make sure the application's
233 * FPU state has been saved and then point TD_SAVEFPU at a
234 * temporary fpu save area in the globaldata structure.
238 * If gd_npxthread is not NULL we must save the application's
239 * current FP state to the current save area and then NULL
240 * out gd_npxthread to interlock against new interruptions
241 * changing the FP state further.
243 * If gd_npxthread is NULL the FP unit is in a known 'safe'
244 * state and may be used once the new save area is installed.
246 * race(1): If an interrupt occurs just prior to calling fxsave
247 * all that happens is that fxsave gets a npxdna trap, restores
248 * the app's environment, and immediately traps, restores,
249 * and saves it again.
251 * race(2): No interrupt can safely occur after we NULL-out
252 * npxthread until we fnclex, because the kernel assumes that
253 * the FP unit is in a safe state when npxthread is NULL. It's
254 * more convenient to use a cli sequence here (it is not
255 * considered to be in the critical path), but a critical
256 * section would also work.
258 * NOTE ON FNINIT vs FNCLEX - Making the FP unit safe here is
259 * the goal. It should be sufficient to just call FNCLEX rather
260 * then having to FNINIT the entire unit.
262 * race(3): The FP unit is in a known state (because npxthread
263 * was either previously NULL or we saved and init'd and made
264 * it NULL). This is true even if we are preempted and the
265 * preempting thread uses the FP unit, because it will be
266 * fninit's again on return. ANY STATE WE SAVE TO THE FPU MAY
267 * BE DESTROYED BY PREEMPTION WHILE NPXTHREAD IS NULL! However,
268 * an interrupt occuring inbetween clts and the setting of
269 * gd_npxthread may set the TS bit again and cause the next
270 * npxdna() to panic when it sees a non-NULL gd_npxthread.
272 * We can safely set TD_SAVEFPU to point to a new uninitialized
273 * save area and then set GD_NPXTHREAD to non-NULL. If an
274 * interrupt occurs after we set GD_NPXTHREAD, all that happens
275 * is that the safe FP state gets saved and restored. We do not
276 * need to clex again.
278 * We can safely clts after setting up the new save-area, before
279 * installing gd_npxthread, even if we get preempted just after
280 * calling clts. This is because the FP unit will be in a safe
281 * state while gd_npxthread is NULL. Setting gd_npxthread will
282 * simply lock-in that safe-state. Calling clts saves
283 * unnecessary trap overhead since we are about to use the FP
284 * unit anyway and don't need to 'restore' any state prior to
287 * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn.
288 * MMX: Typical on XPs and P3s. 64 bit media insn.
290 * WARNING! copyin/copyout expects a push %ebx/onfault frame ONLY,
291 * we can't mess with the stack any more than that.
294 #define MMX_SAVE_BLOCK(missfunc) \
297 movl MYCPU,%eax ; /* EAX = MYCPU */ \
298 btsl $1,GD_FPU_LOCK(%eax) ; \
302 movl GD_CURTHREAD(%eax),%edx ; /* EDX = CURTHREAD */ \
303 movl TD_SAVEFPU(%edx),%ebx ; /* save app save area */\
304 incl TD_CRITCOUNT(%edx) ; \
305 cmpl $0,GD_NPXTHREAD(%eax) ; \
307 fxsave 0(%ebx) ; /* race(1) */ \
308 movl $0,GD_NPXTHREAD(%eax) ; /* interlock intr */ \
310 fnclex ; /* race(2) */ \
312 leal GD_SAVEFPU(%eax),%ecx ; \
313 movl %ecx,TD_SAVEFPU(%edx) ; \
314 orl $TDF_KERNELFP,TD_FLAGS(%edx) ; \
316 movl %edx,GD_NPXTHREAD(%eax) ; /* race(3) */ \
317 decl TD_CRITCOUNT(%edx) ; /* crit_exit() */ \
318 cmpl $0,GD_REQFLAGS(%eax) ; \
320 testl $-1,TD_CRITCOUNT(%edx) ; \
323 /* note: eax,ecx,edx destroyed */ \
326 movl $mmx_onfault,(%esp) ; \
329 * When restoring the application's FP state we must first clear
330 * npxthread to prevent further saves, then restore the pointer
331 * to the app's save area. We do not have to (and should not)
332 * restore the app's FP state now. Note that we do not have to
333 * call fnclex because our use of the FP guarentees that it is in
334 * a 'safe' state (at least for kernel use).
336 * NOTE: it is not usually safe to mess with CR0 outside of a
337 * critical section, because TS may get set by a preemptive
338 * interrupt. However, we *can* race a load/set-ts/store against
339 * an interrupt doing the same thing.
341 * WARNING! A Virtual kernel depends on CR0_TS remaining set after
342 * we use the FP unit if it asked it to be set.
345 #define MMX_RESTORE_BLOCK \
349 #define MMX_RESTORE_BLOCK2 \
351 movl GD_CURTHREAD(%ecx),%edx ; \
352 movl $0,GD_NPXTHREAD(%ecx) ; \
353 andl $~TDF_KERNELFP,TD_FLAGS(%edx) ; \
354 movl %ebx,TD_SAVEFPU(%edx) ; \
359 movl $0,GD_FPU_LOCK(%ecx)
362 * xmm/mmx_onfault routine. Restore the fpu state, skip the normal
363 * return vector, and return to the caller's on-fault routine
364 * (which was pushed on the callers stack just before he called us)
373 * MXX entry points - only support 64 bit media instructions
376 ENTRY(asm_mmx_memcpy) /* memcpy() entry point use optimal copy */
377 MMX_SAVE_BLOCK(asm_generic_memcpy)
382 MMX_SAVE_BLOCK(asm_generic_bcopy)
383 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
386 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
392 * XMM entry points - support 128 bit media instructions
395 ENTRY(asm_xmm_memcpy) /* memcpy() entry point use optimal copy */
396 MMX_SAVE_BLOCK(asm_generic_memcpy)
401 MMX_SAVE_BLOCK(asm_generic_bcopy)
402 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
405 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
409 movl %esi,%eax /* skip xmm if the data is not aligned */
420 movdqa 16(%esi),%xmm1
421 movdqa 32(%esi),%xmm2
422 movdqa 48(%esi),%xmm3
423 movdqa 64(%esi),%xmm4
424 movdqa 80(%esi),%xmm5
425 movdqa 96(%esi),%xmm6
426 movdqa 112(%esi),%xmm7
427 /*prefetchnta 128(%esi) 3dNOW */
431 * movdqa or movntdq can be used.
434 movdqa %xmm1,16(%edi)
435 movdqa %xmm2,32(%edi)
436 movdqa %xmm3,48(%edi)
437 movdqa %xmm4,64(%edi)
438 movdqa %xmm5,80(%edi)
439 movdqa %xmm6,96(%edi)
440 movdqa %xmm7,112(%edi)
458 /*prefetchnta 128(%esi) 3dNOW */
482 * GENERIC_BCOPY() - BACKWARDS COPY
484 * Don't bother using xmm optimizations, just stick with mmx.
501 /*prefetchnta -128(%esi)*/