2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $DragonFly: src/sys/i386/i386/Attic/bcopy.s,v 1.5 2004/05/05 19:26:38 dillon Exp $
29 * bcopy(source:%esi, target:%edi, count:%ecx)
31 * note: esi, edi, eax, ecx, and edx may be destroyed
36 #include <machine/asmacros.h>
37 #include <machine/cputypes.h>
38 #include <machine/pmap.h>
39 #include <machine/specialreg.h>
46 * bcopyb() is a 'dumb' byte-granular bcopy. It is only used by
47 * devices which need to bcopy device-mapped memory which cannot
48 * otherwise handle 16 or 32 bit ops.
59 cmpl %ecx,%eax /* overlapping && src < dst? */
61 cld /* nope, copy forwards */
70 addl %ecx,%edi /* copy backwards. */
84 * If memcpy/bcopy is called as part of a copyin or copyout, the
85 * on-fault routine is set up to do a 'ret'. We have to restore
86 * %ebx and return to the copyin/copyout fault handler.
90 addl $4,%esp /* skip normal return vector */
91 ret /* return to copyin/copyout fault handler */
94 * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
96 * Reasonably optimal on all modern machines.
100 ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */
102 pushl $generic_onfault
106 ENTRY(asm_generic_bcopy)
108 pushl $generic_onfault
109 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
112 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
151 * GENERIC_BCOPY() - BACKWARDS COPY
195 * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
197 * note: esi, edi, eax, ecx, and edx are allowed to be destroyed.
199 * In order for the kernel to be able to use the FPU:
201 * (1) The kernel may not already be using the fpu.
203 * (2) If the fpu is owned by the application, we must save
204 * its state. If the fpu is not owned by the application
205 * the application's saved fp state may already exist
208 * (3) We cannot allow the kernel to overwrite the application's
209 * FPU state with our own, so we make sure the application's
210 * FPU state has been saved and then point TD_SAVEFPU at a
211 * temporary fpu save area in the globaldata structure.
215 * If gd_npxthread is not NULL we must save the application's
216 * current FP state to the current save area and then NULL
217 * out gd_npxthread to interlock against new interruptions
218 * changing the FP state further.
220 * If gd_npxthread is NULL the FP unit is in a known 'safe'
221 * state and may be used once the new save area is installed.
223 * race(1): If an interrupt occurs just prior to calling fxsave
224 * all that happens is that fxsave gets a npxdna trap, restores
225 * the app's environment, and immediately traps, restores,
226 * and saves it again.
228 * race(2): No interrupt can safely occur after we NULL-out
229 * npxthread until we fninit, because the kernel assumes that
230 * the FP unit is in a safe state when npxthread is NULL. It's
231 * more convenient to use a cli sequence here (it is not
232 * considered to be in the critical path), but a critical
233 * section would also work.
235 * race(3): The FP unit is in a known state (because npxthread
236 * was either previously NULL or we saved and init'd and made
237 * it NULL). This is true even if we are preempted and the
238 * preempting thread uses the FP unit, because it will be
239 * fninit's again on return. ANY STATE WE SAVE TO THE FPU MAY
240 * BE DESTROYED BY PREEMPTION WHILE NPXTHREAD IS NULL! However,
241 * an interrupt occuring inbetween clts and the setting of
242 * gd_npxthread may set the TS bit again and cause the next
243 * npxdna() to panic when it sees a non-NULL gd_npxthread.
245 * We can safely set TD_SAVEFPU to point to a new uninitialized
246 * save area and then set GD_NPXTHREAD to non-NULL. If an
247 * interrupt occurs after we set GD_NPXTHREAD, all that happens
248 * is that the safe FP state gets saved and restored. We do not
249 * need to fninit again.
251 * We can safely clts after setting up the new save-area, before
252 * installing gd_npxthread, even if we get preempted just after
253 * calling clts. This is because the FP unit will be in a safe
254 * state while gd_npxthread is NULL. Setting gd_npxthread will
255 * simply lock-in that safe-state. Calling clts saves
256 * unnecessary trap overhead since we are about to use the FP
257 * unit anyway and don't need to 'restore' any state prior to
260 * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn.
261 * MMX: Typical on XPs and P3s. 64 bit media insn.
264 #define MMX_SAVE_BLOCK(missfunc) \
267 movl MYCPU,%eax ; /* EAX = MYCPU */ \
268 btsl $1,GD_FPU_LOCK(%eax) ; \
272 movl GD_CURTHREAD(%eax),%edx ; /* EDX = CURTHREAD */ \
273 movl TD_SAVEFPU(%edx),%ebx ; /* save app save area */\
274 addl $TDPRI_CRIT,TD_PRI(%edx) ; \
275 cmpl $0,GD_NPXTHREAD(%eax) ; \
277 fxsave 0(%ebx) ; /* race(1) */ \
278 movl $0,GD_NPXTHREAD(%eax) ; /* interlock intr */ \
280 fninit ; /* race(2) */ \
282 leal GD_SAVEFPU(%eax),%ecx ; \
283 movl %ecx,TD_SAVEFPU(%edx) ; \
285 movl %edx,GD_NPXTHREAD(%eax) ; /* race(3) */ \
286 subl $TDPRI_CRIT,TD_PRI(%edx) ; /* crit_exit() */ \
287 cmpl $0,GD_REQFLAGS(%eax) ; \
289 cmpl $TDPRI_CRIT,TD_PRI(%edx) ; \
291 call lwkt_yield_quick ; \
292 /* note: eax,ecx,edx destroyed */ \
295 movl $mmx_onfault,(%esp) ; \
298 * When restoring the application's FP state we must first clear
299 * npxthread to prevent further saves, then restore the pointer
300 * to the app's save area. We do not have to (and should not)
301 * restore the app's FP state now. Note that we do not have to
302 * call fninit because our use of the FP guarentees that it is in
303 * a 'safe' state (at least for kernel use).
305 * NOTE: it is not usually safe to mess with CR0 outside of a
306 * critical section, because TS may get set by a preemptive
307 * interrupt. However, we *can* race a load/set-ts/store against
308 * an interrupt doing the same thing.
311 #define MMX_RESTORE_BLOCK \
315 #define MMX_RESTORE_BLOCK2 \
317 movl GD_CURTHREAD(%ecx),%edx ; \
318 movl $0,GD_NPXTHREAD(%ecx) ; \
319 movl %ebx,TD_SAVEFPU(%edx) ; \
324 movl $0,GD_FPU_LOCK(%ecx)
327 * xmm/mmx_onfault routine. Restore the fpu state, skip the normal
328 * return vector, and return to the caller's on-fault routine
329 * (which was pushed on the callers stack just before he called us)
338 * MXX entry points - only support 64 bit media instructions
341 ENTRY(asm_mmx_memcpy) /* memcpy() entry point use optimal copy */
342 MMX_SAVE_BLOCK(asm_generic_memcpy)
347 MMX_SAVE_BLOCK(asm_generic_bcopy)
348 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
351 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
357 * XMM entry points - support 128 bit media instructions
360 ENTRY(asm_xmm_memcpy) /* memcpy() entry point use optimal copy */
361 MMX_SAVE_BLOCK(asm_generic_memcpy)
366 MMX_SAVE_BLOCK(asm_generic_bcopy)
367 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
370 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
374 movl %esi,%eax /* skip xmm if the data is not aligned */
385 movdqa 16(%esi),%xmm1
386 movdqa 32(%esi),%xmm2
387 movdqa 48(%esi),%xmm3
388 movdqa 64(%esi),%xmm4
389 movdqa 80(%esi),%xmm5
390 movdqa 96(%esi),%xmm6
391 movdqa 112(%esi),%xmm7
392 /*prefetchnta 128(%esi) 3dNOW */
396 * movdqa or movntdq can be used.
399 movdqa %xmm1,16(%edi)
400 movdqa %xmm2,32(%edi)
401 movdqa %xmm3,48(%edi)
402 movdqa %xmm4,64(%edi)
403 movdqa %xmm5,80(%edi)
404 movdqa %xmm6,96(%edi)
405 movdqa %xmm7,112(%edi)
423 /*prefetchnta 128(%esi) 3dNOW */
447 * GENERIC_BCOPY() - BACKWARDS COPY
449 * Don't bother using xmm optimizations, just stick with mmx.
466 /*prefetchnta -128(%esi)*/