2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $DragonFly: src/sys/platform/pc32/i386/bcopy.s,v 1.3 2004/04/30 02:59:14 dillon Exp $
29 * bcopy(source:%esi, target:%edi, count:%ecx)
31 * note: esi, edi, eax, ecx, and edx may be destroyed
36 #include <machine/asmacros.h>
37 #include <machine/cputypes.h>
38 #include <machine/pmap.h>
39 #include <machine/specialreg.h>
46 * If memcpy/bcopy is called as part of a copyin or copyout, the
47 * on-fault routine is set up to do a 'ret'. We hve to restore
48 * %ebx and return to the copyin/copyout fault handler.
52 addl $4,%esp /* skip normal return vector */
53 ret /* return to copyin/copyout fault handler */
56 * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
58 * Reasonably optimal on all modern machines.
62 ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */
64 pushl $generic_onfault
68 ENTRY(asm_generic_bcopy)
70 pushl $generic_onfault
71 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
74 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
113 * GENERIC_BCOPY() - BACKWARDS COPY
157 * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
159 * Reasonably optimal on all modern machines with MMX or SSE2.
160 * XXX But very messy, we need a better way to use fp in the kernel.
162 * note: esi, edi, eax, ecx, and edx may be destroyed
164 * In order for the kernel to be able to use the FPU:
166 * (1) The kernel may not already be using the fpu
168 * (2) If the fpu is owned by the application, we must save
169 * its state. If the fpu is not owned by the application
170 * the application's saved fp state may already exist
173 * (3) We cannot allow the kernel overwrite the application's
174 * FPU state with our own, so we allocate space on the
175 * stack and create a new TD_SAVEFPU, saving the old
178 * (4) While we are using the FP unit, an interrupt may come
179 * along and preempt us, causing our FP state to be saved.
180 * We will fault/restore upon resumption. Our FP state
181 * will be saved on the stack.
183 * (5) To clean up we throw away our FP state and, zero out
184 * npxthread to indicate that the application's FP state
185 * is stored in TD_SAVEFPU, and we then restore the original
188 * We do not attempt to restore the application's FP state.
189 * We set the TS bit to guarentee that the application will
190 * fault when it next tries to access the FP (to restore its
193 * NOTE: fxsave requires a 16-byte aligned address
195 * NOTE: RACES (which are ok):
197 * + interrupt saves fp state after we check npxthread but
198 * before we call fxsave
199 * + interrupt saves application fp state after we change
200 * TD_SAVEFPU. Data will be ignored.
201 * + interrupt occurs in critical section. interrupt will be
202 * delayed until we return or block (unless we check for
203 * pending interrupts but I'm not going to bother for now).
205 * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn.
206 * MMX: Typical on XPs and P3s. 64 bit media insn.
209 #define MMX_SAVE_BLOCK(missfunc) \
212 btsl $1,PCPU(kernel_fpu_lock) ; \
217 movl PCPU(curthread),%edx ; \
218 movl TD_SAVEFPU(%edx),%ebx ; \
220 andl $0xfffffff0,%esp ; \
221 addl $TDPRI_CRIT,TD_PRI(%edx) ; \
222 cmpl %edx,PCPU(npxthread) ; \
226 movl %esp,TD_SAVEFPU(%edx) ; \
227 movl %edx,PCPU(npxthread) ; \
230 subl $TDPRI_CRIT,TD_PRI(%edx) ; \
234 * NOTE: RACES (which are ok):
236 * + interrupt occurs after we store NULL to npxthread. No
237 * state will be saved (because npxthread is NULL). Thread
238 * switches never restore npxthread, only a DNA trap does that.
239 * + we can safely restore TD_SAFEFPU after NULLing npxthread.
240 * + we can safely set TS any time after NULLing npxthread.
243 #define MMX_RESTORE_BLOCK \
247 #define MMX_RESTORE_BLOCK2 \
248 movl PCPU(curthread),%edx ; \
249 movl $0,PCPU(npxthread) ; \
250 movl %ebx,TD_SAVEFPU(%edx) ; \
257 movl $0,PCPU(kernel_fpu_lock)
260 * xmm/mmx_onfault routine. Restore the fpu state, skip the normal
261 * return vector, and return to the caller's on-fault routine
262 * (which was pushed on the callers stack just before he calle us)
270 * MXX entry points - only support 64 bit media instructions
273 ENTRY(asm_mmx_memcpy) /* memcpy() entry point use optimal copy */
274 MMX_SAVE_BLOCK(asm_generic_memcpy)
279 MMX_SAVE_BLOCK(asm_generic_bcopy)
280 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
283 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
289 * XMM entry points - support 128 bit media instructions
292 ENTRY(asm_xmm_memcpy) /* memcpy() entry point use optimal copy */
293 MMX_SAVE_BLOCK(asm_generic_memcpy)
298 MMX_SAVE_BLOCK(asm_generic_bcopy)
299 cmpl %esi,%edi /* if (edi < esi) fwd copy ok */
302 cmpl %esi,%edi /* if (edi < esi + count) do bkwrds copy */
306 movl %esi,%eax /* skip xmm if the data is not aligned */
317 movdqa 16(%esi),%xmm1
318 movdqa 32(%esi),%xmm2
319 movdqa 48(%esi),%xmm3
320 movdqa 64(%esi),%xmm4
321 movdqa 80(%esi),%xmm5
322 movdqa 96(%esi),%xmm6
323 movdqa 112(%esi),%xmm7
324 /*prefetchnta 128(%esi) 3dNOW */
328 * movdqa or movntdq can be used.
331 movdqa %xmm1,16(%edi)
332 movdqa %xmm2,32(%edi)
333 movdqa %xmm3,48(%edi)
334 movdqa %xmm4,64(%edi)
335 movdqa %xmm5,80(%edi)
336 movdqa %xmm6,96(%edi)
337 movdqa %xmm7,112(%edi)
355 /*prefetchnta 128(%esi) 3dNOW */
379 * GENERIC_BCOPY() - BACKWARDS COPY
381 * Don't bother using xmm optimizations, just stick with mmx.
398 /*prefetchnta -128(%esi)*/