2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007 The FreeBSD Foundation
5 * Copyright (c) 2008 The DragonFly Project.
6 * Copyright (c) 2008 Jordan Gordeev.
9 * Portions of this software were developed by A. Joseph Koshy under
10 * sponsorship from the FreeBSD Foundation and Google, Inc.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 #include "opt_atpic.h"
41 #include <machine/asmacros.h>
42 #include <machine/psl.h>
43 #include <machine/trap.h>
44 #include <machine/segments.h>
50 .globl lwkt_switch_return
52 /*****************************************************************************/
54 /*****************************************************************************/
56 * Trap and fault vector routines.
58 * All traps are 'interrupt gates', SDT_SYSIGT. An interrupt gate pushes
59 * state on the stack but also disables interrupts. This is important for
60 * us for the use of the swapgs instruction. We cannot be interrupted
61 * until the GS.base value is correct. For most traps, we automatically
62 * then enable interrupts if the interrupted context had them enabled.
64 * The cpu will push a certain amount of state onto the kernel stack for
65 * the current process. See x86_64/include/frame.h.
66 * This includes the current RFLAGS (status register, which includes
67 * the interrupt disable state prior to the trap), the code segment register,
68 * and the return instruction pointer are pushed by the cpu. The cpu
69 * will also push an 'error' code for certain traps. We push a dummy
70 * error code for those traps where the cpu doesn't in order to maintain
71 * a consistent frame. We also push a contrived 'trap number'.
73 * The cpu does not push the general registers, we must do that, and we
74 * must restore them prior to calling 'iret'. The cpu adjusts the %cs and
75 * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we
76 * must load them with appropriate values for supervisor mode operation.
83 * Interrupts must be disabled for all traps, otherwise horrible %gs
87 /* Regular traps; The cpu does not supply tf_err for these. */
90 movq $0,TF_XFLAGS(%rsp) ; \
91 movq $(a),TF_TRAPNO(%rsp) ; \
92 movq $0,TF_ADDR(%rsp) ; \
93 movq $0,TF_ERR(%rsp) ; \
96 /* This group of traps have tf_err already pushed by the cpu */
99 movq $(a),TF_TRAPNO(%rsp) ; \
100 movq $0,TF_ADDR(%rsp) ; \
101 movq $0,TF_XFLAGS(%rsp) ; \
105 * Due to a historical artifact, it is possible for a #DB exception
106 * to occur in certain bad places that would normlally be protected by
107 * the interrupt gate's interrupt disablement.
109 * Due to this possibly occuring in the system call entry code, we also
110 * run #DB on an ist2 stack to force the cpu to load a new %rsp, otherwise
111 * it might push the cpu exception frame onto the user stack. To make things
112 * easier we just point ist2 at our trampoline area.
115 #ifdef DIRECT_DISALLOW_SS_CPUBUG
117 * Directly disallow #DB faults which can occur at critical points
118 * in the code due to a historical artifact of how the cpu operates.
119 * %gs state might not match RPL. Test the %rip and iretq immediately
120 * (valid %gs and %cr3 state not needed). If we don't need kernel
121 * reporting we can enable this and its a bit safer from unintended
124 * If this is not enabled the kernel still catches the problem. It
125 * will report the problem and continue properly.
129 cmpq $Xfast_syscall,0(%rsp)
134 * Ok, regardless of the RPL mask in the trap frame, we took
135 * the trap on a separate stack via ist2. This means we
136 * must copy it appropriately.
138 * If coming from userland we can skip directly to the normal
139 * TRAP code because it will handle the fact that we are on an
140 * alternative stack (dbgstack set by ist2), even though it isn't
141 * the trampoline stack). The frame will be moved to the correct
144 testb $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp)
145 jnz 210f /* jnz from userland */
148 * From kernel - %gs and %cr3 may be inconsistent. Save original
149 * values and load consistent values, restore after return.
151 * The trap handler is NOT allowed to block for this case.
154 movq %rax, TR_RAX(%rsp)
155 movq %rcx, TR_RCX(%rsp)
156 movq %rdx, TR_RDX(%rsp)
159 movq %cr3,%rax /* save CR3 */
160 movq %rax, TR_PCB_CR3_SAVED(%rsp)
161 movl $MSR_GSBASE,%ecx /* save %gs */
165 movq %rax, TR_PCB_GS_SAVED(%rsp)
166 movq TR_PCB_GS_KERNEL(%rsp),%rdx /* retrieve kernel %gs */
170 movq PCPU(trampoline)+TR_PCB_CR3,%rax
173 movq TR_RDX(%rsp), %rdx
174 movq TR_RCX(%rsp), %rcx
175 movq TR_RAX(%rsp), %rax
179 * We are coming from the kernel.
181 * We are on the IST2 stack and, in fact, we have to *STAY* on this
182 * stack so no longer try to shift our frame to the kernel %rsp
183 * in the trap frame, since this %rsp might actually be a user %rsp
184 * in the mov mem,%ss + syscall DBG trap case.
186 * Run the normal trap. Because TF_CS is at a kernel RPL, the
187 * normal code will skip the usual swapgs and KMMU (trampoline)
188 * code. We've handled the rest.
190 * NOTE: at this point the trampframe is above the normal stack
191 * frame. The trap code will be ignorant of the special
192 * TR_* registers above the cpu hardware frame portion,
193 * and the TR_* registers below it will be overwritten.
196 movq $0,TF_XFLAGS(%rsp)
197 movq $T_TRCTRAP,TF_TRAPNO(%rsp)
198 movq $0,TF_ADDR(%rsp)
201 FAKE_MCOUNT(TF_RIP(%rsp))
208 * Pop the frame (since we're coming from kernel mode, this will
209 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
210 * iretq. Not optimal but more readable and this is not a
216 movq %rax, TR_RAX(%rsp)
217 movq %rcx, TR_RCX(%rsp)
218 movq %rdx, TR_RDX(%rsp)
220 movl $MSR_GSBASE,%ecx /* restore %gs */
221 movq TR_PCB_GS_SAVED(%rsp),%rdx
226 movq TR_PCB_CR3_SAVED(%rsp),%rax /* restore %cr3 */
229 movq TR_RAX(%rsp),%rax
230 movq TR_RCX(%rsp),%rcx
231 movq TR_RDX(%rsp),%rdx
235 * Direct iretq. No point jumping to doreti because the
236 * exception code that deals with iretq faults can't handle
237 * non-deterministic %gs/%cr3 state.
239 #ifdef DIRECT_DISALLOW_SS_CPUBUG
245 * From userland (normal trap path)
284 * alltraps entry point. Use swapgs if this is the first time in the
285 * kernel from userland. Reenable interrupts if they were enabled
288 * WARNING! %gs not available until after our swapgs code
292 .type alltraps,@function
297 movq %rdi,TF_RDI(%rsp)
298 alltraps_pushregs_no_rdi:
299 movq %rsi,TF_RSI(%rsp)
300 movq %rdx,TF_RDX(%rsp)
301 movq %rcx,TF_RCX(%rsp)
304 movq %rax,TF_RAX(%rsp)
305 movq %rbx,TF_RBX(%rsp)
306 movq %rbp,TF_RBP(%rsp)
307 movq %r10,TF_R10(%rsp)
308 movq %r11,TF_R11(%rsp)
309 movq %r12,TF_R12(%rsp)
310 movq %r13,TF_R13(%rsp)
311 movq %r14,TF_R14(%rsp)
312 movq %r15,TF_R15(%rsp)
315 FAKE_MCOUNT(TF_RIP(%rsp))
317 .type calltrap,@function
323 jmp doreti /* Handle any pending ASTs */
327 movq $T_DOUBLEFLT,TF_TRAPNO(%rsp)
328 movq $0,TF_ADDR(%rsp)
329 movq $0,TF_XFLAGS(%rsp)
333 call dblfault_handler
338 * We need to save the contents of %cr2 before PUSH_FRAME* messes
342 PUSH_FRAME_TFERR_SAVECR2
343 movq $T_PAGEFLT,TF_TRAPNO(%rsp)
344 movq $0,TF_XFLAGS(%rsp)
348 * We have to special-case this one. If we get a trap in doreti() at
349 * the iretq stage, we'll reenter as a kernel exception with the
350 * wrong gs and isolation state. We have to act as through we came
355 leaq doreti_iret(%rip),%r10
356 cmpq %r10,TF_RIP-TF_ERR+8(%rsp) /* +8 due to pushq */
358 testb $SEL_RPL_MASK,TF_CS-TF_ERR+8(%rsp) /* +8 due to pushq */
362 * Special fault during iretq
369 movq $T_PROTFLT,TF_TRAPNO(%rsp)
370 movq $0,TF_ADDR(%rsp)
371 movq $0,TF_XFLAGS(%rsp)
377 movq $T_PROTFLT,TF_TRAPNO(%rsp)
378 movq $0,TF_ADDR(%rsp)
379 movq $0,TF_XFLAGS(%rsp)
383 * Fast syscall entry point. We enter here with just our new %cs/%ss set,
384 * and the new privilige level. We are still running on the old user stack
385 * pointer. We have to juggle a few things around to find our stack etc.
386 * swapgs gives us access to our PCPU space only.
388 * We use GD_TRAMPOLINE+TR_CR2 to save the user stack pointer temporarily.
391 swapgs /* get kernel %gs */
392 movq %rsp,PCPU(trampoline)+TR_CR2 /* save user %rsp */
393 movq PCPU(common_tss)+TSS_RSP0,%rsp
396 * NOTE: KMMUENTER_SYSCALL does not actually use the stack but
397 * adjust the stack pointer for correctness in case we
400 subq $TR_PCB_RSP,%rsp
402 movq PCPU(trampoline)+TR_PCB_RSP,%rsp
404 /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
406 /* defer TF_RSP till we have a spare register */
407 movq %r11,TF_RFLAGS(%rsp)
408 movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */
409 movq PCPU(trampoline)+TR_CR2,%r11 /* %r11 already saved */
410 movq %r11,TF_RSP(%rsp) /* user stack pointer */
411 orl $RQF_QUICKRET,PCPU(reqflags)
412 movq $KUDSEL,TF_SS(%rsp)
413 movq $KUCSEL,TF_CS(%rsp)
415 movq $T_FAST_SYSCALL,TF_TRAPNO(%rsp) /* for the vkernel */
416 movq $0,TF_XFLAGS(%rsp) /* note: used in signal frame */
417 movq %rdi,TF_RDI(%rsp) /* arg 1 */
418 movq %rsi,TF_RSI(%rsp) /* arg 2 */
419 movq %rdx,TF_RDX(%rsp) /* arg 3 */
420 movq %r10,TF_RCX(%rsp) /* arg 4 */
421 movq %r8,TF_R8(%rsp) /* arg 5 */
422 movq %r9,TF_R9(%rsp) /* arg 6 */
423 movq %rax,TF_RAX(%rsp) /* syscall number */
424 movq %rbx,TF_RBX(%rsp) /* C preserved */
425 movq %rbp,TF_RBP(%rsp) /* C preserved */
426 movq %r12,TF_R12(%rsp) /* C preserved */
427 movq %r13,TF_R13(%rsp) /* C preserved */
428 movq %r14,TF_R14(%rsp) /* C preserved */
429 movq %r15,TF_R15(%rsp) /* C preserved */
431 xorq %rax,%rax /* SECURITY CLEAR REGS */
448 FAKE_MCOUNT(TF_RIP(%rsp))
453 * Fast return from system call
456 testl $RQF_IPIQ|RQF_TIMER|RQF_INTPEND|RQF_AST_MASK,PCPU(reqflags)
458 testl $RQF_QUICKRET,PCPU(reqflags)
462 movq TF_RBX(%rsp),%rbx /* SECURITY RESTORE */
463 movq TF_RCX(%rsp),%rcx
464 movq TF_RBP(%rsp),%rbp
467 xorq %r10,%r10 /* (security - clear scratch) */
469 movq TF_R12(%rsp),%r12
470 movq TF_R13(%rsp),%r13
471 movq TF_R14(%rsp),%r14
472 movq TF_R15(%rsp),%r15
474 movq TF_RDI(%rsp),%rdi /* NORMAL RESTORE */
475 movq TF_RSI(%rsp),%rsi
476 movq TF_RDX(%rsp),%rdx
477 movq TF_RAX(%rsp),%rax
478 movq TF_RFLAGS(%rsp),%r11
479 movq TF_RIP(%rsp),%rcx
480 movq TF_RSP(%rsp),%rsp
486 * Normal slow / full iret
493 * Here for CYA insurance, in case a "syscall" instruction gets
494 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
495 * to *something* if EFER_SCE is enabled.
497 IDTVEC(fast_syscall32)
501 * NMI handling is special.
503 * First, an NMI is taken on its own pcpu stack. RFLAGS.IF, %gs, and %cr3
504 * will be inconsistent when interrupt supervisor mode.
506 * Second, the processor treats NMIs specially, blocking further NMIs
507 * until an 'iretq' instruction is executed. We therefore need to
508 * execute the NMI handler with interrupts disabled to prevent a
509 * nested interrupt from executing an 'iretq' instruction and
510 * inadvertently taking the processor out of NMI mode.
514 * We don't need to special-case entry from userland, %gs will
515 * be consistent with expectations.
517 testb $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp) ; /* from userland? */ \
521 * From kernel - %gs and %cr3 may be inconsistent. Save original
522 * values and load consistent values, restore on return.
524 * The trap handler is NOT allowed to block for this case.
527 movq %rax, TR_RAX(%rsp)
528 movq %rcx, TR_RCX(%rsp)
529 movq %rdx, TR_RDX(%rsp)
532 movq %cr3,%rax /* save CR3 */
533 movq %rax, TR_PCB_CR3_SAVED(%rsp)
534 movl $MSR_GSBASE,%ecx /* save %gs */
538 movq %rax, TR_PCB_GS_SAVED(%rsp)
539 movq TR_PCB_GS_KERNEL(%rsp),%rdx /* retrieve kernel %gs */
544 movq TR_PCB_CR3(%rsp),%rax /* retrieve kernel %cr3 */
546 movq PCPU(trampoline)+TR_PCB_CR3,%rax
549 movq TR_RDX(%rsp), %rdx
550 movq TR_RCX(%rsp), %rcx
551 movq TR_RAX(%rsp), %rax
555 * Ok, run the normal trap. Because TF_CS is at a kernel RPL,
556 * the normal code will skip the usual swapgs and KMMU (trampoline)
557 * code. We've handled the rest.
559 * NOTE: at this point the trampframe is above the normal stack
560 * frame. The trap code will be ignorant of the special
561 * TR_* registers above the cpu hardware frame portion,
562 * and the TR_* registers below it will be overwritten.
565 movq $0,TF_XFLAGS(%rsp)
566 movq $T_NMI,TF_TRAPNO(%rsp)
567 movq $0,TF_ADDR(%rsp)
570 FAKE_MCOUNT(TF_RIP(%rsp))
577 * Pop the frame (since we're coming from kernel mode, this will
578 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
579 * iretq. Not optimal but more readable and this is not a
585 movq %rax, TR_RAX(%rsp)
586 movq %rcx, TR_RCX(%rsp)
587 movq %rdx, TR_RDX(%rsp)
589 movl $MSR_GSBASE,%ecx /* restore %gs */
590 movq TR_PCB_GS_SAVED(%rsp),%rdx
595 movq TR_PCB_CR3_SAVED(%rsp),%rax /* restore %cr3 */
598 movq TR_RAX(%rsp),%rax
599 movq TR_RCX(%rsp),%rcx
600 movq TR_RDX(%rsp),%rdx
604 * Direct iretq. No point jumping to doreti because the
605 * exception code that deals with iretq faults can't handle
606 * non-deterministic %gs/%cr3 state.
611 * From userland (normal trap path)
615 movq $0,TF_XFLAGS(%rsp)
616 movq $T_NMI,TF_TRAPNO(%rsp)
617 movq $0,TF_ADDR(%rsp)
620 FAKE_MCOUNT(TF_RIP(%rsp))
626 POP_FRAME(jmp doreti_iret)
629 * This function is what cpu_heavy_restore jumps to after a new process
630 * is created. The LWKT subsystem switches while holding a critical
631 * section and we maintain that abstraction here (e.g. because
632 * cpu_heavy_restore needs it due to PCB_*() manipulation), then get out of
633 * it before calling the initial function (typically fork_return()) and/or
634 * returning to user mode.
636 * The MP lock is not held at any point but the critcount is bumped
637 * on entry to prevent interruption of the trampoline at a bad point.
639 * This is effectively what td->td_switch() returns to. It 'returns' the
640 * old thread in %rax and since this is not returning to a td->td_switch()
641 * call from lwkt_switch() we must handle the cleanup for the old thread
642 * by calling lwkt_switch_return().
644 * fork_trampoline(%rax:otd, %rbx:func, %r12:arg)
646 ENTRY(fork_trampoline)
648 call lwkt_switch_return
649 movq PCPU(curthread),%rax
650 decl TD_CRITCOUNT(%rax)
653 * cpu_set_fork_handler intercepts this function call to
654 * have this call a non-return function to stay in kernel mode.
656 * initproc has its own fork handler, start_init(), which DOES
659 * %rbx - chaining function (typically fork_return)
660 * %r12 -> %rdi (argument)
661 * frame-> %rsi (trap frame)
663 * void (func:rbx)(arg:rdi, trapframe:rsi)
665 movq %rsp, %rsi /* pass trapframe by reference */
666 movq %r12, %rdi /* arg1 */
667 call *%rbx /* function */
669 /* cut from syscall */
675 * Return via doreti to handle ASTs.
677 * trapframe is at the top of the stack.
683 * To efficiently implement classification of trap and interrupt handlers
684 * for profiling, there must be only trap handlers between the labels btrap
685 * and bintr, and only interrupt handlers between the labels bintr and
686 * eintr. This is implemented (partly) by including files that contain
687 * some of the handlers. Before including the files, set up a normal asm
688 * environment so that the included files doen't need to know that they are
700 #include <x86_64/x86_64/apic_vector.S>
709 #include <x86_64/isa/atpic_vector.S>