Remove upc_{control,register} syscalls and everything that has to do with it.
[dragonfly.git] / sys / platform / vkernel64 / x86_64 / trap.c
CommitLineData
da673940
JG
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
38 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
da673940
JG
39 */
40
41/*
a76ca9b9 42 * x86_64 Trap and System call handling
da673940
JG
43 */
44
45#include "use_isa.h"
46
47#include "opt_ddb.h"
48#include "opt_ktrace.h"
49
50#include <sys/param.h>
51#include <sys/systm.h>
52#include <sys/proc.h>
53#include <sys/pioctl.h>
54#include <sys/kernel.h>
55#include <sys/resourcevar.h>
56#include <sys/signalvar.h>
57#include <sys/signal2.h>
58#include <sys/syscall.h>
59#include <sys/sysctl.h>
60#include <sys/sysent.h>
61#include <sys/uio.h>
62#include <sys/vmmeter.h>
63#include <sys/malloc.h>
64#ifdef KTRACE
65#include <sys/ktrace.h>
66#endif
67#include <sys/ktr.h>
da673940
JG
68#include <sys/vkernel.h>
69#include <sys/sysproto.h>
70#include <sys/sysunion.h>
71#include <sys/vmspace.h>
72
73#include <vm/vm.h>
74#include <vm/vm_param.h>
75#include <sys/lock.h>
76#include <vm/pmap.h>
77#include <vm/vm_kern.h>
78#include <vm/vm_map.h>
79#include <vm/vm_page.h>
80#include <vm/vm_extern.h>
81
82#include <machine/cpu.h>
83#include <machine/md_var.h>
84#include <machine/pcb.h>
85#include <machine/smp.h>
86#include <machine/tss.h>
87#include <machine/globaldata.h>
88
da673940 89#include <ddb/ddb.h>
0e6594a8 90
da673940
JG
91#include <sys/msgport2.h>
92#include <sys/thread2.h>
0e6594a8 93#include <sys/mplock2.h>
da673940 94
da673940
JG
95#define MAKEMPSAFE(have_mplock) \
96 if (have_mplock == 0) { \
97 get_mplock(); \
98 have_mplock = 1; \
99 }
100
da673940
JG
101int (*pmath_emulate) (struct trapframe *);
102
103extern int trapwrite (unsigned addr);
104
105static int trap_pfault (struct trapframe *, int, vm_offset_t);
106static void trap_fatal (struct trapframe *, int, vm_offset_t);
107void dblfault_handler (void);
108
109#if 0
110extern inthand_t IDTVEC(syscall);
111#endif
112
113#define MAX_TRAP_MSG 30
114static char *trap_msg[] = {
115 "", /* 0 unused */
116 "privileged instruction fault", /* 1 T_PRIVINFLT */
117 "", /* 2 unused */
118 "breakpoint instruction fault", /* 3 T_BPTFLT */
119 "", /* 4 unused */
120 "", /* 5 unused */
121 "arithmetic trap", /* 6 T_ARITHTRAP */
122 "system forced exception", /* 7 T_ASTFLT */
123 "", /* 8 unused */
124 "general protection fault", /* 9 T_PROTFLT */
125 "trace trap", /* 10 T_TRCTRAP */
126 "", /* 11 unused */
127 "page fault", /* 12 T_PAGEFLT */
128 "", /* 13 unused */
129 "alignment fault", /* 14 T_ALIGNFLT */
130 "", /* 15 unused */
131 "", /* 16 unused */
132 "", /* 17 unused */
133 "integer divide fault", /* 18 T_DIVIDE */
134 "non-maskable interrupt trap", /* 19 T_NMI */
135 "overflow trap", /* 20 T_OFLOW */
136 "FPU bounds check fault", /* 21 T_BOUND */
137 "FPU device not available", /* 22 T_DNA */
138 "double fault", /* 23 T_DOUBLEFLT */
139 "FPU operand fetch fault", /* 24 T_FPOPFLT */
140 "invalid TSS fault", /* 25 T_TSSFLT */
141 "segment not present fault", /* 26 T_SEGNPFLT */
142 "stack fault", /* 27 T_STKFLT */
143 "machine check trap", /* 28 T_MCHK */
144 "SIMD floating-point exception", /* 29 T_XMMFLT */
145 "reserved (unknown) fault", /* 30 T_RESERVED */
146};
147
148#ifdef DDB
149static int ddb_on_nmi = 1;
150SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
151 &ddb_on_nmi, 0, "Go to DDB on NMI");
152#endif
153static int panic_on_nmi = 1;
154SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
155 &panic_on_nmi, 0, "Panic on NMI");
156static int fast_release;
157SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW,
158 &fast_release, 0, "Passive Release was optimal");
159static int slow_release;
160SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW,
161 &slow_release, 0, "Passive Release was nonoptimal");
da673940
JG
162
163MALLOC_DEFINE(M_SYSMSG, "sysmsg", "sysmsg structure");
164extern int max_sysmsg;
165
166/*
bab69519
JM
167 * Passively intercepts the thread switch function to increase
168 * the thread priority from a user priority to a kernel priority, reducing
da673940 169 * syscall and trap overhead for the case where no switch occurs.
73e24181
MD
170 *
171 * Synchronizes td_ucred with p_ucred. This is used by system calls,
172 * signal handling, faults, AST traps, and anything else that enters the
173 * kernel from userland and provides the kernel with a stable read-only
174 * copy of the process ucred.
da673940 175 */
da673940 176static __inline void
73e24181 177userenter(struct thread *curtd, struct proc *curp)
da673940 178{
73e24181
MD
179 struct ucred *ocred;
180 struct ucred *ncred;
181
da673940 182 curtd->td_release = lwkt_passive_release;
73e24181
MD
183
184 if (curtd->td_ucred != curp->p_ucred) {
185 ncred = crhold(curp->p_ucred);
186 ocred = curtd->td_ucred;
187 curtd->td_ucred = ncred;
188 if (ocred)
189 crfree(ocred);
190 }
da673940
JG
191}
192
193/*
7adb15b6 194 * Handle signals, profiling, and other AST's and/or tasks that
da673940
JG
195 * must be completed before we can return to or try to return to userland.
196 *
197 * Note that td_sticks is a 64 bit quantity, but there's no point doing 64
198 * arithmatic on the delta calculation so the absolute tick values are
199 * truncated to an integer.
200 */
201static void
202userret(struct lwp *lp, struct trapframe *frame, int sticks)
203{
204 struct proc *p = lp->lwp_proc;
205 int sig;
206
207 /*
208 * Charge system time if profiling. Note: times are in microseconds.
209 * This may do a copyout and block, so do it first even though it
210 * means some system time will be charged as user time.
211 */
4643740a 212 if (p->p_flags & P_PROFIL) {
da673940
JG
213 addupc_task(p, frame->tf_rip,
214 (u_int)((int)lp->lwp_thread->td_sticks - sticks));
215 }
216
217recheck:
218 /*
62ae46c9
MD
219 * Specific on-return-to-usermode checks (LWP_MP_WEXIT,
220 * LWP_MP_VNLRU, etc).
da673940 221 */
62ae46c9
MD
222 if (lp->lwp_mpflags & LWP_MP_URETMASK)
223 lwpuserret(lp);
da673940
JG
224
225 /*
226 * Block here if we are in a stopped state.
227 */
228 if (p->p_stat == SSTOP) {
4643740a 229 lwkt_gettoken(&p->p_token);
da673940 230 tstop();
4643740a 231 lwkt_reltoken(&p->p_token);
da673940
JG
232 goto recheck;
233 }
234
235 /*
898e34b3
MD
236 * Post any pending upcalls. If running a virtual kernel be sure
237 * to restore the virtual kernel's vmspace before posting the upcall.
da673940 238 */
7adb15b6 239 if (p->p_flags & (P_SIGVTALRM | P_SIGPROF)) {
fc509460 240 lwkt_gettoken(&p->p_token);
4643740a
MD
241 if (p->p_flags & P_SIGVTALRM) {
242 p->p_flags &= ~P_SIGVTALRM;
898e34b3
MD
243 ksignal(p, SIGVTALRM);
244 }
4643740a
MD
245 if (p->p_flags & P_SIGPROF) {
246 p->p_flags &= ~P_SIGPROF;
898e34b3
MD
247 ksignal(p, SIGPROF);
248 }
fc509460 249 lwkt_reltoken(&p->p_token);
da673940
JG
250 goto recheck;
251 }
252
253 /*
254 * Post any pending signals
2883d2d8
MD
255 *
256 * WARNING! postsig() can exit and not return.
da673940
JG
257 */
258 if ((sig = CURSIG_TRACE(lp)) != 0) {
fc509460 259 lwkt_gettoken(&p->p_token);
da673940 260 postsig(sig);
fc509460 261 lwkt_reltoken(&p->p_token);
da673940
JG
262 goto recheck;
263 }
264
265 /*
266 * block here if we are swapped out, but still process signals
267 * (such as SIGKILL). proc0 (the swapin scheduler) is already
268 * aware of our situation, we do not have to wake it up.
269 */
4643740a 270 if (p->p_flags & P_SWAPPEDOUT) {
616516c8 271 lwkt_gettoken(&p->p_token);
da673940 272 get_mplock();
4643740a 273 p->p_flags |= P_SWAPWAIT;
da673940 274 swapin_request();
4643740a 275 if (p->p_flags & P_SWAPWAIT)
da673940 276 tsleep(p, PCATCH, "SWOUT", 0);
4643740a 277 p->p_flags &= ~P_SWAPWAIT;
da673940 278 rel_mplock();
616516c8 279 lwkt_reltoken(&p->p_token);
da673940
JG
280 goto recheck;
281 }
282
283 /*
6562e2d8
MD
284 * In a multi-threaded program it is possible for a thread to change
285 * signal state during a system call which temporarily changes the
286 * signal mask. In this case postsig() might not be run and we
287 * have to restore the mask ourselves.
da673940 288 */
6562e2d8
MD
289 if (lp->lwp_flags & LWP_OLDMASK) {
290 lp->lwp_flags &= ~LWP_OLDMASK;
291 lp->lwp_sigmask = lp->lwp_oldsigmask;
292 goto recheck;
293 }
da673940
JG
294}
295
296/*
297 * Cleanup from userenter and any passive release that might have occured.
298 * We must reclaim the current-process designation before we can return
299 * to usermode. We also handle both LWKT and USER reschedule requests.
300 */
301static __inline void
302userexit(struct lwp *lp)
303{
304 struct thread *td = lp->lwp_thread;
305 /* globaldata_t gd = td->td_gd; */
306
307 /*
308 * Handle stop requests at kernel priority. Any requests queued
309 * after this loop will generate another AST.
310 */
311 while (lp->lwp_proc->p_stat == SSTOP) {
4643740a 312 lwkt_gettoken(&lp->lwp_proc->p_token);
da673940 313 tstop();
4643740a 314 lwkt_reltoken(&lp->lwp_proc->p_token);
da673940
JG
315 }
316
317 /*
318 * Reduce our priority in preparation for a return to userland. If
319 * our passive release function was still in place, our priority was
320 * never raised and does not need to be reduced.
321 */
322 lwkt_passive_recover(td);
323
324 /*
325 * Become the current user scheduled process if we aren't already,
326 * and deal with reschedule requests and other factors.
327 */
328 lp->lwp_proc->p_usched->acquire_curproc(lp);
329 /* WARNING: we may have migrated cpu's */
330 /* gd = td->td_gd; */
331}
332
333#if !defined(KTR_KERNENTRY)
334#define KTR_KERNENTRY KTR_ALL
335#endif
336KTR_INFO_MASTER(kernentry);
b19d1a6b
SW
337KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0,
338 "TRAP(pid %hd, tid %hd, trapno %ld, eva %lu)",
339 pid_t pid, lwpid_t tid, register_t trapno, vm_offset_t eva);
340KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "TRAP_RET(pid %hd, tid %hd)",
341 pid_t pid, lwpid_t tid);
342KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "SYSC(pid %hd, tid %hd, nr %ld)",
343 pid_t pid, lwpid_t tid, register_t trapno);
344KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "SYSRET(pid %hd, tid %hd, err %d)",
345 pid_t pid, lwpid_t tid, int err);
346KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "FORKRET(pid %hd, tid %hd)",
347 pid_t pid, lwpid_t tid);
da673940
JG
348
349/*
350 * Exception, fault, and trap interface to the kernel.
351 * This common code is called from assembly language IDT gate entry
352 * routines that prepare a suitable stack frame, and restore this
353 * frame after the exception has been processed.
354 *
355 * This function is also called from doreti in an interlock to handle ASTs.
356 * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap
357 *
358 * NOTE! We have to retrieve the fault address prior to obtaining the
359 * MP lock because get_mplock() may switch out. YYY cr2 really ought
360 * to be retrieved by the assembly code, not here.
361 *
362 * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
363 * if an attempt is made to switch from a fast interrupt or IPI. This is
364 * necessary to properly take fatal kernel traps on SMP machines if
365 * get_mplock() has to block.
366 */
367
368void
369user_trap(struct trapframe *frame)
370{
371 struct globaldata *gd = mycpu;
372 struct thread *td = gd->gd_curthread;
373 struct lwp *lp = td->td_lwp;
374 struct proc *p;
375 int sticks = 0;
376 int i = 0, ucode = 0, type, code;
da673940 377 int have_mplock = 0;
da673940 378#ifdef INVARIANTS
f9235b6d 379 int crit_count = td->td_critcount;
3933a3ab 380 lwkt_tokref_t curstop = td->td_toks_stop;
da673940
JG
381#endif
382 vm_offset_t eva;
383
384 p = td->td_proc;
385
386 if (frame->tf_trapno == T_PAGEFLT)
387 eva = frame->tf_addr;
388 else
389 eva = 0;
390#if 0
391 kprintf("USER_TRAP AT %08lx xflags %ld trapno %ld eva %08lx\n",
392 frame->tf_rip, frame->tf_xflags, frame->tf_trapno, eva);
393#endif
394
395 /*
396 * Everything coming from user mode runs through user_trap,
397 * including system calls.
398 */
571cdd83 399 if (frame->tf_trapno == T_FAST_SYSCALL) {
da673940
JG
400 syscall2(frame);
401 return;
402 }
403
404 KTR_LOG(kernentry_trap, lp->lwp_proc->p_pid, lp->lwp_tid,
405 frame->tf_trapno, eva);
406
407#ifdef DDB
408 if (db_active) {
409 eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0);
410 ++gd->gd_trap_nesting_level;
411 MAKEMPSAFE(have_mplock);
412 trap_fatal(frame, TRUE, eva);
413 --gd->gd_trap_nesting_level;
414 goto out2;
415 }
416#endif
417
da673940
JG
418 type = frame->tf_trapno;
419 code = frame->tf_err;
420
73e24181 421 userenter(td, p);
da673940
JG
422
423 sticks = (int)td->td_sticks;
424 lp->lwp_md.md_regs = frame;
425
426 switch (type) {
427 case T_PRIVINFLT: /* privileged instruction fault */
da673940 428 i = SIGILL;
bab69519 429 ucode = ILL_PRVOPC;
da673940
JG
430 break;
431
432 case T_BPTFLT: /* bpt instruction fault */
433 case T_TRCTRAP: /* trace trap */
434 frame->tf_rflags &= ~PSL_T;
435 i = SIGTRAP;
bab69519 436 ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
da673940
JG
437 break;
438
439 case T_ARITHTRAP: /* arithmetic trap */
440 ucode = code;
441 i = SIGFPE;
442 break;
443
444 case T_ASTFLT: /* Allow process switch */
445 mycpu->gd_cnt.v_soft++;
446 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
2a418930
MD
447 atomic_clear_int(&mycpu->gd_reqflags, RQF_AST_OWEUPC);
448 addupc_task(p, p->p_prof.pr_addr, p->p_prof.pr_ticks);
da673940
JG
449 }
450 goto out;
451
452 /*
453 * The following two traps can happen in
454 * vm86 mode, and, if so, we want to handle
455 * them specially.
456 */
457 case T_PROTFLT: /* general protection fault */
458 case T_STKFLT: /* stack fault */
459#if 0
460 if (frame->tf_eflags & PSL_VM) {
461 i = vm86_emulate((struct vm86frame *)frame);
462 if (i == 0)
463 goto out;
464 break;
465 }
466#endif
467 /* FALL THROUGH */
468
469 case T_SEGNPFLT: /* segment not present fault */
470 case T_TSSFLT: /* invalid TSS fault */
471 case T_DOUBLEFLT: /* double fault */
472 default:
da673940 473 i = SIGBUS;
bab69519 474 ucode = code + BUS_SEGM_FAULT ;
da673940
JG
475 break;
476
477 case T_PAGEFLT: /* page fault */
478 MAKEMPSAFE(have_mplock);
479 i = trap_pfault(frame, TRUE, eva);
fbff57d1 480 if (i == -1 || i == 0)
da673940
JG
481 goto out;
482
bab69519
JM
483
484 if (i == SIGSEGV)
485 ucode = SEGV_MAPERR;
486 else {
487 i = SIGSEGV;
488 ucode = SEGV_ACCERR;
489 }
da673940
JG
490 break;
491
492 case T_DIVIDE: /* integer divide fault */
493 ucode = FPE_INTDIV;
494 i = SIGFPE;
495 break;
496
497#if NISA > 0
498 case T_NMI:
499 MAKEMPSAFE(have_mplock);
500 /* machine/parity/power fail/"kitchen sink" faults */
501 if (isa_nmi(code) == 0) {
502#ifdef DDB
503 /*
504 * NMI can be hooked up to a pushbutton
505 * for debugging.
506 */
507 if (ddb_on_nmi) {
508 kprintf ("NMI ... going to debugger\n");
bab69519 509 kdb_trap(type, 0, frame);
da673940
JG
510 }
511#endif /* DDB */
512 goto out2;
513 } else if (panic_on_nmi)
514 panic("NMI indicates hardware failure");
515 break;
516#endif /* NISA > 0 */
517
518 case T_OFLOW: /* integer overflow fault */
519 ucode = FPE_INTOVF;
520 i = SIGFPE;
521 break;
522
523 case T_BOUND: /* bounds check fault */
524 ucode = FPE_FLTSUB;
525 i = SIGFPE;
526 break;
527
528 case T_DNA:
529 /*
530 * Virtual kernel intercept - pass the DNA exception
531 * to the (emulated) virtual kernel if it asked to handle
532 * it. This occurs when the virtual kernel is holding
533 * onto the FP context for a different emulated
534 * process then the one currently running.
535 *
536 * We must still call npxdna() since we may have
537 * saved FP state that the (emulated) virtual kernel
538 * needs to hand over to a different emulated process.
539 */
540 if (lp->lwp_vkernel && lp->lwp_vkernel->ve &&
541 (td->td_pcb->pcb_flags & FP_VIRTFP)
542 ) {
543 npxdna(frame);
544 break;
545 }
bab69519 546
da673940
JG
547 /*
548 * The kernel may have switched out the FP unit's
549 * state, causing the user process to take a fault
550 * when it tries to use the FP unit. Restore the
551 * state here
552 */
553 if (npxdna(frame))
554 goto out;
555 if (!pmath_emulate) {
556 i = SIGFPE;
557 ucode = FPE_FPU_NP_TRAP;
558 break;
559 }
560 i = (*pmath_emulate)(frame);
561 if (i == 0) {
562 if (!(frame->tf_rflags & PSL_T))
563 goto out2;
564 frame->tf_rflags &= ~PSL_T;
565 i = SIGTRAP;
566 }
567 /* else ucode = emulator_only_knows() XXX */
568 break;
569
570 case T_FPOPFLT: /* FPU operand fetch fault */
571 ucode = T_FPOPFLT;
572 i = SIGILL;
573 break;
574
575 case T_XMMFLT: /* SIMD floating-point exception */
576 ucode = 0; /* XXX */
577 i = SIGFPE;
578 break;
579 }
580
581 /*
582 * Virtual kernel intercept - if the fault is directly related to a
583 * VM context managed by a virtual kernel then let the virtual kernel
584 * handle it.
585 */
586 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
587 vkernel_trap(lp, frame);
588 goto out;
589 }
590
591 /*
592 * Translate fault for emulators (e.g. Linux)
593 */
594 if (*p->p_sysent->sv_transtrap)
595 i = (*p->p_sysent->sv_transtrap)(i, type);
596
597 MAKEMPSAFE(have_mplock);
598 trapsignal(lp, i, ucode);
599
600#ifdef DEBUG
601 if (type <= MAX_TRAP_MSG) {
602 uprintf("fatal process exception: %s",
603 trap_msg[type]);
604 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
605 uprintf(", fault VA = 0x%lx", (u_long)eva);
606 uprintf("\n");
607 }
608#endif
609
610out:
da673940
JG
611 userret(lp, frame, sticks);
612 userexit(lp);
613out2: ;
da673940
JG
614 if (have_mplock)
615 rel_mplock();
da673940
JG
616 KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
617#ifdef INVARIANTS
f9235b6d 618 KASSERT(crit_count == td->td_critcount,
3933a3ab 619 ("trap: critical section count mismatch! %d/%d",
f9235b6d 620 crit_count, td->td_pri));
3933a3ab
MD
621 KASSERT(curstop == td->td_toks_stop,
622 ("trap: extra tokens held after trap! %ld/%ld",
623 curstop - &td->td_toks_base,
624 td->td_toks_stop - &td->td_toks_base));
da673940
JG
625#endif
626}
627
628void
629kern_trap(struct trapframe *frame)
630{
631 struct globaldata *gd = mycpu;
632 struct thread *td = gd->gd_curthread;
633 struct lwp *lp;
634 struct proc *p;
635 int i = 0, ucode = 0, type, code;
da673940 636 int have_mplock = 0;
da673940 637#ifdef INVARIANTS
f9235b6d 638 int crit_count = td->td_critcount;
3933a3ab 639 lwkt_tokref_t curstop = td->td_toks_stop;
da673940
JG
640#endif
641 vm_offset_t eva;
642
643 lp = td->td_lwp;
644 p = td->td_proc;
645
646 if (frame->tf_trapno == T_PAGEFLT)
647 eva = frame->tf_addr;
648 else
649 eva = 0;
650
651#ifdef DDB
652 if (db_active) {
653 ++gd->gd_trap_nesting_level;
654 MAKEMPSAFE(have_mplock);
655 trap_fatal(frame, FALSE, eva);
656 --gd->gd_trap_nesting_level;
657 goto out2;
658 }
659#endif
660
da673940
JG
661 type = frame->tf_trapno;
662 code = frame->tf_err;
663
664#if 0
665kernel_trap:
666#endif
667 /* kernel trap */
668
669 switch (type) {
670 case T_PAGEFLT: /* page fault */
671 MAKEMPSAFE(have_mplock);
672 trap_pfault(frame, FALSE, eva);
673 goto out2;
674
675 case T_DNA:
676 /*
677 * The kernel may be using npx for copying or other
678 * purposes.
679 */
680 panic("kernel NPX should not happen");
681 if (npxdna(frame))
682 goto out2;
683 break;
684
685 case T_PROTFLT: /* general protection fault */
686 case T_SEGNPFLT: /* segment not present fault */
687 /*
688 * Invalid segment selectors and out of bounds
689 * %eip's and %esp's can be set up in user mode.
690 * This causes a fault in kernel mode when the
691 * kernel tries to return to user mode. We want
692 * to get this fault so that we can fix the
693 * problem here and not have to check all the
694 * selectors and pointers when the user changes
695 * them.
696 */
697 if (mycpu->gd_intr_nesting_level == 0) {
698 if (td->td_pcb->pcb_onfault) {
699 frame->tf_rip =
700 (register_t)td->td_pcb->pcb_onfault;
701 goto out2;
702 }
703 }
704 break;
705
706 case T_TSSFLT:
707 /*
708 * PSL_NT can be set in user mode and isn't cleared
709 * automatically when the kernel is entered. This
710 * causes a TSS fault when the kernel attempts to
711 * `iret' because the TSS link is uninitialized. We
712 * want to get this fault so that we can fix the
713 * problem here and not every time the kernel is
714 * entered.
715 */
716 if (frame->tf_rflags & PSL_NT) {
717 frame->tf_rflags &= ~PSL_NT;
718 goto out2;
719 }
720 break;
721
722 case T_TRCTRAP: /* trace trap */
723#if 0
724 if (frame->tf_eip == (int)IDTVEC(syscall)) {
725 /*
726 * We've just entered system mode via the
727 * syscall lcall. Continue single stepping
728 * silently until the syscall handler has
729 * saved the flags.
730 */
731 goto out2;
732 }
733 if (frame->tf_eip == (int)IDTVEC(syscall) + 1) {
734 /*
735 * The syscall handler has now saved the
736 * flags. Stop single stepping it.
737 */
738 frame->tf_eflags &= ~PSL_T;
739 goto out2;
740 }
741#endif
742#if 0
743 /*
744 * Ignore debug register trace traps due to
745 * accesses in the user's address space, which
746 * can happen under several conditions such as
747 * if a user sets a watchpoint on a buffer and
748 * then passes that buffer to a system call.
749 * We still want to get TRCTRAPS for addresses
750 * in kernel space because that is useful when
751 * debugging the kernel.
752 */
753 if (user_dbreg_trap()) {
754 /*
755 * Reset breakpoint bits because the
756 * processor doesn't
757 */
758 load_dr6(rdr6() & 0xfffffff0);
759 goto out2;
760 }
761#endif
762 /*
763 * Fall through (TRCTRAP kernel mode, kernel address)
764 */
765 case T_BPTFLT:
766 /*
767 * If DDB is enabled, let it handle the debugger trap.
768 * Otherwise, debugger traps "can't happen".
769 */
770#ifdef DDB
771 MAKEMPSAFE(have_mplock);
772 if (kdb_trap (type, 0, frame))
773 goto out2;
774#endif
775 break;
776 case T_DIVIDE:
777 MAKEMPSAFE(have_mplock);
778 trap_fatal(frame, FALSE, eva);
779 goto out2;
780 case T_NMI:
781 MAKEMPSAFE(have_mplock);
782 trap_fatal(frame, FALSE, eva);
783 goto out2;
b9d01986
MD
784 case T_SYSCALL80:
785 case T_FAST_SYSCALL:
da673940
JG
786 /*
787 * Ignore this trap generated from a spurious SIGTRAP.
788 *
789 * single stepping in / syscalls leads to spurious / SIGTRAP
790 * so ignore
791 *
792 * Haiku (c) 2007 Simon 'corecode' Schubert
793 */
794 goto out2;
795 }
796
797 /*
798 * Translate fault for emulators (e.g. Linux)
799 */
800 if (*p->p_sysent->sv_transtrap)
801 i = (*p->p_sysent->sv_transtrap)(i, type);
802
803 MAKEMPSAFE(have_mplock);
804 trapsignal(lp, i, ucode);
805
806#ifdef DEBUG
807 if (type <= MAX_TRAP_MSG) {
808 uprintf("fatal process exception: %s",
809 trap_msg[type]);
810 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
811 uprintf(", fault VA = 0x%lx", (u_long)eva);
812 uprintf("\n");
813 }
814#endif
815
816out2:
817 ;
da673940
JG
818 if (have_mplock)
819 rel_mplock();
da673940 820#ifdef INVARIANTS
f9235b6d 821 KASSERT(crit_count == td->td_critcount,
3933a3ab 822 ("trap: critical section count mismatch! %d/%d",
f9235b6d 823 crit_count, td->td_pri));
3933a3ab
MD
824 KASSERT(curstop == td->td_toks_stop,
825 ("trap: extra tokens held after trap! %ld/%ld",
826 curstop - &td->td_toks_base,
827 td->td_toks_stop - &td->td_toks_base));
da673940
JG
828#endif
829}
830
831int
832trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
833{
834 vm_offset_t va;
835 struct vmspace *vm = NULL;
836 vm_map_t map = 0;
837 int rv = 0;
838 vm_prot_t ftype;
839 thread_t td = curthread;
840 struct lwp *lp = td->td_lwp;
f8a5cc8f 841 int fault_flags;
da673940
JG
842
843 va = trunc_page(eva);
844 if (usermode == FALSE) {
845 /*
846 * This is a fault on kernel virtual memory.
847 */
848 map = &kernel_map;
849 } else {
850 /*
851 * This is a fault on non-kernel virtual memory.
852 * vm is initialized above to NULL. If curproc is NULL
853 * or curproc->p_vmspace is NULL the fault is fatal.
854 */
855 if (lp != NULL)
856 vm = lp->lwp_vmspace;
857
858 if (vm == NULL)
859 goto nogo;
860
861 map = &vm->vm_map;
862 }
863
864 if (frame->tf_err & PGEX_W)
865 ftype = VM_PROT_READ | VM_PROT_WRITE;
866 else
867 ftype = VM_PROT_READ;
868
869 if (map != &kernel_map) {
870 /*
871 * Keep swapout from messing with us during this
872 * critical time.
873 */
874 PHOLD(lp->lwp_proc);
875
876 /*
877 * Grow the stack if necessary
878 */
879 /* grow_stack returns false only if va falls into
880 * a growable stack region and the stack growth
881 * fails. It returns true if va was not within
882 * a growable stack region, or if the stack
883 * growth succeeded.
884 */
885 if (!grow_stack (lp->lwp_proc, va)) {
886 rv = KERN_FAILURE;
887 PRELE(lp->lwp_proc);
888 goto nogo;
889 }
890
f8a5cc8f
VS
891 fault_flags = 0;
892 if (usermode)
893 fault_flags |= VM_FAULT_BURST;
894 if (ftype & VM_PROT_WRITE)
895 fault_flags |= VM_FAULT_DIRTY;
896 else
897 fault_flags |= VM_FAULT_NORMAL;
f8a5cc8f 898 rv = vm_fault(map, va, ftype, fault_flags);
bab69519 899
da673940
JG
900 PRELE(lp->lwp_proc);
901 } else {
902 /*
903 * Don't have to worry about process locking or stacks in the kernel.
904 */
905 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
906 }
907
908 if (rv == KERN_SUCCESS)
909 return (0);
910nogo:
911 if (!usermode) {
912 if (td->td_gd->gd_intr_nesting_level == 0 &&
913 td->td_pcb->pcb_onfault) {
914 frame->tf_rip = (register_t)td->td_pcb->pcb_onfault;
915 return (0);
916 }
917 trap_fatal(frame, usermode, eva);
918 return (-1);
919 }
920
921 /*
a76ca9b9 922 * NOTE: on x86_64 we have a tf_addr field in the trapframe, no
da673940
JG
923 * kludge is needed to pass the fault address to signal handlers.
924 */
925 struct proc *p = td->td_proc;
926 kprintf("seg-fault accessing address %p rip=%p pid=%d p_comm=%s\n",
927 (void *)va, (void *)frame->tf_rip, p->p_pid, p->p_comm);
928 /* Debugger("seg-fault"); */
929
930 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
931}
932
933static void
934trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva)
935{
936 int code, type, ss;
937 long rsp;
938
939 code = frame->tf_xflags;
940 type = frame->tf_trapno;
941
942 if (type <= MAX_TRAP_MSG) {
943 kprintf("\n\nFatal trap %d: %s while in %s mode\n",
944 type, trap_msg[type],
945 (usermode ? "user" : "kernel"));
946 }
da673940 947 /* two separate prints in case of a trap on an unmapped page */
da673940 948 kprintf("cpuid = %d\n", mycpu->gd_cpuid);
da673940 949 if (type == T_PAGEFLT) {
0e6594a8 950 kprintf("fault virtual address = %p\n", (void *)eva);
da673940
JG
951 kprintf("fault code = %s %s, %s\n",
952 usermode ? "user" : "supervisor",
953 code & PGEX_W ? "write" : "read",
954 code & PGEX_P ? "protection violation" : "page not present");
955 }
956 kprintf("instruction pointer = 0x%lx:0x%lx\n",
957 frame->tf_cs & 0xffff, frame->tf_rip);
958 if (usermode) {
959 ss = frame->tf_ss & 0xffff;
960 rsp = frame->tf_rsp;
961 } else {
962 ss = GSEL(GDATA_SEL, SEL_KPL);
963 rsp = (long)&frame->tf_rsp;
964 }
965 kprintf("stack pointer = 0x%x:0x%lx\n", ss, rsp);
966 kprintf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp);
967 kprintf("processor eflags = ");
968 if (frame->tf_rflags & PSL_T)
969 kprintf("trace trap, ");
970 if (frame->tf_rflags & PSL_I)
971 kprintf("interrupt enabled, ");
972 if (frame->tf_rflags & PSL_NT)
973 kprintf("nested task, ");
974 if (frame->tf_rflags & PSL_RF)
975 kprintf("resume, ");
976#if 0
977 if (frame->tf_eflags & PSL_VM)
978 kprintf("vm86, ");
979#endif
0e6594a8 980 kprintf("IOPL = %jd\n", (intmax_t)((frame->tf_rflags & PSL_IOPL) >> 12));
da673940
JG
981 kprintf("current process = ");
982 if (curproc) {
983 kprintf("%lu (%s)\n",
984 (u_long)curproc->p_pid, curproc->p_comm ?
985 curproc->p_comm : "");
986 } else {
987 kprintf("Idle\n");
988 }
989 kprintf("current thread = pri %d ", curthread->td_pri);
f9235b6d 990 if (curthread->td_critcount)
da673940
JG
991 kprintf("(CRIT)");
992 kprintf("\n");
da673940
JG
993/**
994 * XXX FIXME:
995 * we probably SHOULD have stopped the other CPUs before now!
996 * another CPU COULD have been touching cpl at this moment...
997 */
998 kprintf(" <- SMP: XXX");
da673940
JG
999 kprintf("\n");
1000
1001#ifdef KDB
1002 if (kdb_trap(&psl))
1003 return;
1004#endif
1005#ifdef DDB
1006 if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame))
1007 return;
1008#endif
1009 kprintf("trap number = %d\n", type);
1010 if (type <= MAX_TRAP_MSG)
1011 panic("%s", trap_msg[type]);
1012 else
1013 panic("unknown/reserved trap");
1014}
1015
1016/*
1017 * Double fault handler. Called when a fault occurs while writing
1018 * a frame for a trap/exception onto the stack. This usually occurs
1019 * when the stack overflows (such is the case with infinite recursion,
1020 * for example).
1021 *
1022 * XXX Note that the current PTD gets replaced by IdlePTD when the
1023 * task switch occurs. This means that the stack that was active at
1024 * the time of the double fault is not available at <kstack> unless
1025 * the machine was idle when the double fault occurred. The downside
1026 * of this is that "trace <ebp>" in ddb won't work.
1027 */
1028void
1029dblfault_handler(void)
1030{
0e6594a8 1031#if JG
da673940 1032 struct mdglobaldata *gd = mdcpu;
0e6594a8 1033#endif
da673940
JG
1034
1035 kprintf("\nFatal double fault:\n");
1036#if JG
1037 kprintf("rip = 0x%lx\n", gd->gd_common_tss.tss_rip);
1038 kprintf("rsp = 0x%lx\n", gd->gd_common_tss.tss_rsp);
1039 kprintf("rbp = 0x%lx\n", gd->gd_common_tss.tss_rbp);
1040#endif
da673940 1041 /* two separate prints in case of a trap on an unmapped page */
da673940 1042 kprintf("cpuid = %d\n", mycpu->gd_cpuid);
da673940
JG
1043 panic("double fault");
1044}
1045
1046/*
1047 * Compensate for 386 brain damage (missing URKR).
1048 * This is a little simpler than the pagefault handler in trap() because
1049 * it the page tables have already been faulted in and high addresses
1050 * are thrown out early for other reasons.
1051 */
1052int
1053trapwrite(unsigned addr)
1054{
1055 struct lwp *lp;
1056 vm_offset_t va;
1057 struct vmspace *vm;
1058 int rv;
1059
1060 va = trunc_page((vm_offset_t)addr);
1061 /*
1062 * XXX - MAX is END. Changed > to >= for temp. fix.
1063 */
1064 if (va >= VM_MAX_USER_ADDRESS)
1065 return (1);
1066
1067 lp = curthread->td_lwp;
1068 vm = lp->lwp_vmspace;
1069
1070 PHOLD(lp->lwp_proc);
1071
1072 if (!grow_stack (lp->lwp_proc, va)) {
1073 PRELE(lp->lwp_proc);
1074 return (1);
1075 }
1076
1077 /*
1078 * fault the data page
1079 */
1080 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1081
1082 PRELE(lp->lwp_proc);
1083
1084 if (rv != KERN_SUCCESS)
1085 return 1;
1086
1087 return (0);
1088}
1089
1090/*
1091 * syscall2 - MP aware system call request C handler
1092 *
1093 * A system call is essentially treated as a trap except that the
1094 * MP lock is not held on entry or return. We are responsible for
1095 * obtaining the MP lock if necessary and for handling ASTs
1096 * (e.g. a task switch) prior to return.
1097 *
1098 * In general, only simple access and manipulation of curproc and
1099 * the current stack is allowed without having to hold MP lock.
1100 *
1101 * MPSAFE - note that large sections of this routine are run without
1102 * the MP lock.
1103 */
1104void
1105syscall2(struct trapframe *frame)
1106{
1107 struct thread *td = curthread;
1108 struct proc *p = td->td_proc;
1109 struct lwp *lp = td->td_lwp;
1110 caddr_t params;
1111 struct sysent *callp;
1112 register_t orig_tf_rflags;
1113 int sticks;
1114 int error;
1115 int narg;
1116#ifdef INVARIANTS
f9235b6d 1117 int crit_count = td->td_critcount;
aba00258 1118 lwkt_tokref_t curstop = td->td_toks_stop;
da673940 1119#endif
da673940 1120 int have_mplock = 0;
da673940
JG
1121 register_t *argp;
1122 u_int code;
1123 int reg, regcnt;
1124 union sysunion args;
1125 register_t *argsdst;
1126
1127 mycpu->gd_cnt.v_syscall++;
1128
1129 KTR_LOG(kernentry_syscall, lp->lwp_proc->p_pid, lp->lwp_tid,
b19d1a6b 1130 frame->tf_rax);
da673940 1131
73e24181 1132 userenter(td, p); /* lazy raise our priority */
da673940
JG
1133
1134 reg = 0;
1135 regcnt = 6;
1136 /*
1137 * Misc
1138 */
1139 sticks = (int)td->td_sticks;
1140 orig_tf_rflags = frame->tf_rflags;
1141
1142 /*
1143 * Virtual kernel intercept - if a VM context managed by a virtual
1144 * kernel issues a system call the virtual kernel handles it, not us.
1145 * Restore the virtual kernel context and return from its system
1146 * call. The current frame is copied out to the virtual kernel.
1147 */
1148 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
0e6594a8 1149 vkernel_trap(lp, frame);
da673940
JG
1150 error = EJUSTRETURN;
1151 goto out;
1152 }
1153
1154 /*
1155 * Get the system call parameters and account for time
1156 */
1157 lp->lwp_md.md_regs = frame;
1158 params = (caddr_t)frame->tf_rsp + sizeof(register_t);
1159 code = frame->tf_rax;
1160
1161 if (p->p_sysent->sv_prepsyscall) {
1162 (*p->p_sysent->sv_prepsyscall)(
1163 frame, (int *)(&args.nosys.sysmsg + 1),
1164 &code, &params);
1165 } else {
1166 if (code == SYS_syscall || code == SYS___syscall) {
1167 code = frame->tf_rdi;
1168 reg++;
1169 regcnt--;
1170 }
1171 }
1172
1173 if (p->p_sysent->sv_mask)
1174 code &= p->p_sysent->sv_mask;
1175
1176 if (code >= p->p_sysent->sv_size)
1177 callp = &p->p_sysent->sv_table[0];
1178 else
1179 callp = &p->p_sysent->sv_table[code];
1180
1181 narg = callp->sy_narg & SYF_ARGMASK;
1182
1183 /*
a76ca9b9 1184 * On x86_64 we get up to six arguments in registers. The rest are
da673940
JG
1185 * on the stack. The first six members of 'struct trapframe' happen
1186 * to be the registers used to pass arguments, in exactly the right
1187 * order.
1188 */
1189 argp = &frame->tf_rdi;
1190 argp += reg;
1191 argsdst = (register_t *)(&args.nosys.sysmsg + 1);
1192 /*
1193 * JG can we overflow the space pointed to by 'argsdst'
1194 * either with 'bcopy' or with 'copyin'?
1195 */
1196 bcopy(argp, argsdst, sizeof(register_t) * regcnt);
1197 /*
1198 * copyin is MP aware, but the tracing code is not
1199 */
1200 if (narg > regcnt) {
1201 KASSERT(params != NULL, ("copyin args with no params!"));
1202 error = copyin(params, &argsdst[regcnt],
1203 (narg - regcnt) * sizeof(register_t));
1204 if (error) {
1205#ifdef KTRACE
1206 if (KTRPOINT(td, KTR_SYSCALL)) {
1207 MAKEMPSAFE(have_mplock);
1208
1209 ktrsyscall(lp, code, narg,
1210 (void *)(&args.nosys.sysmsg + 1));
1211 }
1212#endif
1213 goto bad;
1214 }
1215 }
1216
1217#ifdef KTRACE
1218 if (KTRPOINT(td, KTR_SYSCALL)) {
1219 MAKEMPSAFE(have_mplock);
1220 ktrsyscall(lp, code, narg, (void *)(&args.nosys.sysmsg + 1));
1221 }
1222#endif
1223
1224 /*
1225 * Default return value is 0 (will be copied to %rax). Double-value
1226 * returns use %rax and %rdx. %rdx is left unchanged for system
1227 * calls which return only one result.
1228 */
1229 args.sysmsg_fds[0] = 0;
1230 args.sysmsg_fds[1] = frame->tf_rdx;
1231
1232 /*
1233 * The syscall might manipulate the trap frame. If it does it
1234 * will probably return EJUSTRETURN.
1235 */
1236 args.sysmsg_frame = frame;
1237
1238 STOPEVENT(p, S_SCE, narg); /* MP aware */
1239
da673940 1240 /*
0e6594a8
SW
1241 * NOTE: All system calls run MPSAFE now. The system call itself
1242 * is responsible for getting the MP lock.
da673940 1243 */
da673940
JG
1244 error = (*callp->sy_call)(&args);
1245
1246#if 0
1247 kprintf("system call %d returned %d\n", code, error);
1248#endif
1249
1250out:
1251 /*
1252 * MP SAFE (we may or may not have the MP lock at this point)
1253 */
1254 switch (error) {
1255 case 0:
1256 /*
1257 * Reinitialize proc pointer `p' as it may be different
1258 * if this is a child returning from fork syscall.
1259 */
1260 p = curproc;
1261 lp = curthread->td_lwp;
1262 frame->tf_rax = args.sysmsg_fds[0];
1263 frame->tf_rdx = args.sysmsg_fds[1];
1264 frame->tf_rflags &= ~PSL_C;
1265 break;
1266 case ERESTART:
1267 /*
1268 * Reconstruct pc, we know that 'syscall' is 2 bytes.
1269 * We have to do a full context restore so that %r10
1270 * (which was holding the value of %rcx) is restored for
1271 * the next iteration.
1272 */
1273 frame->tf_rip -= frame->tf_err;
1274 frame->tf_r10 = frame->tf_rcx;
1275 break;
1276 case EJUSTRETURN:
1277 break;
1278 case EASYNC:
1279 panic("Unexpected EASYNC return value (for now)");
1280 default:
1281bad:
1282 if (p->p_sysent->sv_errsize) {
1283 if (error >= p->p_sysent->sv_errsize)
1284 error = -1; /* XXX */
1285 else
1286 error = p->p_sysent->sv_errtbl[error];
1287 }
1288 frame->tf_rax = error;
1289 frame->tf_rflags |= PSL_C;
1290 break;
1291 }
1292
1293 /*
1294 * Traced syscall. trapsignal() is not MP aware.
1295 */
1296 if (orig_tf_rflags & PSL_T) {
1297 MAKEMPSAFE(have_mplock);
1298 frame->tf_rflags &= ~PSL_T;
1299 trapsignal(lp, SIGTRAP, 0);
1300 }
1301
1302 /*
1303 * Handle reschedule and other end-of-syscall issues
1304 */
1305 userret(lp, frame, sticks);
1306
1307#ifdef KTRACE
1308 if (KTRPOINT(td, KTR_SYSRET)) {
1309 MAKEMPSAFE(have_mplock);
1310 ktrsysret(lp, code, error, args.sysmsg_result);
1311 }
1312#endif
1313
1314 /*
1315 * This works because errno is findable through the
1316 * register set. If we ever support an emulation where this
1317 * is not the case, this code will need to be revisited.
1318 */
1319 STOPEVENT(p, S_SCX, code);
1320
1321 userexit(lp);
da673940
JG
1322 /*
1323 * Release the MP lock if we had to get it
1324 */
da673940
JG
1325 if (have_mplock)
1326 rel_mplock();
da673940
JG
1327 KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error);
1328#ifdef INVARIANTS
3933a3ab 1329 KASSERT(&td->td_toks_base == td->td_toks_stop,
da673940 1330 ("syscall: critical section count mismatch! %d/%d",
f9235b6d 1331 crit_count, td->td_pri));
3933a3ab
MD
1332 KASSERT(curstop == td->td_toks_stop,
1333 ("syscall: extra tokens held after trap! %ld",
1334 td->td_toks_stop - &td->td_toks_base));
da673940
JG
1335#endif
1336}
1337
2b0bd8aa
MD
1338/*
1339 * NOTE: mplock not held at any point
1340 */
da673940
JG
1341void
1342fork_return(struct lwp *lp, struct trapframe *frame)
1343{
1344 frame->tf_rax = 0; /* Child returns zero */
1345 frame->tf_rflags &= ~PSL_C; /* success */
1346 frame->tf_rdx = 1;
1347
1348 generic_lwp_return(lp, frame);
1349 KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
1350}
1351
1352/*
1353 * Simplified back end of syscall(), used when returning from fork()
2b0bd8aa
MD
1354 * directly into user mode.
1355 *
1356 * This code will return back into the fork trampoline code which then
1357 * runs doreti.
1358 *
1359 * NOTE: The mplock is not held at any point.
da673940
JG
1360 */
1361void
1362generic_lwp_return(struct lwp *lp, struct trapframe *frame)
1363{
1364 struct proc *p = lp->lwp_proc;
1365
1366 /*
1367 * Newly forked processes are given a kernel priority. We have to
1368 * adjust the priority to a normal user priority and fake entry
1369 * into the kernel (call userenter()) to install a passive release
1370 * function just in case userret() decides to stop the process. This
1371 * can occur when ^Z races a fork. If we do not install the passive
1372 * release function the current process designation will not be
1373 * released when the thread goes to sleep.
1374 */
1375 lwkt_setpri_self(TDPRI_USER_NORM);
73e24181 1376 userenter(lp->lwp_thread, p);
da673940
JG
1377 userret(lp, frame, 0);
1378#ifdef KTRACE
1379 if (KTRPOINT(lp->lwp_thread, KTR_SYSRET))
1380 ktrsysret(lp, SYS_fork, 0, 0);
1381#endif
4643740a 1382 lp->lwp_flags |= LWP_PASSIVE_ACQ;
da673940 1383 userexit(lp);
4643740a 1384 lp->lwp_flags &= ~LWP_PASSIVE_ACQ;
da673940
JG
1385}
1386
1387/*
1388 * doreti has turned into this. The frame is directly on the stack. We
1389 * pull everything else we need (fpu and tls context) from the current
1390 * thread.
1391 *
1392 * Note on fpu interactions: In a virtual kernel, the fpu context for
1393 * an emulated user mode process is not shared with the virtual kernel's
1394 * fpu context, so we only have to 'stack' fpu contexts within the virtual
1395 * kernel itself, and not even then since the signal() contexts that we care
1396 * about save and restore the FPU state (I think anyhow).
1397 *
1398 * vmspace_ctl() returns an error only if it had problems instaling the
1399 * context we supplied or problems copying data to/from our VM space.
1400 */
1401void
1402go_user(struct intrframe *frame)
1403{
1404 struct trapframe *tf = (void *)&frame->if_rdi;
1405 int r;
1406
1407 /*
1408 * Interrupts may be disabled on entry, make sure all signals
1409 * can be received before beginning our loop.
1410 */
1411 sigsetmask(0);
1412
1413 /*
1414 * Switch to the current simulated user process, then call
1415 * user_trap() when we break out of it (usually due to a signal).
1416 */
1417 for (;;) {
1418 /*
1419 * Tell the real kernel whether it is ok to use the FP
1420 * unit or not.
1421 */
1422 if (mdcpu->gd_npxthread == curthread) {
1423 tf->tf_xflags &= ~PGEX_FPFAULT;
1424 } else {
1425 tf->tf_xflags |= PGEX_FPFAULT;
1426 }
1427
1428 /*
1429 * Run emulated user process context. This call interlocks
1430 * with new mailbox signals.
1431 *
1432 * Set PGEX_U unconditionally, indicating a user frame (the
1433 * bit is normally set only by T_PAGEFLT).
1434 */
1435 r = vmspace_ctl(&curproc->p_vmspace->vm_pmap, VMSPACE_CTL_RUN,
1436 tf, &curthread->td_savevext);
1437 frame->if_xflags |= PGEX_U;
1438#if 0
1439 kprintf("GO USER %d trap %ld EVA %08lx RIP %08lx RSP %08lx XFLAGS %02lx/%02lx\n",
1440 r, tf->tf_trapno, tf->tf_addr, tf->tf_rip, tf->tf_rsp,
1441 tf->tf_xflags, frame->if_xflags);
1442#endif
1443 if (r < 0) {
1444 if (errno != EINTR)
0e6594a8 1445 panic("vmspace_ctl failed error %d", errno);
da673940
JG
1446 } else {
1447 if (tf->tf_trapno) {
1448 user_trap(tf);
1449 }
1450 }
1451 if (mycpu->gd_reqflags & RQF_AST_MASK) {
1452 tf->tf_trapno = T_ASTFLT;
1453 user_trap(tf);
1454 }
1455 tf->tf_trapno = 0;
1456 }
1457}
1458
1459/*
1460 * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA
1461 * fault (which is then passed back to the virtual kernel) if an attempt is
1462 * made to use the FP unit.
1463 *
1464 * XXX this is a fairly big hack.
1465 */
1466void
1467set_vkernel_fp(struct trapframe *frame)
1468{
1469 struct thread *td = curthread;
1470
1471 if (frame->tf_xflags & PGEX_FPFAULT) {
1472 td->td_pcb->pcb_flags |= FP_VIRTFP;
1473 if (mdcpu->gd_npxthread == td)
1474 npxexit();
1475 } else {
1476 td->td_pcb->pcb_flags &= ~FP_VIRTFP;
1477 }
1478}
0e6594a8
SW
1479
1480/*
1481 * Called from vkernel_trap() to fixup the vkernel's syscall
1482 * frame for vmspace_ctl() return.
1483 */
1484void
1485cpu_vkernel_trap(struct trapframe *frame, int error)
1486{
1487 frame->tf_rax = error;
1488 if (error)
1489 frame->tf_rflags |= PSL_C;
1490 else
1491 frame->tf_rflags &= ~PSL_C;
1492}