gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (C) 1994, David Greenman
	3	* Copyright (c) 1990, 1993
	4	* The Regents of the University of California. All rights reserved.
	5	*
	6	* This code is derived from software contributed to Berkeley by
	7	* the University of Utah, and William Jolitz.
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions
	11	* are met:
	12	* 1. Redistributions of source code must retain the above copyright
	13	* notice, this list of conditions and the following disclaimer.
	14	* 2. Redistributions in binary form must reproduce the above copyright
	15	* notice, this list of conditions and the following disclaimer in the
	16	* documentation and/or other materials provided with the distribution.
	17	* 3. All advertising materials mentioning features or use of this software
	18	* must display the following acknowledgement:
	19	* This product includes software developed by the University of
	20	* California, Berkeley and its contributors.
	21	* 4. Neither the name of the University nor the names of its contributors
	22	* may be used to endorse or promote products derived from this software
	23	* without specific prior written permission.
	24	*
	25	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	26	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	27	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	28	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	29	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	30	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	31	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	32	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	33	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	34	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	35	* SUCH DAMAGE.
	36	*
	37	* from: @(#)trap.c 7.4 (Berkeley) 5/13/91
	38	* $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
	39	*/
	40
	41	/*
	42	* 386 Trap and System call handling
	43	*/
	44
	45	#include "use_isa.h"
	46	#include "use_npx.h"
	47
	48	#include "opt_cpu.h"
	49	#include "opt_ddb.h"
	50	#include "opt_ktrace.h"
	51	#include "opt_clock.h"
	52	#include "opt_trap.h"
	53
	54	#include <sys/param.h>
	55	#include <sys/systm.h>
	56	#include <sys/proc.h>
	57	#include <sys/pioctl.h>
	58	#include <sys/kernel.h>
	59	#include <sys/kerneldump.h>
	60	#include <sys/resourcevar.h>
	61	#include <sys/signalvar.h>
	62	#include <sys/signal2.h>
	63	#include <sys/syscall.h>
	64	#include <sys/sysctl.h>
	65	#include <sys/sysent.h>
	66	#include <sys/uio.h>
	67	#include <sys/vmmeter.h>
	68	#include <sys/malloc.h>
	69	#ifdef KTRACE
	70	#include <sys/ktrace.h>
	71	#endif
	72	#include <sys/ktr.h>
	73	#include <sys/upcall.h>
	74	#include <sys/vkernel.h>
	75	#include <sys/sysproto.h>
	76	#include <sys/sysunion.h>
	77
	78	#include <vm/vm.h>
	79	#include <vm/vm_param.h>
	80	#include <sys/lock.h>
	81	#include <vm/pmap.h>
	82	#include <vm/vm_kern.h>
	83	#include <vm/vm_map.h>
	84	#include <vm/vm_page.h>
	85	#include <vm/vm_extern.h>
	86
	87	#include <machine/cpu.h>
	88	#include <machine/md_var.h>
	89	#include <machine/pcb.h>
	90	#include <machine/smp.h>
	91	#include <machine/tss.h>
	92	#include <machine/specialreg.h>
	93	#include <machine/globaldata.h>
	94	#include <machine/intr_machdep.h>
	95
	96	#include <machine_base/isa/isa_intr.h>
	97	#include <machine_base/apic/lapic.h>
	98
	99	#ifdef POWERFAIL_NMI
	100	#include <sys/syslog.h>
	101	#include <machine/clock.h>
	102	#endif
	103
	104	#include <machine/vm86.h>
	105
	106	#include <ddb/ddb.h>
	107
	108	#include <sys/msgport2.h>
	109	#include <sys/thread2.h>
	110	#include <sys/mplock2.h>
	111
	112	#define MAKEMPSAFE(have_mplock) \
	113	if (have_mplock == 0) { \
	114	get_mplock(); \
	115	have_mplock = 1; \
	116	}
	117
	118	int (pmath_emulate) (struct trapframe );
	119
	120	extern void trap (struct trapframe *frame);
	121	extern void syscall2 (struct trapframe *frame);
	122
	123	static int trap_pfault (struct trapframe *, int, vm_offset_t);
	124	static void trap_fatal (struct trapframe *, vm_offset_t);
	125	void dblfault_handler (void);
	126
	127	extern inthand_t IDTVEC(syscall);
	128
	129	#define MAX_TRAP_MSG 28
	130	static char *trap_msg[] = {
	131	"", /* 0 unused */
	132	"privileged instruction fault", /* 1 T_PRIVINFLT */
	133	"", /* 2 unused */
	134	"breakpoint instruction fault", /* 3 T_BPTFLT */
	135	"", /* 4 unused */
	136	"", /* 5 unused */
	137	"arithmetic trap", /* 6 T_ARITHTRAP */
	138	"system forced exception", /* 7 T_ASTFLT */
	139	"", /* 8 unused */
	140	"general protection fault", /* 9 T_PROTFLT */
	141	"trace trap", /* 10 T_TRCTRAP */
	142	"", /* 11 unused */
	143	"page fault", /* 12 T_PAGEFLT */
	144	"", /* 13 unused */
	145	"alignment fault", /* 14 T_ALIGNFLT */
	146	"", /* 15 unused */
	147	"", /* 16 unused */
	148	"", /* 17 unused */
	149	"integer divide fault", /* 18 T_DIVIDE */
	150	"non-maskable interrupt trap", /* 19 T_NMI */
	151	"overflow trap", /* 20 T_OFLOW */
	152	"FPU bounds check fault", /* 21 T_BOUND */
	153	"FPU device not available", /* 22 T_DNA */
	154	"double fault", /* 23 T_DOUBLEFLT */
	155	"FPU operand fetch fault", /* 24 T_FPOPFLT */
	156	"invalid TSS fault", /* 25 T_TSSFLT */
	157	"segment not present fault", /* 26 T_SEGNPFLT */
	158	"stack fault", /* 27 T_STKFLT */
	159	"machine check trap", /* 28 T_MCHK */
	160	};
	161
	162	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	163	extern int has_f00f_bug;
	164	#endif
	165
	166	#ifdef DDB
	167	static int ddb_on_nmi = 1;
	168	SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
	169	&ddb_on_nmi, 0, "Go to DDB on NMI");
	170	#endif
	171	static int panic_on_nmi = 1;
	172	SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
	173	&panic_on_nmi, 0, "Panic on NMI");
	174	static int fast_release;
	175	SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW,
	176	&fast_release, 0, "Passive Release was optimal");
	177	static int slow_release;
	178	SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW,
	179	&slow_release, 0, "Passive Release was nonoptimal");
	180
	181	MALLOC_DEFINE(M_SYSMSG, "sysmsg", "sysmsg structure");
	182	extern int max_sysmsg;
	183
	184	/*
	185	* Passively intercepts the thread switch function to increase the thread
	186	* priority from a user priority to a kernel priority, reducing
	187	* syscall and trap overhead for the case where no switch occurs.
	188	*
	189	* Synchronizes td_ucred with p_ucred. This is used by system calls,
	190	* signal handling, faults, AST traps, and anything else that enters the
	191	* kernel from userland and provides the kernel with a stable read-only
	192	* copy of the process ucred.
	193	*/
	194	static __inline void
	195	userenter(struct thread curtd, struct proc curp)
	196	{
	197	struct ucred *ocred;
	198	struct ucred *ncred;
	199
	200	curtd->td_release = lwkt_passive_release;
	201
	202	if (curtd->td_ucred != curp->p_ucred) {
	203	ncred = crhold(curp->p_ucred);
	204	ocred = curtd->td_ucred;
	205	curtd->td_ucred = ncred;
	206	if (ocred)
	207	crfree(ocred);
	208	}
	209
	210	}
	211
	212	/*
	213	* Handle signals, upcalls, profiling, and other AST's and/or tasks that
	214	* must be completed before we can return to or try to return to userland.
	215	*
	216	* Note that td_sticks is a 64 bit quantity, but there's no point doing 64
	217	* arithmatic on the delta calculation so the absolute tick values are
	218	* truncated to an integer.
	219	*/
	220	static void
	221	userret(struct lwp lp, struct trapframe frame, int sticks)
	222	{
	223	struct proc *p = lp->lwp_proc;
	224	void (*hook)(void);
	225	int sig;
	226
	227	if (p->p_userret != NULL) {
	228	hook = p->p_userret;
	229	p->p_userret = NULL;
	230	(*hook)();
	231	}
	232
	233	/*
	234	* Charge system time if profiling. Note: times are in microseconds.
	235	* This may do a copyout and block, so do it first even though it
	236	* means some system time will be charged as user time.
	237	*/
	238	if (p->p_flags & P_PROFIL) {
	239	addupc_task(p, frame->tf_eip,
	240	(u_int)((int)lp->lwp_thread->td_sticks - sticks));
	241	}
	242
	243	recheck:
	244	/*
	245	* If the jungle wants us dead, so be it.
	246	*/
	247	if (lp->lwp_mpflags & LWP_MP_WEXIT) {
	248	lwkt_gettoken(&p->p_token);
	249	lwp_exit(0);
	250	lwkt_reltoken(&p->p_token); /* NOT REACHED */
	251	}
	252
	253	/*
	254	* Block here if we are in a stopped state.
	255	*/
	256	if (p->p_stat == SSTOP \|\| dump_stop_usertds) {
	257	lwkt_gettoken(&p->p_token);
	258	tstop();
	259	lwkt_reltoken(&p->p_token);
	260	goto recheck;
	261	}
	262
	263	/*
	264	* Post any pending upcalls. If running a virtual kernel be sure
	265	* to restore the virtual kernel's vmspace before posting the upcall.
	266	*/
	267	if (p->p_flags & (P_SIGVTALRM \| P_SIGPROF \| P_UPCALLPEND)) {
	268	lwkt_gettoken(&p->p_token);
	269	if (p->p_flags & P_SIGVTALRM) {
	270	p->p_flags &= ~P_SIGVTALRM;
	271	ksignal(p, SIGVTALRM);
	272	}
	273	if (p->p_flags & P_SIGPROF) {
	274	p->p_flags &= ~P_SIGPROF;
	275	ksignal(p, SIGPROF);
	276	}
	277	if (p->p_flags & P_UPCALLPEND) {
	278	p->p_flags &= ~P_UPCALLPEND;
	279	postupcall(lp);
	280	}
	281	lwkt_reltoken(&p->p_token);
	282	goto recheck;
	283	}
	284
	285	/*
	286	* Post any pending signals. If running a virtual kernel be sure
	287	* to restore the virtual kernel's vmspace before posting the signal.
	288	*
	289	* WARNING! postsig() can exit and not return.
	290	*/
	291	if ((sig = CURSIG_TRACE(lp)) != 0) {
	292	lwkt_gettoken(&p->p_token);
	293	postsig(sig);
	294	lwkt_reltoken(&p->p_token);
	295	goto recheck;
	296	}
	297
	298	/*
	299	* block here if we are swapped out, but still process signals
	300	* (such as SIGKILL). proc0 (the swapin scheduler) is already
	301	* aware of our situation, we do not have to wake it up.
	302	*/
	303	if (p->p_flags & P_SWAPPEDOUT) {
	304	lwkt_gettoken(&p->p_token);
	305	get_mplock();
	306	p->p_flags \|= P_SWAPWAIT;
	307	swapin_request();
	308	if (p->p_flags & P_SWAPWAIT)
	309	tsleep(p, PCATCH, "SWOUT", 0);
	310	p->p_flags &= ~P_SWAPWAIT;
	311	rel_mplock();
	312	lwkt_reltoken(&p->p_token);
	313	goto recheck;
	314	}
	315
	316	/*
	317	* In a multi-threaded program it is possible for a thread to change
	318	* signal state during a system call which temporarily changes the
	319	* signal mask. In this case postsig() might not be run and we
	320	* have to restore the mask ourselves.
	321	*/
	322	if (lp->lwp_flags & LWP_OLDMASK) {
	323	lp->lwp_flags &= ~LWP_OLDMASK;
	324	lp->lwp_sigmask = lp->lwp_oldsigmask;
	325	goto recheck;
	326	}
	327	}
	328
	329	/*
	330	* Cleanup from userenter and any passive release that might have occured.
	331	* We must reclaim the current-process designation before we can return
	332	* to usermode. We also handle both LWKT and USER reschedule requests.
	333	*/
	334	static __inline void
	335	userexit(struct lwp *lp)
	336	{
	337	struct thread *td = lp->lwp_thread;
	338	/* globaldata_t gd = td->td_gd; */
	339
	340	/*
	341	* Handle stop requests at kernel priority. Any requests queued
	342	* after this loop will generate another AST.
	343	*/
	344	while (lp->lwp_proc->p_stat == SSTOP) {
	345	lwkt_gettoken(&lp->lwp_proc->p_token);
	346	tstop();
	347	lwkt_reltoken(&lp->lwp_proc->p_token);
	348	}
	349
	350	/*
	351	* Become the current user scheduled process if we aren't already,
	352	* and deal with reschedule requests and other factors.
	353	*/
	354	lp->lwp_proc->p_usched->acquire_curproc(lp);
	355	/* WARNING: we may have migrated cpu's */
	356	/* gd = td->td_gd; */
	357
	358	/*
	359	* Reduce our priority in preparation for a return to userland. If
	360	* our passive release function was still in place, our priority was
	361	* never raised and does not need to be reduced.
	362	*/
	363	lwkt_passive_recover(td);
	364	}
	365
	366	#if !defined(KTR_KERNENTRY)
	367	#define KTR_KERNENTRY KTR_ALL
	368	#endif
	369	KTR_INFO_MASTER(kernentry);
	370	KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0,
	371	"TRAP(pid %d, tid %d, trapno %d, eva %lu)",
	372	pid_t pid, lwpid_t tid, register_t trapno, vm_offset_t eva);
	373	KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "TRAP_RET(pid %d, tid %d)",
	374	pid_t pid, lwpid_t tid);
	375	KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "SYSC(pid %d, tid %d, nr %d)",
	376	pid_t pid, lwpid_t tid, register_t trapno);
	377	KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "SYSRET(pid %d, tid %d, err %d)",
	378	pid_t pid, lwpid_t tid, int err);
	379	KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "FORKRET(pid %d, tid %d)",
	380	pid_t pid, lwpid_t tid);
	381
	382	/*
	383	* Exception, fault, and trap interface to the kernel.
	384	* This common code is called from assembly language IDT gate entry
	385	* routines that prepare a suitable stack frame, and restore this
	386	* frame after the exception has been processed.
	387	*
	388	* This function is also called from doreti in an interlock to handle ASTs.
	389	* For example: hardwareint->INTROUTINE->(set ast)->doreti->trap
	390	*
	391	* NOTE! We have to retrieve the fault address prior to obtaining the
	392	* MP lock because get_mplock() may switch out. YYY cr2 really ought
	393	* to be retrieved by the assembly code, not here.
	394	*
	395	* XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
	396	* if an attempt is made to switch from a fast interrupt or IPI. This is
	397	* necessary to properly take fatal kernel traps on SMP machines if
	398	* get_mplock() has to block.
	399	*/
	400
	401	void
	402	trap(struct trapframe *frame)
	403	{
	404	struct globaldata *gd = mycpu;
	405	struct thread *td = gd->gd_curthread;
	406	struct lwp *lp = td->td_lwp;
	407	struct proc *p;
	408	int sticks = 0;
	409	int i = 0, ucode = 0, type, code;
	410	int have_mplock = 0;
	411	#ifdef INVARIANTS
	412	int crit_count = td->td_critcount;
	413	lwkt_tokref_t curstop = td->td_toks_stop;
	414	#endif
	415	vm_offset_t eva;
	416
	417	p = td->td_proc;
	418	#ifdef DDB
	419	/*
	420	* We need to allow T_DNA faults when the debugger is active since
	421	* some dumping paths do large bcopy() which use the floating
	422	* point registers for faster copying.
	423	*/
	424	if (db_active && frame->tf_trapno != T_DNA) {
	425	eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0);
	426	++gd->gd_trap_nesting_level;
	427	MAKEMPSAFE(have_mplock);
	428	trap_fatal(frame, eva);
	429	--gd->gd_trap_nesting_level;
	430	goto out2;
	431	}
	432	#endif
	433
	434	eva = 0;
	435	++gd->gd_trap_nesting_level;
	436	if (frame->tf_trapno == T_PAGEFLT) {
	437	/*
	438	* For some Cyrix CPUs, %cr2 is clobbered by interrupts.
	439	* This problem is worked around by using an interrupt
	440	* gate for the pagefault handler. We are finally ready
	441	* to read %cr2 and then must reenable interrupts.
	442	*
	443	* XXX this should be in the switch statement, but the
	444	* NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
	445	* flow of control too much for this to be obviously
	446	* correct.
	447	*/
	448	eva = rcr2();
	449	cpu_enable_intr();
	450	}
	451
	452	--gd->gd_trap_nesting_level;
	453
	454	if (!(frame->tf_eflags & PSL_I)) {
	455	/*
	456	* Buggy application or kernel code has disabled interrupts
	457	* and then trapped. Enabling interrupts now is wrong, but
	458	* it is better than running with interrupts disabled until
	459	* they are accidentally enabled later.
	460	*/
	461	type = frame->tf_trapno;
	462	if (ISPL(frame->tf_cs)==SEL_UPL \|\| (frame->tf_eflags & PSL_VM)) {
	463	MAKEMPSAFE(have_mplock);
	464	kprintf(
	465	"pid %ld (%s): trap %d with interrupts disabled\n",
	466	(long)curproc->p_pid, curproc->p_comm, type);
	467	} else if (type != T_BPTFLT && type != T_TRCTRAP) {
	468	/*
	469	* XXX not quite right, since this may be for a
	470	* multiple fault in user mode.
	471	*/
	472	MAKEMPSAFE(have_mplock);
	473	kprintf("kernel trap %d with interrupts disabled\n",
	474	type);
	475	}
	476	cpu_enable_intr();
	477	}
	478
	479	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	480	restart:
	481	#endif
	482	type = frame->tf_trapno;
	483	code = frame->tf_err;
	484
	485	if (in_vm86call) {
	486	if (frame->tf_eflags & PSL_VM &&
	487	(type == T_PROTFLT \|\| type == T_STKFLT)) {
	488	KKASSERT(get_mplock_count(curthread) > 0);
	489	i = vm86_emulate((struct vm86frame *)frame);
	490	KKASSERT(get_mplock_count(curthread) > 0);
	491	if (i != 0) {
	492	/*
	493	* returns to original process
	494	*/
	495	vm86_trap((struct vm86frame *)frame,
	496	have_mplock);
	497	KKASSERT(0); /* NOT REACHED */
	498	}
	499	goto out2;
	500	}
	501	switch (type) {
	502	/*
	503	* these traps want either a process context, or
	504	* assume a normal userspace trap.
	505	*/
	506	case T_PROTFLT:
	507	case T_SEGNPFLT:
	508	trap_fatal(frame, eva);
	509	goto out2;
	510	case T_TRCTRAP:
	511	type = T_BPTFLT; /* kernel breakpoint */
	512	/* FALL THROUGH */
	513	}
	514	goto kernel_trap; /* normal kernel trap handling */
	515	}
	516
	517	if ((ISPL(frame->tf_cs) == SEL_UPL) \|\| (frame->tf_eflags & PSL_VM)) {
	518	/* user trap */
	519
	520	KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid,
	521	frame->tf_trapno, eva);
	522
	523	userenter(td, p);
	524
	525	sticks = (int)td->td_sticks;
	526	lp->lwp_md.md_regs = frame;
	527
	528	switch (type) {
	529	case T_PRIVINFLT: /* privileged instruction fault */
	530	i = SIGILL;
	531	ucode = ILL_PRVOPC;
	532	break;
	533
	534	case T_BPTFLT: /* bpt instruction fault */
	535	case T_TRCTRAP: /* trace trap */
	536	frame->tf_eflags &= ~PSL_T;
	537	i = SIGTRAP;
	538	ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
	539	break;
	540
	541	case T_ARITHTRAP: /* arithmetic trap */
	542	ucode = code;
	543	i = SIGFPE;
	544	break;
	545
	546	case T_ASTFLT: /* Allow process switch */
	547	mycpu->gd_cnt.v_soft++;
	548	if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
	549	atomic_clear_int(&mycpu->gd_reqflags,
	550	RQF_AST_OWEUPC);
	551	addupc_task(p, p->p_prof.pr_addr,
	552	p->p_prof.pr_ticks);
	553	}
	554	goto out;
	555
	556	/*
	557	* The following two traps can happen in
	558	* vm86 mode, and, if so, we want to handle
	559	* them specially.
	560	*/
	561	case T_PROTFLT: /* general protection fault */
	562	case T_STKFLT: /* stack fault */
	563	if (frame->tf_eflags & PSL_VM) {
	564	i = vm86_emulate((struct vm86frame *)frame);
	565	if (i == 0)
	566	goto out;
	567	break;
	568	}
	569	i = SIGBUS;
	570	ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
	571	break;
	572	case T_SEGNPFLT: /* segment not present fault */
	573	i = SIGBUS;
	574	ucode = BUS_ADRERR;
	575	break;
	576	case T_TSSFLT: /* invalid TSS fault */
	577	case T_DOUBLEFLT: /* double fault */
	578	default:
	579	i = SIGBUS;
	580	ucode = BUS_OBJERR;
	581	break;
	582
	583	case T_PAGEFLT: /* page fault */
	584	i = trap_pfault(frame, TRUE, eva);
	585	if (i == -1)
	586	goto out;
	587	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	588	if (i == -2)
	589	goto restart;
	590	#endif
	591	if (i == 0)
	592	goto out;
	593
	594	if (i == SIGSEGV)
	595	ucode = SEGV_MAPERR;
	596	else {
	597	i = SIGSEGV;
	598	ucode = SEGV_ACCERR;
	599	}
	600	break;
	601
	602	case T_DIVIDE: /* integer divide fault */
	603	ucode = FPE_INTDIV;
	604	i = SIGFPE;
	605	break;
	606
	607	#if NISA > 0
	608	case T_NMI:
	609	MAKEMPSAFE(have_mplock);
	610	#ifdef POWERFAIL_NMI
	611	goto handle_powerfail;
	612	#else /* !POWERFAIL_NMI */
	613	/* machine/parity/power fail/"kitchen sink" faults */
	614	if (isa_nmi(code) == 0) {
	615	#ifdef DDB
	616	/*
	617	* NMI can be hooked up to a pushbutton
	618	* for debugging.
	619	*/
	620	if (ddb_on_nmi) {
	621	kprintf ("NMI ... going to debugger\n");
	622	kdb_trap (type, 0, frame);
	623	}
	624	#endif /* DDB */
	625	goto out2;
	626	} else if (panic_on_nmi)
	627	panic("NMI indicates hardware failure");
	628	break;
	629	#endif /* POWERFAIL_NMI */
	630	#endif /* NISA > 0 */
	631
	632	case T_OFLOW: /* integer overflow fault */
	633	ucode = FPE_INTOVF;
	634	i = SIGFPE;
	635	break;
	636
	637	case T_BOUND: /* bounds check fault */
	638	ucode = FPE_FLTSUB;
	639	i = SIGFPE;
	640	break;
	641
	642	case T_DNA:
	643	/*
	644	* Virtual kernel intercept - pass the DNA exception
	645	* to the virtual kernel if it asked to handle it.
	646	* This occurs when the virtual kernel is holding
	647	* onto the FP context for a different emulated
	648	* process then the one currently running.
	649	*
	650	* We must still call npxdna() since we may have
	651	* saved FP state that the virtual kernel needs
	652	* to hand over to a different emulated process.
	653	*/
	654	if (lp->lwp_vkernel && lp->lwp_vkernel->ve &&
	655	(td->td_pcb->pcb_flags & FP_VIRTFP)
	656	) {
	657	npxdna();
	658	break;
	659	}
	660
	661	#if NNPX > 0
	662	/*
	663	* The kernel may have switched out the FP unit's
	664	* state, causing the user process to take a fault
	665	* when it tries to use the FP unit. Restore the
	666	* state here
	667	*/
	668	if (npxdna())
	669	goto out;
	670	#endif
	671	if (!pmath_emulate) {
	672	i = SIGFPE;
	673	ucode = FPE_FPU_NP_TRAP;
	674	break;
	675	}
	676	i = (*pmath_emulate)(frame);
	677	if (i == 0) {
	678	if (!(frame->tf_eflags & PSL_T))
	679	goto out2;
	680	frame->tf_eflags &= ~PSL_T;
	681	i = SIGTRAP;
	682	}
	683	/* else ucode = emulator_only_knows() XXX */
	684	break;
	685
	686	case T_FPOPFLT: /* FPU operand fetch fault */
	687	ucode = ILL_COPROC;
	688	i = SIGILL;
	689	break;
	690
	691	case T_XMMFLT: /* SIMD floating-point exception */
	692	ucode = 0; /* XXX */
	693	i = SIGFPE;
	694	break;
	695	}
	696	} else {
	697	kernel_trap:
	698	/* kernel trap */
	699
	700	switch (type) {
	701	case T_PAGEFLT: /* page fault */
	702	trap_pfault(frame, FALSE, eva);
	703	goto out2;
	704
	705	case T_DNA:
	706	#if NNPX > 0
	707	/*
	708	* The kernel may be using npx for copying or other
	709	* purposes.
	710	*/
	711	if (npxdna())
	712	goto out2;
	713	#endif
	714	break;
	715
	716	case T_PROTFLT: /* general protection fault */
	717	case T_SEGNPFLT: /* segment not present fault */
	718	/*
	719	* Invalid segment selectors and out of bounds
	720	* %eip's and %esp's can be set up in user mode.
	721	* This causes a fault in kernel mode when the
	722	* kernel tries to return to user mode. We want
	723	* to get this fault so that we can fix the
	724	* problem here and not have to check all the
	725	* selectors and pointers when the user changes
	726	* them.
	727	*/
	728	#define MAYBE_DORETI_FAULT(where, whereto) \
	729	do { \
	730	if (frame->tf_eip == (int)where) { \
	731	frame->tf_eip = (int)whereto; \
	732	goto out2; \
	733	} \
	734	} while (0)
	735	if (mycpu->gd_intr_nesting_level == 0) {
	736	/*
	737	* Invalid %fs's and %gs's can be created using
	738	* procfs or PT_SETREGS or by invalidating the
	739	* underlying LDT entry. This causes a fault
	740	* in kernel mode when the kernel attempts to
	741	* switch contexts. Lose the bad context
	742	* (XXX) so that we can continue, and generate
	743	* a signal.
	744	*/
	745	MAYBE_DORETI_FAULT(doreti_iret,
	746	doreti_iret_fault);
	747	MAYBE_DORETI_FAULT(doreti_popl_ds,
	748	doreti_popl_ds_fault);
	749	MAYBE_DORETI_FAULT(doreti_popl_es,
	750	doreti_popl_es_fault);
	751	MAYBE_DORETI_FAULT(doreti_popl_fs,
	752	doreti_popl_fs_fault);
	753	MAYBE_DORETI_FAULT(doreti_popl_gs,
	754	doreti_popl_gs_fault);
	755
	756	/*
	757	* NOTE: cpu doesn't push esp on kernel trap
	758	*/
	759	if (td->td_pcb->pcb_onfault &&
	760	td->td_pcb->pcb_onfault_sp ==
	761	(int)&frame->tf_esp) {
	762	frame->tf_eip =
	763	(register_t)td->td_pcb->pcb_onfault;
	764	goto out2;
	765	}
	766	}
	767	break;
	768
	769	case T_TSSFLT:
	770	/*
	771	* PSL_NT can be set in user mode and isn't cleared
	772	* automatically when the kernel is entered. This
	773	* causes a TSS fault when the kernel attempts to
	774	* `iret' because the TSS link is uninitialized. We
	775	* want to get this fault so that we can fix the
	776	* problem here and not every time the kernel is
	777	* entered.
	778	*/
	779	if (frame->tf_eflags & PSL_NT) {
	780	frame->tf_eflags &= ~PSL_NT;
	781	goto out2;
	782	}
	783	break;
	784
	785	case T_TRCTRAP: /* trace trap */
	786	if (frame->tf_eip == (int)IDTVEC(syscall)) {
	787	/*
	788	* We've just entered system mode via the
	789	* syscall lcall. Continue single stepping
	790	* silently until the syscall handler has
	791	* saved the flags.
	792	*/
	793	goto out2;
	794	}
	795	if (frame->tf_eip == (int)IDTVEC(syscall) + 1) {
	796	/*
	797	* The syscall handler has now saved the
	798	* flags. Stop single stepping it.
	799	*/
	800	frame->tf_eflags &= ~PSL_T;
	801	goto out2;
	802	}
	803	/*
	804	* Ignore debug register trace traps due to
	805	* accesses in the user's address space, which
	806	* can happen under several conditions such as
	807	* if a user sets a watchpoint on a buffer and
	808	* then passes that buffer to a system call.
	809	* We still want to get TRCTRAPS for addresses
	810	* in kernel space because that is useful when
	811	* debugging the kernel.
	812	*/
	813	if (user_dbreg_trap()) {
	814	/*
	815	* Reset breakpoint bits because the
	816	* processor doesn't
	817	*/
	818	load_dr6(rdr6() & 0xfffffff0);
	819	goto out2;
	820	}
	821	/*
	822	* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
	823	*/
	824	case T_BPTFLT:
	825	/*
	826	* If DDB is enabled, let it handle the debugger trap.
	827	* Otherwise, debugger traps "can't happen".
	828	*/
	829	ucode = TRAP_BRKPT;
	830	#ifdef DDB
	831	MAKEMPSAFE(have_mplock);
	832	if (kdb_trap (type, 0, frame))
	833	goto out2;
	834	#endif
	835	break;
	836
	837	#if NISA > 0
	838	case T_NMI:
	839	MAKEMPSAFE(have_mplock);
	840	#ifdef POWERFAIL_NMI
	841	#ifndef TIMER_FREQ
	842	# define TIMER_FREQ 1193182
	843	#endif
	844	handle_powerfail:
	845	{
	846	static unsigned lastalert = 0;
	847
	848	if(time_second - lastalert > 10)
	849	{
	850	log(LOG_WARNING, "NMI: power fail\n");
	851	sysbeep(TIMER_FREQ/880, hz);
	852	lastalert = time_second;
	853	}
	854	/* YYY mp count */
	855	goto out2;
	856	}
	857	#else /* !POWERFAIL_NMI */
	858	/* machine/parity/power fail/"kitchen sink" faults */
	859	if (isa_nmi(code) == 0) {
	860	#ifdef DDB
	861	/*
	862	* NMI can be hooked up to a pushbutton
	863	* for debugging.
	864	*/
	865	if (ddb_on_nmi) {
	866	kprintf ("NMI ... going to debugger\n");
	867	kdb_trap (type, 0, frame);
	868	}
	869	#endif /* DDB */
	870	goto out2;
	871	} else if (panic_on_nmi == 0)
	872	goto out2;
	873	/* FALL THROUGH */
	874	#endif /* POWERFAIL_NMI */
	875	#endif /* NISA > 0 */
	876	}
	877
	878	MAKEMPSAFE(have_mplock);
	879	trap_fatal(frame, eva);
	880	goto out2;
	881	}
	882
	883	/*
	884	* Virtual kernel intercept - if the fault is directly related to a
	885	* VM context managed by a virtual kernel then let the virtual kernel
	886	* handle it.
	887	*/
	888	if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
	889	vkernel_trap(lp, frame);
	890	goto out;
	891	}
	892
	893	/* Translate fault for emulators (e.g. Linux) */
	894	if (*p->p_sysent->sv_transtrap)
	895	i = (*p->p_sysent->sv_transtrap)(i, type);
	896
	897	MAKEMPSAFE(have_mplock);
	898	trapsignal(lp, i, ucode);
	899
	900	#ifdef DEBUG
	901	if (type <= MAX_TRAP_MSG) {
	902	uprintf("fatal process exception: %s",
	903	trap_msg[type]);
	904	if ((type == T_PAGEFLT) \|\| (type == T_PROTFLT))
	905	uprintf(", fault VA = 0x%lx", (u_long)eva);
	906	uprintf("\n");
	907	}
	908	#endif
	909
	910	out:
	911	userret(lp, frame, sticks);
	912	userexit(lp);
	913	out2: ;
	914	if (have_mplock)
	915	rel_mplock();
	916	if (p != NULL && lp != NULL)
	917	KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid);
	918	#ifdef INVARIANTS
	919	KASSERT(crit_count == td->td_critcount,
	920	("trap: critical section count mismatch! %d/%d",
	921	crit_count, td->td_pri));
	922	KASSERT(curstop == td->td_toks_stop,
	923	("trap: extra tokens held after trap! %zd/%zd",
	924	curstop - &td->td_toks_base,
	925	td->td_toks_stop - &td->td_toks_base));
	926	#endif
	927	}
	928
	929	int
	930	trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
	931	{
	932	vm_offset_t va;
	933	struct vmspace *vm = NULL;
	934	vm_map_t map = 0;
	935	int rv = 0;
	936	int fault_flags;
	937	vm_prot_t ftype;
	938	thread_t td = curthread;
	939	struct lwp *lp = td->td_lwp;
	940
	941	va = trunc_page(eva);
	942	if (va >= KERNBASE) {
	943	/*
	944	* Don't allow user-mode faults in kernel address space.
	945	* An exception: if the faulting address is the invalid
	946	* instruction entry in the IDT, then the Intel Pentium
	947	* F00F bug workaround was triggered, and we need to
	948	* treat it is as an illegal instruction, and not a page
	949	* fault.
	950	*/
	951	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	952	if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
	953	frame->tf_trapno = T_PRIVINFLT;
	954	return -2;
	955	}
	956	#endif
	957	if (usermode)
	958	goto nogo;
	959
	960	map = &kernel_map;
	961	} else {
	962	/*
	963	* This is a fault on non-kernel virtual memory.
	964	* vm is initialized above to NULL. If curproc is NULL
	965	* or curproc->p_vmspace is NULL the fault is fatal.
	966	*/
	967	if (lp != NULL)
	968	vm = lp->lwp_vmspace;
	969
	970	if (vm == NULL)
	971	goto nogo;
	972
	973	map = &vm->vm_map;
	974	}
	975
	976	if (frame->tf_err & PGEX_W)
	977	ftype = VM_PROT_WRITE;
	978	else
	979	ftype = VM_PROT_READ;
	980
	981	if (map != &kernel_map) {
	982	/*
	983	* Keep swapout from messing with us during this
	984	* critical time.
	985	*/
	986	PHOLD(lp->lwp_proc);
	987
	988	/*
	989	* Issue fault
	990	*/
	991	fault_flags = 0;
	992	if (usermode)
	993	fault_flags \|= VM_FAULT_BURST;
	994	if (ftype & VM_PROT_WRITE)
	995	fault_flags \|= VM_FAULT_DIRTY;
	996	else
	997	fault_flags \|= VM_FAULT_NORMAL;
	998	rv = vm_fault(map, va, ftype, fault_flags);
	999	PRELE(lp->lwp_proc);
	1000	} else {
	1001	/*
	1002	* Don't have to worry about process locking or stacks in the
	1003	* kernel.
	1004	*/
	1005	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
	1006	}
	1007	if (rv == KERN_SUCCESS)
	1008	return (0);
	1009	nogo:
	1010	if (!usermode) {
	1011	/*
	1012	* NOTE: cpu doesn't push esp on kernel trap
	1013	*/
	1014	if (td->td_gd->gd_intr_nesting_level == 0 &&
	1015	td->td_pcb->pcb_onfault &&
	1016	td->td_pcb->pcb_onfault_sp == (int)&frame->tf_esp) {
	1017	frame->tf_eip = (register_t)td->td_pcb->pcb_onfault;
	1018	return (0);
	1019	}
	1020	if (td->td_gd->gd_intr_nesting_level == 0 &&
	1021	td->td_pcb->pcb_onfault) {
	1022	kprintf("ESP mismatch %p %08x\n",
	1023	&frame->tf_esp, td->td_pcb->pcb_onfault_sp);
	1024	}
	1025	trap_fatal(frame, eva);
	1026	return (-1);
	1027	}
	1028
	1029	/* kludge to pass faulting virtual address to sendsig */
	1030	frame->tf_xflags = frame->tf_err;
	1031	frame->tf_err = eva;
	1032
	1033	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
	1034	}
	1035
	1036	static void
	1037	trap_fatal(struct trapframe *frame, vm_offset_t eva)
	1038	{
	1039	int code, type, ss, esp;
	1040	struct soft_segment_descriptor softseg;
	1041
	1042	code = frame->tf_err;
	1043	type = frame->tf_trapno;
	1044	sdtossd(&gdt[mycpu->gd_cpuid * NGDT + IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
	1045
	1046	if (type <= MAX_TRAP_MSG)
	1047	kprintf("\n\nFatal trap %d: %s while in %s mode\n",
	1048	type, trap_msg[type],
	1049	frame->tf_eflags & PSL_VM ? "vm86" :
	1050	ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
	1051	/* three separate prints in case of a trap on an unmapped page */
	1052	kprintf("cpuid = %d; ", mycpu->gd_cpuid);
	1053	kprintf("lapic.id = %08x\n", lapic->id);
	1054	if (type == T_PAGEFLT) {
	1055	kprintf("fault virtual address = %p\n", (void *)eva);
	1056	kprintf("fault code = %s %s, %s\n",
	1057	code & PGEX_U ? "user" : "supervisor",
	1058	code & PGEX_W ? "write" : "read",
	1059	code & PGEX_P ? "protection violation" : "page not present");
	1060	}
	1061	kprintf("instruction pointer = 0x%x:0x%x\n",
	1062	frame->tf_cs & 0xffff, frame->tf_eip);
	1063	if ((ISPL(frame->tf_cs) == SEL_UPL) \|\| (frame->tf_eflags & PSL_VM)) {
	1064	ss = frame->tf_ss & 0xffff;
	1065	esp = frame->tf_esp;
	1066	} else {
	1067	ss = GSEL(GDATA_SEL, SEL_KPL);
	1068	esp = (int)&frame->tf_esp;
	1069	}
	1070	kprintf("stack pointer = 0x%x:0x%x\n", ss, esp);
	1071	kprintf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
	1072	kprintf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
	1073	softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
	1074	kprintf(" = DPL %d, pres %d, def32 %d, gran %d\n",
	1075	softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
	1076	softseg.ssd_gran);
	1077	kprintf("processor eflags = ");
	1078	if (frame->tf_eflags & PSL_T)
	1079	kprintf("trace trap, ");
	1080	if (frame->tf_eflags & PSL_I)
	1081	kprintf("interrupt enabled, ");
	1082	if (frame->tf_eflags & PSL_NT)
	1083	kprintf("nested task, ");
	1084	if (frame->tf_eflags & PSL_RF)
	1085	kprintf("resume, ");
	1086	if (frame->tf_eflags & PSL_VM)
	1087	kprintf("vm86, ");
	1088	kprintf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
	1089	kprintf("current process = ");
	1090	if (curproc) {
	1091	kprintf("%lu (%s)\n",
	1092	(u_long)curproc->p_pid, curproc->p_comm ?
	1093	curproc->p_comm : "");
	1094	} else {
	1095	kprintf("Idle\n");
	1096	}
	1097	kprintf("current thread = pri %d ", curthread->td_pri);
	1098	if (curthread->td_critcount)
	1099	kprintf("(CRIT)");
	1100	kprintf("\n");
	1101	/**
	1102	* XXX FIXME:
	1103	* we probably SHOULD have stopped the other CPUs before now!
	1104	* another CPU COULD have been touching cpl at this moment...
	1105	*/
	1106	kprintf(" <- SMP: XXX");
	1107	kprintf("\n");
	1108
	1109	#ifdef KDB
	1110	if (kdb_trap(&psl))
	1111	return;
	1112	#endif
	1113	#ifdef DDB
	1114	if ((debugger_on_panic \|\| db_active) && kdb_trap(type, code, frame))
	1115	return;
	1116	#endif
	1117	kprintf("trap number = %d\n", type);
	1118	if (type <= MAX_TRAP_MSG)
	1119	panic("%s", trap_msg[type]);
	1120	else
	1121	panic("unknown/reserved trap");
	1122	}
	1123
	1124	/*
	1125	* Double fault handler. Called when a fault occurs while writing
	1126	* a frame for a trap/exception onto the stack. This usually occurs
	1127	* when the stack overflows (such is the case with infinite recursion,
	1128	* for example).
	1129	*
	1130	* XXX Note that the current PTD gets replaced by IdlePTD when the
	1131	* task switch occurs. This means that the stack that was active at
	1132	* the time of the double fault is not available at <kstack> unless
	1133	* the machine was idle when the double fault occurred. The downside
	1134	* of this is that "trace <ebp>" in ddb won't work.
	1135	*/
	1136	static __inline
	1137	int
	1138	in_kstack_guard(register_t rptr)
	1139	{
	1140	thread_t td = curthread;
	1141
	1142	if ((char *)rptr >= td->td_kstack &&
	1143	(char *)rptr < td->td_kstack + PAGE_SIZE) {
	1144	return 1;
	1145	}
	1146	return 0;
	1147	}
	1148
	1149	void
	1150	dblfault_handler(void)
	1151	{
	1152	struct mdglobaldata *gd = mdcpu;
	1153
	1154	if (in_kstack_guard(gd->gd_common_tss.tss_esp) \|\|
	1155	in_kstack_guard(gd->gd_common_tss.tss_ebp)) {
	1156	kprintf("DOUBLE FAULT - KERNEL STACK GUARD HIT!\n");
	1157	} else {
	1158	kprintf("DOUBLE FAULT:\n");
	1159	}
	1160	kprintf("eip = 0x%x\n", gd->gd_common_tss.tss_eip);
	1161	kprintf("esp = 0x%x\n", gd->gd_common_tss.tss_esp);
	1162	kprintf("ebp = 0x%x\n", gd->gd_common_tss.tss_ebp);
	1163	/* three separate prints in case of a trap on an unmapped page */
	1164	kprintf("cpuid = %d; ", gd->mi.gd_cpuid);
	1165	kprintf("lapic.id = %08x\n", lapic->id);
	1166	panic("double fault");
	1167	}
	1168
	1169	/*
	1170	* syscall2 - MP aware system call request C handler
	1171	*
	1172	* A system call is essentially treated as a trap. The MP lock is not
	1173	* held on entry or return. We are responsible for handling ASTs
	1174	* (e.g. a task switch) prior to return.
	1175	*
	1176	* MPSAFE
	1177	*/
	1178	void
	1179	syscall2(struct trapframe *frame)
	1180	{
	1181	struct thread *td = curthread;
	1182	struct proc *p = td->td_proc;
	1183	struct lwp *lp = td->td_lwp;
	1184	caddr_t params;
	1185	struct sysent *callp;
	1186	register_t orig_tf_eflags;
	1187	int sticks;
	1188	int error;
	1189	int narg;
	1190	#ifdef INVARIANTS
	1191	int crit_count = td->td_critcount;
	1192	#endif
	1193	int have_mplock = 0;
	1194	u_int code;
	1195	union sysunion args;
	1196
	1197	#ifdef DIAGNOSTIC
	1198	if (ISPL(frame->tf_cs) != SEL_UPL) {
	1199	get_mplock();
	1200	panic("syscall");
	1201	/* NOT REACHED */
	1202	}
	1203	#endif
	1204
	1205	KTR_LOG(kernentry_syscall, p->p_pid, lp->lwp_tid,
	1206	frame->tf_eax);
	1207
	1208	userenter(td, p); /* lazy raise our priority */
	1209
	1210	/*
	1211	* Misc
	1212	*/
	1213	sticks = (int)td->td_sticks;
	1214	orig_tf_eflags = frame->tf_eflags;
	1215
	1216	/*
	1217	* Virtual kernel intercept - if a VM context managed by a virtual
	1218	* kernel issues a system call the virtual kernel handles it, not us.
	1219	* Restore the virtual kernel context and return from its system
	1220	* call. The current frame is copied out to the virtual kernel.
	1221	*/
	1222	if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
	1223	vkernel_trap(lp, frame);
	1224	error = EJUSTRETURN;
	1225	callp = NULL;
	1226	goto out;
	1227	}
	1228
	1229	/*
	1230	* Get the system call parameters and account for time
	1231	*/
	1232	lp->lwp_md.md_regs = frame;
	1233	params = (caddr_t)frame->tf_esp + sizeof(int);
	1234	code = frame->tf_eax;
	1235
	1236	if (p->p_sysent->sv_prepsyscall) {
	1237	(*p->p_sysent->sv_prepsyscall)(
	1238	frame, (int *)(&args.nosys.sysmsg + 1),
	1239	&code, &params);
	1240	} else {
	1241	/*
	1242	* Need to check if this is a 32 bit or 64 bit syscall.
	1243	* fuword is MP aware.
	1244	*/
	1245	if (code == SYS_syscall) {
	1246	/*
	1247	* Code is first argument, followed by actual args.
	1248	*/
	1249	code = fuword(params);
	1250	params += sizeof(int);
	1251	} else if (code == SYS___syscall) {
	1252	/*
	1253	* Like syscall, but code is a quad, so as to maintain
	1254	* quad alignment for the rest of the arguments.
	1255	*/
	1256	code = fuword(params);
	1257	params += sizeof(quad_t);
	1258	}
	1259	}
	1260
	1261	code &= p->p_sysent->sv_mask;
	1262
	1263	if (code >= p->p_sysent->sv_size)
	1264	callp = &p->p_sysent->sv_table[0];
	1265	else
	1266	callp = &p->p_sysent->sv_table[code];
	1267
	1268	narg = callp->sy_narg & SYF_ARGMASK;
	1269
	1270	#if 0
	1271	if (p->p_sysent->sv_name[0] == 'L')
	1272	kprintf("Linux syscall, code = %d\n", code);
	1273	#endif
	1274
	1275	/*
	1276	* copyin is MP aware, but the tracing code is not
	1277	*/
	1278	if (narg && params) {
	1279	error = copyin(params, (caddr_t)(&args.nosys.sysmsg + 1),
	1280	narg * sizeof(register_t));
	1281	if (error) {
	1282	#ifdef KTRACE
	1283	if (KTRPOINT(td, KTR_SYSCALL)) {
	1284	MAKEMPSAFE(have_mplock);
	1285
	1286	ktrsyscall(lp, code, narg,
	1287	(void *)(&args.nosys.sysmsg + 1));
	1288	}
	1289	#endif
	1290	goto bad;
	1291	}
	1292	}
	1293
	1294	#ifdef KTRACE
	1295	if (KTRPOINT(td, KTR_SYSCALL)) {
	1296	MAKEMPSAFE(have_mplock);
	1297	ktrsyscall(lp, code, narg, (void *)(&args.nosys.sysmsg + 1));
	1298	}
	1299	#endif
	1300
	1301	/*
	1302	* For traditional syscall code edx is left untouched when 32 bit
	1303	* results are returned. Since edx is loaded from fds[1] when the
	1304	* system call returns we pre-set it here.
	1305	*/
	1306	args.sysmsg_fds[0] = 0;
	1307	args.sysmsg_fds[1] = frame->tf_edx;
	1308
	1309	/*
	1310	* The syscall might manipulate the trap frame. If it does it
	1311	* will probably return EJUSTRETURN.
	1312	*/
	1313	args.sysmsg_frame = frame;
	1314
	1315	STOPEVENT(p, S_SCE, narg); /* MP aware */
	1316
	1317	/*
	1318	* NOTE: All system calls run MPSAFE now. The system call itself
	1319	* is responsible for getting the MP lock.
	1320	*/
	1321	error = (*callp->sy_call)(&args);
	1322
	1323	out:
	1324	/*
	1325	* MP SAFE (we may or may not have the MP lock at this point)
	1326	*/
	1327	switch (error) {
	1328	case 0:
	1329	/*
	1330	* Reinitialize proc pointer `p' as it may be different
	1331	* if this is a child returning from fork syscall.
	1332	*/
	1333	p = curproc;
	1334	lp = curthread->td_lwp;
	1335	frame->tf_eax = args.sysmsg_fds[0];
	1336	frame->tf_edx = args.sysmsg_fds[1];
	1337	frame->tf_eflags &= ~PSL_C;
	1338	break;
	1339	case ERESTART:
	1340	/*
	1341	* Reconstruct pc, assuming lcall $X,y is 7 bytes,
	1342	* int 0x80 is 2 bytes. We saved this in tf_err.
	1343	*/
	1344	frame->tf_eip -= frame->tf_err;
	1345	break;
	1346	case EJUSTRETURN:
	1347	break;
	1348	case EASYNC:
	1349	panic("Unexpected EASYNC return value (for now)");
	1350	default:
	1351	bad:
	1352	if (p->p_sysent->sv_errsize) {
	1353	if (error >= p->p_sysent->sv_errsize)
	1354	error = -1; /* XXX */
	1355	else
	1356	error = p->p_sysent->sv_errtbl[error];
	1357	}
	1358	frame->tf_eax = error;
	1359	frame->tf_eflags \|= PSL_C;
	1360	break;
	1361	}
	1362
	1363	/*
	1364	* Traced syscall. trapsignal() is not MP aware.
	1365	*/
	1366	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
	1367	MAKEMPSAFE(have_mplock);
	1368	frame->tf_eflags &= ~PSL_T;
	1369	trapsignal(lp, SIGTRAP, TRAP_TRACE);
	1370	}
	1371
	1372	/*
	1373	* Handle reschedule and other end-of-syscall issues
	1374	*/
	1375	userret(lp, frame, sticks);
	1376
	1377	#ifdef KTRACE
	1378	if (KTRPOINT(td, KTR_SYSRET)) {
	1379	MAKEMPSAFE(have_mplock);
	1380	ktrsysret(lp, code, error, args.sysmsg_result);
	1381	}
	1382	#endif
	1383
	1384	/*
	1385	* This works because errno is findable through the
	1386	* register set. If we ever support an emulation where this
	1387	* is not the case, this code will need to be revisited.
	1388	*/
	1389	STOPEVENT(p, S_SCX, code);
	1390
	1391	userexit(lp);
	1392	/*
	1393	* Release the MP lock if we had to get it
	1394	*/
	1395	if (have_mplock)
	1396	rel_mplock();
	1397	KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error);
	1398	#ifdef INVARIANTS
	1399	KASSERT(crit_count == td->td_critcount,
	1400	("syscall: critical section count mismatch! %d/%d",
	1401	crit_count, td->td_pri));
	1402	KASSERT(&td->td_toks_base == td->td_toks_stop,
	1403	("syscall: extra tokens held after trap! %zd",
	1404	td->td_toks_stop - &td->td_toks_base));
	1405	#endif
	1406	}
	1407
	1408	/*
	1409	* NOTE: MP lock not held at any point.
	1410	*/
	1411	void
	1412	fork_return(struct lwp lp, struct trapframe frame)
	1413	{
	1414	frame->tf_eax = 0; /* Child returns zero */
	1415	frame->tf_eflags &= ~PSL_C; /* success */
	1416	frame->tf_edx = 1;
	1417
	1418	generic_lwp_return(lp, frame);
	1419	KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
	1420	}
	1421
	1422	/*
	1423	* Simplified back end of syscall(), used when returning from fork()
	1424	* directly into user mode.
	1425	*
	1426	* This code will return back into the fork trampoline code which then
	1427	* runs doreti.
	1428	*
	1429	* NOTE: The mplock is not held at any point.
	1430	*/
	1431	void
	1432	generic_lwp_return(struct lwp lp, struct trapframe frame)
	1433	{
	1434	struct proc *p = lp->lwp_proc;
	1435
	1436	/*
	1437	* Newly forked processes are given a kernel priority. We have to
	1438	* adjust the priority to a normal user priority and fake entry
	1439	* into the kernel (call userenter()) to install a passive release
	1440	* function just in case userret() decides to stop the process. This
	1441	* can occur when ^Z races a fork. If we do not install the passive
	1442	* release function the current process designation will not be
	1443	* released when the thread goes to sleep.
	1444	*/
	1445	lwkt_setpri_self(TDPRI_USER_NORM);
	1446	userenter(lp->lwp_thread, p);
	1447	userret(lp, frame, 0);
	1448	#ifdef KTRACE
	1449	if (KTRPOINT(lp->lwp_thread, KTR_SYSRET))
	1450	ktrsysret(lp, SYS_fork, 0, 0);
	1451	#endif
	1452	lp->lwp_flags \|= LWP_PASSIVE_ACQ;
	1453	userexit(lp);
	1454	lp->lwp_flags &= ~LWP_PASSIVE_ACQ;
	1455	}
	1456
	1457	/*
	1458	* If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA
	1459	* fault (which is then passed back to the virtual kernel) if an attempt is
	1460	* made to use the FP unit.
	1461	*
	1462	* XXX this is a fairly big hack.
	1463	*/
	1464	void
	1465	set_vkernel_fp(struct trapframe *frame)
	1466	{
	1467	struct thread *td = curthread;
	1468
	1469	if (frame->tf_xflags & PGEX_FPFAULT) {
	1470	td->td_pcb->pcb_flags \|= FP_VIRTFP;
	1471	if (mdcpu->gd_npxthread == td)
	1472	npxexit();
	1473	} else {
	1474	td->td_pcb->pcb_flags &= ~FP_VIRTFP;
	1475	}
	1476	}
	1477
	1478	/*
	1479	* Called from vkernel_trap() to fixup the vkernel's syscall
	1480	* frame for vmspace_ctl() return.
	1481	*/
	1482	void
	1483	cpu_vkernel_trap(struct trapframe *frame, int error)
	1484	{
	1485	frame->tf_eax = error;
	1486	if (error)
	1487	frame->tf_eflags \|= PSL_C;
	1488	else
	1489	frame->tf_eflags &= ~PSL_C;
	1490	}