Revamp the initial lwkt_abortmsg() support to normalize the abstraction. Now
[dragonfly.git] / sys / i386 / i386 / trap.c
CommitLineData
984263bc
MD
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
38 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
b44419cb 39 * $DragonFly: src/sys/i386/i386/Attic/trap.c,v 1.50 2004/04/20 01:52:17 dillon Exp $
984263bc
MD
40 */
41
42/*
43 * 386 Trap and System call handling
44 */
45
1f2de5d4
MD
46#include "use_isa.h"
47#include "use_npx.h"
48
984263bc
MD
49#include "opt_cpu.h"
50#include "opt_ddb.h"
51#include "opt_ktrace.h"
52#include "opt_clock.h"
53#include "opt_trap.h"
54
55#include <sys/param.h>
56#include <sys/systm.h>
57#include <sys/proc.h>
58#include <sys/pioctl.h>
59#include <sys/kernel.h>
60#include <sys/resourcevar.h>
61#include <sys/signalvar.h>
62#include <sys/syscall.h>
63#include <sys/sysctl.h>
64#include <sys/sysent.h>
65#include <sys/uio.h>
66#include <sys/vmmeter.h>
4fd10eb6 67#include <sys/malloc.h>
984263bc
MD
68#ifdef KTRACE
69#include <sys/ktrace.h>
70#endif
a722be49 71#include <sys/upcall.h>
a64ba182
MD
72#include <sys/sysproto.h>
73#include <sys/sysunion.h>
984263bc
MD
74
75#include <vm/vm.h>
76#include <vm/vm_param.h>
77#include <sys/lock.h>
78#include <vm/pmap.h>
79#include <vm/vm_kern.h>
80#include <vm/vm_map.h>
81#include <vm/vm_page.h>
82#include <vm/vm_extern.h>
83
84#include <machine/cpu.h>
85#include <machine/ipl.h>
86#include <machine/md_var.h>
87#include <machine/pcb.h>
88#ifdef SMP
89#include <machine/smp.h>
90#endif
91#include <machine/tss.h>
85100692 92#include <machine/globaldata.h>
984263bc
MD
93
94#include <i386/isa/intr_machdep.h>
95
96#ifdef POWERFAIL_NMI
97#include <sys/syslog.h>
98#include <machine/clock.h>
99#endif
100
101#include <machine/vm86.h>
102
103#include <ddb/ddb.h>
245e4f17 104#include <sys/msgport2.h>
41c20dac 105#include <sys/thread2.h>
984263bc 106
3ae0cd58 107int (*pmath_emulate) (struct trapframe *);
984263bc 108
3ae0cd58
RG
109extern void trap (struct trapframe frame);
110extern int trapwrite (unsigned addr);
111extern void syscall2 (struct trapframe frame);
112extern void sendsys2 (struct trapframe frame);
984263bc 113
3ae0cd58
RG
114static int trap_pfault (struct trapframe *, int, vm_offset_t);
115static void trap_fatal (struct trapframe *, vm_offset_t);
116void dblfault_handler (void);
984263bc
MD
117
118extern inthand_t IDTVEC(syscall);
119
120#define MAX_TRAP_MSG 28
121static char *trap_msg[] = {
122 "", /* 0 unused */
123 "privileged instruction fault", /* 1 T_PRIVINFLT */
124 "", /* 2 unused */
125 "breakpoint instruction fault", /* 3 T_BPTFLT */
126 "", /* 4 unused */
127 "", /* 5 unused */
128 "arithmetic trap", /* 6 T_ARITHTRAP */
129 "system forced exception", /* 7 T_ASTFLT */
130 "", /* 8 unused */
131 "general protection fault", /* 9 T_PROTFLT */
132 "trace trap", /* 10 T_TRCTRAP */
133 "", /* 11 unused */
134 "page fault", /* 12 T_PAGEFLT */
135 "", /* 13 unused */
136 "alignment fault", /* 14 T_ALIGNFLT */
137 "", /* 15 unused */
138 "", /* 16 unused */
139 "", /* 17 unused */
140 "integer divide fault", /* 18 T_DIVIDE */
141 "non-maskable interrupt trap", /* 19 T_NMI */
142 "overflow trap", /* 20 T_OFLOW */
143 "FPU bounds check fault", /* 21 T_BOUND */
144 "FPU device not available", /* 22 T_DNA */
145 "double fault", /* 23 T_DOUBLEFLT */
146 "FPU operand fetch fault", /* 24 T_FPOPFLT */
147 "invalid TSS fault", /* 25 T_TSSFLT */
148 "segment not present fault", /* 26 T_SEGNPFLT */
149 "stack fault", /* 27 T_STKFLT */
150 "machine check trap", /* 28 T_MCHK */
151};
152
984263bc
MD
153#if defined(I586_CPU) && !defined(NO_F00F_HACK)
154extern int has_f00f_bug;
155#endif
156
157#ifdef DDB
158static int ddb_on_nmi = 1;
159SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
160 &ddb_on_nmi, 0, "Go to DDB on NMI");
161#endif
162static int panic_on_nmi = 1;
163SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
164 &panic_on_nmi, 0, "Panic on NMI");
d9eea1a5
MD
165static int fast_release;
166SYSCTL_INT(_machdep, OID_AUTO, fast_release, CTLFLAG_RW,
167 &fast_release, 0, "Passive Release was optimal");
168static int slow_release;
169SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW,
170 &slow_release, 0, "Passive Release was nonoptimal");
984263bc 171
4fd10eb6
MD
172MALLOC_DEFINE(M_SYSMSG, "sysmsg", "sysmsg structure");
173
26a0694b
MD
174/*
175 * USER->KERNEL transition. Do not transition us out of userland from the
176 * point of view of the userland scheduler unless we actually have to
0b0ee71f 177 * switch. Switching typically occurs when a process blocks in the kernel.
26a0694b 178 *
435ff993
MD
179 * passive_release is called from within a critical section and the BGL will
180 * still be held. This function is NOT called for preemptions, only for
0b0ee71f
MD
181 * switchouts. Note that other elements of the system (uio_yield()) assume
182 * that the user cruft will be released when lwkt_switch() is called.
26a0694b
MD
183 */
184static void
a2a5ad0d 185passive_release(struct thread *td)
26a0694b 186{
a2a5ad0d 187 struct proc *p = td->td_proc;
26a0694b 188
0b0ee71f 189 td->td_release = NULL;
0a3f9b47 190 release_curproc(p);
26a0694b
MD
191}
192
193/*
d81ccc3e
MD
194 * userenter() passively intercepts the thread switch function to increase
195 * the thread priority from a user priority to a kernel priority, reducing
196 * syscall and trap overhead for the case where no switch occurs.
26a0694b 197 */
efd3c4c3 198
26a0694b 199static __inline void
7966cb69 200userenter(struct thread *curtd)
26a0694b 201{
7966cb69 202 curtd->td_release = passive_release;
26a0694b
MD
203}
204
0a3f9b47
MD
205/*
206 * Reacquire our current process designation. This will not return until
207 * we have it. Our LWKT priority will be adjusted for our return to
208 * userland. acquire_curproc() also handles cleaning up P_CP_RELEASED.
209 *
210 * This is always the last step before returning to user mode.
211 */
a2a5ad0d
MD
212static __inline void
213userexit(struct proc *p)
984263bc 214{
a2a5ad0d 215 struct thread *td = p->p_thread;
984263bc 216
435ff993 217 td->td_release = NULL;
0a3f9b47 218 if (p->p_flag & P_CP_RELEASED)
d9eea1a5 219 ++slow_release;
0a3f9b47
MD
220 else
221 ++fast_release;
222 acquire_curproc(p);
a2a5ad0d
MD
223}
224
37af14fe
MD
225/*
226 * userret() handles signals, upcalls, and deals with system profiling
227 * charges. Note that td_sticks is a 64 bit quantity, but there's no
228 * point doing 64 arithmatic on the delta calculation so the absolute
229 * tick values are truncated to an integer.
230 */
a2a5ad0d 231static void
37af14fe 232userret(struct proc *p, struct trapframe *frame, int sticks)
a2a5ad0d
MD
233{
234 int sig;
235
a722be49
MD
236 /*
237 * Post any pending upcalls
238 */
239 if (p->p_flag & P_UPCALLPEND) {
240 p->p_flag &= ~P_UPCALLPEND;
241 postupcall(p);
242 }
243
a2a5ad0d
MD
244 /*
245 * Post any pending signals
246 */
247 while ((sig = CURSIG(p)) != 0) {
248 postsig(sig);
249 }
26a0694b
MD
250
251 /*
435ff993 252 * If a reschedule has been requested then we release the current
0a3f9b47
MD
253 * process in order to shift the current process designation to
254 * another user process and/or to switch to a higher priority
255 * kernel thread at userexit() time.
26a0694b 256 */
0a3f9b47 257 if (any_resched_wanted()) {
435ff993 258 p->p_thread->td_release = NULL;
0a3f9b47 259 release_curproc(p);
984263bc 260 }
26a0694b 261
984263bc 262 /*
6ad39cae 263 * Charge system time if profiling. Note: times are in microseconds.
984263bc
MD
264 */
265 if (p->p_flag & P_PROFIL) {
d16a8831 266 addupc_task(p, frame->tf_eip,
37af14fe 267 (u_int)((int)p->p_thread->td_sticks - sticks));
984263bc 268 }
26a0694b
MD
269
270 /*
a2a5ad0d 271 * Post any pending signals XXX
26a0694b 272 */
a2a5ad0d
MD
273 while ((sig = CURSIG(p)) != 0)
274 postsig(sig);
984263bc
MD
275}
276
277#ifdef DEVICE_POLLING
278extern u_int32_t poll_in_trap;
3ae0cd58 279extern int ether_poll (int count);
984263bc
MD
280#endif /* DEVICE_POLLING */
281
282/*
283 * Exception, fault, and trap interface to the FreeBSD kernel.
284 * This common code is called from assembly language IDT gate entry
285 * routines that prepare a suitable stack frame, and restore this
286 * frame after the exception has been processed.
a2a5ad0d
MD
287 *
288 * This function is also called from doreti in an interlock to handle ASTs.
289 * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap
290 *
291 * NOTE! We have to retrieve the fault address prior to obtaining the
292 * MP lock because get_mplock() may switch out. YYY cr2 really ought
293 * to be retrieved by the assembly code, not here.
984263bc 294 */
984263bc
MD
295void
296trap(frame)
297 struct trapframe frame;
298{
7966cb69
MD
299 struct thread *td = curthread;
300 struct proc *p;
37af14fe 301 int sticks = 0;
984263bc
MD
302 int i = 0, ucode = 0, type, code;
303 vm_offset_t eva;
304
7966cb69 305 p = td->td_proc;
984263bc
MD
306#ifdef DDB
307 if (db_active) {
308 eva = (frame.tf_trapno == T_PAGEFLT ? rcr2() : 0);
a2a5ad0d 309 get_mplock();
984263bc 310 trap_fatal(&frame, eva);
8a8d5d85 311 goto out2;
984263bc
MD
312 }
313#endif
314
a2a5ad0d
MD
315 eva = 0;
316 if (frame.tf_trapno == T_PAGEFLT) {
317 /*
318 * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
319 * This problem is worked around by using an interrupt
320 * gate for the pagefault handler. We are finally ready
321 * to read %cr2 and then must reenable interrupts.
322 *
323 * XXX this should be in the switch statement, but the
324 * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
325 * flow of control too much for this to be obviously
326 * correct.
327 */
328 eva = rcr2();
329 get_mplock();
330 cpu_enable_intr();
331 } else {
332 get_mplock();
333 }
334 /*
335 * MP lock is held at this point
336 */
337
984263bc
MD
338 if (!(frame.tf_eflags & PSL_I)) {
339 /*
340 * Buggy application or kernel code has disabled interrupts
341 * and then trapped. Enabling interrupts now is wrong, but
342 * it is better than running with interrupts disabled until
343 * they are accidentally enabled later.
344 */
345 type = frame.tf_trapno;
a2a5ad0d 346 if (ISPL(frame.tf_cs)==SEL_UPL || (frame.tf_eflags & PSL_VM)) {
984263bc
MD
347 printf(
348 "pid %ld (%s): trap %d with interrupts disabled\n",
349 (long)curproc->p_pid, curproc->p_comm, type);
a2a5ad0d 350 } else if (type != T_BPTFLT && type != T_TRCTRAP) {
984263bc
MD
351 /*
352 * XXX not quite right, since this may be for a
353 * multiple fault in user mode.
354 */
355 printf("kernel trap %d with interrupts disabled\n",
356 type);
a2a5ad0d 357 }
8a8d5d85 358 cpu_enable_intr();
984263bc
MD
359 }
360
984263bc
MD
361
362#ifdef DEVICE_POLLING
363 if (poll_in_trap)
364 ether_poll(poll_in_trap);
365#endif /* DEVICE_POLLING */
366
367#if defined(I586_CPU) && !defined(NO_F00F_HACK)
368restart:
369#endif
370 type = frame.tf_trapno;
371 code = frame.tf_err;
372
373 if (in_vm86call) {
374 if (frame.tf_eflags & PSL_VM &&
375 (type == T_PROTFLT || type == T_STKFLT)) {
96728c05 376#ifdef SMP
37af14fe 377 KKASSERT(td->td_mpcount > 0);
96728c05 378#endif
984263bc 379 i = vm86_emulate((struct vm86frame *)&frame);
96728c05 380#ifdef SMP
37af14fe 381 KKASSERT(td->td_mpcount > 0);
96728c05 382#endif
8a8d5d85 383 if (i != 0) {
984263bc
MD
384 /*
385 * returns to original process
386 */
387 vm86_trap((struct vm86frame *)&frame);
96728c05 388 KKASSERT(0);
8a8d5d85
MD
389 }
390 goto out2;
984263bc
MD
391 }
392 switch (type) {
393 /*
394 * these traps want either a process context, or
395 * assume a normal userspace trap.
396 */
397 case T_PROTFLT:
398 case T_SEGNPFLT:
399 trap_fatal(&frame, eva);
8a8d5d85 400 goto out2;
984263bc
MD
401 case T_TRCTRAP:
402 type = T_BPTFLT; /* kernel breakpoint */
403 /* FALL THROUGH */
404 }
405 goto kernel_trap; /* normal kernel trap handling */
406 }
407
408 if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
409 /* user trap */
410
7966cb69 411 userenter(td);
26a0694b 412
37af14fe 413 sticks = (int)td->td_sticks;
984263bc
MD
414 p->p_md.md_regs = &frame;
415
416 switch (type) {
417 case T_PRIVINFLT: /* privileged instruction fault */
418 ucode = type;
419 i = SIGILL;
420 break;
421
422 case T_BPTFLT: /* bpt instruction fault */
423 case T_TRCTRAP: /* trace trap */
424 frame.tf_eflags &= ~PSL_T;
425 i = SIGTRAP;
426 break;
427
428 case T_ARITHTRAP: /* arithmetic trap */
429 ucode = code;
430 i = SIGFPE;
431 break;
432
433 case T_ASTFLT: /* Allow process switch */
12e4aaff 434 mycpu->gd_cnt.v_soft++;
235957ed
MD
435 if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
436 atomic_clear_int_nonlocked(&mycpu->gd_reqflags,
437 RQF_AST_OWEUPC);
984263bc
MD
438 addupc_task(p, p->p_stats->p_prof.pr_addr,
439 p->p_stats->p_prof.pr_ticks);
440 }
441 goto out;
442
443 /*
444 * The following two traps can happen in
445 * vm86 mode, and, if so, we want to handle
446 * them specially.
447 */
448 case T_PROTFLT: /* general protection fault */
449 case T_STKFLT: /* stack fault */
450 if (frame.tf_eflags & PSL_VM) {
451 i = vm86_emulate((struct vm86frame *)&frame);
452 if (i == 0)
453 goto out;
454 break;
455 }
456 /* FALL THROUGH */
457
458 case T_SEGNPFLT: /* segment not present fault */
459 case T_TSSFLT: /* invalid TSS fault */
460 case T_DOUBLEFLT: /* double fault */
461 default:
462 ucode = code + BUS_SEGM_FAULT ;
463 i = SIGBUS;
464 break;
465
466 case T_PAGEFLT: /* page fault */
467 i = trap_pfault(&frame, TRUE, eva);
468 if (i == -1)
d81ccc3e 469 goto out;
984263bc
MD
470#if defined(I586_CPU) && !defined(NO_F00F_HACK)
471 if (i == -2)
472 goto restart;
473#endif
474 if (i == 0)
475 goto out;
476
477 ucode = T_PAGEFLT;
478 break;
479
480 case T_DIVIDE: /* integer divide fault */
481 ucode = FPE_INTDIV;
482 i = SIGFPE;
483 break;
484
485#if NISA > 0
486 case T_NMI:
487#ifdef POWERFAIL_NMI
488 goto handle_powerfail;
489#else /* !POWERFAIL_NMI */
490 /* machine/parity/power fail/"kitchen sink" faults */
491 if (isa_nmi(code) == 0) {
492#ifdef DDB
493 /*
494 * NMI can be hooked up to a pushbutton
495 * for debugging.
496 */
497 if (ddb_on_nmi) {
498 printf ("NMI ... going to debugger\n");
499 kdb_trap (type, 0, &frame);
500 }
501#endif /* DDB */
8a8d5d85 502 goto out2;
984263bc
MD
503 } else if (panic_on_nmi)
504 panic("NMI indicates hardware failure");
505 break;
506#endif /* POWERFAIL_NMI */
507#endif /* NISA > 0 */
508
509 case T_OFLOW: /* integer overflow fault */
510 ucode = FPE_INTOVF;
511 i = SIGFPE;
512 break;
513
514 case T_BOUND: /* bounds check fault */
515 ucode = FPE_FLTSUB;
516 i = SIGFPE;
517 break;
518
519 case T_DNA:
520#if NNPX > 0
521 /* if a transparent fault (due to context switch "late") */
522 if (npxdna())
d81ccc3e 523 goto out;
984263bc
MD
524#endif
525 if (!pmath_emulate) {
526 i = SIGFPE;
527 ucode = FPE_FPU_NP_TRAP;
528 break;
529 }
530 i = (*pmath_emulate)(&frame);
531 if (i == 0) {
532 if (!(frame.tf_eflags & PSL_T))
8a8d5d85 533 goto out2;
984263bc
MD
534 frame.tf_eflags &= ~PSL_T;
535 i = SIGTRAP;
536 }
537 /* else ucode = emulator_only_knows() XXX */
538 break;
539
540 case T_FPOPFLT: /* FPU operand fetch fault */
541 ucode = T_FPOPFLT;
542 i = SIGILL;
543 break;
544
545 case T_XMMFLT: /* SIMD floating-point exception */
546 ucode = 0; /* XXX */
547 i = SIGFPE;
548 break;
549 }
550 } else {
551kernel_trap:
552 /* kernel trap */
553
554 switch (type) {
555 case T_PAGEFLT: /* page fault */
556 (void) trap_pfault(&frame, FALSE, eva);
8a8d5d85 557 goto out2;
984263bc
MD
558
559 case T_DNA:
560#if NNPX > 0
561 /*
562 * The kernel is apparently using npx for copying.
563 * XXX this should be fatal unless the kernel has
564 * registered such use.
565 */
566 if (npxdna())
8a8d5d85 567 goto out2;
984263bc
MD
568#endif
569 break;
570
571 case T_PROTFLT: /* general protection fault */
572 case T_SEGNPFLT: /* segment not present fault */
573 /*
574 * Invalid segment selectors and out of bounds
575 * %eip's and %esp's can be set up in user mode.
576 * This causes a fault in kernel mode when the
577 * kernel tries to return to user mode. We want
578 * to get this fault so that we can fix the
579 * problem here and not have to check all the
580 * selectors and pointers when the user changes
581 * them.
582 */
583#define MAYBE_DORETI_FAULT(where, whereto) \
584 do { \
585 if (frame.tf_eip == (int)where) { \
586 frame.tf_eip = (int)whereto; \
8a8d5d85 587 goto out2; \
984263bc
MD
588 } \
589 } while (0)
fe8c5e17
MD
590 /*
591 * Since we don't save %gs across an interrupt
592 * frame this check must occur outside the intr
593 * nesting level check.
594 */
595 if (frame.tf_eip == (int)cpu_switch_load_gs) {
37af14fe 596 td->td_pcb->pcb_gs = 0;
fe8c5e17
MD
597 psignal(p, SIGBUS);
598 goto out2;
599 }
ef0fdad1 600 if (mycpu->gd_intr_nesting_level == 0) {
984263bc
MD
601 /*
602 * Invalid %fs's and %gs's can be created using
603 * procfs or PT_SETREGS or by invalidating the
604 * underlying LDT entry. This causes a fault
605 * in kernel mode when the kernel attempts to
606 * switch contexts. Lose the bad context
607 * (XXX) so that we can continue, and generate
608 * a signal.
609 */
984263bc
MD
610 MAYBE_DORETI_FAULT(doreti_iret,
611 doreti_iret_fault);
612 MAYBE_DORETI_FAULT(doreti_popl_ds,
613 doreti_popl_ds_fault);
614 MAYBE_DORETI_FAULT(doreti_popl_es,
615 doreti_popl_es_fault);
616 MAYBE_DORETI_FAULT(doreti_popl_fs,
617 doreti_popl_fs_fault);
37af14fe
MD
618 if (td->td_pcb->pcb_onfault) {
619 frame.tf_eip =
620 (register_t)td->td_pcb->pcb_onfault;
8a8d5d85 621 goto out2;
984263bc
MD
622 }
623 }
624 break;
625
626 case T_TSSFLT:
627 /*
628 * PSL_NT can be set in user mode and isn't cleared
629 * automatically when the kernel is entered. This
630 * causes a TSS fault when the kernel attempts to
631 * `iret' because the TSS link is uninitialized. We
632 * want to get this fault so that we can fix the
633 * problem here and not every time the kernel is
634 * entered.
635 */
636 if (frame.tf_eflags & PSL_NT) {
637 frame.tf_eflags &= ~PSL_NT;
8a8d5d85 638 goto out2;
984263bc
MD
639 }
640 break;
641
642 case T_TRCTRAP: /* trace trap */
643 if (frame.tf_eip == (int)IDTVEC(syscall)) {
644 /*
645 * We've just entered system mode via the
646 * syscall lcall. Continue single stepping
647 * silently until the syscall handler has
648 * saved the flags.
649 */
8a8d5d85 650 goto out2;
984263bc
MD
651 }
652 if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
653 /*
654 * The syscall handler has now saved the
655 * flags. Stop single stepping it.
656 */
657 frame.tf_eflags &= ~PSL_T;
8a8d5d85 658 goto out2;
984263bc
MD
659 }
660 /*
661 * Ignore debug register trace traps due to
662 * accesses in the user's address space, which
663 * can happen under several conditions such as
664 * if a user sets a watchpoint on a buffer and
665 * then passes that buffer to a system call.
666 * We still want to get TRCTRAPS for addresses
667 * in kernel space because that is useful when
668 * debugging the kernel.
669 */
670 if (user_dbreg_trap()) {
671 /*
672 * Reset breakpoint bits because the
673 * processor doesn't
674 */
675 load_dr6(rdr6() & 0xfffffff0);
8a8d5d85 676 goto out2;
984263bc
MD
677 }
678 /*
679 * Fall through (TRCTRAP kernel mode, kernel address)
680 */
681 case T_BPTFLT:
682 /*
683 * If DDB is enabled, let it handle the debugger trap.
684 * Otherwise, debugger traps "can't happen".
685 */
686#ifdef DDB
687 if (kdb_trap (type, 0, &frame))
8a8d5d85 688 goto out2;
984263bc
MD
689#endif
690 break;
691
692#if NISA > 0
693 case T_NMI:
694#ifdef POWERFAIL_NMI
695#ifndef TIMER_FREQ
696# define TIMER_FREQ 1193182
697#endif
698 handle_powerfail:
699 {
700 static unsigned lastalert = 0;
701
702 if(time_second - lastalert > 10)
703 {
704 log(LOG_WARNING, "NMI: power fail\n");
705 sysbeep(TIMER_FREQ/880, hz);
706 lastalert = time_second;
707 }
8a8d5d85
MD
708 /* YYY mp count */
709 goto out2;
984263bc
MD
710 }
711#else /* !POWERFAIL_NMI */
712 /* machine/parity/power fail/"kitchen sink" faults */
713 if (isa_nmi(code) == 0) {
714#ifdef DDB
715 /*
716 * NMI can be hooked up to a pushbutton
717 * for debugging.
718 */
719 if (ddb_on_nmi) {
720 printf ("NMI ... going to debugger\n");
721 kdb_trap (type, 0, &frame);
722 }
723#endif /* DDB */
8a8d5d85 724 goto out2;
984263bc 725 } else if (panic_on_nmi == 0)
8a8d5d85 726 goto out2;
984263bc
MD
727 /* FALL THROUGH */
728#endif /* POWERFAIL_NMI */
729#endif /* NISA > 0 */
730 }
731
732 trap_fatal(&frame, eva);
8a8d5d85 733 goto out2;
984263bc
MD
734 }
735
736 /* Translate fault for emulators (e.g. Linux) */
737 if (*p->p_sysent->sv_transtrap)
738 i = (*p->p_sysent->sv_transtrap)(i, type);
739
740 trapsignal(p, i, ucode);
741
742#ifdef DEBUG
743 if (type <= MAX_TRAP_MSG) {
744 uprintf("fatal process exception: %s",
745 trap_msg[type]);
746 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
747 uprintf(", fault VA = 0x%lx", (u_long)eva);
748 uprintf("\n");
749 }
750#endif
751
752out:
8a8d5d85
MD
753#ifdef SMP
754 if (ISPL(frame.tf_cs) == SEL_UPL)
37af14fe 755 KASSERT(td->td_mpcount == 1, ("badmpcount trap from %p", (void *)frame.tf_eip));
8a8d5d85
MD
756#endif
757 userret(p, &frame, sticks);
a2a5ad0d 758 userexit(p);
8a8d5d85 759out2:
96728c05 760#ifdef SMP
37af14fe 761 KKASSERT(td->td_mpcount > 0);
96728c05 762#endif
8a8d5d85 763 rel_mplock();
984263bc
MD
764}
765
766#ifdef notyet
767/*
768 * This version doesn't allow a page fault to user space while
769 * in the kernel. The rest of the kernel needs to be made "safe"
770 * before this can be used. I think the only things remaining
771 * to be made safe are the iBCS2 code and the process tracing/
772 * debugging code.
773 */
774static int
775trap_pfault(frame, usermode, eva)
776 struct trapframe *frame;
777 int usermode;
778 vm_offset_t eva;
779{
780 vm_offset_t va;
781 struct vmspace *vm = NULL;
782 vm_map_t map = 0;
783 int rv = 0;
784 vm_prot_t ftype;
37af14fe
MD
785 thread_t td = curthread;
786 struct proc *p = td->td_proc; /* may be NULL */
984263bc
MD
787
788 if (frame->tf_err & PGEX_W)
789 ftype = VM_PROT_WRITE;
790 else
791 ftype = VM_PROT_READ;
792
793 va = trunc_page(eva);
794 if (va < VM_MIN_KERNEL_ADDRESS) {
795 vm_offset_t v;
796 vm_page_t mpte;
797
798 if (p == NULL ||
799 (!usermode && va < VM_MAXUSER_ADDRESS &&
37af14fe
MD
800 (td->td_gd->gd_intr_nesting_level != 0 ||
801 td->td_pcb->pcb_onfault == NULL))) {
984263bc
MD
802 trap_fatal(frame, eva);
803 return (-1);
804 }
805
806 /*
807 * This is a fault on non-kernel virtual memory.
808 * vm is initialized above to NULL. If curproc is NULL
809 * or curproc->p_vmspace is NULL the fault is fatal.
810 */
811 vm = p->p_vmspace;
812 if (vm == NULL)
813 goto nogo;
814
815 map = &vm->vm_map;
816
817 /*
818 * Keep swapout from messing with us during this
819 * critical time.
820 */
821 ++p->p_lock;
822
823 /*
824 * Grow the stack if necessary
825 */
826 /* grow_stack returns false only if va falls into
827 * a growable stack region and the stack growth
828 * fails. It returns true if va was not within
829 * a growable stack region, or if the stack
830 * growth succeeded.
831 */
832 if (!grow_stack (p, va)) {
833 rv = KERN_FAILURE;
834 --p->p_lock;
835 goto nogo;
836 }
837
838 /* Fault in the user page: */
839 rv = vm_fault(map, va, ftype,
840 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
841 : VM_FAULT_NORMAL);
842
843 --p->p_lock;
844 } else {
845 /*
846 * Don't allow user-mode faults in kernel address space.
847 */
848 if (usermode)
849 goto nogo;
850
851 /*
852 * Since we know that kernel virtual address addresses
853 * always have pte pages mapped, we just have to fault
854 * the page.
855 */
856 rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
857 }
858
859 if (rv == KERN_SUCCESS)
860 return (0);
861nogo:
862 if (!usermode) {
37af14fe
MD
863 if (mtd->td_gd->gd_intr_nesting_level == 0 &&
864 td->td_pcb->pcb_onfault) {
865 frame->tf_eip = (register_t)td->td_pcb->pcb_onfault;
984263bc
MD
866 return (0);
867 }
868 trap_fatal(frame, eva);
869 return (-1);
870 }
871
872 /* kludge to pass faulting virtual address to sendsig */
873 frame->tf_err = eva;
874
875 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
876}
877#endif
878
879int
880trap_pfault(frame, usermode, eva)
881 struct trapframe *frame;
882 int usermode;
883 vm_offset_t eva;
884{
885 vm_offset_t va;
886 struct vmspace *vm = NULL;
887 vm_map_t map = 0;
888 int rv = 0;
889 vm_prot_t ftype;
37af14fe
MD
890 thread_t td = curthread;
891 struct proc *p = td->td_proc;
984263bc
MD
892
893 va = trunc_page(eva);
894 if (va >= KERNBASE) {
895 /*
896 * Don't allow user-mode faults in kernel address space.
897 * An exception: if the faulting address is the invalid
898 * instruction entry in the IDT, then the Intel Pentium
899 * F00F bug workaround was triggered, and we need to
900 * treat it is as an illegal instruction, and not a page
901 * fault.
902 */
903#if defined(I586_CPU) && !defined(NO_F00F_HACK)
904 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
905 frame->tf_trapno = T_PRIVINFLT;
906 return -2;
907 }
908#endif
909 if (usermode)
910 goto nogo;
911
912 map = kernel_map;
913 } else {
914 /*
915 * This is a fault on non-kernel virtual memory.
916 * vm is initialized above to NULL. If curproc is NULL
917 * or curproc->p_vmspace is NULL the fault is fatal.
918 */
919 if (p != NULL)
920 vm = p->p_vmspace;
921
922 if (vm == NULL)
923 goto nogo;
924
925 map = &vm->vm_map;
926 }
927
928 if (frame->tf_err & PGEX_W)
929 ftype = VM_PROT_WRITE;
930 else
931 ftype = VM_PROT_READ;
932
933 if (map != kernel_map) {
934 /*
935 * Keep swapout from messing with us during this
936 * critical time.
937 */
938 ++p->p_lock;
939
940 /*
941 * Grow the stack if necessary
942 */
943 /* grow_stack returns false only if va falls into
944 * a growable stack region and the stack growth
945 * fails. It returns true if va was not within
946 * a growable stack region, or if the stack
947 * growth succeeded.
948 */
949 if (!grow_stack (p, va)) {
950 rv = KERN_FAILURE;
951 --p->p_lock;
952 goto nogo;
953 }
954
955 /* Fault in the user page: */
956 rv = vm_fault(map, va, ftype,
957 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
958 : VM_FAULT_NORMAL);
959
960 --p->p_lock;
961 } else {
962 /*
963 * Don't have to worry about process locking or stacks in the kernel.
964 */
965 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
966 }
967
968 if (rv == KERN_SUCCESS)
969 return (0);
970nogo:
971 if (!usermode) {
37af14fe
MD
972 if (td->td_gd->gd_intr_nesting_level == 0 &&
973 td->td_pcb->pcb_onfault) {
974 frame->tf_eip = (register_t)td->td_pcb->pcb_onfault;
984263bc
MD
975 return (0);
976 }
977 trap_fatal(frame, eva);
978 return (-1);
979 }
980
981 /* kludge to pass faulting virtual address to sendsig */
982 frame->tf_err = eva;
983
984 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
985}
986
987static void
988trap_fatal(frame, eva)
989 struct trapframe *frame;
990 vm_offset_t eva;
991{
992 int code, type, ss, esp;
993 struct soft_segment_descriptor softseg;
994
995 code = frame->tf_err;
996 type = frame->tf_trapno;
3951a45f 997 sdtossd(&gdt[mycpu->gd_cpuid * NGDT + IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
984263bc
MD
998
999 if (type <= MAX_TRAP_MSG)
1000 printf("\n\nFatal trap %d: %s while in %s mode\n",
1001 type, trap_msg[type],
1002 frame->tf_eflags & PSL_VM ? "vm86" :
1003 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
1004#ifdef SMP
7a44d1cb 1005 /* three separate prints in case of a trap on an unmapped page */
984263bc 1006 printf("mp_lock = %08x; ", mp_lock);
8a8d5d85 1007 printf("cpuid = %d; ", mycpu->gd_cpuid);
984263bc
MD
1008 printf("lapic.id = %08x\n", lapic.id);
1009#endif
1010 if (type == T_PAGEFLT) {
1011 printf("fault virtual address = 0x%x\n", eva);
1012 printf("fault code = %s %s, %s\n",
1013 code & PGEX_U ? "user" : "supervisor",
1014 code & PGEX_W ? "write" : "read",
1015 code & PGEX_P ? "protection violation" : "page not present");
1016 }
1017 printf("instruction pointer = 0x%x:0x%x\n",
1018 frame->tf_cs & 0xffff, frame->tf_eip);
1019 if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
1020 ss = frame->tf_ss & 0xffff;
1021 esp = frame->tf_esp;
1022 } else {
1023 ss = GSEL(GDATA_SEL, SEL_KPL);
1024 esp = (int)&frame->tf_esp;
1025 }
1026 printf("stack pointer = 0x%x:0x%x\n", ss, esp);
1027 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
1028 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
1029 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
1030 printf(" = DPL %d, pres %d, def32 %d, gran %d\n",
1031 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
1032 softseg.ssd_gran);
1033 printf("processor eflags = ");
1034 if (frame->tf_eflags & PSL_T)
1035 printf("trace trap, ");
1036 if (frame->tf_eflags & PSL_I)
1037 printf("interrupt enabled, ");
1038 if (frame->tf_eflags & PSL_NT)
1039 printf("nested task, ");
1040 if (frame->tf_eflags & PSL_RF)
1041 printf("resume, ");
1042 if (frame->tf_eflags & PSL_VM)
1043 printf("vm86, ");
1044 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
1045 printf("current process = ");
1046 if (curproc) {
1047 printf("%lu (%s)\n",
1048 (u_long)curproc->p_pid, curproc->p_comm ?
1049 curproc->p_comm : "");
1050 } else {
1051 printf("Idle\n");
1052 }
f1d1c3fa
MD
1053 printf("current thread = pri %d ", curthread->td_pri);
1054 if (curthread->td_pri >= TDPRI_CRIT)
1055 printf("(CRIT)");
1056 printf("\n");
984263bc 1057 printf("interrupt mask = ");
8f41e33b 1058 if ((curthread->td_cpl & net_imask) == net_imask)
984263bc 1059 printf("net ");
8f41e33b 1060 if ((curthread->td_cpl & tty_imask) == tty_imask)
984263bc 1061 printf("tty ");
8f41e33b 1062 if ((curthread->td_cpl & bio_imask) == bio_imask)
984263bc 1063 printf("bio ");
8f41e33b 1064 if ((curthread->td_cpl & cam_imask) == cam_imask)
984263bc 1065 printf("cam ");
8f41e33b 1066 if (curthread->td_cpl == 0)
984263bc
MD
1067 printf("none");
1068#ifdef SMP
1069/**
1070 * XXX FIXME:
1071 * we probably SHOULD have stopped the other CPUs before now!
1072 * another CPU COULD have been touching cpl at this moment...
1073 */
1074 printf(" <- SMP: XXX");
1075#endif
1076 printf("\n");
1077
1078#ifdef KDB
1079 if (kdb_trap(&psl))
1080 return;
1081#endif
1082#ifdef DDB
f7bc9806 1083 if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame))
984263bc
MD
1084 return;
1085#endif
1086 printf("trap number = %d\n", type);
1087 if (type <= MAX_TRAP_MSG)
1088 panic("%s", trap_msg[type]);
1089 else
1090 panic("unknown/reserved trap");
1091}
1092
1093/*
1094 * Double fault handler. Called when a fault occurs while writing
1095 * a frame for a trap/exception onto the stack. This usually occurs
1096 * when the stack overflows (such is the case with infinite recursion,
1097 * for example).
1098 *
1099 * XXX Note that the current PTD gets replaced by IdlePTD when the
1100 * task switch occurs. This means that the stack that was active at
1101 * the time of the double fault is not available at <kstack> unless
1102 * the machine was idle when the double fault occurred. The downside
1103 * of this is that "trace <ebp>" in ddb won't work.
1104 */
1105void
1106dblfault_handler()
1107{
85100692 1108 struct mdglobaldata *gd = mdcpu;
17a9f566 1109
984263bc 1110 printf("\nFatal double fault:\n");
17a9f566
MD
1111 printf("eip = 0x%x\n", gd->gd_common_tss.tss_eip);
1112 printf("esp = 0x%x\n", gd->gd_common_tss.tss_esp);
1113 printf("ebp = 0x%x\n", gd->gd_common_tss.tss_ebp);
984263bc 1114#ifdef SMP
7a44d1cb 1115 /* three separate prints in case of a trap on an unmapped page */
984263bc 1116 printf("mp_lock = %08x; ", mp_lock);
8a8d5d85 1117 printf("cpuid = %d; ", mycpu->gd_cpuid);
984263bc
MD
1118 printf("lapic.id = %08x\n", lapic.id);
1119#endif
1120 panic("double fault");
1121}
1122
1123/*
1124 * Compensate for 386 brain damage (missing URKR).
1125 * This is a little simpler than the pagefault handler in trap() because
1126 * it the page tables have already been faulted in and high addresses
1127 * are thrown out early for other reasons.
1128 */
1129int trapwrite(addr)
1130 unsigned addr;
1131{
1132 struct proc *p;
1133 vm_offset_t va;
1134 struct vmspace *vm;
1135 int rv;
1136
1137 va = trunc_page((vm_offset_t)addr);
1138 /*
1139 * XXX - MAX is END. Changed > to >= for temp. fix.
1140 */
1141 if (va >= VM_MAXUSER_ADDRESS)
1142 return (1);
1143
1144 p = curproc;
1145 vm = p->p_vmspace;
1146
1147 ++p->p_lock;
1148
1149 if (!grow_stack (p, va)) {
1150 --p->p_lock;
1151 return (1);
1152 }
1153
1154 /*
1155 * fault the data page
1156 */
1157 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1158
1159 --p->p_lock;
1160
1161 if (rv != KERN_SUCCESS)
1162 return 1;
1163
1164 return (0);
1165}
1166
1167/*
1168 * syscall2 - MP aware system call request C handler
1169 *
1170 * A system call is essentially treated as a trap except that the
1171 * MP lock is not held on entry or return. We are responsible for
1172 * obtaining the MP lock if necessary and for handling ASTs
1173 * (e.g. a task switch) prior to return.
1174 *
1175 * In general, only simple access and manipulation of curproc and
1176 * the current stack is allowed without having to hold MP lock.
1177 */
1178void
a64ba182 1179syscall2(struct trapframe frame)
984263bc 1180{
dadab5e9
MD
1181 struct thread *td = curthread;
1182 struct proc *p = td->td_proc;
984263bc
MD
1183 caddr_t params;
1184 int i;
1185 struct sysent *callp;
984263bc 1186 register_t orig_tf_eflags;
37af14fe 1187 int sticks;
984263bc
MD
1188 int error;
1189 int narg;
984263bc 1190 u_int code;
a64ba182 1191 union sysunion args;
984263bc
MD
1192
1193#ifdef DIAGNOSTIC
1194 if (ISPL(frame.tf_cs) != SEL_UPL) {
1195 get_mplock();
1196 panic("syscall");
1197 /* NOT REACHED */
1198 }
1199#endif
1200
8a8d5d85 1201#ifdef SMP
37af14fe 1202 KASSERT(td->td_mpcount == 0, ("badmpcount syscall from %p", (void *)frame.tf_eip));
8a8d5d85
MD
1203 get_mplock();
1204#endif
37af14fe
MD
1205 userenter(td); /* lazy raise our priority */
1206
1207 sticks = (int)td->td_sticks;
984263bc
MD
1208
1209 p->p_md.md_regs = &frame;
1210 params = (caddr_t)frame.tf_esp + sizeof(int);
1211 code = frame.tf_eax;
1212 orig_tf_eflags = frame.tf_eflags;
1213
1214 if (p->p_sysent->sv_prepsyscall) {
1215 /*
1216 * The prep code is not MP aware.
1217 */
245e4f17 1218 (*p->p_sysent->sv_prepsyscall)(&frame, (int *)(&args.nosys.usrmsg + 1), &code, &params);
984263bc
MD
1219 } else {
1220 /*
1221 * Need to check if this is a 32 bit or 64 bit syscall.
1222 * fuword is MP aware.
1223 */
1224 if (code == SYS_syscall) {
1225 /*
1226 * Code is first argument, followed by actual args.
1227 */
1228 code = fuword(params);
1229 params += sizeof(int);
1230 } else if (code == SYS___syscall) {
1231 /*
1232 * Like syscall, but code is a quad, so as to maintain
1233 * quad alignment for the rest of the arguments.
1234 */
1235 code = fuword(params);
1236 params += sizeof(quad_t);
1237 }
1238 }
1239
1240 if (p->p_sysent->sv_mask)
1241 code &= p->p_sysent->sv_mask;
1242
1243 if (code >= p->p_sysent->sv_size)
1244 callp = &p->p_sysent->sv_table[0];
1245 else
1246 callp = &p->p_sysent->sv_table[code];
1247
1248 narg = callp->sy_narg & SYF_ARGMASK;
1249
1250 /*
1251 * copyin is MP aware, but the tracing code is not
1252 */
245e4f17
MD
1253 if (params && (i = narg * sizeof(register_t)) &&
1254 (error = copyin(params, (caddr_t)(&args.nosys.usrmsg + 1), (u_int)i))) {
984263bc 1255#ifdef KTRACE
dadab5e9 1256 if (KTRPOINT(td, KTR_SYSCALL))
245e4f17 1257 ktrsyscall(p->p_tracep, code, narg, (void *)(&args.nosys.usrmsg + 1));
984263bc
MD
1258#endif
1259 goto bad;
1260 }
1261
8a8d5d85 1262#if 0
984263bc
MD
1263 /*
1264 * Try to run the syscall without the MP lock if the syscall
1265 * is MP safe. We have to obtain the MP lock no matter what if
1266 * we are ktracing
1267 */
1268 if ((callp->sy_narg & SYF_MPSAFE) == 0) {
1269 get_mplock();
1270 have_mplock = 1;
1271 }
8a8d5d85 1272#endif
984263bc
MD
1273
1274#ifdef KTRACE
dadab5e9 1275 if (KTRPOINT(td, KTR_SYSCALL)) {
245e4f17 1276 ktrsyscall(p->p_tracep, code, narg, (void *)(&args.nosys.usrmsg + 1));
984263bc
MD
1277 }
1278#endif
01dce7bb
MD
1279
1280 /*
1281 * For traditional syscall code edx is left untouched when 32 bit
1282 * results are returned. Since edx is loaded from fds[1] when the
1283 * system call returns we pre-set it here.
1284 */
b44419cb
MD
1285 lwkt_initmsg(&args.lmsg, &td->td_msgport, 0,
1286 lwkt_cmd_op(code), lwkt_cmd_op_none);
df2244e3 1287 args.sysmsg_copyout = NULL;
c7114eea
MD
1288 args.sysmsg_fds[0] = 0;
1289 args.sysmsg_fds[1] = frame.tf_edx;
984263bc
MD
1290
1291 STOPEVENT(p, S_SCE, narg); /* MP aware */
1292
a64ba182 1293 error = (*callp->sy_call)(&args);
984263bc
MD
1294
1295 /*
1296 * MP SAFE (we may or may not have the MP lock at this point)
1297 */
1298 switch (error) {
1299 case 0:
1300 /*
1301 * Reinitialize proc pointer `p' as it may be different
1302 * if this is a child returning from fork syscall.
1303 */
1304 p = curproc;
c7114eea
MD
1305 frame.tf_eax = args.sysmsg_fds[0];
1306 frame.tf_edx = args.sysmsg_fds[1];
984263bc
MD
1307 frame.tf_eflags &= ~PSL_C;
1308 break;
984263bc
MD
1309 case ERESTART:
1310 /*
1311 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1312 * int 0x80 is 2 bytes. We saved this in tf_err.
1313 */
1314 frame.tf_eip -= frame.tf_err;
1315 break;
984263bc
MD
1316 case EJUSTRETURN:
1317 break;
245e4f17
MD
1318 case EASYNC:
1319 panic("Unexpected EASYNC return value (for now)");
984263bc
MD
1320 default:
1321bad:
1322 if (p->p_sysent->sv_errsize) {
1323 if (error >= p->p_sysent->sv_errsize)
1324 error = -1; /* XXX */
1325 else
1326 error = p->p_sysent->sv_errtbl[error];
1327 }
1328 frame.tf_eax = error;
1329 frame.tf_eflags |= PSL_C;
1330 break;
1331 }
1332
1333 /*
1334 * Traced syscall. trapsignal() is not MP aware.
1335 */
1336 if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
984263bc
MD
1337 frame.tf_eflags &= ~PSL_T;
1338 trapsignal(p, SIGTRAP, 0);
1339 }
1340
1341 /*
1342 * Handle reschedule and other end-of-syscall issues
1343 */
8a8d5d85 1344 userret(p, &frame, sticks);
984263bc
MD
1345
1346#ifdef KTRACE
dadab5e9 1347 if (KTRPOINT(td, KTR_SYSRET)) {
c7114eea 1348 ktrsysret(p->p_tracep, code, error, args.sysmsg_result);
984263bc
MD
1349 }
1350#endif
1351
1352 /*
1353 * This works because errno is findable through the
1354 * register set. If we ever support an emulation where this
1355 * is not the case, this code will need to be revisited.
1356 */
1357 STOPEVENT(p, S_SCX, code);
1358
a2a5ad0d 1359 userexit(p);
8a8d5d85 1360#ifdef SMP
984263bc
MD
1361 /*
1362 * Release the MP lock if we had to get it
1363 */
37af14fe 1364 KASSERT(td->td_mpcount == 1, ("badmpcount syscall from %p", (void *)frame.tf_eip));
8a8d5d85
MD
1365 rel_mplock();
1366#endif
984263bc
MD
1367}
1368
a64ba182
MD
1369/*
1370 * sendsys2 - MP aware system message request C handler
1371 */
1372void
1373sendsys2(struct trapframe frame)
1374{
7966cb69 1375 struct globaldata *gd;
a64ba182
MD
1376 struct thread *td = curthread;
1377 struct proc *p = td->td_proc;
4fd10eb6 1378 register_t orig_tf_eflags;
a64ba182 1379 struct sysent *callp;
245e4f17 1380 union sysunion *sysun;
4fd10eb6 1381 lwkt_msg_t umsg;
37af14fe 1382 int sticks;
a64ba182
MD
1383 int error;
1384 int narg;
4fd10eb6 1385 u_int code = 0;
a64ba182 1386 int msgsize;
90b9818c 1387 int result;
a64ba182
MD
1388
1389#ifdef DIAGNOSTIC
1390 if (ISPL(frame.tf_cs) != SEL_UPL) {
1391 get_mplock();
1392 panic("syscall");
1393 /* NOT REACHED */
1394 }
1395#endif
1396
1397#ifdef SMP
37af14fe 1398 KASSERT(td->td_mpcount == 0, ("badmpcount syscall from %p", (void *)frame.tf_eip));
a64ba182
MD
1399 get_mplock();
1400#endif
1401 /*
1402 * access non-atomic field from critical section. p_sticks is
1403 * updated by the clock interrupt. Also use this opportunity
1404 * to lazy-raise our LWKT priority.
1405 */
7966cb69 1406 userenter(td);
37af14fe 1407 sticks = td->td_sticks;
a64ba182
MD
1408
1409 p->p_md.md_regs = &frame;
4fd10eb6 1410 orig_tf_eflags = frame.tf_eflags;
90b9818c 1411 result = 0;
a64ba182 1412
245e4f17
MD
1413 /*
1414 * Handle the waitport/waitmsg/checkport/checkmsg case
1415 *
1416 * YYY MOVE THIS TO INT 0x82! We don't really need to combine it
1417 * with sendsys().
1418 */
1419 if ((msgsize = frame.tf_edx) <= 0) {
1420 if (frame.tf_ecx) {
1421 printf("waitmsg/checkmsg not yet supported: %08x\n",
1422 frame.tf_ecx);
1423 error = ENOTSUP;
1424 goto bad2;
1425 }
1426 if (frame.tf_eax) {
1427 printf("waitport/checkport only the default port is supported at the moment\n");
1428 error = ENOTSUP;
1429 goto bad2;
1430 }
1431 switch(msgsize) {
1432 case 0:
1433 /*
1434 * Wait on port for message
1435 */
1436 sysun = lwkt_getport(&td->td_msgport);
1437 /* XXX block */
1438 break;
1439 case -1:
1440 /*
1441 * Test port for message
1442 */
1443 sysun = lwkt_getport(&td->td_msgport);
1444 break;
1445 default:
1446 error = ENOSYS;
1447 goto bad2;
1448 }
1449 if (sysun) {
1450 gd = td->td_gd;
1451 umsg = sysun->lmsg.opaque.ms_umsg;
1452 frame.tf_eax = (register_t)umsg;
df2244e3
MD
1453 if (sysun->sysmsg_copyout)
1454 sysun->sysmsg_copyout(sysun);
245e4f17
MD
1455 atomic_add_int_nonlocked(&td->td_msgport.mp_refs, -1);
1456 sysun->nosys.usrmsg.umsg.u.ms_fds[0] = sysun->lmsg.u.ms_fds[0];
1457 sysun->nosys.usrmsg.umsg.u.ms_fds[1] = sysun->lmsg.u.ms_fds[1];
1458 sysun->nosys.usrmsg.umsg.ms_error = sysun->lmsg.ms_error;
1459 error = sysun->lmsg.ms_error;
1460 result = sysun->lmsg.u.ms_fds[0]; /* for ktrace */
1461 if (error != 0 || code != SYS_execve) {
1462 error = copyout(
1463 &sysun->nosys.usrmsg.umsg.ms_copyout_start,
1464 &umsg->ms_copyout_start,
1465 ms_copyout_size);
1466 }
1467 crit_enter_quick(td);
1468 sysun->lmsg.opaque.ms_sysunnext = gd->gd_freesysun;
1469 gd->gd_freesysun = sysun;
1470 crit_exit_quick(td);
1471 } else {
1472 frame.tf_eax = 0;
1473 }
1474 frame.tf_edx = 0;
1475 code = 0;
1476 error = 0;
1477 goto good;
1478 }
1479
a64ba182
MD
1480 /*
1481 * Extract the system call message. If msgsize is zero we are
245e4f17
MD
1482 * blocking on a message and/or message port. If msgsize is -1
1483 * we are testing a message for completion or a message port for
1484 * activity.
c7114eea
MD
1485 *
1486 * The userland system call message size includes the size of the
245e4f17
MD
1487 * userland lwkt_msg plus arguments. We load it into the userland
1488 * portion of our sysunion structure then we initialize the kerneland
1489 * portion and go.
a64ba182 1490 */
a64ba182
MD
1491
1492 /*
1493 * Bad message size
1494 */
245e4f17 1495 if (msgsize < sizeof(struct lwkt_msg) ||
df2244e3 1496 msgsize > sizeof(union sysunion) - sizeof(struct sysmsg)
245e4f17 1497 ) {
a64ba182 1498 error = ENOSYS;
4fd10eb6 1499 goto bad2;
a64ba182
MD
1500 }
1501
1502 /*
245e4f17 1503 * Obtain a sysun from our per-cpu cache or allocate a new one. Use
4fd10eb6
MD
1504 * the opaque field to store the original (user) message pointer.
1505 * A critical section is necessary to interlock against interrupts
1506 * returning system messages to the thread cache.
a64ba182 1507 */
7966cb69
MD
1508 gd = td->td_gd;
1509 crit_enter_quick(td);
245e4f17
MD
1510 if ((sysun = gd->gd_freesysun) != NULL) {
1511 gd->gd_freesysun = sysun->lmsg.opaque.ms_sysunnext;
c7114eea 1512 crit_exit_quick(td);
a64ba182 1513 } else {
c7114eea 1514 crit_exit_quick(td);
245e4f17 1515 sysun = malloc(sizeof(union sysunion), M_SYSMSG, M_WAITOK);
a64ba182 1516 }
245e4f17 1517 atomic_add_int_nonlocked(&td->td_msgport.mp_refs, 1);
4fd10eb6
MD
1518
1519 /*
245e4f17 1520 * Copy the user request into the kernel copy of the user request.
4fd10eb6 1521 */
a64ba182 1522 umsg = (void *)frame.tf_ecx;
245e4f17 1523 error = copyin(umsg, &sysun->nosys.usrmsg, msgsize);
c7114eea 1524 if (error)
4fd10eb6 1525 goto bad1;
245e4f17
MD
1526 if ((sysun->nosys.usrmsg.umsg.ms_flags & MSGF_ASYNC) &&
1527 (error = suser(td)) != 0
1528 ) {
1529 goto bad1;
c7114eea 1530 }
a64ba182 1531
4fd10eb6 1532 /*
245e4f17
MD
1533 * Initialize the kernel message from the copied-in data and
1534 * pull in appropriate flags from the userland message.
4fd10eb6 1535 */
b44419cb
MD
1536 lwkt_initmsg(&sysun->lmsg, &td->td_msgport, 0,
1537 sysun->nosys.usrmsg.umsg.ms_cmd,
1538 lwkt_cmd_op_none);
df2244e3 1539 sysun->sysmsg_copyout = NULL;
245e4f17
MD
1540 sysun->lmsg.opaque.ms_umsg = umsg;
1541 sysun->lmsg.ms_flags |= sysun->nosys.usrmsg.umsg.ms_flags & MSGF_ASYNC;
a64ba182 1542
4fd10eb6
MD
1543 /*
1544 * Extract the system call number, lookup the system call, and
1545 * set the default return value.
1546 */
b44419cb 1547 code = (u_int)sysun->lmsg.ms_cmd.cm_op;
a64ba182
MD
1548 if (code >= p->p_sysent->sv_size) {
1549 error = ENOSYS;
4fd10eb6 1550 goto bad1;
a64ba182
MD
1551 }
1552
1553 callp = &p->p_sysent->sv_table[code];
1554
c7114eea 1555 narg = (msgsize - sizeof(struct lwkt_msg)) / sizeof(register_t);
4fd10eb6 1556
a64ba182
MD
1557#ifdef KTRACE
1558 if (KTRPOINT(td, KTR_SYSCALL)) {
245e4f17 1559 ktrsyscall(p->p_tracep, code, narg, (void *)(&sysun->nosys.usrmsg + 1));
a64ba182
MD
1560 }
1561#endif
245e4f17
MD
1562 sysun->lmsg.u.ms_fds[0] = 0;
1563 sysun->lmsg.u.ms_fds[1] = 0;
a64ba182
MD
1564
1565 STOPEVENT(p, S_SCE, narg); /* MP aware */
1566
1567 /*
1568 * Make the system call. An error code is always returned, results
4fd10eb6
MD
1569 * are copied back via ms_result32 or ms_result64. YYY temporary
1570 * stage copy p_retval[] into ms_result32/64
a64ba182
MD
1571 *
1572 * NOTE! XXX if this is a child returning from a fork curproc
4fd10eb6
MD
1573 * might be different. YYY huh? a child returning from a fork
1574 * should never 'return' from this call, it should go right to the
1575 * fork_trampoline function.
a64ba182 1576 */
245e4f17 1577 error = (*callp->sy_call)(sysun);
7966cb69 1578 gd = td->td_gd; /* RELOAD, might have switched cpus */
a64ba182 1579
4fd10eb6 1580bad1:
a64ba182 1581 /*
4fd10eb6
MD
1582 * If a synchronous return copy p_retval to ms_result64 and return
1583 * the sysmsg to the free pool.
90b9818c
MD
1584 *
1585 * YYY Don't writeback message if execve() YYY
a64ba182
MD
1586 */
1587 if (error != EASYNC) {
245e4f17
MD
1588 atomic_add_int_nonlocked(&td->td_msgport.mp_refs, -1);
1589 sysun->nosys.usrmsg.umsg.u.ms_fds[0] = sysun->lmsg.u.ms_fds[0];
1590 sysun->nosys.usrmsg.umsg.u.ms_fds[1] = sysun->lmsg.u.ms_fds[1];
1591 result = sysun->nosys.usrmsg.umsg.u.ms_fds[0]; /* for ktrace */
1592 if (error != 0 || code != SYS_execve) {
1593 int error2;
1594 error2 = copyout(&sysun->nosys.usrmsg.umsg.ms_copyout_start,
1595 &umsg->ms_copyout_start,
1596 ms_copyout_size);
1597 if (error == 0)
1598 error2 = error;
90b9818c 1599 }
7966cb69 1600 crit_enter_quick(td);
245e4f17
MD
1601 sysun->lmsg.opaque.ms_sysunnext = gd->gd_freesysun;
1602 gd->gd_freesysun = sysun;
7966cb69 1603 crit_exit_quick(td);
a64ba182 1604 }
4fd10eb6 1605bad2:
a64ba182 1606 frame.tf_eax = error;
245e4f17 1607good:
a64ba182
MD
1608
1609 /*
1610 * Traced syscall. trapsignal() is not MP aware.
1611 */
1612 if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1613 frame.tf_eflags &= ~PSL_T;
1614 trapsignal(p, SIGTRAP, 0);
1615 }
1616
1617 /*
1618 * Handle reschedule and other end-of-syscall issues
1619 */
1620 userret(p, &frame, sticks);
1621
1622#ifdef KTRACE
1623 if (KTRPOINT(td, KTR_SYSRET)) {
90b9818c 1624 ktrsysret(p->p_tracep, code, error, result);
a64ba182
MD
1625 }
1626#endif
1627
1628 /*
1629 * This works because errno is findable through the
1630 * register set. If we ever support an emulation where this
1631 * is not the case, this code will need to be revisited.
1632 */
1633 STOPEVENT(p, S_SCX, code);
1634
1635 userexit(p);
1636#ifdef SMP
1637 /*
1638 * Release the MP lock if we had to get it
1639 */
37af14fe 1640 KASSERT(td->td_mpcount == 1, ("badmpcount syscall from %p", (void *)frame.tf_eip));
a64ba182
MD
1641 rel_mplock();
1642#endif
1643}
1644
984263bc
MD
1645/*
1646 * Simplified back end of syscall(), used when returning from fork()
8a8d5d85
MD
1647 * directly into user mode. MP lock is held on entry and should be
1648 * released on return. This code will return back into the fork
1649 * trampoline code which then runs doreti.
984263bc
MD
1650 */
1651void
1652fork_return(p, frame)
1653 struct proc *p;
1654 struct trapframe frame;
1655{
1656 frame.tf_eax = 0; /* Child returns zero */
1657 frame.tf_eflags &= ~PSL_C; /* success */
1658 frame.tf_edx = 1;
1659
8a8d5d85 1660 userret(p, &frame, 0);
984263bc 1661#ifdef KTRACE
dadab5e9 1662 if (KTRPOINT(p->p_thread, KTR_SYSRET))
984263bc
MD
1663 ktrsysret(p->p_tracep, SYS_fork, 0, 0);
1664#endif
d9eea1a5 1665 p->p_flag |= P_PASSIVE_ACQ;
a2a5ad0d 1666 userexit(p);
d9eea1a5 1667 p->p_flag &= ~P_PASSIVE_ACQ;
8a8d5d85 1668#ifdef SMP
37af14fe 1669 KKASSERT(p->p_thread->td_mpcount == 1);
8a8d5d85
MD
1670 rel_mplock();
1671#endif
984263bc 1672}
8a8d5d85 1673