From 93ad6da25762a0c741b5f653bdd58afe310c4899 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 11 Oct 2012 12:12:25 -0700 Subject: [PATCH] kernel - Make pcb_onfault more robust. * Record the expected kernel stack pointer along with the pcb_onfault action. Adjust the trap code to only take the action if the frame's stack pointer matches the recorded expected stack pointer. Otherwise this might be a recursive trap and we definitely do NOT want to execute the on-fault stuff in that situation. * Prior to these changes recursive traps during uiomove()s could result in a kernel stack so corrupt that finding the actual cause of the panic becomes impossible. This is believed to be making life difficult for us trying to track down a particular i386 panic. * On x86-64 we had to increase the size of the pcb structure. kgdb on kernel cores and live kernels will be effected (needs recompile). --- sys/kern/lwkt_ipiq.c | 1 + sys/platform/pc32/i386/bcopy.s | 13 ++++++++----- sys/platform/pc32/i386/genassym.c | 1 + sys/platform/pc32/i386/support.s | 17 ++++++++++++++++- sys/platform/pc32/i386/trap.c | 19 +++++++++++++++++-- sys/platform/pc32/i386/vm_machdep.c | 5 +++-- sys/platform/pc32/include/pcb.h | 2 +- sys/platform/pc64/include/pcb.h | 10 ++++++---- sys/platform/pc64/x86_64/genassym.c | 1 + sys/platform/pc64/x86_64/support.s | 15 ++++++++++++++- sys/platform/pc64/x86_64/trap.c | 26 +++++++++++++++++++++----- sys/platform/pc64/x86_64/vm_machdep.c | 3 ++- 12 files changed, 91 insertions(+), 22 deletions(-) diff --git a/sys/kern/lwkt_ipiq.c b/sys/kern/lwkt_ipiq.c index e8b326ead2..767c3d3817 100644 --- a/sys/kern/lwkt_ipiq.c +++ b/sys/kern/lwkt_ipiq.c @@ -888,6 +888,7 @@ lwkt_cpusync_remote2(lwkt_cpusync_t cs) if (cs->cs_func) cs->cs_func(cs->cs_data); atomic_set_cpumask(&cs->cs_mack, gd->gd_cpumask); + /* cs can be ripped out at this point */ } else { lwkt_ipiq_t ip; int wi; diff --git a/sys/platform/pc32/i386/bcopy.s b/sys/platform/pc32/i386/bcopy.s index 37c792cab3..b164ddb348 100644 --- a/sys/platform/pc32/i386/bcopy.s +++ b/sys/platform/pc32/i386/bcopy.s @@ -116,19 +116,19 @@ generic_onfault: /* * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY * - * Reasonably optimal on all modern machines. + * Reasonably optimal on all modern machines. */ SUPERALIGN_TEXT ENTRY(asm_generic_memcpy) /* memcpy() entry point use optimal copy */ - pushl %ebx - pushl $generic_onfault + pushl %ebx /* WARNING COPYIN/OUT EXPECTS THIS FRAME */ + pushl $generic_onfault/* WARNING COPYIN/OUT EXPECTS THIS FRAME */ jmp 2f SUPERALIGN_TEXT ENTRY(asm_generic_bcopy) - pushl %ebx - pushl $generic_onfault + pushl %ebx /* WARNING COPYIN/OUT EXPECTS THIS FRAME */ + pushl $generic_onfault/* WARNING COPYIN/OUT EXPECTS THIS FRAME */ cmpl %esi,%edi /* if (edi < esi) fwd copy ok */ jb 2f addl %ecx,%esi @@ -286,6 +286,9 @@ ENTRY(asm_generic_bcopy) * * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn. * MMX: Typical on XPs and P3s. 64 bit media insn. + * + * WARNING! copyin/copyout expects a push %ebx/onfault frame ONLY, + * we can't mess with the stack any more than that. */ #define MMX_SAVE_BLOCK(missfunc) \ diff --git a/sys/platform/pc32/i386/genassym.c b/sys/platform/pc32/i386/genassym.c index 84e9f151f4..53d1b26170 100644 --- a/sys/platform/pc32/i386/genassym.c +++ b/sys/platform/pc32/i386/genassym.c @@ -152,6 +152,7 @@ ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); ASSYM(PCB_SAVE87_SIZE, sizeof(struct save87)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); +ASSYM(PCB_ONFAULT_SP, offsetof(struct pcb, pcb_onfault_sp)); ASSYM(PCB_SIZE, sizeof(struct pcb)); diff --git a/sys/platform/pc32/i386/support.s b/sys/platform/pc32/i386/support.s index 1eed1c8863..cbb2043763 100644 --- a/sys/platform/pc32/i386/support.s +++ b/sys/platform/pc32/i386/support.s @@ -169,6 +169,9 @@ ENTRY(copyout) pushl %ebx pushl $copyout_fault2 movl $stack_onfault,PCB_ONFAULT(%eax) + movl %esp,PCB_ONFAULT_SP(%eax) + subl $12,PCB_ONFAULT_SP(%eax) /* call,ebx,stackedfault */ + /* for *memcpy_vector */ movl 4+16(%esp),%esi movl 8+16(%esp),%edi movl 12+16(%esp),%ebx @@ -241,6 +244,9 @@ ENTRY(copyin) pushl %edi pushl $copyin_fault2 movl $stack_onfault,PCB_ONFAULT(%eax) + movl %esp,PCB_ONFAULT_SP(%eax) + subl $12,PCB_ONFAULT_SP(%eax) /* call,ebx,stackedfault */ + /* for *memcpy_vector */ movl 4+12(%esp),%esi /* caddr_t from */ movl 8+12(%esp),%edi /* caddr_t to */ movl 12+12(%esp),%ecx /* size_t len */ @@ -294,6 +300,7 @@ ENTRY(casuword) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx /* dst */ movl 8(%esp),%eax /* old */ movl 12(%esp),%ecx /* new */ @@ -314,7 +321,6 @@ ENTRY(casuword) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) movl $0,PCB_ONFAULT(%ecx) ret END(casuword) @@ -328,6 +334,7 @@ ENTRY(fuword) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx /* from */ cmpl $VM_MAX_USER_ADDRESS-4,%edx /* verify address is valid */ @@ -344,6 +351,7 @@ ENTRY(fusword) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx cmpl $VM_MAX_USER_ADDRESS-2,%edx @@ -360,6 +368,7 @@ ENTRY(fubyte) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx cmpl $VM_MAX_USER_ADDRESS-1,%edx @@ -387,6 +396,7 @@ ENTRY(suword) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx cmpl $VM_MAX_USER_ADDRESS-4,%edx /* verify address validity */ @@ -407,6 +417,7 @@ ENTRY(suword32) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx cmpl $VM_MAX_USER_ADDRESS-4,%edx /* verify address validity */ @@ -427,6 +438,7 @@ ENTRY(susword) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx cmpl $VM_MAX_USER_ADDRESS-2,%edx /* verify address validity */ @@ -447,6 +459,7 @@ ENTRY(subyte) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $fusufault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%edx cmpl $VM_MAX_USER_ADDRESS-1,%edx /* verify address validity */ @@ -474,6 +487,7 @@ ENTRY(copyinstr) movl PCPU(curthread),%ecx movl TD_PCB(%ecx),%ecx movl $cpystrflt,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ @@ -760,6 +774,7 @@ ENTRY(rdmsr_safe) movl PCPU(curthread),%ecx movl TD_PCB(%ecx), %ecx movl $msr_onfault,PCB_ONFAULT(%ecx) + movl %esp,PCB_ONFAULT_SP(%ecx) movl 4(%esp),%ecx rdmsr diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c index 2ecf1d11f9..5e429d81e0 100644 --- a/sys/platform/pc32/i386/trap.c +++ b/sys/platform/pc32/i386/trap.c @@ -770,7 +770,13 @@ kernel_trap: doreti_popl_fs_fault); MAYBE_DORETI_FAULT(doreti_popl_gs, doreti_popl_gs_fault); - if (td->td_pcb->pcb_onfault) { + + /* + * NOTE: cpu doesn't push esp on kernel trap + */ + if (td->td_pcb->pcb_onfault && + td->td_pcb->pcb_onfault_sp == + (int)&frame->tf_esp) { frame->tf_eip = (register_t)td->td_pcb->pcb_onfault; goto out2; @@ -1022,11 +1028,20 @@ trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva) return (0); nogo: if (!usermode) { + /* + * NOTE: cpu doesn't push esp on kernel trap + */ if (td->td_gd->gd_intr_nesting_level == 0 && - td->td_pcb->pcb_onfault) { + td->td_pcb->pcb_onfault && + td->td_pcb->pcb_onfault_sp == (int)&frame->tf_esp) { frame->tf_eip = (register_t)td->td_pcb->pcb_onfault; return (0); } + if (td->td_gd->gd_intr_nesting_level == 0 && + td->td_pcb->pcb_onfault) { + kprintf("ESP mismatch %p %08x\n", + &frame->tf_esp, td->td_pcb->pcb_onfault_sp); + } trap_fatal(frame, eva); return (-1); } diff --git a/sys/platform/pc32/i386/vm_machdep.c b/sys/platform/pc32/i386/vm_machdep.c index 98b358310c..4e34100990 100644 --- a/sys/platform/pc32/i386/vm_machdep.c +++ b/sys/platform/pc32/i386/vm_machdep.c @@ -171,8 +171,9 @@ cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags) /* * pcb2->pcb_ldt: duplicated below, if necessary. * pcb2->pcb_savefpu: cloned above. - * pcb2->pcb_flags: cloned above (always 0 here?). - * pcb2->pcb_onfault: cloned above (always NULL here?). + * pcb2->pcb_flags: cloned above (always 0 here). + * pcb2->pcb_onfault: cloned above (always NULL here). + * pcb2->pcb_onfault_sp:cloned above (don't care) */ /* diff --git a/sys/platform/pc32/include/pcb.h b/sys/platform/pc32/include/pcb.h index c0aabf1d07..09cac9072d 100644 --- a/sys/platform/pc32/include/pcb.h +++ b/sys/platform/pc32/include/pcb.h @@ -69,7 +69,7 @@ struct pcb { #define PCB_DBREGS 0x02 /* process using debug registers */ #define FP_VIRTFP 0x04 /* virtual kernel wants exception */ caddr_t pcb_onfault; /* copyin/out fault recovery */ - int pcb_unused; + int pcb_onfault_sp; /* validate onfault state */ struct pcb_ext *pcb_ext; /* optional pcb extension */ u_long __pcb_spare[3]; /* adjust to avoid core dump size changes */ }; diff --git a/sys/platform/pc64/include/pcb.h b/sys/platform/pc64/include/pcb.h index 71c135a33d..0369ea6cc4 100644 --- a/sys/platform/pc64/include/pcb.h +++ b/sys/platform/pc64/include/pcb.h @@ -78,14 +78,16 @@ struct pcb { struct pcb_ldt *pcb_ldt; union savefpu pcb_save; + caddr_t pcb_onfault; /* copyin/out fault recovery */ + register_t pcb_onfault_sp; /* validate onfault */ + register_t pcb_unused[4]; /* kgdb compatibility expansion */ + struct pcb_ext *pcb_ext; /* optional pcb extension */ +}; + #define PCB_DBREGS 0x02 /* process using debug registers */ #define PCB_FPUINITDONE 0x08 /* fpu state is initialized */ #define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */ #define FP_VIRTFP 0x04 /* virtual kernel wants exception */ - caddr_t pcb_onfault; /* copyin/out fault recovery */ - int pcb_unused; - struct pcb_ext *pcb_ext; /* optional pcb extension */ -}; #ifdef _KERNEL void savectx(struct pcb *); diff --git a/sys/platform/pc64/x86_64/genassym.c b/sys/platform/pc64/x86_64/genassym.c index 7324901ef8..13272f28d9 100644 --- a/sys/platform/pc64/x86_64/genassym.c +++ b/sys/platform/pc64/x86_64/genassym.c @@ -135,6 +135,7 @@ ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); +ASSYM(PCB_ONFAULT_SP, offsetof(struct pcb, pcb_onfault_sp)); ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase)); ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); diff --git a/sys/platform/pc64/x86_64/support.s b/sys/platform/pc64/x86_64/support.s index f155036912..b15e81d2f8 100644 --- a/sys/platform/pc64/x86_64/support.s +++ b/sys/platform/pc64/x86_64/support.s @@ -218,6 +218,7 @@ ENTRY(copyout) movq PCPU(curthread),%rax movq TD_PCB(%rax), %rax movq $copyout_fault,PCB_ONFAULT(%rax) + movq %rsp,PCB_ONFAULT_SP(%rax) testq %rdx,%rdx /* anything to do? */ jz done_copyout @@ -280,6 +281,7 @@ ENTRY(copyin) movq PCPU(curthread),%rax movq TD_PCB(%rax), %rax movq $copyin_fault,PCB_ONFAULT(%rax) + movq %rsp,PCB_ONFAULT_SP(%rax) testq %rdx,%rdx /* anything to do? */ jz done_copyin @@ -328,6 +330,7 @@ ENTRY(casuword32) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ @@ -358,6 +361,7 @@ ENTRY(casuword) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ @@ -377,7 +381,6 @@ ENTRY(casuword) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx - movq $fusufault,PCB_ONFAULT(%rcx) movq $0,PCB_ONFAULT(%rcx) ret @@ -392,6 +395,7 @@ ENTRY(fuword) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address is valid */ @@ -405,6 +409,7 @@ ENTRY(fuword32) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ @@ -430,6 +435,7 @@ ENTRY(fuword16) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-2,%rax cmpq %rax,%rdi @@ -443,6 +449,7 @@ ENTRY(fubyte) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-1,%rax cmpq %rax,%rdi @@ -474,6 +481,7 @@ ENTRY(suword) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address validity */ @@ -493,6 +501,7 @@ ENTRY(suword32) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address validity */ @@ -509,6 +518,7 @@ ENTRY(suword16) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-2,%rax cmpq %rax,%rdi /* verify address validity */ @@ -525,6 +535,7 @@ ENTRY(subyte) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS-1,%rax cmpq %rax,%rdi /* verify address validity */ @@ -554,6 +565,7 @@ ENTRY(copyinstr) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $cpystrflt,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) movq $VM_MAX_USER_ADDRESS,%rax @@ -711,6 +723,7 @@ ENTRY(rdmsr_safe) movq PCPU(curthread),%r8 movq TD_PCB(%r8), %r8 movq $msr_onfault,PCB_ONFAULT(%r8) + movq %rsp,PCB_ONFAULT_SP(%r8) movl %edi,%ecx rdmsr /* Read MSR pointed by %ecx. Returns hi byte in edx, lo in %eax */ diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c index 786b4807b0..0e373cfd9b 100644 --- a/sys/platform/pc64/x86_64/trap.c +++ b/sys/platform/pc64/x86_64/trap.c @@ -653,7 +653,13 @@ trap(struct trapframe *frame) * them. */ if (mycpu->gd_intr_nesting_level == 0) { - if (td->td_pcb->pcb_onfault) { + /* + * NOTE: in 64-bit mode traps push rsp/ss + * even if no ring change occurs. + */ + if (td->td_pcb->pcb_onfault && + td->td_pcb->pcb_onfault_sp == + frame->tf_rsp) { frame->tf_rip = (register_t) td->td_pcb->pcb_onfault; goto out2; @@ -857,7 +863,8 @@ trap_pfault(struct trapframe *frame, int usermode) * Debugging, try to catch kernel faults on the user address space when not inside * on onfault (e.g. copyin/copyout) routine. */ - if (usermode == 0 && (td->td_pcb == NULL || td->td_pcb->pcb_onfault == NULL)) { + if (usermode == 0 && (td->td_pcb == NULL || + td->td_pcb->pcb_onfault == NULL)) { #ifdef DDB if (freeze_on_seg_fault) { kprintf("trap_pfault: user address fault from kernel mode " @@ -915,8 +922,13 @@ trap_pfault(struct trapframe *frame, int usermode) return (0); nogo: if (!usermode) { - if (td->td_gd->gd_intr_nesting_level == 0 && - td->td_pcb->pcb_onfault) { + /* + * NOTE: in 64-bit mode traps push rsp/ss + * even if no ring change occurs. + */ + if (td->td_pcb->pcb_onfault && + td->td_pcb->pcb_onfault_sp == frame->tf_rsp && + td->td_gd->gd_intr_nesting_level == 0) { frame->tf_rip = (register_t)td->td_pcb->pcb_onfault; return (0); } @@ -992,8 +1004,12 @@ trap_fatal(struct trapframe *frame, vm_offset_t eva) ss = frame->tf_ss & 0xffff; rsp = frame->tf_rsp; } else { + /* + * NOTE: in 64-bit mode traps push rsp/ss even if no ring + * change occurs. + */ ss = GSEL(GDATA_SEL, SEL_KPL); - rsp = (long)&frame->tf_rsp; + rsp = frame->tf_rsp; } kprintf("stack pointer = 0x%x:0x%lx\n", ss, rsp); kprintf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); diff --git a/sys/platform/pc64/x86_64/vm_machdep.c b/sys/platform/pc64/x86_64/vm_machdep.c index c41d295ff2..5fb3e64cfb 100644 --- a/sys/platform/pc64/x86_64/vm_machdep.c +++ b/sys/platform/pc64/x86_64/vm_machdep.c @@ -158,7 +158,8 @@ cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags) * pcb2->pcb_ldt: duplicated below, if necessary. * pcb2->pcb_savefpu: cloned above. * pcb2->pcb_flags: cloned above (always 0 here?). - * pcb2->pcb_onfault: cloned above (always NULL here?). + * pcb2->pcb_onfault: cloned above (always NULL here). + * pcb2->pcb_onfault_sp:cloned above (dont care) */ /* -- 2.41.0