From: Matthew Dillon Date: Thu, 26 Jan 2017 19:35:47 +0000 (-0800) Subject: kernel - Refactor suword, fuword, etc. change vmm_guest_sync_addr() X-Git-Tag: v4.8.0rc~151 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/5947157eb0effa08048af245e271b7441b862ce3 kernel - Refactor suword, fuword, etc. change vmm_guest_sync_addr() * Rename the entire family of functions to reduce confusion. * Change how vmm_guest_sync_addr() works. Instead of loading one value into a target location we exchange the two target locations, with the first address using an atomic op. This will allow the vkernel to drop privs and query pte state atomically. --- diff --git a/sys/bus/cam/scsi/scsi_target.c b/sys/bus/cam/scsi/scsi_target.c index 3fa428989c..3ebb48d061 100644 --- a/sys/bus/cam/scsi/scsi_target.c +++ b/sys/bus/cam/scsi/scsi_target.c @@ -562,12 +562,12 @@ targwrite(struct dev_write_args *ap) ("write - uiomove failed (%d)\n", error)); break; } - priority = fuword(&user_ccb->ccb_h.pinfo.priority); + priority = fuword32(&user_ccb->ccb_h.pinfo.priority); if (priority == -1) { error = EINVAL; break; } - func_code = fuword(&user_ccb->ccb_h.func_code); + func_code = fuword32(&user_ccb->ccb_h.func_code); switch (func_code) { case XPT_ACCEPT_TARGET_IO: case XPT_IMMED_NOTIFY: @@ -661,8 +661,8 @@ targstart(struct cam_periph *periph, union ccb *start_ccb) xpt_print(periph->path, "targsendccb failed, err %d\n", error); xpt_release_ccb(start_ccb); - suword(&descr->user_ccb->ccb_h.status, - CAM_REQ_CMP_ERR); + suword32(&descr->user_ccb->ccb_h.status, + CAM_REQ_CMP_ERR); crit_enter(); TAILQ_INSERT_TAIL(&softc->abort_queue, descr, tqe); crit_exit(); @@ -691,10 +691,10 @@ targusermerge(struct targ_softc *softc, struct targ_cmd_descr *descr, * preserved, the rest we get from the user ccb. (See xpt_merge_ccb) */ xpt_setup_ccb(k_ccbh, softc->path, descr->priority); - k_ccbh->retry_count = fuword(&u_ccbh->retry_count); + k_ccbh->retry_count = fuword32(&u_ccbh->retry_count); k_ccbh->func_code = descr->func_code; - k_ccbh->flags = fuword(&u_ccbh->flags); - k_ccbh->timeout = fuword(&u_ccbh->timeout); + k_ccbh->flags = fuword32(&u_ccbh->flags); + k_ccbh->timeout = fuword32(&u_ccbh->timeout); ccb_len = targccblen(k_ccbh->func_code) - sizeof(struct ccb_hdr); error = copyin(u_ccbh + 1, k_ccbh + 1, ccb_len); if (error != 0) { @@ -912,7 +912,7 @@ targread(struct dev_read_args *ap) CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targread aborted descr %p (%p)\n", user_descr, user_ccb)); - suword(&user_ccb->ccb_h.status, CAM_REQ_ABORTED); + suword32(&user_ccb->ccb_h.status, CAM_REQ_ABORTED); cam_periph_unlock(softc->periph); error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio); cam_periph_lock(softc->periph); diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index aeee27d0a0..4f6412dbc8 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -907,7 +907,7 @@ __elfN(dragonfly_fixup)(register_t **stack_base, struct image_params *imgp) imgp->auxargs = NULL; base--; - suword(base, (long)imgp->args->argc); + suword64(base, (long)imgp->args->argc); *stack_base = (register_t *)base; return (0); } diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index fe0e824279..b79901e301 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -644,9 +644,12 @@ start_init(void *dummy, struct trapframe *frame) * Move out the arg pointers. */ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); - (void)suword((caddr_t)--uap, (long)0); /* terminator */ - (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); - (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); + + /* terminator */ + suword64((uint64_t *)(caddr_t)--uap, (uint64_t)0); + + suword64((uint64_t *)(caddr_t)--uap, (uint64_t)(intptr_t)arg1); + suword64((uint64_t *)(caddr_t)--uap, (uint64_t)(intptr_t)arg0); /* * Point at the arguments. diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index ca81edddbe..7e68a2dc7e 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -358,7 +358,7 @@ interpret: if (p->p_sysent->sv_fixup && imgp->resident == 0) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else - suword(--stack_base, imgp->args->argc); + suword64(--stack_base, imgp->args->argc); /* * For security and other reasons, the file descriptor table cannot @@ -901,7 +901,8 @@ exec_copyin_args(struct image_args *args, char *fname, if (argv == NULL) error = EFAULT; if (error == 0) { - while ((argp = (caddr_t)(intptr_t)fuword(argv++)) != NULL) { + while ((argp = (caddr_t)(intptr_t) + fuword64((uintptr_t *)argv++)) != NULL) { if (argp == (caddr_t)-1) { error = EFAULT; break; @@ -936,7 +937,8 @@ exec_copyin_args(struct image_args *args, char *fname, * extract environment strings. envv may be NULL. */ if (envv && error == 0) { - while ((envp = (caddr_t) (intptr_t) fuword(envv++))) { + while ((envp = (caddr_t)(intptr_t) + fuword64((uintptr_t *)envv++))) { if (envp == (caddr_t) -1) { error = EFAULT; break; @@ -1068,37 +1070,37 @@ exec_copyout_strings(struct image_params *imgp) /* * Fill in "ps_strings" struct for ps, w, etc. */ - suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); - suword32(&arginfo->ps_nargvstr, argc); + suword64((void *)&arginfo->ps_argvstr, (uint64_t)(intptr_t)vectp); + suword32((void *)&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { - suword(vectp++, (long)(intptr_t)destp); + suword64((void *)vectp++, (uintptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ - suword(vectp++, 0); + suword64((void *)vectp++, 0); - suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); - suword32(&arginfo->ps_nenvstr, envc); + suword64((void *)&arginfo->ps_envstr, (uintptr_t)vectp); + suword32((void *)&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { - suword(vectp++, (long)(intptr_t)destp); + suword64((void *)vectp++, (uintptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ - suword(vectp, 0); + suword64((void *)vectp, 0); return (stack_base); } diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c index 8eb3650c32..923eaaada5 100644 --- a/sys/kern/subr_prof.c +++ b/sys/kern/subr_prof.c @@ -382,13 +382,7 @@ sys_profil(struct profil_args *uap) /* * Collect user-level profiling statistics; called on a profiling tick, * when a process is running in user-mode. This routine may be called - * from an interrupt context. We try to update the user profiling buffers - * cheaply with fuswintr() and suswintr(). If that fails, we revert to - * an AST that will vector us to trap() with a context in which copyin - * and copyout will work. Trap will then call addupc_task(). - * - * XXX fuswintr() and suswintr() never worked (always returnde -1), remove - * them. It's just a bad idea to try to do this from a hard interrupt. + * from an interrupt context. * * Note that we may (rarely) not get around to the AST soon enough, and * lose profile ticks when the next tick overwrites this one, but in this diff --git a/sys/kern/sys_vmm.c b/sys/kern/sys_vmm.c index bdc6b3bf84..f87fd053d5 100644 --- a/sys/kern/sys_vmm.c +++ b/sys/kern/sys_vmm.c @@ -132,6 +132,13 @@ vmm_exit_vmm(void *dummy __unused) { } +/* + * Swap the 64 bit value between *dstaddr and *srcaddr in a pmap-safe manner. + * + * v = *srcaddr + * v = swap(dstaddr, v) + * *dstaddr = v + */ int sys_vmm_guest_sync_addr(struct vmm_guest_sync_addr_args *uap) { @@ -139,7 +146,6 @@ sys_vmm_guest_sync_addr(struct vmm_guest_sync_addr_args *uap) cpulock_t olock; cpulock_t nlock; cpumask_t mask; - long val; struct proc *p = curproc; if (p->p_vmm == NULL) @@ -195,9 +201,10 @@ sys_vmm_guest_sync_addr(struct vmm_guest_sync_addr_args *uap) /* * Make the requested modification, wakeup any waiters. */ - if (uap->srcaddr) { - copyin(uap->srcaddr, &val, sizeof(long)); - copyout(&val, uap->dstaddr, sizeof(long)); + if (uap->dstaddr) { + long v = fuword64(uap->srcaddr); + v = swapu64(uap->dstaddr, v); + suword64(uap->srcaddr, v); } /* diff --git a/sys/net/sppp/if_spppsubr.c b/sys/net/sppp/if_spppsubr.c index 37f454ac1b..213d8f42ce 100644 --- a/sys/net/sppp/if_spppsubr.c +++ b/sys/net/sppp/if_spppsubr.c @@ -4982,7 +4982,7 @@ sppp_params(struct sppp *sp, u_long cmd, void *data) * Check the cmd word first before attempting to fetch all the * data. */ - if ((subcmd = fuword(ifr->ifr_data)) == -1) { + if ((subcmd = fuword64(ifr->ifr_data)) == -1) { rv = EFAULT; goto quit; } diff --git a/sys/platform/pc64/include/pmap.h b/sys/platform/pc64/include/pmap.h index 97da388f48..d7ed96574d 100644 --- a/sys/platform/pc64/include/pmap.h +++ b/sys/platform/pc64/include/pmap.h @@ -296,11 +296,12 @@ struct pmap { int (*copyinstr)(const void *, void *, size_t, size_t *); int (*copyin)(const void *, void *, size_t); int (*copyout)(const void *, void *, size_t); - int (*fubyte)(const void *); - int (*subyte)(void *, int); - long (*fuword)(const void *); - int (*suword)(void *, long); - int (*suword32)(void *, int); + int (*fubyte)(const uint8_t *); /* returns int for -1 err */ + int (*subyte)(uint8_t *, uint8_t); + int32_t (*fuword32)(const uint32_t *); + int64_t (*fuword64)(const uint64_t *); + int (*suword64)(uint64_t *, uint64_t); + int (*suword32)(uint32_t *, int); }; #define PMAP_FLAG_SIMPLE 0x00000001 diff --git a/sys/platform/pc64/vmm/ept.c b/sys/platform/pc64/vmm/ept.c index 214b67bfd7..7e1cd4eef0 100644 --- a/sys/platform/pc64/vmm/ept.c +++ b/sys/platform/pc64/vmm/ept.c @@ -283,9 +283,9 @@ ept_copyinstr(const void *udaddr, void *kaddr, size_t len, size_t *res) static int -ept_fubyte(const void *base) +ept_fubyte(const uint8_t *base) { - unsigned char c = 0; + uint8_t c = 0; if (ept_copyin(base, &c, 1) == 0) return((int)c); @@ -293,7 +293,7 @@ ept_fubyte(const void *base) } static int -ept_subyte(void *base, int byte) +ept_subyte(uint8_t *base, uint8_t byte) { unsigned char c = byte; @@ -302,10 +302,20 @@ ept_subyte(void *base, int byte) return(-1); } -static long -ept_fuword(const void *base) +static int32_t +ept_fuword32(const uint32_t *base) { - long v; + uint32_t v; + + if (ept_copyin(base, &v, sizeof(v)) == 0) + return(v); + return(-1); +} + +static int64_t +ept_fuword64(const uint64_t *base) +{ + uint64_t v; if (ept_copyin(base, &v, sizeof(v)) == 0) return(v); @@ -313,7 +323,7 @@ ept_fuword(const void *base) } static int -ept_suword(void *base, long word) +ept_suword64(uint64_t *base, uint64_t word) { if (ept_copyout(&word, base, sizeof(word)) == 0) return(0); @@ -321,7 +331,7 @@ ept_suword(void *base, long word) } static int -ept_suword32(void *base, int word) +ept_suword32(uint32_t *base, int word) { if (ept_copyout(&word, base, sizeof(word)) == 0) return(0); @@ -344,7 +354,8 @@ vmx_ept_pmap_pinit(pmap_t pmap) pmap->copyout = ept_copyout; pmap->fubyte = ept_fubyte; pmap->subyte = ept_subyte; - pmap->fuword = ept_fuword; - pmap->suword = ept_suword; + pmap->fuword32 = ept_fuword32; + pmap->fuword64 = ept_fuword64; + pmap->suword64 = ept_suword64; pmap->suword32 = ept_suword32; } diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index 327eeb2d12..b51e55d279 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -257,11 +257,12 @@ extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, size_t *lencopied); extern int std_copyin (const void *udaddr, void *kaddr, size_t len); extern int std_copyout (const void *kaddr, void *udaddr, size_t len); -extern int std_fubyte (const void *base); -extern int std_subyte (void *base, int byte); -extern long std_fuword (const void *base); -extern int std_suword (void *base, long word); -extern int std_suword32 (void *base, int word); +extern int std_fubyte (const uint8_t *base); +extern int std_subyte (uint8_t *base, uint8_t byte); +extern int32_t std_fuword32 (const uint32_t *base); +extern int64_t std_fuword64 (const uint64_t *base); +extern int std_suword64 (uint64_t *base, uint64_t word); +extern int std_suword32 (uint32_t *base, int word); static void pv_hold(pv_entry_t pv); static int _pv_hold_try(pv_entry_t pv @@ -1758,8 +1759,9 @@ pmap_pinit_defaults(struct pmap *pmap) pmap->copyout = std_copyout; pmap->fubyte = std_fubyte; pmap->subyte = std_subyte; - pmap->fuword = std_fuword; - pmap->suword = std_suword; + pmap->fuword32 = std_fuword32; + pmap->fuword64 = std_fuword64; + pmap->suword64 = std_suword64; pmap->suword32 = std_suword32; } /* diff --git a/sys/platform/pc64/x86_64/support.s b/sys/platform/pc64/x86_64/support.s index 109688b6ad..9877e62d79 100644 --- a/sys/platform/pc64/x86_64/support.s +++ b/sys/platform/pc64/x86_64/support.s @@ -308,10 +308,10 @@ copyin_fault: ret /* - * casuword32. Compare and set user integer. Returns -1 or the current value. - * dst = %rdi, old = %rsi, new = %rdx + * casu32 - Compare and set user integer. Returns -1 or the current value. + * dst = %rdi, old = %rsi, new = %rdx */ -ENTRY(casuword32) +ENTRY(casu32) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) @@ -337,10 +337,37 @@ ENTRY(casuword32) ret /* - * casuword. Compare and set user word. Returns -1 or the current value. - * dst = %rdi, old = %rsi, new = %rdx + * swapu32 - Swap int in user space. ptr = %rdi, val = %rsi */ -ENTRY(casuword) +ENTRY(swapu32) + movq PCPU(curthread),%rcx + movq TD_PCB(%rcx), %rcx + movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) + + movq $VM_MAX_USER_ADDRESS-8,%rax + cmpq %rax,%rdi /* verify address is valid */ + ja fusufault + + movq %rsi,%rax /* old */ + xchgl %eax,(%rdi) + + /* + * The old value is in %rax. If the store succeeded it will be the + * value we expected (old) from before the store, otherwise it will + * be the current value. + */ + + movq PCPU(curthread),%rcx + movq TD_PCB(%rcx), %rcx + movq $0,PCB_ONFAULT(%rcx) + ret + +/* + * casu64 - Compare and set user word. Returns -1 or the current value. + * dst = %rdi, old = %rsi, new = %rdx + */ +ENTRY(casu64) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) @@ -365,14 +392,40 @@ ENTRY(casuword) movq $0,PCB_ONFAULT(%rcx) ret +/* + * swapu64 - Swap long in user space. ptr = %rdi, val = %rsi + */ +ENTRY(swapu64) + movq PCPU(curthread),%rcx + movq TD_PCB(%rcx), %rcx + movq $fusufault,PCB_ONFAULT(%rcx) + movq %rsp,PCB_ONFAULT_SP(%rcx) + + movq $VM_MAX_USER_ADDRESS-8,%rax + cmpq %rax,%rdi /* verify address is valid */ + ja fusufault + + movq %rsi,%rax /* old */ + xchgq %rax,(%rdi) + + /* + * The old value is in %rax. If the store succeeded it will be the + * value we expected (old) from before the store, otherwise it will + * be the current value. + */ + + movq PCPU(curthread),%rcx + movq TD_PCB(%rcx), %rcx + movq $0,PCB_ONFAULT(%rcx) + ret + /* * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit * byte from user memory. All these functions are MPSAFE. * addr = %rdi */ -ALTENTRY(fuword64) -ENTRY(std_fuword) +ENTRY(std_fuword64) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) @@ -386,7 +439,7 @@ ENTRY(std_fuword) movq $0,PCB_ONFAULT(%rcx) ret -ENTRY(fuword32) +ENTRY(std_fuword32) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) @@ -400,18 +453,6 @@ ENTRY(fuword32) movq $0,PCB_ONFAULT(%rcx) ret -/* - * fuswintr() and suswintr() are specialized variants of fuword16() and - * suword16(), respectively. They are called from the profiling code, - * potentially at interrupt time. If they fail, that's okay; good things - * will happen later. They always fail for now, until the trap code is - * able to deal with this. - */ -ALTENTRY(suswintr) -ENTRY(fuswintr) - movq $-1,%rax - ret - ENTRY(fuword16) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx @@ -457,8 +498,7 @@ fusufault: * * Write a long */ -ALTENTRY(suword64) -ENTRY(std_suword) +ENTRY(std_suword64) movq PCPU(curthread),%rcx movq TD_PCB(%rcx), %rcx movq $fusufault,PCB_ONFAULT(%rcx) diff --git a/sys/platform/pc64/x86_64/uwrapper.c b/sys/platform/pc64/x86_64/uwrapper.c index f7196b0c1e..dec7d3532c 100644 --- a/sys/platform/pc64/x86_64/uwrapper.c +++ b/sys/platform/pc64/x86_64/uwrapper.c @@ -25,31 +25,37 @@ copyout(const void *kaddr, void *udaddr, size_t len) } int -fubyte(const void *base) +fubyte(const uint8_t *base) { return curthread->td_proc->p_vmspace->vm_pmap.fubyte(base); } int -subyte (void *base, int byte) +subyte(uint8_t *base, uint8_t byte) { return curthread->td_proc->p_vmspace->vm_pmap.subyte(base, byte); } -long -fuword(const void *base) +int32_t +fuword32(const uint32_t *base) { - return curthread->td_proc->p_vmspace->vm_pmap.fuword(base); + return curthread->td_proc->p_vmspace->vm_pmap.fuword32(base); +} + +int64_t +fuword64(const uint64_t *base) +{ + return curthread->td_proc->p_vmspace->vm_pmap.fuword64(base); } int -suword(void *base, long word) +suword64(uint64_t *base, uint64_t word) { - return curthread->td_proc->p_vmspace->vm_pmap.suword(base, word); + return curthread->td_proc->p_vmspace->vm_pmap.suword64(base, word); } int -suword32(void *base, int word) +suword32(uint32_t *base, int word) { return curthread->td_proc->p_vmspace->vm_pmap.suword32(base, word); } diff --git a/sys/sys/imgact_elf.h b/sys/sys/imgact_elf.h index 0b618f6f20..0b65363a6b 100644 --- a/sys/sys/imgact_elf.h +++ b/sys/sys/imgact_elf.h @@ -36,7 +36,7 @@ #ifdef _KERNEL -#define AUXARGS_ENTRY(pos, id, val) {suword(pos++, id); suword(pos++, val);} +#define AUXARGS_ENTRY(pos, id, val) {suword64(pos++, id); suword64(pos++, val);} struct lwp; struct file; diff --git a/sys/sys/systm.h b/sys/sys/systm.h index eaba480c6e..fbf7be103b 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -265,14 +265,16 @@ int copyout(const void *kaddr, void *udaddr, size_t len) int copyout_nofault(const void *kaddr, void *udaddr, size_t len) __nonnull(1) __nonnull(2); -int fubyte (const void *base); -int subyte (void *base, int byte); -long fuword (const void *base); -int suword (void *base, long word); -int suword32 (void *base, int word); -int fusword (void *base); -int susword (void *base, int word); -u_long casuword(volatile u_long *p, u_long oldval, u_long newval); +int fubyte (const uint8_t *base); +int subyte (uint8_t *base, uint8_t byte); +int32_t fuword32 (const uint32_t *base); +int64_t fuword64 (const uint64_t *base); +int suword32 (uint32_t *base, int word); +int suword64 (uint64_t *base, uint64_t word); +uint32_t casu32(volatile uint32_t *p, uint32_t oldval, uint32_t newval); +uint64_t casu64(volatile uint64_t *p, uint64_t oldval, uint64_t newval); +uint32_t swapu32(volatile uint32_t *p, uint32_t val); +uint64_t swapu64(volatile uint64_t *p, uint64_t val); void DELAY(int usec); void DRIVERSLEEP(int usec); diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index e15065b6e1..ef0be6ca6b 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -875,7 +875,7 @@ swap_pager_haspage(vm_object_t object, vm_pindex_t pindex) * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * - * The page must be busied or soft-busied. + * The page must be busied. * The caller can hold the object to avoid blocking, else we might block. * No other requirements. */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 9f7be94da7..958a5524de 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -328,9 +328,9 @@ RetryFault: * are initialized but not otherwise referenced or locked. * * NOTE! vm_map_lookup will try to upgrade the fault_type to - * VM_FAULT_WRITE if the map entry is a virtual page table and also - * writable, so we can set the 'A'accessed bit in the virtual page - * table entry. + * VM_FAULT_WRITE if the map entry is a virtual page table + * and also writable, so we can set the 'A'accessed bit in + * the virtual page table entry. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, @@ -719,7 +719,7 @@ vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, int *errorp) * updated. * * The returned page will be properly dirtied if VM_PROT_WRITE was specified, - * and marked PG_REFERENCED as well. + * and marked PG_REFERENCED. The page is not busied on return, only held. * * If the page cannot be faulted writable and VM_PROT_WRITE was specified, an * error will be returned. @@ -779,8 +779,9 @@ RetryFault: * are initialized but not otherwise referenced or locked. * * NOTE! vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE - * if the map entry is a virtual page table and also writable, - * so we can set the 'A'accessed bit in the virtual page table entry. + * if the map entry is a virtual page table and also writable, + * so we can set the 'A'accessed bit in the virtual page table + * entry. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, @@ -1237,29 +1238,44 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, lwb = lwbuf_alloc(fs->m, &lwb_cache); ptep = ((vpte_t *)lwbuf_kva(lwb) + ((*pindex >> vshift) & VPTE_PAGE_MASK)); - vpte = *ptep; + vm_page_activate(fs->m); /* - * Page table write-back. If the vpte is valid for the - * requested operation, do a write-back to the page table. + * Page table write-back - entire operation including + * validation of the pte must be atomic to avoid races + * against the vkernel changing the pte. + * + * If the vpte is valid for the* requested operation, do + * a write-back to the page table. * * XXX VPTE_M is not set properly for page directory pages. * It doesn't get set in the page directory if the page table * is modified during a read access. */ - vm_page_activate(fs->m); - if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) && - (vpte & VPTE_RW)) { - if ((vpte & (VPTE_M|VPTE_A)) != (VPTE_M|VPTE_A)) { - atomic_set_long(ptep, VPTE_M | VPTE_A); - vm_page_dirty(fs->m); + for (;;) { + vpte_t nvpte; + + vpte = *ptep; + cpu_ccfence(); + nvpte = vpte; + + if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) && + (vpte & VPTE_RW)) { + nvpte |= VPTE_M | VPTE_A; } - } - if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V)) { - if ((vpte & VPTE_A) == 0) { - atomic_set_long(ptep, VPTE_A); - vm_page_dirty(fs->m); + if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V)) { + nvpte |= VPTE_A; } + if (vpte == nvpte) + break; + if (atomic_cmpset_long(ptep, vpte, nvpte) == 0) + continue; + /* + * Success + */ + if ((vpte ^ nvpte) & (VPTE_M | VPTE_A)) + vm_page_dirty(fs->m); + break; } lwbuf_free(lwb); vm_page_flag_set(fs->m, PG_REFERENCED); @@ -1267,10 +1283,10 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, fs->m = NULL; cleanup_successful_fault(fs); } + /* * Combine remaining address bits with the vpte. */ - /* JG how many bits from each? */ *pindex = ((vpte & VPTE_FRAME) >> PAGE_SHIFT) + (*pindex & ((1L << vshift) - 1)); return (KERN_SUCCESS); @@ -1320,9 +1336,9 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex, vm_object_pip_add(fs->first_object, 1); /* - * If a read fault occurs we try to make the page writable if - * possible. There are three cases where we cannot make the - * page mapping writable: + * If a read fault occurs we try to upgrade the page protection + * and make it also writable if possible. There are three cases + * where we cannot make the page mapping writable: * * (1) The mapping is read-only or the VM object is read-only, * fs->prot above will simply not have VM_PROT_WRITE set.