From 862481e51f6ea1ac44befc9b9f79a0745daced2e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 26 Mar 2013 16:35:56 -0700 Subject: [PATCH] kernel - Fix bug in uiomove() error handling, fix deadlock in read_shortcut * Fix a recently introduced bug in uiomove() related to error handling when iovcnt is greater than 1. * Fix a deadlock in the vm.read_shortcut code, rare triggerable except by vkernels. The deadlock is due to a busied VM page being held across a uiomove(). To fix a new function uiomove_nofault() was added which causes EFAULT to be returned for any non-trivial VM fault (any fault related to nested objects or vnode objects). The vop_helper_read_shortcut() procedure uses this new uiomove function and breaks out when the case occurs, returning and falling through to the filesystem's normal buffer-cache-based read() code. * As an added bonus, we no longer have to unlock the VM object across the new uiomove_nofault() call in the read shorcut code, since it is no longer possible to deadlock against the VM object. Reported-by: tuxillo --- sys/kern/kern_subr.c | 39 +++++++++++++++++++++++++++++++-------- sys/kern/vfs_helper.c | 20 ++++++++++++++------ sys/sys/thread.h | 1 + sys/sys/uio.h | 1 + sys/vm/vm_fault.c | 40 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 14 deletions(-) diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index a4c95027c3..91bc604a43 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -114,10 +114,7 @@ uiomove(caddr_t cp, size_t n, struct uio *uio) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); - if (error) - break; break; - case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy(cp, iov->iov_base, cnt); @@ -127,6 +124,9 @@ uiomove(caddr_t cp, size_t n, struct uio *uio) case UIO_NOCOPY: break; } + + if (error) + break; iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; @@ -137,6 +137,7 @@ uiomove(caddr_t cp, size_t n, struct uio *uio) crit_enter(); td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save; crit_exit(); + return (error); } @@ -173,6 +174,27 @@ uiomovebp(struct buf *bp, caddr_t cp, size_t n, struct uio *uio) return (uiomove(cp, n, uio)); } +/* + * uiomove() but fail for non-trivial VM faults, even if the VM fault is + * valid. Returns EFAULT if a VM fault occurred via the copyin/copyout + * onfault code. + * + * This allows callers to hold e.g. a busy VM page, or a busy VM object, + * or a locked vnode through the call and then fall-back to safer code + * if we fail. + */ +int +uiomove_nofault(caddr_t cp, size_t n, struct uio *uio) +{ + thread_t td = curthread; + int error; + + atomic_set_int(&td->td_flags, TDF_NOFAULT); + error = uiomove(cp, n, uio); + atomic_clear_int(&td->td_flags, TDF_NOFAULT); + return error; +} + /* * Like uiomove() but copies zero-fill. Only allowed for UIO_READ, * for obvious reasons. @@ -202,8 +224,6 @@ uiomovez(size_t n, struct uio *uio) switch (uio->uio_segflg) { case UIO_USERSPACE: error = copyout(ZeroPage, iov->iov_base, cnt); - if (error) - break; break; case UIO_SYSSPACE: bzero(iov->iov_base, cnt); @@ -211,6 +231,9 @@ uiomovez(size_t n, struct uio *uio) case UIO_NOCOPY: break; } + + if (error) + break; iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; @@ -256,22 +279,21 @@ again: uio->uio_iov++; goto again; } - switch (uio->uio_segflg) { + switch (uio->uio_segflg) { case UIO_USERSPACE: if (subyte(iov->iov_base, c) < 0) return (EFAULT); break; - case UIO_SYSSPACE: iov_base = iov->iov_base; *iov_base = c; iov->iov_base = iov_base; break; - case UIO_NOCOPY: break; } + iov->iov_base = (char *)iov->iov_base + 1; iov->iov_len--; uio->uio_resid--; @@ -533,6 +555,7 @@ uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) m = ma[offset >> PAGE_SHIFT]; lwb = lwbuf_alloc(m, &lwb_cache); cp = (char *)lwbuf_kva(lwb) + page_offset; + switch (uio->uio_segflg) { case UIO_USERSPACE: /* diff --git a/sys/kern/vfs_helper.c b/sys/kern/vfs_helper.c index 0d351ebcd1..ba7f4b33af 100644 --- a/sys/kern/vfs_helper.c +++ b/sys/kern/vfs_helper.c @@ -373,13 +373,14 @@ vop_helper_read_shortcut(struct vop_read_args *ap) lwb = lwbuf_alloc(m, &lwb_cache); /* - * Can't hold object across uiomove, a VM fault could - * wind up live locking on the same object (one shared, - * on exclusive). + * Use a no-fault uiomove() to avoid deadlocking against + * our VM object (which could livelock on the same object + * due to shared-vs-exclusive), or deadlocking against + * our busied page. Returns EFAULT on any fault which + * winds up diving a vnode. */ - vm_object_drop(obj); - error = uiomove((char *)lwbuf_kva(lwb) + offset, n, uio); - vm_object_hold_shared(obj); + error = uiomove_nofault((char *)lwbuf_kva(lwb) + offset, + n, uio); vm_page_flag_set(m, PG_REFERENCED); lwbuf_free(lwb); @@ -387,6 +388,13 @@ vop_helper_read_shortcut(struct vop_read_args *ap) } vm_object_drop(obj); + /* + * Ignore EFAULT since we used uiomove_nofault(), causes caller + * to fall-back to normal code for this case. + */ + if (error == EFAULT) + error = 0; + return (error); } diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 003d962763..0eb29e0578 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -371,6 +371,7 @@ struct thread { #define TDF_DELAYED_WAKEUP 0x02000000 #define TDF_UNUSED1 0x04000000 /* unused */ #define TDF_USERMODE 0x08000000 /* in or entering user mode */ +#define TDF_NOFAULT 0x10000000 /* force onfault on fault */ #define TDF_MP_STOPREQ 0x00000001 /* suspend_kproc */ #define TDF_MP_WAKEREQ 0x00000002 /* resume_kproc */ diff --git a/sys/sys/uio.h b/sys/sys/uio.h index c466ce1b80..4eace1a835 100644 --- a/sys/sys/uio.h +++ b/sys/sys/uio.h @@ -95,6 +95,7 @@ struct vm_object; struct vm_page; int uiomove (caddr_t, size_t, struct uio *); +int uiomove_nofault (caddr_t, size_t, struct uio *); int uiomovebp (struct buf *, caddr_t, size_t, struct uio *); int uiomovez (size_t, struct uio *); int uiomove_frombuf (void *buf, size_t buflen, struct uio *uio); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 3ee82c282d..d8f12d3690 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -353,6 +353,11 @@ RetryFault: * Misc checks. Save the map generation number to detect races. */ fs.map_generation = fs.map->timestamp; + fs.lookup_still_valid = TRUE; + fs.first_m = NULL; + fs.object = fs.first_object; /* so unlock_and_deallocate works */ + fs.shared = 0; + fs.vp = NULL; if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) { if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { @@ -376,6 +381,18 @@ RetryFault: (void *)vaddr, fs.entry); } + /* + * Fail here if not a trivial anonymous page fault and TDF_NOFAULT + * is set. + */ + if ((curthread->td_flags & TDF_NOFAULT) && + (fs.first_object->type == OBJT_VNODE || + fs.first_object->backing_object)) { + result = KERN_FAILURE; + unlock_things(&fs); + goto done2; + } + /* * Attempt to shortcut the fault if the lookup returns a * terminal object and the page is present. This allows us @@ -451,10 +468,12 @@ RetryFault: if (fs.vp == NULL) fs.vp = vnode_pager_lock(fs.first_object); +#if 0 fs.lookup_still_valid = TRUE; fs.first_m = NULL; fs.object = fs.first_object; /* so unlock_and_deallocate works */ fs.shared = 0; +#endif /* * If the entry is wired we cannot change the page protection. @@ -581,6 +600,7 @@ quick: done: if (fs.first_object) vm_object_drop(fs.first_object); +done2: lwkt_reltoken(&map->token); if (lp) lp->lwp_flags &= ~LWP_PAGING; @@ -665,6 +685,11 @@ RetryFault: * Misc checks. Save the map generation number to detect races. */ fs.map_generation = fs.map->timestamp; + fs.lookup_still_valid = TRUE; + fs.first_m = NULL; + fs.object = fs.first_object; /* so unlock_and_deallocate works */ + fs.shared = 0; + fs.vp = NULL; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", @@ -680,6 +705,18 @@ RetryFault: (void *)vaddr, fs.entry); } + /* + * Fail here if not a trivial anonymous page fault and TDF_NOFAULT + * is set. + */ + if ((curthread->td_flags & TDF_NOFAULT) && + (fs.first_object->type == OBJT_VNODE || + fs.first_object->backing_object)) { + result = KERN_FAILURE; + unlock_things(&fs); + goto done2; + } + /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free @@ -697,10 +734,12 @@ RetryFault: vm_object_hold(fs.first_object); fs.vp = vnode_pager_lock(fs.first_object); +#if 0 fs.lookup_still_valid = TRUE; fs.first_m = NULL; fs.object = fs.first_object; /* so unlock_and_deallocate works */ fs.shared = 0; +#endif /* * If the entry is wired we cannot change the page protection. @@ -821,6 +860,7 @@ RetryFault: done: if (fs.first_object) vm_object_drop(fs.first_object); +done2: lwkt_reltoken(&map->token); return(fs.m); } -- 2.41.0