From 64b5a8a550c3c782ab04d04d63723691ac054ffc Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 11 Nov 2019 17:06:55 -0800 Subject: [PATCH] kernel - sigblockall()/sigunblockall() support (per thread shared page) * Implement /dev/lpmap, a per-thread RW shared page between userland and the kernel. Each thread in the process will receive a unique shared page for communication with the kernel when memory-mapping /dev/lpmap and can access varous variables via this map. * The current thread's TID is retained for both fork() and vfork(). Previously it was only retained for vfork(). This avoids userland code confusion for any bits and pieces that are indexed based on the TID. * Implement support for a per-thread block-all-signals feature that does not require any system calls (see next commit to libc). The functions will be called sigblockall() and sigunblockall(). The lpmap->blockallsigs variable prevents normal signals from being dispatched. They will still be queued to the LWP as per normal. The behavior is not quite that of a signal mask when dealing with global signals. The low 31 bits represents a recursion counter, allowing recursive use of the functions. The high bit (bit 31) is set by the kernel if a signal was prevented from being dispatched. When userland decrements the counter to 0 (the low 31 bits), it can check and clear bit 31 and if found to be set userland can then make a dummy 'real' system call to cause pending signals to be delivered. Synchronous TRAPs (e.g. kernel-generated SIGFPE, SIGSEGV, etc) are not affected by this feature and will still be dispatched synchronously. * PThreads is expected to unmap the mapped page upon thread exit. The kernel will force-unmap the page upon thread exit if pthreads does not. XXX needs work - currently if the page has not been faulted in the kernel has no visbility into the mapping and will not unmap it, but neither will it get confused if the address is accessed. To be fixed soon. Because if we don't, programs using LWP primitives instead of pthreads might not realize that libc has mapped the page. * The TID is reset to 1 on a successful exec*() * On [v]fork(), if lpmap exists for the current thread, the kernel will copy the lpmap->blockallsigs value to the lpmap for the new thread in the new process. This way sigblock*() state is retained across the [v]fork(). This feature not only reduces code confusion in userland, it also allows [v]fork() to be implemented by the userland program in a way that ensures no signal races in either the parent or the new child process until it is ready for them. * The implementation leverages our vm_map_backing extents by having the per-thread memory mappings indexed within the lwp. This allows the lwp to remove the mappings when it exits (since not doing so would result in a wild pmap entry and kernel memory disclosure). * The implementation currently delays instantiation of the mapped page(s) and some side structures until the first fault. XXX this will have to be changed. --- sys/kern/imgact_aout.c | 7 +- sys/kern/imgact_elf.c | 4 +- sys/kern/init_main.c | 1 + sys/kern/kern_exec.c | 27 ++++- sys/kern/kern_exit.c | 6 + sys/kern/kern_fork.c | 42 +++++-- sys/kern/kern_memio.c | 202 ++++++++++++++++++++----------- sys/kern/kern_proc.c | 80 +++++++++++- sys/kern/kern_sig.c | 15 ++- sys/kern/kern_slaballoc.c | 3 +- sys/platform/pc64/x86_64/efirt.c | 3 +- sys/sys/device.h | 4 +- sys/sys/proc.h | 9 +- sys/sys/signal.h | 4 + sys/sys/signal2.h | 53 +++++++- sys/sys/signalvar.h | 7 ++ sys/sys/upmap.h | 50 ++++++++ sys/vm/vm_fault.c | 12 +- sys/vm/vm_kern.c | 18 ++- sys/vm/vm_map.c | 131 +++++++++++++------- sys/vm/vm_map.h | 30 ++++- sys/vm/vm_mmap.c | 5 +- 22 files changed, 558 insertions(+), 155 deletions(-) diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c index 8b83f0fe7e..292795eda3 100644 --- a/sys/kern/imgact_aout.c +++ b/sys/kern/imgact_aout.c @@ -181,7 +181,7 @@ exec_aout_imgact(struct image_params *imgp) text_end = virtual_offset + a_out->a_text; error = vm_map_insert(map, &count, object, NULL, - file_offset, + file_offset, NULL, virtual_offset, text_end, VM_MAPTYPE_NORMAL, VM_SUBSYS_IMGACT, @@ -200,7 +200,7 @@ exec_aout_imgact(struct image_params *imgp) if (a_out->a_data) { vm_object_reference_locked(object); error = vm_map_insert(map, &count, object, NULL, - file_offset + a_out->a_text, + file_offset + a_out->a_text, NULL, text_end, data_end, VM_MAPTYPE_NORMAL, VM_SUBSYS_IMGACT, @@ -218,7 +218,8 @@ exec_aout_imgact(struct image_params *imgp) if (bss_size) { error = vm_map_insert(map, &count, NULL, NULL, - 0, data_end, data_end + bss_size, + 0, NULL, + data_end, data_end + bss_size, VM_MAPTYPE_NORMAL, VM_SUBSYS_IMGACT, VM_PROT_ALL, VM_PROT_ALL, diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 8d2e0cd51c..146bc50d1d 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -305,7 +305,7 @@ __elfN(load_section)(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, &count, object, NULL, - file_addr, /* file offset */ + file_addr, NULL, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ VM_MAPTYPE_NORMAL, @@ -347,7 +347,7 @@ __elfN(load_section)(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, &count, NULL, NULL, - 0, + 0, NULL, map_addr, map_addr + map_len, VM_MAPTYPE_NORMAL, diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 9a30b80886..08f9937deb 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -174,6 +174,7 @@ mi_proc0init(struct globaldata *gd, struct user *proc0paddr) proc0.p_usched = usched_init(); CPUMASK_ASSALLONES(lwp0.lwp_cpumask); lwkt_token_init(&lwp0.lwp_token, "lwp_token"); + TAILQ_INIT(&lwp0.lwp_lpmap_backing_list); spin_init(&lwp0.lwp_spin, "iproc_lwp0"); varsymset_init(&proc0.p_varsymset, NULL); thread0.td_flags |= TDF_RUNNING; diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 69dbb217c5..b71d6a9f99 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -391,6 +391,15 @@ interpret: } } + /* + * Clean up shared pages, the new program will allocate fresh + * copies as needed. This is also for security purposes and + * to ensure (for example) that things like sys_lpmap->blockallsigs + * state is properly reset on exec. + */ + lwp_userunmap(lp); + proc_userunmap(p); + /* * For security and other reasons virtual kernels cannot be * inherited by an exec. This also allows a virtual kernel @@ -774,6 +783,7 @@ exec_new_vmspace(struct image_params *imgp, struct vmspace *vmcopy) { struct vmspace *vmspace = imgp->proc->p_vmspace; vm_offset_t stack_addr = USRSTACK - maxssiz; + struct lwp *lp; struct proc *p; vm_map_t map; int error; @@ -788,7 +798,8 @@ exec_new_vmspace(struct image_params *imgp, struct vmspace *vmcopy) * want since another thread is patiently waiting for us to exit * in that case. */ - p = curproc; + lp = curthread->td_lwp; + p = lp->lwp_proc; imgp->vmspace_destroyed = 1; if (curthread->td_proc->p_nthreads > 1) { @@ -841,6 +852,20 @@ exec_new_vmspace(struct image_params *imgp, struct vmspace *vmcopy) map = &vmspace->vm_map; } + /* + * Really make sure lwp-specific and process-specific mappings + * are gone. + * + * Once we've done that, and because we are the only LWP left, with + * no TID-dependent mappings, we can reset the TID to 1 (the RB tree + * will remain consistent since it has only one entry). This way + * the exec'd program gets a nice deterministic tid of 1. + */ + lwp_userunmap(lp); + proc_userunmap(p); + lp->lwp_tid = 1; + p->p_lasttid = 1; + /* * Allocate a new stack, generally make the stack non-executable * but allow the program to adjust that (the program may desire to diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index ed37b2dcd3..005be9018e 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -667,6 +667,12 @@ lwp_exit(int masterexit, void *waddr) */ p->p_usched->release_curproc(lp); + /* + * Destroy the per-thread shared page and remove from any pmaps + * it resides in. + */ + lwp_userunmap(lp); + /* * lwp_exit() may be called without setting LWP_MP_WEXIT, so * make sure it is set here. diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index d66070c9f2..61903796ae 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -772,6 +772,7 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags, crit_exit(); CPUMASK_ANDMASK(lp->lwp_cpumask, usched_mastermask); lwkt_token_init(&lp->lwp_token, "lwp_token"); + TAILQ_INIT(&lp->lwp_lpmap_backing_list); spin_init(&lp->lwp_spin, "lwptoken"); /* @@ -800,21 +801,15 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags, kqueue_init(&lp->lwp_kqueue, destproc->p_fd); /* - * Assign a TID to the lp. Loop until the insert succeeds (returns - * NULL). + * Use the same TID for the first thread in the new process after + * a fork or vfork. This is needed to keep pthreads and /dev/lpmap + * sane. In particular a consequence of implementing the per-thread + * /dev/lpmap map code makes this mandatory. * - * If we are in a vfork assign the same TID as the lwp that did the - * vfork(). This way if the user program messes around with - * pthread calls inside the vfork(), it will operate like an - * extension of the (blocked) parent. Also note that since the - * address space is being shared, insofar as pthreads is concerned, - * the code running in the vfork() is part of the original process. + * NOTE: exec*() will reset the TID to 1 to keep things sane in that + * department too. */ - if (flags & RFPPWAIT) { - lp->lwp_tid = origlp->lwp_tid - 1; - } else { - lp->lwp_tid = destproc->p_lasttid; - } + lp->lwp_tid = origlp->lwp_tid - 1; /* * Leave 2 bits open so the pthreads library can optimize locks @@ -837,6 +832,27 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags, pmap_maybethreaded(&destproc->p_vmspace->vm_pmap); destproc->p_flags |= P_MAYBETHREADED; + /* + * If the original lp had a lpmap and a non-zero blockallsigs + * count, give the lp for the forked process the same count. + * + * This makes the user code and expectations less confusing + * in terms of unwinding locks and also allows userland to start + * the forked process with signals blocked via the blockallsigs() + * mechanism if desired. + * + * XXX future - also inherit the lwp-specific process title ? + */ + if (origlp->lwp_lpmap && + (origlp->lwp_lpmap->blockallsigs & 0x7FFFFFFF)) { + lwp_usermap(lp, 0); + if (lp->lwp_lpmap) { + lp->lwp_lpmap->blockallsigs = + origlp->lwp_lpmap->blockallsigs; + } + } + + return (lp); } diff --git a/sys/kern/kern_memio.c b/sys/kern/kern_memio.c index 4d6a70fe7d..df494cfe68 100644 --- a/sys/kern/kern_memio.c +++ b/sys/kern/kern_memio.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -60,9 +61,11 @@ #include #include +#include #include #include +#include #include @@ -75,7 +78,7 @@ static d_ioctl_t mmioctl; static d_mmap_t memmmap; #endif static d_kqfilter_t mmkqfilter; -static int memuksmap(cdev_t dev, vm_page_t fake); +static int memuksmap(vm_map_backing_t ba, int op, cdev_t dev, vm_page_t fake); #define CDEV_MAJOR 2 static struct dev_ops mem_ops = { @@ -379,83 +382,94 @@ mmwrite(struct dev_write_args *ap) * instead of going through read/write * \*******************************************************/ -static int user_kernel_mapping(int num, vm_ooffset_t offset, - vm_ooffset_t *resultp); - -#if 0 +static int user_kernel_mapping(vm_map_backing_t ba, int num, + vm_ooffset_t offset, vm_ooffset_t *resultp); static int -memmmap(struct dev_mmap_args *ap) +memuksmap(vm_map_backing_t ba, int op, cdev_t dev, vm_page_t fake) { - cdev_t dev = ap->a_head.a_dev; vm_ooffset_t result; int error; + struct proc *p; + struct lwp *lp; - switch (minor(dev)) { - case 0: - /* - * minor device 0 is physical memory - */ - ap->a_result = atop(ap->a_offset); - error = 0; - break; - case 1: - /* - * minor device 1 is kernel memory - */ - ap->a_result = atop(vtophys(ap->a_offset)); - error = 0; - break; - case 5: - case 6: - /* - * minor device 5 is /dev/upmap (see sys/upmap.h) - * minor device 6 is /dev/kpmap (see sys/upmap.h) - */ - result = 0; - error = user_kernel_mapping(minor(dev), ap->a_offset, &result); - ap->a_result = atop(result); - break; - default: - error = EINVAL; - break; - } - return error; -} - -#endif - -static int -memuksmap(cdev_t dev, vm_page_t fake) -{ - vm_ooffset_t result; - int error; + error = 0; - switch (minor(dev)) { - case 0: + switch(op) { + case UKSMAPOP_ADD: /* - * minor device 0 is physical memory + * /dev/lpmap only (minor 7) + * + * Don't do anything until the page is faulted in. Clear + * our flags on this possibly replicated ba. vm_map_entry + * replication can occur before the new process/lwp is + * created, so there's nothing to link into. */ - fake->phys_addr = ptoa(fake->pindex); - error = 0; + if (minor(dev) != 7) + break; + atomic_clear_int(&ba->flags, VM_MAP_LWP_LINKED); break; - case 1: + case UKSMAPOP_REM: /* - * minor device 1 is kernel memory + * /dev/lpmap only (minor 7) + * + * The mapping is only on the lwp list after it has been + * faulted in. */ - fake->phys_addr = vtophys(ptoa(fake->pindex)); - error = 0; + if (minor(dev) != 7) + break; + if ((ba->flags & VM_MAP_LWP_LINKED) == 0) + break; + + p = curproc; + lwkt_gettoken_shared(&p->p_token); + lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, + (int)(intptr_t)ba->aux_info); + if (lp) { + LWPHOLD(lp); + lwkt_reltoken(&p->p_token); + spin_lock(&lp->lwp_spin); + TAILQ_REMOVE(&lp->lwp_lpmap_backing_list, ba, entry); + atomic_clear_int(&ba->flags, VM_MAP_LWP_LINKED); + spin_unlock(&lp->lwp_spin); + LWPRELE(lp); + } else { + lwkt_reltoken(&p->p_token); + } break; - case 5: - case 6: - /* - * minor device 5 is /dev/upmap (see sys/upmap.h) - * minor device 6 is /dev/kpmap (see sys/upmap.h) - */ - result = 0; - error = user_kernel_mapping(minor(dev), - ptoa(fake->pindex), &result); - fake->phys_addr = result; + case UKSMAPOP_FAULT: + switch (minor(dev)) { + case 0: + /* + * minor device 0 is physical memory + */ + fake->phys_addr = ptoa(fake->pindex); + break; + case 1: + /* + * minor device 1 is kernel memory + */ + fake->phys_addr = vtophys(ptoa(fake->pindex)); + break; + case 5: + case 6: + case 7: + /* + * minor device 5 is /dev/upmap (see sys/upmap.h) + * minor device 6 is /dev/kpmap (see sys/upmap.h) + * minor device 7 is /dev/lpmap (see sys/upmap.h) + */ + result = 0; + error = user_kernel_mapping(ba, + minor(dev), + ptoa(fake->pindex), + &result); + fake->phys_addr = result; + break; + default: + error = EINVAL; + break; + } break; default: error = EINVAL; @@ -700,16 +714,21 @@ iszerodev(cdev_t dev) } /* - * /dev/upmap and /dev/kpmap. + * /dev/lpmap, /dev/upmap, /dev/kpmap. */ static int -user_kernel_mapping(int num, vm_ooffset_t offset, vm_ooffset_t *resultp) +user_kernel_mapping(vm_map_backing_t ba, int num, vm_ooffset_t offset, + vm_ooffset_t *resultp) { struct proc *p; + struct lwp *lp; int error; int invfork; - if ((p = curproc) == NULL) + p = curthread->td_proc; + if (p == NULL) + return (EINVAL); + if (offset < 0) return (EINVAL); /* @@ -763,6 +782,52 @@ user_kernel_mapping(int num, vm_ooffset_t offset, vm_ooffset_t *resultp) error = 0; } break; + case 7: + /* + * /dev/lpmap - maps RW per-thread shared user-kernel area. + * + * Link the vm_map_backing into the lwp so we can delete + * the mapping when the lwp exits. Otherwise we would end + * up with a lingering pmap page and the associated kernel + * memory disclosure. + * + * We do the linking on first-fault since the process and/or + * lwp might not exist at the time the map is created (i.e. + * in the case of fork()). + */ + lwkt_gettoken_shared(&p->p_token); + lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, + (int)(intptr_t)ba->aux_info); + if (lp == NULL) { + lwkt_reltoken(&p->p_token); + break; + } + LWPHOLD(lp); + lwkt_reltoken(&p->p_token); + + /* + * Extract address + */ + if (lp->lwp_lpmap == NULL) + lwp_usermap(lp, invfork); + + if ((ba->flags & VM_MAP_LWP_LINKED) == 0) { + spin_lock(&lp->lwp_spin); + TAILQ_INSERT_TAIL(&lp->lwp_lpmap_backing_list, + ba, entry); + atomic_set_int(&ba->flags, VM_MAP_LWP_LINKED); + spin_unlock(&lp->lwp_spin); + } + + if (lp->lwp_lpmap && + offset < roundup2(sizeof(*lp->lwp_lpmap), PAGE_SIZE)) { + /* only good for current process */ + *resultp = pmap_kextract((vm_offset_t)lp->lwp_lpmap + + offset); + error = 0; + } + LWPRELE(lp); + break; default: break; } @@ -784,6 +849,7 @@ mem_drvinit(void *unused) make_dev(&mem_ops, 4, UID_ROOT, GID_WHEEL, 0644, "urandom"); make_dev(&mem_ops, 5, UID_ROOT, GID_WHEEL, 0666, "upmap"); make_dev(&mem_ops, 6, UID_ROOT, GID_WHEEL, 0444, "kpmap"); + make_dev(&mem_ops, 7, UID_ROOT, GID_WHEEL, 0666, "lpmap"); zerodev = make_dev(&mem_ops, 12, UID_ROOT, GID_WHEEL, 0666, "zero"); make_dev(&mem_ops_noq, 14, UID_ROOT, GID_WHEEL, 0600, "io"); } diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index f5fc9da8c4..5c9880f459 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -86,6 +86,7 @@ MALLOC_DEFINE(M_SESSION, "session", "session header"); MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_LWP, "lwp", "lwp structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); +MALLOC_DEFINE(M_UPMAP, "upmap", "upmap/kpmap/lpmap structures"); int ps_showallprocs = 1; static int ps_showallthreads = 1; @@ -1215,7 +1216,7 @@ proc_usermap(struct proc *p, int invfork) struct sys_upmap *upmap; lwkt_gettoken(&p->p_token); - upmap = kmalloc(roundup2(sizeof(*upmap), PAGE_SIZE), M_PROC, + upmap = kmalloc(roundup2(sizeof(*upmap), PAGE_SIZE), M_UPMAP, M_WAITOK | M_ZERO); if (p->p_upmap == NULL) { upmap->header[0].type = UKPTYPE_VERSION; @@ -1237,7 +1238,7 @@ proc_usermap(struct proc *p, int invfork) upmap->invfork = invfork; p->p_upmap = upmap; } else { - kfree(upmap, M_PROC); + kfree(upmap, M_UPMAP); } lwkt_reltoken(&p->p_token); } @@ -1250,11 +1251,84 @@ proc_userunmap(struct proc *p) lwkt_gettoken(&p->p_token); if ((upmap = p->p_upmap) != NULL) { p->p_upmap = NULL; - kfree(upmap, M_PROC); + kfree(upmap, M_UPMAP); } lwkt_reltoken(&p->p_token); } +/* + * Called when the per-thread user/kernel shared page needs to be + * allocated. The function refuses to allocate the page if the + * thread is exiting to avoid races against lwp_userunmap(). + */ +void +lwp_usermap(struct lwp *lp, int invfork) +{ + struct sys_lpmap *lpmap; + + lwkt_gettoken(&lp->lwp_token); + + lpmap = kmalloc(roundup2(sizeof(*lpmap), PAGE_SIZE), M_UPMAP, + M_WAITOK | M_ZERO); + if (lp->lwp_lpmap == NULL && (lp->lwp_mpflags & LWP_MP_WEXIT) == 0) { + lpmap->header[0].type = UKPTYPE_VERSION; + lpmap->header[0].offset = offsetof(struct sys_lpmap, version); + lpmap->header[1].type = LPTYPE_BLOCKALLSIGS; + lpmap->header[1].offset = offsetof(struct sys_lpmap, + blockallsigs); + lpmap->header[2].type = LPTYPE_THREAD_TITLE; + lpmap->header[2].offset = offsetof(struct sys_lpmap, + thread_title); + + lpmap->version = LPMAP_VERSION; + lp->lwp_lpmap = lpmap; + } else { + kfree(lpmap, M_UPMAP); + } + lwkt_reltoken(&lp->lwp_token); +} + +/* + * Called when a LWP (but not necessarily the whole process) exits. + * Called when a process execs (after all other threads have been killed). + * + * lwp-specific mappings must be removed. If userland didn't do it, then + * we have to. Otherwise we could end-up disclosing kernel memory due to + * the ad-hoc pmap mapping. + */ +void +lwp_userunmap(struct lwp *lp) +{ + struct sys_lpmap *lpmap; + struct vm_map *map; + struct vm_map_backing *ba; + struct vm_map_backing copy; + + lwkt_gettoken(&lp->lwp_token); + map = &lp->lwp_proc->p_vmspace->vm_map; + lpmap = lp->lwp_lpmap; + lp->lwp_lpmap = NULL; + + spin_lock(&lp->lwp_spin); + while ((ba = TAILQ_FIRST(&lp->lwp_lpmap_backing_list)) != NULL) { + TAILQ_REMOVE(&lp->lwp_lpmap_backing_list, ba, entry); + atomic_clear_int(&ba->flags, VM_MAP_LWP_LINKED); + copy = *ba; + spin_unlock(&lp->lwp_spin); + + lwkt_gettoken(&map->token); + vm_map_remove(map, copy.start, copy.end); + lwkt_reltoken(&map->token); + + spin_lock(&lp->lwp_spin); + } + spin_unlock(&lp->lwp_spin); + + if (lpmap) + kfree(lpmap, M_UPMAP); + lwkt_reltoken(&lp->lwp_token); +} + /* * Scan all processes on the allproc list. The process is automatically * held for the callback. A return value of -1 terminates the loop. diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 43f4881543..c202d0a9a3 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -204,12 +204,15 @@ static int sigproptbl[NSIG] = { }; +__read_mostly sigset_t sigcantmask_mask; + static __inline int sigprop(int sig) { if (sig > 0 && sig < NSIG) return (sigproptbl[_SIG_IDX(sig)]); + return (0); } @@ -414,9 +417,15 @@ siginit(struct proc *p) { int i; - for (i = 1; i <= NSIG; i++) + for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SA_IGNORE && i != SIGCONT) SIGADDSET(p->p_sigignore, i); + } + + /* + * Also initialize signal-related global state. + */ + SIGSETOR_CANTMASK(sigcantmask_mask); } /* @@ -921,6 +930,8 @@ pgsignal(struct pgrp *pgrp, int sig, int checkctty) * * These signals may ONLY be delivered to the specified lwp and may never * be delivered to the process generically. + * + * lpmap->blockallsigs is ignored. */ void trapsignal(struct lwp *lp, int sig, u_long code) @@ -1984,6 +1995,8 @@ issignal(struct lwp *lp, int maytrace, int *ptokp) SIGSETNAND(mask, lp->lwp_sigmask); if (p->p_flags & P_PPWAIT) SIG_STOPSIGMASK(mask); + SIG_CONDBLOCKALLSIGS(mask, lp); + if (SIGISEMPTY(mask)) /* no signal to send */ return (0); diff --git a/sys/kern/kern_slaballoc.c b/sys/kern/kern_slaballoc.c index 092b1cc4ba..963b7307fd 100644 --- a/sys/kern/kern_slaballoc.c +++ b/sys/kern/kern_slaballoc.c @@ -1558,7 +1558,8 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags) vm_object_reference_locked(&kernel_object); vm_map_insert(&kernel_map, &count, &kernel_object, NULL, - addr, addr, addr + size, + addr, NULL, + addr, addr + size, VM_MAPTYPE_NORMAL, VM_SUBSYS_KMALLOC, VM_PROT_ALL, VM_PROT_ALL, 0); diff --git a/sys/platform/pc64/x86_64/efirt.c b/sys/platform/pc64/x86_64/efirt.c index e93c737034..0aceb168e3 100644 --- a/sys/platform/pc64/x86_64/efirt.c +++ b/sys/platform/pc64/x86_64/efirt.c @@ -220,7 +220,8 @@ efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz) count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(&efi_vmspace->vm_map); result = vm_map_insert(&efi_vmspace->vm_map, &count, efi_obj, NULL, - 0, 0, VM_MAX_USER_ADDRESS, + 0, NULL, + 0, VM_MAX_USER_ADDRESS, VM_MAPTYPE_NORMAL, VM_SUBSYS_EFI, VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE, diff --git a/sys/sys/device.h b/sys/sys/device.h index dbe48e5071..f70674ac48 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -50,6 +50,7 @@ struct cdev; struct ucred; struct devfs_bitmap; struct vm_page; +struct vm_map_backing; struct vnode; /* @@ -250,7 +251,8 @@ struct dev_ops { d_kqfilter_t *d_kqfilter; d_clone_t *d_clone; /* clone from base dev_ops */ d_revoke_t *d_revoke; - int (*d_uksmap)(struct cdev *dev, struct vm_page *fake); + int (*d_uksmap)(struct vm_map_backing *ba, int op, + struct cdev *dev, struct vm_page *fake); #define dev_ops_last_field d_uksmap }; #endif /* _KERNEL */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 69e550df72..26b1a94368 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -75,6 +75,7 @@ struct session; struct lwp; struct uidcount; struct procglob; +struct vm_map_backing; LIST_HEAD(proclist, proc); LIST_HEAD(pgrplist, pgrp); @@ -226,12 +227,12 @@ struct lwp { struct mdproc lwp_md; /* Any machine-dependent fields. */ struct thread *lwp_thread; /* backpointer to proc's thread */ - void *lwp_unused01; /* for future fields */ + struct sys_lpmap *lwp_lpmap; /* user RW mappable per-thread page */ struct kqueue lwp_kqueue; /* for select/poll */ uint64_t lwp_kqueue_serial; /* for select/poll */ struct lwkt_token lwp_token; /* per-lwp token for signal/state */ struct spinlock lwp_spin; /* spinlock for signal handling */ - void *lwp_reserveds1; /* reserved for lwp_saveusp */ + TAILQ_HEAD(, vm_map_backing) lwp_lpmap_backing_list; void *lwp_reserveds2; /* reserved for lwp_saveupc */ }; @@ -345,7 +346,7 @@ struct proc { void *p_vmm; cpulock_t p_vmm_cpulock; /* count cpus in and kickout lock */ cpumask_t p_vmm_cpumask; /* cpus entering or in vmm */ - struct sys_upmap *p_upmap; /* user RO mappable per-process page */ + struct sys_upmap *p_upmap; /* user RW mappable per-process page */ forkid_t p_forkid; /* unique forkid */ struct sysreaper *p_reaper; /* reaper control */ void *p_reserveds[3]; /* reserved for future */ @@ -598,6 +599,8 @@ void prelezomb (struct proc *); void pstall (struct proc *, const char *, int); void lwpuserret(struct lwp *); void lwpkthreaddeferred(void); +void lwp_usermap(struct lwp *lp, int invfork); +void lwp_userunmap(struct lwp *lp); void proc_usermap(struct proc *p, int invfork); void proc_userunmap(struct proc *p); void reaper_hold(struct sysreaper *reap); diff --git a/sys/sys/signal.h b/sys/sys/signal.h index 2c04ef1cc8..fa1f0f1951 100644 --- a/sys/sys/signal.h +++ b/sys/sys/signal.h @@ -411,6 +411,10 @@ struct sigvec { */ __BEGIN_DECLS __sighandler_t *signal(int, __sighandler_t *); +#if __BSD_VISIBLE +int sigblockall(void); +int sigunblockall(void); +#endif __END_DECLS #endif /* !_SYS_SIGNAL_H_ */ diff --git a/sys/sys/signal2.h b/sys/sys/signal2.h index 949194cb37..c6ae8587b9 100644 --- a/sys/sys/signal2.h +++ b/sys/sys/signal2.h @@ -74,6 +74,56 @@ lwp_delsig(struct lwp *lp, int sig, int fromproc) #define CURSIG_LCK_TRACE(lp, ptok) __cursig(lp, 1, 1, ptok) #define CURSIG_NOBLOCK(lp) __cursig(lp, 0, 0, NULL) +/* + * This inline checks lpmap->blockallsigs, a user r/w accessible + * memory-mapped variable that allows a user thread to instantly + * mask and unmask all maskable signals without having to issue a + * system call. + * + * On the unmask count reaching 0, userland can check and clear + * bit 31 to determine if any signals arrived, then issue a dummy + * system call to ensure delivery. + */ +static __inline +void +__sig_condblockallsigs(sigset_t *mask, struct lwp *lp) +{ + struct sys_lpmap *lpmap; + uint32_t bas; + sigset_t tmp; + int trapsig; + + if ((lpmap = lp->lwp_lpmap) == NULL) + return; + + bas = lpmap->blockallsigs; + while (bas & 0x7FFFFFFFU) { + tmp = *mask; /* check maskable signals */ + SIG_CANTMASK(tmp); + if (SIGISEMPTY(tmp)) /* no unmaskable signals */ + return; + + /* + * Upon successful update to lpmap->blockallsigs remove + * all maskable signals, leaving only unmaskable signals. + * + * If lwp_sig is non-zero it represents a syncronous 'trap' + * signal which, being a synchronous trap, must be allowed. + */ + if (atomic_fcmpset_int(&lpmap->blockallsigs, &bas, + bas | 0x80000000U)) { + trapsig = lp->lwp_sig; + if (trapsig && SIGISMEMBER(*mask, trapsig)) { + SIGSETAND(*mask, sigcantmask_mask); + SIGADDSET(*mask, trapsig); + } else { + SIGSETAND(*mask, sigcantmask_mask); + } + break; + } + } +} + /* * Determine signal that should be delivered to process p, the current * process, 0 if none. If there is a pending stop signal with default @@ -97,6 +147,7 @@ __cursig(struct lwp *lp, int mayblock, int maytrace, int *ptok) tmpset = lwp_sigpend(lp); SIGSETNAND(tmpset, lp->lwp_sigmask); + SIG_CONDBLOCKALLSIGS(tmpset, lp); /* Nothing interesting happening? */ if (SIGISEMPTY(tmpset)) { @@ -105,7 +156,7 @@ __cursig(struct lwp *lp, int mayblock, int maytrace, int *ptok) * a) we may block and * b) somebody is tracing us. */ - if (!(mayblock && (p->p_flags & P_TRACED))) + if (mayblock == 0 || (p->p_flags & P_TRACED) == 0) return (0); } diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h index f591753be4..50ded18d7e 100644 --- a/sys/sys/signalvar.h +++ b/sys/sys/signalvar.h @@ -144,6 +144,12 @@ struct sigacts { (set1).__bits[__i] &= ~(set2).__bits[__i]; \ } while (0) +#define SIGSETOR_CANTMASK(set) \ + SIGADDSET(set, SIGKILL), SIGADDSET(set, SIGSTOP) + +#define SIG_CONDBLOCKALLSIGS(set, lp) \ + __sig_condblockallsigs(&(set), lp) + #define SIG_CANTMASK(set) \ SIGDELSET(set, SIGKILL), SIGDELSET(set, SIGSTOP) @@ -196,6 +202,7 @@ struct proc; struct sigio; extern int sugid_coredump; /* Sysctl variable kern.sugid_coredump */ +extern sigset_t sigcantmask_mask; /* * Machine-independent functions: diff --git a/sys/sys/upmap.h b/sys/sys/upmap.h index da1697e26c..b9852bb319 100644 --- a/sys/sys/upmap.h +++ b/sys/sys/upmap.h @@ -43,9 +43,12 @@ #endif #define UPMAP_MAXPROCTITLE 1024 +#define LPMAP_MAXTHREADTITLE 1024 +#define LPMAP_MAPSIZE 65536 #define UPMAP_MAPSIZE 65536 #define KPMAP_MAPSIZE 65536 +#define LPMAP_VERSION 1 #define UPMAP_VERSION 1 #define KPMAP_VERSION 1 @@ -73,14 +76,31 @@ typedef struct ukpheader { UKPLEN_16 : UKPLEN_32) #define UKPLEN_DECODE(type) (1 << ((type >> 8) & 0x0F)) +/* + * Global types - may exist in all three mapping types + */ #define UKPTYPE_VERSION (0x0001 | UKPLEN_4) /* always first */ +/* + * /dev/lpmap - per-thread + */ +#define LPTYPE_RESERVEDINT0 (0x4010 | UKPLEN_4) +#define LPTYPE_RESERVEDINT1 (0x4011 | UKPLEN_4) +#define LPTYPE_BLOCKALLSIGS (0x4012 | UKPLEN_4) +#define LPTYPE_THREAD_TITLE (0x4013 | UKPLEN_1024) + +/* + * /dev/upmap - per-process + */ #define UPTYPE_RUNTICKS (0x0010 | UKPLEN_4) #define UPTYPE_FORKID (0x0011 | UKPLEN_8) #define UPTYPE_PID (0x0012 | UKPLEN_4) #define UPTYPE_PROC_TITLE (0x0013 | UKPLEN_1024) #define UPTYPE_INVFORK (0x0014 | UKPLEN_4) +/* + * /dev/kpmap - kernel-wide + */ #define KPTYPE_UPTICKS (0x8000 | UKPLEN_4) #define KPTYPE_TS_UPTIME (0x8001 | UKPLEN_TS) #define KPTYPE_TS_REALTIME (0x8002 | UKPLEN_TS) @@ -90,6 +110,36 @@ typedef struct ukpheader { #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) +/* + * (writable) user per-thread map via /dev/lpmap. + * + * ABSOLUTE LOCATIONS CAN CHANGE, ITERATE HEADERS FOR THE TYPE YOU DESIRE + * UNTIL YOU HIT TYPE 0, THEN CACHE THE RESULTING POINTER. + * + * If you insist, at least check that the version matches LPMAP_VERSION. + * + * -- + * + * The current thread can block all blockable signals by (atomically) + * incrementing blockallsigs. If the kernel receives a signal while + * the low 31 bits of blockallsigs are non-zero, the received signal + * will be made pending but not acted upon and bit 31 of blockallsigs + * will be set. The signal mask is not affected. + * + * Upon decrementing blockallsigs to 0 (low 31 bits to 0), again atomically, + * userland should then check to see if bit 31 is set, clear it, and then + * issue any real system call to force the kernel to re-check pending signals + * and act upon them. + */ +struct sys_lpmap { + ukpheader_t header[64]; + uint32_t version; + uint32_t reserved01; + uint32_t reserved02; + uint32_t blockallsigs; + char thread_title[LPMAP_MAXTHREADTITLE]; +}; + /* * (writable) user per-process map via /dev/upmap. * diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index c498d8eb16..c4ef40b905 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -545,7 +545,8 @@ RetryFault: fakem.busy_count = PBUSY_LOCKED; fakem.valid = VM_PAGE_BITS_ALL; fakem.pat_mode = VM_MEMATTR_DEFAULT; - if (fs.entry->ba.uksmap(fs.entry->aux.dev, &fakem)) { + if (fs.entry->ba.uksmap(&fs.entry->ba, UKSMAPOP_FAULT, + fs.entry->aux.dev, &fakem)) { result = KERN_FAILURE; unlock_things(&fs); goto done2; @@ -1171,7 +1172,8 @@ RetryFault: fakem.busy_count = PBUSY_LOCKED; fakem.valid = VM_PAGE_BITS_ALL; fakem.pat_mode = VM_MEMATTR_DEFAULT; - if (fs.entry->ba.uksmap(fs.entry->aux.dev, &fakem)) { + if (fs.entry->ba.uksmap(&fs.entry->ba, UKSMAPOP_FAULT, + fs.entry->aux.dev, &fakem)) { *errorp = KERN_FAILURE; fs.m = NULL; unlock_things(&fs); @@ -1850,7 +1852,8 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex, goto readrest; } } - fs->first_ba->flags &= ~VM_MAP_BACK_EXCL_HEUR; + atomic_clear_int(&fs->first_ba->flags, + VM_MAP_BACK_EXCL_HEUR); break; /* break to PAGE HAS BEEN FOUND */ } @@ -1879,7 +1882,8 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex, /* * Allocating, must be exclusive. */ - fs->first_ba->flags |= VM_MAP_BACK_EXCL_HEUR; + atomic_set_int(&fs->first_ba->flags, + VM_MAP_BACK_EXCL_HEUR); if (fs->ba == fs->first_ba && fs->first_shared) { fs->first_shared = 0; vm_object_pip_wakeup(fs->first_ba->object); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index c44bd41765..35b6c9c1f7 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -248,7 +248,8 @@ kmem_alloc3(vm_map_t map, vm_size_t size, vm_subsys_t id, int kmflags) vm_object_reference_locked(&kernel_object); vm_map_insert(map, &count, &kernel_object, NULL, - addr, addr, addr + size, + addr, NULL, + addr, addr + size, VM_MAPTYPE_NORMAL, id, VM_PROT_ALL, VM_PROT_ALL, cow); vm_object_drop(&kernel_object); @@ -389,7 +390,8 @@ kmem_alloc_wait(vm_map_t map, vm_size_t size, vm_subsys_t id) } vm_map_insert(map, &count, NULL, NULL, - (vm_offset_t) 0, addr, addr + size, + (vm_offset_t)0, NULL, + addr, addr + size, VM_MAPTYPE_NORMAL, id, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); @@ -429,7 +431,8 @@ kmem_alloc_attr(vm_map_t map, vm_size_t size, vm_subsys_t id, vm_object_reference_locked(&kernel_object); vm_map_insert(map, &count, &kernel_object, NULL, - offset, addr, addr + size, + offset, NULL, + addr, addr + size, VM_MAPTYPE_NORMAL, id, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); @@ -501,7 +504,8 @@ kmem_init(void) if (addr < virtual2_start) { vm_map_insert(m, &count, NULL, NULL, - (vm_offset_t) 0, addr, virtual2_start, + (vm_offset_t) 0, NULL, + addr, virtual2_start, VM_MAPTYPE_NORMAL, VM_SUBSYS_RESERVED, VM_PROT_ALL, VM_PROT_ALL, 0); } @@ -510,7 +514,8 @@ kmem_init(void) if (addr < virtual_start) { vm_map_insert(m, &count, NULL, NULL, - (vm_offset_t) 0, addr, virtual_start, + (vm_offset_t) 0, NULL, + addr, virtual_start, VM_MAPTYPE_NORMAL, VM_SUBSYS_RESERVED, VM_PROT_ALL, VM_PROT_ALL, 0); } @@ -518,7 +523,8 @@ kmem_init(void) if (addr < KvaEnd) { vm_map_insert(m, &count, NULL, NULL, - (vm_offset_t) 0, addr, KvaEnd, + (vm_offset_t) 0, NULL, + addr, KvaEnd, VM_MAPTYPE_NORMAL, VM_SUBSYS_RESERVED, VM_PROT_ALL, VM_PROT_ALL, 0); } diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 22be1506f8..a082a710f7 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -164,15 +164,15 @@ static void vmspace_drop_notoken(struct vmspace *vm); static void vm_map_entry_shadow(vm_map_entry_t entry); static vm_map_entry_t vm_map_entry_create(int *); static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *); -static void vm_map_entry_dispose_ba (vm_map_backing_t ba); +static void vm_map_entry_dispose_ba (vm_map_entry_t entry, vm_map_backing_t ba); static void vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags); static void vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start); static void vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end); -static void vm_map_backing_attach (vm_map_backing_t ba); -static void vm_map_backing_detach (vm_map_backing_t ba); +static void vm_map_backing_attach (vm_map_entry_t entry, vm_map_backing_t ba); +static void vm_map_backing_detach (vm_map_entry_t entry, vm_map_backing_t ba); static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *); static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *); static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *); @@ -799,7 +799,7 @@ vm_map_entry_shadow(vm_map_entry_t entry) * * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS */ - vm_map_backing_detach(&entry->ba); + vm_map_backing_detach(entry, &entry->ba); *ba = entry->ba; /* previous ba */ entry->ba.object = result; /* new ba (at head of entry) */ entry->ba.backing_ba = ba; @@ -809,8 +809,8 @@ vm_map_entry_shadow(vm_map_entry_t entry) /* cpu localization twist */ result->pg_color = vm_quickcolor(); - vm_map_backing_attach(&entry->ba); - vm_map_backing_attach(ba); + vm_map_backing_attach(entry, &entry->ba); + vm_map_backing_attach(entry, ba); /* * Adjust the return storage. Drop the ref on source before @@ -860,7 +860,7 @@ vm_map_entry_allocate_object(vm_map_entry_t entry) entry->ba.offset); } entry->ba.object = obj; - vm_map_backing_attach(&entry->ba); + vm_map_backing_attach(entry, &entry->ba); } /* @@ -1043,26 +1043,44 @@ vm_map_entry_create(int *countp) } /* - * + * Attach and detach backing store elements */ static void -vm_map_backing_attach(vm_map_backing_t ba) +vm_map_backing_attach(vm_map_entry_t entry, vm_map_backing_t ba) { - vm_object_t obj = ba->object; + vm_object_t obj; - lockmgr(&obj->backing_lk, LK_EXCLUSIVE); - TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry); - lockmgr(&obj->backing_lk, LK_RELEASE); + switch(entry->maptype) { + case VM_MAPTYPE_VPAGETABLE: + case VM_MAPTYPE_NORMAL: + obj = ba->object; + lockmgr(&obj->backing_lk, LK_EXCLUSIVE); + TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry); + lockmgr(&obj->backing_lk, LK_RELEASE); + break; + case VM_MAPTYPE_UKSMAP: + ba->uksmap(ba, UKSMAPOP_ADD, entry->aux.dev, NULL); + break; + } } static void -vm_map_backing_detach(vm_map_backing_t ba) +vm_map_backing_detach(vm_map_entry_t entry, vm_map_backing_t ba) { - vm_object_t obj = ba->object; + vm_object_t obj; - lockmgr(&obj->backing_lk, LK_EXCLUSIVE); - TAILQ_REMOVE(&obj->backing_list, ba, entry); - lockmgr(&obj->backing_lk, LK_RELEASE); + switch(entry->maptype) { + case VM_MAPTYPE_VPAGETABLE: + case VM_MAPTYPE_NORMAL: + obj = ba->object; + lockmgr(&obj->backing_lk, LK_EXCLUSIVE); + TAILQ_REMOVE(&obj->backing_list, ba, entry); + lockmgr(&obj->backing_lk, LK_RELEASE); + break; + case VM_MAPTYPE_UKSMAP: + ba->uksmap(ba, UKSMAPOP_REM, entry->aux.dev, NULL); + break; + } } /* @@ -1072,15 +1090,17 @@ vm_map_backing_detach(vm_map_backing_t ba) * We decrement the (possibly shared) element and kfree() on the * 1->0 transition. We only iterate to the next backing_ba when * the previous one went through a 1->0 transition. + * + * These can only be normal vm_object based backings. */ static void -vm_map_entry_dispose_ba(vm_map_backing_t ba) +vm_map_entry_dispose_ba(vm_map_entry_t entry, vm_map_backing_t ba) { vm_map_backing_t next; while (ba) { - if (ba->object) { - vm_map_backing_detach(ba); + if (ba->map_object) { + vm_map_backing_detach(entry, ba); vm_object_deallocate(ba->object); } next = ba->backing_ba; @@ -1105,19 +1125,20 @@ vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp) switch(entry->maptype) { case VM_MAPTYPE_NORMAL: case VM_MAPTYPE_VPAGETABLE: - if (entry->ba.object) { - vm_map_backing_detach(&entry->ba); + if (entry->ba.map_object) { + vm_map_backing_detach(entry, &entry->ba); vm_object_deallocate(entry->ba.object); } break; case VM_MAPTYPE_SUBMAP: + break; case VM_MAPTYPE_UKSMAP: - /* XXX TODO */ + vm_map_backing_detach(entry, &entry->ba); break; default: break; } - vm_map_entry_dispose_ba(entry->ba.backing_ba); + vm_map_entry_dispose_ba(entry, entry->ba.backing_ba); /* * Cleanup for safety. @@ -1220,8 +1241,10 @@ vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) * making call to account for the new entry. XXX API is a bit messy. */ int -vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux, - vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, +vm_map_insert(vm_map_t map, int *countp, + void *map_object, void *map_aux, + vm_ooffset_t offset, void *aux_info, + vm_offset_t start, vm_offset_t end, vm_maptype_t maptype, vm_subsys_t id, vm_prot_t prot, vm_prot_t max, int cow) { @@ -1361,6 +1384,7 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux, new_entry->ba.backing_ba = NULL; new_entry->ba.backing_count = 0; new_entry->ba.offset = offset; + new_entry->ba.aux_info = aux_info; new_entry->ba.flags = 0; new_entry->ba.pmap = map->pmap; @@ -1577,13 +1601,22 @@ vm_map_find(vm_map_t map, void *map_object, void *map_aux, { vm_offset_t start; vm_object_t object; + void *aux_info; int result; int count; - if (maptype == VM_MAPTYPE_UKSMAP) + /* + * UKSMAPs set aux_info to the tid of the calling thread. This is + * only used by /dev/lpmap (per-thread user/kernel shared page). + */ + aux_info = NULL; + if (maptype == VM_MAPTYPE_UKSMAP) { object = NULL; - else + if (curthread->td_lwp) + aux_info = (void *)(intptr_t)curthread->td_lwp->lwp_tid; + } else { object = map_object; + } start = *addr; @@ -1601,8 +1634,10 @@ vm_map_find(vm_map_t map, void *map_object, void *map_aux, } start = *addr; } - result = vm_map_insert(map, &count, map_object, map_aux, - offset, start, start + length, + result = vm_map_insert(map, &count, + map_object, map_aux, + offset, aux_info, + start, start + length, maptype, id, prot, max, cow); if (object) vm_object_drop(object); @@ -3419,14 +3454,20 @@ vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags) (flags & MAP_BACK_BASEOBJREFD) == 0) { vm_object_reference_quick(object); } - vm_map_backing_attach(ba); + vm_map_backing_attach(entry, ba); if ((flags & MAP_BACK_CLIPPED) == 0 && object->ref_count > 1) { vm_object_clear_flag(object, OBJ_ONEMAPPING); } + } else if (entry->maptype == VM_MAPTYPE_UKSMAP) { + vm_map_backing_attach(entry, ba); } if (ba->backing_ba == NULL) break; + + /* + * NOTE: The aux_info field is retained. + */ nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT); *nba = *ba->backing_ba; nba->offset += (ba->start - nba->start); /* += (new - old) */ @@ -3519,11 +3560,15 @@ vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map, * * The fault-copy code doesn't work with virtual page * tables. + * + * NOTE: obj is not actually an object for all MAPTYPEs, + * just test against NULL. */ - if ((obj = dst_entry->ba.object) != NULL) { - vm_map_backing_detach(&dst_entry->ba); - dst_entry->ba.object = NULL; - vm_map_entry_dispose_ba(dst_entry->ba.backing_ba); + if (dst_entry->ba.map_object != NULL) { + vm_map_backing_detach(dst_entry, &dst_entry->ba); + dst_entry->ba.map_object = NULL; + vm_map_entry_dispose_ba(dst_entry, + dst_entry->ba.backing_ba); dst_entry->ba.backing_ba = NULL; dst_entry->ba.backing_count = 0; } @@ -3687,7 +3732,7 @@ vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map, ba = old_entry->ba.backing_ba; old_entry->ba.backing_ba = NULL; old_entry->ba.backing_count = 0; - vm_map_entry_dispose_ba(ba); + vm_map_entry_dispose_ba(old_entry, ba); } } object = NULL; /* object variable is now invalid */ @@ -3888,8 +3933,10 @@ vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize, * eliminate these as input parameters, and just * pass these values here in the insert call. */ - rv = vm_map_insert(map, &count, NULL, NULL, - 0, *addrbos + max_ssize - init_ssize, + rv = vm_map_insert(map, &count, + NULL, NULL, + 0, NULL, + *addrbos + max_ssize - init_ssize, *addrbos + max_ssize, VM_MAPTYPE_NORMAL, VM_SUBSYS_STACK, prot, max, cow); @@ -4074,8 +4121,10 @@ Retry: addr = end; } - rv = vm_map_insert(map, &count, NULL, NULL, - 0, addr, stack_entry->ba.start, + rv = vm_map_insert(map, &count, + NULL, NULL, + 0, NULL, + addr, stack_entry->ba.start, VM_MAPTYPE_NORMAL, VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0); diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index abbcfda02b..cb8034276f 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -156,6 +156,10 @@ typedef enum { VM_SUBSYS_LIMIT /* end of list */ } vm_subsys_t; +#define UKSMAPOP_ADD 1 +#define UKSMAPOP_REM 2 +#define UKSMAPOP_FAULT 3 + /* * vm_map backing structure for specifying multiple backings. This * structure is NOT shared across pmaps but may be shared within a pmap. @@ -168,7 +172,12 @@ struct vm_map_backing { struct pmap *pmap; /* for vm_object extents */ struct vm_map_backing *backing_ba; /* backing store */ - TAILQ_ENTRY(vm_map_backing) entry; /* for vm_object extents */ + + /* + * Keep track of extents, typically via a vm_object but for uksmaps + * this can also be based off of a process or lwp. + */ + TAILQ_ENTRY(vm_map_backing) entry; /* * A vm_map_entry may reference an object, a submap, a uksmap, or a @@ -177,11 +186,19 @@ struct vm_map_backing { union { struct vm_object *object; /* vm_object */ struct vm_map *sub_map; /* belongs to another map */ - int (*uksmap)(struct cdev *dev, vm_page_t fake); + int (*uksmap)(struct vm_map_backing *entry, + int op, + struct cdev *dev, + vm_page_t fake); void *map_object; /* generic */ }; + void *aux_info; - vm_ooffset_t offset; /* absolute offset in obj */ + /* + * The offset field typically represents the absolute offset in the + * object, but can have other meanings for uksmaps. + */ + vm_ooffset_t offset; uint32_t flags; uint32_t backing_count; /* #entries backing us */ }; @@ -189,6 +206,7 @@ struct vm_map_backing { typedef struct vm_map_backing *vm_map_backing_t; #define VM_MAP_BACK_EXCL_HEUR 0x00000001U +#define VM_MAP_LWP_LINKED 0x00000002U /* * Address map entries consist of start and end addresses, a VM object @@ -599,8 +617,10 @@ int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_size_t, vm_offset_t vm_map_hint(struct proc *, vm_offset_t, vm_prot_t); int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t); void vm_map_init (struct vm_map *, vm_offset_t, vm_offset_t, pmap_t); -int vm_map_insert (vm_map_t, int *, void *, void *, - vm_ooffset_t, vm_offset_t, vm_offset_t, +int vm_map_insert (vm_map_t, int *, + void *, void *, + vm_ooffset_t, void *, + vm_offset_t, vm_offset_t, vm_maptype_t, vm_subsys_t id, vm_prot_t, vm_prot_t, int); int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t, diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 0ea06908ca..1c09b29958 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1186,7 +1186,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_offset_t eaddr; vm_size_t esize; vm_size_t align; - int (*uksmap)(cdev_t dev, vm_page_t fake); + int (*uksmap)(vm_map_backing_t ba, int op, cdev_t dev, vm_page_t fake); struct vnode *vp; struct thread *td = curthread; struct proc *p; @@ -1325,6 +1325,9 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, * sharing permanently allocated kernel memory or * process-context-specific (per-process) data. * + * The object offset for uksmap represents the + * lwp_tid that did the mapping. + * * Force them to be shared. */ uksmap = vp->v_rdev->si_ops->d_uksmap; -- 2.41.0