From 0adbcbd6bc12ddb6dccdf11bc0d5004c1831a619 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 16 Oct 2014 12:35:05 -0700 Subject: [PATCH] kernel - Add /dev/upmap and /dev/kpmap and sys/upmap.h * Add two memory-mappable devices for accessing a per-process and global kernel shared memory space. These can be mapped to acquire certain information from the kernel that would normally require a system call in a more efficient manner. Userland programs using this feature should NOT directly map the sys_upmap and sys_kpmap structures (which is why they are in #ifdef _KERNEL sections in sys/upmap.h). Instead, mmap the devices using UPMAP_MAPSIZE and KPMAP_MAPSIZE and parse the ukpheader[] array at the front of each area to locate the desired fields. You can then simply cache a pointer to the desired field. The width of the field is encoded in the UPTYPE/KPTYPE elements and can be asserted if desired, user programs are not expected to handle integers of multiple sizes for the same field type. * Add /dev/upmap. A program can open and mmap() this device R+W and use it to access: header[...] - See sys/upmap.h. An array of headers terminating with a type=0 header indicating where various fields are in the mapping. This should be used by userland instead of directly mapping to the struct sys_upmap structure. version - The sys_upmap version, typically 1. runticks - Scheduler run ticks (aggregate, all threads). This may be used by userland interpreters to determine when to soft-switch. forkid - A unique non-zero 64-bit fork identifier. This is NOT a pid. This may be used by userland libraries to determine if a fork has occurred by comparing against a stored value. pid - The current process pid. This may be used to acquire the process pid without having to make further system calls. proc_title - This starts out as an empty buffer and may be used to set the process title. To revert to the original process title, set proc_title[0] to 0. NOTE! Userland may write to the entire buffer, but it is recommended that userland only write to fields intended to be writable. NOTE! When a program forks, an area already mmap()d remains mmap()d but will point to the new process's area and not the old, so libraries do not need to do anything special atfork. NOTE! Access to this structure is cpu localized. * Add /dev/kpmap. A program can open and mmap() this device RO and use it to access: header[...] - See sys/upmap.h. An array of headers terminating with a type=0 header indicating where various fields are in the mapping. This should be used by userland instead of directly mapping to the struct sys_upmap structure. version - The sys_kpmap version, typically 1. upticks - System uptime tick counter (32 bit integer). Monotonic, uncompensated. ts_uptime - System uptime in struct timespec format at tick-resolution. Monotonic, uncompensated. ts_realtime - System realtime in struct timespec format at tick-resolution. This is compensated so reverse-indexing is possible. tsc_freq - If the system supports a TSC of some sort, the TSC frequency is recorded here, else 0. tick_freq - The tick resolution of ts_uptime and ts_realtime and approximate tick resolution for the scheduler. Typically 100. NOTE! Userland may only read from this buffer. NOTE! Access to this structure is NOT cpu localized. A memory fence and double-check should be used when accessing non-atomic structures which might change such as ts_uptime and ts_realtime. XXX needs work. --- sys/dev/drm/i915/i915_gem.c | 15 +- sys/emulation/linux/i386/imgact_linux.c | 6 +- sys/emulation/linux/linux_misc.c | 14 +- sys/kern/imgact_aout.c | 8 +- sys/kern/imgact_elf.c | 8 +- sys/kern/imgact_gzip.c | 7 +- sys/kern/init_main.c | 31 +- sys/kern/kern_clock.c | 16 ++ sys/kern/kern_exit.c | 3 + sys/kern/kern_fork.c | 2 + sys/kern/kern_memio.c | 132 +++++++-- sys/kern/kern_proc.c | 75 ++++- sys/kern/kern_slaballoc.c | 11 +- sys/kern/link_elf_obj.c | 5 +- sys/kern/sys_pipe.c | 9 +- sys/kern/sys_process.c | 18 +- sys/kern/sysv_shm.c | 9 +- sys/kern/vfs_bio.c | 4 +- sys/sys/device.h | 4 +- sys/sys/globaldata.h | 5 +- sys/sys/proc.h | 10 +- sys/sys/upmap.h | 133 +++++++++ sys/vfs/procfs/procfs_status.c | 25 +- sys/vm/vm.h | 7 + sys/vm/vm_fault.c | 95 ++++++- sys/vm/vm_kern.c | 67 ++--- sys/vm/vm_map.c | 357 ++++++++++++++++-------- sys/vm/vm_map.h | 24 +- sys/vm/vm_mmap.c | 50 +++- sys/vm/vm_unix.c | 8 +- 30 files changed, 876 insertions(+), 282 deletions(-) create mode 100644 sys/sys/upmap.h diff --git a/sys/dev/drm/i915/i915_gem.c b/sys/dev/drm/i915/i915_gem.c index a7364ba059..3195a25f9f 100644 --- a/sys/dev/drm/i915/i915_gem.c +++ b/sys/dev/drm/i915/i915_gem.c @@ -701,13 +701,14 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data, vm_object_hold(obj->vm_obj); vm_object_reference_locked(obj->vm_obj); vm_object_drop(obj->vm_obj); - rv = vm_map_find(map, obj->vm_obj, args->offset, &addr, args->size, - PAGE_SIZE, /* align */ - TRUE, /* fitit */ - VM_MAPTYPE_NORMAL, /* maptype */ - VM_PROT_READ | VM_PROT_WRITE, /* prot */ - VM_PROT_READ | VM_PROT_WRITE, /* max */ - MAP_SHARED /* cow */); + rv = vm_map_find(map, obj->vm_obj, NULL, + args->offset, &addr, args->size, + PAGE_SIZE, /* align */ + TRUE, /* fitit */ + VM_MAPTYPE_NORMAL, /* maptype */ + VM_PROT_READ | VM_PROT_WRITE, /* prot */ + VM_PROT_READ | VM_PROT_WRITE, /* max */ + MAP_SHARED /* cow */); if (rv != KERN_SUCCESS) { vm_object_deallocate(obj->vm_obj); error = -vm_mmap_to_errno(rv); diff --git a/sys/emulation/linux/i386/imgact_linux.c b/sys/emulation/linux/i386/imgact_linux.c index 7369ed6f0f..cd272bc8d3 100644 --- a/sys/emulation/linux/i386/imgact_linux.c +++ b/sys/emulation/linux/i386/imgact_linux.c @@ -127,7 +127,8 @@ exec_linux_imgact(struct image_params *imgp) * Map text+data+bss read/write/execute */ vmaddr = virtual_offset; - error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, + error = vm_map_find(&vmspace->vm_map, NULL, NULL, + 0, &vmaddr, a_out->a_text + a_out->a_data + bss_size, PAGE_SIZE, FALSE, VM_MAPTYPE_NORMAL, @@ -199,7 +200,8 @@ exec_linux_imgact(struct image_params *imgp) */ if (bss_size != 0) { vmaddr = virtual_offset + a_out->a_text + a_out->a_data; - error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, + error = vm_map_find(&vmspace->vm_map, NULL, NULL, + 0, &vmaddr, bss_size, PAGE_SIZE, FALSE, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, diff --git a/sys/emulation/linux/linux_misc.c b/sys/emulation/linux/linux_misc.c index 851bdecbc6..3d1575029e 100644 --- a/sys/emulation/linux/linux_misc.c +++ b/sys/emulation/linux/linux_misc.c @@ -415,12 +415,11 @@ sys_linux_uselib(struct linux_uselib_args *args) vmaddr = trunc_page(a_out->a_entry); /* get anon user mapping, read+write+execute */ - error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, - &vmaddr, a_out->a_text + a_out->a_data, + error = vm_map_find(&p->p_vmspace->vm_map, NULL, NULL, + 0, &vmaddr, a_out->a_text + a_out->a_data, PAGE_SIZE, FALSE, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; @@ -472,12 +471,11 @@ sys_linux_uselib(struct linux_uselib_args *args) a_out->a_data; /* allocate some 'anon' space */ - error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, - &vmaddr, bss_size, + error = vm_map_find(&p->p_vmspace->vm_map, NULL, NULL, + 0, &vmaddr, bss_size, PAGE_SIZE, FALSE, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; } diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c index 89ca8fffe3..d27009f8b9 100644 --- a/sys/kern/imgact_aout.c +++ b/sys/kern/imgact_aout.c @@ -184,7 +184,7 @@ exec_aout_imgact(struct image_params *imgp) vm_object_reference_locked(object); text_end = virtual_offset + a_out->a_text; - error = vm_map_insert(map, &count, object, + error = vm_map_insert(map, &count, object, NULL, file_offset, virtual_offset, text_end, VM_MAPTYPE_NORMAL, @@ -201,7 +201,7 @@ exec_aout_imgact(struct image_params *imgp) data_end = text_end + a_out->a_data; if (a_out->a_data) { vm_object_reference_locked(object); - error = vm_map_insert(map, &count, object, + error = vm_map_insert(map, &count, object, NULL, file_offset + a_out->a_text, text_end, data_end, VM_MAPTYPE_NORMAL, @@ -217,8 +217,8 @@ exec_aout_imgact(struct image_params *imgp) vm_object_drop(object); if (bss_size) { - error = vm_map_insert(map, &count, NULL, 0, - data_end, data_end + bss_size, + error = vm_map_insert(map, &count, NULL, NULL, + 0, data_end, data_end + bss_size, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, 0); diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 33b1c0a5f9..b0e0460ffb 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -304,7 +304,7 @@ __elfN(load_section)(struct proc *p, struct vmspace *vmspace, struct vnode *vp, count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, &count, - object, + object, NULL, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ @@ -346,8 +346,10 @@ __elfN(load_section)(struct proc *p, struct vmspace *vmspace, struct vnode *vp, count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, &count, - NULL, 0, - map_addr, map_addr + map_len, + NULL, NULL, + 0, + map_addr, + map_addr + map_len, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, 0); diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c index 17625ab9ed..5096495e3f 100644 --- a/sys/kern/imgact_gzip.c +++ b/sys/kern/imgact_gzip.c @@ -247,11 +247,10 @@ do_aout_hdr(struct imgact_gzip * gz) vmaddr = gz->virtual_offset + gz->a_out.a_text + gz->a_out.a_data; error = vm_map_find(&vmspace->vm_map, - NULL, 0, - &vmaddr, gz->bss_size, PAGE_SIZE, + NULL, NULL, + 0, &vmaddr, gz->bss_size, PAGE_SIZE, FALSE, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { gz->where = __LINE__; return (error); diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index b1c20a331c..f09bfdfa7c 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -90,6 +90,7 @@ struct proc *initproc; struct proc proc0; struct lwp lwp0; struct thread thread0; +struct sys_kpmap *kpmap; int cmask = CMASK; u_int cpu_mi_feature; @@ -583,11 +584,11 @@ start_init(void *dummy, struct trapframe *frame) * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = trunc_page(USRSTACK - PAGE_SIZE); - error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, - PAGE_SIZE, PAGE_SIZE, + error = vm_map_find(&p->p_vmspace->vm_map, NULL, NULL, + 0, &addr, PAGE_SIZE, + PAGE_SIZE, FALSE, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (error) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; @@ -717,6 +718,28 @@ kick_init(const void *udata __unused) } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) +static void +kpmap_init(const void *udata __unused) +{ + kpmap = kmalloc(roundup2(sizeof(*kpmap), PAGE_SIZE), + M_TEMP, M_ZERO | M_WAITOK); + + kpmap->header[0].type = UKPTYPE_VERSION; + kpmap->header[0].offset = offsetof(struct sys_kpmap, version); + kpmap->header[1].type = KPTYPE_UPTICKS; + kpmap->header[1].offset = offsetof(struct sys_kpmap, upticks); + kpmap->header[2].type = KPTYPE_TS_UPTIME; + kpmap->header[2].offset = offsetof(struct sys_kpmap, ts_uptime); + kpmap->header[3].type = KPTYPE_TS_REALTIME; + kpmap->header[3].offset = offsetof(struct sys_kpmap, ts_realtime); + kpmap->header[4].type = KPTYPE_TSC_FREQ; + kpmap->header[4].offset = offsetof(struct sys_kpmap, tsc_freq); + kpmap->header[5].type = KPTYPE_TICK_FREQ; + kpmap->header[5].offset = offsetof(struct sys_kpmap, tick_freq); + kpmap->version = KPMAP_VERSION; +} +SYSINIT(kpmapinit, SI_BOOT1_POST, SI_ORDER_FIRST, kpmap_init, NULL) + /* * Machine independant globaldata initialization * diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 0ec5edc574..81ee172325 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -253,6 +254,10 @@ initclocks(void *dummy) /*psratio = profhz / stathz;*/ initclocks_pcpu(); clocks_running = 1; + if (kpmap) { + kpmap->tsc_freq = (uint64_t)tsc_frequency; + kpmap->tick_freq = hz; + } } /* @@ -551,6 +556,14 @@ hardclock(systimer_t info, int in_ipi __unused, struct intrframe *frame) */ cpu_sfence(); basetime_index = ni; + + /* + * Update kpmap on each tick + */ + if (kpmap) { + getnanouptime(&kpmap->ts_uptime); + getnanotime(&kpmap->ts_realtime); + } } /* @@ -576,6 +589,9 @@ hardclock(systimer_t info, int in_ipi __unused, struct intrframe *frame) */ if ((p = curproc) != NULL && lwkt_trytoken(&p->p_token)) { crit_enter_hard(); + if (p->p_upmap) + ++p->p_upmap->runticks; + if (frame && CLKF_USERMODE(frame) && timevalisset(&p->p_timer[ITIMER_VIRTUAL].it_value) && itimerdecr(&p->p_timer[ITIMER_VIRTUAL], ustick) == 0) { diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 85290fbfd3..9a201808d5 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -764,6 +764,7 @@ lwp_wait(struct lwp *lp) * will be cleared temporarily if a thread gets preempted. */ while ((td->td_flags & (TDF_RUNNING | + TDF_RUNQ | TDF_PREEMPT_LOCK | TDF_EXITING)) != TDF_EXITING) { tsleep(lp, 0, "lwpwait3", 1); @@ -788,6 +789,7 @@ lwp_dispose(struct lwp *lp) KKASSERT(lwkt_preempted_proc() != lp); KKASSERT(td->td_refs == 0); KKASSERT((td->td_flags & (TDF_RUNNING | + TDF_RUNQ | TDF_PREEMPT_LOCK | TDF_EXITING)) == TDF_EXITING); @@ -984,6 +986,7 @@ loop: * the zombie list. */ proc_remove_zombie(p); + proc_userunmap(p); lwkt_reltoken(&p->p_token); leavepgrp(p); diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index b5fbc4bb85..9c4d98c9eb 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -379,6 +379,8 @@ fork1(struct lwp *lp1, int flags, struct proc **procp) * once the process is on the allproc list to avoid things such * as competing modifications to p_flags. */ + mycpu->gd_forkid += ncpus; + p2->p_forkid = mycpu->gd_forkid + mycpu->gd_cpuid; p2->p_lasttid = -1; /* first tid will be 0 */ p2->p_stat = SIDL; diff --git a/sys/kern/kern_memio.c b/sys/kern/kern_memio.c index 3f3d46c841..8f391e71c6 100644 --- a/sys/kern/kern_memio.c +++ b/sys/kern/kern_memio.c @@ -71,8 +71,11 @@ static d_close_t mmclose; static d_read_t mmread; static d_write_t mmwrite; static d_ioctl_t mmioctl; +#if 0 static d_mmap_t memmmap; +#endif static d_kqfilter_t mmkqfilter; +static int memuksmap(cdev_t dev, vm_page_t fake); #define CDEV_MAJOR 2 static struct dev_ops mem_ops = { @@ -83,7 +86,10 @@ static struct dev_ops mem_ops = { .d_write = mmwrite, .d_ioctl = mmioctl, .d_kqfilter = mmkqfilter, +#if 0 .d_mmap = memmmap, +#endif + .d_uksmap = memuksmap }; static int rand_bolt; @@ -283,6 +289,8 @@ mmrw(cdev_t dev, struct uio *uio, int flags) c = min(c, poolsize); error = uiomove(buf, (int)c, uio); continue; + /* case 5: read/write not supported, mmap only */ + /* case 6: read/write not supported, mmap only */ case 12: /* * minor device 12 (/dev/zero) is source of nulls @@ -326,45 +334,94 @@ mmwrite(struct dev_write_args *ap) return(mmrw(ap->a_head.a_dev, ap->a_uio, ap->a_ioflag)); } - - - - /*******************************************************\ * allow user processes to MMAP some memory sections * * instead of going through read/write * \*******************************************************/ +static int user_kernel_mapping(int num, vm_ooffset_t offset, + vm_ooffset_t *resultp); + +#if 0 + static int memmmap(struct dev_mmap_args *ap) { cdev_t dev = ap->a_head.a_dev; + vm_ooffset_t result; + int error; switch (minor(dev)) { case 0: /* * minor device 0 is physical memory */ -#if defined(__i386__) - ap->a_result = i386_btop(ap->a_offset); -#elif defined(__x86_64__) - ap->a_result = x86_64_btop(ap->a_offset); -#endif - return 0; + ap->a_result = atop(ap->a_offset); + error = 0; + break; case 1: /* * minor device 1 is kernel memory */ -#if defined(__i386__) - ap->a_result = i386_btop(vtophys(ap->a_offset)); -#elif defined(__x86_64__) - ap->a_result = x86_64_btop(vtophys(ap->a_offset)); + ap->a_result = atop(vtophys(ap->a_offset)); + error = 0; + break; + case 5: + case 6: + /* + * minor device 5 is /dev/upmap (see sys/upmap.h) + * minor device 6 is /dev/kpmap (see sys/upmap.h) + */ + result = 0; + error = user_kernel_mapping(minor(dev), ap->a_offset, &result); + ap->a_result = atop(result); + break; + default: + error = EINVAL; + break; + } + return error; +} + #endif - return 0; +static int +memuksmap(cdev_t dev, vm_page_t fake) +{ + vm_ooffset_t result; + int error; + + switch (minor(dev)) { + case 0: + /* + * minor device 0 is physical memory + */ + fake->phys_addr = ptoa(fake->pindex); + error = 0; + break; + case 1: + /* + * minor device 1 is kernel memory + */ + fake->phys_addr = vtophys(ptoa(fake->pindex)); + error = 0; + break; + case 5: + case 6: + /* + * minor device 5 is /dev/upmap (see sys/upmap.h) + * minor device 6 is /dev/kpmap (see sys/upmap.h) + */ + result = 0; + error = user_kernel_mapping(minor(dev), + ptoa(fake->pindex), &result); + fake->phys_addr = result; + break; default: - return EINVAL; + error = EINVAL; + break; } + return error; } static int @@ -601,6 +658,47 @@ iszerodev(cdev_t dev) return (zerodev == dev); } +/* + * /dev/upmap and /dev/kpmap. + */ +static int +user_kernel_mapping(int num, vm_ooffset_t offset, vm_ooffset_t *resultp) +{ + struct proc *p = curproc; + int error; + + if (p == NULL) + return (EINVAL); + error = EINVAL; + + switch(num) { + case 5: + /* + * /dev/upmap - maps RW per-process shared user-kernel area. + */ + if (p->p_upmap == NULL) + proc_usermap(p); + if (p->p_upmap && offset == 0) { + /* only good for current process */ + *resultp = pmap_kextract((vm_offset_t)p->p_upmap); + error = 0; + } + break; + case 6: + /* + * /dev/kpmap - maps RO shared kernel global page + */ + if (kpmap && offset == 0) { + *resultp = pmap_kextract((vm_offset_t)kpmap); + error = 0; + } + break; + default: + break; + } + return error; +} + static void mem_drvinit(void *unused) { @@ -614,6 +712,8 @@ mem_drvinit(void *unused) make_dev(&mem_ops, 2, UID_ROOT, GID_WHEEL, 0666, "null"); make_dev(&mem_ops, 3, UID_ROOT, GID_WHEEL, 0644, "random"); make_dev(&mem_ops, 4, UID_ROOT, GID_WHEEL, 0644, "urandom"); + make_dev(&mem_ops, 5, UID_ROOT, GID_WHEEL, 0666, "upmap"); + make_dev(&mem_ops, 6, UID_ROOT, GID_WHEEL, 0444, "kpmap"); zerodev = make_dev(&mem_ops, 12, UID_ROOT, GID_WHEEL, 0666, "zero"); make_dev(&mem_ops, 14, UID_ROOT, GID_WHEEL, 0600, "io"); } diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index ea9457ca85..950d2fa49c 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -1154,6 +1154,49 @@ lwpkthreaddeferred(void) } } +void +proc_usermap(struct proc *p) +{ + struct sys_upmap *upmap; + + lwkt_gettoken(&p->p_token); + upmap = kmalloc(roundup2(sizeof(*upmap), PAGE_SIZE), M_PROC, + M_WAITOK | M_ZERO); + if (p->p_upmap == NULL) { + upmap->header[0].type = UKPTYPE_VERSION; + upmap->header[0].offset = offsetof(struct sys_upmap, version); + upmap->header[1].type = UPTYPE_RUNTICKS; + upmap->header[1].offset = offsetof(struct sys_upmap, runticks); + upmap->header[2].type = UPTYPE_FORKID; + upmap->header[2].offset = offsetof(struct sys_upmap, forkid); + upmap->header[3].type = UPTYPE_PID; + upmap->header[3].offset = offsetof(struct sys_upmap, pid); + upmap->header[4].type = UPTYPE_PROC_TITLE; + upmap->header[4].offset = offsetof(struct sys_upmap,proc_title); + + upmap->version = UPMAP_VERSION; + upmap->pid = p->p_pid; + upmap->forkid = p->p_forkid; + p->p_upmap = upmap; + } else { + kfree(upmap, M_PROC); + } + lwkt_reltoken(&p->p_token); +} + +void +proc_userunmap(struct proc *p) +{ + struct sys_upmap *upmap; + + lwkt_gettoken(&p->p_token); + if ((upmap = p->p_upmap) != NULL) { + p->p_upmap = NULL; + kfree(upmap, M_PROC); + } + lwkt_reltoken(&p->p_token); +} + /* * Scan all processes on the allproc list. The process is automatically * held for the callback. A return value of -1 terminates the loop. @@ -1600,11 +1643,33 @@ sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) error = EPERM; goto done; } - if (req->oldptr && (pa = p->p_args) != NULL) { - refcount_acquire(&pa->ar_ref); - error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); - if (refcount_release(&pa->ar_ref)) - kfree(pa, M_PARGS); + if (req->oldptr) { + if (p->p_upmap != NULL && p->p_upmap->proc_title[0]) { + /* + * Args set via writable user process mmap. + * We must calculate the string length manually + * because the user data can change at any time. + */ + size_t n; + char *base; + + base = p->p_upmap->proc_title; + for (n = 0; n < UPMAP_MAXPROCTITLE - 1; ++n) { + if (base[n] == 0) + break; + } + error = SYSCTL_OUT(req, base, n); + if (error == 0) + error = SYSCTL_OUT(req, "", 1); + } else if ((pa = p->p_args) != NULL) { + /* + * Args set by setproctitle() sysctl. + */ + refcount_acquire(&pa->ar_ref); + error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); + if (refcount_release(&pa->ar_ref)) + kfree(pa, M_PARGS); + } } if (req->newptr == NULL) goto done; diff --git a/sys/kern/kern_slaballoc.c b/sys/kern/kern_slaballoc.c index f40db032dc..28e014ce15 100644 --- a/sys/kern/kern_slaballoc.c +++ b/sys/kern/kern_slaballoc.c @@ -1482,11 +1482,12 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags) */ vm_object_hold(&kernel_object); vm_object_reference_locked(&kernel_object); - vm_map_insert(&kernel_map, &count, - &kernel_object, addr, addr, addr + size, - VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + vm_map_insert(&kernel_map, &count, + &kernel_object, NULL, + addr, addr, addr + size, + VM_MAPTYPE_NORMAL, + VM_PROT_ALL, VM_PROT_ALL, + 0); vm_object_drop(&kernel_object); vm_map_set_wired_quick(&kernel_map, addr, size, &count); vm_map_unlock(&kernel_map); diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c index e9bdcd1f3e..f37ce4239b 100644 --- a/sys/kern/link_elf_obj.c +++ b/sys/kern/link_elf_obj.c @@ -656,8 +656,9 @@ link_elf_obj_load_file(const char *filename, linker_file_t * result) vm_object_drop(ef->object); #else mapbase = KERNBASE; - error = vm_map_find(&kernel_map, ef->object, 0, &mapbase, - round_page(mapsize), PAGE_SIZE, + error = vm_map_find(&kernel_map, ef->object, NULL, + 0, &mapbase, round_page(mapsize), + PAGE_SIZE, TRUE, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, FALSE); vm_object_drop(ef->object); diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 33804519ce..2ea4fa6e7a 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -321,12 +321,11 @@ pipespace(struct pipe *cpipe, int size) object = vm_object_allocate(OBJT_DEFAULT, npages); buffer = (caddr_t)vm_map_min(&kernel_map); - error = vm_map_find(&kernel_map, object, 0, - (vm_offset_t *)&buffer, - size, PAGE_SIZE, + error = vm_map_find(&kernel_map, object, NULL, + 0, (vm_offset_t *)&buffer, size, + PAGE_SIZE, 1, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { vm_object_deallocate(object); diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index 098499dc2b..26d04cd21c 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -85,12 +85,11 @@ pread (struct proc *procp, unsigned int addr, unsigned int *retval) { vm_map_lookup_done (tmap, out_entry, 0); /* Find space in kernel_map for the page we're interested in */ - rv = vm_map_find (&kernel_map, object, IDX_TO_OFF(pindex), - &kva, - PAGE_SIZE, PAGE_SIZE, + rv = vm_map_find (&kernel_map, object, NULL, + IDX_TO_OFF(pindex), &kva, PAGE_SIZE, + PAGE_SIZE, 0, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (!rv) { vm_object_reference XXX (object); @@ -172,12 +171,11 @@ pwrite (struct proc *procp, unsigned int addr, unsigned int datum) { return EFAULT; /* Find space in kernel_map for the page we're interested in */ - rv = vm_map_find (&kernel_map, object, IDX_TO_OFF(pindex), - &kva, - PAGE_SIZE, PAGE_SIZE, + rv = vm_map_find (&kernel_map, object, NULL, + IDX_TO_OFF(pindex), &kva, PAGE_SIZE, + PAGE_SIZE, 0, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (!rv) { vm_object_reference XXX (object); diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 8edd773614..dc971adbd3 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -329,13 +329,12 @@ again: vm_object_chain_wait(shm_handle->shm_object, 0); vm_object_reference_locked(shm_handle->shm_object); rv = vm_map_find(&p->p_vmspace->vm_map, - shm_handle->shm_object, 0, - &attach_va, - size, align, + shm_handle->shm_object, NULL, + 0, &attach_va, size, + align, ((flags & MAP_FIXED) ? 0 : 1), VM_MAPTYPE_NORMAL, - prot, prot, - 0); + prot, prot, 0); vm_object_drop(shm_handle->shm_object); if (rv != KERN_SUCCESS) { vm_object_deallocate(shm_handle->shm_object); diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 45ef3dbe00..5588f2dc39 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -2334,8 +2334,8 @@ restart: } if (addr) { vm_map_insert(&buffer_map, &count, - NULL, 0, - addr, addr + maxsize, + NULL, NULL, + 0, addr, addr + maxsize, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); diff --git a/sys/sys/device.h b/sys/sys/device.h index 0219f2cc2b..b9aca6c00e 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -48,6 +48,7 @@ struct cdev; struct ucred; struct devfs_bitmap; +struct vm_page; /* * This structure is at the base of every device args structure @@ -247,7 +248,8 @@ struct dev_ops { d_kqfilter_t *d_kqfilter; d_clone_t *d_clone; /* clone from base dev_ops */ d_revoke_t *d_revoke; -#define dev_ops_last_field d_revoke + int (*d_uksmap)(struct cdev *dev, struct vm_page *fake); +#define dev_ops_last_field d_uksmap }; /* diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h index 0042d38cd7..cbe6be3871 100644 --- a/sys/sys/globaldata.h +++ b/sys/sys/globaldata.h @@ -172,7 +172,10 @@ struct globaldata { struct lwkt_tokref gd_handoff; /* hand-off tokref */ void *gd_delayed_wakeup[2]; void *gd_sample_pc; /* sample program ctr/tr */ - void *gd_preserved[5]; /* future fields */ + void *gd_reserved_pcpu_mmap; /* future */ + uint64_t gd_forkid; /* per-cpu unique inc ncpus */ + uint64_t gd_reserved64[4]; + void *gd_preserved[4]; /* future fields */ /* extended by */ }; diff --git a/sys/sys/proc.h b/sys/sys/proc.h index cb2dbbd5b8..d569ca12d3 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -34,11 +34,7 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * @(#)proc.h 8.15 (Berkeley) 5/19/95 - * $FreeBSD: src/sys/sys/proc.h,v 1.99.2.9 2003/06/06 20:21:32 tegge Exp $ */ - #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ @@ -55,6 +51,7 @@ #include /* For struct rtprio. */ #include #include +#include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #endif @@ -344,6 +341,9 @@ struct proc { void *p_vmm; cpulock_t p_vmm_cpulock; /* count cpus in and kickout lock */ cpumask_t p_vmm_cpumask; /* cpus entering or in vmm */ + struct sys_upmap *p_upmap; /* user RO mappable per-process page */ + forkid_t p_forkid; /* unique forkid */ + void *p_reserveds[4]; /* reserved for future */ }; #define lwp_wchan lwp_thread->td_wchan @@ -575,6 +575,8 @@ void prelezomb (struct proc *); void pstall (struct proc *, const char *, int); void lwpuserret(struct lwp *); void lwpkthreaddeferred(void); +void proc_usermap(struct proc *p); +void proc_userunmap(struct proc *p); u_int32_t procrunnable (void); diff --git a/sys/sys/upmap.h b/sys/sys/upmap.h new file mode 100644 index 0000000000..3af5c5ca7c --- /dev/null +++ b/sys/sys/upmap.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2014 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_UPMAP_H_ +#define _SYS_UPMAP_H_ + +#ifndef _SYS_TYPES_H_ +#include +#endif +#ifndef _SYS_TIME_H_ +#include +#endif + +#define UPMAP_MAXPROCTITLE 1024 +#define UPMAP_MAPSIZE 65536 +#define KPMAP_MAPSIZE 65536 + +#define UPMAP_VERSION 1 +#define KPMAP_VERSION 1 + +typedef uint64_t forkid_t; + +typedef struct ukpheader { + uint16_t type; /* element type */ + uint16_t offset; /* offset from map base, max 65535 */ +} ukpheader_t; + +#define UKPLEN_MASK 0x0F00 +#define UKPLEN_1 0x0000 +#define UKPLEN_2 0x0100 +#define UKPLEN_4 0x0200 +#define UKPLEN_8 0x0300 +#define UKPLEN_16 0x0400 +#define UKPLEN_32 0x0500 +#define UKPLEN_64 0x0600 +#define UKPLEN_128 0x0700 +#define UKPLEN_256 0x0800 +#define UKPLEN_512 0x0900 +#define UKPLEN_1024 0x0A00 + +#define UKPLEN_TS ((sizeof(struct timespec) == 8) ? \ + UKPLEN_8 : UKPLEN_16) + +#define UKPTYPE_VERSION (0x0001 | UKPLEN_4) /* always first */ + +#define UPTYPE_RUNTICKS (0x0010 | UKPLEN_4) +#define UPTYPE_FORKID (0x0011 | UKPLEN_8) +#define UPTYPE_PID (0x0012 | UKPLEN_4) +#define UPTYPE_PROC_TITLE (0x0013 | UKPLEN_1024) + +#define KPTYPE_UPTICKS (0x8000 | UKPLEN_4) +#define KPTYPE_TS_UPTIME (0x8001 | UKPLEN_TS) +#define KPTYPE_TS_REALTIME (0x8002 | UKPLEN_TS) +#define KPTYPE_TSC_FREQ (0x8003 | UKPLEN_8) +#define KPTYPE_TICK_FREQ (0x8003 | UKPLEN_8) + +#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) + +/* + * (writable) user per-process map via /dev/upmap. + * + * ABSOLUTE LOCATIONS CAN CHANGE, ITERATE HEADERS FOR THE TYPE YOU DESIRE + * UNTIL YOU HIT TYPE 0, THEN CACHE THE RESULTING POINTER. + * + * If you insist, at least check that the version matches UPMAP_VERSION. + */ +struct sys_upmap { + ukpheader_t header[64]; + uint32_t version; + uint32_t runticks; /* running scheduler ticks */ + forkid_t forkid; /* unique 2^64 (fork detect) NOT MONO */ + uint32_t unused01; /* cpu migrations (kpmap detect) */ + pid_t pid; /* process id */ + uint32_t reserved[16]; + char proc_title[UPMAP_MAXPROCTITLE]; +}; + +/* + * (read-only) kernel per-cpu map via /dev/kpmap. + * + * ABSOLUTE LOCATIONS CAN CHANGE, ITERATE HEADERS FOR THE TYPE YOU DESIRE + * UNTIL YOU HIT TYPE 0, THEN CACHE THE RESULTING POINTER. + * + * If you insist, at least check that the version matches KPMAP_VERSION. + */ +struct sys_kpmap { + ukpheader_t header[64]; + int32_t version; + int32_t upticks; + struct timespec ts_uptime; /* mono uptime @ticks (uncompensated) */ + struct timespec ts_realtime; /* realtime @ticks resolution */ + int64_t tsc_freq; /* (if supported by cpu) */ + int32_t tick_freq; /* scheduler tick frequency */ +}; + +#endif + +#ifdef _KERNEL +extern struct sys_kpmap *kpmap; +#endif + +#endif diff --git a/sys/vfs/procfs/procfs_status.c b/sys/vfs/procfs/procfs_status.c index b479a878e1..8b9ec6e799 100644 --- a/sys/vfs/procfs/procfs_status.c +++ b/sys/vfs/procfs/procfs_status.c @@ -201,12 +201,27 @@ procfs_docmdline(struct proc *curp, struct lwp *lp, struct pfsnode *pfs, * don't fall back on p->p_comm or return an error: the authentic * Linux behaviour is to return zero-length in this case. */ - - if (p->p_args && + if (p->p_upmap != NULL && p->p_upmap->proc_title[0] && (ps_argsopen || (CHECKIO(curp, p) && - (p->p_flags & P_INEXEC) == 0 && - !p_trespass(curp->p_ucred, p->p_ucred))) - ) { + (p->p_flags & P_INEXEC) == 0 && + !p_trespass(curp->p_ucred, p->p_ucred)) + )) { + /* + * Args set via writable user process mmap. + * We must calculate the string length manually + * because the user data can change at any time. + */ + bp = p->p_upmap->proc_title; + for (buflen = 0; buflen < UPMAP_MAXPROCTITLE - 1; ++buflen) { + if (bp[buflen] == 0) + break; + } + buf = NULL; + } else if (p->p_args && + (ps_argsopen || (CHECKIO(curp, p) && + (p->p_flags & P_INEXEC) == 0 && + !p_trespass(curp->p_ucred, p->p_ucred)) + )) { bp = p->p_args->ar_args; buflen = p->p_args->ar_length; buf = NULL; diff --git a/sys/vm/vm.h b/sys/vm/vm.h index f04b3e8642..a619775103 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -92,10 +92,17 @@ typedef u_char vm_prot_t; /* protection codes */ typedef u_char vm_maptype_t; /* type of vm_map_entry */ +/* + * NOTE: UKSMAPs are unmanaged. The underlying kernel memory must not be + * freed until all related mappings are gone. There is no object. + * The device can map different things for the same UKS mapping even + * when inherited via fork(). + */ #define VM_MAPTYPE_UNSPECIFIED 0 #define VM_MAPTYPE_NORMAL 1 #define VM_MAPTYPE_VPAGETABLE 2 #define VM_MAPTYPE_SUBMAP 3 +#define VM_MAPTYPE_UKSMAP 4 /* user-kernel shared memory */ union vm_map_object; typedef union vm_map_object vm_map_object_t; diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 064afcd397..79b06a3da8 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1,5 +1,37 @@ /* - * (MPSAFE) + * Copyright (c) 2003-2014 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * --- * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. @@ -36,8 +68,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 - * + * --- * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. @@ -63,9 +94,6 @@ * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. - * - * $FreeBSD: src/sys/vm/vm_fault.c,v 1.108.2.8 2002/02/26 05:49:27 silby Exp $ - * $DragonFly: src/sys/vm/vm_fault.c,v 1.47 2008/07/01 02:02:56 dillon Exp $ */ /* @@ -379,6 +407,7 @@ RetryFault: fs.lookup_still_valid = TRUE; fs.first_m = NULL; fs.object = fs.first_object; /* so unlock_and_deallocate works */ + fs.prot = fs.first_prot; /* default (used by uksmap) */ if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) { if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { @@ -393,6 +422,30 @@ RetryFault: } } + /* + * A user-kernel shared map has no VM object and bypasses + * everything. We execute the uksmap function with a temporary + * fictitious vm_page. The address is directly mapped with no + * management. + */ + if (fs.entry->maptype == VM_MAPTYPE_UKSMAP) { + struct vm_page fakem; + + bzero(&fakem, sizeof(fakem)); + fakem.pindex = first_pindex; + fakem.flags = PG_BUSY | PG_FICTITIOUS | PG_UNMANAGED; + fakem.valid = VM_PAGE_BITS_ALL; + fakem.pat_mode = VM_MEMATTR_DEFAULT; + if (fs.entry->object.uksmap(fs.entry->aux.dev, &fakem)) { + result = KERN_FAILURE; + unlock_things(&fs); + goto done2; + } + pmap_enter(fs.map->pmap, vaddr, &fakem, fs.prot | inherit_prot, + fs.wired, fs.entry); + goto done_success; + } + /* * A system map entry may return a NULL object. No object means * no pager means an unrecoverable kernel fault. @@ -528,9 +581,6 @@ RetryFault: vm_page_flag_set(fs.m, PG_REFERENCED); pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot | inherit_prot, fs.wired, fs.entry); - mycpu->gd_cnt.v_vm_faults++; - if (curthread->td_lwp) - ++curthread->td_lwp->lwp_ru.ru_minflt; /*KKASSERT(fs.m->queue == PQ_NONE); page-in op may deactivate page */ KKASSERT(fs.m->flags & PG_BUSY); @@ -574,6 +624,11 @@ RetryFault: } } +done_success: + mycpu->gd_cnt.v_vm_faults++; + if (curthread->td_lwp) + ++curthread->td_lwp->lwp_ru.ru_minflt; + /* * Unlock everything, and return */ @@ -1984,9 +2039,21 @@ vm_fault_wire(vm_map_t map, vm_map_entry_t entry, pmap = vm_map_pmap(map); start = entry->start; end = entry->end; - fictitious = entry->object.vm_object && - ((entry->object.vm_object->type == OBJT_DEVICE) || - (entry->object.vm_object->type == OBJT_MGTDEVICE)); + switch(entry->maptype) { + case VM_MAPTYPE_NORMAL: + case VM_MAPTYPE_VPAGETABLE: + fictitious = entry->object.vm_object && + ((entry->object.vm_object->type == OBJT_DEVICE) || + (entry->object.vm_object->type == OBJT_MGTDEVICE)); + break; + case VM_MAPTYPE_UKSMAP: + fictitious = TRUE; + break; + default: + fictitious = FALSE; + break; + } + if (entry->eflags & MAP_ENTRY_KSTACK) start += PAGE_SIZE; map->timestamp++; @@ -2390,7 +2457,7 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot, * We do not currently prefault mappings that use virtual page * tables. We do not prefault foreign pmaps. */ - if (entry->maptype == VM_MAPTYPE_VPAGETABLE) + if (entry->maptype != VM_MAPTYPE_NORMAL) return; lp = curthread->td_lwp; if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) @@ -2691,7 +2758,7 @@ vm_prefault_quick(pmap_t pmap, vm_offset_t addra, * We do not currently prefault mappings that use virtual page * tables. We do not prefault foreign pmaps. */ - if (entry->maptype == VM_MAPTYPE_VPAGETABLE) + if (entry->maptype != VM_MAPTYPE_NORMAL) return; lp = curthread->td_lwp; if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 92d4d0ac33..94a8dbbbe4 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -102,11 +102,11 @@ kmem_alloc_pageable(vm_map_t map, vm_size_t size) size = round_page(size); addr = vm_map_min(map); - result = vm_map_find(map, NULL, (vm_offset_t) 0, - &addr, size, PAGE_SIZE, + result = vm_map_find(map, NULL, NULL, + (vm_offset_t) 0, &addr, size, + PAGE_SIZE, TRUE, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (result != KERN_SUCCESS) return (0); return (addr); @@ -125,11 +125,11 @@ kmem_alloc_nofault(vm_map_t map, vm_size_t size, vm_size_t align) size = round_page(size); addr = vm_map_min(map); - result = vm_map_find(map, NULL, (vm_offset_t) 0, - &addr, size, align, + result = vm_map_find(map, NULL, NULL, + (vm_offset_t) 0, &addr, size, + align, TRUE, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - MAP_NOFAULT); + VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); if (result != KERN_SUCCESS) return (0); return (addr); @@ -184,10 +184,10 @@ kmem_alloc3(vm_map_t map, vm_size_t size, int kmflags) vm_object_hold(&kernel_object); vm_object_reference_locked(&kernel_object); vm_map_insert(map, &count, - &kernel_object, addr, addr, addr + size, + &kernel_object, NULL, + addr, addr, addr + size, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - cow); + VM_PROT_ALL, VM_PROT_ALL, cow); vm_object_drop(&kernel_object); vm_map_unlock(map); @@ -273,11 +273,11 @@ kmem_suballoc(vm_map_t parent, vm_map_t result, size = round_page(size); *min = (vm_offset_t) vm_map_min(parent); - ret = vm_map_find(parent, NULL, (vm_offset_t) 0, - min, size, PAGE_SIZE, + ret = vm_map_find(parent, NULL, NULL, + (vm_offset_t) 0, min, size, + PAGE_SIZE, TRUE, VM_MAPTYPE_UNSPECIFIED, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (ret != KERN_SUCCESS) { kprintf("kmem_suballoc: bad status return of %d.\n", ret); panic("kmem_suballoc"); @@ -325,8 +325,8 @@ kmem_alloc_wait(vm_map_t map, vm_size_t size) tsleep(map, 0, "kmaw", 0); } vm_map_insert(map, &count, - NULL, (vm_offset_t) 0, - addr, addr + size, + NULL, NULL, + (vm_offset_t) 0, addr, addr + size, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, 0); @@ -356,7 +356,7 @@ kmem_alloc_attr(vm_map_t map, vm_size_t size, int flags, vm_paddr_t low, count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, PAGE_SIZE, - flags, &addr)) { + flags, &addr)) { vm_map_unlock(map); vm_map_entry_release(count); return (0); @@ -364,8 +364,11 @@ kmem_alloc_attr(vm_map_t map, vm_size_t size, int flags, vm_paddr_t low, offset = addr - vm_map_min(&kernel_map); vm_object_hold(&kernel_object); vm_object_reference_locked(&kernel_object); - vm_map_insert(map, &count, &kernel_object, offset, addr, addr + size, - VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, 0); + vm_map_insert(map, &count, + &kernel_object, NULL, + offset, addr, addr + size, + VM_MAPTYPE_NORMAL, + VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); vm_map_entry_release(count); vm_object_drop(&kernel_object); @@ -431,28 +434,28 @@ kmem_init(void) addr = KvaStart; if (virtual2_start) { if (addr < virtual2_start) { - vm_map_insert(m, &count, NULL, (vm_offset_t) 0, - addr, virtual2_start, + vm_map_insert(m, &count, + NULL, NULL, + (vm_offset_t) 0, addr, virtual2_start, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); } addr = virtual2_end; } if (addr < virtual_start) { - vm_map_insert(m, &count, NULL, (vm_offset_t) 0, - addr, virtual_start, + vm_map_insert(m, &count, + NULL, NULL, + (vm_offset_t) 0, addr, virtual_start, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); } addr = virtual_end; if (addr < KvaEnd) { - vm_map_insert(m, &count, NULL, (vm_offset_t) 0, - addr, KvaEnd, + vm_map_insert(m, &count, + NULL, NULL, + (vm_offset_t) 0, addr, KvaEnd, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); } /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 851a7b9b37..e4f6eef83d 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -965,18 +965,22 @@ vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) * making call to account for the new entry. */ int -vm_map_insert(vm_map_t map, int *countp, - vm_object_t object, vm_ooffset_t offset, - vm_offset_t start, vm_offset_t end, +vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux, + vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_maptype_t maptype, - vm_prot_t prot, vm_prot_t max, - int cow) + vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry; vm_map_entry_t prev_entry; vm_map_entry_t temp_entry; vm_eflags_t protoeflags; int must_drop = 0; + vm_object_t object; + + if (maptype == VM_MAPTYPE_UKSMAP) + object = NULL; + else + object = map_object; ASSERT_VM_MAP_LOCKED(map); if (object) @@ -1048,6 +1052,7 @@ vm_map_insert(vm_map_t map, int *countp, (prev_entry->end == start) && (prev_entry->wired_count == 0) && prev_entry->maptype == maptype && + maptype == VM_MAPTYPE_NORMAL && ((prev_entry->object.vm_object == NULL) || vm_object_coalesce(prev_entry->object.vm_object, OFF_TO_IDX(prev_entry->offset), @@ -1101,9 +1106,10 @@ vm_map_insert(vm_map_t map, int *countp, new_entry->maptype = maptype; new_entry->eflags = protoeflags; - new_entry->object.vm_object = object; + new_entry->object.map_object = map_object; + new_entry->aux.master_pde = 0; /* in case size is different */ + new_entry->aux.map_aux = map_aux; new_entry->offset = offset; - new_entry->aux.master_pde = 0; new_entry->inheritance = VM_INHERIT_DEFAULT; new_entry->protection = prot; @@ -1145,7 +1151,8 @@ vm_map_insert(vm_map_t map, int *countp, * don't try. */ if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) && - maptype != VM_MAPTYPE_VPAGETABLE) { + maptype != VM_MAPTYPE_VPAGETABLE && + maptype != VM_MAPTYPE_UKSMAP) { int dorelock = 0; if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) { dorelock = 1; @@ -1306,17 +1313,24 @@ vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length, * No requirements. This function will lock the map temporarily. */ int -vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, - vm_offset_t *addr, vm_size_t length, vm_size_t align, +vm_map_find(vm_map_t map, void *map_object, void *map_aux, + vm_ooffset_t offset, vm_offset_t *addr, + vm_size_t length, vm_size_t align, boolean_t fitit, vm_maptype_t maptype, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t start; + vm_object_t object; int result; int count; + if (maptype == VM_MAPTYPE_UKSMAP) + object = NULL; + else + object = map_object; + start = *addr; count = vm_map_entry_reserve(MAP_RESERVE_COUNT); @@ -1333,11 +1347,9 @@ vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, } start = *addr; } - result = vm_map_insert(map, &count, object, offset, - start, start + length, - maptype, - prot, max, - cow); + result = vm_map_insert(map, &count, map_object, map_aux, + offset, start, start + length, + maptype, prot, max, cow); if (object) vm_object_drop(object); vm_map_unlock(map); @@ -1370,6 +1382,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp) if (entry->maptype == VM_MAPTYPE_SUBMAP) return; + if (entry->maptype == VM_MAPTYPE_UKSMAP) + return; prev = entry->prev; if (prev != &map->header) { @@ -2205,7 +2219,8 @@ vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, * management structures and the faulting in of the * page. */ - if (entry->maptype != VM_MAPTYPE_SUBMAP) { + if (entry->maptype == VM_MAPTYPE_NORMAL || + entry->maptype == VM_MAPTYPE_VPAGETABLE) { int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY; if (copyflag && ((entry->protection & @@ -2402,7 +2417,8 @@ vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags) * do not have to do this for entries that point to sub * maps because we won't hold the lock on the sub map. */ - if (entry->maptype != VM_MAPTYPE_SUBMAP) { + if (entry->maptype == VM_MAPTYPE_NORMAL || + entry->maptype == VM_MAPTYPE_VPAGETABLE) { int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY; if (copyflag && ((entry->protection & @@ -2612,7 +2628,10 @@ vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end, for (current = entry; current->start < end; current = current->next) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; - if (current->maptype == VM_MAPTYPE_SUBMAP) { + + switch(current->maptype) { + case VM_MAPTYPE_SUBMAP: + { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; @@ -2626,8 +2645,15 @@ vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end, object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); - } else { + break; + } + case VM_MAPTYPE_NORMAL: + case VM_MAPTYPE_VPAGETABLE: object = current->object.vm_object; + break; + default: + object = NULL; + break; } if (object) @@ -2759,8 +2785,12 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp) switch(entry->maptype) { case VM_MAPTYPE_NORMAL: case VM_MAPTYPE_VPAGETABLE: + case VM_MAPTYPE_SUBMAP: vm_object_deallocate(entry->object.vm_object); break; + case VM_MAPTYPE_UKSMAP: + /* XXX TODO */ + break; default: break; } @@ -2847,7 +2877,17 @@ again: offidxstart = OFF_TO_IDX(entry->offset); count = OFF_TO_IDX(e - s); - object = entry->object.vm_object; + + switch(entry->maptype) { + case VM_MAPTYPE_NORMAL: + case VM_MAPTYPE_VPAGETABLE: + case VM_MAPTYPE_SUBMAP: + object = entry->object.vm_object; + break; + default: + object = NULL; + break; + } /* * Unwire before removing addresses from the pmap; otherwise, @@ -3260,9 +3300,11 @@ vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map, { vm_object_t src_object; - if (dst_entry->maptype == VM_MAPTYPE_SUBMAP) + if (dst_entry->maptype == VM_MAPTYPE_SUBMAP || + dst_entry->maptype == VM_MAPTYPE_UKSMAP) return; - if (src_entry->maptype == VM_MAPTYPE_SUBMAP) + if (src_entry->maptype == VM_MAPTYPE_SUBMAP || + src_entry->maptype == VM_MAPTYPE_UKSMAP) return; if (src_entry->wired_count == 0) { @@ -3330,6 +3372,11 @@ vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map, * The source map must not be locked. * No requirements. */ +static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map, + vm_map_entry_t old_entry, int *countp); +static void vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map, + vm_map_entry_t old_entry, int *countp); + struct vmspace * vmspace_fork(struct vmspace *vm1) { @@ -3337,8 +3384,6 @@ vmspace_fork(struct vmspace *vm1) vm_map_t old_map = &vm1->vm_map; vm_map_t new_map; vm_map_entry_t old_entry; - vm_map_entry_t new_entry; - vm_object_t object; int count; lwkt_gettoken(&vm1->vm_map.token); @@ -3364,98 +3409,18 @@ vmspace_fork(struct vmspace *vm1) old_entry = old_map->header.next; while (old_entry != &old_map->header) { - if (old_entry->maptype == VM_MAPTYPE_SUBMAP) + switch(old_entry->maptype) { + case VM_MAPTYPE_SUBMAP: panic("vm_map_fork: encountered a submap"); - - switch (old_entry->inheritance) { - case VM_INHERIT_NONE: break; - case VM_INHERIT_SHARE: - /* - * Clone the entry, creating the shared object if - * necessary. - */ - if (old_entry->object.vm_object == NULL) - vm_map_entry_allocate_object(old_entry); - - if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { - /* - * Shadow a map_entry which needs a copy, - * replacing its object with a new object - * that points to the old one. Ask the - * shadow code to automatically add an - * additional ref. We can't do it afterwords - * because we might race a collapse. The call - * to vm_map_entry_shadow() will also clear - * OBJ_ONEMAPPING. - */ - vm_map_entry_shadow(old_entry, 1); - } else if (old_entry->object.vm_object) { - /* - * We will make a shared copy of the object, - * and must clear OBJ_ONEMAPPING. - * - * Optimize vnode objects. OBJ_ONEMAPPING - * is non-applicable but clear it anyway, - * and its terminal so we don'th ave to deal - * with chains. Reduces SMP conflicts. - * - * XXX assert that object.vm_object != NULL - * since we allocate it above. - */ - object = old_entry->object.vm_object; - if (object->type == OBJT_VNODE) { - vm_object_reference_quick(object); - vm_object_clear_flag(object, - OBJ_ONEMAPPING); - } else { - vm_object_hold(object); - vm_object_chain_wait(object, 0); - vm_object_reference_locked(object); - vm_object_clear_flag(object, - OBJ_ONEMAPPING); - vm_object_drop(object); - } - } - - /* - * Clone the entry. We've already bumped the ref on - * any vm_object. - */ - new_entry = vm_map_entry_create(new_map, &count); - *new_entry = *old_entry; - new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; - new_entry->wired_count = 0; - - /* - * Insert the entry into the new map -- we know we're - * inserting at the end of the new map. - */ - - vm_map_entry_link(new_map, new_map->header.prev, - new_entry); - - /* - * Update the physical map - */ - pmap_copy(new_map->pmap, old_map->pmap, - new_entry->start, - (old_entry->end - old_entry->start), - old_entry->start); + case VM_MAPTYPE_UKSMAP: + vmspace_fork_uksmap_entry(old_map, new_map, + old_entry, &count); break; - case VM_INHERIT_COPY: - /* - * Clone the entry and link into the map. - */ - new_entry = vm_map_entry_create(new_map, &count); - *new_entry = *old_entry; - new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; - new_entry->wired_count = 0; - new_entry->object.vm_object = NULL; - vm_map_entry_link(new_map, new_map->header.prev, - new_entry); - vm_map_copy_entry(old_map, new_map, old_entry, - new_entry); + case VM_MAPTYPE_NORMAL: + case VM_MAPTYPE_VPAGETABLE: + vmspace_fork_normal_entry(old_map, new_map, + old_entry, &count); break; } old_entry = old_entry->next; @@ -3472,6 +3437,126 @@ vmspace_fork(struct vmspace *vm1) return (vm2); } +static +void +vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map, + vm_map_entry_t old_entry, int *countp) +{ + vm_map_entry_t new_entry; + vm_object_t object; + + switch (old_entry->inheritance) { + case VM_INHERIT_NONE: + break; + case VM_INHERIT_SHARE: + /* + * Clone the entry, creating the shared object if + * necessary. + */ + if (old_entry->object.vm_object == NULL) + vm_map_entry_allocate_object(old_entry); + + if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { + /* + * Shadow a map_entry which needs a copy, + * replacing its object with a new object + * that points to the old one. Ask the + * shadow code to automatically add an + * additional ref. We can't do it afterwords + * because we might race a collapse. The call + * to vm_map_entry_shadow() will also clear + * OBJ_ONEMAPPING. + */ + vm_map_entry_shadow(old_entry, 1); + } else if (old_entry->object.vm_object) { + /* + * We will make a shared copy of the object, + * and must clear OBJ_ONEMAPPING. + * + * Optimize vnode objects. OBJ_ONEMAPPING + * is non-applicable but clear it anyway, + * and its terminal so we don'th ave to deal + * with chains. Reduces SMP conflicts. + * + * XXX assert that object.vm_object != NULL + * since we allocate it above. + */ + object = old_entry->object.vm_object; + if (object->type == OBJT_VNODE) { + vm_object_reference_quick(object); + vm_object_clear_flag(object, + OBJ_ONEMAPPING); + } else { + vm_object_hold(object); + vm_object_chain_wait(object, 0); + vm_object_reference_locked(object); + vm_object_clear_flag(object, + OBJ_ONEMAPPING); + vm_object_drop(object); + } + } + + /* + * Clone the entry. We've already bumped the ref on + * any vm_object. + */ + new_entry = vm_map_entry_create(new_map, countp); + *new_entry = *old_entry; + new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; + new_entry->wired_count = 0; + + /* + * Insert the entry into the new map -- we know we're + * inserting at the end of the new map. + */ + + vm_map_entry_link(new_map, new_map->header.prev, + new_entry); + + /* + * Update the physical map + */ + pmap_copy(new_map->pmap, old_map->pmap, + new_entry->start, + (old_entry->end - old_entry->start), + old_entry->start); + break; + case VM_INHERIT_COPY: + /* + * Clone the entry and link into the map. + */ + new_entry = vm_map_entry_create(new_map, countp); + *new_entry = *old_entry; + new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; + new_entry->wired_count = 0; + new_entry->object.vm_object = NULL; + vm_map_entry_link(new_map, new_map->header.prev, + new_entry); + vm_map_copy_entry(old_map, new_map, old_entry, + new_entry); + break; + } +} + +/* + * When forking user-kernel shared maps, the map might change in the + * child so do not try to copy the underlying pmap entries. + */ +static +void +vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map, + vm_map_entry_t old_entry, int *countp) +{ + vm_map_entry_t new_entry; + + new_entry = vm_map_entry_create(new_map, countp); + *new_entry = *old_entry; + new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; + new_entry->wired_count = 0; + vm_map_entry_link(new_map, new_map->header.prev, + new_entry); +} + /* * Create an auto-grow stack entry * @@ -3555,12 +3640,11 @@ vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, * eliminate these as input parameters, and just * pass these values here in the insert call. */ - rv = vm_map_insert(map, &count, - NULL, 0, addrbos + max_ssize - init_ssize, + rv = vm_map_insert(map, &count, NULL, NULL, + 0, addrbos + max_ssize - init_ssize, addrbos + max_ssize, VM_MAPTYPE_NORMAL, - prot, max, - cow); + prot, max, cow); /* Now set the avail_ssize amount */ if (rv == KERN_SUCCESS) { @@ -3710,11 +3794,10 @@ Retry: addr = end; } - rv = vm_map_insert(map, &count, - NULL, 0, addr, stack_entry->start, + rv = vm_map_insert(map, &count, NULL, NULL, + 0, addr, stack_entry->start, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); /* Adjust the available stack space by the amount we grew. */ if (rv == KERN_SUCCESS) { @@ -3992,6 +4075,15 @@ RetryLookup: fault_type |= VM_PROT_WRITE; } + /* + * Only NORMAL and VPAGETABLE maps are object-based. UKSMAPs are not. + */ + if (entry->maptype != VM_MAPTYPE_NORMAL && + entry->maptype != VM_MAPTYPE_VPAGETABLE) { + *object = NULL; + goto skip; + } + /* * If the entry was copy-on-write, we either ... */ @@ -4047,9 +4139,10 @@ RetryLookup: * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ + *object = entry->object.vm_object; +skip: *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); - *object = entry->object.vm_object; /* * Return whether this is the only map sharing this data. On @@ -4129,7 +4222,8 @@ DB_SHOW_COMMAND(map, vm_map_print) if (entry->wired_count != 0) db_printf(", wired"); } - if (entry->maptype == VM_MAPTYPE_SUBMAP) { + switch(entry->maptype) { + case VM_MAPTYPE_SUBMAP: /* XXX no %qd in kernel. Truncate entry->offset. */ db_printf(", share=%p, offset=0x%lx\n", (void *)entry->object.sub_map, @@ -4144,7 +4238,9 @@ DB_SHOW_COMMAND(map, vm_map_print) full, 0, NULL); db_indent -= 2; } - } else { + break; + case VM_MAPTYPE_NORMAL: + case VM_MAPTYPE_VPAGETABLE: /* XXX no %qd in kernel. Truncate entry->offset. */ db_printf(", object=%p, offset=0x%lx", (void *)entry->object.vm_object, @@ -4165,6 +4261,19 @@ DB_SHOW_COMMAND(map, vm_map_print) nlines += 4; db_indent -= 2; } + break; + case VM_MAPTYPE_UKSMAP: + db_printf(", uksmap=%p, offset=0x%lx", + (void *)entry->object.uksmap, + (long)entry->offset); + if (entry->eflags & MAP_ENTRY_COW) + db_printf(", copy (%s)", + (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); + db_printf("\n"); + nlines++; + break; + default: + break; } } db_indent -= 2; diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index b85c3ae629..3dae2982d0 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -118,18 +118,21 @@ typedef u_int vm_flags_t; typedef u_int vm_eflags_t; /* - * Objects which live in maps may be either VM objects, or - * another map (called a "sharing map") which denotes read-write - * sharing with other maps. + * A vm_map_entry may reference an object, a submap, a uksmap, or a + * direct user-kernel shared map. */ union vm_map_object { struct vm_object *vm_object; /* object object */ struct vm_map *sub_map; /* belongs to another map */ + int (*uksmap)(cdev_t dev, vm_page_t fake); + void *map_object; /* generic */ }; union vm_map_aux { vm_offset_t avail_ssize; /* amt can grow if this is a stack */ vpte_t master_pde; /* virtual page table root */ + struct cdev *dev; + void *map_aux; }; /* @@ -534,21 +537,20 @@ void vm_map_entry_release(int); void vm_map_entry_krelease(int); vm_map_t vm_map_create (vm_map_t, struct pmap *, vm_offset_t, vm_offset_t); int vm_map_delete (vm_map_t, vm_offset_t, vm_offset_t, int *); -int vm_map_find (vm_map_t, vm_object_t, vm_ooffset_t, - vm_offset_t *, vm_size_t, vm_size_t, +int vm_map_find (vm_map_t, void *, void *, + vm_ooffset_t, vm_offset_t *, vm_size_t, + vm_size_t, boolean_t, vm_maptype_t, - vm_prot_t, vm_prot_t, - int); + vm_prot_t, vm_prot_t, int); int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_size_t, int, vm_offset_t *); vm_offset_t vm_map_hint(struct proc *, vm_offset_t, vm_prot_t); int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t); void vm_map_init (struct vm_map *, vm_offset_t, vm_offset_t, pmap_t); -int vm_map_insert (vm_map_t, int *, vm_object_t, vm_ooffset_t, - vm_offset_t, vm_offset_t, +int vm_map_insert (vm_map_t, int *, void *, void *, + vm_ooffset_t, vm_offset_t, vm_offset_t, vm_maptype_t, - vm_prot_t, vm_prot_t, - int); + vm_prot_t, vm_prot_t, int); int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *); void vm_map_lookup_done (vm_map_t, vm_map_entry_t, int); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index ff56bf1628..29d0ed7369 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1188,6 +1188,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_offset_t eaddr; vm_size_t esize; vm_size_t align; + int (*uksmap)(cdev_t dev, vm_page_t fake); struct vnode *vp; struct thread *td = curthread; struct proc *p; @@ -1276,6 +1277,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_map_remove(map, *addr, *addr + size); } + uksmap = NULL; + /* * Lookup/allocate object. */ @@ -1306,7 +1309,32 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vp = NULL; } else { vp = (struct vnode *)handle; + + /* + * Non-anonymous mappings of VCHR (aka not /dev/zero) + * cannot specify MAP_STACK or MAP_VPAGETABLE. + */ if (vp->v_type == VCHR) { + if (flags & (MAP_STACK | MAP_VPAGETABLE)) { + lwkt_reltoken(&map->token); + return(EINVAL); + } + } + + if (vp->v_type == VCHR && vp->v_rdev->si_ops->d_uksmap) { + /* + * Device mappings without a VM object, typically + * sharing permanently allocated kernel memory or + * process-context-specific (per-process) data. + * + * Force them to be shared. + */ + uksmap = vp->v_rdev->si_ops->d_uksmap; + object = NULL; + docow = MAP_PREFAULT_PARTIAL; + flags &= ~(MAP_PRIVATE|MAP_COPY); + flags |= MAP_SHARED; + } else if (vp->v_type == VCHR) { /* * Device mappings (device size unknown?). * Force them to be shared. @@ -1332,7 +1360,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, } else { /* * Regular file mapping (typically). The attribute - * check is for the link count test only. Mmapble + * check is for the link count test only. mmapable * vnodes must already have a VM object assigned. */ struct vattr vat; @@ -1383,6 +1411,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, /* * This may place the area in its own page directory if (size) is * large enough, otherwise it typically returns its argument. + * + * (object can be NULL) */ if (fitit) { *addr = pmap_addr_hint(object, *addr, size); @@ -1394,15 +1424,25 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, * Mappings that use virtual page tables will default to storing * the page table at offset 0. */ - if (flags & MAP_STACK) { + if (uksmap) { + rv = vm_map_find(map, uksmap, vp->v_rdev, + foff, addr, size, + align, + fitit, VM_MAPTYPE_UKSMAP, + prot, maxprot, docow); + } else if (flags & MAP_STACK) { rv = vm_map_stack(map, *addr, size, flags, prot, maxprot, docow); } else if (flags & MAP_VPAGETABLE) { - rv = vm_map_find(map, object, foff, addr, size, align, + rv = vm_map_find(map, object, NULL, + foff, addr, size, + align, fitit, VM_MAPTYPE_VPAGETABLE, prot, maxprot, docow); } else { - rv = vm_map_find(map, object, foff, addr, size, align, + rv = vm_map_find(map, object, NULL, + foff, addr, size, + align, fitit, VM_MAPTYPE_NORMAL, prot, maxprot, docow); } @@ -1412,6 +1452,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. + * + * (NOTE: object can be NULL) */ vm_object_deallocate(object); goto out; diff --git a/sys/vm/vm_unix.c b/sys/vm/vm_unix.c index df7e934d35..a24e707d32 100644 --- a/sys/vm/vm_unix.c +++ b/sys/vm/vm_unix.c @@ -112,11 +112,11 @@ sys_obreak(struct obreak_args *uap) error = ENOMEM; goto done; } - rv = vm_map_find(&vm->vm_map, NULL, 0, &old, - diff, PAGE_SIZE, + rv = vm_map_find(&vm->vm_map, NULL, NULL, + 0, &old, diff, + PAGE_SIZE, FALSE, VM_MAPTYPE_NORMAL, - VM_PROT_ALL, VM_PROT_ALL, - 0); + VM_PROT_ALL, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { error = ENOMEM; goto done; -- 2.41.0