From 5c5185ae1c44d1e9a2eb9d6d55dd551a57aa9ef1 Mon Sep 17 00:00:00 2001 From: "Samuel J. Greear" Date: Tue, 9 Mar 2010 02:24:55 -0700 Subject: [PATCH] kernel - Introduce lightweight buffers * Summary: The lightweight buffer (lwbuf) subsystem is effectively a reimplementation of the sfbuf (sendfile buffers) implementation. It was designed to be lighter weight than the sfbuf implementation when possible, on x86_64 we use the DMAP and the implementation is -very- simple. It was also designed to be more SMP friendly. * Replace all consumption of sfbuf with lwbuf * Refactor sfbuf to act as an external refcount mechanism for sendfile(2), this will probably go away eventually as well. --- share/man/man9/Makefile | 1 + share/man/man9/lwbuf.9 | 112 ++++++++++++ sys/boot/common/help.common | 5 - sys/boot/common/loader.8 | 6 - sys/boot/forth/loader-bootp.conf | 1 - sys/boot/forth/loader.conf | 1 - sys/cpu/i386/include/lwbuf.h | 85 +++++++++ sys/cpu/i386/misc/lwbuf.c | 203 +++++++++++++++++++++ sys/cpu/x86_64/include/lwbuf.h | 80 +++++++++ sys/cpu/x86_64/misc/lwbuf.c | 97 ++++++++++ sys/kern/imgact_elf.c | 21 +-- sys/kern/kern_exec.c | 17 +- sys/kern/kern_sfbuf.c | 209 ++++++---------------- sys/kern/kern_subr.c | 15 +- sys/kern/kern_umtx.c | 13 +- sys/kern/kern_xio.c | 35 ++-- sys/kern/subr_param.c | 11 +- sys/kern/uipc_syscalls.c | 60 ++----- sys/platform/pc32/conf/files | 1 + sys/platform/pc32/i386/genassym.c | 1 + sys/platform/pc32/i386/globals.s | 3 + sys/platform/pc32/include/globaldata.h | 4 + sys/platform/pc64/conf/files | 1 + sys/platform/vkernel/conf/files | 1 + sys/platform/vkernel/i386/genassym.c | 1 + sys/platform/vkernel/i386/global.s | 3 + sys/platform/vkernel/include/globaldata.h | 4 + sys/platform/vkernel/platform/copyio.c | 18 +- sys/sys/exec.h | 6 +- sys/sys/imgact.h | 2 +- sys/sys/sfbuf.h | 40 ++--- sys/vfs/tmpfs/tmpfs_vnops.c | 1 - sys/vm/vm_fault.c | 11 +- sys/vm/vnode_pager.c | 11 +- 34 files changed, 751 insertions(+), 329 deletions(-) create mode 100644 share/man/man9/lwbuf.9 create mode 100644 sys/cpu/i386/include/lwbuf.h create mode 100644 sys/cpu/i386/misc/lwbuf.c create mode 100644 sys/cpu/x86_64/include/lwbuf.h create mode 100644 sys/cpu/x86_64/misc/lwbuf.c diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile index 9ef34ea147..26a48955ec 100644 --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -100,6 +100,7 @@ MAN= accept_filter.9 \ ksignal.9 \ ktr.9 \ lock.9 \ + lwbuf.9 \ make_autoclone_dev.9 \ make_dev.9 \ mbuf.9 \ diff --git a/share/man/man9/lwbuf.9 b/share/man/man9/lwbuf.9 new file mode 100644 index 0000000000..5503a46e37 --- /dev/null +++ b/share/man/man9/lwbuf.9 @@ -0,0 +1,112 @@ +.\" Copyright (c) 2010 by The DragonFly Project and Samuel J. Greear. +.\" All rights reserved. +.\" +.\" This code is derived from software contributed to The DragonFly Project +.\" by Samuel J. Greear +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in +.\" the documentation and/or other materials provided with the +.\" distribution. +.\" 3. Neither the name of The DragonFly Project nor the names of its +.\" contributors may be used to endorse or promote products derived +.\" from this software without specific, prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +.\" FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +.\" COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +.\" INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, +.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +.\" OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" " +.Dd March 17, 2010 +.Os +.Dt lwbuf 9 +.Sh NAME +.Nm lwbuf_alloc , +.Nm lwbuf_free , +.Nm lwbuf_page , +.Nm lwbuf_kva , +.Nm lwbuf_set_global +.Nd "Lightweight buffers" +.Sh SYNOPSIS +.In cpu/lwbuf.h +.Ft "struct lwbuf *" +.Fn lwbuf_alloc "vm_page_t m" +.Ft void +.Fn lwbuf_free "struct lwbuf *lwb" +.Ft vm_page_t +.Fn lwbuf_page "struct lwbuf *lwb" +.Ft vm_offset_t +.Fn lwbuf_kva "struct lwbuf *lwb" +.Ft void +.Fn lwbuf_set_global "struct lwbuf *lwb" +.Sh DESCRIPTION +The +.Fa lwbuf +kernel functions are used for maintaining a lightweight reference to and +accessing an arbitrary +.Fa vm_page_t . +.Pp +.Fn lwbuf_alloc +returns a pointer to a lightweight buffer representing +.Fa m . +.Pp +.Fn lwbuf_free +frees all resources associated with the lightweight buffer +.Fa lwb . +.Pp +.Fn lwbuf_page +and +.Fn lwbuf_kva +return the associated +.Fa vm_page_t +or +.Fa vm_offset_t +of the lightweight buffer +.Fa lwb . +.Pp +.Fn lwbuf_set_global +ensures that a vm_offset_t previously obtained through +.Fa lwbuf_kva +will be valid on all processors without subsequent calls to +.Fa lwbuf_kva . +It should not be used. +.Sh IMPLEMENTATION NOTES +The implementation of +.Fa lwbuf +is cpu-dependent. On i386, pages taken from +per-processor pools of kernel virtual address space (KVA) are used to map +arbitrary +.Fa vm_page_t +objects. On x86_64 such tricks are unnecessary, the +kernel maintains a direct map of KVA covering all physical memory. +.Pp +Lightweight buffers are thread and cross-processor safe with a number of +limitations. Allocated buffers are not internally cached or reference counted. +Any consumer of lightweight buffers may elect to share allocated buffers +or allow them to be used in other threads or on other processors, but care +must be taken. Buffers must be externally refcounted or in some other manner +freed only after last use. +.Sh HISTORY +A +.Nm lwbuf +implementation first appeared in +.Dx 2.5 . +.Sh Authors +The +.Nm lwbuf +implementation and this manpage were written by +.An Samuel J. Greear . diff --git a/sys/boot/common/help.common b/sys/boot/common/help.common index b87de33f5d..3a8ce900ae 100644 --- a/sys/boot/common/help.common +++ b/sys/boot/common/help.common @@ -240,11 +240,6 @@ cannot be set below the default determined when the kernel was compiled. - set kern.ipc.nsfbufs= NSFBUFS - - Set the number of sendfile buffers to be allocated. This - overrides the value determined when the kernel was compiled. - set kern.vm.kmem.size= Sets the size of kernel memory (bytes). This overrides diff --git a/sys/boot/common/loader.8 b/sys/boot/common/loader.8 index 3ef914a5b5..ead2046b3b 100644 --- a/sys/boot/common/loader.8 +++ b/sys/boot/common/loader.8 @@ -488,12 +488,6 @@ The value cannot be set below the default determined when the kernel was compiled. Modifies .Va NMBCLUSTERS . -.It Va kern.ipc.nsfbufs -Set the number of -.Xr sendfile 2 -buffers to be allocated. -Overrides -.Dv NSFBUFS . .It Va kern.mmxopt Toggles the mmx optimizations for the bcopy/copyin/copyout routines .It Va kern.vm.kmem.size diff --git a/sys/boot/forth/loader-bootp.conf b/sys/boot/forth/loader-bootp.conf index 42cdeb6c3c..35b470c1ae 100644 --- a/sys/boot/forth/loader-bootp.conf +++ b/sys/boot/forth/loader-bootp.conf @@ -97,7 +97,6 @@ module_path="/boot;/boot/modules;/;/modules" # Set the module search path #kern.ipc.maxsockets="" # Set the maximum number of sockets avaliable #kern.ipc.nmbclusters="" # Set the number of mbuf clusters #kern.ipc.nmbufs="" # Set the maximum number of mbufs -#kern.ipc.nsfbufs="" # Set the number of sendfile(2) bufs #kern.vm.kmem.size="" # Sets the size of kernel memory (bytes) #net.inet.tcp.tcbhashsize="" # Set the value of TCBHASHSIZE #vfs.root.mountfrom="" # Specify root partition in a way the diff --git a/sys/boot/forth/loader.conf b/sys/boot/forth/loader.conf index ace3780a0c..3e8a2fd82f 100644 --- a/sys/boot/forth/loader.conf +++ b/sys/boot/forth/loader.conf @@ -99,7 +99,6 @@ module_path="/boot;/boot/modules;/;/modules" # Set the module search path #kern.ipc.maxsockets="" # Set the maximum number of sockets avaliable #kern.ipc.nmbclusters="" # Set the number of mbuf clusters #kern.ipc.nmbufs="" # Set the maximum number of mbufs -#kern.ipc.nsfbufs="" # Set the number of sendfile(2) bufs #kern.vm.kmem.size="" # Sets the size of kernel memory (bytes) #net.inet.tcp.tcbhashsize="" # Set the value of TCBHASHSIZE #vfs.root.mountfrom="" # Specify root partition in a way the diff --git a/sys/cpu/i386/include/lwbuf.h b/sys/cpu/i386/include/lwbuf.h new file mode 100644 index 0000000000..022cdb9077 --- /dev/null +++ b/sys/cpu/i386/include/lwbuf.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2010 by The DragonFly Project and Samuel J. Greear. + * All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Samuel J. Greear + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _CPU_LWBUF_H_ +#define _CPU_LWBUF_H_ + +#ifndef _SYS_TYPES_H_ +#include +#endif +#ifndef _SYS_QUEUE_H_ +#include +#endif +#ifndef _SYS_GLOBALDATA_H_ +#include +#endif +#ifndef _VM_PMAP_H_ +#include +#endif +#ifndef _MACHINE_ATOMIC_H_ +#include +#endif + +#if !defined(_KERNEL) && !defined(_KERNEL_STRUCTURES) +#error "This file should not be included by userland programs." +#endif + +struct lwbuf { + vm_page_t m; /* currently mapped page */ + vm_offset_t kva; /* va of mapping */ + cpumask_t cpumask; /* cpu mapping synchronization */ +}; + +struct lwbuf_free_kvp { + vm_offset_t kva; + SLIST_ENTRY(lwbuf_free_kvp) next; +}; +SLIST_HEAD(lwbuf_free_kvp_list, lwbuf_free_kvp); + +static __inline vm_page_t +lwbuf_page(struct lwbuf *lwb) { + return (lwb->m); +} + +#if defined(_KERNEL) + +struct lwbuf *lwbuf_alloc(vm_page_t); +void lwbuf_free(struct lwbuf *); +vm_offset_t lwbuf_kva(struct lwbuf *lwb); +void lwbuf_set_global(struct lwbuf *); + +#endif + +#endif /* !_CPU_LWBUF_H_ */ diff --git a/sys/cpu/i386/misc/lwbuf.c b/sys/cpu/i386/misc/lwbuf.c new file mode 100644 index 0000000000..a584b322d1 --- /dev/null +++ b/sys/cpu/i386/misc/lwbuf.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2010 by The DragonFly Project and Samuel J. Greear. + * All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Samuel J. Greear + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void lwbuf_init(void *); +SYSINIT(sock_lwb, SI_BOOT2_MACHDEP, SI_ORDER_ANY, lwbuf_init, NULL); + +/* Number of pages of KVA to allocate at boot per cpu (1MB) */ +#define LWBUF_BOOT_PAGES 256 +/* Number to allocate incrementally (128KB) */ +#define LWBUF_ALLOC_PAGES 32 + +static struct objcache *lwbuf_cache; + +MALLOC_DEFINE(M_LWBUF, "lwbuf", "Lightweight buffers"); +struct objcache_malloc_args lwbuf_malloc_args = { sizeof(struct lwbuf), M_LWBUF }; + + +static boolean_t +lwbuf_cache_ctor(void *obj, void *pdata, int ocflags) +{ + struct lwbuf *lwb = (struct lwbuf *)obj; + + lwb->m = NULL; + lwb->kva = 0; + lwb->cpumask = 0; + + return (TRUE); +} + +static boolean_t +lwbuf_initpages(struct lwbuf_free_kvp_list *fkvpl, int pages) +{ + struct lwbuf_free_kvp *free_kvp; + vm_offset_t k; + int i; + + k = kmem_alloc_nofault(&kernel_map, PAGE_SIZE * pages, PAGE_SIZE); + if (k == 0) + return (FALSE); + + for (i = 0; i < pages; ++i) { + free_kvp = (struct lwbuf_free_kvp *) + kmalloc(sizeof(*free_kvp), M_LWBUF, M_WAITOK | M_ZERO); + + free_kvp->kva = k + (i * PAGE_SIZE); + SLIST_INSERT_HEAD(fkvpl, free_kvp, next); + } + + return (TRUE); +} + +static void +lwbuf_init(void *arg) +{ + int i; + + lwbuf_cache = objcache_create("lwbuf", 0, 0, + lwbuf_cache_ctor, NULL, NULL, + objcache_malloc_alloc, objcache_malloc_free, + &lwbuf_malloc_args); + + /* Should probably be in cpu_gdinit */ + for (i = 0; i < SMP_MAXCPU; ++i) { + SLIST_INIT(&mdcpu->gd_lwbuf_fpages); + lwbuf_initpages(&mdcpu->gd_lwbuf_fpages, LWBUF_BOOT_PAGES); + } +} + +struct lwbuf * +lwbuf_alloc(vm_page_t m) +{ + struct mdglobaldata *gd = mdcpu; + struct lwbuf_free_kvp *free_kvp; + struct lwbuf *lwb; + + if ((lwb = objcache_get(lwbuf_cache, M_WAITOK)) == NULL) + return (NULL); + + lwb->m = m; + +check_slist: + if (!SLIST_EMPTY(&gd->gd_lwbuf_fpages)) { + free_kvp = SLIST_FIRST(&gd->gd_lwbuf_fpages); + SLIST_REMOVE_HEAD(&gd->gd_lwbuf_fpages, next); + + lwb->kva = free_kvp->kva; + + kfree(free_kvp, M_LWBUF); + } else { + if (lwbuf_initpages(&gd->gd_lwbuf_fpages, + LWBUF_ALLOC_PAGES) == FALSE) + tsleep(&gd->gd_lwbuf_fpages, 0, "lwbuf", hz); + + goto check_slist; + } + + pmap_kenter_quick(lwb->kva, lwb->m->phys_addr); + lwb->cpumask |= gd->mi.gd_cpumask; + + return (lwb); +} + +void +lwbuf_free(struct lwbuf *lwb) +{ + struct lwbuf_free_kvp *free_kvp; + + free_kvp = (struct lwbuf_free_kvp *) + kmalloc(sizeof(*free_kvp), M_LWBUF, M_WAITOK); + free_kvp->kva = lwb->kva; + SLIST_INSERT_HEAD(&mdcpu->gd_lwbuf_fpages, free_kvp, next); + wakeup_one(&mdcpu->gd_lwbuf_fpages); + + lwb->m = NULL; + lwb->kva = 0; + lwb->cpumask = 0; + + objcache_put(lwbuf_cache, lwb); +} + +void +lwbuf_set_global(struct lwbuf *lwb) +{ + pmap_kenter_sync(lwb->kva); + lwb->cpumask = (cpumask_t)-1; +} + +static vm_offset_t +_lwbuf_kva(struct lwbuf *lwb, struct mdglobaldata *gd) +{ + cpumask_t old, new; + + pmap_kenter_sync_quick(lwb->kva); + + do { + old = lwb->cpumask; + new = old | gd->mi.gd_cpumask; + } while (atomic_cmpset_int(&lwb->cpumask, old, new) == 0); + + return (lwb->kva); +} + +__inline vm_offset_t +lwbuf_kva(struct lwbuf *lwb) +{ + struct mdglobaldata *gd = mdcpu; + + if (lwb->cpumask & gd->mi.gd_cpumask) + return (lwb->kva); + + return (_lwbuf_kva(lwb, gd)); +} diff --git a/sys/cpu/x86_64/include/lwbuf.h b/sys/cpu/x86_64/include/lwbuf.h new file mode 100644 index 0000000000..fbbecd09fd --- /dev/null +++ b/sys/cpu/x86_64/include/lwbuf.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2010 by The DragonFly Project and Samuel J. Greear. + * All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Samuel J. Greear + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _CPU_LWBUF_H_ +#define _CPU_LWBUF_H_ + +#ifndef _SYS_TYPES_H_ +#include +#endif +#ifndef _SYS_GLOBALDATA_H_ +#include +#endif +#ifndef _VM_PMAP_H_ +#include +#endif +#ifndef _MACHINE_ATOMIC_H_ +#include +#endif + +#if !defined(_KERNEL) && !defined(_KERNEL_STRUCTURES) +#error "This file should not be included by userland programs." +#endif + +struct lwbuf { + vm_page_t m; /* currently mapped page */ + vm_offset_t kva; /* va of mapping */ +}; + +static __inline vm_page_t +lwbuf_page(struct lwbuf *lwb) { + return (lwb->m); +} + +static __inline vm_offset_t +lwbuf_kva(struct lwbuf *lwb) { + return (lwb->kva); +} + +#define lwbuf_set_global(lwb) + +#if defined(_KERNEL) + +struct lwbuf *lwbuf_alloc(vm_page_t); +void lwbuf_free(struct lwbuf *); + +#endif + +#endif /* !_CPU_LWBUF_H_ */ diff --git a/sys/cpu/x86_64/misc/lwbuf.c b/sys/cpu/x86_64/misc/lwbuf.c new file mode 100644 index 0000000000..e4326f4e6d --- /dev/null +++ b/sys/cpu/x86_64/misc/lwbuf.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2010 by The DragonFly Project and Samuel J. Greear. + * All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Samuel J. Greear + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void lwbuf_init(void *); +SYSINIT(sock_lwb, SI_BOOT2_MACHDEP, SI_ORDER_ANY, lwbuf_init, NULL); + +static struct objcache *lwbuf_cache; + +MALLOC_DEFINE(M_LWBUF, "lwbuf", "Lightweight buffers"); +struct objcache_malloc_args lwbuf_malloc_args = { sizeof(struct lwbuf), M_LWBUF }; + + +static boolean_t +lwbuf_cache_ctor(void *obj, void *pdata, int ocflags) +{ + struct lwbuf *lwb = (struct lwbuf *)obj; + + lwb->m = NULL; + lwb->kva = 0; + + return (TRUE); +} + +static void +lwbuf_init(void *arg) +{ + lwbuf_cache = objcache_create("lwbuf", 0, 0, + lwbuf_cache_ctor, NULL, NULL, + objcache_malloc_alloc, objcache_malloc_free, + &lwbuf_malloc_args); +} + +struct lwbuf * +lwbuf_alloc(vm_page_t m) +{ + struct lwbuf *lwb; + + if ((lwb = objcache_get(lwbuf_cache, M_WAITOK)) == NULL) + return (NULL); + + lwb->m = m; + lwb->kva = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(lwb->m)); + + return (lwb); +} + +void +lwbuf_free(struct lwbuf *lwb) +{ + lwb->m = NULL; + + objcache_put(lwbuf_cache, lwb); +} diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 77a3bf4021..c51f0346d0 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -51,9 +51,10 @@ #include #include #include -#include #include +#include + #include #include #include @@ -255,7 +256,7 @@ elf_check_abi_note(struct image_params *imgp, const Elf_Phdr *ph) { Elf_Brandinfo *match = NULL; const Elf_Note *tmp_note; - struct sf_buf *sfb; + struct lwbuf *lwb; const char *page; char *data = NULL; Elf_Off off; @@ -272,7 +273,7 @@ elf_check_abi_note(struct image_params *imgp, const Elf_Phdr *ph) if (len < sizeof(Elf_Note) || len > PAGE_SIZE) return NULL; /* ENOEXEC? */ - if (exec_map_page(imgp, off >> PAGE_SHIFT, &sfb, &page)) + if (exec_map_page(imgp, off >> PAGE_SHIFT, &lwb, &page)) return NULL; /* @@ -283,8 +284,8 @@ elf_check_abi_note(struct image_params *imgp, const Elf_Phdr *ph) bcopy(page + firstoff, data, firstlen); - exec_unmap_page(sfb); - if (exec_map_page(imgp, (off >> PAGE_SHIFT) + 1, &sfb, &page)) { + exec_unmap_page(lwb); + if (exec_map_page(imgp, (off >> PAGE_SHIFT) + 1, &lwb, &page)) { kfree(data, M_TEMP); return NULL; } @@ -326,7 +327,7 @@ next: if (data != NULL) kfree(data, M_TEMP); - exec_unmap_page(sfb); + exec_unmap_page(lwb); return (match); } @@ -435,15 +436,15 @@ elf_load_section(struct proc *p, struct vmspace *vmspace, struct vnode *vp, if (copy_len != 0) { vm_page_t m; - struct sf_buf *sf; + struct lwbuf *lwb; m = vm_fault_object_page(object, trunc_page(offset + filsz), VM_PROT_READ, 0, &error); if (m) { - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - error = copyout((caddr_t)sf_buf_kva(sf), + lwb = lwbuf_alloc(m); + error = copyout((caddr_t)lwbuf_kva(lwb), (caddr_t)map_addr, copy_len); - sf_buf_free(sf); + lwbuf_free(lwb); vm_page_unhold(m); } if (error) { diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 81aed747ae..b3d847c08e 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -56,6 +55,8 @@ #include #include +#include + #include #include #include @@ -583,7 +584,7 @@ sys_execve(struct execve_args *uap) int exec_map_page(struct image_params *imgp, vm_pindex_t pageno, - struct sf_buf **psfb, const char **pdata) + struct lwbuf **plwb, const char **pdata) { int rv; vm_page_t ma; @@ -632,8 +633,8 @@ exec_map_page(struct image_params *imgp, vm_pindex_t pageno, vm_page_wakeup(m); /* unbusy the page */ crit_exit(); - *psfb = sf_buf_alloc(m, SFB_CPUPRIVATE); - *pdata = (void *)sf_buf_kva(*psfb); + *plwb = lwbuf_alloc(m); + *pdata = (void *)lwbuf_kva(*plwb); return (0); } @@ -655,14 +656,14 @@ exec_map_first_page(struct image_params *imgp) } void -exec_unmap_page(struct sf_buf *sfb) +exec_unmap_page(struct lwbuf *lwb) { vm_page_t m; crit_enter(); - if (sfb != NULL) { - m = sf_buf_page(sfb); - sf_buf_free(sfb); + if (lwb != NULL) { + m = lwbuf_page(lwb); + lwbuf_free(lwb); vm_page_unhold(m); } crit_exit(); diff --git a/sys/kern/kern_sfbuf.c b/sys/kern/kern_sfbuf.c index 8e439d5e85..b09c5c4747 100644 --- a/sys/kern/kern_sfbuf.c +++ b/sys/kern/kern_sfbuf.c @@ -30,212 +30,103 @@ #include #include #include -#include #include -#include -#include -#include +#include + +#include + #include #include #include #include #include -#include static void sf_buf_init(void *arg); SYSINIT(sock_sf, SI_BOOT2_MACHDEP, SI_ORDER_ANY, sf_buf_init, NULL) LIST_HEAD(sf_buf_list, sf_buf); -SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0, - "Maximum number of sf_bufs available to the system"); +static struct objcache *sf_buf_cache; -/* - * A hash table of active sendfile(2) buffers - */ -static struct sf_buf_list *sf_buf_hashtable; -static u_long sf_buf_hashmask; +MALLOC_DEFINE(M_SFBUF, "sfbuf", "Sendfile buffer structures"); +struct objcache_malloc_args sf_buf_malloc_args = { sizeof(struct sf_buf), M_SFBUF }; -static TAILQ_HEAD(, sf_buf) sf_buf_freelist; -static u_int sf_buf_alloc_want; -static vm_offset_t sf_base; -static struct sf_buf *sf_bufs; - -static int sfbuf_quick = 1; -SYSCTL_INT(_debug, OID_AUTO, sfbuf_quick, CTLFLAG_RW, &sfbuf_quick, 0, ""); -static int nsffree; -SYSCTL_INT(_kern_ipc, OID_AUTO, nsffree, CTLFLAG_RD, &nsffree, 0, - "Number of free sf_bufs available to the system"); - -static __inline -int -sf_buf_hash(vm_page_t m) +static boolean_t +sf_buf_cache_ctor(void *obj, void *pdata, int ocflags) { - int hv; + struct sf_buf *sf = (struct sf_buf *)obj; - hv = ((int)(intptr_t)m / sizeof(vm_page_t)) + ((int)(intptr_t)m >> 12); - return(hv & sf_buf_hashmask); + sf->lwbuf = NULL; + sf->refs = 0; + + return (TRUE); } /* - * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) + * Init objcache of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) */ static void sf_buf_init(void *arg) { - int i; - - sf_buf_hashtable = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask); - TAILQ_INIT(&sf_buf_freelist); - sf_base = kmem_alloc_nofault(&kernel_map, nsfbufs * PAGE_SIZE, - PAGE_SIZE); - sf_bufs = kmalloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, - M_WAITOK | M_ZERO); - for (i = 0; i < nsfbufs; i++) { - sf_bufs[i].kva = sf_base + i * PAGE_SIZE; - sf_bufs[i].flags |= SFBA_ONFREEQ; - TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry); - ++nsffree; - } + sf_buf_cache = objcache_create("sf_buf", 0, 0, + sf_buf_cache_ctor, NULL, NULL, + objcache_malloc_alloc, objcache_malloc_free, + &sf_buf_malloc_args); } /* - * Get an sf_buf from the freelist. Will block if none are available. + * Acquire an sf_buf reference for a vm_page. */ struct sf_buf * -sf_buf_alloc(struct vm_page *m, int flags) +sf_buf_alloc(struct vm_page *m) { - struct sf_buf_list *hash_chain; struct sf_buf *sf; - globaldata_t gd; - int error; - int pflags; - - gd = mycpu; - crit_enter(); - hash_chain = &sf_buf_hashtable[sf_buf_hash(m)]; - LIST_FOREACH(sf, hash_chain, list_entry) { - if (sf->m == m) { - /* - * cache hit - * - * We must invalidate the TLB entry based on whether - * it need only be valid on the local cpu (SFB_CPUPRIVATE), - * or on all cpus. This is conditionalized and in - * most cases no system-wide invalidation should be - * needed. - * - * Note: we do not remove the entry from the freelist - * on the 0->1 transition. - */ - ++sf->refcnt; - if ((flags & SFB_CPUPRIVATE) && sfbuf_quick) { - if ((sf->cpumask & gd->gd_cpumask) == 0) { - pmap_kenter_sync_quick(sf->kva); - sf->cpumask |= gd->gd_cpumask; - } - } else { - if (sf->cpumask != (cpumask_t)-1) { - pmap_kenter_sync(sf->kva); - sf->cpumask = (cpumask_t)-1; - } - } - goto done; /* found existing mapping */ - } + + if ((sf = objcache_get(sf_buf_cache, M_WAITOK)) == NULL) + goto done; + + if ((sf->lwbuf = lwbuf_alloc(m)) == NULL) { + objcache_put(sf_buf_cache, sf); + sf = NULL; + goto done; } /* - * Didn't find old mapping. Get a buffer off the freelist. We - * may have to remove and skip buffers with non-zero ref counts - * that were lazily allocated. + * Force invalidation of the TLB entry on all CPU's */ - for (;;) { - if ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) { - pflags = (flags & SFB_CATCH) ? PCATCH : 0; - ++sf_buf_alloc_want; - error = tsleep(&sf_buf_freelist, pflags, "sfbufa", 0); - --sf_buf_alloc_want; - if (error) - goto done; - } else { - /* - * We may have to do delayed removals for referenced - * sf_buf's here in addition to locating a sf_buf - * to reuse. The sf_bufs must be removed. - * - * We are finished when we find an sf_buf with a - * refcnt of 0. We theoretically do not have to - * remove it from the freelist but it's a good idea - * to do so to preserve LRU operation for the - * (1) never before seen before case and (2) - * accidently recycled due to prior cached uses not - * removing the buffer case. - */ - KKASSERT(sf->flags & SFBA_ONFREEQ); - TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry); - --nsffree; - sf->flags &= ~SFBA_ONFREEQ; - if (sf->refcnt == 0) - break; - } - } - if (sf->m != NULL) /* remove previous mapping from hash table */ - LIST_REMOVE(sf, list_entry); - LIST_INSERT_HEAD(hash_chain, sf, list_entry); - sf->refcnt = 1; - sf->m = m; - if ((flags & SFB_CPUPRIVATE) && sfbuf_quick) { - pmap_kenter_quick(sf->kva, sf->m->phys_addr); - sf->cpumask = gd->gd_cpumask; - } else { - pmap_kenter(sf->kva, sf->m->phys_addr); - sf->cpumask = (cpumask_t)-1; - } + lwbuf_set_global(sf->lwbuf); + + sf->refs = 1; + done: - crit_exit(); return (sf); } -#if 0 - -/* - * Add a reference to a buffer (currently unused) - */ void -sf_buf_ref(struct sf_buf *sf) +sf_buf_ref(void *arg) { - if (sf->refcnt == 0) - panic("sf_buf_ref: referencing a free sf_buf"); - crit_enter(); - sf->refcnt++; - crit_exit(); -} + struct sf_buf *sf = arg; -#endif + atomic_add_int(&sf->refs, 1); +} /* - * Lose a reference to an sf_buf. When none left, detach mapped page - * and release resources back to the system. Note that the sfbuf's - * removal from the freelist is delayed, so it may in fact already be - * on the free list. This is the optimal (and most likely) scenario. + * Detach mapped page and release resources back to the system. * * Must be called at splimp. */ -void -sf_buf_free(struct sf_buf *sf) +int +sf_buf_free(void *arg) { - if (sf->refcnt == 0) - panic("sf_buf_free: freeing free sf_buf"); - crit_enter(); - sf->refcnt--; - if (sf->refcnt == 0 && (sf->flags & SFBA_ONFREEQ) == 0) { - TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry); - ++nsffree; - sf->flags |= SFBA_ONFREEQ; - if (sf_buf_alloc_want > 0) - wakeup_one(&sf_buf_freelist); + struct sf_buf *sf = arg; + + KKASSERT(sf->refs > 0); + if (atomic_fetchadd_int(&sf->refs, -1) == 1) { + lwbuf_free(sf->lwbuf); + objcache_put(sf_buf_cache, sf); + return (0); } - crit_exit(); -} + return (1); +} diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index a8a2b8be51..4abceec919 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -52,10 +52,11 @@ #include #include #include -#include #include #include +#include + #include #include #include @@ -446,13 +447,13 @@ iovec_copyin(struct iovec *uiov, struct iovec **kiov, struct iovec *siov, */ /* - * Implement uiomove(9) from physical memory using sf_bufs to reduce + * Implement uiomove(9) from physical memory using lwbuf's to reduce * the creation and destruction of ephemeral mappings. */ int uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) { - struct sf_buf *sf; + struct lwbuf *lwb; struct thread *td = curthread; struct iovec *iov; void *cp; @@ -485,8 +486,8 @@ uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) page_offset = offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - page_offset); m = ma[offset >> PAGE_SHIFT]; - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - cp = (char *)sf_buf_kva(sf) + page_offset; + lwb = lwbuf_alloc(m); + cp = (char *)lwbuf_kva(lwb) + page_offset; switch (uio->uio_segflg) { case UIO_USERSPACE: /* @@ -498,7 +499,7 @@ uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) else error = copyin(iov->iov_base, cp, cnt); if (error) { - sf_buf_free(sf); + lwbuf_free(lwb); goto out; } break; @@ -511,7 +512,7 @@ uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) case UIO_NOCOPY: break; } - sf_buf_free(sf); + lwbuf_free(lwb); iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c index 132c3e353c..1088f7fda1 100644 --- a/sys/kern/kern_umtx.c +++ b/sys/kern/kern_umtx.c @@ -47,9 +47,10 @@ #include #include #include -#include #include +#include + #include #include #include @@ -101,7 +102,7 @@ int sys_umtx_sleep(struct umtx_sleep_args *uap) { int error = EBUSY; - struct sf_buf *sf; + struct lwbuf *lwb; struct vm_page_action action; vm_page_t m; void *waddr; @@ -124,14 +125,14 @@ sys_umtx_sleep(struct umtx_sleep_args *uap) error = EFAULT; goto done; } - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); + lwb = lwbuf_alloc(m); offset = (vm_offset_t)uap->ptr & PAGE_MASK; /* * The critical section is required to interlock the tsleep against * a wakeup from another cpu. The lfence forces synchronization. */ - if (*(int *)(sf_buf_kva(sf) + offset) == uap->value) { + if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) { if ((timeout = uap->timeout) != 0) { timeout = (timeout / 1000000) * hz + ((timeout % 1000000) * hz + 999999) / 1000000; @@ -139,7 +140,7 @@ sys_umtx_sleep(struct umtx_sleep_args *uap) waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset); crit_enter(); tsleep_interlock(waddr, PCATCH | PDOMAIN_UMTX); - if (*(int *)(sf_buf_kva(sf) + offset) == uap->value) { + if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) { vm_page_init_action(&action, umtx_sleep_page_action_cow, waddr); vm_page_register_action(m, &action, VMEVENT_COW); error = tsleep(waddr, PCATCH | PINTERLOCKED | PDOMAIN_UMTX, @@ -156,7 +157,7 @@ sys_umtx_sleep(struct umtx_sleep_args *uap) error = EBUSY; } - sf_buf_free(sf); + lwbuf_free(lwb); /*vm_page_dirty(m); we don't actually dirty the page */ vm_page_unhold(m); done: diff --git a/sys/kern/kern_xio.c b/sys/kern/kern_xio.c index 374e9da085..0961f3112d 100644 --- a/sys/kern/kern_xio.c +++ b/sys/kern/kern_xio.c @@ -54,7 +54,8 @@ #include #include #include -#include + +#include #include #include @@ -301,7 +302,7 @@ xio_copy_xtou(xio_t xio, int uoffset, void *uptr, int bytes) int error; int offset; vm_page_t m; - struct sf_buf *sf; + struct lwbuf *lwb; if (uoffset + bytes > xio->xio_bytes) return(EFAULT); @@ -316,9 +317,9 @@ xio_copy_xtou(xio_t xio, int uoffset, void *uptr, int bytes) ++i ) { m = xio->xio_pages[i]; - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - error = copyout((char *)sf_buf_kva(sf) + offset, uptr, n); - sf_buf_free(sf); + lwb = lwbuf_alloc(m); + error = copyout((char *)lwbuf_kva(lwb) + offset, uptr, n); + lwbuf_free(lwb); if (error) break; bytes -= n; @@ -349,7 +350,7 @@ xio_copy_xtok(xio_t xio, int uoffset, void *kptr, int bytes) int error; int offset; vm_page_t m; - struct sf_buf *sf; + struct lwbuf *lwb; if (bytes + uoffset > xio->xio_bytes) return(EFAULT); @@ -364,9 +365,9 @@ xio_copy_xtok(xio_t xio, int uoffset, void *kptr, int bytes) ++i ) { m = xio->xio_pages[i]; - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - bcopy((char *)sf_buf_kva(sf) + offset, kptr, n); - sf_buf_free(sf); + lwb = lwbuf_alloc(m); + bcopy((char *)lwbuf_kva(lwb) + offset, kptr, n); + lwbuf_free(lwb); bytes -= n; kptr = (char *)kptr + n; if (bytes == 0) @@ -395,7 +396,7 @@ xio_copy_utox(xio_t xio, int uoffset, const void *uptr, int bytes) int error; int offset; vm_page_t m; - struct sf_buf *sf; + struct lwbuf *lwb; if (uoffset + bytes > xio->xio_bytes) return(EFAULT); @@ -410,9 +411,9 @@ xio_copy_utox(xio_t xio, int uoffset, const void *uptr, int bytes) ++i ) { m = xio->xio_pages[i]; - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - error = copyin(uptr, (char *)sf_buf_kva(sf) + offset, n); - sf_buf_free(sf); + lwb = lwbuf_alloc(m); + error = copyin(uptr, (char *)lwbuf_kva(lwb) + offset, n); + lwbuf_free(lwb); if (error) break; bytes -= n; @@ -443,7 +444,7 @@ xio_copy_ktox(xio_t xio, int uoffset, const void *kptr, int bytes) int error; int offset; vm_page_t m; - struct sf_buf *sf; + struct lwbuf *lwb; if (uoffset + bytes > xio->xio_bytes) return(EFAULT); @@ -458,9 +459,9 @@ xio_copy_ktox(xio_t xio, int uoffset, const void *kptr, int bytes) ++i ) { m = xio->xio_pages[i]; - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - bcopy(kptr, (char *)sf_buf_kva(sf) + offset, n); - sf_buf_free(sf); + lwb = lwbuf_alloc(m); + bcopy(kptr, (char *)lwbuf_kva(lwb) + offset, n); + lwbuf_free(lwb); bytes -= n; kptr = (const char *)kptr + n; if (bytes == 0) diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index f3af67edf8..239a86e384 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -63,9 +63,6 @@ #ifndef MAXFILES #define MAXFILES (maxproc * 16) #endif -#ifndef NSFBUFS -#define NSFBUFS (512 + maxusers * 16) -#endif #ifndef MAXPOSIXLOCKSPERUID #define MAXPOSIXLOCKSPERUID (maxusers * 64) /* Should be a safe value */ #endif @@ -95,9 +92,6 @@ u_quad_t dflssiz; /* initial stack size limit */ u_quad_t maxssiz; /* max stack size */ u_quad_t sgrowsiz; /* amount to grow stack */ -/* maximum # of sf_bufs (sendfile(2) zero-copy virtual buffers) */ -int nsfbufs; - /* * These have to be allocated somewhere; allocating * them here forces loader errors if this file is omitted @@ -182,11 +176,8 @@ init_param2(int physpages) TUNABLE_INT_FETCH("kern.maxposixlocksperuid", &maxposixlocksperuid); /* - * Cannot be changed after boot. Unless overriden, NSFBUFS is based - * on maxusers and NBUF is typically 0 (auto-sized later). + * Unless overriden, NBUF is typically 0 (auto-sized later). */ - nsfbufs = NSFBUFS; - TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); nbuf = NBUF; TUNABLE_INT_FETCH("kern.nbuf", &nbuf); diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 2c3774cb95..1e4735ce03 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -86,13 +86,6 @@ #include #endif /* SCTP */ -struct sfbuf_mref { - struct sf_buf *sf; - int mref_count; -}; - -static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile sfbuf ref structures"); - /* * System call interface to the socket abstraction. */ @@ -1402,40 +1395,25 @@ getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) * * XXX vm_page_*() routines are not MPSAFE yet, the MP lock is required. */ -static void -sf_buf_mref(void *arg) -{ - struct sfbuf_mref *sfm = arg; - - /* - * We must already hold a ref so there is no race to 0, just - * atomically increment the count. - */ - atomic_add_int(&sfm->mref_count, 1); -} - static void sf_buf_mfree(void *arg) { - struct sfbuf_mref *sfm = arg; + struct sf_buf *sf = arg; vm_page_t m; - KKASSERT(sfm->mref_count > 0); - if (atomic_fetchadd_int(&sfm->mref_count, -1) == 1) { - /* - * XXX vm_page_*() and SFBUF routines not MPSAFE yet. - */ - get_mplock(); - crit_enter(); - m = sf_buf_page(sfm->sf); - sf_buf_free(sfm->sf); + /* + * XXX vm_page_*() and SFBUF routines not MPSAFE yet. + */ + get_mplock(); + crit_enter(); + m = sf_buf_page(sf); + if (sf_buf_free(sf) == 0) { vm_page_unwire(m, 0); if (m->wire_count == 0 && m->object == NULL) vm_page_try_to_free(m); - crit_exit(); - rel_mplock(); - kfree(sfm, M_SENDFILE); } + crit_exit(); + rel_mplock(); } /* @@ -1573,7 +1551,6 @@ kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes, struct file *fp; struct mbuf *m; struct sf_buf *sf; - struct sfbuf_mref *sfm; struct vm_page *pg; off_t off, xfsize; off_t hbytes = 0; @@ -1724,7 +1701,7 @@ retry_lookup: * Get a sendfile buf. We usually wait as long as necessary, * but this wait can be interrupted. */ - if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) { + if ((sf = sf_buf_alloc(pg)) == NULL) { crit_enter(); vm_page_unwire(pg, 0); vm_page_try_to_free(pg); @@ -1745,19 +1722,12 @@ retry_lookup: goto done; } - /* - * sfm is a temporary hack, use a per-cpu cache for this. - */ - sfm = kmalloc(sizeof(struct sfbuf_mref), M_SENDFILE, M_WAITOK); - sfm->sf = sf; - sfm->mref_count = 1; - m->m_ext.ext_free = sf_buf_mfree; - m->m_ext.ext_ref = sf_buf_mref; - m->m_ext.ext_arg = sfm; - m->m_ext.ext_buf = (void *)sf->kva; + m->m_ext.ext_ref = sf_buf_ref; + m->m_ext.ext_arg = sf; + m->m_ext.ext_buf = (void *)sf_buf_kva(sf); m->m_ext.ext_size = PAGE_SIZE; - m->m_data = (char *) sf->kva + pgoff; + m->m_data = (char *)sf_buf_kva(sf) + pgoff; m->m_flags |= M_EXT; m->m_pkthdr.len = m->m_len = xfsize; KKASSERT((m->m_flags & (M_EXT_CLUSTER)) == 0); diff --git a/sys/platform/pc32/conf/files b/sys/platform/pc32/conf/files index 17e0734fef..c3e6e6567d 100644 --- a/sys/platform/pc32/conf/files +++ b/sys/platform/pc32/conf/files @@ -140,6 +140,7 @@ platform/pc32/apm/apm.c optional apm cpu/i386/misc/atomic.c standard \ compile-with "${CC} -c ${CFLAGS} ${WERROR} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" cpu/i386/misc/elf_machdep.c standard +cpu/i386/misc/lwbuf.c standard cpu/i386/misc/in_cksum2.s optional inet cpu/i386/misc/ktr.c optional ktr cpu/i386/misc/db_disasm.c optional ddb diff --git a/sys/platform/pc32/i386/genassym.c b/sys/platform/pc32/i386/genassym.c index 65e5c4daad..5a9b591685 100644 --- a/sys/platform/pc32/i386/genassym.c +++ b/sys/platform/pc32/i386/genassym.c @@ -210,6 +210,7 @@ ASSYM(GD_SPENDING, offsetof(struct mdglobaldata, gd_spending)); ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss)); ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt)); +ASSYM(GD_LWBUF_FPAGES, offsetof(struct mdglobaldata, gd_lwbuf_fpages)); ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread)); ASSYM(GD_FPU_LOCK, offsetof(struct mdglobaldata, gd_fpu_lock)); ASSYM(GD_SAVEFPU, offsetof(struct mdglobaldata, gd_savefpu)); diff --git a/sys/platform/pc32/i386/globals.s b/sys/platform/pc32/i386/globals.s index 2aa2467c50..9ea88d389c 100644 --- a/sys/platform/pc32/i386/globals.s +++ b/sys/platform/pc32/i386/globals.s @@ -58,6 +58,9 @@ .set gd_reqflags,globaldata + GD_REQFLAGS .set gd_common_tss,globaldata + GD_COMMON_TSS + .globl gd_lwbuf_fpages + .set gd_lwbuf_fpages,globaldata + GD_LWBUF_FPAGES + .globl gd_common_tssd, gd_tss_gdt .set gd_common_tssd,globaldata + GD_COMMON_TSSD .set gd_tss_gdt,globaldata + GD_TSS_GDT diff --git a/sys/platform/pc32/include/globaldata.h b/sys/platform/pc32/include/globaldata.h index 3fcf66772d..495b809362 100644 --- a/sys/platform/pc32/include/globaldata.h +++ b/sys/platform/pc32/include/globaldata.h @@ -51,6 +51,9 @@ #ifndef _MACHINE_NPX_H_ #include #endif +#ifndef _CPU_LWBUF_H_ +#include +#endif /* * Note on interrupt control. Pending interrupts not yet dispatched are @@ -73,6 +76,7 @@ struct mdglobaldata { struct segment_descriptor *gd_tss_gdt; struct thread *gd_npxthread; struct i386tss gd_common_tss; + struct lwbuf_free_kvp_list gd_lwbuf_fpages; /* lwbuf: free kva */ union savefpu gd_savefpu; /* fast bcopy/zero temp fpu save area */ int gd_fpu_lock; /* fast bcopy/zero cpu lock */ int gd_fpending; /* fast interrupt pending */ diff --git a/sys/platform/pc64/conf/files b/sys/platform/pc64/conf/files index 4c39be3d97..49e7150f5a 100644 --- a/sys/platform/pc64/conf/files +++ b/sys/platform/pc64/conf/files @@ -95,6 +95,7 @@ platform/pc64/x86_64/mpboot.S optional smp # DDB XXX cpu/x86_64/misc/x86_64-gdbstub.c optional ddb +cpu/x86_64/misc/lwbuf.c standard platform/pc64/x86_64/elf_machdep.c standard platform/pc64/x86_64/in_cksum2.s optional inet platform/pc64/x86_64/ktr.c optional ktr diff --git a/sys/platform/vkernel/conf/files b/sys/platform/vkernel/conf/files index b639ce6c4a..f4bd19408a 100644 --- a/sys/platform/vkernel/conf/files +++ b/sys/platform/vkernel/conf/files @@ -38,6 +38,7 @@ platform/vkernel/i386/mp.c optional smp \ # # DDB XXX cpu/i386/misc/elf_machdep.c standard +cpu/i386/misc/lwbuf.c standard cpu/i386/misc/in_cksum2.s optional inet cpu/i386/misc/ktr.c optional ktr cpu/i386/misc/db_disasm.c optional ddb diff --git a/sys/platform/vkernel/i386/genassym.c b/sys/platform/vkernel/i386/genassym.c index cbabc033d6..22c18ee6d5 100644 --- a/sys/platform/vkernel/i386/genassym.c +++ b/sys/platform/vkernel/i386/genassym.c @@ -200,6 +200,7 @@ ASSYM(GD_SPENDING, offsetof(struct mdglobaldata, gd_spending)); ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss)); ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt)); +ASSYM(GD_LWBUF_FPAGES, offsetof(struct mdglobaldata, gd_lwbuf_fpages)); ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread)); ASSYM(GD_FPU_LOCK, offsetof(struct mdglobaldata, gd_fpu_lock)); ASSYM(GD_SAVEFPU, offsetof(struct mdglobaldata, gd_savefpu)); diff --git a/sys/platform/vkernel/i386/global.s b/sys/platform/vkernel/i386/global.s index c81783fa79..9772e925f8 100644 --- a/sys/platform/vkernel/i386/global.s +++ b/sys/platform/vkernel/i386/global.s @@ -55,6 +55,9 @@ .set gd_reqflags,globaldata + GD_REQFLAGS .set gd_common_tss,globaldata + GD_COMMON_TSS + .globl gd_lwbuf_fpages + .set gd_lwbuf_fpages,globaldata + GD_LWBUF_FPAGES + .globl gd_common_tssd, gd_tss_gdt .set gd_common_tssd,globaldata + GD_COMMON_TSSD .set gd_tss_gdt,globaldata + GD_TSS_GDT diff --git a/sys/platform/vkernel/include/globaldata.h b/sys/platform/vkernel/include/globaldata.h index 336eac90c7..0f9a771e08 100644 --- a/sys/platform/vkernel/include/globaldata.h +++ b/sys/platform/vkernel/include/globaldata.h @@ -54,6 +54,9 @@ #ifndef _MACHINE_NPX_H_ #include #endif +#ifndef _CPU_LWBUF_H_ +#include +#endif /* * Note on interrupt control. Pending interrupts not yet dispatched are @@ -76,6 +79,7 @@ struct mdglobaldata { struct segment_descriptor *gd_tss_gdt; struct thread *gd_npxthread; struct i386tss gd_common_tss; + struct lwbuf_free_kvp_list gd_lwbuf_fpages; /* lwbuf: free kva */ union savefpu gd_savefpu; /* fast bcopy/zero temp fpu save area */ int gd_fpu_lock; /* fast bcopy/zero cpu lock */ int gd_fpending; /* fast interrupt pending */ diff --git a/sys/platform/vkernel/platform/copyio.c b/sys/platform/vkernel/platform/copyio.c index 9231995cbf..aff3e6e09e 100644 --- a/sys/platform/vkernel/platform/copyio.c +++ b/sys/platform/vkernel/platform/copyio.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include @@ -128,7 +128,7 @@ int copyin(const void *udaddr, void *kaddr, size_t len) { struct vmspace *vm = curproc->p_vmspace; - struct sf_buf *sf; + struct lwbuf *lwb; vm_page_t m; int error; size_t n; @@ -144,14 +144,14 @@ copyin(const void *udaddr, void *kaddr, size_t len) n = PAGE_SIZE - ((vm_offset_t)udaddr & PAGE_MASK); if (n > len) n = len; - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - bcopy((char *)sf_buf_kva(sf)+((vm_offset_t)udaddr & PAGE_MASK), + lwb = lwbuf_alloc(m); + bcopy((char *)lwbuf_kva(lwb)+((vm_offset_t)udaddr & PAGE_MASK), kaddr, n); len -= n; udaddr = (const char *)udaddr + n; kaddr = (char *)kaddr + n; vm_page_unhold(m); - sf_buf_free(sf); + lwbuf_free(lwb); } rel_mplock(); return (error); @@ -166,7 +166,7 @@ int copyout(const void *kaddr, void *udaddr, size_t len) { struct vmspace *vm = curproc->p_vmspace; - struct sf_buf *sf; + struct lwbuf *lwb; vm_page_t m; int error; size_t n; @@ -182,15 +182,15 @@ copyout(const void *kaddr, void *udaddr, size_t len) n = PAGE_SIZE - ((vm_offset_t)udaddr & PAGE_MASK); if (n > len) n = len; - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - bcopy(kaddr, (char *)sf_buf_kva(sf) + + lwb = lwbuf_alloc(m); + bcopy(kaddr, (char *)lwbuf_kva(lwb) + ((vm_offset_t)udaddr & PAGE_MASK), n); len -= n; udaddr = (char *)udaddr + n; kaddr = (const char *)kaddr + n; vm_page_dirty(m); vm_page_unhold(m); - sf_buf_free(sf); + lwbuf_free(lwb); } rel_mplock(); return (error); diff --git a/sys/sys/exec.h b/sys/sys/exec.h index 788c6c27fa..cd13bbccd0 100644 --- a/sys/sys/exec.h +++ b/sys/sys/exec.h @@ -76,10 +76,10 @@ struct execsw { #ifdef _KERNEL #include -struct sf_buf; -int exec_map_page(struct image_params *, vm_pindex_t, struct sf_buf **, +struct lwbuf; +int exec_map_page(struct image_params *, vm_pindex_t, struct lwbuf **, const char **); -void exec_unmap_page(struct sf_buf *); +void exec_unmap_page(struct lwbuf *); int exec_map_first_page (struct image_params *); void exec_unmap_first_page (struct image_params *); diff --git a/sys/sys/imgact.h b/sys/sys/imgact.h index d68bce0c20..2d1eb855a4 100644 --- a/sys/sys/imgact.h +++ b/sys/sys/imgact.h @@ -62,7 +62,7 @@ struct image_params { char interpreted; /* flag - this executable is interpreted */ char interpreter_name[MAXSHELLCMDLEN]; /* name of the interpreter */ void *auxargs; /* ELF Auxinfo structure pointer */ - struct sf_buf *firstpage; /* first page that we mapped */ + struct lwbuf *firstpage; /* first page that we mapped */ unsigned long ps_strings; /* PS_STRINGS for BSD/OS binaries */ }; diff --git a/sys/sys/sfbuf.h b/sys/sys/sfbuf.h index b13f3b1bb0..43f378f110 100644 --- a/sys/sys/sfbuf.h +++ b/sys/sys/sfbuf.h @@ -35,6 +35,9 @@ #ifndef _SYS_QUEUE_H_ #include #endif +#ifndef _CPU_LWBUF_H_ +#include +#endif #if !defined(_KERNEL) && !defined(_KERNEL_STRUCTURES) @@ -42,42 +45,19 @@ #endif struct sf_buf { - LIST_ENTRY(sf_buf) list_entry; /* hash chain of active buffers */ - TAILQ_ENTRY(sf_buf) free_entry; /* list of free buffers */ - struct vm_page *m; /* currently mapped page */ - vm_offset_t kva; /* va of mapping */ - int refcnt; /* usage of this mapping */ - int flags; /* global SFBA flags */ - cpumask_t cpumask; /* cpu mapping synchronization */ + struct lwbuf *lwbuf; /* lightweight buffer */ + int refs; }; -/* - * sf_buf_alloc() flags (not all are stored in sf->flags) - */ -#define SFB_CPUPRIVATE 0x0001 /* sync mapping to current cpu only */ -#define SFBA_ONFREEQ 0x0002 /* on the free queue (lazy move) */ -#define SFB_CATCH 0x0004 /* allow interruption */ - -static __inline vm_offset_t -sf_buf_kva(struct sf_buf *sf) -{ - return(sf->kva); -} - -static __inline struct vm_page * -sf_buf_page(struct sf_buf *sf) -{ - return(sf->m); -} +#define sf_buf_kva(sf) (lwbuf_kva((sf)->lwbuf)) +#define sf_buf_page(sf) (lwbuf_page((sf)->lwbuf)) #if defined(_KERNEL) -extern int nsfbufs; - -struct sf_buf *sf_buf_alloc(struct vm_page *, int flags); -void sf_buf_free(struct sf_buf *); -void sf_buf_ref(struct sf_buf *); +struct sf_buf *sf_buf_alloc(struct vm_page *); +void sf_buf_ref(void *); +int sf_buf_free(void *); #endif diff --git a/sys/vfs/tmpfs/tmpfs_vnops.c b/sys/vfs/tmpfs/tmpfs_vnops.c index 8c928665c1..936f27db0c 100644 --- a/sys/vfs/tmpfs/tmpfs_vnops.c +++ b/sys/vfs/tmpfs/tmpfs_vnops.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index b55b0da686..c163a2f1ba 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -82,10 +82,11 @@ #include #include #include -#include #include #include +#include + #include #include #include @@ -792,7 +793,7 @@ int vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, vpte_t vpte, int fault_type) { - struct sf_buf *sf; + struct lwbuf *lwb; int vshift = 32 - PAGE_SHIFT; /* page index bits remaining */ int result = KERN_SUCCESS; vpte_t *ptep; @@ -835,8 +836,8 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, * entry in the page table page. */ vshift -= VPTE_PAGE_BITS; - sf = sf_buf_alloc(fs->m, SFB_CPUPRIVATE); - ptep = ((vpte_t *)sf_buf_kva(sf) + + lwb = lwbuf_alloc(fs->m); + ptep = ((vpte_t *)lwbuf_kva(lwb) + ((*pindex >> vshift) & VPTE_PAGE_MASK)); vpte = *ptep; @@ -862,7 +863,7 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, vm_page_dirty(fs->m); } } - sf_buf_free(sf); + lwbuf_free(lwb); vm_page_flag_set(fs->m, PG_REFERENCED); vm_page_activate(fs->m); vm_page_wakeup(fs->m); diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 315d429998..70fcbb91ef 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -61,7 +61,8 @@ #include #include #include -#include + +#include #include #include @@ -382,7 +383,7 @@ vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) if (m && m->valid) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; - struct sf_buf *sf; + struct lwbuf *lwb; /* * Clear out partial-page garbage in case @@ -391,10 +392,10 @@ vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) * This is byte aligned. */ vm_page_busy(m); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - kva = sf_buf_kva(sf); + lwb = lwbuf_alloc(m); + kva = lwbuf_kva(lwb); bzero((caddr_t)kva + base, size); - sf_buf_free(sf); + lwbuf_free(lwb); /* * XXX work around SMP data integrity race -- 2.41.0