From bb6811be56a9f299d28f69b6d6bf7233e1e7359a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 16 May 2010 10:46:35 -0700 Subject: [PATCH] kernel - zero pages during idle Bring in base work by Venkatesh Srinivas: * vm/vm_zeropage.c and bzeront() code. Plus make the following adjustments: * Move the bzeront code from platform/ to cpu/ and add a x86_64 version of bzeront(). * Expose the page zeroing rate via sysctl and adjust the default upwards. * Expose the movnti feature via sysctl (defaults to off). * Change poll interval from 1 second to 1/10 second. * Correct a bug where the MP lock was not being initially released. The thread is created with the MP lock held. This was causing the hysteresis check to fail. * Correct a bug where vm_page_zero_count was being double-incremented due to changes in how the zerod page is freed. * Clean up the code syntax a bit. Submitted-by: Venkatesh Srinivas --- sys/conf/files | 1 + sys/cpu/i386/misc/bzeront.s | 58 ++++++++ sys/cpu/x86_64/misc/bzeront.s | 55 ++++++++ sys/platform/pc32/conf/files | 1 + sys/platform/pc64/conf/files | 1 + sys/platform/vkernel/conf/files | 1 + sys/platform/vkernel64/conf/files | 2 + sys/sys/systm.h | 1 + sys/sys/thread.h | 1 + sys/vm/vm_page.c | 38 +++++- sys/vm/vm_page.h | 1 + sys/vm/vm_zeroidle.c | 216 ++++++++++++++++++++++++++++++ 12 files changed, 375 insertions(+), 1 deletion(-) create mode 100644 sys/cpu/i386/misc/bzeront.s create mode 100644 sys/cpu/x86_64/misc/bzeront.s create mode 100644 sys/vm/vm_zeroidle.c diff --git a/sys/conf/files b/sys/conf/files index b685f6b3a7..2a89e68b6f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1485,6 +1485,7 @@ vm/vm_swap.c standard vm/vm_unix.c standard vm/vm_vmspace.c standard vm/vnode_pager.c standard +vm/vm_zeroidle.c standard vm/vm_zone.c standard # # USB support diff --git a/sys/cpu/i386/misc/bzeront.s b/sys/cpu/i386/misc/bzeront.s new file mode 100644 index 0000000000..bae1ec7b1b --- /dev/null +++ b/sys/cpu/i386/misc/bzeront.s @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2010 The DragonFly Project. All rights reserved. + * All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "assym.s" + + .text + +/* + * void bzeront(void *buf, size_t n); + * bzero() using non-temporal stores to bypass cache pollution + * + * Only use on dword-aligned buffers with dword-multiple length + */ +ENTRY(bzeront) + movl 8(%esp), %ecx + movl 4(%esp), %eax + xorl %edx, %edx + shrl $2, %ecx +1: + movnti %edx, (%eax) + add $4, %eax + loop 1b + ret diff --git a/sys/cpu/x86_64/misc/bzeront.s b/sys/cpu/x86_64/misc/bzeront.s new file mode 100644 index 0000000000..4acfefb762 --- /dev/null +++ b/sys/cpu/x86_64/misc/bzeront.s @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2010 The DragonFly Project. All rights reserved. + * All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include "assym.s" + + .text + +/* + * void bzeront(void *buf, size_t n); (%rdi, %rsi) + * + * bzero() using non-temporal stores to bypass cache pollution. Both + * the buffer and size must be 8-byte aligned. + */ +ENTRY(bzeront) + xorq %rax,%rax + shrq $3,%rsi +1: + movnti %rax,(%rdi) + addq $8,%rdi + subq $8,%rsi + jne 1b + ret diff --git a/sys/platform/pc32/conf/files b/sys/platform/pc32/conf/files index b3ea73dbac..aed5ab70b1 100644 --- a/sys/platform/pc32/conf/files +++ b/sys/platform/pc32/conf/files @@ -145,6 +145,7 @@ cpu/i386/misc/in_cksum2.s optional inet cpu/i386/misc/ktr.c optional ktr cpu/i386/misc/db_disasm.c optional ddb cpu/i386/misc/i386-gdbstub.c optional ddb +cpu/i386/misc/bzeront.s standard platform/pc32/i386/autoconf.c standard platform/pc32/i386/bios.c standard platform/pc32/i386/bioscall.s standard diff --git a/sys/platform/pc64/conf/files b/sys/platform/pc64/conf/files index a57dd69bf7..2b082ce0e0 100644 --- a/sys/platform/pc64/conf/files +++ b/sys/platform/pc64/conf/files @@ -100,6 +100,7 @@ cpu/x86_64/misc/elf_machdep.c standard cpu/x86_64/misc/in_cksum2.s optional inet cpu/x86_64/misc/ktr.c optional ktr cpu/x86_64/misc/db_disasm.c optional ddb +cpu/x86_64/misc/bzeront.s standard # # DOS mbr and gpt kern/subr_diskmbr.c standard diff --git a/sys/platform/vkernel/conf/files b/sys/platform/vkernel/conf/files index f4bd19408a..4ed0c718a3 100644 --- a/sys/platform/vkernel/conf/files +++ b/sys/platform/vkernel/conf/files @@ -43,6 +43,7 @@ cpu/i386/misc/in_cksum2.s optional inet cpu/i386/misc/ktr.c optional ktr cpu/i386/misc/db_disasm.c optional ddb cpu/i386/misc/i386-gdbstub.c optional ddb +cpu/i386/misc/bzeront.s standard # # DOS mbr and gpt kern/subr_diskmbr.c standard diff --git a/sys/platform/vkernel64/conf/files b/sys/platform/vkernel64/conf/files index adfe6877b6..4782990ac3 100644 --- a/sys/platform/vkernel64/conf/files +++ b/sys/platform/vkernel64/conf/files @@ -24,6 +24,8 @@ cpu/x86_64/misc/in_cksum2.s optional inet cpu/x86_64/misc/ktr.c optional ktr cpu/x86_64/misc/db_disasm.c optional ddb cpu/x86_64/misc/x86_64-gdbstub.c optional ddb +cpu/x86_64/misc/bzeront.s standard + # # DOS mbr and gpt kern/subr_diskmbr.c standard diff --git a/sys/sys/systm.h b/sys/sys/systm.h index bf4b166355..cd8abd1523 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -224,6 +224,7 @@ void bcopyi (const void *from, void *to, size_t len); void bcopy (volatile const void *from, volatile void *to, size_t len); void ovbcopy (const void *from, void *to, size_t len); void bzero (volatile void *buf, size_t len); +void bzeront (volatile void *buf, size_t len); void *memcpy (void *to, const void *from, size_t len); int copystr (const void *kfaddr, void *kdaddr, size_t len, diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 42c9f99546..cacc775566 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -296,6 +296,7 @@ struct thread { * rollup flag will be set in mycpu->gd_reqflags. */ #define TDPRI_IDLE_THREAD 0 /* the idle thread */ +#define TDPRI_IDLE_WORK 1 /* idle work (page zero, etc) */ #define TDPRI_USER_SCHEDULER 2 /* user scheduler helper */ #define TDPRI_USER_IDLE 4 /* user scheduler idle */ #define TDPRI_USER_NORM 6 /* user scheduler normal */ diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index c25de4e1cc..b3f54e2c05 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -87,11 +87,13 @@ #include #include #include -#include #include #include +#include +#include + static void vm_page_queue_init(void); static void vm_page_free_wakeup(void); static vm_page_t vm_page_select_cache(vm_object_t, vm_pindex_t); @@ -391,6 +393,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) /* * Insert it into the object. */ + ASSERT_MP_LOCK_HELD(curthread); vm_page_rb_tree_RB_INSERT(&object->rb_memq, m); object->generation++; @@ -444,6 +447,7 @@ vm_page_remove(vm_page_t m) /* * Remove the page from the object and update the object. */ + ASSERT_MP_LOCK_HELD(curthread); vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m); object->resident_page_count--; object->generation++; @@ -475,6 +479,7 @@ vm_page_lookup(vm_object_t object, vm_pindex_t pindex) /* * Search the hash table for this object/offset pair */ + ASSERT_MP_LOCK_HELD(curthread); crit_enter(); m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex); crit_exit(); @@ -1058,6 +1063,36 @@ vm_page_free_toq(vm_page_t m) crit_exit(); } +/* + * vm_page_free_fromq_fast() + * + * Remove a non-zero page from one of the free queues; the page is removed for + * zeroing, so do not issue a wakeup. + * + * MPUNSAFE + */ +vm_page_t +vm_page_free_fromq_fast(void) +{ + static int qi; + vm_page_t m; + int i; + + crit_enter(); + for (i = 0; i < PQ_L2_SIZE; ++i) { + m = vm_page_list_find(PQ_FREE, qi, FALSE); + qi = (qi + PQ_PRIME2) & PQ_L2_MASK; + if (m && (m->flags & PG_ZERO) == 0) { + vm_page_unqueue_nowakeup(m); + vm_page_busy(m); + break; + } + m = NULL; + } + crit_exit(); + return (m); +} + /* * vm_page_unmanage() * @@ -1743,6 +1778,7 @@ vm_page_event_internal(vm_page_t m, vm_page_event_t event) } } + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 3e73f24d18..e6bce8c4f6 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -514,6 +514,7 @@ int vm_page_bits (int, int); vm_page_t vm_page_list_find(int basequeue, int index, boolean_t prefer_zero); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); +vm_page_t vm_page_free_fromq_fast(void); vm_offset_t vm_contig_pg_kmap(int, u_long, vm_map_t, int); void vm_contig_pg_free(int, u_long); void vm_page_event_internal(vm_page_t, vm_page_event_t); diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c new file mode 100644 index 0000000000..276cc3aab2 --- /dev/null +++ b/sys/vm/vm_zeroidle.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 1994 John Dyson + * Copyright (c) 2001 Matt Dillon + * Copyright (c) 2010 The DragonFly Project + * + * All Rights Reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 + * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ + * from FreeBSD: .../i386/vm_machdep.c,v 1.165 2001/07/04 23:27:04 dillon + * + * $Id: vm_zeroidle.c,v 1.3 2010/05/12 04:50:45 sv5679 Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Implement the pre-zeroed page mechanism. + */ +#define ZIDLE_LO(v) ((v) * 2 / 3) +#define ZIDLE_HI(v) ((v) * 4 / 5) + +/* Number of bytes to zero between reschedule checks */ +#define IDLEZERO_RUN (32) + +/* Maximum number of pages per second to zero */ +#define NPAGES_RUN (20000) + + +static int idlezero_enable = 0; +TUNABLE_INT("vm.idlezero_enable", &idlezero_enable); +SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0, + "Allow the kernel to use idle CPU cycles to zero pages"); +static int idlezero_rate = NPAGES_RUN; +SYSCTL_INT(_vm, OID_AUTO, idlezero_rate, CTLFLAG_RW, &idlezero_rate, 0, + "Maximum pages per second to zero"); +static int idlezero_nocache = 0; +SYSCTL_INT(_vm, OID_AUTO, idlezero_nocache, CTLFLAG_RW, &idlezero_nocache, 0, + "Maximum pages per second to zero"); + +static int idlezero_count = 0; +SYSCTL_INT(_vm, OID_AUTO, idlezero_count, CTLFLAG_RD, &idlezero_count, 0, + "The number of physical pages prezeroed at idle time"); + +enum zeroidle_state { + STATE_IDLE, + STATE_GET_PAGE, + STATE_ZERO_PAGE, + STATE_RELEASE_PAGE +}; + +static int zero_state; + +/* + * Attempt to maintain approximately 1/2 of our free pages in a + * PG_ZERO'd state. Add some hysteresis to (attempt to) avoid + * generally zeroing a page when the system is near steady-state. + * Otherwise we might get 'flutter' during disk I/O / IPC or + * fast sleeps. We also do not want to be continuously zeroing + * pages because doing so may flush our L1 and L2 caches too much. + */ +static int +vm_page_zero_check(void) +{ + if (idlezero_enable == 0) + return (0); + if (zero_state && vm_page_zero_count >= ZIDLE_LO(vmstats.v_free_count)) + return (0); + if (vm_page_zero_count >= ZIDLE_HI(vmstats.v_free_count)) + return (0); + return (1); +} + +static void +vm_pagezero(void __unused *arg) +{ + vm_page_t m = NULL; + struct lwbuf *buf = NULL; + enum zeroidle_state state = STATE_IDLE; + char *pg = NULL; + int npages = 0; + int i = 0; + + /* + * Adjust thread parameters before entering our loop. The thread + * is started with the MP lock held and with normal kernel thread + * priority. + * + * Also put us on the last cpu for now. + */ + rel_mplock(); + lwkt_setpri_self(TDPRI_IDLE_WORK); + lwkt_setcpu_self(globaldata_find(ncpus - 1)); + + /* + * Loop forever + */ + for (;;) { + switch(state) { + case STATE_IDLE: + /* + * Wait for work. + */ + tsleep(&zero_state, 0, "pgzero", hz / 10); + if (vm_page_zero_check()) + npages = idlezero_rate / 10; + if (npages) + state = STATE_GET_PAGE; /* Fallthrough */ + break; + case STATE_GET_PAGE: + /* + * Acquire page to zero + */ + if (try_mplock() == 0) { + state = STATE_IDLE; + } else if (--npages == 0) { + state = STATE_IDLE; + rel_mplock(); + } else { + m = vm_page_free_fromq_fast(); + if (m == NULL) { + state = STATE_IDLE; + } else { + state = STATE_ZERO_PAGE; + buf = lwbuf_alloc(m); + pg = (char *)lwbuf_kva(buf); + i = 0; + } + rel_mplock(); + } + break; + case STATE_ZERO_PAGE: + /* + * Zero-out the page, stop immediately if a + * resched has been requested. + */ + while (i < PAGE_SIZE) { + if (lwkt_check_resched(curthread)) + break; + if (idlezero_nocache == 1) + bzeront(&pg[i], IDLEZERO_RUN); + else + bzero(&pg[i], IDLEZERO_RUN); + i += IDLEZERO_RUN; + } + if (i == PAGE_SIZE) + state = STATE_RELEASE_PAGE; + break; + case STATE_RELEASE_PAGE: + if (try_mplock()) { + lwbuf_free(buf); + vm_page_flag_set(m, PG_ZERO); + vm_page_free_toq(m); + state = STATE_GET_PAGE; + ++idlezero_count; + rel_mplock(); + } + break; + } + lwkt_switch(); + } +} + +static void +pagezero_start(void __unused *arg) +{ + int error; + struct thread *td; + + error = kthread_create(vm_pagezero, NULL, &td, "pagezero"); + if (error) + panic("pagezero_start: error %d\n", error); +} + +SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL); -- 2.41.0