From 3038a8caa1068d2a58b982b5033ccd015d6c8a07 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 10 Nov 2011 16:32:10 -0800 Subject: [PATCH] kernel - Performance improvements during heavy memory/IO use * Remove the vm.vm_load logic, it was breaking things worse and fixing things not so much. * Fix a bug in the pageout algorithm that was causing the PQ_ACTIVE queue to drain excessively, messing up the LRU/activity algorithm. * Rip out hammer_limit_running_io and instead just call waitrunningbufspace(). * Change the waitrunningbufspace() logic to add a bit of hyseresis and to fairly block everyone doing write I/O, otherwise some threads may be blocked while other threads are allowed to proceed while the buf_daemon is trying to flush stuff out. --- sys/kern/kern_clock.c | 5 --- sys/kern/vfs_bio.c | 33 ++++++------------ sys/sys/thread.h | 2 +- sys/vfs/hammer/hammer.h | 3 +- sys/vfs/hammer/hammer_flusher.c | 4 +-- sys/vfs/hammer/hammer_io.c | 10 +----- sys/vfs/hammer/hammer_vfsops.c | 17 ---------- sys/vm/vm_extern.h | 1 - sys/vm/vm_fault.c | 59 --------------------------------- sys/vm/vm_pageout.c | 53 ++++++++++------------------- sys/vm/vm_pageout.h | 3 -- 11 files changed, 33 insertions(+), 157 deletions(-) diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 63e995128a..dab7b6d8a4 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -533,11 +533,6 @@ hardclock(systimer_t info, int in_ipi __unused, struct intrframe *frame) */ cpu_sfence(); basetime_index = ni; - - /* - * Figure out how badly the system is starved for memory - */ - vm_fault_ratecheck(); } /* diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 2f6552e159..20598f3118 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -267,7 +267,7 @@ runningbufwakeup(struct buf *bp) /* * see waitrunningbufspace() for limit test. */ - limit = hirunningspace * 4 / 6; + limit = hirunningspace * 3 / 6; if (runningbufreq && runningbufspace <= limit) { runningbufreq = 0; spin_unlock(&bufcspin); @@ -305,38 +305,27 @@ bufcountwakeup(void) /* * waitrunningbufspace() * - * Wait for the amount of running I/O to drop to hirunningspace * 4 / 6. - * This is the point where write bursting stops so we don't want to wait - * for the running amount to drop below it (at least if we still want bioq - * to burst writes). + * If runningbufspace exceeds 4/6 hirunningspace we block until + * runningbufspace drops to 3/6 hirunningspace. We also block if another + * thread blocked here in order to be fair, even if runningbufspace + * is now lower than the limit. * * The caller may be using this function to block in a tight loop, we - * must block while runningbufspace is greater then or equal to - * hirunningspace * 4 / 6. - * - * And even with that it may not be enough, due to the presence of - * B_LOCKED dirty buffers, so also wait for at least one running buffer - * to complete. + * must block while runningbufspace is greater than at least + * hirunningspace * 3 / 6. */ void waitrunningbufspace(void) { int limit = hirunningspace * 4 / 6; - int dummy; - spin_lock(&bufcspin); - if (runningbufspace > limit) { - while (runningbufspace > limit) { - ++runningbufreq; + if (runningbufspace > limit || runningbufreq) { + spin_lock(&bufcspin); + while (runningbufspace > limit || runningbufreq) { + runningbufreq = 1; ssleep(&runningbufreq, &bufcspin, 0, "wdrn1", 0); } spin_unlock(&bufcspin); - } else if (runningbufspace > limit / 2) { - ++runningbufreq; - spin_unlock(&bufcspin); - tsleep(&dummy, 0, "wdrn2", 1); - } else { - spin_unlock(&bufcspin); } } diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 7d22ac4cdf..d8555a6bf7 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -275,7 +275,7 @@ struct thread { int td_fairq_load; /* fairq */ int td_fairq_count; /* fairq */ struct globaldata *td_migrate_gd; /* target gd for thread migration */ - const void *td_mplock_stallpc; /* last mplock stall address */ + const void *unused01; #ifdef DEBUG_CRIT_SECTIONS #define CRIT_DEBUG_ARRAY_SIZE 32 #define CRIT_DEBUG_ARRAY_MASK (CRIT_DEBUG_ARRAY_SIZE - 1) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index ed43d8a271..1bad1b12da 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -913,7 +913,7 @@ struct hammer_mount { struct hammer_mod_rb_tree lose_root; /* loose buffers */ int locked_dirty_space; /* meta/volu count */ int io_running_space; /* io_token */ - int io_running_wakeup; /* io_token */ + int unused01; int objid_cache_count; int dedup_cache_count; int error; /* critical I/O error */ @@ -1045,7 +1045,6 @@ extern int hammer_count_io_running_read; extern int hammer_count_io_running_write; extern int hammer_count_io_locked; extern int hammer_limit_dirtybufspace; -extern int hammer_limit_running_io; extern int hammer_limit_recs; extern int hammer_limit_inode_recs; extern int hammer_limit_reclaims; diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index 21c3101a55..9298b77f51 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -207,13 +207,13 @@ hammer_flusher_create(hammer_mount_t hmp) TAILQ_INIT(&hmp->flusher.ready_list); lwkt_create(hammer_flusher_master_thread, hmp, - &hmp->flusher.td, NULL, TDF_SYSTHREAD, -1, "hammer-M"); + &hmp->flusher.td, NULL, 0, -1, "hammer-M"); for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) { info = kmalloc(sizeof(*info), hmp->m_misc, M_WAITOK|M_ZERO); info->hmp = hmp; TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry); lwkt_create(hammer_flusher_slave_thread, info, - &info->td, NULL, TDF_SYSTHREAD, -1, "hammer-S%d", i); + &info->td, NULL, 0, -1, "hammer-S%d", i); } } diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index de02111852..be5f0c3a37 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -1124,11 +1124,6 @@ hammer_io_complete(struct buf *bp) hammer_stats_disk_write += iou->io.bytes; atomic_add_int(&hammer_count_io_running_write, -iou->io.bytes); atomic_add_int(&hmp->io_running_space, -iou->io.bytes); - if (hmp->io_running_wakeup && - hmp->io_running_space < hammer_limit_running_io / 2) { - hmp->io_running_wakeup = 0; - wakeup(&hmp->io_running_wakeup); - } KKASSERT(hmp->io_running_space >= 0); iou->io.running = 0; @@ -1999,8 +1994,5 @@ hammer_io_flush_sync(hammer_mount_t hmp) void hammer_io_limit_backlog(hammer_mount_t hmp) { - while (hmp->io_running_space > hammer_limit_running_io) { - hmp->io_running_wakeup = 1; - tsleep(&hmp->io_running_wakeup, 0, "hmiolm", hz / 10); - } + waitrunningbufspace(); } diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index 4dfb470b22..ccd0ff37b9 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -101,7 +101,6 @@ int hammer_count_io_running_read; int hammer_count_io_running_write; int hammer_count_io_locked; int hammer_limit_dirtybufspace; /* per-mount */ -int hammer_limit_running_io; /* per-mount */ int hammer_limit_recs; /* as a whole XXX */ int hammer_limit_inode_recs = 2048; /* per inode */ int hammer_limit_reclaims; @@ -166,8 +165,6 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, tdmux_ticks, CTLFLAG_RW, SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW, &hammer_limit_dirtybufspace, 0, ""); -SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_running_io, CTLFLAG_RW, - &hammer_limit_running_io, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW, &hammer_limit_recs, 0, ""); SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_inode_recs, CTLFLAG_RW, @@ -358,20 +355,6 @@ hammer_vfs_init(struct vfsconf *conf) hammer_limit_dirtybufspace = 100; } - /* - * Set reasonable limits to maintain an I/O pipeline. This is - * used by the flush code which explicitly initiates I/O, and - * is per-mount. - * - * The system-driven buffer cache uses vfs.lorunningspace and - * vfs.hirunningspace globally. - */ - if (hammer_limit_running_io == 0) - hammer_limit_running_io = hammer_limit_dirtybufspace; - - if (hammer_limit_running_io > 10 * 1024 * 1024) - hammer_limit_running_io = 10 * 1024 * 1024; - /* * The hammer_inode structure detaches from the vnode on reclaim. * This limits the number of inodes in this state to prevent a diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index a9b9b4332f..1664ea7b4b 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -97,7 +97,6 @@ void vm_fault_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t); void vm_fault_unwire (vm_map_t, vm_map_entry_t); int vm_fault_wire (vm_map_t, vm_map_entry_t, boolean_t); void vm_fork (struct proc *, struct proc *, int); -void vm_fault_ratecheck(void); int vm_test_nominal (void); void vm_wait_nominal (void); void vm_init_limits(struct proc *); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index d591effd9a..dc618a2443 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -115,7 +115,6 @@ struct faultstate { vm_map_t map; vm_map_entry_t entry; int lookup_still_valid; - int didlimit; int hardfault; int fault_flags; int map_generation; @@ -131,7 +130,6 @@ static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int); #if 0 static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *); #endif -static int vm_fault_ratelimit(struct vmspace *); static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry); static void vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot); @@ -256,7 +254,6 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) mycpu->gd_cnt.v_vm_faults++; - fs.didlimit = 0; fs.hardfault = 0; fs.fault_flags = fault_flags; growstack = 1; @@ -528,7 +525,6 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, mycpu->gd_cnt.v_vm_faults++; - fs.didlimit = 0; fs.hardfault = 0; fs.fault_flags = fault_flags; KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); @@ -737,7 +733,6 @@ vm_fault_object_page(vm_object_t object, vm_ooffset_t offset, entry.maptype = VM_MAPTYPE_NORMAL; entry.protection = entry.max_protection = fault_type; - fs.didlimit = 0; fs.hardfault = 0; fs.fault_flags = fault_flags; fs.map = NULL; @@ -1137,26 +1132,6 @@ vm_fault_object(struct faultstate *fs, return (KERN_PROTECTION_FAILURE); } - /* - * Ratelimit. - */ - if (fs->didlimit == 0 && curproc != NULL) { - int limticks; - - limticks = vm_fault_ratelimit(curproc->p_vmspace); - if (limticks) { - vm_object_pip_wakeup(fs->first_object); - vm_object_chain_release_all( - fs->first_object, fs->object); - if (fs->object != fs->first_object) - vm_object_drop(fs->object); - unlock_and_deallocate(fs); - tsleep(curproc, 0, "vmrate", limticks); - fs->didlimit = 1; - return (KERN_TRY_AGAIN); - } - } - /* * Allocate a new page for this object/offset pair. * @@ -1797,40 +1772,6 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry) lwkt_reltoken(&map->token); } -/* - * Reduce the rate at which memory is allocated to a process based - * on the perceived load on the VM system. As the load increases - * the allocation burst rate goes down and the delay increases. - * - * Rate limiting does not apply when faulting active or inactive - * pages. When faulting 'cache' pages, rate limiting only applies - * if the system currently has a severe page deficit. - * - * XXX vm_pagesupply should be increased when a page is freed. - * - * We sleep up to 1/10 of a second. - */ -static int -vm_fault_ratelimit(struct vmspace *vmspace) -{ - if (vm_load_enable == 0) - return(0); - if (vmspace->vm_pagesupply > 0) { - --vmspace->vm_pagesupply; /* SMP race ok */ - return(0); - } -#ifdef INVARIANTS - if (vm_load_debug) { - kprintf("load %-4d give %d pgs, wait %d, pid %-5d (%s)\n", - vm_load, - (1000 - vm_load ) / 10, vm_load * hz / 10000, - curproc->p_pid, curproc->p_comm); - } -#endif - vmspace->vm_pagesupply = (1000 - vm_load) / 10; - return(vm_load * hz / 10000); -} - /* * Copy all of the pages from a wired-down map entry to another. * diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 6178152aa0..328fd2ebd3 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -183,18 +183,6 @@ static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); -int vm_load; -SYSCTL_INT(_vm, OID_AUTO, vm_load, - CTLFLAG_RD, &vm_load, 0, "load on the VM system"); -int vm_load_enable = 1; -SYSCTL_INT(_vm, OID_AUTO, vm_load_enable, - CTLFLAG_RW, &vm_load_enable, 0, "enable vm_load rate limiting"); -#ifdef INVARIANTS -int vm_load_debug; -SYSCTL_INT(_vm, OID_AUTO, vm_load_debug, - CTLFLAG_RW, &vm_load_debug, 0, "debug vm_load"); -#endif - #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; @@ -208,24 +196,6 @@ static void vm_req_vmdaemon (void); #endif static void vm_pageout_page_stats(int q); -/* - * Update vm_load to slow down faulting processes. - * - * SMP races ok. - * No requirements. - */ -void -vm_fault_ratecheck(void) -{ - if (vm_pages_needed) { - if (vm_load < 1000) - ++vm_load; - } else { - if (vm_load > 0) - --vm_load; - } -} - /* * vm_pageout_clean: * @@ -1914,12 +1884,23 @@ vm_pageout_thread(void) active_shortage = vmstats.v_inactive_target - vmstats.v_inactive_count; - tmp = inactive_shortage; - if (tmp < vmstats.v_inactive_target / 10) - tmp = vmstats.v_inactive_target / 10; - inactive_shortage -= delta1; - if (inactive_shortage <= 0 && active_shortage > tmp * 2) - active_shortage = tmp * 2; + /* + * If we were unable to free sufficient inactive pages to + * satisfy the free/cache queue requirements then simply + * reaching the inactive target may not be good enough. + * Try to deactivate pages in excess of the target based + * on the shortfall. + * + * However to prevent thrashing the VM system do not + * deactivate more than an additional 1/10 the inactive + * target's worth of active pages. + */ + if (delta1 < inactive_shortage) { + tmp = (inactive_shortage - delta1) * 2; + if (tmp > vmstats.v_inactive_target / 10) + tmp = vmstats.v_inactive_target / 10; + active_shortage += tmp; + } delta2 = 0; for (q = 0; q < PQ_MAXL2_SIZE; ++q) { diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h index cc662ef140..1b878b99ac 100644 --- a/sys/vm/vm_pageout.h +++ b/sys/vm/vm_pageout.h @@ -84,9 +84,6 @@ extern int vm_page_max_wired; extern int vm_pages_needed; /* should be some "event" structure */ extern int vm_pageout_pages_needed; extern int vm_pageout_deficit; -extern int vm_load; -extern int vm_load_enable; -extern int vm_load_debug; #define VM_PAGEOUT_ASYNC 0 #define VM_PAGEOUT_SYNC 1 -- 2.41.0