From 51c99c6153dfaddf8f0a27f9f536caaa28c681a4 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 10 Nov 2011 19:31:24 -0800 Subject: [PATCH] kernel - Fix numerous performance problems with the pageout daemon * The VM page queues were not being fully utilized, causing the pageout daemon to calculate incorrect average page counts for deactivation/freeing. This caused the pageout daemon to dig into the active queue even when it did not need to. * The pageout daemon was incorrectly calculating the maxscan value for each queue. It was using the aggregate count (across all 256 queues) instead of the per-queue count, resulting in long stalls when memory is low. * Clean up the PQ_L2* knobs, constants, and other cruft, reducing them to the essentials for our goals. Reported-by: vsrinivas, thesjg, luxh, etc --- sys/conf/options | 1 - sys/config/LINT | 3 - sys/config/LINT64 | 3 - sys/emulation/linux/Makefile | 2 +- sys/vfs/nfs/Makefile | 2 +- sys/vfs/nwfs/Makefile | 2 +- sys/vfs/smbfs/Makefile | 2 +- sys/vm/vm_page.h | 53 +++++------------ sys/vm/vm_pageout.c | 107 ++++++++++++++++++++--------------- sys/vm/vm_swap.c | 2 +- sys/vm/vm_swapcache.c | 10 ++-- 11 files changed, 83 insertions(+), 104 deletions(-) diff --git a/sys/conf/options b/sys/conf/options index da7b1544e9..ec26b96f06 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -488,7 +488,6 @@ DEBUG_PCTRACK opt_pctrack.h # These are VM related options NO_SWAPPING opt_vm.h -PQ_CACHESIZE opt_vmpage.h # Standard SMP options SMP opt_global.h diff --git a/sys/config/LINT b/sys/config/LINT index 879ba94f84..955387c774 100644 --- a/sys/config/LINT +++ b/sys/config/LINT @@ -90,9 +90,6 @@ options DFLDSIZ="(256*1024*1024)" # options BLKDEV_IOSIZE=8192 -# Options for the VM subsystem. -options PQ_CACHESIZE=512 # color for 512k/16k cache - # This allows you to actually store this configuration file into # the kernel binary itself, where it may be later read by saying: # strings -n 3 /kernel | sed -n 's/^___//p' > MYKERNEL diff --git a/sys/config/LINT64 b/sys/config/LINT64 index 0e59d11cfe..d9594dc1d5 100644 --- a/sys/config/LINT64 +++ b/sys/config/LINT64 @@ -90,9 +90,6 @@ options DFLDSIZ="(256*1024*1024)" # options BLKDEV_IOSIZE=8192 -# Options for the VM subsystem. -options PQ_CACHESIZE=512 # color for 512k/16k cache - # This allows you to actually store this configuration file into # the kernel binary itself, where it may be later read by saying: # strings -n 3 /kernel | sed -n 's/^___//p' > MYKERNEL diff --git a/sys/emulation/linux/Makefile b/sys/emulation/linux/Makefile index 759037cc4f..5668acba2e 100644 --- a/sys/emulation/linux/Makefile +++ b/sys/emulation/linux/Makefile @@ -10,7 +10,7 @@ SRCS= linux_dummy.c linux_emuldata.c linux_epoll.c \ linux_machdep.c linux_mib.c linux_misc.c linux_time.c linux_signal.c \ linux_socket.c \ linux_stats.c linux_sysctl.c linux_sysent.c linux_sysvec.c \ - linux_util.c opt_compat.h opt_global.h opt_vmpage.h + linux_util.c opt_compat.h opt_global.h SRCS+= bus_if.h device_if.h SRCS+= opt_nfs.h assym.s OBJS= linux_support.o linux_locore.o diff --git a/sys/vfs/nfs/Makefile b/sys/vfs/nfs/Makefile index f3e8e4921c..59d5eecf71 100644 --- a/sys/vfs/nfs/Makefile +++ b/sys/vfs/nfs/Makefile @@ -5,7 +5,7 @@ KMOD= nfs SRCS= nfs_bio.c nfs_node.c nfs_kerb.c nfs_serv.c nfs_socket.c \ nfs_srvcache.c nfs_subs.c nfs_syscalls.c nfs_vfsops.c nfs_iod.c \ nfsm_subs.c nfs_vnops.c \ - opt_inet.h opt_nfs.h opt_vmpage.h opt_bootp.h opt_nfsroot.h + opt_inet.h opt_nfs.h opt_bootp.h opt_nfsroot.h # 0/1 - requires INET to be configured in kernel # diff --git a/sys/vfs/nwfs/Makefile b/sys/vfs/nwfs/Makefile index cbfd80e1d7..4c0fa1c79d 100644 --- a/sys/vfs/nwfs/Makefile +++ b/sys/vfs/nwfs/Makefile @@ -4,7 +4,7 @@ KMOD= nwfs SRCS= nwfs_node.c nwfs_ioctl.c nwfs_io.c nwfs_vfsops.c nwfs_vnops.c \ - nwfs_subr.c opt_ncp.h opt_nwfs.h opt_vmpage.h + nwfs_subr.c opt_ncp.h opt_nwfs.h .if defined(VNPRINT) CFLAGS+= -DVNPRINT diff --git a/sys/vfs/smbfs/Makefile b/sys/vfs/smbfs/Makefile index 238db0ebde..2378301dd3 100644 --- a/sys/vfs/smbfs/Makefile +++ b/sys/vfs/smbfs/Makefile @@ -9,7 +9,7 @@ KMOD= smbfs SRCS= opt_inet.h opt_ipx.h \ - opt_netsmb.h opt_vmpage.h \ + opt_netsmb.h \ iconv_converter_if.h \ md4c.c \ smb_conn.c smb_dev.c smb_trantcp.c smb_smb.c smb_subr.c smb_rq.c \ diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index c7e9c7321d..ebafa88c4f 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -71,10 +71,6 @@ #ifndef _VM_VM_PAGE_H_ #define _VM_VM_PAGE_H_ -#if !defined(KLD_MODULE) && defined(_KERNEL) -#include "opt_vmpage.h" -#endif - #ifndef _SYS_TYPES_H_ #include #endif @@ -201,69 +197,46 @@ typedef struct vm_page *vm_page_t; #endif /* - * Page coloring parameters. We default to a middle of the road optimization. - * Larger selections would not really hurt us but if a machine does not have - * a lot of memory it could cause vm_page_alloc() to eat more cpu cycles - * looking for free pages. + * Page coloring parameters. We use generous parameters designed to + * statistically spread pages over available cpu cache space. This has + * become less important over time as cache associativity is higher + * in modern times but we still use the core algorithm to help reduce + * lock contention between cpus. * - * Page coloring cannot be disabled. Modules do not have access to most PQ - * constants because they can change between builds. + * Page coloring cannot be disabled. */ -#if defined(_KERNEL) && !defined(KLD_MODULE) - -#if !defined(PQ_CACHESIZE) -#define PQ_CACHESIZE 256 /* max is 1024 (MB) */ -#endif -#if PQ_CACHESIZE >= 1024 #define PQ_PRIME1 31 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 23 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 256 /* A number of colors opt for 1M cache */ -#elif PQ_CACHESIZE >= 512 +#if 0 #define PQ_PRIME1 31 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 23 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 128 /* A number of colors opt for 512K cache */ -#elif PQ_CACHESIZE >= 256 #define PQ_PRIME1 13 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 7 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 64 /* A number of colors opt for 256K cache */ -#elif PQ_CACHESIZE >= 128 #define PQ_PRIME1 9 /* Produces a good PQ_L2_SIZE/3 + PQ_PRIME1 */ #define PQ_PRIME2 5 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 32 /* A number of colors opt for 128k cache */ -#else #define PQ_PRIME1 5 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 3 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 16 /* A reasonable number of colors (opt for 64K cache) */ - #endif #define PQ_L2_MASK (PQ_L2_SIZE - 1) -#endif /* KERNEL && !KLD_MODULE */ - -/* - * - * The queue array is always based on PQ_MAXL2_SIZE regardless of the actual - * cache size chosen in order to present a uniform interface for modules. - */ -#define PQ_MAXL2_SIZE 256 /* fixed maximum (in pages) / module compat */ - -#if PQ_L2_SIZE > PQ_MAXL2_SIZE -#error "Illegal PQ_L2_SIZE" -#endif - #define PQ_NONE 0 -#define PQ_FREE (1 + 0*PQ_MAXL2_SIZE) -#define PQ_INACTIVE (1 + 1*PQ_MAXL2_SIZE) -#define PQ_ACTIVE (1 + 2*PQ_MAXL2_SIZE) -#define PQ_CACHE (1 + 3*PQ_MAXL2_SIZE) -#define PQ_HOLD (1 + 4*PQ_MAXL2_SIZE) -#define PQ_COUNT (1 + 5*PQ_MAXL2_SIZE) +#define PQ_FREE (1 + 0*PQ_L2_SIZE) +#define PQ_INACTIVE (1 + 1*PQ_L2_SIZE) +#define PQ_ACTIVE (1 + 2*PQ_L2_SIZE) +#define PQ_CACHE (1 + 3*PQ_L2_SIZE) +#define PQ_HOLD (1 + 4*PQ_L2_SIZE) +#define PQ_COUNT (1 + 5*PQ_L2_SIZE) /* * Scan support diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 328fd2ebd3..ba1922d4d0 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -119,7 +119,6 @@ static struct kproc_desc vm_kp = { SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) #endif - int vm_pages_needed=0; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit=0; /* Estimated number of pages deficit */ int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */ @@ -196,6 +195,15 @@ static void vm_req_vmdaemon (void); #endif static void vm_pageout_page_stats(int q); +static __inline int +PQAVERAGE(int n) +{ + if (n >= 0) + return((n + (PQ_L2_SIZE - 1)) / PQ_L2_SIZE + 1); + else + return((n - (PQ_L2_SIZE - 1)) / PQ_L2_SIZE - 1); +} + /* * vm_pageout_clean: * @@ -718,7 +726,7 @@ struct vm_pageout_scan_info { static int vm_pageout_scan_callback(struct proc *p, void *data); static int -vm_pageout_scan_inactive(int pass, int q, int inactive_shortage, +vm_pageout_scan_inactive(int pass, int q, int avail_shortage, int *vnodes_skippedp) { vm_page_t m; @@ -771,11 +779,11 @@ vm_pageout_scan_inactive(int pass, int q, int inactive_shortage, vm_page_queues_spin_lock(PQ_INACTIVE + q); TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); - maxscan = vmstats.v_inactive_count; + maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; vm_page_queues_spin_unlock(PQ_INACTIVE + q); while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && - maxscan-- > 0 && inactive_shortage - delta > 0) + maxscan-- > 0 && avail_shortage - delta > 0) { vm_page_and_queue_spin_lock(m); if (m != TAILQ_NEXT(&marker, pageq)) { @@ -1129,20 +1137,19 @@ vm_pageout_scan_inactive(int pass, int q, int inactive_shortage, vm_page_queues_spin_lock(PQ_INACTIVE + q); TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); vm_page_queues_spin_unlock(PQ_INACTIVE + q); - return (delta); } static int vm_pageout_scan_active(int pass, int q, - int inactive_shortage, int active_shortage, + int avail_shortage, int inactive_shortage, int *recycle_countp) { struct vm_page marker; vm_page_t m; int actcount; int delta = 0; - int pcount; + int maxscan; /* * We want to move pages from the active queue to the inactive @@ -1173,17 +1180,17 @@ vm_pageout_scan_active(int pass, int q, vm_page_queues_spin_lock(PQ_ACTIVE + q); TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); + maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; vm_page_queues_spin_unlock(PQ_ACTIVE + q); - pcount = vmstats.v_active_count; while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && - pcount-- > 0 && (inactive_shortage - delta > 0 || - active_shortage > 0)) + maxscan-- > 0 && (avail_shortage - delta > 0 || + inactive_shortage > 0)) { vm_page_and_queue_spin_lock(m); if (m != TAILQ_NEXT(&marker, pageq)) { vm_page_and_queue_spin_unlock(m); - ++pcount; + ++maxscan; continue; } KKASSERT(m->queue - m->pc == PQ_ACTIVE); @@ -1285,15 +1292,14 @@ vm_pageout_scan_active(int pass, int q, * inactive scan, that could lead to * gigabytes being moved. */ - --active_shortage; - if (inactive_shortage - delta > 0 || + --inactive_shortage; + if (avail_shortage - delta > 0 || m->object->ref_count == 0) { - if (inactive_shortage - delta > 0) + if (avail_shortage - delta > 0) ++*recycle_countp; vm_page_protect(m, VM_PROT_NONE); if (m->dirty == 0 && - inactive_shortage - delta > 0) { - ++delta; + avail_shortage - delta > 0) { vm_page_cache(m); } else { vm_page_deactivate(m); @@ -1303,6 +1309,7 @@ vm_pageout_scan_active(int pass, int q, vm_page_deactivate(m); vm_page_wakeup(m); } + ++delta; } else { vm_page_and_queue_spin_lock(m); if (m->queue - m->pc == PQ_ACTIVE) { @@ -1359,8 +1366,7 @@ vm_pageout_scan_active(int pass, int q, * pages_freed counter. */ static void -vm_pageout_scan_cache(int inactive_shortage, - int vnodes_skipped, int recycle_count) +vm_pageout_scan_cache(int avail_shortage, int vnodes_skipped, int recycle_count) { struct vm_pageout_scan_info info; vm_page_t m; @@ -1456,7 +1462,7 @@ vm_pageout_scan_cache(int inactive_shortage, * enough pages to meet bare minimum needs. This test only * works if the inactive queue is bloated. * - * - due to a positive inactive_shortage we shifted the remaining + * - due to a positive avail_shortage we shifted the remaining * dirty pages from the active queue to the inactive queue * trying to find clean ones to free. */ @@ -1464,7 +1470,7 @@ vm_pageout_scan_cache(int inactive_shortage, kprintf("Warning: system low on memory+swap!\n"); if (swap_pager_full && vm_page_count_min(recycle_count) && vmstats.v_inactive_count > vmstats.v_active_count * 4 && - inactive_shortage > 0) { + avail_shortage > 0) { /* * Kill something. */ @@ -1554,11 +1560,11 @@ vm_pageout_page_stats(int q) if (page_shortage <= 0) return; - pcount = vmstats.v_active_count; + pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; fullintervalcount += vm_pageout_stats_interval; if (fullintervalcount < vm_pageout_full_stats_interval) { - tpcount = (vm_pageout_stats_max * vmstats.v_active_count) / - vmstats.v_page_count; + tpcount = (vm_pageout_stats_max * pcount) / + vmstats.v_page_count + 1; if (pcount > tpcount) pcount = tpcount; } else { @@ -1823,8 +1829,8 @@ vm_pageout_thread(void) int error; int delta1; int delta2; + int avail_shortage; int inactive_shortage; - int active_shortage; int vnodes_skipped = 0; int recycle_count = 0; int tmp; @@ -1841,7 +1847,7 @@ vm_pageout_thread(void) if (error && vm_paging_needed() == 0 && vm_pages_needed == 0) { - for (q = 0; q < PQ_MAXL2_SIZE; ++q) + for (q = 0; q < PQ_L2_SIZE; ++q) vm_pageout_page_stats(q); continue; } @@ -1863,16 +1869,20 @@ vm_pageout_thread(void) * want to get to. This is higher then the number that causes * allocations to stall (severe) in order to provide hysteresis, * and if we don't make it all the way but get to the minimum - * we're happy. + * we're happy. Goose it a bit if there are multipler + * requests for memory. */ - inactive_shortage = vm_paging_target() + vm_pageout_deficit; + avail_shortage = vm_paging_target() + vm_pageout_deficit; vm_pageout_deficit = 0; delta1 = 0; - for (q = 0; q < PQ_MAXL2_SIZE; ++q) { - delta1 += vm_pageout_scan_inactive( - pass, q, - inactive_shortage / PQ_MAXL2_SIZE + 1, - &vnodes_skipped); + if (avail_shortage > 0) { + for (q = 0; q < PQ_L2_SIZE; ++q) { + delta1 += vm_pageout_scan_inactive( + pass, q, + PQAVERAGE(avail_shortage), + &vnodes_skipped); + } + avail_shortage -= delta1; } /* @@ -1881,8 +1891,8 @@ vm_pageout_thread(void) * scan above we limit the number of active pages we * deactivate to reduce unnecessary work. */ - active_shortage = vmstats.v_inactive_target - - vmstats.v_inactive_count; + inactive_shortage = vmstats.v_inactive_target - + vmstats.v_inactive_count; /* * If we were unable to free sufficient inactive pages to @@ -1895,20 +1905,24 @@ vm_pageout_thread(void) * deactivate more than an additional 1/10 the inactive * target's worth of active pages. */ - if (delta1 < inactive_shortage) { - tmp = (inactive_shortage - delta1) * 2; + if (avail_shortage > 0) { + tmp = avail_shortage * 2; if (tmp > vmstats.v_inactive_target / 10) tmp = vmstats.v_inactive_target / 10; - active_shortage += tmp; + inactive_shortage += tmp; } - delta2 = 0; - for (q = 0; q < PQ_MAXL2_SIZE; ++q) { - delta2 += vm_pageout_scan_active( - pass, q, - inactive_shortage / PQ_MAXL2_SIZE + 1, - active_shortage / PQ_MAXL2_SIZE + 1, - &recycle_count); + if (avail_shortage > 0 || inactive_shortage > 0) { + delta2 = 0; + for (q = 0; q < PQ_L2_SIZE; ++q) { + delta2 += vm_pageout_scan_active( + pass, q, + PQAVERAGE(avail_shortage), + PQAVERAGE(inactive_shortage), + &recycle_count); + } + inactive_shortage -= delta2; + avail_shortage -= delta2; } /* @@ -1916,14 +1930,13 @@ vm_pageout_thread(void) * requirement and take more drastic measures if we are * still in trouble. */ - inactive_shortage -= delta2; - vm_pageout_scan_cache(inactive_shortage, vnodes_skipped, + vm_pageout_scan_cache(avail_shortage, vnodes_skipped, recycle_count); /* * Wait for more work. */ - if (inactive_shortage > 0) { + if (avail_shortage > 0) { ++pass; if (swap_pager_full) { /* diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c index 88f73a031c..6291df537a 100644 --- a/sys/vm/vm_swap.c +++ b/sys/vm/vm_swap.c @@ -459,7 +459,7 @@ swapoff_one(int index) * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ - for (q = 0; q < PQ_MAXL2_SIZE; ++q) { + for (q = 0; q < PQ_L2_SIZE; ++q) { bzero(&marker, sizeof(marker)); marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; marker.queue = PQ_ACTIVE + q; diff --git a/sys/vm/vm_swapcache.c b/sys/vm/vm_swapcache.c index baf7865319..fe445994d6 100644 --- a/sys/vm/vm_swapcache.c +++ b/sys/vm/vm_swapcache.c @@ -159,7 +159,7 @@ vm_swapcached_thread(void) { enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING; - static struct vm_page page_marker[PQ_MAXL2_SIZE]; + static struct vm_page page_marker[PQ_L2_SIZE]; static struct vm_object object_marker; int q; @@ -176,7 +176,7 @@ vm_swapcached_thread(void) * Initialize our marker for the inactive scan (SWAPC_WRITING) */ bzero(&page_marker, sizeof(page_marker)); - for (q = 0; q < PQ_MAXL2_SIZE; ++q) { + for (q = 0; q < PQ_L2_SIZE; ++q) { page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; page_marker[q].queue = PQ_INACTIVE + q; page_marker[q].pc = q; @@ -254,7 +254,7 @@ vm_swapcached_thread(void) if (state == SWAPC_WRITING) { if (vm_swapcache_curburst >= vm_swapcache_accrate) { if (burst == SWAPB_BURSTING) { - for (q = 0; q < PQ_MAXL2_SIZE; ++q) { + for (q = 0; q < PQ_L2_SIZE; ++q) { vm_swapcache_writing( &page_marker[q]); } @@ -262,7 +262,7 @@ vm_swapcached_thread(void) burst = SWAPB_RECOVERING; } else if (vm_swapcache_curburst > vm_swapcache_minburst) { - for (q = 0; q < PQ_MAXL2_SIZE; ++q) { + for (q = 0; q < PQ_L2_SIZE; ++q) { vm_swapcache_writing( &page_marker[q]); } @@ -277,7 +277,7 @@ vm_swapcached_thread(void) /* * Cleanup (NOT REACHED) */ - for (q = 0; q < PQ_MAXL2_SIZE; ++q) { + for (q = 0; q < PQ_L2_SIZE; ++q) { vm_page_queues_spin_lock(PQ_INACTIVE + q); TAILQ_REMOVE( &vm_page_queues[PQ_INACTIVE + q].pl, -- 2.41.0