From 00a3fdca11527a9e70b79270e03d979267907ca4 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 6 Feb 2010 08:57:05 -0800 Subject: [PATCH] kernel - SWAP CACHE part 12/many - Add swapcache cleanup state * Add a small state machine and hysteresis to flip between swapcache writing and swapcache cleaning. The swapcache is written to until (unless) it hits 75% use. If this occurs it switches to cleaning mode to get rid of swapcache pages until it gets down to 70%. While in cleaning mode burst accumulation still occurs. Then it flips back. Currently the cleaning mode tries to choose swap meta-blocks which are wholely swapped (have no VM pages), running linearly through the VM object list in order to try to clean contiguous areas of the swapcache. The idea is to reduce fragmentation that would lead to excessive disk seeking. At the same time the limited cleaning run (only 5% of the swap cache) should prevent any large-scale excessive deletion of the swapcache. * Add a new VM object type, OBJT_MARKER, which may be used by iterators running through the vm_object_list. --- .../linux/i386/linprocfs/linprocfs_misc.c | 5 +- sys/emulation/linux/linux_misc.c | 5 +- sys/vm/swap_pager.c | 75 ++++ sys/vm/swap_pager.h | 2 + sys/vm/vm_meter.c | 7 +- sys/vm/vm_object.c | 4 + sys/vm/vm_object.h | 3 +- sys/vm/vm_swapcache.c | 321 ++++++++++++------ 8 files changed, 307 insertions(+), 115 deletions(-) diff --git a/sys/emulation/linux/i386/linprocfs/linprocfs_misc.c b/sys/emulation/linux/i386/linprocfs/linprocfs_misc.c index 34b3c94b20..6be0e01e9d 100644 --- a/sys/emulation/linux/i386/linprocfs/linprocfs_misc.c +++ b/sys/emulation/linux/i386/linprocfs/linprocfs_misc.c @@ -121,9 +121,12 @@ linprocfs_domeminfo(struct proc *curp, struct proc *p, struct pfsnode *pfs, swapused = swaptotal - swapfree; memshared = 0; for (object = TAILQ_FIRST(&vm_object_list); object != NULL; - object = TAILQ_NEXT(object, object_list)) + object = TAILQ_NEXT(object, object_list)) { + if (object->type == OBJT_MARKER) + continue; if (object->shadow_count > 1) memshared += object->resident_page_count; + } memshared *= PAGE_SIZE; /* * We'd love to be able to write: diff --git a/sys/emulation/linux/linux_misc.c b/sys/emulation/linux/linux_misc.c index d282a65edf..5c34294133 100644 --- a/sys/emulation/linux/linux_misc.c +++ b/sys/emulation/linux/linux_misc.c @@ -147,9 +147,12 @@ sys_linux_sysinfo(struct linux_sysinfo_args *args) get_mplock(); sysinfo.sharedram = 0; for (object = TAILQ_FIRST(&vm_object_list); object != NULL; - object = TAILQ_NEXT(object, object_list)) + object = TAILQ_NEXT(object, object_list)) { + if (object->type == OBJT_MARKER) + continue; if (object->shadow_count > 1) sysinfo.sharedram += object->resident_page_count; + } sysinfo.sharedram *= PAGE_SIZE; sysinfo.bufferram = 0; diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index d7092d4d5c..e7c0e2410c 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -210,6 +210,17 @@ rb_swblock_scancmp(struct swblock *swb, void *data) return(0); } +static +int +rb_swblock_condcmp(struct swblock *swb, void *data) +{ + struct swfreeinfo *info = data; + + if (swb->swb_index < info->basei) + return(-1); + return(0); +} + /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. @@ -576,6 +587,70 @@ swap_pager_freespace_all(vm_object_t object) crit_exit(); } +/* + * This function conditionally frees swap cache swap starting at + * (*basei) in the object. (count) swap blocks will be nominally freed. + * The actual number of blocks freed can be more or less than the + * requested number. + * + * This function nominally returns the number of blocks freed. However, + * the actual number of blocks freed may be less then the returned value. + * If the function is unable to exhaust the object or if it is able to + * free (approximately) the requested number of blocks it returns + * a value n > count. + * + * If we exhaust the object we will return a value n <= count. + */ +static int swap_pager_condfree_callback(struct swblock *swap, void *data); + +int +swap_pager_condfree(vm_object_t object, vm_size_t *basei, int count) +{ + struct swfreeinfo info; + + info.object = object; + info.basei = *basei; /* skip up to this page index */ + info.begi = count; /* max swap pages to destroy */ + info.endi = count * 8; /* max swblocks to scan */ + + swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp, + swap_pager_condfree_callback, &info); + *basei = info.basei; + if (info.endi < 0 && info.begi <= count) + info.begi = count + 1; + return(count - (int)info.begi); +} + +/* + * The idea is to free whole meta-block to avoid fragmenting + * the swap space or disk I/O. We only do this if NO VM pages + * are present. + * + * We do not have to deal with clearing PG_SWAPPED in related VM + * pages because there are no related VM pages. + */ +static int +swap_pager_condfree_callback(struct swblock *swap, void *data) +{ + struct swfreeinfo *info = data; + vm_object_t object = info->object; + int i; + + for (i = 0; i < SWAP_META_PAGES; ++i) { + if (vm_page_lookup(object, swap->swb_index + i)) + break; + } + info->basei = swap->swb_index + SWAP_META_PAGES; + if (i == SWAP_META_PAGES) { + info->begi -= swap->swb_count; + swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES); + } + --info->endi; + if ((int)info->begi < 0 || (int)info->endi < 0) + return(-1); + return(0); +} + /* * Called by vm_page_alloc() when a new VM page is inserted * into a VM object. Checks whether swap has been assigned to diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index 9900d6f687..3a995a094a 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -101,6 +101,8 @@ int swap_pager_swp_alloc (vm_object_t, int); void swap_pager_copy (vm_object_t, vm_object_t, vm_pindex_t, int); void swap_pager_freespace (vm_object_t, vm_pindex_t, vm_pindex_t); void swap_pager_freespace_all (vm_object_t); +int swap_pager_condfree(vm_object_t, vm_size_t *, int); + void swap_pager_page_inserted(vm_page_t); void swap_pager_swap_init (void); void swap_pager_newswap (void); diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 7a92f18178..3a436c4a79 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -94,6 +94,8 @@ do_vmtotal(SYSCTL_HANDLER_ARGS) for (object = TAILQ_FIRST(&vm_object_list); object != NULL; object = TAILQ_NEXT(object,object_list)) { + if (object->type == OBJT_MARKER) + continue; vm_object_clear_flag(object, OBJ_ACTIVE); } @@ -109,8 +111,11 @@ do_vmtotal(SYSCTL_HANDLER_ARGS) object != NULL; object = TAILQ_NEXT(object, object_list)) { /* - * devices, like /dev/mem, will badly skew our totals + * devices, like /dev/mem, will badly skew our totals. + * markers aren't real objects. */ + if (object->type == OBJT_MARKER) + continue; if (object->type == OBJT_DEVICE) continue; totalp->t_vm += object->size; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 1d38824a10..4ae7fb329e 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1846,6 +1846,8 @@ DB_SHOW_COMMAND(vmochk, vm_object_check) for (object = TAILQ_FIRST(&vm_object_list); object != NULL; object = TAILQ_NEXT(object, object_list)) { + if (object->type == OBJT_MARKER) + continue; if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { @@ -1946,6 +1948,8 @@ DB_SHOW_COMMAND(vmopag, vm_object_print_pages) int rcount; vm_page_t m; + if (object->type == OBJT_MARKER) + continue; db_printf("new object: %p\n", (void *)object); if ( nl > 18) { c = cngetc(); diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 06540905d2..e5deda15a7 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -115,7 +115,8 @@ enum obj_type { OBJT_VNODE, /* object backed by file pages (vnode) */ OBJT_DEVICE, /* object backed by device pages */ OBJT_PHYS, /* object backed by physical pages */ - OBJT_DEAD /* dead object */ + OBJT_DEAD, /* dead object */ + OBJT_MARKER /* marker object */ }; typedef u_char objtype_t; diff --git a/sys/vm/vm_swapcache.c b/sys/vm/vm_swapcache.c index d603fb8157..39f2d1470a 100644 --- a/sys/vm/vm_swapcache.c +++ b/sys/vm/vm_swapcache.c @@ -80,6 +80,8 @@ /* the kernel process "vm_pageout"*/ static void vm_swapcached (void); static void vm_swapcached_flush (vm_page_t m); +static void vm_swapcache_writing(vm_page_t marker); +static void vm_swapcache_cleaning(vm_object_t marker); struct thread *swapcached_thread; static struct kproc_desc swpc_kp = { @@ -126,27 +128,31 @@ SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, static void vm_swapcached(void) { - struct vm_page marker; - vm_object_t object; - struct vnode *vp; - vm_page_t m; - int count; + enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; + struct vm_page page_marker; + struct vm_object object_marker; /* * Thread setup */ curthread->td_flags |= TDF_SYSTHREAD; + crit_enter(); /* - * Initialize our marker + * Initialize our marker for the inactive scan (SWAPC_WRITING) */ - bzero(&marker, sizeof(marker)); - marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; - marker.queue = PQ_INACTIVE; - marker.wire_count = 1; + bzero(&page_marker, sizeof(page_marker)); + page_marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; + page_marker.queue = PQ_INACTIVE; + page_marker.wire_count = 1; + TAILQ_INSERT_HEAD(INACTIVE_LIST, &page_marker, pageq); - crit_enter(); - TAILQ_INSERT_HEAD(INACTIVE_LIST, &marker, pageq); + /* + * Initialize our marker for the vm_object scan (SWAPC_CLEANING) + */ + bzero(&object_marker, sizeof(object_marker)); + object_marker.type = OBJT_MARKER; + TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list); for (;;) { /* @@ -159,124 +165,142 @@ vm_swapcached(void) } /* - * Polling rate when enabled is 10 hz. Deal with write - * bandwidth limits. - * - * We don't want to nickle-and-dime the scan as that will - * create unnecessary fragmentation. + * Polling rate when enabled is 10 hz. */ tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); - vm_swapcache_curburst += vm_swapcache_accrate / 10; - if (vm_swapcache_curburst > vm_swapcache_maxburst) - vm_swapcache_curburst = vm_swapcache_maxburst; - if (vm_swapcache_curburst < vm_swapcache_accrate) - continue; /* - * Don't load any more into the cache once we have exceeded - * 3/4 of available swap space. XXX need to start cleaning - * it out, though vnode recycling will accomplish that to - * some degree. + * State hysteresis. Generate write activity up to 75% of + * swap, then clean out swap assignments down to 70%, then + * repeat. */ - if (vm_swap_cache_use > vm_swap_max * 3 / 4) - continue; + if (state == SWAPC_WRITING) { + if (vm_swap_cache_use > (int64_t)vm_swap_max * 75 / 100) + state = SWAPC_CLEANING; + } else { + if (vm_swap_cache_use < (int64_t)vm_swap_max * 70 / 100) + state = SWAPC_WRITING; + } /* - * Calculate the number of pages to test. We don't want - * to get into a cpu-bound loop. + * We are allowed to continue accumulating burst value + * in either state. */ - count = vmstats.v_inactive_count; - if (count > vm_swapcache_maxlaunder) - count = vm_swapcache_maxlaunder; + vm_swapcache_curburst += vm_swapcache_accrate / 10; + if (vm_swapcache_curburst > vm_swapcache_maxburst) + vm_swapcache_curburst = vm_swapcache_maxburst; /* - * Scan the inactive queue from our marker to locate - * suitable pages to push to the swap cache. - * - * We are looking for clean vnode-backed pages. - * - * NOTE: PG_SWAPPED pages in particular are not part of - * our count because once the cache stabilizes we - * can end up with a very high datarate of VM pages - * cycling from it. + * We don't want to nickle-and-dime the scan as that will + * create unnecessary fragmentation. The minimum burst + * is one-seconds worth of accumulation. */ - m = ▮ - while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) { - if (m->flags & (PG_MARKER | PG_SWAPPED)) { - ++count; - continue; - } - if (vm_swapcache_curburst < 0) - break; - if (m->flags & (PG_BUSY | PG_UNMANAGED)) - continue; - if (m->busy || m->hold_count || m->wire_count) - continue; - if (m->valid != VM_PAGE_BITS_ALL) - continue; - if (m->dirty & m->valid) - continue; - if ((object = m->object) == NULL) - continue; - if (object->type != OBJT_VNODE || - (object->flags & OBJ_DEAD)) { - continue; - } - vm_page_test_dirty(m); - if (m->dirty & m->valid) - continue; - vp = object->handle; - if (vp == NULL) + if (state == SWAPC_WRITING) { + if (vm_swapcache_curburst >= vm_swapcache_accrate) + vm_swapcache_writing(&page_marker); + } else { + vm_swapcache_cleaning(&object_marker); + } + } + TAILQ_REMOVE(INACTIVE_LIST, &page_marker, pageq); + TAILQ_REMOVE(&vm_object_list, &object_marker, object_list); + crit_exit(); +} + +static void +vm_swapcache_writing(vm_page_t marker) +{ + vm_object_t object; + struct vnode *vp; + vm_page_t m; + int count; + + /* + * Scan the inactive queue from our marker to locate + * suitable pages to push to the swap cache. + * + * We are looking for clean vnode-backed pages. + * + * NOTE: PG_SWAPPED pages in particular are not part of + * our count because once the cache stabilizes we + * can end up with a very high datarate of VM pages + * cycling from it. + */ + m = marker; + count = vm_swapcache_maxlaunder; + + while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) { + if (m->flags & (PG_MARKER | PG_SWAPPED)) { + ++count; + continue; + } + if (vm_swapcache_curburst < 0) + break; + if (m->flags & (PG_BUSY | PG_UNMANAGED)) + continue; + if (m->busy || m->hold_count || m->wire_count) + continue; + if (m->valid != VM_PAGE_BITS_ALL) + continue; + if (m->dirty & m->valid) + continue; + if ((object = m->object) == NULL) + continue; + if (object->type != OBJT_VNODE || + (object->flags & OBJ_DEAD)) { + continue; + } + vm_page_test_dirty(m); + if (m->dirty & m->valid) + continue; + vp = object->handle; + if (vp == NULL) + continue; + switch(vp->v_type) { + case VREG: + if (vm_swapcache_data_enable == 0) continue; - switch(vp->v_type) { - case VREG: - if (vm_swapcache_data_enable == 0) - continue; - break; - case VCHR: - if (vm_swapcache_meta_enable == 0) - continue; - break; - default: + break; + case VCHR: + if (vm_swapcache_meta_enable == 0) continue; - } - - /* - * Ok, move the marker and soft-busy the page. - */ - TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq); - TAILQ_INSERT_AFTER(INACTIVE_LIST, m, &marker, pageq); - - /* - * Assign swap and initiate I/O - */ - vm_swapcached_flush(m); - - /* - * Setup for next loop using marker. - */ - m = ▮ + break; + default: + continue; } /* - * Cleanup marker position. If we hit the end of the - * list the marker is placed at the tail. Newly deactivated - * pages will be placed after it. - * - * Earlier inactive pages that were dirty and become clean - * are typically moved to the end of PQ_INACTIVE by virtue - * of vfs_vmio_release() when they become unwired from the - * buffer cache. + * Ok, move the marker and soft-busy the page. */ - TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq); - if (m) - TAILQ_INSERT_BEFORE(m, &marker, pageq); - else - TAILQ_INSERT_TAIL(INACTIVE_LIST, &marker, pageq); + TAILQ_REMOVE(INACTIVE_LIST, marker, pageq); + TAILQ_INSERT_AFTER(INACTIVE_LIST, m, marker, pageq); + /* + * Assign swap and initiate I/O + */ + vm_swapcached_flush(m); + + /* + * Setup for next loop using marker. + */ + m = marker; } - TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq); - crit_exit(); + + /* + * Cleanup marker position. If we hit the end of the + * list the marker is placed at the tail. Newly deactivated + * pages will be placed after it. + * + * Earlier inactive pages that were dirty and become clean + * are typically moved to the end of PQ_INACTIVE by virtue + * of vfs_vmio_release() when they become unwired from the + * buffer cache. + */ + TAILQ_REMOVE(INACTIVE_LIST, marker, pageq); + if (m) + TAILQ_INSERT_BEFORE(m, marker, pageq); + else + TAILQ_INSERT_TAIL(INACTIVE_LIST, marker, pageq); } /* @@ -303,3 +327,78 @@ vm_swapcached_flush(vm_page_t m) vm_page_io_finish(m); } } + +static +void +vm_swapcache_cleaning(vm_object_t marker) +{ + vm_object_t object; + struct vnode *vp; + int count; + int n; + + object = marker; + count = vm_swapcache_maxlaunder; + + /* + * Look for vnode objects + */ + while ((object = TAILQ_NEXT(object, object_list)) != NULL && count--) { + if (object->type != OBJT_VNODE) + continue; + if ((object->flags & OBJ_DEAD) || object->swblock_count == 0) + continue; + if ((vp = object->handle) == NULL) + continue; + if (vp->v_type != VREG && vp->v_type != VCHR) + continue; + + /* + * Adjust iterator. + */ + if (marker->backing_object != object) + marker->size = 0; + + /* + * Move the marker so we can work on the VM object + */ + TAILQ_REMOVE(&vm_object_list, marker, object_list); + TAILQ_INSERT_AFTER(&vm_object_list, object, + marker, object_list); + + /* + * Look for swblocks starting at our iterator. + * + * The swap_pager_condfree() function attempts to free + * swap space starting at the specified index. The index + * will be updated on return. The function will return + * a scan factor (NOT the number of blocks freed). + * + * If it must cut its scan of the object short due to an + * excessive number of swblocks, or is able to free the + * requested number of blocks, it will return n >= count + * and we break and pick it back up on a future attempt. + */ + n = swap_pager_condfree(object, &marker->size, count); + count -= n; + if (count < 0) + break; + + /* + * Setup for loop. + */ + marker->size = 0; + object = marker; + } + + /* + * Adjust marker so we continue the scan from where we left off. + * When we reach the end we start back at the beginning. + */ + TAILQ_REMOVE(&vm_object_list, marker, object_list); + if (object) + TAILQ_INSERT_BEFORE(object, marker, object_list); + else + TAILQ_INSERT_HEAD(&vm_object_list, marker, object_list); + marker->backing_object = object; +} -- 2.41.0