kernel - SWAP CACHE part 12/many - Add swapcache cleanup state
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 6 Feb 2010 16:57:05 +0000 (08:57 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 6 Feb 2010 16:57:05 +0000 (08:57 -0800)
* Add a small state machine and hysteresis to flip between swapcache
  writing and swapcache cleaning.  The swapcache is written to until
  (unless) it hits 75% use.  If this occurs it switches to cleaning
  mode to get rid of swapcache pages until it gets down to 70%.  While
  in cleaning mode burst accumulation still occurs.  Then it flips back.

  Currently the cleaning mode tries to choose swap meta-blocks which
  are wholely swapped (have no VM pages), running linearly through
  the VM object list in order to try to clean contiguous areas of
  the swapcache.  The idea is to reduce fragmentation that would lead
  to excessive disk seeking.  At the same time the limited cleaning
  run (only 5% of the swap cache) should prevent any large-scale
  excessive deletion of the swapcache.

* Add a new VM object type, OBJT_MARKER, which may be used by iterators
  running through the vm_object_list.

sys/emulation/linux/i386/linprocfs/linprocfs_misc.c
sys/emulation/linux/linux_misc.c
sys/vm/swap_pager.c
sys/vm/swap_pager.h
sys/vm/vm_meter.c
sys/vm/vm_object.c
sys/vm/vm_object.h
sys/vm/vm_swapcache.c

index 34b3c94..6be0e01 100644 (file)
@@ -121,9 +121,12 @@ linprocfs_domeminfo(struct proc *curp, struct proc *p, struct pfsnode *pfs,
        swapused = swaptotal - swapfree;
        memshared = 0;
        for (object = TAILQ_FIRST(&vm_object_list); object != NULL;
-           object = TAILQ_NEXT(object, object_list))
+           object = TAILQ_NEXT(object, object_list)) {
+               if (object->type == OBJT_MARKER)
+                       continue;
                if (object->shadow_count > 1)
                        memshared += object->resident_page_count;
+       }
        memshared *= PAGE_SIZE;
        /*
         * We'd love to be able to write:
index d282a65..5c34294 100644 (file)
@@ -147,9 +147,12 @@ sys_linux_sysinfo(struct linux_sysinfo_args *args)
        get_mplock();
        sysinfo.sharedram = 0;
        for (object = TAILQ_FIRST(&vm_object_list); object != NULL;
-            object = TAILQ_NEXT(object, object_list))
+            object = TAILQ_NEXT(object, object_list)) {
+               if (object->type == OBJT_MARKER)
+                       continue;
                if (object->shadow_count > 1)
                        sysinfo.sharedram += object->resident_page_count;
+       }
 
        sysinfo.sharedram *= PAGE_SIZE;
        sysinfo.bufferram = 0;
index d7092d4..e7c0e24 100644 (file)
@@ -210,6 +210,17 @@ rb_swblock_scancmp(struct swblock *swb, void *data)
        return(0);
 }
 
+static
+int
+rb_swblock_condcmp(struct swblock *swb, void *data)
+{
+       struct swfreeinfo *info = data;
+
+       if (swb->swb_index < info->basei)
+               return(-1);
+       return(0);
+}
+
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
@@ -576,6 +587,70 @@ swap_pager_freespace_all(vm_object_t object)
        crit_exit();
 }
 
+/*
+ * This function conditionally frees swap cache swap starting at
+ * (*basei) in the object.  (count) swap blocks will be nominally freed.
+ * The actual number of blocks freed can be more or less than the
+ * requested number.
+ *
+ * This function nominally returns the number of blocks freed.  However,
+ * the actual number of blocks freed may be less then the returned value.
+ * If the function is unable to exhaust the object or if it is able to
+ * free (approximately) the requested number of blocks it returns
+ * a value n > count.
+ *
+ * If we exhaust the object we will return a value n <= count.
+ */
+static int swap_pager_condfree_callback(struct swblock *swap, void *data);
+
+int
+swap_pager_condfree(vm_object_t object, vm_size_t *basei, int count)
+{
+       struct swfreeinfo info;
+
+       info.object = object;
+       info.basei = *basei;    /* skip up to this page index */
+       info.begi = count;      /* max swap pages to destroy */
+       info.endi = count * 8;  /* max swblocks to scan */
+
+       swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
+                               swap_pager_condfree_callback, &info);
+       *basei = info.basei;
+       if (info.endi < 0 && info.begi <= count)
+               info.begi = count + 1;
+       return(count - (int)info.begi);
+}
+
+/*
+ * The idea is to free whole meta-block to avoid fragmenting
+ * the swap space or disk I/O.  We only do this if NO VM pages
+ * are present.
+ *
+ * We do not have to deal with clearing PG_SWAPPED in related VM
+ * pages because there are no related VM pages.
+ */
+static int
+swap_pager_condfree_callback(struct swblock *swap, void *data)
+{
+       struct swfreeinfo *info = data;
+       vm_object_t object = info->object;
+       int i;
+
+       for (i = 0; i < SWAP_META_PAGES; ++i) {
+               if (vm_page_lookup(object, swap->swb_index + i))
+                       break;
+       }
+       info->basei = swap->swb_index + SWAP_META_PAGES;
+       if (i == SWAP_META_PAGES) {
+               info->begi -= swap->swb_count;
+               swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
+       }
+       --info->endi;
+       if ((int)info->begi < 0 || (int)info->endi < 0)
+               return(-1);
+       return(0);
+}
+
 /*
  * Called by vm_page_alloc() when a new VM page is inserted
  * into a VM object.  Checks whether swap has been assigned to
index 9900d6f..3a995a0 100644 (file)
@@ -101,6 +101,8 @@ int swap_pager_swp_alloc (vm_object_t, int);
 void swap_pager_copy (vm_object_t, vm_object_t, vm_pindex_t, int);
 void swap_pager_freespace (vm_object_t, vm_pindex_t, vm_pindex_t);
 void swap_pager_freespace_all (vm_object_t);
+int swap_pager_condfree(vm_object_t, vm_size_t *, int);
+
 void swap_pager_page_inserted(vm_page_t);
 void swap_pager_swap_init (void);
 void swap_pager_newswap (void);
index 7a92f18..3a436c4 100644 (file)
@@ -94,6 +94,8 @@ do_vmtotal(SYSCTL_HANDLER_ARGS)
        for (object = TAILQ_FIRST(&vm_object_list);
            object != NULL;
            object = TAILQ_NEXT(object,object_list)) {
+               if (object->type == OBJT_MARKER)
+                       continue;
                vm_object_clear_flag(object, OBJ_ACTIVE);
        }
 
@@ -109,8 +111,11 @@ do_vmtotal(SYSCTL_HANDLER_ARGS)
            object != NULL;
            object = TAILQ_NEXT(object, object_list)) {
                /*
-                * devices, like /dev/mem, will badly skew our totals
+                * devices, like /dev/mem, will badly skew our totals.
+                * markers aren't real objects.
                 */
+               if (object->type == OBJT_MARKER)
+                       continue;
                if (object->type == OBJT_DEVICE)
                        continue;
                totalp->t_vm += object->size;
index 1d38824..4ae7fb3 100644 (file)
@@ -1846,6 +1846,8 @@ DB_SHOW_COMMAND(vmochk, vm_object_check)
        for (object = TAILQ_FIRST(&vm_object_list);
                        object != NULL;
                        object = TAILQ_NEXT(object, object_list)) {
+               if (object->type == OBJT_MARKER)
+                       continue;
                if (object->handle == NULL &&
                    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
                        if (object->ref_count == 0) {
@@ -1946,6 +1948,8 @@ DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
                int rcount;
                vm_page_t m;
 
+               if (object->type == OBJT_MARKER)
+                       continue;
                db_printf("new object: %p\n", (void *)object);
                if ( nl > 18) {
                        c = cngetc();
index 0654090..e5deda1 100644 (file)
@@ -115,7 +115,8 @@ enum obj_type {
        OBJT_VNODE,     /* object backed by file pages (vnode) */
        OBJT_DEVICE,    /* object backed by device pages */
        OBJT_PHYS,      /* object backed by physical pages */
-       OBJT_DEAD       /* dead object */
+       OBJT_DEAD,      /* dead object */
+       OBJT_MARKER     /* marker object */
 };
 typedef u_char objtype_t;
 
index d603fb8..39f2d14 100644 (file)
@@ -80,6 +80,8 @@
 /* the kernel process "vm_pageout"*/
 static void vm_swapcached (void);
 static void vm_swapcached_flush (vm_page_t m);
+static void vm_swapcache_writing(vm_page_t marker);
+static void vm_swapcache_cleaning(vm_object_t marker);
 struct thread *swapcached_thread;
 
 static struct kproc_desc swpc_kp = {
@@ -126,27 +128,31 @@ SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
 static void
 vm_swapcached(void)
 {
-       struct vm_page marker;
-       vm_object_t object;
-       struct vnode *vp;
-       vm_page_t m;
-       int count;
+       enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
+       struct vm_page page_marker;
+       struct vm_object object_marker;
 
        /*
         * Thread setup
         */
        curthread->td_flags |= TDF_SYSTHREAD;
+       crit_enter();
 
        /*
-        * Initialize our marker
+        * Initialize our marker for the inactive scan (SWAPC_WRITING)
         */
-       bzero(&marker, sizeof(marker));
-       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
-       marker.queue = PQ_INACTIVE;
-       marker.wire_count = 1;
+       bzero(&page_marker, sizeof(page_marker));
+       page_marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+       page_marker.queue = PQ_INACTIVE;
+       page_marker.wire_count = 1;
+       TAILQ_INSERT_HEAD(INACTIVE_LIST, &page_marker, pageq);
 
-       crit_enter();
-       TAILQ_INSERT_HEAD(INACTIVE_LIST, &marker, pageq);
+       /*
+        * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
+        */
+       bzero(&object_marker, sizeof(object_marker));
+       object_marker.type = OBJT_MARKER;
+       TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list);
 
        for (;;) {
                /*
@@ -159,124 +165,142 @@ vm_swapcached(void)
                }
 
                /*
-                * Polling rate when enabled is 10 hz.  Deal with write
-                * bandwidth limits.
-                *
-                * We don't want to nickle-and-dime the scan as that will
-                * create unnecessary fragmentation.
+                * Polling rate when enabled is 10 hz.
                 */
                tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
-               vm_swapcache_curburst += vm_swapcache_accrate / 10;
-               if (vm_swapcache_curburst > vm_swapcache_maxburst)
-                       vm_swapcache_curburst = vm_swapcache_maxburst;
-               if (vm_swapcache_curburst < vm_swapcache_accrate)
-                       continue;
 
                /*
-                * Don't load any more into the cache once we have exceeded
-                * 3/4 of available swap space.  XXX need to start cleaning
-                * it out, though vnode recycling will accomplish that to
-                * some degree.
+                * State hysteresis.  Generate write activity up to 75% of
+                * swap, then clean out swap assignments down to 70%, then
+                * repeat.
                 */
-               if (vm_swap_cache_use > vm_swap_max * 3 / 4)
-                       continue;
+               if (state == SWAPC_WRITING) {
+                       if (vm_swap_cache_use > (int64_t)vm_swap_max * 75 / 100)
+                               state = SWAPC_CLEANING;
+               } else {
+                       if (vm_swap_cache_use < (int64_t)vm_swap_max * 70 / 100)
+                               state = SWAPC_WRITING;
+               }
 
                /*
-                * Calculate the number of pages to test.  We don't want
-                * to get into a cpu-bound loop.
+                * We are allowed to continue accumulating burst value
+                * in either state.
                 */
-               count = vmstats.v_inactive_count;
-               if (count > vm_swapcache_maxlaunder)
-                       count = vm_swapcache_maxlaunder;
+               vm_swapcache_curburst += vm_swapcache_accrate / 10;
+               if (vm_swapcache_curburst > vm_swapcache_maxburst)
+                       vm_swapcache_curburst = vm_swapcache_maxburst;
 
                /*
-                * Scan the inactive queue from our marker to locate
-                * suitable pages to push to the swap cache.
-                *
-                * We are looking for clean vnode-backed pages.
-                *
-                * NOTE: PG_SWAPPED pages in particular are not part of
-                *       our count because once the cache stabilizes we
-                *       can end up with a very high datarate of VM pages
-                *       cycling from it.
+                * We don't want to nickle-and-dime the scan as that will
+                * create unnecessary fragmentation.  The minimum burst
+                * is one-seconds worth of accumulation.
                 */
-               m = &marker;
-               while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) {
-                       if (m->flags & (PG_MARKER | PG_SWAPPED)) {
-                               ++count;
-                               continue;
-                       }
-                       if (vm_swapcache_curburst < 0)
-                               break;
-                       if (m->flags & (PG_BUSY | PG_UNMANAGED))
-                               continue;
-                       if (m->busy || m->hold_count || m->wire_count)
-                               continue;
-                       if (m->valid != VM_PAGE_BITS_ALL)
-                               continue;
-                       if (m->dirty & m->valid)
-                               continue;
-                       if ((object = m->object) == NULL)
-                               continue;
-                       if (object->type != OBJT_VNODE ||
-                           (object->flags & OBJ_DEAD)) {
-                               continue;
-                       }
-                       vm_page_test_dirty(m);
-                       if (m->dirty & m->valid)
-                               continue;
-                       vp = object->handle;
-                       if (vp == NULL)
+               if (state == SWAPC_WRITING) {
+                       if (vm_swapcache_curburst >= vm_swapcache_accrate)
+                               vm_swapcache_writing(&page_marker);
+               } else {
+                       vm_swapcache_cleaning(&object_marker);
+               }
+       }
+       TAILQ_REMOVE(INACTIVE_LIST, &page_marker, pageq);
+       TAILQ_REMOVE(&vm_object_list, &object_marker, object_list);
+       crit_exit();
+}
+
+static void
+vm_swapcache_writing(vm_page_t marker)
+{
+       vm_object_t object;
+       struct vnode *vp;
+       vm_page_t m;
+       int count;
+
+       /*
+        * Scan the inactive queue from our marker to locate
+        * suitable pages to push to the swap cache.
+        *
+        * We are looking for clean vnode-backed pages.
+        *
+        * NOTE: PG_SWAPPED pages in particular are not part of
+        *       our count because once the cache stabilizes we
+        *       can end up with a very high datarate of VM pages
+        *       cycling from it.
+        */
+       m = marker;
+       count = vm_swapcache_maxlaunder;
+
+       while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) {
+               if (m->flags & (PG_MARKER | PG_SWAPPED)) {
+                       ++count;
+                       continue;
+               }
+               if (vm_swapcache_curburst < 0)
+                       break;
+               if (m->flags & (PG_BUSY | PG_UNMANAGED))
+                       continue;
+               if (m->busy || m->hold_count || m->wire_count)
+                       continue;
+               if (m->valid != VM_PAGE_BITS_ALL)
+                       continue;
+               if (m->dirty & m->valid)
+                       continue;
+               if ((object = m->object) == NULL)
+                       continue;
+               if (object->type != OBJT_VNODE ||
+                   (object->flags & OBJ_DEAD)) {
+                       continue;
+               }
+               vm_page_test_dirty(m);
+               if (m->dirty & m->valid)
+                       continue;
+               vp = object->handle;
+               if (vp == NULL)
+                       continue;
+               switch(vp->v_type) {
+               case VREG:
+                       if (vm_swapcache_data_enable == 0)
                                continue;
-                       switch(vp->v_type) {
-                       case VREG:
-                               if (vm_swapcache_data_enable == 0)
-                                       continue;
-                               break;
-                       case VCHR:
-                               if (vm_swapcache_meta_enable == 0)
-                                       continue;
-                               break;
-                       default:
+                       break;
+               case VCHR:
+                       if (vm_swapcache_meta_enable == 0)
                                continue;
-                       }
-
-                       /*
-                        * Ok, move the marker and soft-busy the page.
-                        */
-                       TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq);
-                       TAILQ_INSERT_AFTER(INACTIVE_LIST, m, &marker, pageq);
-
-                       /*
-                        * Assign swap and initiate I/O
-                        */
-                       vm_swapcached_flush(m);
-
-                       /*
-                        * Setup for next loop using marker.
-                        */
-                       m = &marker;
+                       break;
+               default:
+                       continue;
                }
 
                /*
-                * Cleanup marker position.  If we hit the end of the
-                * list the marker is placed at the tail.  Newly deactivated
-                * pages will be placed after it.
-                *
-                * Earlier inactive pages that were dirty and become clean
-                * are typically moved to the end of PQ_INACTIVE by virtue
-                * of vfs_vmio_release() when they become unwired from the
-                * buffer cache.
+                * Ok, move the marker and soft-busy the page.
                 */
-               TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq);
-               if (m)
-                       TAILQ_INSERT_BEFORE(m, &marker, pageq);
-               else
-                       TAILQ_INSERT_TAIL(INACTIVE_LIST, &marker, pageq);
+               TAILQ_REMOVE(INACTIVE_LIST, marker, pageq);
+               TAILQ_INSERT_AFTER(INACTIVE_LIST, m, marker, pageq);
 
+               /*
+                * Assign swap and initiate I/O
+                */
+               vm_swapcached_flush(m);
+
+               /*
+                * Setup for next loop using marker.
+                */
+               m = marker;
        }
-       TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq);
-       crit_exit();
+
+       /*
+        * Cleanup marker position.  If we hit the end of the
+        * list the marker is placed at the tail.  Newly deactivated
+        * pages will be placed after it.
+        *
+        * Earlier inactive pages that were dirty and become clean
+        * are typically moved to the end of PQ_INACTIVE by virtue
+        * of vfs_vmio_release() when they become unwired from the
+        * buffer cache.
+        */
+       TAILQ_REMOVE(INACTIVE_LIST, marker, pageq);
+       if (m)
+               TAILQ_INSERT_BEFORE(m, marker, pageq);
+       else
+               TAILQ_INSERT_TAIL(INACTIVE_LIST, marker, pageq);
 }
 
 /*
@@ -303,3 +327,78 @@ vm_swapcached_flush(vm_page_t m)
                vm_page_io_finish(m);
        }
 }
+
+static
+void
+vm_swapcache_cleaning(vm_object_t marker)
+{
+       vm_object_t object;
+       struct vnode *vp;
+       int count;
+       int n;
+
+       object = marker;
+       count = vm_swapcache_maxlaunder;
+
+       /*
+        * Look for vnode objects
+        */
+       while ((object = TAILQ_NEXT(object, object_list)) != NULL && count--) {
+               if (object->type != OBJT_VNODE)
+                       continue;
+               if ((object->flags & OBJ_DEAD) || object->swblock_count == 0)
+                       continue;
+               if ((vp = object->handle) == NULL)
+                       continue;
+               if (vp->v_type != VREG && vp->v_type != VCHR)
+                       continue;
+
+               /*
+                * Adjust iterator.
+                */
+               if (marker->backing_object != object)
+                       marker->size = 0;
+
+               /*
+                * Move the marker so we can work on the VM object
+                */
+               TAILQ_REMOVE(&vm_object_list, marker, object_list);
+               TAILQ_INSERT_AFTER(&vm_object_list, object,
+                                  marker, object_list);
+
+               /*
+                * Look for swblocks starting at our iterator.
+                *
+                * The swap_pager_condfree() function attempts to free
+                * swap space starting at the specified index.  The index
+                * will be updated on return.  The function will return
+                * a scan factor (NOT the number of blocks freed).
+                *
+                * If it must cut its scan of the object short due to an
+                * excessive number of swblocks, or is able to free the
+                * requested number of blocks, it will return n >= count
+                * and we break and pick it back up on a future attempt.
+                */
+               n = swap_pager_condfree(object, &marker->size, count);
+               count -= n;
+               if (count < 0)
+                       break;
+
+               /*
+                * Setup for loop.
+                */
+               marker->size = 0;
+               object = marker;
+       }
+
+       /*
+        * Adjust marker so we continue the scan from where we left off.
+        * When we reach the end we start back at the beginning.
+        */
+       TAILQ_REMOVE(&vm_object_list, marker, object_list);
+       if (object)
+               TAILQ_INSERT_BEFORE(object, marker, object_list);
+       else
+               TAILQ_INSERT_HEAD(&vm_object_list, marker, object_list);
+       marker->backing_object = object;
+}