kernel - Fix longstanding VM long-duration stall issues (2)
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 30 Oct 2010 05:46:29 +0000 (22:46 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 30 Oct 2010 05:51:28 +0000 (22:51 -0700)
* Refactor the pageout daemon's hysteresis, in particular the
  free page queue had no real hysteressis and could cause excessive
  pagedaemon wakeups.

  Use a calculation that maintains a free page queue about half the
  size of the minimum cache queue, giving us pretty good pipelining
  when a system is under constant memory pressure.

* Add a sysctl for monitoring ppwakeups (wakeups of processes waiting
  for memory).

sys/sys/vmmeter.h
sys/vm/vm_meter.c
sys/vm/vm_page.c
sys/vm/vm_page2.h
sys/vm/vm_pageout.c

index 1baf8a6..db5e67e 100644 (file)
@@ -101,7 +101,7 @@ struct vmmeter {
        u_int v_sendsys;        /* calls to sendsys() */
        u_int v_waitsys;        /* calls to waitsys() */
        u_int v_smpinvltlb;     /* nasty global invltlbs */
-       u_int v_reserved0;
+       u_int v_ppwakeups;      /* wakeups on processes stalled on VM */
        u_int v_reserved1;
        u_int v_reserved2;
        u_int v_reserved3;
index 6acc877..29efb18 100644 (file)
@@ -385,6 +385,8 @@ SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_reactivated, CTLTYPE_UINT|CTLFLAG_RD,
        0, VMMETEROFF(v_reactivated), vcnt, "IU", "Reactivated pages");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdwakeups, CTLTYPE_UINT|CTLFLAG_RD,
        0, VMMETEROFF(v_pdwakeups), vcnt, "IU", "Pagedaemon wakeups");
+SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_ppwakeups, CTLTYPE_UINT|CTLFLAG_RD,
+       0, VMMETEROFF(v_ppwakeups), vcnt, "IU", "vm_wait wakeups");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_UINT|CTLFLAG_RD,
        0, VMMETEROFF(v_pdpages), vcnt, "IU", "Pagedaemon page scans");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_dfree, CTLTYPE_UINT|CTLFLAG_RD,
index b43417c..5e87931 100644 (file)
@@ -77,6 +77,7 @@
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
+#include <sys/kernel.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -108,6 +109,7 @@ struct vpgqueues vm_page_queues[PQ_COUNT]; /* Array of tailq lists */
 
 LIST_HEAD(vm_page_action_list, vm_page_action);
 struct vm_page_action_list     action_list[VMACTION_HSIZE];
+static volatile int vm_pages_waiting;
 
 
 #define ASSERT_IN_CRIT_SECTION()       KKASSERT(crit_test(curthread));
@@ -916,44 +918,68 @@ vm_test_nominal(void)
 /*
  * Block until free pages are available for allocation, called in various
  * places before memory allocations.
+ *
+ * The caller may loop if vm_page_count_min() == FALSE so we cannot be
+ * more generous then that.
  */
 void
 vm_wait(int timo)
 {
-       crit_enter();
+       /*
+        * never wait forever
+        */
+       if (timo == 0)
+               timo = hz;
        lwkt_gettoken(&vm_token);
+
        if (curthread == pagethread) {
-               vm_pageout_pages_needed = 1;
-               tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
+               /*
+                * The pageout daemon itself needs pages, this is bad.
+                */
+               if (vm_page_count_min(0)) {
+                       vm_pageout_pages_needed = 1;
+                       tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
+               }
        } else {
-               if (vm_pages_needed == 0) {
-                       vm_pages_needed = 1;
-                       wakeup(&vm_pages_needed);
+               /*
+                * Wakeup the pageout daemon if necessary and wait.
+                */
+               if (vm_page_count_target()) {
+                       if (vm_pages_needed == 0) {
+                               vm_pages_needed = 1;
+                               wakeup(&vm_pages_needed);
+                       }
+                       ++vm_pages_waiting;     /* SMP race ok */
+                       tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
                }
-               tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
        }
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
  * Block until free pages are available for allocation
  *
- * Called only in vm_fault so that processes page faulting can be
+ * Called only from vm_fault so that processes page faulting can be
  * easily tracked.
  */
 void
 vm_waitpfault(void)
 {
-       crit_enter();
-       lwkt_gettoken(&vm_token);
-       if (vm_pages_needed == 0) {
-               vm_pages_needed = 1;
-               wakeup(&vm_pages_needed);
+       /*
+        * Wakeup the pageout daemon if necessary and wait.
+        */
+       if (vm_page_count_target()) {
+               lwkt_gettoken(&vm_token);
+               if (vm_page_count_target()) {
+                       if (vm_pages_needed == 0) {
+                               vm_pages_needed = 1;
+                               wakeup(&vm_pages_needed);
+                       }
+                       ++vm_pages_waiting;     /* SMP race ok */
+                       tsleep(&vmstats.v_free_count, 0, "pfault", hz);
+               }
+               lwkt_reltoken(&vm_token);
        }
-       tsleep(&vmstats.v_free_count, 0, "pfault", 0);
-       lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
@@ -1003,8 +1029,8 @@ static __inline void
 vm_page_free_wakeup(void)
 {
        /*
-        * if pageout daemon needs pages, then tell it that there are
-        * some free.
+        * If the pageout daemon itself needs pages, then tell it that
+        * there are some free.
         */
        if (vm_pageout_pages_needed &&
            vmstats.v_cache_count + vmstats.v_free_count >= 
@@ -1015,13 +1041,32 @@ vm_page_free_wakeup(void)
        }
 
        /*
-        * wakeup processes that are waiting on memory if we hit a
-        * high water mark. And wakeup scheduler process if we have
-        * lots of memory. this process will swapin processes.
+        * Wakeup processes that are waiting on memory.
+        *
+        * NOTE: vm_paging_target() is the pageout daemon's target, while
+        *       vm_page_count_target() is somewhere inbetween.  We want
+        *       to wake processes up prior to the pageout daemon reaching
+        *       its target to provide some hysteresis.
         */
-       if (vm_pages_needed && !vm_page_count_min(0)) {
-               vm_pages_needed = 0;
-               wakeup(&vmstats.v_free_count);
+       if (vm_pages_waiting) {
+               if (!vm_page_count_target()) {
+                       /*
+                        * Plenty of pages are free, wakeup everyone.
+                        */
+                       vm_pages_waiting = 0;
+                       wakeup(&vmstats.v_free_count);
+                       ++mycpu->gd_cnt.v_ppwakeups;
+               } else if (!vm_page_count_min(0)) {
+                       /*
+                        * Some pages are free, wakeup someone.
+                        */
+                       int wcount = vm_pages_waiting;
+                       if (wcount > 0)
+                               --wcount;
+                       vm_pages_waiting = wcount;
+                       wakeup_one(&vmstats.v_free_count);
+                       ++mycpu->gd_cnt.v_ppwakeups;
+               }
        }
 }
 
index 5185955..354192d 100644 (file)
@@ -63,7 +63,8 @@ int
 vm_page_count_severe(void)
 {
     return (vmstats.v_free_severe >
-           vmstats.v_free_count + vmstats.v_cache_count);
+           vmstats.v_free_count + vmstats.v_cache_count ||
+           vmstats.v_free_reserved > vmstats.v_free_count);
 }
 
 /*
@@ -91,13 +92,20 @@ int
 vm_page_count_target(void)
 {
     return (vmstats.v_free_target >
-           (vmstats.v_free_count + vmstats.v_cache_count));
+           (vmstats.v_free_count + vmstats.v_cache_count) ||
+           vmstats.v_free_reserved > vmstats.v_free_count);
 }
 
 /*
  * Return the number of pages the pageout daemon needs to move into the
  * cache or free lists.  A negative number means we have sufficient free
  * pages.
+ *
+ * The target free+cache is greater than vm_page_count_target().  The
+ * frontend uses vm_page_count_target() while the backend continue freeing
+ * based on vm_paging_target().
+ *
+ * This function DOES NOT return TRUE or FALSE.
  */
 static __inline 
 int
@@ -110,30 +118,24 @@ vm_paging_target(void)
 }
 
 /*
- * Return TRUE if we need to start paging.  This routine should not be
- * used to determine when to block on the VM system.  It supplies hysteresis
- * to the pageout code.
+ * Return TRUE if hysteresis dictates we should nominally wakeup the
+ * pageout daemon to start working on freeing up some memory.  This
+ * routine should NOT be used to determine when to block on the VM system.
+ * We want to wakeup the pageout daemon before we might otherwise block.
  *
- * XXX this triggers a wakeup of the pagedaemon.  As part of its work
- * the pagedaemon tries to maintain v_free_reserved worth of truely
- * free pages.  Set the trigger point a bit lower so we have some hystereis.
+ * Paging begins when cache+free drops below cache_min + free_min.
  */
 static __inline 
 int
 vm_paging_needed(void)
 {
-    int trigger;
-
-    trigger = vmstats.v_interrupt_free_min +
-             (vmstats.v_free_reserved - vmstats.v_interrupt_free_min) / 2;
-    if (trigger < 10)  /* safety */
-       trigger = 10;
-
-    return (
-       (vmstats.v_free_min + vmstats.v_cache_min) >
-       (vmstats.v_free_count + vmstats.v_cache_count) ||
-       trigger > vmstats.v_free_count
-    );
+    if (vmstats.v_free_min + vmstats.v_cache_min >
+       vmstats.v_free_count + vmstats.v_cache_count) {
+               return 1;
+    }
+    if (vmstats.v_free_min > vmstats.v_free_count)
+               return 1;
+    return 0;
 }
 
 static __inline
index 999d05c..b1b0b00 100644 (file)
@@ -1187,17 +1187,39 @@ rescan0:
        }
 
        /*
-        * We try to maintain some *really* free pages, this allows interrupt
-        * code to be guaranteed space.  Since both cache and free queues 
-        * are considered basically 'free', moving pages from cache to free
-        * does not effect other calculations.
+        * The number of actually free pages can drop down to v_free_reserved,
+        * we try to build the free count back above v_free_min.  Note that
+        * vm_paging_needed() also returns TRUE if v_free_count is not at
+        * least v_free_min so that is the minimum we must build the free
+        * count to.
+        *
+        * We use a slightly higher target to improve hysteresis,
+        * ((v_free_target + v_free_min) / 2).  Since v_free_target
+        * is usually the same as v_cache_min this maintains about
+        * half the pages in the free queue as are in the cache queue,
+        * providing pretty good pipelining for pageout operation.
+        *
+        * The system operator can manipulate vm.v_cache_min and
+        * vm.v_free_target to tune the pageout demon.  Be sure
+        * to keep vm.v_free_min < vm.v_free_target.
+        *
+        * Note that the original paging target is to get at least
+        * (free_min + cache_min) into (free + cache).  The slightly
+        * higher target will shift additional pages from cache to free
+        * without effecting the original paging target in order to
+        * maintain better hysteresis and not have the free count always
+        * be dead-on v_free_min.
         *
         * NOTE: we are still in a critical section.
         *
         * Pages moved from PQ_CACHE to totally free are not counted in the
         * pages_freed counter.
         */
-       while (vmstats.v_free_count < vmstats.v_free_reserved) {
+       while (vmstats.v_free_count <
+              (vmstats.v_free_min + vmstats.v_free_target) / 2) {
+               /*
+                *
+                */
                static int cache_rover = 0;
                m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE);
                if (m == NULL)
@@ -1584,7 +1606,6 @@ vm_pageout_thread(void)
                 * see if paging is needed (in case the normal wakeup
                 * code raced us).
                 */
-               crit_enter();
                if (vm_pages_needed == 0) {
                        error = tsleep(&vm_pages_needed,
                                       0, "psleep",
@@ -1597,17 +1618,8 @@ vm_pageout_thread(void)
                        }
                        vm_pages_needed = 1;
                }
-               crit_exit();
 
-               /*
-                * If we have enough free memory, wakeup waiters.
-                * (This is optional here)
-                */
-               crit_enter();
-               if (!vm_page_count_min(0))
-                       wakeup(&vmstats.v_free_count);
                mycpu->gd_cnt.v_pdwakeups++;
-               crit_exit();
 
                /*
                 * Scan for pageout.  Try to avoid thrashing the system