This is a major revamping of the pageout and low-memory handling code.
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 18 Dec 2008 21:27:20 +0000 (13:27 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 18 Dec 2008 21:27:20 +0000 (13:27 -0800)
The pageout daemon now detects out-of-memory conditions and properly
kills the largest process(es).  This condition occurs when swap is
full (or you have no swap) and most of the remaining VM pages in memory
have become dirty.  With no swap to page to the dirty pages squeeze out
the clean ones.  The pageout daemon detects the case and starts killing
processes.

The pageout daemon now detects stress in the form of excess cpu use
and tries to reduce its cpu footprint when that occurs.  Excess cpu use
can occur when the only pages left in-core are dirty and there is nowhere
to swap them to.  Previously if this case occured the system would basically
just stop working.

These changes make the system truely have VM = RAM+SWAP.  If you 1G of ram
and 1G of swap the system can run up to 2G worth of processes.

sys/kern/vfs_bio.c
sys/platform/pc32/i386/pmap.c
sys/platform/pc64/amd64/pmap.c
sys/platform/vkernel/platform/pmap.c
sys/vm/vm_glue.c
sys/vm/vm_map.c
sys/vm/vm_map.h
sys/vm/vm_page.c
sys/vm/vm_page2.h
sys/vm/vm_pageout.c

index af02f56..e1237df 100644 (file)
@@ -2930,17 +2930,6 @@ allocbuf(struct buf *bp, int size)
 
                                if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
                                        continue;
-
-                               /*
-                                * We have a good page.  Should we wakeup the
-                                * page daemon?
-                                */
-                               if ((curthread != pagethread) &&
-                                   ((m->queue - m->pc) == PQ_CACHE) &&
-                                   ((vmstats.v_free_count + vmstats.v_cache_count) <
-                                       (vmstats.v_free_min + vmstats.v_cache_min))) {
-                                       pagedaemon_wakeup();
-                               }
                                vm_page_flag_clear(m, PG_ZERO);
                                vm_page_wire(m);
                                bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
index c3e4fbc..2102002 100644 (file)
@@ -1495,8 +1495,8 @@ get_pv_entry(void)
 {
        pv_entry_count++;
        if (pv_entry_high_water &&
-               (pv_entry_count > pv_entry_high_water) &&
-               (pmap_pagedaemon_waken == 0)) {
+           (pv_entry_count > pv_entry_high_water) &&
+           (pmap_pagedaemon_waken == 0)) {
                pmap_pagedaemon_waken = 1;
                wakeup (&vm_pages_needed);
        }
@@ -1516,6 +1516,7 @@ pmap_collect(void)
 
        if (pmap_pagedaemon_waken == 0)
                return;
+       pmap_pagedaemon_waken = 0;
 
        if (warningdone < 5) {
                kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
@@ -1529,7 +1530,6 @@ pmap_collect(void)
                        continue;
                pmap_remove_all(m);
        }
-       pmap_pagedaemon_waken = 0;
 }
        
 
index fb1c753..3e5637f 100644 (file)
@@ -1612,8 +1612,8 @@ get_pv_entry(void)
 {
        pv_entry_count++;
        if (pv_entry_high_water &&
-               (pv_entry_count > pv_entry_high_water) &&
-               (pmap_pagedaemon_waken == 0)) {
+           (pv_entry_count > pv_entry_high_water) &&
+           (pmap_pagedaemon_waken == 0)) {
                pmap_pagedaemon_waken = 1;
                wakeup (&vm_pages_needed);
        }
@@ -1633,6 +1633,7 @@ pmap_collect(void)
 
        if (pmap_pagedaemon_waken == 0)
                return;
+       pmap_pagedaemon_waken = 0;
 
        if (warningdone < 5) {
                kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
@@ -1646,7 +1647,6 @@ pmap_collect(void)
                        continue;
                pmap_remove_all(m);
        }
-       pmap_pagedaemon_waken = 0;
 }
        
 
index e9d3a96..8f65346 100644 (file)
@@ -1278,8 +1278,8 @@ get_pv_entry(void)
 {
        pv_entry_count++;
        if (pv_entry_high_water &&
-               (pv_entry_count > pv_entry_high_water) &&
-               (pmap_pagedaemon_waken == 0)) {
+           (pv_entry_count > pv_entry_high_water) &&
+           (pmap_pagedaemon_waken == 0)) {
                pmap_pagedaemon_waken = 1;
                wakeup (&vm_pages_needed);
        }
@@ -1299,6 +1299,7 @@ pmap_collect(void)
 
        if (pmap_pagedaemon_waken == 0)
                return;
+       pmap_pagedaemon_waken = 0;
 
        if (warningdone < 5) {
                kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
@@ -1312,7 +1313,6 @@ pmap_collect(void)
                        continue;
                pmap_remove_all(m);
        }
-       pmap_pagedaemon_waken = 0;
 }
        
 /*
index 14db5d1..0dcfc36 100644 (file)
@@ -344,7 +344,7 @@ loop:
        /*
         * Don't try to swap anything in if we are low on memory.
         */
-       if (vm_page_count_min()) {
+       if (vm_page_count_severe()) {
                vm_wait(0);
                goto loop;
        }
index 762c774..0e994ef 100644 (file)
@@ -331,8 +331,7 @@ vmspace_exitfree(struct proc *p)
 }
 
 /*
- * vmspace_swap_count() - count the approximate swap useage in pages for a
- *                       vmspace.
+ * vmspace_swap_count()
  *
  *     Swap useage is determined by taking the proportional swap used by
  *     VM objects backing the VM map.  To make up for fractional losses,
@@ -369,6 +368,42 @@ vmspace_swap_count(struct vmspace *vmspace)
        return(count);
 }
 
+/*
+ * vmspace_anonymous_count()
+ *
+ *     Calculate the approximate number of anonymous pages in use by
+ *     this vmspace.  To make up for fractional losses, we count each
+ *     VM object as having at least 1 anonymous page.
+ */
+int
+vmspace_anonymous_count(struct vmspace *vmspace)
+{
+       vm_map_t map = &vmspace->vm_map;
+       vm_map_entry_t cur;
+       vm_object_t object;
+       int count = 0;
+
+       for (cur = map->header.next; cur != &map->header; cur = cur->next) {
+               switch(cur->maptype) {
+               case VM_MAPTYPE_NORMAL:
+               case VM_MAPTYPE_VPAGETABLE:
+                       if ((object = cur->object.vm_object) == NULL)
+                               break;
+                       if (object->type != OBJT_DEFAULT &&
+                           object->type != OBJT_SWAP) {
+                               break;
+                       }
+                       count += object->resident_page_count;
+                       break;
+               default:
+                       break;
+               }
+       }
+       return(count);
+}
+
+
+
 
 /*
  *     vm_map_create:
index 6de62cf..4990221 100644 (file)
@@ -470,6 +470,7 @@ int vm_uiomove (vm_map_t, vm_object_t, off_t, int, vm_offset_t, int *);
 int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
 int vm_map_growstack (struct proc *p, vm_offset_t addr);
 int vmspace_swap_count (struct vmspace *vmspace);
+int vmspace_anonymous_count (struct vmspace *vmspace);
 void vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size, int *);
 
 #endif
index 6ce5fd5..3450362 100644 (file)
@@ -526,10 +526,8 @@ vm_page_unqueue(vm_page_t m)
                TAILQ_REMOVE(&pq->pl, m, pageq);
                (*pq->cnt)--;
                pq->lcnt--;
-               if ((queue - m->pc) == PQ_CACHE) {
-                       if (vm_paging_needed())
-                               pagedaemon_wakeup();
-               }
+               if ((queue - m->pc) == PQ_CACHE || (queue - m->pc) == PQ_FREE)
+                       pagedaemon_wakeup();
        }
 }
 
@@ -790,8 +788,7 @@ loop:
         * Don't wakeup too often - wakeup the pageout daemon when
         * we would be nearly out of memory.
         */
-       if (vm_paging_needed())
-               pagedaemon_wakeup();
+       pagedaemon_wakeup();
 
        crit_exit();
 
@@ -813,7 +810,7 @@ vm_wait(int timo)
                vm_pageout_pages_needed = 1;
                tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
        } else {
-               if (!vm_pages_needed) {
+               if (vm_pages_needed == 0) {
                        vm_pages_needed = 1;
                        wakeup(&vm_pages_needed);
                }
@@ -827,16 +824,12 @@ vm_wait(int timo)
  *
  * Called only in vm_fault so that processes page faulting can be
  * easily tracked.
- *
- * Sleeps at a lower priority than vm_wait() so that vm_wait()ing
- * processes will be able to grab memory first.  Do not change
- * this balance without careful testing first.
  */
 void
 vm_waitpfault(void)
 {
        crit_enter();
-       if (!vm_pages_needed) {
+       if (vm_pages_needed == 0) {
                vm_pages_needed = 1;
                wakeup(&vm_pages_needed);
        }
@@ -905,7 +898,7 @@ vm_page_free_wakeup(void)
         * high water mark. And wakeup scheduler process if we have
         * lots of memory. this process will swapin processes.
         */
-       if (vm_pages_needed && !vm_page_count_min()) {
+       if (vm_pages_needed && !vm_page_count_min(0)) {
                vm_pages_needed = 0;
                wakeup(&vmstats.v_free_count);
        }
index 63dfc62..e4f2762 100644 (file)
 #ifdef _KERNEL
 
 /*
- * Return TRUE if we are under our reserved low-free-pages threshold
- */
-
-static __inline 
-int
-vm_page_count_reserved(void)
-{
-    return (vmstats.v_free_reserved > 
-       (vmstats.v_free_count + vmstats.v_cache_count));
-}
-
-/*
  * Return TRUE if we are under our severe low-free-pages threshold
  *
- * This routine is typically used at the user<->system interface to determine
- * whether we need to block in order to avoid a low memory deadlock.
+ * This causes user processes to stall to avoid exhausting memory that
+ * the kernel might need.
+ *
+ * reserved < severe < minimum < target < paging_target
  */
-
 static __inline 
 int
 vm_page_count_severe(void)
 {
     return (vmstats.v_free_severe >
-       (vmstats.v_free_count + vmstats.v_cache_count));
+           vmstats.v_free_count + vmstats.v_cache_count);
 }
 
 /*
  * Return TRUE if we are under our minimum low-free-pages threshold.
+ * This activates the pageout demon.  The pageout demon tries to
+ * reach the target but may stop once it satisfies the minimum.
  *
- * This routine is typically used within the system to determine whether
- * we can execute potentially very expensive code in terms of memory.  It
- * is also used by the pageout daemon to calculate when to sleep, when
- * to wake waiters up, and when (after making a pass) to become more
- * desparate.
+ * reserved < severe < minimum < target < paging_target
  */
-
 static __inline 
 int
-vm_page_count_min(void)
+vm_page_count_min(int donotcount)
 {
-    return (vmstats.v_free_min >
-       (vmstats.v_free_count + vmstats.v_cache_count));
+    return (vmstats.v_free_min + donotcount >
+           (vmstats.v_free_count + vmstats.v_cache_count) ||
+           vmstats.v_free_reserved > vmstats.v_free_count);
 }
 
 /*
- * Return TRUE if we have not reached our free page target during
- * free page recovery operations.
+ * Return TRUE if we are under our free page target.  The pageout demon
+ * tries to reach the target but may stop once it gets past the min.
  */
-
 static __inline 
 int
 vm_page_count_target(void)
 {
     return (vmstats.v_free_target >
-       (vmstats.v_free_count + vmstats.v_cache_count));
+           (vmstats.v_free_count + vmstats.v_cache_count));
 }
 
 /*
- * Return the number of pages we need to free-up or cache
- * A positive number indicates that we do not have enough free pages.
+ * Return the number of pages the pageout daemon needs to move into the
+ * cache or free lists.  A negative number means we have sufficient free
+ * pages.
  */
-
 static __inline 
 int
 vm_paging_target(void)
@@ -124,16 +110,29 @@ vm_paging_target(void)
 }
 
 /*
- * Return a positive number if the pagedaemon needs to be woken up.
+ * Return TRUE if we need to start paging.  This routine should not be
+ * used to determine when to block on the VM system.  It supplies hysteresis
+ * to the pageout code.
+ *
+ * XXX this triggers a wakeup of the pagedaemon.  As part of its work
+ * the pagedaemon tries to maintain v_free_reserved worth of truely
+ * free pages.  Set the trigger point a bit lower so we have some hystereis.
  */
-
 static __inline 
 int
 vm_paging_needed(void)
 {
+    int trigger;
+
+    trigger = vmstats.v_interrupt_free_min +
+             (vmstats.v_free_reserved - vmstats.v_interrupt_free_min) / 2;
+    if (trigger < 10)  /* safety */
+       trigger = 10;
+
     return (
-       (vmstats.v_free_reserved + vmstats.v_cache_min) >
-       (vmstats.v_free_count + vmstats.v_cache_count)
+       (vmstats.v_free_min + vmstats.v_cache_min) >
+       (vmstats.v_free_count + vmstats.v_cache_count) ||
+       trigger > vmstats.v_free_count
     );
 }
 
index ed58182..0e40eec 100644 (file)
 /* the kernel process "vm_pageout"*/
 static void vm_pageout (void);
 static int vm_pageout_clean (vm_page_t);
-static void vm_pageout_scan (int pass);
+static int vm_pageout_scan (int pass);
 static int vm_pageout_free_page_calc (vm_size_t count);
 struct thread *pagethread;
 
@@ -221,7 +221,7 @@ static void vm_req_vmdaemon (void);
 static void vm_pageout_page_stats(void);
 
 /*
- * Update
+ * Update vm_load to slow down faulting processes.
  */
 void
 vm_fault_ratecheck(void)
@@ -683,9 +683,8 @@ vm_pageout_page_free(vm_page_t m)
 }
 
 /*
- *     vm_pageout_scan does the dirty work for the pageout daemon.
+ * vm_pageout_scan does the dirty work for the pageout daemon.
  */
-
 struct vm_pageout_scan_info {
        struct proc *bigproc;
        vm_offset_t bigsize;
@@ -693,18 +692,18 @@ struct vm_pageout_scan_info {
 
 static int vm_pageout_scan_callback(struct proc *p, void *data);
 
-static void
+static int
 vm_pageout_scan(int pass)
 {
        struct vm_pageout_scan_info info;
        vm_page_t m, next;
        struct vm_page marker;
-       int page_shortage, maxscan, pcount;
-       int addl_page_shortage, addl_page_shortage_init;
+       int maxscan, pcount;
+       int recycle_count;
+       int inactive_shortage, active_shortage;
        vm_object_t object;
        int actcount;
        int vnodes_skipped = 0;
-       int pages_freed = 0;
        int maxlaunder;
 
        /*
@@ -712,14 +711,15 @@ vm_pageout_scan(int pass)
         */
        pmap_collect();
 
-       addl_page_shortage_init = vm_pageout_deficit;
-       vm_pageout_deficit = 0;
-
        /*
-        * Calculate the number of pages we want to either free or move
-        * to the cache.
+        * Calculate our target for the number of free+cache pages we
+        * want to get to.  This is higher then the number that causes
+        * allocations to stall (severe) in order to provide hysteresis,
+        * and if we don't make it all the way but get to the minimum
+        * we're happy.
         */
-       page_shortage = vm_paging_target() + addl_page_shortage_init;
+       inactive_shortage = vm_paging_target() + vm_pageout_deficit;
+       vm_pageout_deficit = 0;
 
        /*
         * Initialize our marker
@@ -760,10 +760,9 @@ vm_pageout_scan(int pass)
         */
        crit_enter();
 rescan0:
-       addl_page_shortage = addl_page_shortage_init;
        maxscan = vmstats.v_inactive_count;
        for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
-            m != NULL && maxscan-- > 0 && page_shortage > 0;
+            m != NULL && maxscan-- > 0 && inactive_shortage > 0;
             m = next
         ) {
                mycpu->gd_cnt.v_pdpages++;
@@ -795,7 +794,6 @@ rescan0:
                if (m->hold_count) {
                        TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
                        TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-                       addl_page_shortage++;
                        continue;
                }
 
@@ -804,7 +802,6 @@ rescan0:
                 * queue, most likely are being paged out.
                 */
                if (m->busy || (m->flags & PG_BUSY)) {
-                       addl_page_shortage++;
                        continue;
                }
 
@@ -872,16 +869,14 @@ rescan0:
                         */
                        vm_pageout_page_free(m);
                        mycpu->gd_cnt.v_dfree++;
-                       --page_shortage;
-                       ++pages_freed;
+                       --inactive_shortage;
                } else if (m->dirty == 0) {
                        /*
                         * Clean pages can be placed onto the cache queue.
                         * This effectively frees them.
                         */
                        vm_page_cache(m);
-                       --page_shortage;
-                       ++pages_freed;
+                       --inactive_shortage;
                } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
                        /*
                         * Dirty pages need to be paged out, but flushing
@@ -916,7 +911,7 @@ rescan0:
                        } else {
                                swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
                                swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
-                               vm_page_count_min());
+                               vm_page_count_min(0));
                                                                                
                        }
 
@@ -1011,16 +1006,15 @@ rescan0:
                         * pointer.  To prevent an inordinate number of
                         * restarts we use our marker to remember our place.
                         *
-                        * decrement page_shortage on success to account for
-                        * the (future) cleaned page.  Otherwise we could wind
-                        * up laundering or cleaning too many pages.
+                        * decrement inactive_shortage on success to account
+                        * for the (future) cleaned page.  Otherwise we
+                        * could wind up laundering or cleaning too many
+                        * pages.
                         */
                        TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
                        if (vm_pageout_clean(m) != 0) {
-                               --page_shortage;
+                               --inactive_shortage;
                                --maxlaunder;
-                       } else {
-                               addl_page_shortage++;
                        }
                        next = TAILQ_NEXT(&marker, pageq);
                        TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
@@ -1030,34 +1024,32 @@ rescan0:
        }
 
        /*
-        * Compute the number of pages we want to try to move from the
-        * active queue to the inactive queue.
-        */
-       page_shortage = vm_paging_target() +
-                       vmstats.v_inactive_target - vmstats.v_inactive_count;
-       page_shortage += addl_page_shortage;
-
-       /*
-        * If the system is running out of swap or has none a large backlog
-        * can accumulate in the inactive list.  Continue moving pages to
-        * the inactive list even though its 'target' has been met due to
-        * being unable to drain.  We can then use a low active count to
-        * measure stress and out-of-memory conditions.
-        */
-       if (page_shortage < addl_page_shortage)
-               page_shortage = addl_page_shortage;
-
-       /*
-        * Scan the active queue for things we can deactivate. We nominally
-        * track the per-page activity counter and use it to locate 
-        * deactivation candidates.
+        * We want to move pages from the active queue to the inactive
+        * queue to get the inactive queue to the inactive target.  If
+        * we still have a page shortage from above we try to directly free
+        * clean pages instead of moving them.
         *
-        * NOTE: we are still in a critical section.
+        * If we do still have a shortage we keep track of the number of
+        * pages we free or cache (recycle_count) as a measure of thrashing
+        * between the active and inactive queues.
+        *
+        * We do not do this if we were able to satisfy the requirement
+        * entirely from the inactive queue.
+        *
+        * NOTE: Both variables can end up negative.
+        * NOTE: We are still in a critical section.
         */
+       active_shortage = vmstats.v_inactive_target - vmstats.v_inactive_count;
+       if (inactive_shortage <= 0)
+               active_shortage = 0;
+
        pcount = vmstats.v_active_count;
+       recycle_count = 0;
        m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
 
-       while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
+       while ((m != NULL) && (pcount-- > 0) &&
+              (inactive_shortage > 0 || active_shortage > 0)
+       ) {
                /*
                 * Give interrupts a chance.
                 */
@@ -1090,13 +1082,14 @@ rescan0:
                mycpu->gd_cnt.v_pdpages++;
 
                /*
-                * Check to see "how much" the page has been used.
+                * Check to see "how much" the page has been used and clear
+                * the tracking access bits.  If the object has no references
+                * don't bother paying the expense.
                 */
                actcount = 0;
                if (m->object->ref_count != 0) {
-                       if (m->flags & PG_REFERENCED) {
-                               actcount += 1;
-                       }
+                       if (m->flags & PG_REFERENCED)
+                               ++actcount;
                        actcount += pmap_ts_referenced(m);
                        if (actcount) {
                                m->act_count += ACT_ADVANCE + actcount;
@@ -1104,31 +1097,35 @@ rescan0:
                                        m->act_count = ACT_MAX;
                        }
                }
-
-               /*
-                * Since we have "tested" this bit, we need to clear it now.
-                */
                vm_page_flag_clear(m, PG_REFERENCED);
 
                /*
-                * Only if an object is currently being used, do we use the
-                * page activation count stats.
+                * actcount is only valid if the object ref_count is non-zero.
                 */
-               if (actcount && (m->object->ref_count != 0)) {
+               if (actcount && m->object->ref_count != 0) {
                        TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
                        TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
                } else {
                        m->act_count -= min(m->act_count, ACT_DECLINE);
                        if (vm_pageout_algorithm ||
                            m->object->ref_count == 0 ||
-                           m->act_count < pass) {
-                               page_shortage--;
-                               if (m->object->ref_count == 0) {
+                           m->act_count < pass + 1
+                       ) {
+                               /*
+                                * Deactivate the page.  If we had a
+                                * shortage from our inactive scan try to
+                                * free (cache) the page instead.
+                                */
+                               --active_shortage;
+                               if (inactive_shortage > 0 ||
+                                   m->object->ref_count == 0) {
+                                       if (inactive_shortage > 0)
+                                               ++recycle_count;
                                        vm_page_busy(m);
                                        vm_page_protect(m, VM_PROT_NONE);
                                        vm_page_wakeup(m);
                                        if (m->dirty == 0) {
-                                               ++pages_freed;
+                                               --inactive_shortage;
                                                vm_page_cache(m);
                                        } else {
                                                vm_page_deactivate(m);
@@ -1155,11 +1152,10 @@ rescan0:
         * Pages moved from PQ_CACHE to totally free are not counted in the
         * pages_freed counter.
         */
-
        while (vmstats.v_free_count < vmstats.v_free_reserved) {
                static int cache_rover = 0;
                m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE);
-               if (!m)
+               if (m == NULL)
                        break;
                if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || 
                    m->busy || 
@@ -1200,7 +1196,7 @@ rescan0:
         * if we did not get enough free pages.
         */
        if (vm_paging_target() > 0) {
-               if (vnodes_skipped && vm_page_count_min())
+               if (vnodes_skipped && vm_page_count_min(0))
                        speedup_syncer();
 #if !defined(NO_SWAPPING)
                if (vm_swap_enabled && vm_page_count_target()) {
@@ -1211,33 +1207,37 @@ rescan0:
        }
 
        /*
-        * If we are out of swap space (or have no swap) then we
-        * can detect when the system has completely run out of
-        * memory by observing several variables.
+        * Handle catastrophic conditions.  Under good conditions we should
+        * be at the target, well beyond our minimum.  If we could not even
+        * reach our minimum the system is under heavy stress.
+        *
+        * Determine whether we have run out of memory.  This occurs when
+        * swap_pager_full is TRUE and the only pages left in the page
+        * queues are dirty.  We will still likely have page shortages.
         *
         * - swap_pager_full is set if insufficient swap was
         *   available to satisfy a requested pageout.
         *
-        * - vm_page_count_min() means we could not recover
-        *   enough pages to meet bare minimum needs.
-        *
-        * - vm_active_count
+        * - the inactive queue is bloated (4 x size of active queue),
+        *   meaning it is unable to get rid of dirty pages and.
         *
-        *and we were
-        * not able to reach our minimum free page count target,
-        * then we can detect whether we have run out of memory
-        * by observing the active count.  A memory starved
-        * system will reduce the active count
+        * - vm_page_count_min() without counting pages recycled from the
+        *   active queue (recycle_count) means we could not recover
+        *   enough pages to meet bare minimum needs.  This test only
+        *   works if the inactive queue is bloated.
         *
-        * If under these circumstances our paging target exceeds
-        * 1/2 the number of active pages we have a very serious
-        * problem that the deactivation of pages failed to solve
-        * and must start killing things.
+        * - due to a positive inactive_shortage we shifted the remaining
+        *   dirty pages from the active queue to the inactive queue
+        *   trying to find clean ones to free.
         */
-       if (swap_pager_full && vm_page_count_min())
+       if (swap_pager_full && vm_page_count_min(recycle_count))
                kprintf("Warning: system low on memory+swap!\n");
-       if (swap_pager_full && vm_page_count_min() &&
-           vm_paging_target() > vmstats.v_active_count / 4) {
+       if (swap_pager_full && vm_page_count_min(recycle_count) &&
+           vmstats.v_inactive_count > vmstats.v_active_count * 4 &&
+           inactive_shortage > 0) {
+               /*
+                * Kill something.
+                */
                info.bigproc = NULL;
                info.bigsize = 0;
                allproc_scan(vm_pageout_scan_callback, &info);
@@ -1250,6 +1250,7 @@ rescan0:
                        PRELE(info.bigproc);
                }
        }
+       return(inactive_shortage);
 }
 
 static int
@@ -1259,7 +1260,8 @@ vm_pageout_scan_callback(struct proc *p, void *data)
        vm_offset_t size;
 
        /*
-        * if this is a system process, skip it
+        * Never kill system processes or init.  If we have configured swap
+        * then try to avoid killing low-numbered pids.
         */
        if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
            ((p->p_pid < 48) && (vm_swap_size != 0))) {
@@ -1270,21 +1272,23 @@ vm_pageout_scan_callback(struct proc *p, void *data)
         * if the process is in a non-running type state,
         * don't touch it.
         */
-       if (p->p_stat != SACTIVE && p->p_stat != SSTOP) {
+       if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
                return (0);
-       }
 
        /*
-        * get the process size
+        * Get the approximate process size.  Note that anonymous pages
+        * with backing swap will be counted twice, but there should not
+        * be too many such pages due to the stress the VM system is
+        * under at this point.
         */
-       size = vmspace_resident_count(p->p_vmspace) +
+       size = vmspace_anonymous_count(p->p_vmspace) +
                vmspace_swap_count(p->p_vmspace);
 
        /*
         * If the this process is bigger than the biggest one
         * remember it.
         */
-       if (size > info->bigsize) {
+       if (info->bigsize < size) {
                if (info->bigproc)
                        PRELE(info->bigproc);
                PHOLD(p);
@@ -1413,12 +1417,13 @@ vm_pageout_free_page_calc(vm_size_t count)
 
 
 /*
- *     vm_pageout is the high level pageout daemon.
+ * vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout(void)
 {
        int pass;
+       int inactive_shortage;
 
        /*
         * Initialize some paging parameters.
@@ -1430,6 +1435,7 @@ vm_pageout(void)
                vm_pageout_page_count = 8;
 
        vm_pageout_free_page_calc(vmstats.v_page_count);
+
        /*
         * v_free_target and v_cache_min control pageout hysteresis.  Note
         * that these are more a measure of the VM cache queue hysteresis
@@ -1482,66 +1488,96 @@ vm_pageout(void)
 
        swap_pager_swap_init();
        pass = 0;
+
        /*
         * The pageout daemon is never done, so loop forever.
         */
        while (TRUE) {
                int error;
 
-               /*
-                * If we have enough free memory, wakeup waiters.  Do
-                * not clear vm_pages_needed until we reach our target,
-                * otherwise we may be woken up over and over again and
-                * waste a lot of cpu.
-                */
-               crit_enter();
-               if (vm_pages_needed && !vm_page_count_min()) {
-                       if (vm_paging_needed() <= 0)
-                               vm_pages_needed = 0;
-                       wakeup(&vmstats.v_free_count);
-               }
-               if (vm_pages_needed) {
+               if (vm_pages_needed == 0) {
                        /*
-                        * Still not done, take a second pass without waiting
-                        * (unlimited dirty cleaning), otherwise sleep a bit
-                        * and try again.
+                        * Wait for an action request
                         */
-                       ++pass;
-                       if (pass > 1)
-                               tsleep(&vm_pages_needed, 0, "psleep", hz/2);
-               } else {
-                       /*
-                        * Good enough, sleep & handle stats.  Prime the pass
-                        * for the next run.
-                        */
-                       if (pass > 1)
-                               pass = 1;
-                       else
-                               pass = 0;
                        error = tsleep(&vm_pages_needed,
-                               0, "psleep", vm_pageout_stats_interval * hz);
-                       if (error && !vm_pages_needed) {
-                               crit_exit();
-                               pass = 0;
+                                      0, "psleep",
+                                      vm_pageout_stats_interval * hz);
+                       if (error && vm_pages_needed == 0) {
                                vm_pageout_page_stats();
                                continue;
                        }
+                       vm_pages_needed = 1;
                }
 
-               if (vm_pages_needed)
-                       mycpu->gd_cnt.v_pdwakeups++;
+               /*
+                * If we have enough free memory, wakeup waiters.
+                */
+               crit_enter();
+               if (!vm_page_count_min(0))
+                       wakeup(&vmstats.v_free_count);
+               mycpu->gd_cnt.v_pdwakeups++;
                crit_exit();
-               vm_pageout_scan(pass);
-               vm_pageout_deficit = 0;
+               inactive_shortage = vm_pageout_scan(pass);
+
+               /*
+                * Try to avoid thrashing the system with activity.
+                */
+               if (inactive_shortage > 0) {
+                       ++pass;
+                       if (swap_pager_full) {
+                               /*
+                                * Running out of memory, catastrophic back-off
+                                * to one-second intervals.
+                                */
+                               tsleep(&vm_pages_needed, 0, "pdelay", hz);
+                       } else if (pass < 10 && vm_pages_needed > 1) {
+                               /*
+                                * Normal operation, additional processes
+                                * have already kicked us.  Retry immediately.
+                                */
+                       } else if (pass < 10) {
+                               /*
+                                * Normal operation, fewer processes.  Delay
+                                * a bit but allow wakeups.
+                                */
+                               vm_pages_needed = 0;
+                               tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
+                               vm_pages_needed = 1;
+                       } else {
+                               /*
+                                * We've taken too many passes, forced delay.
+                                */
+                               tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
+                       }
+               } else {
+                       pass = 0;
+                       vm_pages_needed = 0;
+               }
        }
 }
 
+/*
+ * Called after allocating a page out of the cache or free queue
+ * to possibly wake the pagedaemon up to replentish our supply.
+ *
+ * We try to generate some hysteresis by waking the pagedaemon up
+ * when our free+cache pages go below the severe level.  The pagedaemon
+ * tries to get the count back up to at least the minimum, and through
+ * to the target level if possible.
+ *
+ * If the pagedaemon is already active bump vm_pages_needed as a hint
+ * that there are even more requests pending.
+ */
 void
 pagedaemon_wakeup(void)
 {
-       if (!vm_pages_needed && curthread != pagethread) {
-               vm_pages_needed++;
-               wakeup(&vm_pages_needed);
+       if (vm_page_count_severe() && curthread != pagethread) {
+               if (vm_pages_needed == 0) {
+                       vm_pages_needed = 1;
+                       wakeup(&vm_pages_needed);
+               } else if (vm_page_count_min(0)) {
+                       ++vm_pages_needed;
+               }
        }
 }