kernel - Another huge HUGE VM performance improvement for many-cores
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 28 Oct 2011 16:32:51 +0000 (09:32 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 28 Oct 2011 16:32:51 +0000 (09:32 -0700)
This requires a bit of explanation.  The last single-point spinlocks in the
VM system were the spinlocks for the inactive and active queue.  Even though
these two spinlocks are only held for a very short period of time they can
create a major point of contention when one has (e.g.) 48 cores all trying
to run a VM fault at the same time.  This is an issue with multi-socket/
many-cores systems and not so much an issue with single-socket systems.

On many cores systems the global VM fault rate was limited to around
~200-250K zfod faults per second prior to this commit on our 48-core
opteron test box.  Since any single compiler process can run ~35K zfod
faults per second the maximum concurrency topped out at around ~7 concurrent
processes.

With this commit the global VM fault rate was tested to almost 900K zfod
faults per second.  That's 900,000 page faults per second (about 3.5 GBytes
per second).  Typical operation was consistently above 750K zfod faults per
second.  Maximum concurrency at a 35K fault rate per process is thus
increased from 7 processes to over 25 processes, and is probably approaching
the physical memory bus limit considering that one also has to take into
account generic page-fault overhead above and beyond the memory impact on the
page itself.

I can't stress enough how important it is to avoid contention entirely when
possible on a many-cores system.  In this case even though the VM page queue
spinlocks are only held for a very short period of time, the convulsing of
the cache coherency management between physical cpu sockets when all the
cores need to use the spinlock still created an enormous bottleneck.  Fixing
this one spinlock easily doubled concurrent compiler performance on our
48-core opteron.

* Fan-out the PQ_INACTIVE and PQ_ACTIVE page queues from 1 queue to
  256 queues, each with its own spin lock.

* This removes the last major contention point in the VM system.

* -j48 buildkernel test on monster (48-core opteron) now runs in 55 seconds.
  It was originally 167 seconds, and 101 seconds just prior to this commit.

  Concurrent compiles are now three times faster (a +200% improvement) on
  a many-cores box, with virtually no contention at all.

sys/vm/vm_contig.c
sys/vm/vm_page.c
sys/vm/vm_page.h
sys/vm/vm_pageout.c
sys/vm/vm_swap.c
sys/vm/vm_swapcache.c

index 811abc6..a428616 100644 (file)
@@ -244,7 +244,7 @@ static int
 vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high,
                   unsigned long alignment, unsigned long boundary, int mflags)
 {
-       int i, start, pass;
+       int i, q, start, pass;
        vm_offset_t phys;
        vm_page_t pga = vm_page_array;
        vm_page_t m;
@@ -302,8 +302,11 @@ again:
                         * This is quite quick, for now stall all
                         * callers, even if they've specified M_NOWAIT.
                         */
-                       vm_contig_pg_clean(PQ_INACTIVE,
-                                          vmstats.v_inactive_count);
+                       for (q = 0; q < PQ_L2_SIZE; ++q) {
+                               vm_contig_pg_clean(PQ_INACTIVE + q,
+                                                  vmstats.v_inactive_count);
+                               lwkt_yield();
+                       }
 
                        /*
                         * Best effort flush of active pages.
@@ -316,8 +319,11 @@ again:
                         * will fail in the index < 0 case.
                         */
                        if (pass > 0 && (mflags & M_WAITOK)) {
-                               vm_contig_pg_clean(PQ_ACTIVE,
-                                                  vmstats.v_active_count);
+                               for (q = 0; q < PQ_L2_SIZE; ++q) {
+                                       vm_contig_pg_clean(PQ_ACTIVE + q,
+                                                      vmstats.v_active_count);
+                               }
+                               lwkt_yield();
                        }
 
                        /*
index c113d77..dcb3faa 100644 (file)
@@ -124,10 +124,12 @@ vm_page_queue_init(void)
                vm_page_queues[PQ_FREE+i].cnt = &vmstats.v_free_count;
        for (i = 0; i < PQ_L2_SIZE; i++)
                vm_page_queues[PQ_CACHE+i].cnt = &vmstats.v_cache_count;
-
-       vm_page_queues[PQ_INACTIVE].cnt = &vmstats.v_inactive_count;
-       vm_page_queues[PQ_ACTIVE].cnt = &vmstats.v_active_count;
-       vm_page_queues[PQ_HOLD].cnt = &vmstats.v_active_count;
+       for (i = 0; i < PQ_L2_SIZE; i++)
+               vm_page_queues[PQ_INACTIVE+i].cnt = &vmstats.v_inactive_count;
+       for (i = 0; i < PQ_L2_SIZE; i++)
+               vm_page_queues[PQ_ACTIVE+i].cnt = &vmstats.v_active_count;
+       for (i = 0; i < PQ_L2_SIZE; i++)
+               vm_page_queues[PQ_HOLD+i].cnt = &vmstats.v_active_count;
        /* PQ_NONE has no queue */
 
        for (i = 0; i < PQ_COUNT; i++) {
@@ -719,7 +721,7 @@ vm_page_hold(vm_page_t m)
        if (m->queue - m->pc == PQ_FREE) {
                _vm_page_queue_spin_lock(m);
                _vm_page_rem_queue_spinlocked(m);
-               _vm_page_add_queue_spinlocked(m, PQ_HOLD, 0);
+               _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
                _vm_page_queue_spin_unlock(m);
        }
        vm_page_spin_unlock(m);
@@ -736,7 +738,7 @@ vm_page_unhold(vm_page_t m)
 {
        vm_page_spin_lock(m);
        atomic_add_int(&m->hold_count, -1);
-       if (m->hold_count == 0 && m->queue == PQ_HOLD) {
+       if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
                _vm_page_queue_spin_lock(m);
                _vm_page_rem_queue_spinlocked(m);
                _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
@@ -1527,7 +1529,7 @@ vm_page_activate(vm_page_t m)
        u_short oqueue;
 
        vm_page_spin_lock(m);
-       if (m->queue != PQ_ACTIVE) {
+       if (m->queue - m->pc != PQ_ACTIVE) {
                _vm_page_queue_spin_lock(m);
                oqueue = _vm_page_rem_queue_spinlocked(m);
                /* page is left spinlocked, queue is unlocked */
@@ -1537,7 +1539,7 @@ vm_page_activate(vm_page_t m)
                if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
                        if (m->act_count < ACT_INIT)
                                m->act_count = ACT_INIT;
-                       _vm_page_add_queue_spinlocked(m, PQ_ACTIVE, 0);
+                       _vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
                }
                _vm_page_and_queue_spin_unlock(m);
                if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
@@ -1667,7 +1669,7 @@ vm_page_free_toq(vm_page_t m)
 
        if (m->hold_count != 0) {
                vm_page_flag_clear(m, PG_ZERO);
-               _vm_page_add_queue_spinlocked(m, PQ_HOLD, 0);
+               _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
        } else {
                _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
        }
@@ -1838,13 +1840,14 @@ vm_page_unwire(vm_page_t m, int activate)
                                ;
                        } else if (activate) {
                                vm_page_spin_lock(m);
-                               _vm_page_add_queue_spinlocked(m, PQ_ACTIVE, 0);
+                               _vm_page_add_queue_spinlocked(m,
+                                                       PQ_ACTIVE + m->pc, 0);
                                _vm_page_and_queue_spin_unlock(m);
                        } else {
                                vm_page_spin_lock(m);
                                vm_page_flag_clear(m, PG_WINATCFLS);
-                               _vm_page_add_queue_spinlocked(m, PQ_INACTIVE,
-                                                             0);
+                               _vm_page_add_queue_spinlocked(m,
+                                                       PQ_INACTIVE + m->pc, 0);
                                ++vm_swapcache_inactive_heuristic;
                                _vm_page_and_queue_spin_unlock(m);
                        }
@@ -1871,7 +1874,7 @@ _vm_page_deactivate_locked(vm_page_t m, int athead)
        /*
         * Ignore if already inactive.
         */
-       if (m->queue == PQ_INACTIVE)
+       if (m->queue - m->pc == PQ_INACTIVE)
                return;
        _vm_page_queue_spin_lock(m);
        oqueue = _vm_page_rem_queue_spinlocked(m);
@@ -1880,7 +1883,7 @@ _vm_page_deactivate_locked(vm_page_t m, int athead)
                if (oqueue == PQ_CACHE)
                        mycpu->gd_cnt.v_reactivated++;
                vm_page_flag_clear(m, PG_WINATCFLS);
-               _vm_page_add_queue_spinlocked(m, PQ_INACTIVE, athead);
+               _vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
                if (athead == 0)
                        ++vm_swapcache_inactive_heuristic;
        }
@@ -2094,7 +2097,7 @@ vm_page_dontneed(vm_page_t m)
         * occassionally leave the page alone
         */
        if ((dnw & 0x01F0) == 0 ||
-           m->queue == PQ_INACTIVE || 
+           m->queue - m->pc == PQ_INACTIVE ||
            m->queue - m->pc == PQ_CACHE
        ) {
                if (m->act_count >= ACT_INIT)
@@ -2653,8 +2656,16 @@ DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
        }
        db_printf("\n");
 
-       db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
-               vm_page_queues[PQ_ACTIVE].lcnt,
-               vm_page_queues[PQ_INACTIVE].lcnt);
+       db_printf("PQ_ACTIVE:");
+       for(i=0;i<PQ_L2_SIZE;i++) {
+               db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
+       }
+       db_printf("\n");
+
+       db_printf("PQ_INACTIVE:");
+       for(i=0;i<PQ_L2_SIZE;i++) {
+               db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
+       }
+       db_printf("\n");
 }
 #endif /* DDB */
index 56f905c..0ba7e75 100644 (file)
@@ -258,12 +258,12 @@ typedef struct vm_page *vm_page_t;
 #endif
 
 #define PQ_NONE                0
-#define PQ_FREE                1
+#define PQ_FREE                (1 + 0*PQ_MAXL2_SIZE)
 #define PQ_INACTIVE    (1 + 1*PQ_MAXL2_SIZE)
-#define PQ_ACTIVE      (2 + 1*PQ_MAXL2_SIZE)
-#define PQ_CACHE       (3 + 1*PQ_MAXL2_SIZE)
-#define PQ_HOLD                (3 + 2*PQ_MAXL2_SIZE)
-#define PQ_COUNT       (4 + 2*PQ_MAXL2_SIZE)
+#define PQ_ACTIVE      (1 + 2*PQ_MAXL2_SIZE)
+#define PQ_CACHE       (1 + 3*PQ_MAXL2_SIZE)
+#define PQ_HOLD                (1 + 4*PQ_MAXL2_SIZE)
+#define PQ_COUNT       (1 + 5*PQ_MAXL2_SIZE)
 
 /*
  * Scan support
index 28cad92..5d7dcb0 100644 (file)
 
 /* the kernel process "vm_pageout"*/
 static int vm_pageout_clean (vm_page_t);
-static int vm_pageout_scan (int pass);
 static int vm_pageout_free_page_calc (vm_size_t count);
 struct thread *pagethread;
 
@@ -207,7 +206,7 @@ static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t);
 static freeer_fcn_t vm_pageout_object_deactivate_pages;
 static void vm_req_vmdaemon (void);
 #endif
-static void vm_pageout_page_stats(void);
+static void vm_pageout_page_stats(int q);
 
 /*
  * Update vm_load to slow down faulting processes.
@@ -319,7 +318,7 @@ more:
                }
                vm_page_test_dirty(p);
                if ((p->dirty & p->valid) == 0 ||
-                   p->queue != PQ_INACTIVE ||
+                   p->queue - p->pc != PQ_INACTIVE ||
                    p->wire_count != 0 ||       /* may be held by buf cache */
                    p->hold_count != 0) {       /* may be undergoing I/O */
                        vm_page_wakeup(p);
@@ -351,7 +350,7 @@ more:
                }
                vm_page_test_dirty(p);
                if ((p->dirty & p->valid) == 0 ||
-                   p->queue != PQ_INACTIVE ||
+                   p->queue - p->pc != PQ_INACTIVE ||
                    p->wire_count != 0 ||       /* may be held by buf cache */
                    p->hold_count != 0) {       /* may be undergoing I/O */
                        vm_page_wakeup(p);
@@ -589,12 +588,12 @@ vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
        }
 
        vm_page_and_queue_spin_lock(p);
-       if (p->queue != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
+       if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
                vm_page_and_queue_spin_unlock(p);
                vm_page_activate(p);
                p->act_count += actcount;
                vm_page_flag_clear(p, PG_REFERENCED);
-       } else if (p->queue == PQ_ACTIVE) {
+       } else if (p->queue - p->pc == PQ_ACTIVE) {
                if ((p->flags & PG_REFERENCED) == 0) {
                        p->act_count -= min(p->act_count, ACT_DECLINE);
                        if (!info->limit &&
@@ -603,8 +602,10 @@ vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
                                vm_page_protect(p, VM_PROT_NONE);
                                vm_page_deactivate(p);
                        } else {
-                               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
-                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
+                               TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
+                                            p, pageq);
+                               TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
+                                                 p, pageq);
                                vm_page_and_queue_spin_unlock(p);
                        }
                } else {
@@ -613,15 +614,17 @@ vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
                        vm_page_flag_clear(p, PG_REFERENCED);
 
                        vm_page_and_queue_spin_lock(p);
-                       if (p->queue == PQ_ACTIVE) {
+                       if (p->queue - p->pc == PQ_ACTIVE) {
                                if (p->act_count < (ACT_MAX - ACT_ADVANCE))
                                        p->act_count += ACT_ADVANCE;
-                               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
-                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
+                               TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
+                                            p, pageq);
+                               TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
+                                                 p, pageq);
                        }
                        vm_page_and_queue_spin_unlock(p);
                }
-       } else if (p->queue == PQ_INACTIVE) {
+       } else if (p->queue - p->pc == PQ_INACTIVE) {
                vm_page_and_queue_spin_unlock(p);
                vm_page_protect(p, VM_PROT_NONE);
        } else {
@@ -736,38 +739,19 @@ struct vm_pageout_scan_info {
 static int vm_pageout_scan_callback(struct proc *p, void *data);
 
 static int
-vm_pageout_scan(int pass)
+vm_pageout_scan_inactive(int pass, int q, int inactive_shortage,
+                        int *vnodes_skippedp)
 {
-       struct vm_pageout_scan_info info;
        vm_page_t m;
        struct vm_page marker;
        struct vnode *vpfailed;         /* warning, allowed to be stale */
-       int maxscan, pcount;
-       int recycle_count;
-       int inactive_shortage, active_shortage;
-       int inactive_original_shortage;
+       int maxscan;
+       int delta = 0;
        vm_object_t object;
        int actcount;
-       int vnodes_skipped = 0;
        int maxlaunder;
 
        /*
-        * Do whatever cleanup that the pmap code can.
-        */
-       pmap_collect();
-
-       /*
-        * Calculate our target for the number of free+cache pages we
-        * want to get to.  This is higher then the number that causes
-        * allocations to stall (severe) in order to provide hysteresis,
-        * and if we don't make it all the way but get to the minimum
-        * we're happy.
-        */
-       inactive_shortage = vm_paging_target() + vm_pageout_deficit;
-       inactive_original_shortage = inactive_shortage;
-       vm_pageout_deficit = 0;
-
-       /*
         * Start scanning the inactive queue for pages we can move to the
         * cache or free.  The scan will stop when the target is reached or
         * we have scanned the entire inactive queue.  Note that m->act_count
@@ -793,7 +777,8 @@ vm_pageout_scan(int pass)
         */
        bzero(&marker, sizeof(marker));
        marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
-       marker.queue = PQ_INACTIVE;
+       marker.queue = PQ_INACTIVE + q;
+       marker.pc = q;
        marker.wire_count = 1;
 
        /*
@@ -805,13 +790,13 @@ vm_pageout_scan(int pass)
         */
        vpfailed = NULL;
 
-       vm_page_queues_spin_lock(PQ_INACTIVE);
-       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
+       vm_page_queues_spin_lock(PQ_INACTIVE + q);
+       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
        maxscan = vmstats.v_inactive_count;
-       vm_page_queues_spin_unlock(PQ_INACTIVE);
+       vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 
        while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
-              maxscan-- > 0 && inactive_shortage > 0)
+              maxscan-- > 0 && inactive_shortage - delta > 0)
        {
                vm_page_and_queue_spin_lock(m);
                if (m != TAILQ_NEXT(&marker, pageq)) {
@@ -819,10 +804,10 @@ vm_pageout_scan(int pass)
                        ++maxscan;
                        continue;
                }
-               KKASSERT(m->queue == PQ_INACTIVE);
-               TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
+               KKASSERT(m->queue - m->pc == PQ_INACTIVE);
+               TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
                             &marker, pageq);
-               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m,
+               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
                                   &marker, pageq);
                mycpu->gd_cnt.v_pdpages++;
 
@@ -843,7 +828,7 @@ vm_pageout_scan(int pass)
                        continue;
                }
                vm_page_and_queue_spin_unlock(m);
-               KKASSERT(m->queue == PQ_INACTIVE);
+               KKASSERT(m->queue - m->pc == PQ_INACTIVE);
 
                lwkt_yield();
 
@@ -858,11 +843,13 @@ vm_pageout_scan(int pass)
                 */
                if (m->hold_count) {
                        vm_page_and_queue_spin_lock(m);
-                       if (m->queue == PQ_INACTIVE) {
-                               TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
-                                            m, pageq);
-                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl,
-                                                 m, pageq);
+                       if (m->queue - m->pc == PQ_INACTIVE) {
+                               TAILQ_REMOVE(
+                                       &vm_page_queues[PQ_INACTIVE + q].pl,
+                                       m, pageq);
+                               TAILQ_INSERT_TAIL(
+                                       &vm_page_queues[PQ_INACTIVE + q].pl,
+                                       m, pageq);
                        }
                        vm_page_and_queue_spin_unlock(m);
                        ++vm_swapcache_inactive_heuristic;
@@ -938,14 +925,14 @@ vm_pageout_scan(int pass)
                         */
                        vm_pageout_page_free(m);
                        mycpu->gd_cnt.v_dfree++;
-                       --inactive_shortage;
+                       ++delta;
                } else if (m->dirty == 0) {
                        /*
                         * Clean pages can be placed onto the cache queue.
                         * This effectively frees them.
                         */
                        vm_page_cache(m);
-                       --inactive_shortage;
+                       ++delta;
                } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
                        /*
                         * Dirty pages need to be paged out, but flushing
@@ -961,9 +948,13 @@ vm_pageout_scan(int pass)
                         */
                        vm_page_flag_set(m, PG_WINATCFLS);
                        vm_page_and_queue_spin_lock(m);
-                       if (m->queue == PQ_INACTIVE) {
-                               TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+                       if (m->queue - m->pc == PQ_INACTIVE) {
+                               TAILQ_REMOVE(
+                                       &vm_page_queues[PQ_INACTIVE + q].pl,
+                                       m, pageq);
+                               TAILQ_INSERT_TAIL(
+                                       &vm_page_queues[PQ_INACTIVE + q].pl,
+                                       m, pageq);
                        }
                        vm_page_and_queue_spin_unlock(m);
                        ++vm_swapcache_inactive_heuristic;
@@ -996,9 +987,13 @@ vm_pageout_scan(int pass)
                         */
                        if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
                                vm_page_and_queue_spin_lock(m);
-                               if (m->queue == PQ_INACTIVE) {
-                                       TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-                                       TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+                               if (m->queue - m->pc == PQ_INACTIVE) {
+                                       TAILQ_REMOVE(
+                                           &vm_page_queues[PQ_INACTIVE + q].pl,
+                                           m, pageq);
+                                       TAILQ_INSERT_TAIL(
+                                           &vm_page_queues[PQ_INACTIVE + q].pl,
+                                           m, pageq);
                                }
                                vm_page_and_queue_spin_unlock(m);
                                ++vm_swapcache_inactive_heuristic;
@@ -1053,7 +1048,7 @@ vm_pageout_scan(int pass)
                                        vpfailed = vp;
                                        ++pageout_lock_miss;
                                        if (object->flags & OBJ_MIGHTBEDIRTY)
-                                                   vnodes_skipped++;
+                                                   ++*vnodes_skippedp;
                                        vm_page_unhold(m);
                                        continue;
                                }
@@ -1065,11 +1060,11 @@ vm_pageout_scan(int pass)
                                 * reused for another vnode.  The object might
                                 * have been reused for another vnode.
                                 */
-                               if (m->queue != PQ_INACTIVE ||
+                               if (m->queue - m->pc != PQ_INACTIVE ||
                                    m->object != object ||
                                    object->handle != vp) {
                                        if (object->flags & OBJ_MIGHTBEDIRTY)
-                                               vnodes_skipped++;
+                                               ++*vnodes_skippedp;
                                        vput(vp);
                                        vm_page_unhold(m);
                                        continue;
@@ -1097,14 +1092,14 @@ vm_pageout_scan(int pass)
                                 */
                                if (m->hold_count) {
                                        vm_page_and_queue_spin_lock(m);
-                                       if (m->queue == PQ_INACTIVE) {
-                                               TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-                                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+                                       if (m->queue - m->pc == PQ_INACTIVE) {
+                                               TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
+                                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
                                        }
                                        vm_page_and_queue_spin_unlock(m);
                                        ++vm_swapcache_inactive_heuristic;
                                        if (object->flags & OBJ_MIGHTBEDIRTY)
-                                               vnodes_skipped++;
+                                               ++*vnodes_skippedp;
                                        vm_page_wakeup(m);
                                        vput(vp);
                                        continue;
@@ -1126,7 +1121,7 @@ vm_pageout_scan(int pass)
                         * pages.
                         */
                        if (vm_pageout_clean(m) != 0) {
-                               --inactive_shortage;
+                               ++delta;
                                --maxlaunder;
                        }
                        /* clean ate busy, page no longer accessible */
@@ -1136,9 +1131,23 @@ vm_pageout_scan(int pass)
                        vm_page_wakeup(m);
                }
        }
-       vm_page_queues_spin_lock(PQ_INACTIVE);
-       TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
-       vm_page_queues_spin_unlock(PQ_INACTIVE);
+       vm_page_queues_spin_lock(PQ_INACTIVE + q);
+       TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
+       vm_page_queues_spin_unlock(PQ_INACTIVE + q);
+
+       return (delta);
+}
+
+static int
+vm_pageout_scan_active(int pass, int q,
+                      int inactive_shortage, int active_shortage,
+                      int *recycle_countp)
+{
+       struct vm_page marker;
+       vm_page_t m;
+       int actcount;
+       int delta = 0;
+       int pcount;
 
        /*
         * We want to move pages from the active queue to the inactive
@@ -1160,24 +1169,21 @@ vm_pageout_scan(int pass)
         * NOTE: Both variables can end up negative.
         * NOTE: We are still in a critical section.
         */
-       active_shortage = vmstats.v_inactive_target - vmstats.v_inactive_count;
-       if (inactive_original_shortage < vmstats.v_inactive_target / 10)
-               inactive_original_shortage = vmstats.v_inactive_target / 10;
-       if (inactive_shortage <= 0 &&
-           active_shortage > inactive_original_shortage * 2) {
-               active_shortage = inactive_original_shortage * 2;
-       }
 
-       recycle_count = 0;
-       marker.queue = PQ_ACTIVE;
+       bzero(&marker, sizeof(marker));
+       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+       marker.queue = PQ_ACTIVE + q;
+       marker.pc = q;
+       marker.wire_count = 1;
 
-       vm_page_queues_spin_lock(PQ_ACTIVE);
-       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
-       vm_page_queues_spin_unlock(PQ_ACTIVE);
+       vm_page_queues_spin_lock(PQ_ACTIVE + q);
+       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
+       vm_page_queues_spin_unlock(PQ_ACTIVE + q);
        pcount = vmstats.v_active_count;
 
        while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
-              pcount-- > 0 && (inactive_shortage > 0 || active_shortage > 0))
+              pcount-- > 0 && (inactive_shortage - delta > 0 ||
+                               active_shortage > 0))
        {
                vm_page_and_queue_spin_lock(m);
                if (m != TAILQ_NEXT(&marker, pageq)) {
@@ -1185,10 +1191,10 @@ vm_pageout_scan(int pass)
                        ++pcount;
                        continue;
                }
-               KKASSERT(m->queue == PQ_ACTIVE);
-               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
+               KKASSERT(m->queue - m->pc == PQ_ACTIVE);
+               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
                             &marker, pageq);
-               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE].pl, m,
+               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
                                   &marker, pageq);
 
                /*
@@ -1213,9 +1219,9 @@ vm_pageout_scan(int pass)
                 * busy them.  (XXX why not?)
                 */
                if (m->hold_count != 0) {
-                       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
+                       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
                                     m, pageq);
-                       TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
+                       TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE + q].pl,
                                          m, pageq);
                        vm_page_and_queue_spin_unlock(m);
                        vm_page_wakeup(m);
@@ -1258,11 +1264,13 @@ vm_pageout_scan(int pass)
                 */
                if (actcount && m->object->ref_count != 0) {
                        vm_page_and_queue_spin_lock(m);
-                       if (m->queue == PQ_ACTIVE) {
-                               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
-                                            m, pageq);
-                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
-                                                 m, pageq);
+                       if (m->queue - m->pc == PQ_ACTIVE) {
+                               TAILQ_REMOVE(
+                                       &vm_page_queues[PQ_ACTIVE + q].pl,
+                                       m, pageq);
+                               TAILQ_INSERT_TAIL(
+                                       &vm_page_queues[PQ_ACTIVE + q].pl,
+                                       m, pageq);
                        }
                        vm_page_and_queue_spin_unlock(m);
                        vm_page_wakeup(m);
@@ -1283,14 +1291,14 @@ vm_pageout_scan(int pass)
                                 * gigabytes being moved.
                                 */
                                --active_shortage;
-                               if (inactive_shortage > 0 ||
+                               if (inactive_shortage - delta > 0 ||
                                    m->object->ref_count == 0) {
-                                       if (inactive_shortage > 0)
-                                               ++recycle_count;
+                                       if (inactive_shortage - delta > 0)
+                                               ++*recycle_countp;
                                        vm_page_protect(m, VM_PROT_NONE);
                                        if (m->dirty == 0 &&
-                                           inactive_shortage > 0) {
-                                               --inactive_shortage;
+                                           inactive_shortage - delta > 0) {
+                                               ++delta;
                                                vm_page_cache(m);
                                        } else {
                                                vm_page_deactivate(m);
@@ -1302,13 +1310,13 @@ vm_pageout_scan(int pass)
                                }
                        } else {
                                vm_page_and_queue_spin_lock(m);
-                               if (m->queue == PQ_ACTIVE) {
+                               if (m->queue - m->pc == PQ_ACTIVE) {
                                        TAILQ_REMOVE(
-                                               &vm_page_queues[PQ_ACTIVE].pl,
-                                               m, pageq);
+                                           &vm_page_queues[PQ_ACTIVE + q].pl,
+                                           m, pageq);
                                        TAILQ_INSERT_TAIL(
-                                               &vm_page_queues[PQ_ACTIVE].pl,
-                                               m, pageq);
+                                           &vm_page_queues[PQ_ACTIVE + q].pl,
+                                           m, pageq);
                                }
                                vm_page_and_queue_spin_unlock(m);
                                vm_page_wakeup(m);
@@ -1319,39 +1327,49 @@ vm_pageout_scan(int pass)
        /*
         * Clean out our local marker.
         */
-       vm_page_queues_spin_lock(PQ_ACTIVE);
-       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
-       vm_page_queues_spin_unlock(PQ_ACTIVE);
+       vm_page_queues_spin_lock(PQ_ACTIVE + q);
+       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
+       vm_page_queues_spin_unlock(PQ_ACTIVE + q);
+
+       return (delta);
+}
+
+/*
+ * The number of actually free pages can drop down to v_free_reserved,
+ * we try to build the free count back above v_free_min.  Note that
+ * vm_paging_needed() also returns TRUE if v_free_count is not at
+ * least v_free_min so that is the minimum we must build the free
+ * count to.
+ *
+ * We use a slightly higher target to improve hysteresis,
+ * ((v_free_target + v_free_min) / 2).  Since v_free_target
+ * is usually the same as v_cache_min this maintains about
+ * half the pages in the free queue as are in the cache queue,
+ * providing pretty good pipelining for pageout operation.
+ *
+ * The system operator can manipulate vm.v_cache_min and
+ * vm.v_free_target to tune the pageout demon.  Be sure
+ * to keep vm.v_free_min < vm.v_free_target.
+ *
+ * Note that the original paging target is to get at least
+ * (free_min + cache_min) into (free + cache).  The slightly
+ * higher target will shift additional pages from cache to free
+ * without effecting the original paging target in order to
+ * maintain better hysteresis and not have the free count always
+ * be dead-on v_free_min.
+ *
+ * NOTE: we are still in a critical section.
+ *
+ * Pages moved from PQ_CACHE to totally free are not counted in the
+ * pages_freed counter.
+ */
+static void
+vm_pageout_scan_cache(int inactive_shortage,
+                     int vnodes_skipped, int recycle_count)
+{
+       struct vm_pageout_scan_info info;
+       vm_page_t m;
 
-       /*
-        * The number of actually free pages can drop down to v_free_reserved,
-        * we try to build the free count back above v_free_min.  Note that
-        * vm_paging_needed() also returns TRUE if v_free_count is not at
-        * least v_free_min so that is the minimum we must build the free
-        * count to.
-        *
-        * We use a slightly higher target to improve hysteresis,
-        * ((v_free_target + v_free_min) / 2).  Since v_free_target
-        * is usually the same as v_cache_min this maintains about
-        * half the pages in the free queue as are in the cache queue,
-        * providing pretty good pipelining for pageout operation.
-        *
-        * The system operator can manipulate vm.v_cache_min and
-        * vm.v_free_target to tune the pageout demon.  Be sure
-        * to keep vm.v_free_min < vm.v_free_target.
-        *
-        * Note that the original paging target is to get at least
-        * (free_min + cache_min) into (free + cache).  The slightly
-        * higher target will shift additional pages from cache to free
-        * without effecting the original paging target in order to
-        * maintain better hysteresis and not have the free count always
-        * be dead-on v_free_min.
-        *
-        * NOTE: we are still in a critical section.
-        *
-        * Pages moved from PQ_CACHE to totally free are not counted in the
-        * pages_freed counter.
-        */
        while (vmstats.v_free_count <
               (vmstats.v_free_min + vmstats.v_free_target) / 2) {
                /*
@@ -1467,7 +1485,6 @@ vm_pageout_scan(int pass)
                        PRELE(info.bigproc);
                }
        }
-       return(inactive_shortage);
 }
 
 /*
@@ -1526,7 +1543,7 @@ vm_pageout_scan_callback(struct proc *p, void *data)
  * helps the situation where paging just starts to occur.
  */
 static void
-vm_pageout_page_stats(void)
+vm_pageout_page_stats(int q)
 {
        static int fullintervalcount = 0;
        struct vm_page marker;
@@ -1555,12 +1572,13 @@ vm_pageout_page_stats(void)
 
        bzero(&marker, sizeof(marker));
        marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
-       marker.queue = PQ_ACTIVE;
+       marker.queue = PQ_ACTIVE + q;
+       marker.pc = q;
        marker.wire_count = 1;
 
-       vm_page_queues_spin_lock(PQ_ACTIVE);
-       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
-       vm_page_queues_spin_unlock(PQ_ACTIVE);
+       vm_page_queues_spin_lock(PQ_ACTIVE + q);
+       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
+       vm_page_queues_spin_unlock(PQ_ACTIVE + q);
 
        while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
               pcount-- > 0)
@@ -1573,9 +1591,9 @@ vm_pageout_page_stats(void)
                        ++pcount;
                        continue;
                }
-               KKASSERT(m->queue == PQ_ACTIVE);
-               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
-               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE].pl, m,
+               KKASSERT(m->queue - m->pc == PQ_ACTIVE);
+               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
+               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
                                   &marker, pageq);
 
                /*
@@ -1594,7 +1612,7 @@ vm_pageout_page_stats(void)
                        continue;
                }
                vm_page_and_queue_spin_unlock(m);
-               KKASSERT(m->queue == PQ_ACTIVE);
+               KKASSERT(m->queue - m->pc == PQ_ACTIVE);
 
                /*
                 * We now have a safely busied page, the page and queue
@@ -1625,11 +1643,13 @@ vm_pageout_page_stats(void)
                        if (m->act_count > ACT_MAX)
                                m->act_count = ACT_MAX;
                        vm_page_and_queue_spin_lock(m);
-                       if (m->queue == PQ_ACTIVE) {
-                               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
-                                            m, pageq);
-                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
-                                                 m, pageq);
+                       if (m->queue - m->pc == PQ_ACTIVE) {
+                               TAILQ_REMOVE(
+                                       &vm_page_queues[PQ_ACTIVE + q].pl,
+                                       m, pageq);
+                               TAILQ_INSERT_TAIL(
+                                       &vm_page_queues[PQ_ACTIVE + q].pl,
+                                       m, pageq);
                        }
                        vm_page_and_queue_spin_unlock(m);
                        vm_page_wakeup(m);
@@ -1655,11 +1675,13 @@ vm_pageout_page_stats(void)
                } else {
                        m->act_count -= min(m->act_count, ACT_DECLINE);
                        vm_page_and_queue_spin_lock(m);
-                       if (m->queue == PQ_ACTIVE) {
-                               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
-                                            m, pageq);
-                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
-                                                 m, pageq);
+                       if (m->queue - m->pc == PQ_ACTIVE) {
+                               TAILQ_REMOVE(
+                                       &vm_page_queues[PQ_ACTIVE + q].pl,
+                                       m, pageq);
+                               TAILQ_INSERT_TAIL(
+                                       &vm_page_queues[PQ_ACTIVE + q].pl,
+                                       m, pageq);
                        }
                        vm_page_and_queue_spin_unlock(m);
                }
@@ -1669,10 +1691,9 @@ vm_pageout_page_stats(void)
        /*
         * Remove our local marker
         */
-       vm_page_queues_spin_lock(PQ_ACTIVE);
-       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
-       vm_page_queues_spin_unlock(PQ_ACTIVE);
-
+       vm_page_queues_spin_lock(PQ_ACTIVE + q);
+       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
+       vm_page_queues_spin_unlock(PQ_ACTIVE + q);
 }
 
 static int
@@ -1711,7 +1732,7 @@ static void
 vm_pageout_thread(void)
 {
        int pass;
-       int inactive_shortage;
+       int q;
 
        /*
         * Initialize some paging parameters.
@@ -1805,6 +1826,13 @@ vm_pageout_thread(void)
         */
        while (TRUE) {
                int error;
+               int delta1;
+               int delta2;
+               int inactive_shortage;
+               int active_shortage;
+               int vnodes_skipped = 0;
+               int recycle_count = 0;
+               int tmp;
 
                /*
                 * Wait for an action request.  If we timeout check to
@@ -1818,7 +1846,8 @@ vm_pageout_thread(void)
                        if (error &&
                            vm_paging_needed() == 0 &&
                            vm_pages_needed == 0) {
-                               vm_pageout_page_stats();
+                               for (q = 0; q < PQ_MAXL2_SIZE; ++q)
+                                       vm_pageout_page_stats(q);
                                continue;
                        }
                        vm_pages_needed = 1;
@@ -1827,10 +1856,67 @@ vm_pageout_thread(void)
                mycpu->gd_cnt.v_pdwakeups++;
 
                /*
+                * Do whatever cleanup that the pmap code can.
+                */
+               pmap_collect();
+
+               /*
                 * Scan for pageout.  Try to avoid thrashing the system
                 * with activity.
+                *
+                * Calculate our target for the number of free+cache pages we
+                * want to get to.  This is higher then the number that causes
+                * allocations to stall (severe) in order to provide hysteresis,
+                * and if we don't make it all the way but get to the minimum
+                * we're happy.
+                */
+               inactive_shortage = vm_paging_target() + vm_pageout_deficit;
+               vm_pageout_deficit = 0;
+               delta1 = 0;
+               for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
+                       delta1 += vm_pageout_scan_inactive(
+                                       pass, q,
+                                       inactive_shortage / PQ_MAXL2_SIZE + 1,
+                                       &vnodes_skipped);
+               }
+
+               /*
+                * Figure out how many active pages we must deactivate.  If
+                * we were able to reach our target with just the inactive
+                * scan above we limit the number of active pages we
+                * deactivate to reduce unnecessary work.
+                */
+               active_shortage = vmstats.v_inactive_target -
+                                 vmstats.v_inactive_count;
+
+               tmp = inactive_shortage;
+               if (tmp < vmstats.v_inactive_target / 10)
+                       tmp = vmstats.v_inactive_target / 10;
+               inactive_shortage -= delta1;
+               if (inactive_shortage <= 0 && active_shortage > tmp * 2)
+                       active_shortage = tmp * 2;
+
+               delta2 = 0;
+               for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
+                       delta2 += vm_pageout_scan_active(
+                                       pass, q,
+                                       inactive_shortage / PQ_MAXL2_SIZE + 1,
+                                       active_shortage / PQ_MAXL2_SIZE + 1,
+                                       &recycle_count);
+               }
+
+               /*
+                * Finally free enough cache pages to meet our free page
+                * requirement and take more drastic measures if we are
+                * still in trouble.
+                */
+               inactive_shortage -= delta2;
+               vm_pageout_scan_cache(inactive_shortage, vnodes_skipped,
+                                     recycle_count);
+
+               /*
+                * Wait for more work.
                 */
-               inactive_shortage = vm_pageout_scan(pass);
                if (inactive_shortage > 0) {
                        ++pass;
                        if (swap_pager_full) {
index 82e794a..88f73a0 100644 (file)
@@ -82,7 +82,7 @@ int nswdev = NSWAPDEV;                                /* exported to pstat/systat */
 int vm_swap_size;
 int vm_swap_max;
 
-static int swapoff_one (int index);
+static int swapoff_one(int index);
 struct vnode *swapdev_vp;
 
 /*
@@ -445,6 +445,7 @@ swapoff_one(int index)
        struct swdevt *sp;
        struct vm_page marker;
        vm_page_t m;
+       int q;
 
        mtx_lock(&swap_mtx);
 
@@ -458,61 +459,70 @@ swapoff_one(int index)
         * of data we will have to page back in, plus an epsilon so
         * the system doesn't become critically low on swap space.
         */
-       bzero(&marker, sizeof(marker));
-       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
-       marker.queue = PQ_ACTIVE;
-       marker.wire_count = 1;
-
-       vm_page_queues_spin_lock(PQ_ACTIVE);
-       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
-
-       while ((m = TAILQ_NEXT(&marker, pageq)) != NULL) {
-               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
-                            &marker, pageq);
-               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE].pl, m,
-                                  &marker, pageq);
-               if (m->flags & (PG_MARKER | PG_FICTITIOUS))
-                       continue;
-
-               if (vm_page_busy_try(m, FALSE) == 0) {
-                       vm_page_queues_spin_unlock(PQ_ACTIVE);
-                       if (m->dirty == 0) {
-                               vm_page_test_dirty(m);
-                               if (m->dirty == 0)
-                                       ++pq_active_clean;
+       for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
+               bzero(&marker, sizeof(marker));
+               marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+               marker.queue = PQ_ACTIVE + q;
+               marker.pc = q;
+               marker.wire_count = 1;
+
+               vm_page_queues_spin_lock(marker.queue);
+               TAILQ_INSERT_HEAD(&vm_page_queues[marker.queue].pl,
+                                 &marker, pageq);
+
+               while ((m = TAILQ_NEXT(&marker, pageq)) != NULL) {
+                       TAILQ_REMOVE(&vm_page_queues[marker.queue].pl,
+                                    &marker, pageq);
+                       TAILQ_INSERT_AFTER(&vm_page_queues[marker.queue].pl, m,
+                                          &marker, pageq);
+                       if (m->flags & (PG_MARKER | PG_FICTITIOUS))
+                               continue;
+
+                       if (vm_page_busy_try(m, FALSE) == 0) {
+                               vm_page_queues_spin_unlock(marker.queue);
+                               if (m->dirty == 0) {
+                                       vm_page_test_dirty(m);
+                                       if (m->dirty == 0)
+                                               ++pq_active_clean;
+                               }
+                               vm_page_wakeup(m);
+                               vm_page_queues_spin_lock(marker.queue);
                        }
-                       vm_page_wakeup(m);
-                       vm_page_queues_spin_lock(PQ_ACTIVE);
                }
-       }
-       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
-       vm_page_queues_spin_unlock(PQ_ACTIVE);
-
-       marker.queue = PQ_INACTIVE;
-       vm_page_queues_spin_lock(PQ_INACTIVE);
-       TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
-
-       while ((m = TAILQ_NEXT(&marker, pageq)) != NULL) {
-               TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
-                            &marker, pageq);
-               TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m,
-                                  &marker, pageq);
-               if (m->flags & (PG_MARKER | PG_FICTITIOUS))
-                       continue;
-
-               if (vm_page_busy_try(m, FALSE) == 0) {
-                       vm_page_queues_spin_unlock(PQ_INACTIVE);
-                       if (m->dirty == 0) {
-                               vm_page_test_dirty(m);
-                               if (m->dirty == 0)
-                                       ++pq_inactive_clean;
+               TAILQ_REMOVE(&vm_page_queues[marker.queue].pl, &marker, pageq);
+               vm_page_queues_spin_unlock(marker.queue);
+
+               marker.queue = PQ_INACTIVE + q;
+               marker.pc = q;
+               vm_page_queues_spin_lock(marker.queue);
+               TAILQ_INSERT_HEAD(&vm_page_queues[marker.queue].pl,
+                                 &marker, pageq);
+
+               while ((m = TAILQ_NEXT(&marker, pageq)) != NULL) {
+                       TAILQ_REMOVE(
+                               &vm_page_queues[marker.queue].pl,
+                               &marker, pageq);
+                       TAILQ_INSERT_AFTER(
+                               &vm_page_queues[marker.queue].pl,
+                               m, &marker, pageq);
+                       if (m->flags & (PG_MARKER | PG_FICTITIOUS))
+                               continue;
+
+                       if (vm_page_busy_try(m, FALSE) == 0) {
+                               vm_page_queues_spin_unlock(marker.queue);
+                               if (m->dirty == 0) {
+                                       vm_page_test_dirty(m);
+                                       if (m->dirty == 0)
+                                               ++pq_inactive_clean;
+                               }
+                               vm_page_wakeup(m);
+                               vm_page_queues_spin_lock(marker.queue);
                        }
-                       vm_page_wakeup(m);
-                       vm_page_queues_spin_lock(PQ_INACTIVE);
                }
+               TAILQ_REMOVE(&vm_page_queues[marker.queue].pl,
+                            &marker, pageq);
+               vm_page_queues_spin_unlock(marker.queue);
        }
-       TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
-       vm_page_queues_spin_unlock(PQ_INACTIVE);
 
        if (vmstats.v_free_count + vmstats.v_cache_count + pq_active_clean +
            pq_inactive_clean + vm_swap_size < aligned_nblks + nswap_lowat) {
index c52d0e8..baf7865 100644 (file)
@@ -79,8 +79,6 @@
 #include <sys/spinlock2.h>
 #include <vm/vm_page2.h>
 
-#define INACTIVE_LIST  (&vm_page_queues[PQ_INACTIVE].pl)
-
 /* the kernel process "vm_pageout"*/
 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
 static int vm_swapcache_test(vm_page_t m);
@@ -161,8 +159,9 @@ vm_swapcached_thread(void)
 {
        enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
        enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
-       struct vm_page page_marker;
-       struct vm_object object_marker;
+       static struct vm_page page_marker[PQ_MAXL2_SIZE];
+       static struct vm_object object_marker;
+       int q;
 
        /*
         * Thread setup
@@ -177,13 +176,17 @@ vm_swapcached_thread(void)
         * Initialize our marker for the inactive scan (SWAPC_WRITING)
         */
        bzero(&page_marker, sizeof(page_marker));
-       page_marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
-       page_marker.queue = PQ_INACTIVE;
-       page_marker.wire_count = 1;
-
-       vm_page_queues_spin_lock(PQ_INACTIVE);
-       TAILQ_INSERT_HEAD(INACTIVE_LIST, &page_marker, pageq);
-       vm_page_queues_spin_unlock(PQ_INACTIVE);
+       for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
+               page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+               page_marker[q].queue = PQ_INACTIVE + q;
+               page_marker[q].pc = q;
+               page_marker[q].wire_count = 1;
+               vm_page_queues_spin_lock(PQ_INACTIVE + q);
+               TAILQ_INSERT_HEAD(
+                       &vm_page_queues[PQ_INACTIVE + q].pl,
+                       &page_marker[q], pageq);
+               vm_page_queues_spin_unlock(PQ_INACTIVE + q);
+       }
 
        vm_swapcache_hysteresis = vmstats.v_inactive_target / 2;
        vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
@@ -251,12 +254,18 @@ vm_swapcached_thread(void)
                if (state == SWAPC_WRITING) {
                        if (vm_swapcache_curburst >= vm_swapcache_accrate) {
                                if (burst == SWAPB_BURSTING) {
-                                       vm_swapcache_writing(&page_marker);
+                                       for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
+                                               vm_swapcache_writing(
+                                                       &page_marker[q]);
+                                       }
                                        if (vm_swapcache_curburst <= 0)
                                                burst = SWAPB_RECOVERING;
                                } else if (vm_swapcache_curburst >
                                           vm_swapcache_minburst) {
-                                       vm_swapcache_writing(&page_marker);
+                                       for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
+                                               vm_swapcache_writing(
+                                                       &page_marker[q]);
+                                       }
                                        burst = SWAPB_BURSTING;
                                }
                        }
@@ -268,9 +277,13 @@ vm_swapcached_thread(void)
        /*
         * Cleanup (NOT REACHED)
         */
-       vm_page_queues_spin_lock(PQ_INACTIVE);
-       TAILQ_REMOVE(INACTIVE_LIST, &page_marker, pageq);
-       vm_page_queues_spin_unlock(PQ_INACTIVE);
+       for (q = 0; q < PQ_MAXL2_SIZE; ++q) {
+               vm_page_queues_spin_lock(PQ_INACTIVE + q);
+               TAILQ_REMOVE(
+                       &vm_page_queues[PQ_INACTIVE + q].pl,
+                       &page_marker[q], pageq);
+               vm_page_queues_spin_unlock(PQ_INACTIVE + q);
+       }
 
        lwkt_gettoken(&vmobj_token);
        TAILQ_REMOVE(&vm_object_list, &object_marker, object_list);
@@ -320,38 +333,40 @@ vm_swapcache_writing(vm_page_t marker)
         */
        count = vm_swapcache_maxlaunder;
 
-       vm_page_queues_spin_lock(PQ_INACTIVE);
+       vm_page_queues_spin_lock(marker->queue);
        while ((m = TAILQ_NEXT(marker, pageq)) != NULL && count-- > 0) {
-               KKASSERT(m->queue == PQ_INACTIVE);
+               KKASSERT(m->queue == marker->queue);
 
                if (vm_swapcache_curburst < 0)
                        break;
-               TAILQ_REMOVE(INACTIVE_LIST, marker, pageq);
-               TAILQ_INSERT_AFTER(INACTIVE_LIST, m, marker, pageq);
+               TAILQ_REMOVE(
+                       &vm_page_queues[marker->queue].pl, marker, pageq);
+               TAILQ_INSERT_AFTER(
+                       &vm_page_queues[marker->queue].pl, m, marker, pageq);
                if (m->flags & (PG_MARKER | PG_SWAPPED)) {
                        ++count;
                        continue;
                }
                if (vm_page_busy_try(m, TRUE))
                        continue;
-               vm_page_queues_spin_unlock(PQ_INACTIVE);
+               vm_page_queues_spin_unlock(marker->queue);
 
                if ((object = m->object) == NULL) {
                        vm_page_wakeup(m);
-                       vm_page_queues_spin_lock(PQ_INACTIVE);
+                       vm_page_queues_spin_lock(marker->queue);
                        continue;
                }
                vm_object_hold(object);
                if (m->object != object) {
                        vm_object_drop(object);
                        vm_page_wakeup(m);
-                       vm_page_queues_spin_lock(PQ_INACTIVE);
+                       vm_page_queues_spin_lock(marker->queue);
                        continue;
                }
                if (vm_swapcache_test(m)) {
                        vm_object_drop(object);
                        vm_page_wakeup(m);
-                       vm_page_queues_spin_lock(PQ_INACTIVE);
+                       vm_page_queues_spin_lock(marker->queue);
                        continue;
                }
 
@@ -359,7 +374,7 @@ vm_swapcache_writing(vm_page_t marker)
                if (vp == NULL) {
                        vm_object_drop(object);
                        vm_page_wakeup(m);
-                       vm_page_queues_spin_lock(PQ_INACTIVE);
+                       vm_page_queues_spin_lock(marker->queue);
                        continue;
                }
 
@@ -374,7 +389,7 @@ vm_swapcache_writing(vm_page_t marker)
                        if (m->flags & PG_NOTMETA) {
                                vm_object_drop(object);
                                vm_page_wakeup(m);
-                               vm_page_queues_spin_lock(PQ_INACTIVE);
+                               vm_page_queues_spin_lock(marker->queue);
                                continue;
                        }
 
@@ -388,7 +403,7 @@ vm_swapcache_writing(vm_page_t marker)
                             vm_swapcache_use_chflags)) {
                                vm_object_drop(object);
                                vm_page_wakeup(m);
-                               vm_page_queues_spin_lock(PQ_INACTIVE);
+                               vm_page_queues_spin_lock(marker->queue);
                                continue;
                        }
                        if (vm_swapcache_maxfilesize &&
@@ -396,7 +411,7 @@ vm_swapcache_writing(vm_page_t marker)
                            (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
                                vm_object_drop(object);
                                vm_page_wakeup(m);
-                               vm_page_queues_spin_lock(PQ_INACTIVE);
+                               vm_page_queues_spin_lock(marker->queue);
                                continue;
                        }
                        isblkdev = 0;
@@ -411,13 +426,13 @@ vm_swapcache_writing(vm_page_t marker)
                        if (m->flags & PG_NOTMETA) {
                                vm_object_drop(object);
                                vm_page_wakeup(m);
-                               vm_page_queues_spin_lock(PQ_INACTIVE);
+                               vm_page_queues_spin_lock(marker->queue);
                                continue;
                        }
                        if (vm_swapcache_meta_enable == 0) {
                                vm_object_drop(object);
                                vm_page_wakeup(m);
-                               vm_page_queues_spin_lock(PQ_INACTIVE);
+                               vm_page_queues_spin_lock(marker->queue);
                                continue;
                        }
                        isblkdev = 1;
@@ -425,7 +440,7 @@ vm_swapcache_writing(vm_page_t marker)
                default:
                        vm_object_drop(object);
                        vm_page_wakeup(m);
-                       vm_page_queues_spin_lock(PQ_INACTIVE);
+                       vm_page_queues_spin_lock(marker->queue);
                        continue;
                }
 
@@ -441,7 +456,7 @@ vm_swapcache_writing(vm_page_t marker)
                 * Setup for next loop using marker.
                 */
                vm_object_drop(object);
-               vm_page_queues_spin_lock(PQ_INACTIVE);
+               vm_page_queues_spin_lock(marker->queue);
        }
 
        /*
@@ -455,7 +470,7 @@ vm_swapcache_writing(vm_page_t marker)
         */
        if (m == NULL)
                vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
-       vm_page_queues_spin_unlock(PQ_INACTIVE);
+       vm_page_queues_spin_unlock(marker->queue);
 }
 
 /*