kernel - Remove P_SWAPPEDOUT flag and paging mode
[dragonfly.git] / sys / vm / vm_pageout.c
1 /*
2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1991 Regents of the University of California.
35  * All rights reserved.
36  * Copyright (c) 1994 John S. Dyson
37  * All rights reserved.
38  * Copyright (c) 1994 David Greenman
39  * All rights reserved.
40  *
41  * This code is derived from software contributed to Berkeley by
42  * The Mach Operating System project at Carnegie-Mellon University.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  * 1. Redistributions of source code must retain the above copyright
48  *    notice, this list of conditions and the following disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  * 3. Neither the name of the University nor the names of its contributors
53  *    may be used to endorse or promote products derived from this software
54  *    without specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66  * SUCH DAMAGE.
67  *
68  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
69  *
70  *
71  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96
97 /*
98  * The proverbial page-out daemon, rewritten many times over the decades.
99  */
100
101 #include "opt_vm.h"
102 #include <sys/param.h>
103 #include <sys/systm.h>
104 #include <sys/kernel.h>
105 #include <sys/proc.h>
106 #include <sys/kthread.h>
107 #include <sys/resourcevar.h>
108 #include <sys/signalvar.h>
109 #include <sys/vnode.h>
110 #include <sys/vmmeter.h>
111 #include <sys/conf.h>
112 #include <sys/sysctl.h>
113
114 #include <vm/vm.h>
115 #include <vm/vm_param.h>
116 #include <sys/lock.h>
117 #include <vm/vm_object.h>
118 #include <vm/vm_page.h>
119 #include <vm/vm_map.h>
120 #include <vm/vm_pageout.h>
121 #include <vm/vm_pager.h>
122 #include <vm/swap_pager.h>
123 #include <vm/vm_extern.h>
124
125 #include <sys/spinlock2.h>
126 #include <vm/vm_page2.h>
127
128 /*
129  * System initialization
130  */
131
132 /* the kernel process "vm_pageout"*/
133 static int vm_pageout_page(vm_page_t m, long *max_launderp,
134                            long *vnodes_skippedp, struct vnode **vpfailedp,
135                            int pass, int vmflush_flags, long *counts);
136 static int vm_pageout_clean_helper (vm_page_t, int);
137 static void vm_pageout_free_page_calc (vm_size_t count);
138 static void vm_pageout_page_free(vm_page_t m) ;
139 __read_frequently struct thread *emergpager;
140 __read_frequently struct thread *pagethread;
141 static int sequence_emerg_pager;
142
143 #if !defined(NO_SWAPPING)
144 /* the kernel process "vm_daemon"*/
145 static void vm_daemon (void);
146 static struct   thread *vmthread;
147
148 static struct kproc_desc vm_kp = {
149         "vmdaemon",
150         vm_daemon,
151         &vmthread
152 };
153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
154 #endif
155
156 __read_mostly int vm_pages_needed = 0;  /* pageout daemon tsleep event */
157 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */
158 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
159 __read_mostly int vm_page_free_hysteresis = 16;
160 __read_mostly static int vm_pagedaemon_time;
161
162 #if !defined(NO_SWAPPING)
163 static int vm_daemon_needed;
164 #endif
165 __read_mostly static int vm_max_launder = 0;
166 __read_mostly static int vm_emerg_launder = 100;
167 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
168 __read_mostly static int vm_pageout_full_stats_interval = 0;
169 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
170 __read_mostly static int defer_swap_pageouts=0;
171 __read_mostly static int disable_swap_pageouts=0;
172 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE;
173 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2;
174 __read_mostly static int vm_pageout_debug;
175
176 #if defined(NO_SWAPPING)
177 __read_mostly static int vm_swap_enabled=0;
178 #else
179 __read_mostly static int vm_swap_enabled=1;
180 #endif
181
182 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/
183 __read_mostly int vm_pageout_memuse_mode=2;
184 __read_mostly int vm_pageout_allow_active=1;
185
186 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
187         CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
188
189 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
190         CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
191
192 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
193         CTLFLAG_RW, &vm_page_free_hysteresis, 0,
194         "Free more pages than the minimum required");
195
196 SYSCTL_INT(_vm, OID_AUTO, max_launder,
197         CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
198 SYSCTL_INT(_vm, OID_AUTO, emerg_launder,
199         CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum");
200
201 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
202         CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
203
204 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
205         CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
206
207 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
208         CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
209
210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
211         CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
212 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
213         CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
214 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active,
215         CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active");
216 SYSCTL_INT(_vm, OID_AUTO, pageout_debug,
217         CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)");
218
219
220 #if defined(NO_SWAPPING)
221 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
222         CTLFLAG_RD, &vm_swap_enabled, 0, "");
223 #else
224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
225         CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
226 #endif
227
228 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
229         CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
230
231 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
232         CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
233
234 static int pageout_lock_miss;
235 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
236         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
237
238 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
239
240 #if !defined(NO_SWAPPING)
241 static void vm_req_vmdaemon (void);
242 #endif
243 static void vm_pageout_page_stats(int q);
244
245 #define MAXSCAN_DIVIDER         10
246
247 /*
248  * Calculate approximately how many pages on each queue to try to
249  * clean.  An exact calculation creates an edge condition when the
250  * queues are unbalanced so add significant slop.  The queue scans
251  * will stop early when targets are reached and will start where they
252  * left off on the next pass.
253  *
254  * We need to be generous here because there are all sorts of loading
255  * conditions that can cause edge cases if try to average over all queues.
256  * In particular, storage subsystems have become so fast that paging
257  * activity can become quite frantic.  Eventually we will probably need
258  * two paging threads, one for dirty pages and one for clean, to deal
259  * with the bandwidth requirements.
260
261  * So what we do is calculate a value that can be satisfied nominally by
262  * only having to scan half the queues.
263  */
264 static __inline long
265 PQAVERAGE(long n)
266 {
267         long avg;
268
269         if (n >= 0) {
270                 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
271         } else {
272                 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
273         }
274         return avg;
275 }
276
277 /*
278  * vm_pageout_clean_helper:
279  *
280  * Clean the page and remove it from the laundry.  The page must be busied
281  * by the caller and will be disposed of (put away, flushed) by this routine.
282  */
283 static int
284 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
285 {
286         vm_object_t object;
287         vm_page_t mc[BLIST_MAX_ALLOC];
288         int error;
289         int ib, is, page_base;
290         vm_pindex_t pindex = m->pindex;
291
292         object = m->object;
293
294         /*
295          * Don't mess with the page if it's held or special.  Theoretically
296          * we can pageout held pages but there is no real need to press our
297          * luck, so don't.
298          */
299         if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) {
300                 vm_page_wakeup(m);
301                 return 0;
302         }
303
304         /*
305          * Place page in cluster.  Align cluster for optimal swap space
306          * allocation (whether it is swap or not).  This is typically ~16-32
307          * pages, which also tends to align the cluster to multiples of the
308          * filesystem block size if backed by a filesystem.
309          */
310         page_base = pindex % BLIST_MAX_ALLOC;
311         mc[page_base] = m;
312         ib = page_base - 1;
313         is = page_base + 1;
314
315         /*
316          * Scan object for clusterable pages.
317          *
318          * We can cluster ONLY if: ->> the page is NOT
319          * clean, wired, busy, held, or mapped into a
320          * buffer, and one of the following:
321          * 1) The page is inactive, or a seldom used
322          *    active page.
323          * -or-
324          * 2) we force the issue.
325          *
326          * During heavy mmap/modification loads the pageout
327          * daemon can really fragment the underlying file
328          * due to flushing pages out of order and not trying
329          * align the clusters (which leave sporatic out-of-order
330          * holes).  To solve this problem we do the reverse scan
331          * first and attempt to align our cluster, then do a 
332          * forward scan if room remains.
333          */
334         vm_object_hold(object);
335
336         while (ib >= 0) {
337                 vm_page_t p;
338
339                 p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
340                                             TRUE, &error);
341                 if (error || p == NULL)
342                         break;
343                 if ((p->queue - p->pc) == PQ_CACHE ||
344                     (p->flags & PG_UNQUEUED)) {
345                         vm_page_wakeup(p);
346                         break;
347                 }
348                 vm_page_test_dirty(p);
349                 if (((p->dirty & p->valid) == 0 &&
350                      (p->flags & PG_NEED_COMMIT) == 0) ||
351                     p->wire_count != 0 ||       /* may be held by buf cache */
352                     p->hold_count != 0) {       /* may be undergoing I/O */
353                         vm_page_wakeup(p);
354                         break;
355                 }
356                 if (p->queue - p->pc != PQ_INACTIVE) {
357                         if (p->queue - p->pc != PQ_ACTIVE ||
358                             (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
359                                 vm_page_wakeup(p);
360                                 break;
361                         }
362                 }
363
364                 /*
365                  * Try to maintain page groupings in the cluster.
366                  */
367                 if (m->flags & PG_WINATCFLS)
368                         vm_page_flag_set(p, PG_WINATCFLS);
369                 else
370                         vm_page_flag_clear(p, PG_WINATCFLS);
371                 p->act_count = m->act_count;
372
373                 mc[ib] = p;
374                 --ib;
375         }
376         ++ib;   /* fixup */
377
378         while (is < BLIST_MAX_ALLOC &&
379                pindex - page_base + is < object->size) {
380                 vm_page_t p;
381
382                 p = vm_page_lookup_busy_try(object, pindex - page_base + is,
383                                             TRUE, &error);
384                 if (error || p == NULL)
385                         break;
386                 if (((p->queue - p->pc) == PQ_CACHE) ||
387                     (p->flags & PG_UNQUEUED)) {
388                         vm_page_wakeup(p);
389                         break;
390                 }
391                 vm_page_test_dirty(p);
392                 if (((p->dirty & p->valid) == 0 &&
393                      (p->flags & PG_NEED_COMMIT) == 0) ||
394                     p->wire_count != 0 ||       /* may be held by buf cache */
395                     p->hold_count != 0) {       /* may be undergoing I/O */
396                         vm_page_wakeup(p);
397                         break;
398                 }
399                 if (p->queue - p->pc != PQ_INACTIVE) {
400                         if (p->queue - p->pc != PQ_ACTIVE ||
401                             (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
402                                 vm_page_wakeup(p);
403                                 break;
404                         }
405                 }
406
407                 /*
408                  * Try to maintain page groupings in the cluster.
409                  */
410                 if (m->flags & PG_WINATCFLS)
411                         vm_page_flag_set(p, PG_WINATCFLS);
412                 else
413                         vm_page_flag_clear(p, PG_WINATCFLS);
414                 p->act_count = m->act_count;
415
416                 mc[is] = p;
417                 ++is;
418         }
419
420         vm_object_drop(object);
421
422         /*
423          * we allow reads during pageouts...
424          */
425         return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
426 }
427
428 /*
429  * vm_pageout_flush() - launder the given pages
430  *
431  *      The given pages are laundered.  Note that we setup for the start of
432  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
433  *      reference count all in here rather then in the parent.  If we want
434  *      the parent to do more sophisticated things we may have to change
435  *      the ordering.
436  *
437  *      The pages in the array must be busied by the caller and will be
438  *      unbusied by this function.
439  */
440 int
441 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
442 {
443         vm_object_t object;
444         int pageout_status[count];
445         int numpagedout = 0;
446         int i;
447
448         /*
449          * Initiate I/O.  Bump the vm_page_t->busy counter.
450          */
451         for (i = 0; i < count; i++) {
452                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
453                         ("vm_pageout_flush page %p index %d/%d: partially "
454                          "invalid page", mc[i], i, count));
455                 vm_page_io_start(mc[i]);
456         }
457
458         /*
459          * We must make the pages read-only.  This will also force the
460          * modified bit in the related pmaps to be cleared.  The pager
461          * cannot clear the bit for us since the I/O completion code
462          * typically runs from an interrupt.  The act of making the page
463          * read-only handles the case for us.
464          *
465          * Then we can unbusy the pages, we still hold a reference by virtue
466          * of our soft-busy.
467          */
468         for (i = 0; i < count; i++) {
469                 if (vmflush_flags & OBJPC_TRY_TO_CACHE)
470                         vm_page_protect(mc[i], VM_PROT_NONE);
471                 else
472                         vm_page_protect(mc[i], VM_PROT_READ);
473                 vm_page_wakeup(mc[i]);
474         }
475
476         object = mc[0]->object;
477         vm_object_pip_add(object, count);
478
479         vm_pager_put_pages(object, mc, count,
480                            (vmflush_flags |
481                             ((object == &kernel_object) ?
482                                 OBJPC_SYNC : 0)),
483                            pageout_status);
484
485         for (i = 0; i < count; i++) {
486                 vm_page_t mt = mc[i];
487
488                 switch (pageout_status[i]) {
489                 case VM_PAGER_OK:
490                         numpagedout++;
491                         break;
492                 case VM_PAGER_PEND:
493                         numpagedout++;
494                         break;
495                 case VM_PAGER_BAD:
496                         /*
497                          * Page outside of range of object. Right now we
498                          * essentially lose the changes by pretending it
499                          * worked.
500                          */
501                         vm_page_busy_wait(mt, FALSE, "pgbad");
502                         pmap_clear_modify(mt);
503                         vm_page_undirty(mt);
504                         vm_page_wakeup(mt);
505                         break;
506                 case VM_PAGER_ERROR:
507                 case VM_PAGER_FAIL:
508                         /*
509                          * A page typically cannot be paged out when we
510                          * have run out of swap.  We leave the page
511                          * marked inactive and will try to page it out
512                          * again later.
513                          *
514                          * Starvation of the active page list is used to
515                          * determine when the system is massively memory
516                          * starved.
517                          */
518                         break;
519                 case VM_PAGER_AGAIN:
520                         break;
521                 }
522
523                 /*
524                  * If not PENDing this was a synchronous operation and we
525                  * clean up after the I/O.  If it is PENDing the mess is
526                  * cleaned up asynchronously.
527                  *
528                  * Also nominally act on the caller's wishes if the caller
529                  * wants to try to really clean (cache or free) the page.
530                  *
531                  * Also nominally deactivate the page if the system is
532                  * memory-stressed.
533                  */
534                 if (pageout_status[i] != VM_PAGER_PEND) {
535                         vm_page_busy_wait(mt, FALSE, "pgouw");
536                         vm_page_io_finish(mt);
537                         if (vmflush_flags & OBJPC_TRY_TO_CACHE) {
538                                 vm_page_try_to_cache(mt);
539                         } else if (vm_page_count_severe()) {
540                                 vm_page_deactivate(mt);
541                                 vm_page_wakeup(mt);
542                         } else {
543                                 vm_page_wakeup(mt);
544                         }
545                         vm_object_pip_wakeup(object);
546                 }
547         }
548         return numpagedout;
549 }
550
551 #if !defined(NO_SWAPPING)
552
553 /*
554  * Callback function, page busied for us.  We must dispose of the busy
555  * condition.  Any related pmap pages may be held but will not be locked.
556  */
557 static
558 int
559 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
560                         vm_page_t p)
561 {
562         int actcount;
563         int cleanit = 0;
564
565         /*
566          * Basic tests - There should never be a marker, and we can stop
567          *               once the RSS is below the required level.
568          */
569         KKASSERT((p->flags & PG_MARKER) == 0);
570         if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
571                 vm_page_wakeup(p);
572                 return(-1);
573         }
574
575         mycpu->gd_cnt.v_pdpages++;
576
577         if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) {
578                 vm_page_wakeup(p);
579                 goto done;
580         }
581
582         ++info->actioncount;
583
584         /*
585          * Check if the page has been referened recently.  If it has,
586          * activate it and skip.
587          */
588         actcount = pmap_ts_referenced(p);
589         if (actcount) {
590                 vm_page_flag_set(p, PG_REFERENCED);
591         } else if (p->flags & PG_REFERENCED) {
592                 actcount = 1;
593         }
594
595         if (actcount) {
596                 if (p->queue - p->pc != PQ_ACTIVE) {
597                         vm_page_and_queue_spin_lock(p);
598                         if (p->queue - p->pc != PQ_ACTIVE) {
599                                 vm_page_and_queue_spin_unlock(p);
600                                 vm_page_activate(p);
601                         } else {
602                                 vm_page_and_queue_spin_unlock(p);
603                         }
604                 } else {
605                         p->act_count += actcount;
606                         if (p->act_count > ACT_MAX)
607                                 p->act_count = ACT_MAX;
608                 }
609                 vm_page_flag_clear(p, PG_REFERENCED);
610                 vm_page_wakeup(p);
611                 goto done;
612         }
613
614         /*
615          * Remove the page from this particular pmap.  Once we do this, our
616          * pmap scans will not see it again (unless it gets faulted in), so
617          * we must actively dispose of or deal with the page.
618          */
619         pmap_remove_specific(info->pmap, p);
620
621         /*
622          * If the page is not mapped to another process (i.e. as would be
623          * typical if this were a shared page from a library) then deactivate
624          * the page and clean it in two passes only.
625          *
626          * If the page hasn't been referenced since the last check, remove it
627          * from the pmap.  If it is no longer mapped, deactivate it
628          * immediately, accelerating the normal decline.
629          *
630          * Once the page has been removed from the pmap the RSS code no
631          * longer tracks it so we have to make sure that it is staged for
632          * potential flush action.
633          *
634          * XXX
635          */
636         if ((p->flags & PG_MAPPED) == 0 ||
637             (pmap_mapped_sync(p) & PG_MAPPED) == 0) {
638                 if (p->queue - p->pc == PQ_ACTIVE) {
639                         vm_page_deactivate(p);
640                 }
641                 if (p->queue - p->pc == PQ_INACTIVE) {
642                         cleanit = 1;
643                 }
644         }
645
646         /*
647          * Ok, try to fully clean the page and any nearby pages such that at
648          * least the requested page is freed or moved to the cache queue.
649          *
650          * We usually do this synchronously to allow us to get the page into
651          * the CACHE queue quickly, which will prevent memory exhaustion if
652          * a process with a memoryuse limit is running away.  However, the
653          * sysadmin may desire to set vm.swap_user_async which relaxes this
654          * and improves write performance.
655          */
656         if (cleanit) {
657                 long max_launder = 0x7FFF;
658                 long vnodes_skipped = 0;
659                 long counts[4] = { 0, 0, 0, 0 };
660                 int vmflush_flags;
661                 struct vnode *vpfailed = NULL;
662
663                 info->offset = va;
664
665                 if (vm_pageout_memuse_mode >= 2) {
666                         vmflush_flags = OBJPC_TRY_TO_CACHE |
667                                         OBJPC_ALLOW_ACTIVE;
668                         if (swap_user_async == 0)
669                                 vmflush_flags |= OBJPC_SYNC;
670                         vm_page_flag_set(p, PG_WINATCFLS);
671                         info->cleancount +=
672                                 vm_pageout_page(p, &max_launder,
673                                                 &vnodes_skipped,
674                                                 &vpfailed, 1, vmflush_flags,
675                                                 counts);
676                 } else {
677                         vm_page_wakeup(p);
678                         ++info->cleancount;
679                 }
680         } else {
681                 vm_page_wakeup(p);
682         }
683
684         /*
685          * Must be at end to avoid SMP races.
686          */
687 done:
688         lwkt_user_yield();
689         return 0;
690 }
691
692 /*
693  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
694  * that is relatively difficult to do.  We try to keep track of where we
695  * left off last time to reduce scan overhead.
696  *
697  * Called when vm_pageout_memuse_mode is >= 1.
698  */
699 void
700 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
701 {
702         vm_offset_t pgout_offset;
703         struct pmap_pgscan_info info;
704         int retries = 3;
705
706         pgout_offset = map->pgout_offset;
707 again:
708 #if 0
709         kprintf("%016jx ", pgout_offset);
710 #endif
711         if (pgout_offset < VM_MIN_USER_ADDRESS)
712                 pgout_offset = VM_MIN_USER_ADDRESS;
713         if (pgout_offset >= VM_MAX_USER_ADDRESS)
714                 pgout_offset = 0;
715         info.pmap = vm_map_pmap(map);
716         info.limit = limit;
717         info.beg_addr = pgout_offset;
718         info.end_addr = VM_MAX_USER_ADDRESS;
719         info.callback = vm_pageout_mdp_callback;
720         info.cleancount = 0;
721         info.actioncount = 0;
722         info.busycount = 0;
723
724         pmap_pgscan(&info);
725         pgout_offset = info.offset;
726 #if 0
727         kprintf("%016jx %08lx %08lx\n", pgout_offset,
728                 info.cleancount, info.actioncount);
729 #endif
730
731         if (pgout_offset != VM_MAX_USER_ADDRESS &&
732             pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
733                 goto again;
734         } else if (retries &&
735                    pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
736                 --retries;
737                 goto again;
738         }
739         map->pgout_offset = pgout_offset;
740 }
741 #endif
742
743 /*
744  * Called when the pageout scan wants to free a page.  We no longer
745  * try to cycle the vm_object here with a reference & dealloc, which can
746  * cause a non-trivial object collapse in a critical path.
747  *
748  * It is unclear why we cycled the ref_count in the past, perhaps to try
749  * to optimize shadow chain collapses but I don't quite see why it would
750  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
751  * synchronously and not have to be kicked-start.
752  */
753 static void
754 vm_pageout_page_free(vm_page_t m) 
755 {
756         vm_page_protect(m, VM_PROT_NONE);
757         vm_page_free(m);
758 }
759
760 /*
761  * vm_pageout_scan does the dirty work for the pageout daemon.
762  */
763 struct vm_pageout_scan_info {
764         struct proc *bigproc;
765         vm_offset_t bigsize;
766 };
767
768 static int vm_pageout_scan_callback(struct proc *p, void *data);
769
770 /*
771  * Scan inactive queue
772  *
773  * WARNING! Can be called from two pagedaemon threads simultaneously.
774  */
775 static int
776 vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
777                          long *vnodes_skipped, long *counts)
778 {
779         vm_page_t m;
780         struct vm_page marker;
781         struct vnode *vpfailed;         /* warning, allowed to be stale */
782         long maxscan;
783         long delta = 0;
784         long max_launder;
785         int isep;
786         int vmflush_flags;
787
788         isep = (curthread == emergpager);
789         if ((unsigned)pass > 1000)
790                 pass = 1000;
791
792         /*
793          * This routine is called for each of PQ_L2_SIZE inactive queues.
794          * We want the vm_max_launder parameter to apply to the whole
795          * queue (i.e. per-whole-queue pass, not per-sub-queue).
796          *
797          * In each successive full-pass when the page target is not met we
798          * allow the per-queue max_launder to increase up to a maximum of
799          * vm_max_launder / 16.
800          */
801         if (pass)
802                 max_launder = (long)vm_max_launder * (pass + 1) / PQ_L2_SIZE;
803         else
804                 max_launder = (long)vm_max_launder / PQ_L2_SIZE;
805         max_launder /= MAXSCAN_DIVIDER;
806
807         if (max_launder <= 1)
808                 max_launder = 1;
809         if (max_launder >= vm_max_launder / 16)
810                 max_launder = vm_max_launder / 16 + 1;
811
812         /*
813          * Start scanning the inactive queue for pages we can move to the
814          * cache or free.  The scan will stop when the target is reached or
815          * we have scanned the entire inactive queue.  Note that m->act_count
816          * is not used to form decisions for the inactive queue, only for the
817          * active queue.
818          *
819          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
820          *        PAGES.
821          */
822
823         /*
824          * Initialize our marker
825          */
826         bzero(&marker, sizeof(marker));
827         marker.flags = PG_FICTITIOUS | PG_MARKER;
828         marker.busy_count = PBUSY_LOCKED;
829         marker.queue = PQ_INACTIVE + q;
830         marker.pc = q;
831         marker.wire_count = 1;
832
833         /*
834          * Inactive queue scan.
835          *
836          * We pick off approximately 1/10 of each queue.  Each queue is
837          * effectively organized LRU so scanning the entire queue would
838          * improperly pick up pages that might still be in regular use.
839          *
840          * NOTE: The vm_page must be spinlocked before the queue to avoid
841          *       deadlocks, so it is easiest to simply iterate the loop
842          *       with the queue unlocked at the top.
843          */
844         vpfailed = NULL;
845
846         vm_page_queues_spin_lock(PQ_INACTIVE + q);
847         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
848         maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
849
850         /*
851          * Queue locked at top of loop to avoid stack marker issues.
852          */
853         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
854                maxscan-- > 0 && avail_shortage - delta > 0)
855         {
856                 int count;
857
858                 KKASSERT(m->queue == PQ_INACTIVE + q);
859                 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
860                              &marker, pageq);
861                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
862                                    &marker, pageq);
863                 mycpu->gd_cnt.v_pdpages++;
864
865                 /*
866                  * Skip marker pages (atomic against other markers to avoid
867                  * infinite hop-over scans).
868                  */
869                 if (m->flags & PG_MARKER)
870                         continue;
871
872                 /*
873                  * Try to busy the page.  Don't mess with pages which are
874                  * already busy or reorder them in the queue.
875                  */
876                 if (vm_page_busy_try(m, TRUE))
877                         continue;
878
879                 /*
880                  * Remaining operations run with the page busy and neither
881                  * the page or the queue will be spin-locked.
882                  */
883                 KKASSERT(m->queue == PQ_INACTIVE + q);
884                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
885
886                 /*
887                  * The emergency pager runs when the primary pager gets
888                  * stuck, which typically means the primary pager deadlocked
889                  * on a vnode-backed page.  Therefore, the emergency pager
890                  * must skip any complex objects.
891                  *
892                  * We disallow VNODEs unless they are VCHR whos device ops
893                  * does not flag D_NOEMERGPGR.
894                  */
895                 if (isep && m->object) {
896                         struct vnode *vp;
897
898                         switch(m->object->type) {
899                         case OBJT_DEFAULT:
900                         case OBJT_SWAP:
901                                 /*
902                                  * Allow anonymous memory and assume that
903                                  * swap devices are not complex, since its
904                                  * kinda worthless if we can't swap out dirty
905                                  * anonymous pages.
906                                  */
907                                 break;
908                         case OBJT_VNODE:
909                                 /*
910                                  * Allow VCHR device if the D_NOEMERGPGR
911                                  * flag is not set, deny other vnode types
912                                  * as being too complex.
913                                  */
914                                 vp = m->object->handle;
915                                 if (vp && vp->v_type == VCHR &&
916                                     vp->v_rdev && vp->v_rdev->si_ops &&
917                                     (vp->v_rdev->si_ops->head.flags &
918                                      D_NOEMERGPGR) == 0) {
919                                         break;
920                                 }
921                                 /* Deny - fall through */
922                         default:
923                                 /*
924                                  * Deny
925                                  */
926                                 vm_page_wakeup(m);
927                                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
928                                 lwkt_yield();
929                                 continue;
930                         }
931                 }
932
933                 /*
934                  * Try to pageout the page and perhaps other nearby pages.
935                  * We want to get the pages into the cache eventually (
936                  * first or second pass).  Otherwise the pages can wind up
937                  * just cycling in the inactive queue, getting flushed over
938                  * and over again.
939                  *
940                  * Generally speaking we recycle dirty pages within PQ_INACTIVE
941                  * twice (double LRU) before paging them out.  If the
942                  * memuse_mode is >= 3 we run them single-LRU like we do clean
943                  * pages.
944                  */
945                 if (vm_pageout_memuse_mode >= 3)
946                         vm_page_flag_set(m, PG_WINATCFLS);
947
948                 vmflush_flags = 0;
949                 if (vm_pageout_allow_active)
950                         vmflush_flags |= OBJPC_ALLOW_ACTIVE;
951                 if (m->flags & PG_WINATCFLS)
952                         vmflush_flags |= OBJPC_TRY_TO_CACHE;
953                 count = vm_pageout_page(m, &max_launder, vnodes_skipped,
954                                         &vpfailed, pass, vmflush_flags, counts);
955                 delta += count;
956
957                 /*
958                  * Systems with a ton of memory can wind up with huge
959                  * deactivation counts.  Because the inactive scan is
960                  * doing a lot of flushing, the combination can result
961                  * in excessive paging even in situations where other
962                  * unrelated threads free up sufficient VM.
963                  *
964                  * To deal with this we abort the nominal active->inactive
965                  * scan before we hit the inactive target when free+cache
966                  * levels have reached a reasonable target.
967                  *
968                  * When deciding to stop early we need to add some slop to
969                  * the test and we need to return full completion to the caller
970                  * to prevent the caller from thinking there is something
971                  * wrong and issuing a low-memory+swap warning or pkill.
972                  *
973                  * A deficit forces paging regardless of the state of the
974                  * VM page queues (used for RSS enforcement).
975                  */
976                 lwkt_yield();
977                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
978                 if (vm_paging_target() < -vm_max_launder) {
979                         /*
980                          * Stopping early, return full completion to caller.
981                          */
982                         if (delta < avail_shortage)
983                                 delta = avail_shortage;
984                         break;
985                 }
986         }
987
988         /* page queue still spin-locked */
989         TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
990         vm_page_queues_spin_unlock(PQ_INACTIVE + q);
991
992         return (delta);
993 }
994
995 /*
996  * Pageout the specified page, return the total number of pages paged out
997  * (this routine may cluster).
998  *
999  * The page must be busied and soft-busied by the caller and will be disposed
1000  * of by this function.
1001  */
1002 static int
1003 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
1004                 struct vnode **vpfailedp, int pass, int vmflush_flags,
1005                 long *counts)
1006 {
1007         vm_object_t object;
1008         int actcount;
1009         int count = 0;
1010
1011         /*
1012          * Wiring no longer removes a page from its queue.  The last unwiring
1013          * will requeue the page.  Obviously wired pages cannot be paged out
1014          * so unqueue it and return.
1015          */
1016         if (m->wire_count) {
1017                 vm_page_unqueue_nowakeup(m);
1018                 vm_page_wakeup(m);
1019                 return 0;
1020         }
1021
1022         /*
1023          * A held page may be undergoing I/O, so skip it.
1024          */
1025         if (m->hold_count) {
1026                 vm_page_and_queue_spin_lock(m);
1027                 if (m->queue - m->pc == PQ_INACTIVE) {
1028                         TAILQ_REMOVE(
1029                                 &vm_page_queues[m->queue].pl, m, pageq);
1030                         TAILQ_INSERT_TAIL(
1031                                 &vm_page_queues[m->queue].pl, m, pageq);
1032                 }
1033                 vm_page_and_queue_spin_unlock(m);
1034                 vm_page_wakeup(m);
1035                 return 0;
1036         }
1037
1038         if (m->object == NULL || m->object->ref_count == 0) {
1039                 /*
1040                  * If the object is not being used, we ignore previous
1041                  * references.
1042                  */
1043                 vm_page_flag_clear(m, PG_REFERENCED);
1044                 pmap_clear_reference(m);
1045                 /* fall through to end */
1046         } else if (((m->flags & PG_REFERENCED) == 0) &&
1047                     (actcount = pmap_ts_referenced(m))) {
1048                 /*
1049                  * Otherwise, if the page has been referenced while
1050                  * in the inactive queue, we bump the "activation
1051                  * count" upwards, making it less likely that the
1052                  * page will be added back to the inactive queue
1053                  * prematurely again.  Here we check the page tables
1054                  * (or emulated bits, if any), given the upper level
1055                  * VM system not knowing anything about existing
1056                  * references.
1057                  */
1058                 ++counts[3];
1059                 vm_page_activate(m);
1060                 m->act_count += (actcount + ACT_ADVANCE);
1061                 vm_page_wakeup(m);
1062                 return 0;
1063         }
1064
1065         /*
1066          * (m) is still busied.
1067          *
1068          * If the upper level VM system knows about any page
1069          * references, we activate the page.  We also set the
1070          * "activation count" higher than normal so that we will less
1071          * likely place pages back onto the inactive queue again.
1072          */
1073         if ((m->flags & PG_REFERENCED) != 0) {
1074                 vm_page_flag_clear(m, PG_REFERENCED);
1075                 actcount = pmap_ts_referenced(m);
1076                 vm_page_activate(m);
1077                 m->act_count += (actcount + ACT_ADVANCE + 1);
1078                 vm_page_wakeup(m);
1079                 ++counts[3];
1080                 return 0;
1081         }
1082
1083         /*
1084          * If the upper level VM system doesn't know anything about
1085          * the page being dirty, we have to check for it again.  As
1086          * far as the VM code knows, any partially dirty pages are
1087          * fully dirty.
1088          *
1089          * Pages marked PG_WRITEABLE may be mapped into the user
1090          * address space of a process running on another cpu.  A
1091          * user process (without holding the MP lock) running on
1092          * another cpu may be able to touch the page while we are
1093          * trying to remove it.  vm_page_cache() will handle this
1094          * case for us.
1095          */
1096         if (m->dirty == 0) {
1097                 vm_page_test_dirty(m);
1098         } else {
1099                 vm_page_dirty(m);
1100         }
1101
1102         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1103                 /*
1104                  * Invalid pages can be easily freed
1105                  */
1106                 vm_pageout_page_free(m);
1107                 mycpu->gd_cnt.v_dfree++;
1108                 ++count;
1109                 ++counts[1];
1110         } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1111                 /*
1112                  * Clean pages can be placed onto the cache queue.
1113                  * This effectively frees them.
1114                  */
1115                 vm_page_cache(m);
1116                 ++count;
1117                 ++counts[1];
1118         } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1119                 /*
1120                  * Dirty pages need to be paged out, but flushing
1121                  * a page is extremely expensive verses freeing
1122                  * a clean page.  Rather then artificially limiting
1123                  * the number of pages we can flush, we instead give
1124                  * dirty pages extra priority on the inactive queue
1125                  * by forcing them to be cycled through the queue
1126                  * twice before being flushed, after which the
1127                  * (now clean) page will cycle through once more
1128                  * before being freed.  This significantly extends
1129                  * the thrash point for a heavily loaded machine.
1130                  */
1131                 ++counts[2];
1132                 vm_page_flag_set(m, PG_WINATCFLS);
1133                 vm_page_and_queue_spin_lock(m);
1134                 if (m->queue - m->pc == PQ_INACTIVE) {
1135                         TAILQ_REMOVE(
1136                                 &vm_page_queues[m->queue].pl, m, pageq);
1137                         TAILQ_INSERT_TAIL(
1138                                 &vm_page_queues[m->queue].pl, m, pageq);
1139                 }
1140                 vm_page_and_queue_spin_unlock(m);
1141                 vm_page_wakeup(m);
1142         } else if (*max_launderp > 0) {
1143                 /*
1144                  * We always want to try to flush some dirty pages if
1145                  * we encounter them, to keep the system stable.
1146                  * Normally this number is small, but under extreme
1147                  * pressure where there are insufficient clean pages
1148                  * on the inactive queue, we may have to go all out.
1149                  */
1150                 int swap_pageouts_ok;
1151                 struct vnode *vp = NULL;
1152
1153                 if ((m->flags & PG_WINATCFLS) == 0)
1154                         vm_page_flag_set(m, PG_WINATCFLS);
1155                 swap_pageouts_ok = 0;
1156                 object = m->object;
1157                 if (object &&
1158                     (object->type != OBJT_SWAP) &&
1159                     (object->type != OBJT_DEFAULT)) {
1160                         swap_pageouts_ok = 1;
1161                 } else {
1162                         swap_pageouts_ok = !(defer_swap_pageouts ||
1163                                              disable_swap_pageouts);
1164                         swap_pageouts_ok |= (!disable_swap_pageouts &&
1165                                              defer_swap_pageouts &&
1166                                              vm_page_count_min(0));
1167                 }
1168
1169                 /*
1170                  * We don't bother paging objects that are "dead".
1171                  * Those objects are in a "rundown" state.
1172                  */
1173                 if (!swap_pageouts_ok ||
1174                     (object == NULL) ||
1175                     (object->flags & OBJ_DEAD)) {
1176                         vm_page_and_queue_spin_lock(m);
1177                         if (m->queue - m->pc == PQ_INACTIVE) {
1178                                 TAILQ_REMOVE(
1179                                     &vm_page_queues[m->queue].pl,
1180                                     m, pageq);
1181                                 TAILQ_INSERT_TAIL(
1182                                     &vm_page_queues[m->queue].pl,
1183                                     m, pageq);
1184                         }
1185                         vm_page_and_queue_spin_unlock(m);
1186                         vm_page_wakeup(m);
1187                         return 0;
1188                 }
1189
1190                 /*
1191                  * (m) is still busied.
1192                  *
1193                  * The object is already known NOT to be dead.   It
1194                  * is possible for the vget() to block the whole
1195                  * pageout daemon, but the new low-memory handling
1196                  * code should prevent it.
1197                  *
1198                  * The previous code skipped locked vnodes and, worse,
1199                  * reordered pages in the queue.  This results in
1200                  * completely non-deterministic operation because,
1201                  * quite often, a vm_fault has initiated an I/O and
1202                  * is holding a locked vnode at just the point where
1203                  * the pageout daemon is woken up.
1204                  *
1205                  * We can't wait forever for the vnode lock, we might
1206                  * deadlock due to a vn_read() getting stuck in
1207                  * vm_wait while holding this vnode.  We skip the
1208                  * vnode if we can't get it in a reasonable amount
1209                  * of time.
1210                  *
1211                  * vpfailed is used to (try to) avoid the case where
1212                  * a large number of pages are associated with a
1213                  * locked vnode, which could cause the pageout daemon
1214                  * to stall for an excessive amount of time.
1215                  */
1216                 if (object->type == OBJT_VNODE) {
1217                         int flags;
1218
1219                         vp = object->handle;
1220                         flags = LK_EXCLUSIVE;
1221                         if (vp == *vpfailedp)
1222                                 flags |= LK_NOWAIT;
1223                         else
1224                                 flags |= LK_TIMELOCK;
1225                         vm_page_hold(m);
1226                         vm_page_wakeup(m);
1227
1228                         /*
1229                          * We have unbusied (m) temporarily so we can
1230                          * acquire the vp lock without deadlocking.
1231                          * (m) is held to prevent destruction.
1232                          */
1233                         if (vget(vp, flags) != 0) {
1234                                 *vpfailedp = vp;
1235                                 ++pageout_lock_miss;
1236                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1237                                             ++*vnodes_skippedp;
1238                                 vm_page_unhold(m);
1239                                 return 0;
1240                         }
1241
1242                         /*
1243                          * The page might have been moved to another
1244                          * queue during potential blocking in vget()
1245                          * above.  The page might have been freed and
1246                          * reused for another vnode.  The object might
1247                          * have been reused for another vnode.
1248                          */
1249                         if (m->queue - m->pc != PQ_INACTIVE ||
1250                             m->object != object ||
1251                             object->handle != vp) {
1252                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1253                                         ++*vnodes_skippedp;
1254                                 vput(vp);
1255                                 vm_page_unhold(m);
1256                                 return 0;
1257                         }
1258
1259                         /*
1260                          * The page may have been busied during the
1261                          * blocking in vput();  We don't move the
1262                          * page back onto the end of the queue so that
1263                          * statistics are more correct if we don't.
1264                          */
1265                         if (vm_page_busy_try(m, TRUE)) {
1266                                 vput(vp);
1267                                 vm_page_unhold(m);
1268                                 return 0;
1269                         }
1270                         vm_page_unhold(m);
1271
1272                         /*
1273                          * If it was wired while we didn't own it.
1274                          */
1275                         if (m->wire_count) {
1276                                 vm_page_unqueue_nowakeup(m);
1277                                 vput(vp);
1278                                 vm_page_wakeup(m);
1279                                 return 0;
1280                         }
1281
1282                         /*
1283                          * (m) is busied again
1284                          *
1285                          * We own the busy bit and remove our hold
1286                          * bit.  If the page is still held it
1287                          * might be undergoing I/O, so skip it.
1288                          */
1289                         if (m->hold_count) {
1290 rebusy_failed:
1291                                 vm_page_and_queue_spin_lock(m);
1292                                 if (m->queue - m->pc == PQ_INACTIVE) {
1293                                         TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1294                                         TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1295                                 }
1296                                 vm_page_and_queue_spin_unlock(m);
1297                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1298                                         ++*vnodes_skippedp;
1299                                 vm_page_wakeup(m);
1300                                 vput(vp);
1301                                 return 0;
1302                         }
1303
1304                         /*
1305                          * Recheck queue, object, and vp now that we have
1306                          * rebusied the page.
1307                          */
1308                         if (m->queue - m->pc != PQ_INACTIVE ||
1309                             m->object != object ||
1310                             object->handle != vp) {
1311                                 kprintf("vm_pageout_page: "
1312                                         "rebusy %p failed(A)\n",
1313                                         m);
1314                                 goto rebusy_failed;
1315                         }
1316
1317                         /*
1318                          * Check page validity
1319                          */
1320                         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1321                                 kprintf("vm_pageout_page: "
1322                                         "rebusy %p failed(B)\n",
1323                                         m);
1324                                 goto rebusy_failed;
1325                         }
1326                         if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1327                                 kprintf("vm_pageout_page: "
1328                                         "rebusy %p failed(C)\n",
1329                                         m);
1330                                 goto rebusy_failed;
1331                         }
1332
1333                         /* (m) is left busied as we fall through */
1334                 }
1335
1336                 /*
1337                  * page is busy and not held here.
1338                  *
1339                  * If a page is dirty, then it is either being washed
1340                  * (but not yet cleaned) or it is still in the
1341                  * laundry.  If it is still in the laundry, then we
1342                  * start the cleaning operation.
1343                  *
1344                  * decrement inactive_shortage on success to account
1345                  * for the (future) cleaned page.  Otherwise we
1346                  * could wind up laundering or cleaning too many
1347                  * pages.
1348                  *
1349                  * NOTE: Cleaning the page here does not cause
1350                  *       force_deficit to be adjusted, because the
1351                  *       page is not being freed or moved to the
1352                  *       cache.
1353                  */
1354                 count = vm_pageout_clean_helper(m, vmflush_flags);
1355                 counts[0] += count;
1356                 *max_launderp -= count;
1357
1358                 /*
1359                  * Clean ate busy, page no longer accessible
1360                  */
1361                 if (vp != NULL)
1362                         vput(vp);
1363         } else {
1364                 vm_page_wakeup(m);
1365         }
1366         return count;
1367 }
1368
1369 /*
1370  * Scan active queue
1371  *
1372  * WARNING! Can be called from two pagedaemon threads simultaneously.
1373  */
1374 static int
1375 vm_pageout_scan_active(int pass, int q,
1376                        long avail_shortage, long inactive_shortage,
1377                        long *recycle_countp)
1378 {
1379         struct vm_page marker;
1380         vm_page_t m;
1381         int actcount;
1382         long delta = 0;
1383         long maxscan;
1384         int isep;
1385
1386         isep = (curthread == emergpager);
1387
1388         /*
1389          * We want to move pages from the active queue to the inactive
1390          * queue to get the inactive queue to the inactive target.  If
1391          * we still have a page shortage from above we try to directly free
1392          * clean pages instead of moving them.
1393          *
1394          * If we do still have a shortage we keep track of the number of
1395          * pages we free or cache (recycle_count) as a measure of thrashing
1396          * between the active and inactive queues.
1397          *
1398          * If we were able to completely satisfy the free+cache targets
1399          * from the inactive pool we limit the number of pages we move
1400          * from the active pool to the inactive pool to 2x the pages we
1401          * had removed from the inactive pool (with a minimum of 1/5 the
1402          * inactive target).  If we were not able to completely satisfy
1403          * the free+cache targets we go for the whole target aggressively.
1404          *
1405          * NOTE: Both variables can end up negative.
1406          * NOTE: We are still in a critical section.
1407          *
1408          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
1409          *        PAGES.
1410          */
1411
1412         bzero(&marker, sizeof(marker));
1413         marker.flags = PG_FICTITIOUS | PG_MARKER;
1414         marker.busy_count = PBUSY_LOCKED;
1415         marker.queue = PQ_ACTIVE + q;
1416         marker.pc = q;
1417         marker.wire_count = 1;
1418
1419         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1420         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1421         maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
1422
1423         /*
1424          * Queue locked at top of loop to avoid stack marker issues.
1425          */
1426         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1427                maxscan-- > 0 && (avail_shortage - delta > 0 ||
1428                                 inactive_shortage > 0))
1429         {
1430                 KKASSERT(m->queue == PQ_ACTIVE + q);
1431                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1432                              &marker, pageq);
1433                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1434                                    &marker, pageq);
1435
1436                 /*
1437                  * Skip marker pages (atomic against other markers to avoid
1438                  * infinite hop-over scans).
1439                  */
1440                 if (m->flags & PG_MARKER)
1441                         continue;
1442
1443                 /*
1444                  * Try to busy the page.  Don't mess with pages which are
1445                  * already busy or reorder them in the queue.
1446                  */
1447                 if (vm_page_busy_try(m, TRUE))
1448                         continue;
1449
1450                 /*
1451                  * Remaining operations run with the page busy and neither
1452                  * the page or the queue will be spin-locked.
1453                  */
1454                 KKASSERT(m->queue == PQ_ACTIVE + q);
1455                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1456
1457 #if 0
1458                 /*
1459                  * Don't deactivate pages that are held, even if we can
1460                  * busy them.  (XXX why not?)
1461                  */
1462                 if (m->hold_count) {
1463                         vm_page_and_queue_spin_lock(m);
1464                         if (m->queue - m->pc == PQ_ACTIVE) {
1465                                 TAILQ_REMOVE(
1466                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1467                                         m, pageq);
1468                                 TAILQ_INSERT_TAIL(
1469                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1470                                         m, pageq);
1471                         }
1472                         vm_page_and_queue_spin_unlock(m);
1473                         vm_page_wakeup(m);
1474                         goto next;
1475                 }
1476 #endif
1477                 /*
1478                  * We can just remove wired pages from the queue
1479                  */
1480                 if (m->wire_count) {
1481                         vm_page_unqueue_nowakeup(m);
1482                         vm_page_wakeup(m);
1483                         goto next;
1484                 }
1485
1486                 /*
1487                  * The emergency pager ignores vnode-backed pages as these
1488                  * are the pages that probably bricked the main pager.
1489                  */
1490                 if (isep && m->object && m->object->type == OBJT_VNODE) {
1491                         vm_page_and_queue_spin_lock(m);
1492                         if (m->queue - m->pc == PQ_ACTIVE) {
1493                                 TAILQ_REMOVE(
1494                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1495                                         m, pageq);
1496                                 TAILQ_INSERT_TAIL(
1497                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1498                                         m, pageq);
1499                         }
1500                         vm_page_and_queue_spin_unlock(m);
1501                         vm_page_wakeup(m);
1502                         goto next;
1503                 }
1504
1505                 /*
1506                  * The count for pagedaemon pages is done after checking the
1507                  * page for eligibility...
1508                  */
1509                 mycpu->gd_cnt.v_pdpages++;
1510
1511                 /*
1512                  * Check to see "how much" the page has been used and clear
1513                  * the tracking access bits.  If the object has no references
1514                  * don't bother paying the expense.
1515                  */
1516                 actcount = 0;
1517                 if (m->object && m->object->ref_count != 0) {
1518                         if (m->flags & PG_REFERENCED)
1519                                 ++actcount;
1520                         actcount += pmap_ts_referenced(m);
1521                         if (actcount) {
1522                                 m->act_count += ACT_ADVANCE + actcount;
1523                                 if (m->act_count > ACT_MAX)
1524                                         m->act_count = ACT_MAX;
1525                         }
1526                 }
1527                 vm_page_flag_clear(m, PG_REFERENCED);
1528
1529                 /*
1530                  * actcount is only valid if the object ref_count is non-zero.
1531                  * If the page does not have an object, actcount will be zero.
1532                  */
1533                 if (actcount && m->object->ref_count != 0) {
1534                         vm_page_and_queue_spin_lock(m);
1535                         if (m->queue - m->pc == PQ_ACTIVE) {
1536                                 TAILQ_REMOVE(
1537                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1538                                         m, pageq);
1539                                 TAILQ_INSERT_TAIL(
1540                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1541                                         m, pageq);
1542                         }
1543                         vm_page_and_queue_spin_unlock(m);
1544                         vm_page_wakeup(m);
1545                 } else {
1546                         switch(m->object->type) {
1547                         case OBJT_DEFAULT:
1548                         case OBJT_SWAP:
1549                                 m->act_count -= min(m->act_count,
1550                                                     vm_anonmem_decline);
1551                                 break;
1552                         default:
1553                                 m->act_count -= min(m->act_count,
1554                                                     vm_filemem_decline);
1555                                 break;
1556                         }
1557                         if (vm_pageout_algorithm ||
1558                             (m->object == NULL) ||
1559                             (m->object && (m->object->ref_count == 0)) ||
1560                             m->act_count < pass + 1
1561                         ) {
1562                                 /*
1563                                  * Deactivate the page.  If we had a
1564                                  * shortage from our inactive scan try to
1565                                  * free (cache) the page instead.
1566                                  *
1567                                  * Don't just blindly cache the page if
1568                                  * we do not have a shortage from the
1569                                  * inactive scan, that could lead to
1570                                  * gigabytes being moved.
1571                                  */
1572                                 --inactive_shortage;
1573                                 if (avail_shortage - delta > 0 ||
1574                                     (m->object && (m->object->ref_count == 0)))
1575                                 {
1576                                         if (avail_shortage - delta > 0)
1577                                                 ++*recycle_countp;
1578                                         vm_page_protect(m, VM_PROT_NONE);
1579                                         if (m->dirty == 0 &&
1580                                             (m->flags & PG_NEED_COMMIT) == 0 &&
1581                                             avail_shortage - delta > 0) {
1582                                                 vm_page_cache(m);
1583                                         } else {
1584                                                 vm_page_deactivate(m);
1585                                                 vm_page_wakeup(m);
1586                                         }
1587                                 } else {
1588                                         vm_page_deactivate(m);
1589                                         vm_page_wakeup(m);
1590                                 }
1591                                 ++delta;
1592                         } else {
1593                                 vm_page_and_queue_spin_lock(m);
1594                                 if (m->queue - m->pc == PQ_ACTIVE) {
1595                                         TAILQ_REMOVE(
1596                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1597                                             m, pageq);
1598                                         TAILQ_INSERT_TAIL(
1599                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1600                                             m, pageq);
1601                                 }
1602                                 vm_page_and_queue_spin_unlock(m);
1603                                 vm_page_wakeup(m);
1604                         }
1605                 }
1606 next:
1607                 lwkt_yield();
1608                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1609         }
1610
1611         /*
1612          * Clean out our local marker.
1613          *
1614          * Page queue still spin-locked.
1615          */
1616         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1617         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1618
1619         return (delta);
1620 }
1621
1622 /*
1623  * The number of actually free pages can drop down to v_free_reserved,
1624  * we try to build the free count back above v_free_min.  Note that
1625  * vm_paging_needed() also returns TRUE if v_free_count is not at
1626  * least v_free_min so that is the minimum we must build the free
1627  * count to.
1628  *
1629  * We use a slightly higher target to improve hysteresis,
1630  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1631  * is usually the same as v_cache_min this maintains about
1632  * half the pages in the free queue as are in the cache queue,
1633  * providing pretty good pipelining for pageout operation.
1634  *
1635  * The system operator can manipulate vm.v_cache_min and
1636  * vm.v_free_target to tune the pageout demon.  Be sure
1637  * to keep vm.v_free_min < vm.v_free_target.
1638  *
1639  * Note that the original paging target is to get at least
1640  * (free_min + cache_min) into (free + cache).  The slightly
1641  * higher target will shift additional pages from cache to free
1642  * without effecting the original paging target in order to
1643  * maintain better hysteresis and not have the free count always
1644  * be dead-on v_free_min.
1645  *
1646  * NOTE: we are still in a critical section.
1647  *
1648  * Pages moved from PQ_CACHE to totally free are not counted in the
1649  * pages_freed counter.
1650  *
1651  * WARNING! Can be called from two pagedaemon threads simultaneously.
1652  */
1653 static void
1654 vm_pageout_scan_cache(long avail_shortage, int pass,
1655                       long vnodes_skipped, long recycle_count)
1656 {
1657         static int lastkillticks;
1658         struct vm_pageout_scan_info info;
1659         vm_page_t m;
1660         int isep;
1661
1662         isep = (curthread == emergpager);
1663
1664         while (vmstats.v_free_count <
1665                (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1666                 /*
1667                  * This steals some code from vm/vm_page.c
1668                  *
1669                  * Create two rovers and adjust the code to reduce
1670                  * chances of them winding up at the same index (which
1671                  * can cause a lot of contention).
1672                  */
1673                 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 };
1674
1675                 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0)
1676                         goto next_rover;
1677
1678                 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
1679                 if (m == NULL)
1680                         break;
1681                 /*
1682                  * page is returned removed from its queue and spinlocked
1683                  *
1684                  * If the busy attempt fails we can still deactivate the page.
1685                  */
1686                 if (vm_page_busy_try(m, TRUE)) {
1687                         vm_page_deactivate_locked(m);
1688                         vm_page_spin_unlock(m);
1689                         continue;
1690                 }
1691                 vm_page_spin_unlock(m);
1692                 pagedaemon_wakeup();
1693                 lwkt_yield();
1694
1695                 /*
1696                  * Remaining operations run with the page busy and neither
1697                  * the page or the queue will be spin-locked.
1698                  */
1699                 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) ||
1700                     m->hold_count ||
1701                     m->wire_count) {
1702                         vm_page_deactivate(m);
1703                         vm_page_wakeup(m);
1704                         continue;
1705                 }
1706
1707                 /*
1708                  * Because the page is in the cache, it shouldn't be mapped.
1709                  */
1710                 pmap_mapped_sync(m);
1711                 KKASSERT((m->flags & PG_MAPPED) == 0);
1712                 KKASSERT(m->dirty == 0);
1713                 vm_pageout_page_free(m);
1714                 mycpu->gd_cnt.v_dfree++;
1715 next_rover:
1716                 if (isep)
1717                         cache_rover[1] -= PQ_PRIME2;
1718                 else
1719                         cache_rover[0] += PQ_PRIME2;
1720         }
1721
1722         /*
1723          * If we didn't get enough free pages, and we have skipped a vnode
1724          * in a writeable object, wakeup the sync daemon.  And kick swapout
1725          * if we did not get enough free pages.
1726          */
1727         if (vm_paging_target() > 0) {
1728                 if (vnodes_skipped && vm_page_count_min(0))
1729                         speedup_syncer(NULL);
1730 #if !defined(NO_SWAPPING)
1731                 if (vm_swap_enabled && vm_page_count_target())
1732                         vm_req_vmdaemon();
1733 #endif
1734         }
1735
1736         /*
1737          * Handle catastrophic conditions.  Under good conditions we should
1738          * be at the target, well beyond our minimum.  If we could not even
1739          * reach our minimum the system is under heavy stress.  But just being
1740          * under heavy stress does not trigger process killing.
1741          *
1742          * We consider ourselves to have run out of memory if the swap pager
1743          * is full and avail_shortage is still positive.  The secondary check
1744          * ensures that we do not kill processes if the instantanious
1745          * availability is good, even if the pageout demon pass says it
1746          * couldn't get to the target.
1747          *
1748          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL
1749          *        SITUATIONS.
1750          */
1751         if (swap_pager_almost_full &&
1752             pass > 0 &&
1753             isep == 0 &&
1754             (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1755                 kprintf("Warning: system low on memory+swap "
1756                         "shortage %ld for %d ticks!\n",
1757                         avail_shortage, ticks - swap_fail_ticks);
1758                 if (bootverbose)
1759                 kprintf("Metrics: spaf=%d spf=%d pass=%d "
1760                         "avail=%ld target=%ld last=%u\n",
1761                         swap_pager_almost_full,
1762                         swap_pager_full,
1763                         pass,
1764                         avail_shortage,
1765                         vm_paging_target(),
1766                         (unsigned int)(ticks - lastkillticks));
1767         }
1768         if (swap_pager_full &&
1769             pass > 1 &&
1770             isep == 0 &&
1771             avail_shortage > 0 &&
1772             vm_paging_target() > 0 &&
1773             (unsigned int)(ticks - lastkillticks) >= hz) {
1774                 /*
1775                  * Kill something, maximum rate once per second to give
1776                  * the process time to free up sufficient memory.
1777                  */
1778                 lastkillticks = ticks;
1779                 info.bigproc = NULL;
1780                 info.bigsize = 0;
1781                 allproc_scan(vm_pageout_scan_callback, &info, 0);
1782                 if (info.bigproc != NULL) {
1783                         kprintf("Try to kill process %d %s\n",
1784                                 info.bigproc->p_pid, info.bigproc->p_comm);
1785                         info.bigproc->p_nice = PRIO_MIN;
1786                         info.bigproc->p_usched->resetpriority(
1787                                 FIRST_LWP_IN_PROC(info.bigproc));
1788                         atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1789                         killproc(info.bigproc, "out of swap space");
1790                         wakeup(&vmstats.v_free_count);
1791                         PRELE(info.bigproc);
1792                 }
1793         }
1794 }
1795
1796 static int
1797 vm_pageout_scan_callback(struct proc *p, void *data)
1798 {
1799         struct vm_pageout_scan_info *info = data;
1800         vm_offset_t size;
1801
1802         /*
1803          * Never kill system processes or init.  If we have configured swap
1804          * then try to avoid killing low-numbered pids.
1805          */
1806         if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1807             ((p->p_pid < 48) && (vm_swap_size != 0))) {
1808                 return (0);
1809         }
1810
1811         lwkt_gettoken(&p->p_token);
1812
1813         /*
1814          * if the process is in a non-running type state,
1815          * don't touch it.
1816          */
1817         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1818                 lwkt_reltoken(&p->p_token);
1819                 return (0);
1820         }
1821
1822         /*
1823          * Get the approximate process size.  Note that anonymous pages
1824          * with backing swap will be counted twice, but there should not
1825          * be too many such pages due to the stress the VM system is
1826          * under at this point.
1827          */
1828         size = vmspace_anonymous_count(p->p_vmspace) +
1829                 vmspace_swap_count(p->p_vmspace);
1830
1831         /*
1832          * If the this process is bigger than the biggest one
1833          * remember it.
1834          */
1835         if (info->bigsize < size) {
1836                 if (info->bigproc)
1837                         PRELE(info->bigproc);
1838                 PHOLD(p);
1839                 info->bigproc = p;
1840                 info->bigsize = size;
1841         }
1842         lwkt_reltoken(&p->p_token);
1843         lwkt_yield();
1844
1845         return(0);
1846 }
1847
1848 /*
1849  * This old guy slowly walks PQ_HOLD looking for pages which need to be
1850  * moved back to PQ_FREE.  It is possible for pages to accumulate here
1851  * when vm_page_free() races against vm_page_unhold(), resulting in a
1852  * page being left on a PQ_HOLD queue with hold_count == 0.
1853  *
1854  * It is easier to handle this edge condition here, in non-critical code,
1855  * rather than enforce a spin-lock for every 1->0 transition in
1856  * vm_page_unhold().
1857  *
1858  * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
1859  */
1860 static void
1861 vm_pageout_scan_hold(int q)
1862 {
1863         vm_page_t m;
1864
1865         vm_page_queues_spin_lock(PQ_HOLD + q);
1866         TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) {
1867                 if (m->flags & PG_MARKER)
1868                         continue;
1869
1870                 /*
1871                  * Process one page and return
1872                  */
1873                 if (m->hold_count)
1874                         break;
1875                 kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
1876                 vm_page_hold(m);
1877                 vm_page_queues_spin_unlock(PQ_HOLD + q);
1878                 vm_page_unhold(m);      /* reprocess */
1879                 return;
1880         }
1881         vm_page_queues_spin_unlock(PQ_HOLD + q);
1882 }
1883
1884 /*
1885  * This routine tries to maintain the pseudo LRU active queue,
1886  * so that during long periods of time where there is no paging,
1887  * that some statistic accumulation still occurs.  This code
1888  * helps the situation where paging just starts to occur.
1889  */
1890 static void
1891 vm_pageout_page_stats(int q)
1892 {
1893         static int fullintervalcount = 0;
1894         struct vm_page marker;
1895         vm_page_t m;
1896         long pcount, tpcount;           /* Number of pages to check */
1897         long page_shortage;
1898
1899         page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1900                          vmstats.v_free_min) -
1901                         (vmstats.v_free_count + vmstats.v_inactive_count +
1902                          vmstats.v_cache_count);
1903
1904         if (page_shortage <= 0)
1905                 return;
1906
1907         pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1908         fullintervalcount += vm_pageout_stats_interval;
1909         if (fullintervalcount < vm_pageout_full_stats_interval) {
1910                 tpcount = (vm_pageout_stats_max * pcount) /
1911                           vmstats.v_page_count + 1;
1912                 if (pcount > tpcount)
1913                         pcount = tpcount;
1914         } else {
1915                 fullintervalcount = 0;
1916         }
1917
1918         bzero(&marker, sizeof(marker));
1919         marker.flags = PG_FICTITIOUS | PG_MARKER;
1920         marker.busy_count = PBUSY_LOCKED;
1921         marker.queue = PQ_ACTIVE + q;
1922         marker.pc = q;
1923         marker.wire_count = 1;
1924
1925         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1926         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1927
1928         /*
1929          * Queue locked at top of loop to avoid stack marker issues.
1930          */
1931         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1932                pcount-- > 0)
1933         {
1934                 int actcount;
1935
1936                 KKASSERT(m->queue == PQ_ACTIVE + q);
1937                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1938                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1939                                    &marker, pageq);
1940
1941                 /*
1942                  * Skip marker pages (atomic against other markers to avoid
1943                  * infinite hop-over scans).
1944                  */
1945                 if (m->flags & PG_MARKER)
1946                         continue;
1947
1948                 /*
1949                  * Ignore pages we can't busy
1950                  */
1951                 if (vm_page_busy_try(m, TRUE))
1952                         continue;
1953
1954                 /*
1955                  * Remaining operations run with the page busy and neither
1956                  * the page or the queue will be spin-locked.
1957                  */
1958                 KKASSERT(m->queue == PQ_ACTIVE + q);
1959                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1960
1961                 /*
1962                  * We can just remove wired pages from the queue
1963                  */
1964                 if (m->wire_count) {
1965                         vm_page_unqueue_nowakeup(m);
1966                         vm_page_wakeup(m);
1967                         goto next;
1968                 }
1969
1970
1971                 /*
1972                  * We now have a safely busied page, the page and queue
1973                  * spinlocks have been released.
1974                  *
1975                  * Ignore held and wired pages
1976                  */
1977                 if (m->hold_count || m->wire_count) {
1978                         vm_page_wakeup(m);
1979                         goto next;
1980                 }
1981
1982                 /*
1983                  * Calculate activity
1984                  */
1985                 actcount = 0;
1986                 if (m->flags & PG_REFERENCED) {
1987                         vm_page_flag_clear(m, PG_REFERENCED);
1988                         actcount += 1;
1989                 }
1990                 actcount += pmap_ts_referenced(m);
1991
1992                 /*
1993                  * Update act_count and move page to end of queue.
1994                  */
1995                 if (actcount) {
1996                         m->act_count += ACT_ADVANCE + actcount;
1997                         if (m->act_count > ACT_MAX)
1998                                 m->act_count = ACT_MAX;
1999                         vm_page_and_queue_spin_lock(m);
2000                         if (m->queue - m->pc == PQ_ACTIVE) {
2001                                 TAILQ_REMOVE(
2002                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2003                                         m, pageq);
2004                                 TAILQ_INSERT_TAIL(
2005                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2006                                         m, pageq);
2007                         }
2008                         vm_page_and_queue_spin_unlock(m);
2009                         vm_page_wakeup(m);
2010                         goto next;
2011                 }
2012
2013                 if (m->act_count == 0) {
2014                         /*
2015                          * We turn off page access, so that we have
2016                          * more accurate RSS stats.  We don't do this
2017                          * in the normal page deactivation when the
2018                          * system is loaded VM wise, because the
2019                          * cost of the large number of page protect
2020                          * operations would be higher than the value
2021                          * of doing the operation.
2022                          *
2023                          * We use the marker to save our place so
2024                          * we can release the spin lock.  both (m)
2025                          * and (next) will be invalid.
2026                          */
2027                         vm_page_protect(m, VM_PROT_NONE);
2028                         vm_page_deactivate(m);
2029                 } else {
2030                         m->act_count -= min(m->act_count, ACT_DECLINE);
2031                         vm_page_and_queue_spin_lock(m);
2032                         if (m->queue - m->pc == PQ_ACTIVE) {
2033                                 TAILQ_REMOVE(
2034                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2035                                         m, pageq);
2036                                 TAILQ_INSERT_TAIL(
2037                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2038                                         m, pageq);
2039                         }
2040                         vm_page_and_queue_spin_unlock(m);
2041                 }
2042                 vm_page_wakeup(m);
2043 next:
2044                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
2045         }
2046
2047         /*
2048          * Remove our local marker
2049          *
2050          * Page queue still spin-locked.
2051          */
2052         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
2053         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2054 }
2055
2056 static void
2057 vm_pageout_free_page_calc(vm_size_t count)
2058 {
2059         /*
2060          * v_free_min           normal allocations
2061          * v_free_reserved      system allocations
2062          * v_pageout_free_min   allocations by pageout daemon
2063          * v_interrupt_free_min low level allocations (e.g swap structures)
2064          *
2065          * v_free_min is used to generate several other baselines, and they
2066          * can get pretty silly on systems with a lot of memory.
2067          */
2068         vmstats.v_free_min = 64 + vmstats.v_page_count / 200;
2069         vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
2070         vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
2071         vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
2072         vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
2073 }
2074
2075
2076 /*
2077  * vm_pageout is the high level pageout daemon.  TWO kernel threads run
2078  * this daemon, the primary pageout daemon and the emergency pageout daemon.
2079  *
2080  * The emergency pageout daemon takes over when the primary pageout daemon
2081  * deadlocks.  The emergency pageout daemon ONLY pages out to swap, thus
2082  * avoiding the many low-memory deadlocks which can occur when paging out
2083  * to VFS's.
2084  */
2085 static void
2086 vm_pageout_thread(void)
2087 {
2088         int pass;
2089         int q;
2090         int q1iterator = 0;
2091         int q2iterator = 0;
2092         int q3iterator = 0;
2093         int isep;
2094
2095         curthread->td_flags |= TDF_SYSTHREAD;
2096
2097         /*
2098          * We only need to setup once.
2099          */
2100         isep = 0;
2101         if (curthread == emergpager) {
2102                 isep = 1;
2103                 goto skip_setup;
2104         }
2105
2106         /*
2107          * Initialize vm_max_launder per pageout pass to be 1/16
2108          * of total physical memory, plus a little slop.
2109          */
2110         if (vm_max_launder == 0)
2111                 vm_max_launder = physmem / 256 + 16;
2112
2113         /*
2114          * Initialize some paging parameters.
2115          */
2116         vm_pageout_free_page_calc(vmstats.v_page_count);
2117
2118         /*
2119          * v_free_target and v_cache_min control pageout hysteresis.  Note
2120          * that these are more a measure of the VM cache queue hysteresis
2121          * then the VM free queue.  Specifically, v_free_target is the
2122          * high water mark (free+cache pages).
2123          *
2124          * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2125          * low water mark, while v_free_min is the stop.  v_cache_min must
2126          * be big enough to handle memory needs while the pageout daemon
2127          * is signalled and run to free more pages.
2128          */
2129         vmstats.v_free_target = 4 * vmstats.v_free_min +
2130                                 vmstats.v_free_reserved;
2131
2132         /*
2133          * NOTE: With the new buffer cache b_act_count we want the default
2134          *       inactive target to be a percentage of available memory.
2135          *
2136          *       The inactive target essentially determines the minimum
2137          *       number of 'temporary' pages capable of caching one-time-use
2138          *       files when the VM system is otherwise full of pages
2139          *       belonging to multi-time-use files or active program data.
2140          *
2141          * NOTE: The inactive target is aggressively persued only if the
2142          *       inactive queue becomes too small.  If the inactive queue
2143          *       is large enough to satisfy page movement to free+cache
2144          *       then it is repopulated more slowly from the active queue.
2145          *       This allows a general inactive_target default to be set.
2146          *
2147          *       There is an issue here for processes which sit mostly idle
2148          *       'overnight', such as sshd, tcsh, and X.  Any movement from
2149          *       the active queue will eventually cause such pages to
2150          *       recycle eventually causing a lot of paging in the morning.
2151          *       To reduce the incidence of this pages cycled out of the
2152          *       buffer cache are moved directly to the inactive queue if
2153          *       they were only used once or twice.
2154          *
2155          *       The vfs.vm_cycle_point sysctl can be used to adjust this.
2156          *       Increasing the value (up to 64) increases the number of
2157          *       buffer recyclements which go directly to the inactive queue.
2158          */
2159         if (vmstats.v_free_count > 2048) {
2160                 vmstats.v_cache_min = vmstats.v_free_target;
2161                 vmstats.v_cache_max = 2 * vmstats.v_cache_min;
2162         } else {
2163                 vmstats.v_cache_min = 0;
2164                 vmstats.v_cache_max = 0;
2165         }
2166         vmstats.v_inactive_target = vmstats.v_free_count / 4;
2167
2168         /* XXX does not really belong here */
2169         if (vm_page_max_wired == 0)
2170                 vm_page_max_wired = vmstats.v_free_count / 3;
2171
2172         if (vm_pageout_stats_max == 0)
2173                 vm_pageout_stats_max = vmstats.v_free_target;
2174
2175         /*
2176          * Set interval in seconds for stats scan.
2177          */
2178         if (vm_pageout_stats_interval == 0)
2179                 vm_pageout_stats_interval = 5;
2180         if (vm_pageout_full_stats_interval == 0)
2181                 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
2182         
2183
2184         /*
2185          * Set maximum free per pass
2186          */
2187         if (vm_pageout_stats_free_max == 0)
2188                 vm_pageout_stats_free_max = 5;
2189
2190         swap_pager_swap_init();
2191         pass = 0;
2192
2193         atomic_swap_int(&sequence_emerg_pager, 1);
2194         wakeup(&sequence_emerg_pager);
2195
2196 skip_setup:
2197         /*
2198          * Sequence emergency pager startup
2199          */
2200         if (isep) {
2201                 while (sequence_emerg_pager == 0)
2202                         tsleep(&sequence_emerg_pager, 0, "pstartup", hz);
2203         }
2204
2205         /*
2206          * The pageout daemon is never done, so loop forever.
2207          *
2208          * WARNING!  This code is being executed by two kernel threads
2209          *           potentially simultaneously.
2210          */
2211         while (TRUE) {
2212                 int error;
2213                 long avail_shortage;
2214                 long inactive_shortage;
2215                 long vnodes_skipped = 0;
2216                 long recycle_count = 0;
2217                 long tmp;
2218
2219                 /*
2220                  * Wait for an action request.  If we timeout check to
2221                  * see if paging is needed (in case the normal wakeup
2222                  * code raced us).
2223                  */
2224                 if (isep) {
2225                         /*
2226                          * Emergency pagedaemon monitors the primary
2227                          * pagedaemon while vm_pages_needed != 0.
2228                          *
2229                          * The emergency pagedaemon only runs if VM paging
2230                          * is needed and the primary pagedaemon has not
2231                          * updated vm_pagedaemon_time for more than 2 seconds.
2232                          */
2233                         if (vm_pages_needed)
2234                                 tsleep(&vm_pagedaemon_time, 0, "psleep", hz);
2235                         else
2236                                 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10);
2237                         if (vm_pages_needed == 0) {
2238                                 pass = 0;
2239                                 continue;
2240                         }
2241                         if ((int)(ticks - vm_pagedaemon_time) < hz * 2) {
2242                                 pass = 0;
2243                                 continue;
2244                         }
2245                 } else {
2246                         /*
2247                          * Primary pagedaemon
2248                          *
2249                          * NOTE: We unconditionally cleanup PQ_HOLD even
2250                          *       when there is no work to do.
2251                          */
2252                         vm_pageout_scan_hold(q3iterator & PQ_L2_MASK);
2253                         ++q3iterator;
2254
2255                         if (vm_pages_needed == 0) {
2256                                 error = tsleep(&vm_pages_needed,
2257                                                0, "psleep",
2258                                                vm_pageout_stats_interval * hz);
2259                                 if (error &&
2260                                     vm_paging_needed(0) == 0 &&
2261                                     vm_pages_needed == 0) {
2262                                         for (q = 0; q < PQ_L2_SIZE; ++q)
2263                                                 vm_pageout_page_stats(q);
2264                                         continue;
2265                                 }
2266                                 vm_pagedaemon_time = ticks;
2267                                 vm_pages_needed = 1;
2268
2269                                 /*
2270                                  * Wake the emergency pagedaemon up so it
2271                                  * can monitor us.  It will automatically
2272                                  * go back into a long sleep when
2273                                  * vm_pages_needed returns to 0.
2274                                  */
2275                                 wakeup(&vm_pagedaemon_time);
2276                         }
2277                 }
2278
2279                 mycpu->gd_cnt.v_pdwakeups++;
2280
2281                 /*
2282                  * Scan for INACTIVE->CLEAN/PAGEOUT
2283                  *
2284                  * This routine tries to avoid thrashing the system with
2285                  * unnecessary activity.
2286                  *
2287                  * Calculate our target for the number of free+cache pages we
2288                  * want to get to.  This is higher then the number that causes
2289                  * allocations to stall (severe) in order to provide hysteresis,
2290                  * and if we don't make it all the way but get to the minimum
2291                  * we're happy.  Goose it a bit if there are multiple requests
2292                  * for memory.
2293                  *
2294                  * Don't reduce avail_shortage inside the loop or the
2295                  * PQAVERAGE() calculation will break.
2296                  *
2297                  * NOTE! deficit is differentiated from avail_shortage as
2298                  *       REQUIRING at least (deficit) pages to be cleaned,
2299                  *       even if the page queues are in good shape.  This
2300                  *       is used primarily for handling per-process
2301                  *       RLIMIT_RSS and may also see small values when
2302                  *       processes block due to low memory.
2303                  */
2304                 vmstats_rollup();
2305                 if (isep == 0)
2306                         vm_pagedaemon_time = ticks;
2307                 avail_shortage = vm_paging_target() + vm_pageout_deficit;
2308                 vm_pageout_deficit = 0;
2309
2310                 if (avail_shortage > 0) {
2311                         long delta = 0;
2312                         long counts[4] = { 0, 0, 0, 0 };
2313                         int qq;
2314
2315                         if (vm_pageout_debug) {
2316                                 kprintf("scan_inactive pass %d isep=%d\t",
2317                                         pass / MAXSCAN_DIVIDER, isep);
2318                         }
2319
2320                         qq = q1iterator;
2321                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2322                                 delta += vm_pageout_scan_inactive(
2323                                             pass / MAXSCAN_DIVIDER,
2324                                             qq & PQ_L2_MASK,
2325                                             PQAVERAGE(avail_shortage),
2326                                             &vnodes_skipped, counts);
2327                                 if (isep)
2328                                         --qq;
2329                                 else
2330                                         ++qq;
2331                                 if (avail_shortage - delta <= 0)
2332                                         break;
2333
2334                                 /*
2335                                  * It is possible for avail_shortage to be
2336                                  * very large.  If a large program exits or
2337                                  * frees a ton of memory all at once, we do
2338                                  * not have to continue deactivations.
2339                                  *
2340                                  * (We will still run the active->inactive
2341                                  * target, however).
2342                                  */
2343                                 if (!vm_page_count_target() &&
2344                                     !vm_page_count_min(
2345                                                 vm_page_free_hysteresis)) {
2346                                         avail_shortage = 0;
2347                                         break;
2348                                 }
2349                         }
2350                         if (vm_pageout_debug) {
2351                                 kprintf("flushed %ld cleaned %ld "
2352                                         "lru2 %ld react %ld "
2353                                         "delta %ld\n",
2354                                         counts[0], counts[1],
2355                                         counts[2], counts[3],
2356                                         delta);
2357                         }
2358                         avail_shortage -= delta;
2359                         q1iterator = qq;
2360                 }
2361
2362                 /*
2363                  * Figure out how many active pages we must deactivate.  If
2364                  * we were able to reach our target with just the inactive
2365                  * scan above we limit the number of active pages we
2366                  * deactivate to reduce unnecessary work.
2367                  */
2368                 vmstats_rollup();
2369                 if (isep == 0)
2370                         vm_pagedaemon_time = ticks;
2371                 inactive_shortage = vmstats.v_inactive_target -
2372                                     vmstats.v_inactive_count;
2373
2374                 /*
2375                  * If we were unable to free sufficient inactive pages to
2376                  * satisfy the free/cache queue requirements then simply
2377                  * reaching the inactive target may not be good enough.
2378                  * Try to deactivate pages in excess of the target based
2379                  * on the shortfall.
2380                  *
2381                  * However to prevent thrashing the VM system do not
2382                  * deactivate more than an additional 1/10 the inactive
2383                  * target's worth of active pages.
2384                  */
2385                 if (avail_shortage > 0) {
2386                         tmp = avail_shortage * 2;
2387                         if (tmp > vmstats.v_inactive_target / 10)
2388                                 tmp = vmstats.v_inactive_target / 10;
2389                         inactive_shortage += tmp;
2390                 }
2391
2392                 /*
2393                  * Only trigger a pmap cleanup on inactive shortage.
2394                  */
2395                 if (isep == 0 && inactive_shortage > 0) {
2396                         pmap_collect();
2397                 }
2398
2399                 /*
2400                  * Scan for ACTIVE->INACTIVE
2401                  *
2402                  * Only trigger on inactive shortage.  Triggering on
2403                  * avail_shortage can starve the active queue with
2404                  * unnecessary active->inactive transitions and destroy
2405                  * performance.
2406                  *
2407                  * If this is the emergency pager, always try to move
2408                  * a few pages from active to inactive because the inactive
2409                  * queue might have enough pages, but not enough anonymous
2410                  * pages.
2411                  */
2412                 if (isep && inactive_shortage < vm_emerg_launder)
2413                         inactive_shortage = vm_emerg_launder;
2414
2415                 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2416                         long delta = 0;
2417                         int qq;
2418
2419                         qq = q2iterator;
2420                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2421                                 delta += vm_pageout_scan_active(
2422                                                 pass / MAXSCAN_DIVIDER,
2423                                                 qq & PQ_L2_MASK,
2424                                                 PQAVERAGE(avail_shortage),
2425                                                 PQAVERAGE(inactive_shortage),
2426                                                 &recycle_count);
2427                                 if (isep)
2428                                         --qq;
2429                                 else
2430                                         ++qq;
2431                                 if (inactive_shortage - delta <= 0 &&
2432                                     avail_shortage - delta <= 0) {
2433                                         break;
2434                                 }
2435
2436                                 /*
2437                                  * inactive_shortage can be a very large
2438                                  * number.  This is intended to break out
2439                                  * early if our inactive_target has been
2440                                  * reached due to other system activity.
2441                                  */
2442                                 if (vmstats.v_inactive_count >
2443                                     vmstats.v_inactive_target) {
2444                                         inactive_shortage = 0;
2445                                         break;
2446                                 }
2447                         }
2448                         inactive_shortage -= delta;
2449                         avail_shortage -= delta;
2450                         q2iterator = qq;
2451                 }
2452
2453                 /*
2454                  * Scan for CACHE->FREE
2455                  *
2456                  * Finally free enough cache pages to meet our free page
2457                  * requirement and take more drastic measures if we are
2458                  * still in trouble.
2459                  */
2460                 vmstats_rollup();
2461                 if (isep == 0)
2462                         vm_pagedaemon_time = ticks;
2463                 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER,
2464                                       vnodes_skipped, recycle_count);
2465
2466                 /*
2467                  * This is a bit sophisticated because we do not necessarily
2468                  * want to force paging until our targets are reached if we
2469                  * were able to successfully retire the shortage we calculated.
2470                  */
2471                 if (avail_shortage > 0) {
2472                         /*
2473                          * If we did not retire enough pages continue the
2474                          * pageout operation until we are able to.  It
2475                          * takes MAXSCAN_DIVIDER passes to cover the entire
2476                          * inactive list.
2477                          */
2478                         ++pass;
2479
2480                         if (pass / MAXSCAN_DIVIDER < 10 &&
2481                             vm_pages_needed > 1) {
2482                                 /*
2483                                  * Normal operation, additional processes
2484                                  * have already kicked us.  Retry immediately
2485                                  * unless swap space is completely full in
2486                                  * which case delay a bit.
2487                                  */
2488                                 if (swap_pager_full) {
2489                                         tsleep(&vm_pages_needed, 0, "pdelay",
2490                                                 hz / 5);
2491                                 } /* else immediate retry */
2492                         } else if (pass / MAXSCAN_DIVIDER < 10) {
2493                                 /*
2494                                  * Do a short sleep for the first 10 passes,
2495                                  * allow the sleep to be woken up by resetting
2496                                  * vm_pages_needed to 1 (NOTE: we are still
2497                                  * active paging!).
2498                                  */
2499                                 if (isep == 0)
2500                                         vm_pages_needed = 1;
2501                                 tsleep(&vm_pages_needed, 0, "pdelay", 2);
2502                         } else if (swap_pager_full == 0) {
2503                                 /*
2504                                  * We've taken too many passes, force a
2505                                  * longer delay.
2506                                  */
2507                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2508                         } else {
2509                                 /*
2510                                  * Running out of memory, catastrophic
2511                                  * back-off to one-second intervals.
2512                                  */
2513                                 tsleep(&vm_pages_needed, 0, "pdelay", hz);
2514                         }
2515                 } else if (vm_pages_needed) {
2516                         /*
2517                          * We retired our calculated shortage but we may have
2518                          * to continue paging if threads drain memory too far
2519                          * below our target.
2520                          *
2521                          * Similar to vm_page_free_wakeup() in vm_page.c.
2522                          */
2523                         pass = 0;
2524                         if (!vm_paging_needed(0)) {
2525                                 /* still more than half-way to our target */
2526                                 vm_pages_needed = 0;
2527                                 wakeup(&vmstats.v_free_count);
2528                         } else
2529                         if (!vm_page_count_min(vm_page_free_hysteresis)) {
2530                                 /*
2531                                  * Continue operations with wakeup
2532                                  * (set variable to avoid overflow)
2533                                  */
2534                                 vm_pages_needed = 2;
2535                                 wakeup(&vmstats.v_free_count);
2536                         } else {
2537                                 /*
2538                                  * No wakeup() needed, continue operations.
2539                                  * (set variable to avoid overflow)
2540                                  */
2541                                 vm_pages_needed = 2;
2542                         }
2543                 } else {
2544                         /*
2545                          * Turn paging back on immediately if we are under
2546                          * minimum.
2547                          */
2548                         pass = 0;
2549                 }
2550         }
2551 }
2552
2553 static struct kproc_desc pg1_kp = {
2554         "pagedaemon",
2555         vm_pageout_thread,
2556         &pagethread
2557 };
2558 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp);
2559
2560 static struct kproc_desc pg2_kp = {
2561         "emergpager",
2562         vm_pageout_thread,
2563         &emergpager
2564 };
2565 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp);
2566
2567
2568 /*
2569  * Called after allocating a page out of the cache or free queue
2570  * to possibly wake the pagedaemon up to replentish our supply.
2571  *
2572  * We try to generate some hysteresis by waking the pagedaemon up
2573  * when our free+cache pages go below the free_min+cache_min level.
2574  * The pagedaemon tries to get the count back up to at least the
2575  * minimum, and through to the target level if possible.
2576  *
2577  * If the pagedaemon is already active bump vm_pages_needed as a hint
2578  * that there are even more requests pending.
2579  *
2580  * SMP races ok?
2581  * No requirements.
2582  */
2583 void
2584 pagedaemon_wakeup(void)
2585 {
2586         if (vm_paging_needed(0) && curthread != pagethread) {
2587                 if (vm_pages_needed <= 1) {
2588                         vm_pages_needed = 1;            /* SMP race ok */
2589                         wakeup(&vm_pages_needed);       /* tickle pageout */
2590                 } else if (vm_page_count_min(0)) {
2591                         ++vm_pages_needed;              /* SMP race ok */
2592                         /* a wakeup() would be wasted here */
2593                 }
2594         }
2595 }
2596
2597 #if !defined(NO_SWAPPING)
2598
2599 /*
2600  * SMP races ok?
2601  * No requirements.
2602  */
2603 static void
2604 vm_req_vmdaemon(void)
2605 {
2606         static int lastrun = 0;
2607
2608         if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2609                 wakeup(&vm_daemon_needed);
2610                 lastrun = ticks;
2611         }
2612 }
2613
2614 static int vm_daemon_callback(struct proc *p, void *data __unused);
2615
2616 /*
2617  * No requirements.
2618  *
2619  * Scan processes for exceeding their rlimits, deactivate pages
2620  * when RSS is exceeded.
2621  */
2622 static void
2623 vm_daemon(void)
2624 {
2625         while (TRUE) {
2626                 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2627                 allproc_scan(vm_daemon_callback, NULL, 0);
2628         }
2629 }
2630
2631 static int
2632 vm_daemon_callback(struct proc *p, void *data __unused)
2633 {
2634         struct vmspace *vm;
2635         vm_pindex_t limit, size;
2636
2637         /*
2638          * if this is a system process or if we have already
2639          * looked at this process, skip it.
2640          */
2641         lwkt_gettoken(&p->p_token);
2642
2643         if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2644                 lwkt_reltoken(&p->p_token);
2645                 return (0);
2646         }
2647
2648         /*
2649          * if the process is in a non-running type state,
2650          * don't touch it.
2651          */
2652         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2653                 lwkt_reltoken(&p->p_token);
2654                 return (0);
2655         }
2656
2657         /*
2658          * get a limit
2659          */
2660         limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2661                                 p->p_rlimit[RLIMIT_RSS].rlim_max));
2662
2663         vm = p->p_vmspace;
2664         vmspace_hold(vm);
2665         size = pmap_resident_tlnw_count(&vm->vm_pmap);
2666         if (limit >= 0 && size > 4096 &&
2667             size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
2668                 vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2669         }
2670         vmspace_drop(vm);
2671
2672         lwkt_reltoken(&p->p_token);
2673
2674         return (0);
2675 }
2676
2677 #endif