kernel - Major signal path adjustments to fix races, tsleep race fixes, +more
[dragonfly.git] / sys / vm / vm_pageout.c
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * The Mach Operating System project at Carnegie-Mellon University.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
39  *
40  *
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  *
66  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
67  */
68
69 /*
70  *      The proverbial page-out daemon.
71  */
72
73 #include "opt_vm.h"
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/kernel.h>
77 #include <sys/proc.h>
78 #include <sys/kthread.h>
79 #include <sys/resourcevar.h>
80 #include <sys/signalvar.h>
81 #include <sys/vnode.h>
82 #include <sys/vmmeter.h>
83 #include <sys/sysctl.h>
84
85 #include <vm/vm.h>
86 #include <vm/vm_param.h>
87 #include <sys/lock.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_pager.h>
93 #include <vm/swap_pager.h>
94 #include <vm/vm_extern.h>
95
96 #include <sys/thread2.h>
97 #include <sys/spinlock2.h>
98 #include <vm/vm_page2.h>
99
100 /*
101  * System initialization
102  */
103
104 /* the kernel process "vm_pageout"*/
105 static int vm_pageout_clean (vm_page_t);
106 static int vm_pageout_free_page_calc (vm_size_t count);
107 struct thread *pagethread;
108
109 #if !defined(NO_SWAPPING)
110 /* the kernel process "vm_daemon"*/
111 static void vm_daemon (void);
112 static struct   thread *vmthread;
113
114 static struct kproc_desc vm_kp = {
115         "vmdaemon",
116         vm_daemon,
117         &vmthread
118 };
119 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
120 #endif
121
122 int vm_pages_needed=0;          /* Event on which pageout daemon sleeps */
123 int vm_pageout_deficit=0;       /* Estimated number of pages deficit */
124 int vm_pageout_pages_needed=0;  /* flag saying that the pageout daemon needs pages */
125
126 #if !defined(NO_SWAPPING)
127 static int vm_pageout_req_swapout;      /* XXX */
128 static int vm_daemon_needed;
129 #endif
130 static int vm_max_launder = 32;
131 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
132 static int vm_pageout_full_stats_interval = 0;
133 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
134 static int defer_swap_pageouts=0;
135 static int disable_swap_pageouts=0;
136
137 #if defined(NO_SWAPPING)
138 static int vm_swap_enabled=0;
139 static int vm_swap_idle_enabled=0;
140 #else
141 static int vm_swap_enabled=1;
142 static int vm_swap_idle_enabled=0;
143 #endif
144
145 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
146         CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
147
148 SYSCTL_INT(_vm, OID_AUTO, max_launder,
149         CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
150
151 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
152         CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
153
154 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
155         CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
156
157 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
158         CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
159
160 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
161         CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
162
163 #if defined(NO_SWAPPING)
164 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
165         CTLFLAG_RD, &vm_swap_enabled, 0, "");
166 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
167         CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
168 #else
169 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
170         CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
171 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
172         CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
173 #endif
174
175 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
176         CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
177
178 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
179         CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
180
181 static int pageout_lock_miss;
182 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
183         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
184
185 #define VM_PAGEOUT_PAGE_COUNT 16
186 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
187
188 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
189
190 #if !defined(NO_SWAPPING)
191 typedef void freeer_fcn_t (vm_map_t, vm_object_t, vm_pindex_t, int);
192 static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t);
193 static freeer_fcn_t vm_pageout_object_deactivate_pages;
194 static void vm_req_vmdaemon (void);
195 #endif
196 static void vm_pageout_page_stats(int q);
197
198 static __inline int
199 PQAVERAGE(int n)
200 {
201         if (n >= 0)
202                 return((n + (PQ_L2_SIZE - 1)) / PQ_L2_SIZE + 1);
203         else
204                 return((n - (PQ_L2_SIZE - 1)) / PQ_L2_SIZE - 1);
205 }
206
207 /*
208  * vm_pageout_clean:
209  *
210  * Clean the page and remove it from the laundry.  The page must not be
211  * busy on-call.
212  * 
213  * We set the busy bit to cause potential page faults on this page to
214  * block.  Note the careful timing, however, the busy bit isn't set till
215  * late and we cannot do anything that will mess with the page.
216  */
217 static int
218 vm_pageout_clean(vm_page_t m)
219 {
220         vm_object_t object;
221         vm_page_t mc[2*vm_pageout_page_count];
222         int pageout_count;
223         int error;
224         int ib, is, page_base;
225         vm_pindex_t pindex = m->pindex;
226
227         object = m->object;
228
229         /*
230          * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
231          * with the new swapper, but we could have serious problems paging
232          * out other object types if there is insufficient memory.  
233          *
234          * Unfortunately, checking free memory here is far too late, so the
235          * check has been moved up a procedural level.
236          */
237
238         /*
239          * Don't mess with the page if it's busy, held, or special
240          *
241          * XXX do we really need to check hold_count here?  hold_count
242          * isn't supposed to mess with vm_page ops except prevent the
243          * page from being reused.
244          */
245         if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) {
246                 vm_page_wakeup(m);
247                 return 0;
248         }
249
250         mc[vm_pageout_page_count] = m;
251         pageout_count = 1;
252         page_base = vm_pageout_page_count;
253         ib = 1;
254         is = 1;
255
256         /*
257          * Scan object for clusterable pages.
258          *
259          * We can cluster ONLY if: ->> the page is NOT
260          * clean, wired, busy, held, or mapped into a
261          * buffer, and one of the following:
262          * 1) The page is inactive, or a seldom used
263          *    active page.
264          * -or-
265          * 2) we force the issue.
266          *
267          * During heavy mmap/modification loads the pageout
268          * daemon can really fragment the underlying file
269          * due to flushing pages out of order and not trying
270          * align the clusters (which leave sporatic out-of-order
271          * holes).  To solve this problem we do the reverse scan
272          * first and attempt to align our cluster, then do a 
273          * forward scan if room remains.
274          */
275
276         vm_object_hold(object);
277 more:
278         while (ib && pageout_count < vm_pageout_page_count) {
279                 vm_page_t p;
280
281                 if (ib > pindex) {
282                         ib = 0;
283                         break;
284                 }
285
286                 p = vm_page_lookup_busy_try(object, pindex - ib, TRUE, &error);
287                 if (error || p == NULL) {
288                         ib = 0;
289                         break;
290                 }
291                 if ((p->queue - p->pc) == PQ_CACHE ||
292                     (p->flags & PG_UNMANAGED)) {
293                         vm_page_wakeup(p);
294                         ib = 0;
295                         break;
296                 }
297                 vm_page_test_dirty(p);
298                 if ((p->dirty & p->valid) == 0 ||
299                     p->queue - p->pc != PQ_INACTIVE ||
300                     p->wire_count != 0 ||       /* may be held by buf cache */
301                     p->hold_count != 0) {       /* may be undergoing I/O */
302                         vm_page_wakeup(p);
303                         ib = 0;
304                         break;
305                 }
306                 mc[--page_base] = p;
307                 ++pageout_count;
308                 ++ib;
309                 /*
310                  * alignment boundry, stop here and switch directions.  Do
311                  * not clear ib.
312                  */
313                 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
314                         break;
315         }
316
317         while (pageout_count < vm_pageout_page_count && 
318             pindex + is < object->size) {
319                 vm_page_t p;
320
321                 p = vm_page_lookup_busy_try(object, pindex + is, TRUE, &error);
322                 if (error || p == NULL)
323                         break;
324                 if (((p->queue - p->pc) == PQ_CACHE) ||
325                     (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
326                         vm_page_wakeup(p);
327                         break;
328                 }
329                 vm_page_test_dirty(p);
330                 if ((p->dirty & p->valid) == 0 ||
331                     p->queue - p->pc != PQ_INACTIVE ||
332                     p->wire_count != 0 ||       /* may be held by buf cache */
333                     p->hold_count != 0) {       /* may be undergoing I/O */
334                         vm_page_wakeup(p);
335                         break;
336                 }
337                 mc[page_base + pageout_count] = p;
338                 ++pageout_count;
339                 ++is;
340         }
341
342         /*
343          * If we exhausted our forward scan, continue with the reverse scan
344          * when possible, even past a page boundry.  This catches boundry
345          * conditions.
346          */
347         if (ib && pageout_count < vm_pageout_page_count)
348                 goto more;
349
350         vm_object_drop(object);
351
352         /*
353          * we allow reads during pageouts...
354          */
355         return vm_pageout_flush(&mc[page_base], pageout_count, 0);
356 }
357
358 /*
359  * vm_pageout_flush() - launder the given pages
360  *
361  *      The given pages are laundered.  Note that we setup for the start of
362  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
363  *      reference count all in here rather then in the parent.  If we want
364  *      the parent to do more sophisticated things we may have to change
365  *      the ordering.
366  *
367  *      The pages in the array must be busied by the caller and will be
368  *      unbusied by this function.
369  */
370 int
371 vm_pageout_flush(vm_page_t *mc, int count, int flags)
372 {
373         vm_object_t object;
374         int pageout_status[count];
375         int numpagedout = 0;
376         int i;
377
378         /*
379          * Initiate I/O.  Bump the vm_page_t->busy counter.
380          */
381         for (i = 0; i < count; i++) {
382                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
383                         ("vm_pageout_flush page %p index %d/%d: partially "
384                          "invalid page", mc[i], i, count));
385                 vm_page_io_start(mc[i]);
386         }
387
388         /*
389          * We must make the pages read-only.  This will also force the
390          * modified bit in the related pmaps to be cleared.  The pager
391          * cannot clear the bit for us since the I/O completion code
392          * typically runs from an interrupt.  The act of making the page
393          * read-only handles the case for us.
394          *
395          * Then we can unbusy the pages, we still hold a reference by virtue
396          * of our soft-busy.
397          */
398         for (i = 0; i < count; i++) {
399                 vm_page_protect(mc[i], VM_PROT_READ);
400                 vm_page_wakeup(mc[i]);
401         }
402
403         object = mc[0]->object;
404         vm_object_pip_add(object, count);
405
406         vm_pager_put_pages(object, mc, count,
407             (flags | ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
408             pageout_status);
409
410         for (i = 0; i < count; i++) {
411                 vm_page_t mt = mc[i];
412
413                 switch (pageout_status[i]) {
414                 case VM_PAGER_OK:
415                         numpagedout++;
416                         break;
417                 case VM_PAGER_PEND:
418                         numpagedout++;
419                         break;
420                 case VM_PAGER_BAD:
421                         /*
422                          * Page outside of range of object. Right now we
423                          * essentially lose the changes by pretending it
424                          * worked.
425                          */
426                         vm_page_busy_wait(mt, FALSE, "pgbad");
427                         pmap_clear_modify(mt);
428                         vm_page_undirty(mt);
429                         vm_page_wakeup(mt);
430                         break;
431                 case VM_PAGER_ERROR:
432                 case VM_PAGER_FAIL:
433                         /*
434                          * A page typically cannot be paged out when we
435                          * have run out of swap.  We leave the page
436                          * marked inactive and will try to page it out
437                          * again later.
438                          *
439                          * Starvation of the active page list is used to
440                          * determine when the system is massively memory
441                          * starved.
442                          */
443                         break;
444                 case VM_PAGER_AGAIN:
445                         break;
446                 }
447
448                 /*
449                  * If the operation is still going, leave the page busy to
450                  * block all other accesses. Also, leave the paging in
451                  * progress indicator set so that we don't attempt an object
452                  * collapse.
453                  *
454                  * For any pages which have completed synchronously, 
455                  * deactivate the page if we are under a severe deficit.
456                  * Do not try to enter them into the cache, though, they
457                  * might still be read-heavy.
458                  */
459                 if (pageout_status[i] != VM_PAGER_PEND) {
460                         vm_page_busy_wait(mt, FALSE, "pgouw");
461                         if (vm_page_count_severe())
462                                 vm_page_deactivate(mt);
463 #if 0
464                         if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
465                                 vm_page_protect(mt, VM_PROT_READ);
466 #endif
467                         vm_page_io_finish(mt);
468                         vm_page_wakeup(mt);
469                         vm_object_pip_wakeup(object);
470                 }
471         }
472         return numpagedout;
473 }
474
475 #if !defined(NO_SWAPPING)
476 /*
477  * deactivate enough pages to satisfy the inactive target
478  * requirements or if vm_page_proc_limit is set, then
479  * deactivate all of the pages in the object and its
480  * backing_objects.
481  *
482  * The map must be locked.
483  * The caller must hold the vm_object.
484  */
485 static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
486
487 static void
488 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
489                                    vm_pindex_t desired, int map_remove_only)
490 {
491         struct rb_vm_page_scan_info info;
492         vm_object_t lobject;
493         vm_object_t tobject;
494         int remove_mode;
495
496         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
497         lobject = object;
498
499         while (lobject) {
500                 if (pmap_resident_count(vm_map_pmap(map)) <= desired)
501                         break;
502                 if (lobject->type == OBJT_DEVICE || lobject->type == OBJT_PHYS)
503                         break;
504                 if (lobject->paging_in_progress)
505                         break;
506
507                 remove_mode = map_remove_only;
508                 if (lobject->shadow_count > 1)
509                         remove_mode = 1;
510
511                 /*
512                  * scan the objects entire memory queue.  We hold the
513                  * object's token so the scan should not race anything.
514                  */
515                 info.limit = remove_mode;
516                 info.map = map;
517                 info.desired = desired;
518                 vm_page_rb_tree_RB_SCAN(&lobject->rb_memq, NULL,
519                                 vm_pageout_object_deactivate_pages_callback,
520                                 &info
521                 );
522                 while ((tobject = lobject->backing_object) != NULL) {
523                         KKASSERT(tobject != object);
524                         vm_object_hold(tobject);
525                         if (tobject == lobject->backing_object)
526                                 break;
527                         vm_object_drop(tobject);
528                 }
529                 if (lobject != object) {
530                         vm_object_lock_swap();
531                         vm_object_drop(lobject);
532                 }
533                 lobject = tobject;
534         }
535         if (lobject != object)
536                 vm_object_drop(lobject);
537 }
538
539 /*
540  * The caller must hold the vm_object.
541  */
542 static int
543 vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
544 {
545         struct rb_vm_page_scan_info *info = data;
546         int actcount;
547
548         if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) {
549                 return(-1);
550         }
551         mycpu->gd_cnt.v_pdpages++;
552
553         if (vm_page_busy_try(p, TRUE))
554                 return(0);
555         if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) {
556                 vm_page_wakeup(p);
557                 return(0);
558         }
559         if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
560                 vm_page_wakeup(p);
561                 return(0);
562         }
563
564         actcount = pmap_ts_referenced(p);
565         if (actcount) {
566                 vm_page_flag_set(p, PG_REFERENCED);
567         } else if (p->flags & PG_REFERENCED) {
568                 actcount = 1;
569         }
570
571         vm_page_and_queue_spin_lock(p);
572         if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
573                 vm_page_and_queue_spin_unlock(p);
574                 vm_page_activate(p);
575                 p->act_count += actcount;
576                 vm_page_flag_clear(p, PG_REFERENCED);
577         } else if (p->queue - p->pc == PQ_ACTIVE) {
578                 if ((p->flags & PG_REFERENCED) == 0) {
579                         p->act_count -= min(p->act_count, ACT_DECLINE);
580                         if (!info->limit &&
581                             (vm_pageout_algorithm || (p->act_count == 0))) {
582                                 vm_page_and_queue_spin_unlock(p);
583                                 vm_page_protect(p, VM_PROT_NONE);
584                                 vm_page_deactivate(p);
585                         } else {
586                                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
587                                              p, pageq);
588                                 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
589                                                   p, pageq);
590                                 vm_page_and_queue_spin_unlock(p);
591                         }
592                 } else {
593                         vm_page_and_queue_spin_unlock(p);
594                         vm_page_activate(p);
595                         vm_page_flag_clear(p, PG_REFERENCED);
596
597                         vm_page_and_queue_spin_lock(p);
598                         if (p->queue - p->pc == PQ_ACTIVE) {
599                                 if (p->act_count < (ACT_MAX - ACT_ADVANCE))
600                                         p->act_count += ACT_ADVANCE;
601                                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
602                                              p, pageq);
603                                 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
604                                                   p, pageq);
605                         }
606                         vm_page_and_queue_spin_unlock(p);
607                 }
608         } else if (p->queue - p->pc == PQ_INACTIVE) {
609                 vm_page_and_queue_spin_unlock(p);
610                 vm_page_protect(p, VM_PROT_NONE);
611         } else {
612                 vm_page_and_queue_spin_unlock(p);
613         }
614         vm_page_wakeup(p);
615         return(0);
616 }
617
618 /*
619  * Deactivate some number of pages in a map, try to do it fairly, but
620  * that is really hard to do.
621  */
622 static void
623 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired)
624 {
625         vm_map_entry_t tmpe;
626         vm_object_t obj, bigobj;
627         int nothingwired;
628
629         if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT)) {
630                 return;
631         }
632
633         bigobj = NULL;
634         nothingwired = TRUE;
635
636         /*
637          * first, search out the biggest object, and try to free pages from
638          * that.
639          */
640         tmpe = map->header.next;
641         while (tmpe != &map->header) {
642                 switch(tmpe->maptype) {
643                 case VM_MAPTYPE_NORMAL:
644                 case VM_MAPTYPE_VPAGETABLE:
645                         obj = tmpe->object.vm_object;
646                         if ((obj != NULL) && (obj->shadow_count <= 1) &&
647                                 ((bigobj == NULL) ||
648                                  (bigobj->resident_page_count < obj->resident_page_count))) {
649                                 bigobj = obj;
650                         }
651                         break;
652                 default:
653                         break;
654                 }
655                 if (tmpe->wired_count > 0)
656                         nothingwired = FALSE;
657                 tmpe = tmpe->next;
658         }
659
660         if (bigobj)  {
661                 vm_object_hold(bigobj);
662                 vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
663                 vm_object_drop(bigobj);
664         }
665
666         /*
667          * Next, hunt around for other pages to deactivate.  We actually
668          * do this search sort of wrong -- .text first is not the best idea.
669          */
670         tmpe = map->header.next;
671         while (tmpe != &map->header) {
672                 if (pmap_resident_count(vm_map_pmap(map)) <= desired)
673                         break;
674                 switch(tmpe->maptype) {
675                 case VM_MAPTYPE_NORMAL:
676                 case VM_MAPTYPE_VPAGETABLE:
677                         obj = tmpe->object.vm_object;
678                         if (obj) {
679                                 vm_object_hold(obj);
680                                 vm_pageout_object_deactivate_pages(map, obj, desired, 0);
681                                 vm_object_drop(obj);
682                         }
683                         break;
684                 default:
685                         break;
686                 }
687                 tmpe = tmpe->next;
688         };
689
690         /*
691          * Remove all mappings if a process is swapped out, this will free page
692          * table pages.
693          */
694         if (desired == 0 && nothingwired)
695                 pmap_remove(vm_map_pmap(map),
696                             VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
697         vm_map_unlock(map);
698 }
699 #endif
700
701 /*
702  * Called when the pageout scan wants to free a page.  We no longer
703  * try to cycle the vm_object here with a reference & dealloc, which can
704  * cause a non-trivial object collapse in a critical path.
705  *
706  * It is unclear why we cycled the ref_count in the past, perhaps to try
707  * to optimize shadow chain collapses but I don't quite see why it would
708  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
709  * synchronously and not have to be kicked-start.
710  */
711 static void
712 vm_pageout_page_free(vm_page_t m) 
713 {
714         vm_page_protect(m, VM_PROT_NONE);
715         vm_page_free(m);
716 }
717
718 /*
719  * vm_pageout_scan does the dirty work for the pageout daemon.
720  */
721 struct vm_pageout_scan_info {
722         struct proc *bigproc;
723         vm_offset_t bigsize;
724 };
725
726 static int vm_pageout_scan_callback(struct proc *p, void *data);
727
728 static int
729 vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
730                          int *vnodes_skippedp)
731 {
732         vm_page_t m;
733         struct vm_page marker;
734         struct vnode *vpfailed;         /* warning, allowed to be stale */
735         int maxscan;
736         int delta = 0;
737         vm_object_t object;
738         int actcount;
739         int maxlaunder;
740
741         /*
742          * Start scanning the inactive queue for pages we can move to the
743          * cache or free.  The scan will stop when the target is reached or
744          * we have scanned the entire inactive queue.  Note that m->act_count
745          * is not used to form decisions for the inactive queue, only for the
746          * active queue.
747          *
748          * maxlaunder limits the number of dirty pages we flush per scan.
749          * For most systems a smaller value (16 or 32) is more robust under
750          * extreme memory and disk pressure because any unnecessary writes
751          * to disk can result in extreme performance degredation.  However,
752          * systems with excessive dirty pages (especially when MAP_NOSYNC is
753          * used) will die horribly with limited laundering.  If the pageout
754          * daemon cannot clean enough pages in the first pass, we let it go
755          * all out in succeeding passes.
756          */
757         if ((maxlaunder = vm_max_launder) <= 1)
758                 maxlaunder = 1;
759         if (pass)
760                 maxlaunder = 10000;
761
762         /*
763          * Initialize our marker
764          */
765         bzero(&marker, sizeof(marker));
766         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
767         marker.queue = PQ_INACTIVE + q;
768         marker.pc = q;
769         marker.wire_count = 1;
770
771         /*
772          * Inactive queue scan.
773          *
774          * NOTE: The vm_page must be spinlocked before the queue to avoid
775          *       deadlocks, so it is easiest to simply iterate the loop
776          *       with the queue unlocked at the top.
777          */
778         vpfailed = NULL;
779
780         vm_page_queues_spin_lock(PQ_INACTIVE + q);
781         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
782         maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt;
783         vm_page_queues_spin_unlock(PQ_INACTIVE + q);
784
785         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
786                maxscan-- > 0 && avail_shortage - delta > 0)
787         {
788                 vm_page_and_queue_spin_lock(m);
789                 if (m != TAILQ_NEXT(&marker, pageq)) {
790                         vm_page_and_queue_spin_unlock(m);
791                         ++maxscan;
792                         continue;
793                 }
794                 KKASSERT(m->queue - m->pc == PQ_INACTIVE);
795                 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
796                              &marker, pageq);
797                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
798                                    &marker, pageq);
799                 mycpu->gd_cnt.v_pdpages++;
800
801                 /*
802                  * Skip marker pages
803                  */
804                 if (m->flags & PG_MARKER) {
805                         vm_page_and_queue_spin_unlock(m);
806                         continue;
807                 }
808
809                 /*
810                  * Try to busy the page.  Don't mess with pages which are
811                  * already busy or reorder them in the queue.
812                  */
813                 if (vm_page_busy_try(m, TRUE)) {
814                         vm_page_and_queue_spin_unlock(m);
815                         continue;
816                 }
817                 vm_page_and_queue_spin_unlock(m);
818                 KKASSERT(m->queue - m->pc == PQ_INACTIVE);
819
820                 lwkt_yield();
821
822                 /*
823                  * The page has been successfully busied and is now no
824                  * longer spinlocked.  The queue is no longer spinlocked
825                  * either.
826                  */
827
828                 /*
829                  * It is possible for a page to be busied ad-hoc (e.g. the
830                  * pmap_collect() code) and wired and race against the
831                  * allocation of a new page.  vm_page_alloc() may be forced
832                  * to deactivate the wired page in which case it winds up
833                  * on the inactive queue and must be handled here.  We
834                  * correct the problem simply by unqueuing the page.
835                  */
836                 if (m->wire_count) {
837                         vm_page_unqueue_nowakeup(m);
838                         vm_page_wakeup(m);
839                         kprintf("WARNING: pagedaemon: wired page on "
840                                 "inactive queue %p\n", m);
841                         continue;
842                 }
843
844                 /*
845                  * A held page may be undergoing I/O, so skip it.
846                  */
847                 if (m->hold_count) {
848                         vm_page_and_queue_spin_lock(m);
849                         if (m->queue - m->pc == PQ_INACTIVE) {
850                                 TAILQ_REMOVE(
851                                         &vm_page_queues[PQ_INACTIVE + q].pl,
852                                         m, pageq);
853                                 TAILQ_INSERT_TAIL(
854                                         &vm_page_queues[PQ_INACTIVE + q].pl,
855                                         m, pageq);
856                         }
857                         vm_page_and_queue_spin_unlock(m);
858                         ++vm_swapcache_inactive_heuristic;
859                         vm_page_wakeup(m);
860                         continue;
861                 }
862
863                 if (m->object->ref_count == 0) {
864                         /*
865                          * If the object is not being used, we ignore previous 
866                          * references.
867                          */
868                         vm_page_flag_clear(m, PG_REFERENCED);
869                         pmap_clear_reference(m);
870                         /* fall through to end */
871                 } else if (((m->flags & PG_REFERENCED) == 0) &&
872                             (actcount = pmap_ts_referenced(m))) {
873                         /*
874                          * Otherwise, if the page has been referenced while 
875                          * in the inactive queue, we bump the "activation
876                          * count" upwards, making it less likely that the
877                          * page will be added back to the inactive queue
878                          * prematurely again.  Here we check the page tables
879                          * (or emulated bits, if any), given the upper level
880                          * VM system not knowing anything about existing 
881                          * references.
882                          */
883                         vm_page_activate(m);
884                         m->act_count += (actcount + ACT_ADVANCE);
885                         vm_page_wakeup(m);
886                         continue;
887                 }
888
889                 /*
890                  * (m) is still busied.
891                  *
892                  * If the upper level VM system knows about any page 
893                  * references, we activate the page.  We also set the 
894                  * "activation count" higher than normal so that we will less 
895                  * likely place pages back onto the inactive queue again.
896                  */
897                 if ((m->flags & PG_REFERENCED) != 0) {
898                         vm_page_flag_clear(m, PG_REFERENCED);
899                         actcount = pmap_ts_referenced(m);
900                         vm_page_activate(m);
901                         m->act_count += (actcount + ACT_ADVANCE + 1);
902                         vm_page_wakeup(m);
903                         continue;
904                 }
905
906                 /*
907                  * If the upper level VM system doesn't know anything about 
908                  * the page being dirty, we have to check for it again.  As 
909                  * far as the VM code knows, any partially dirty pages are 
910                  * fully dirty.
911                  *
912                  * Pages marked PG_WRITEABLE may be mapped into the user
913                  * address space of a process running on another cpu.  A
914                  * user process (without holding the MP lock) running on
915                  * another cpu may be able to touch the page while we are
916                  * trying to remove it.  vm_page_cache() will handle this
917                  * case for us.
918                  */
919                 if (m->dirty == 0) {
920                         vm_page_test_dirty(m);
921                 } else {
922                         vm_page_dirty(m);
923                 }
924
925                 if (m->valid == 0) {
926                         /*
927                          * Invalid pages can be easily freed
928                          */
929                         vm_pageout_page_free(m);
930                         mycpu->gd_cnt.v_dfree++;
931                         ++delta;
932                 } else if (m->dirty == 0) {
933                         /*
934                          * Clean pages can be placed onto the cache queue.
935                          * This effectively frees them.
936                          */
937                         vm_page_cache(m);
938                         ++delta;
939                 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
940                         /*
941                          * Dirty pages need to be paged out, but flushing
942                          * a page is extremely expensive verses freeing
943                          * a clean page.  Rather then artificially limiting
944                          * the number of pages we can flush, we instead give
945                          * dirty pages extra priority on the inactive queue
946                          * by forcing them to be cycled through the queue
947                          * twice before being flushed, after which the 
948                          * (now clean) page will cycle through once more
949                          * before being freed.  This significantly extends
950                          * the thrash point for a heavily loaded machine.
951                          */
952                         vm_page_flag_set(m, PG_WINATCFLS);
953                         vm_page_and_queue_spin_lock(m);
954                         if (m->queue - m->pc == PQ_INACTIVE) {
955                                 TAILQ_REMOVE(
956                                         &vm_page_queues[PQ_INACTIVE + q].pl,
957                                         m, pageq);
958                                 TAILQ_INSERT_TAIL(
959                                         &vm_page_queues[PQ_INACTIVE + q].pl,
960                                         m, pageq);
961                         }
962                         vm_page_and_queue_spin_unlock(m);
963                         ++vm_swapcache_inactive_heuristic;
964                         vm_page_wakeup(m);
965                 } else if (maxlaunder > 0) {
966                         /*
967                          * We always want to try to flush some dirty pages if
968                          * we encounter them, to keep the system stable.
969                          * Normally this number is small, but under extreme
970                          * pressure where there are insufficient clean pages
971                          * on the inactive queue, we may have to go all out.
972                          */
973                         int swap_pageouts_ok;
974                         struct vnode *vp = NULL;
975
976                         object = m->object;
977
978                         if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
979                                 swap_pageouts_ok = 1;
980                         } else {
981                                 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
982                                 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
983                                 vm_page_count_min(0));
984                                                                                 
985                         }
986
987                         /*
988                          * We don't bother paging objects that are "dead".  
989                          * Those objects are in a "rundown" state.
990                          */
991                         if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
992                                 vm_page_and_queue_spin_lock(m);
993                                 if (m->queue - m->pc == PQ_INACTIVE) {
994                                         TAILQ_REMOVE(
995                                             &vm_page_queues[PQ_INACTIVE + q].pl,
996                                             m, pageq);
997                                         TAILQ_INSERT_TAIL(
998                                             &vm_page_queues[PQ_INACTIVE + q].pl,
999                                             m, pageq);
1000                                 }
1001                                 vm_page_and_queue_spin_unlock(m);
1002                                 ++vm_swapcache_inactive_heuristic;
1003                                 vm_page_wakeup(m);
1004                                 continue;
1005                         }
1006
1007                         /*
1008                          * (m) is still busied.
1009                          *
1010                          * The object is already known NOT to be dead.   It
1011                          * is possible for the vget() to block the whole
1012                          * pageout daemon, but the new low-memory handling
1013                          * code should prevent it.
1014                          *
1015                          * The previous code skipped locked vnodes and, worse,
1016                          * reordered pages in the queue.  This results in
1017                          * completely non-deterministic operation because,
1018                          * quite often, a vm_fault has initiated an I/O and
1019                          * is holding a locked vnode at just the point where
1020                          * the pageout daemon is woken up.
1021                          *
1022                          * We can't wait forever for the vnode lock, we might
1023                          * deadlock due to a vn_read() getting stuck in
1024                          * vm_wait while holding this vnode.  We skip the 
1025                          * vnode if we can't get it in a reasonable amount
1026                          * of time.
1027                          *
1028                          * vpfailed is used to (try to) avoid the case where
1029                          * a large number of pages are associated with a
1030                          * locked vnode, which could cause the pageout daemon
1031                          * to stall for an excessive amount of time.
1032                          */
1033                         if (object->type == OBJT_VNODE) {
1034                                 int flags;
1035
1036                                 vp = object->handle;
1037                                 flags = LK_EXCLUSIVE | LK_NOOBJ;
1038                                 if (vp == vpfailed)
1039                                         flags |= LK_NOWAIT;
1040                                 else
1041                                         flags |= LK_TIMELOCK;
1042                                 vm_page_hold(m);
1043                                 vm_page_wakeup(m);
1044
1045                                 /*
1046                                  * We have unbusied (m) temporarily so we can
1047                                  * acquire the vp lock without deadlocking.
1048                                  * (m) is held to prevent destruction.
1049                                  */
1050                                 if (vget(vp, flags) != 0) {
1051                                         vpfailed = vp;
1052                                         ++pageout_lock_miss;
1053                                         if (object->flags & OBJ_MIGHTBEDIRTY)
1054                                                     ++*vnodes_skippedp;
1055                                         vm_page_unhold(m);
1056                                         continue;
1057                                 }
1058
1059                                 /*
1060                                  * The page might have been moved to another
1061                                  * queue during potential blocking in vget()
1062                                  * above.  The page might have been freed and
1063                                  * reused for another vnode.  The object might
1064                                  * have been reused for another vnode.
1065                                  */
1066                                 if (m->queue - m->pc != PQ_INACTIVE ||
1067                                     m->object != object ||
1068                                     object->handle != vp) {
1069                                         if (object->flags & OBJ_MIGHTBEDIRTY)
1070                                                 ++*vnodes_skippedp;
1071                                         vput(vp);
1072                                         vm_page_unhold(m);
1073                                         continue;
1074                                 }
1075         
1076                                 /*
1077                                  * The page may have been busied during the
1078                                  * blocking in vput();  We don't move the
1079                                  * page back onto the end of the queue so that
1080                                  * statistics are more correct if we don't.
1081                                  */
1082                                 if (vm_page_busy_try(m, TRUE)) {
1083                                         vput(vp);
1084                                         vm_page_unhold(m);
1085                                         continue;
1086                                 }
1087                                 vm_page_unhold(m);
1088
1089                                 /*
1090                                  * (m) is busied again
1091                                  *
1092                                  * We own the busy bit and remove our hold
1093                                  * bit.  If the page is still held it
1094                                  * might be undergoing I/O, so skip it.
1095                                  */
1096                                 if (m->hold_count) {
1097                                         vm_page_and_queue_spin_lock(m);
1098                                         if (m->queue - m->pc == PQ_INACTIVE) {
1099                                                 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
1100                                                 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
1101                                         }
1102                                         vm_page_and_queue_spin_unlock(m);
1103                                         ++vm_swapcache_inactive_heuristic;
1104                                         if (object->flags & OBJ_MIGHTBEDIRTY)
1105                                                 ++*vnodes_skippedp;
1106                                         vm_page_wakeup(m);
1107                                         vput(vp);
1108                                         continue;
1109                                 }
1110                                 /* (m) is left busied as we fall through */
1111                         }
1112
1113                         /*
1114                          * page is busy and not held here.
1115                          *
1116                          * If a page is dirty, then it is either being washed
1117                          * (but not yet cleaned) or it is still in the
1118                          * laundry.  If it is still in the laundry, then we
1119                          * start the cleaning operation. 
1120                          *
1121                          * decrement inactive_shortage on success to account
1122                          * for the (future) cleaned page.  Otherwise we
1123                          * could wind up laundering or cleaning too many
1124                          * pages.
1125                          */
1126                         if (vm_pageout_clean(m) != 0) {
1127                                 ++delta;
1128                                 --maxlaunder;
1129                         }
1130                         /* clean ate busy, page no longer accessible */
1131                         if (vp != NULL)
1132                                 vput(vp);
1133                 } else {
1134                         vm_page_wakeup(m);
1135                 }
1136         }
1137         vm_page_queues_spin_lock(PQ_INACTIVE + q);
1138         TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
1139         vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1140         return (delta);
1141 }
1142
1143 static int
1144 vm_pageout_scan_active(int pass, int q,
1145                        int avail_shortage, int inactive_shortage,
1146                        int *recycle_countp)
1147 {
1148         struct vm_page marker;
1149         vm_page_t m;
1150         int actcount;
1151         int delta = 0;
1152         int maxscan;
1153
1154         /*
1155          * We want to move pages from the active queue to the inactive
1156          * queue to get the inactive queue to the inactive target.  If
1157          * we still have a page shortage from above we try to directly free
1158          * clean pages instead of moving them.
1159          *
1160          * If we do still have a shortage we keep track of the number of
1161          * pages we free or cache (recycle_count) as a measure of thrashing
1162          * between the active and inactive queues.
1163          *
1164          * If we were able to completely satisfy the free+cache targets
1165          * from the inactive pool we limit the number of pages we move
1166          * from the active pool to the inactive pool to 2x the pages we
1167          * had removed from the inactive pool (with a minimum of 1/5 the
1168          * inactive target).  If we were not able to completely satisfy
1169          * the free+cache targets we go for the whole target aggressively.
1170          *
1171          * NOTE: Both variables can end up negative.
1172          * NOTE: We are still in a critical section.
1173          */
1174
1175         bzero(&marker, sizeof(marker));
1176         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1177         marker.queue = PQ_ACTIVE + q;
1178         marker.pc = q;
1179         marker.wire_count = 1;
1180
1181         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1182         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1183         maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt;
1184         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1185
1186         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1187                maxscan-- > 0 && (avail_shortage - delta > 0 ||
1188                                 inactive_shortage > 0))
1189         {
1190                 vm_page_and_queue_spin_lock(m);
1191                 if (m != TAILQ_NEXT(&marker, pageq)) {
1192                         vm_page_and_queue_spin_unlock(m);
1193                         ++maxscan;
1194                         continue;
1195                 }
1196                 KKASSERT(m->queue - m->pc == PQ_ACTIVE);
1197                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1198                              &marker, pageq);
1199                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1200                                    &marker, pageq);
1201
1202                 /*
1203                  * Skip marker pages
1204                  */
1205                 if (m->flags & PG_MARKER) {
1206                         vm_page_and_queue_spin_unlock(m);
1207                         continue;
1208                 }
1209
1210                 /*
1211                  * Try to busy the page.  Don't mess with pages which are
1212                  * already busy or reorder them in the queue.
1213                  */
1214                 if (vm_page_busy_try(m, TRUE)) {
1215                         vm_page_and_queue_spin_unlock(m);
1216                         continue;
1217                 }
1218
1219                 /*
1220                  * Don't deactivate pages that are held, even if we can
1221                  * busy them.  (XXX why not?)
1222                  */
1223                 if (m->hold_count != 0) {
1224                         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1225                                      m, pageq);
1226                         TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE + q].pl,
1227                                           m, pageq);
1228                         vm_page_and_queue_spin_unlock(m);
1229                         vm_page_wakeup(m);
1230                         continue;
1231                 }
1232                 vm_page_and_queue_spin_unlock(m);
1233                 lwkt_yield();
1234
1235                 /*
1236                  * The page has been successfully busied and the page and
1237                  * queue are no longer locked.
1238                  */
1239
1240                 /*
1241                  * The count for pagedaemon pages is done after checking the
1242                  * page for eligibility...
1243                  */
1244                 mycpu->gd_cnt.v_pdpages++;
1245
1246                 /*
1247                  * Check to see "how much" the page has been used and clear
1248                  * the tracking access bits.  If the object has no references
1249                  * don't bother paying the expense.
1250                  */
1251                 actcount = 0;
1252                 if (m->object->ref_count != 0) {
1253                         if (m->flags & PG_REFERENCED)
1254                                 ++actcount;
1255                         actcount += pmap_ts_referenced(m);
1256                         if (actcount) {
1257                                 m->act_count += ACT_ADVANCE + actcount;
1258                                 if (m->act_count > ACT_MAX)
1259                                         m->act_count = ACT_MAX;
1260                         }
1261                 }
1262                 vm_page_flag_clear(m, PG_REFERENCED);
1263
1264                 /*
1265                  * actcount is only valid if the object ref_count is non-zero.
1266                  */
1267                 if (actcount && m->object->ref_count != 0) {
1268                         vm_page_and_queue_spin_lock(m);
1269                         if (m->queue - m->pc == PQ_ACTIVE) {
1270                                 TAILQ_REMOVE(
1271                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1272                                         m, pageq);
1273                                 TAILQ_INSERT_TAIL(
1274                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1275                                         m, pageq);
1276                         }
1277                         vm_page_and_queue_spin_unlock(m);
1278                         vm_page_wakeup(m);
1279                 } else {
1280                         m->act_count -= min(m->act_count, ACT_DECLINE);
1281                         if (vm_pageout_algorithm ||
1282                             m->object->ref_count == 0 ||
1283                             m->act_count < pass + 1
1284                         ) {
1285                                 /*
1286                                  * Deactivate the page.  If we had a
1287                                  * shortage from our inactive scan try to
1288                                  * free (cache) the page instead.
1289                                  *
1290                                  * Don't just blindly cache the page if
1291                                  * we do not have a shortage from the
1292                                  * inactive scan, that could lead to
1293                                  * gigabytes being moved.
1294                                  */
1295                                 --inactive_shortage;
1296                                 if (avail_shortage - delta > 0 ||
1297                                     m->object->ref_count == 0) {
1298                                         if (avail_shortage - delta > 0)
1299                                                 ++*recycle_countp;
1300                                         vm_page_protect(m, VM_PROT_NONE);
1301                                         if (m->dirty == 0 &&
1302                                             avail_shortage - delta > 0) {
1303                                                 vm_page_cache(m);
1304                                         } else {
1305                                                 vm_page_deactivate(m);
1306                                                 vm_page_wakeup(m);
1307                                         }
1308                                 } else {
1309                                         vm_page_deactivate(m);
1310                                         vm_page_wakeup(m);
1311                                 }
1312                                 ++delta;
1313                         } else {
1314                                 vm_page_and_queue_spin_lock(m);
1315                                 if (m->queue - m->pc == PQ_ACTIVE) {
1316                                         TAILQ_REMOVE(
1317                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1318                                             m, pageq);
1319                                         TAILQ_INSERT_TAIL(
1320                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1321                                             m, pageq);
1322                                 }
1323                                 vm_page_and_queue_spin_unlock(m);
1324                                 vm_page_wakeup(m);
1325                         }
1326                 }
1327         }
1328
1329         /*
1330          * Clean out our local marker.
1331          */
1332         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1333         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1334         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1335
1336         return (delta);
1337 }
1338
1339 /*
1340  * The number of actually free pages can drop down to v_free_reserved,
1341  * we try to build the free count back above v_free_min.  Note that
1342  * vm_paging_needed() also returns TRUE if v_free_count is not at
1343  * least v_free_min so that is the minimum we must build the free
1344  * count to.
1345  *
1346  * We use a slightly higher target to improve hysteresis,
1347  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1348  * is usually the same as v_cache_min this maintains about
1349  * half the pages in the free queue as are in the cache queue,
1350  * providing pretty good pipelining for pageout operation.
1351  *
1352  * The system operator can manipulate vm.v_cache_min and
1353  * vm.v_free_target to tune the pageout demon.  Be sure
1354  * to keep vm.v_free_min < vm.v_free_target.
1355  *
1356  * Note that the original paging target is to get at least
1357  * (free_min + cache_min) into (free + cache).  The slightly
1358  * higher target will shift additional pages from cache to free
1359  * without effecting the original paging target in order to
1360  * maintain better hysteresis and not have the free count always
1361  * be dead-on v_free_min.
1362  *
1363  * NOTE: we are still in a critical section.
1364  *
1365  * Pages moved from PQ_CACHE to totally free are not counted in the
1366  * pages_freed counter.
1367  */
1368 static void
1369 vm_pageout_scan_cache(int avail_shortage, int vnodes_skipped, int recycle_count)
1370 {
1371         struct vm_pageout_scan_info info;
1372         vm_page_t m;
1373
1374         while (vmstats.v_free_count <
1375                (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1376                 /*
1377                  * This steals some code from vm/vm_page.c
1378                  */
1379                 static int cache_rover = 0;
1380
1381                 m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK, FALSE);
1382                 if (m == NULL)
1383                         break;
1384                 /* page is returned removed from its queue and spinlocked */
1385                 if (vm_page_busy_try(m, TRUE)) {
1386                         vm_page_deactivate_locked(m);
1387                         vm_page_spin_unlock(m);
1388 #ifdef INVARIANTS
1389                         kprintf("Warning: busy page %p found in cache\n", m);
1390 #endif
1391                         continue;
1392                 }
1393                 vm_page_spin_unlock(m);
1394                 pagedaemon_wakeup();
1395                 lwkt_yield();
1396
1397                 /*
1398                  * Page has been successfully busied and it and its queue
1399                  * is no longer spinlocked.
1400                  */
1401                 if ((m->flags & PG_UNMANAGED) ||
1402                     m->hold_count ||
1403                     m->wire_count) {
1404                         vm_page_deactivate(m);
1405                         vm_page_wakeup(m);
1406                         continue;
1407                 }
1408                 KKASSERT((m->flags & PG_MAPPED) == 0);
1409                 KKASSERT(m->dirty == 0);
1410                 cache_rover += PQ_PRIME2;
1411                 vm_pageout_page_free(m);
1412                 mycpu->gd_cnt.v_dfree++;
1413         }
1414
1415 #if !defined(NO_SWAPPING)
1416         /*
1417          * Idle process swapout -- run once per second.
1418          */
1419         if (vm_swap_idle_enabled) {
1420                 static long lsec;
1421                 if (time_second != lsec) {
1422                         vm_pageout_req_swapout |= VM_SWAP_IDLE;
1423                         vm_req_vmdaemon();
1424                         lsec = time_second;
1425                 }
1426         }
1427 #endif
1428                 
1429         /*
1430          * If we didn't get enough free pages, and we have skipped a vnode
1431          * in a writeable object, wakeup the sync daemon.  And kick swapout
1432          * if we did not get enough free pages.
1433          */
1434         if (vm_paging_target() > 0) {
1435                 if (vnodes_skipped && vm_page_count_min(0))
1436                         speedup_syncer();
1437 #if !defined(NO_SWAPPING)
1438                 if (vm_swap_enabled && vm_page_count_target()) {
1439                         vm_req_vmdaemon();
1440                         vm_pageout_req_swapout |= VM_SWAP_NORMAL;
1441                 }
1442 #endif
1443         }
1444
1445         /*
1446          * Handle catastrophic conditions.  Under good conditions we should
1447          * be at the target, well beyond our minimum.  If we could not even
1448          * reach our minimum the system is under heavy stress.
1449          *
1450          * Determine whether we have run out of memory.  This occurs when
1451          * swap_pager_full is TRUE and the only pages left in the page
1452          * queues are dirty.  We will still likely have page shortages.
1453          *
1454          * - swap_pager_full is set if insufficient swap was
1455          *   available to satisfy a requested pageout.
1456          *
1457          * - the inactive queue is bloated (4 x size of active queue),
1458          *   meaning it is unable to get rid of dirty pages and.
1459          *
1460          * - vm_page_count_min() without counting pages recycled from the
1461          *   active queue (recycle_count) means we could not recover
1462          *   enough pages to meet bare minimum needs.  This test only
1463          *   works if the inactive queue is bloated.
1464          *
1465          * - due to a positive avail_shortage we shifted the remaining
1466          *   dirty pages from the active queue to the inactive queue
1467          *   trying to find clean ones to free.
1468          */
1469         if (swap_pager_full && vm_page_count_min(recycle_count))
1470                 kprintf("Warning: system low on memory+swap!\n");
1471         if (swap_pager_full && vm_page_count_min(recycle_count) &&
1472             vmstats.v_inactive_count > vmstats.v_active_count * 4 &&
1473             avail_shortage > 0) {
1474                 /*
1475                  * Kill something.
1476                  */
1477                 info.bigproc = NULL;
1478                 info.bigsize = 0;
1479                 allproc_scan(vm_pageout_scan_callback, &info);
1480                 if (info.bigproc != NULL) {
1481                         killproc(info.bigproc, "out of swap space");
1482                         info.bigproc->p_nice = PRIO_MIN;
1483                         info.bigproc->p_usched->resetpriority(
1484                                 FIRST_LWP_IN_PROC(info.bigproc));
1485                         wakeup(&vmstats.v_free_count);
1486                         PRELE(info.bigproc);
1487                 }
1488         }
1489 }
1490
1491 /*
1492  * The caller must hold proc_token.
1493  */
1494 static int
1495 vm_pageout_scan_callback(struct proc *p, void *data)
1496 {
1497         struct vm_pageout_scan_info *info = data;
1498         vm_offset_t size;
1499
1500         /*
1501          * Never kill system processes or init.  If we have configured swap
1502          * then try to avoid killing low-numbered pids.
1503          */
1504         if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1505             ((p->p_pid < 48) && (vm_swap_size != 0))) {
1506                 return (0);
1507         }
1508
1509         /*
1510          * if the process is in a non-running type state,
1511          * don't touch it.
1512          */
1513         if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
1514                 return (0);
1515
1516         /*
1517          * Get the approximate process size.  Note that anonymous pages
1518          * with backing swap will be counted twice, but there should not
1519          * be too many such pages due to the stress the VM system is
1520          * under at this point.
1521          */
1522         size = vmspace_anonymous_count(p->p_vmspace) +
1523                 vmspace_swap_count(p->p_vmspace);
1524
1525         /*
1526          * If the this process is bigger than the biggest one
1527          * remember it.
1528          */
1529         if (info->bigsize < size) {
1530                 if (info->bigproc)
1531                         PRELE(info->bigproc);
1532                 PHOLD(p);
1533                 info->bigproc = p;
1534                 info->bigsize = size;
1535         }
1536         lwkt_yield();
1537         return(0);
1538 }
1539
1540 /*
1541  * This routine tries to maintain the pseudo LRU active queue,
1542  * so that during long periods of time where there is no paging,
1543  * that some statistic accumulation still occurs.  This code
1544  * helps the situation where paging just starts to occur.
1545  */
1546 static void
1547 vm_pageout_page_stats(int q)
1548 {
1549         static int fullintervalcount = 0;
1550         struct vm_page marker;
1551         vm_page_t m;
1552         int pcount, tpcount;            /* Number of pages to check */
1553         int page_shortage;
1554
1555         page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1556                          vmstats.v_free_min) -
1557                         (vmstats.v_free_count + vmstats.v_inactive_count +
1558                          vmstats.v_cache_count);
1559
1560         if (page_shortage <= 0)
1561                 return;
1562
1563         pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1564         fullintervalcount += vm_pageout_stats_interval;
1565         if (fullintervalcount < vm_pageout_full_stats_interval) {
1566                 tpcount = (vm_pageout_stats_max * pcount) /
1567                           vmstats.v_page_count + 1;
1568                 if (pcount > tpcount)
1569                         pcount = tpcount;
1570         } else {
1571                 fullintervalcount = 0;
1572         }
1573
1574         bzero(&marker, sizeof(marker));
1575         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1576         marker.queue = PQ_ACTIVE + q;
1577         marker.pc = q;
1578         marker.wire_count = 1;
1579
1580         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1581         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1582         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1583
1584         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1585                pcount-- > 0)
1586         {
1587                 int actcount;
1588
1589                 vm_page_and_queue_spin_lock(m);
1590                 if (m != TAILQ_NEXT(&marker, pageq)) {
1591                         vm_page_and_queue_spin_unlock(m);
1592                         ++pcount;
1593                         continue;
1594                 }
1595                 KKASSERT(m->queue - m->pc == PQ_ACTIVE);
1596                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1597                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1598                                    &marker, pageq);
1599
1600                 /*
1601                  * Ignore markers
1602                  */
1603                 if (m->flags & PG_MARKER) {
1604                         vm_page_and_queue_spin_unlock(m);
1605                         continue;
1606                 }
1607
1608                 /*
1609                  * Ignore pages we can't busy
1610                  */
1611                 if (vm_page_busy_try(m, TRUE)) {
1612                         vm_page_and_queue_spin_unlock(m);
1613                         continue;
1614                 }
1615                 vm_page_and_queue_spin_unlock(m);
1616                 KKASSERT(m->queue - m->pc == PQ_ACTIVE);
1617
1618                 /*
1619                  * We now have a safely busied page, the page and queue
1620                  * spinlocks have been released.
1621                  *
1622                  * Ignore held pages
1623                  */
1624                 if (m->hold_count) {
1625                         vm_page_wakeup(m);
1626                         continue;
1627                 }
1628
1629                 /*
1630                  * Calculate activity
1631                  */
1632                 actcount = 0;
1633                 if (m->flags & PG_REFERENCED) {
1634                         vm_page_flag_clear(m, PG_REFERENCED);
1635                         actcount += 1;
1636                 }
1637                 actcount += pmap_ts_referenced(m);
1638
1639                 /*
1640                  * Update act_count and move page to end of queue.
1641                  */
1642                 if (actcount) {
1643                         m->act_count += ACT_ADVANCE + actcount;
1644                         if (m->act_count > ACT_MAX)
1645                                 m->act_count = ACT_MAX;
1646                         vm_page_and_queue_spin_lock(m);
1647                         if (m->queue - m->pc == PQ_ACTIVE) {
1648                                 TAILQ_REMOVE(
1649                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1650                                         m, pageq);
1651                                 TAILQ_INSERT_TAIL(
1652                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1653                                         m, pageq);
1654                         }
1655                         vm_page_and_queue_spin_unlock(m);
1656                         vm_page_wakeup(m);
1657                         continue;
1658                 }
1659
1660                 if (m->act_count == 0) {
1661                         /*
1662                          * We turn off page access, so that we have
1663                          * more accurate RSS stats.  We don't do this
1664                          * in the normal page deactivation when the
1665                          * system is loaded VM wise, because the
1666                          * cost of the large number of page protect
1667                          * operations would be higher than the value
1668                          * of doing the operation.
1669                          *
1670                          * We use the marker to save our place so
1671                          * we can release the spin lock.  both (m)
1672                          * and (next) will be invalid.
1673                          */
1674                         vm_page_protect(m, VM_PROT_NONE);
1675                         vm_page_deactivate(m);
1676                 } else {
1677                         m->act_count -= min(m->act_count, ACT_DECLINE);
1678                         vm_page_and_queue_spin_lock(m);
1679                         if (m->queue - m->pc == PQ_ACTIVE) {
1680                                 TAILQ_REMOVE(
1681                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1682                                         m, pageq);
1683                                 TAILQ_INSERT_TAIL(
1684                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1685                                         m, pageq);
1686                         }
1687                         vm_page_and_queue_spin_unlock(m);
1688                 }
1689                 vm_page_wakeup(m);
1690         }
1691
1692         /*
1693          * Remove our local marker
1694          */
1695         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1696         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1697         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1698 }
1699
1700 static int
1701 vm_pageout_free_page_calc(vm_size_t count)
1702 {
1703         if (count < vmstats.v_page_count)
1704                  return 0;
1705         /*
1706          * free_reserved needs to include enough for the largest swap pager
1707          * structures plus enough for any pv_entry structs when paging.
1708          *
1709          * v_free_min           normal allocations
1710          * v_free_reserved      system allocations
1711          * v_pageout_free_min   allocations by pageout daemon
1712          * v_interrupt_free_min low level allocations (e.g swap structures)
1713          */
1714         if (vmstats.v_page_count > 1024)
1715                 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200;
1716         else
1717                 vmstats.v_free_min = 64;
1718         vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
1719         vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
1720         vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
1721         vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
1722
1723         return 1;
1724 }
1725
1726
1727 /*
1728  * vm_pageout is the high level pageout daemon.
1729  *
1730  * No requirements.
1731  */
1732 static void
1733 vm_pageout_thread(void)
1734 {
1735         int pass;
1736         int q;
1737
1738         /*
1739          * Initialize some paging parameters.
1740          */
1741         curthread->td_flags |= TDF_SYSTHREAD;
1742
1743         if (vmstats.v_page_count < 2000)
1744                 vm_pageout_page_count = 8;
1745
1746         vm_pageout_free_page_calc(vmstats.v_page_count);
1747
1748         /*
1749          * v_free_target and v_cache_min control pageout hysteresis.  Note
1750          * that these are more a measure of the VM cache queue hysteresis
1751          * then the VM free queue.  Specifically, v_free_target is the
1752          * high water mark (free+cache pages).
1753          *
1754          * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
1755          * low water mark, while v_free_min is the stop.  v_cache_min must
1756          * be big enough to handle memory needs while the pageout daemon
1757          * is signalled and run to free more pages.
1758          */
1759         if (vmstats.v_free_count > 6144)
1760                 vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
1761         else
1762                 vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
1763
1764         /*
1765          * NOTE: With the new buffer cache b_act_count we want the default
1766          *       inactive target to be a percentage of available memory.
1767          *
1768          *       The inactive target essentially determines the minimum
1769          *       number of 'temporary' pages capable of caching one-time-use
1770          *       files when the VM system is otherwise full of pages
1771          *       belonging to multi-time-use files or active program data.
1772          *
1773          * NOTE: The inactive target is aggressively persued only if the
1774          *       inactive queue becomes too small.  If the inactive queue
1775          *       is large enough to satisfy page movement to free+cache
1776          *       then it is repopulated more slowly from the active queue.
1777          *       This allows a general inactive_target default to be set.
1778          *
1779          *       There is an issue here for processes which sit mostly idle
1780          *       'overnight', such as sshd, tcsh, and X.  Any movement from
1781          *       the active queue will eventually cause such pages to
1782          *       recycle eventually causing a lot of paging in the morning.
1783          *       To reduce the incidence of this pages cycled out of the
1784          *       buffer cache are moved directly to the inactive queue if
1785          *       they were only used once or twice.
1786          *
1787          *       The vfs.vm_cycle_point sysctl can be used to adjust this.
1788          *       Increasing the value (up to 64) increases the number of
1789          *       buffer recyclements which go directly to the inactive queue.
1790          */
1791         if (vmstats.v_free_count > 2048) {
1792                 vmstats.v_cache_min = vmstats.v_free_target;
1793                 vmstats.v_cache_max = 2 * vmstats.v_cache_min;
1794         } else {
1795                 vmstats.v_cache_min = 0;
1796                 vmstats.v_cache_max = 0;
1797         }
1798         vmstats.v_inactive_target = vmstats.v_free_count / 4;
1799
1800         /* XXX does not really belong here */
1801         if (vm_page_max_wired == 0)
1802                 vm_page_max_wired = vmstats.v_free_count / 3;
1803
1804         if (vm_pageout_stats_max == 0)
1805                 vm_pageout_stats_max = vmstats.v_free_target;
1806
1807         /*
1808          * Set interval in seconds for stats scan.
1809          */
1810         if (vm_pageout_stats_interval == 0)
1811                 vm_pageout_stats_interval = 5;
1812         if (vm_pageout_full_stats_interval == 0)
1813                 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
1814         
1815
1816         /*
1817          * Set maximum free per pass
1818          */
1819         if (vm_pageout_stats_free_max == 0)
1820                 vm_pageout_stats_free_max = 5;
1821
1822         swap_pager_swap_init();
1823         pass = 0;
1824
1825         /*
1826          * The pageout daemon is never done, so loop forever.
1827          */
1828         while (TRUE) {
1829                 int error;
1830                 int delta1;
1831                 int delta2;
1832                 int avail_shortage;
1833                 int inactive_shortage;
1834                 int vnodes_skipped = 0;
1835                 int recycle_count = 0;
1836                 int tmp;
1837
1838                 /*
1839                  * Wait for an action request.  If we timeout check to
1840                  * see if paging is needed (in case the normal wakeup
1841                  * code raced us).
1842                  */
1843                 if (vm_pages_needed == 0) {
1844                         error = tsleep(&vm_pages_needed,
1845                                        0, "psleep",
1846                                        vm_pageout_stats_interval * hz);
1847                         if (error &&
1848                             vm_paging_needed() == 0 &&
1849                             vm_pages_needed == 0) {
1850                                 for (q = 0; q < PQ_L2_SIZE; ++q)
1851                                         vm_pageout_page_stats(q);
1852                                 continue;
1853                         }
1854                         vm_pages_needed = 1;
1855                 }
1856
1857                 mycpu->gd_cnt.v_pdwakeups++;
1858
1859                 /*
1860                  * Do whatever cleanup that the pmap code can.
1861                  */
1862                 pmap_collect();
1863
1864                 /*
1865                  * Scan for pageout.  Try to avoid thrashing the system
1866                  * with activity.
1867                  *
1868                  * Calculate our target for the number of free+cache pages we
1869                  * want to get to.  This is higher then the number that causes
1870                  * allocations to stall (severe) in order to provide hysteresis,
1871                  * and if we don't make it all the way but get to the minimum
1872                  * we're happy.  Goose it a bit if there are multipler
1873                  * requests for memory.
1874                  */
1875                 avail_shortage = vm_paging_target() + vm_pageout_deficit;
1876                 vm_pageout_deficit = 0;
1877                 delta1 = 0;
1878                 if (avail_shortage > 0) {
1879                         for (q = 0; q < PQ_L2_SIZE; ++q) {
1880                                 delta1 += vm_pageout_scan_inactive(
1881                                             pass, q,
1882                                             PQAVERAGE(avail_shortage),
1883                                             &vnodes_skipped);
1884                         }
1885                         avail_shortage -= delta1;
1886                 }
1887
1888                 /*
1889                  * Figure out how many active pages we must deactivate.  If
1890                  * we were able to reach our target with just the inactive
1891                  * scan above we limit the number of active pages we
1892                  * deactivate to reduce unnecessary work.
1893                  */
1894                 inactive_shortage = vmstats.v_inactive_target -
1895                                     vmstats.v_inactive_count;
1896
1897                 /*
1898                  * If we were unable to free sufficient inactive pages to
1899                  * satisfy the free/cache queue requirements then simply
1900                  * reaching the inactive target may not be good enough.
1901                  * Try to deactivate pages in excess of the target based
1902                  * on the shortfall.
1903                  *
1904                  * However to prevent thrashing the VM system do not
1905                  * deactivate more than an additional 1/10 the inactive
1906                  * target's worth of active pages.
1907                  */
1908                 if (avail_shortage > 0) {
1909                         tmp = avail_shortage * 2;
1910                         if (tmp > vmstats.v_inactive_target / 10)
1911                                 tmp = vmstats.v_inactive_target / 10;
1912                         inactive_shortage += tmp;
1913                 }
1914
1915                 if (avail_shortage > 0 || inactive_shortage > 0) {
1916                         delta2 = 0;
1917                         for (q = 0; q < PQ_L2_SIZE; ++q) {
1918                                 delta2 += vm_pageout_scan_active(
1919                                                 pass, q,
1920                                                 PQAVERAGE(avail_shortage),
1921                                                 PQAVERAGE(inactive_shortage),
1922                                                 &recycle_count);
1923                         }
1924                         inactive_shortage -= delta2;
1925                         avail_shortage -= delta2;
1926                 }
1927
1928                 /*
1929                  * Finally free enough cache pages to meet our free page
1930                  * requirement and take more drastic measures if we are
1931                  * still in trouble.
1932                  */
1933                 vm_pageout_scan_cache(avail_shortage, vnodes_skipped,
1934                                       recycle_count);
1935
1936                 /*
1937                  * Wait for more work.
1938                  */
1939                 if (avail_shortage > 0) {
1940                         ++pass;
1941                         if (swap_pager_full) {
1942                                 /*
1943                                  * Running out of memory, catastrophic back-off
1944                                  * to one-second intervals.
1945                                  */
1946                                 tsleep(&vm_pages_needed, 0, "pdelay", hz);
1947                         } else if (pass < 10 && vm_pages_needed > 1) {
1948                                 /*
1949                                  * Normal operation, additional processes
1950                                  * have already kicked us.  Retry immediately.
1951                                  */
1952                         } else if (pass < 10) {
1953                                 /*
1954                                  * Normal operation, fewer processes.  Delay
1955                                  * a bit but allow wakeups.
1956                                  */
1957                                 vm_pages_needed = 0;
1958                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
1959                                 vm_pages_needed = 1;
1960                         } else {
1961                                 /*
1962                                  * We've taken too many passes, forced delay.
1963                                  */
1964                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
1965                         }
1966                 } else {
1967                         /*
1968                          * Interlocked wakeup of waiters (non-optional)
1969                          */
1970                         pass = 0;
1971                         if (vm_pages_needed && !vm_page_count_min(0)) {
1972                                 wakeup(&vmstats.v_free_count);
1973                                 vm_pages_needed = 0;
1974                         }
1975                 }
1976         }
1977 }
1978
1979 static struct kproc_desc page_kp = {
1980         "pagedaemon",
1981         vm_pageout_thread,
1982         &pagethread
1983 };
1984 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
1985
1986
1987 /*
1988  * Called after allocating a page out of the cache or free queue
1989  * to possibly wake the pagedaemon up to replentish our supply.
1990  *
1991  * We try to generate some hysteresis by waking the pagedaemon up
1992  * when our free+cache pages go below the free_min+cache_min level.
1993  * The pagedaemon tries to get the count back up to at least the
1994  * minimum, and through to the target level if possible.
1995  *
1996  * If the pagedaemon is already active bump vm_pages_needed as a hint
1997  * that there are even more requests pending.
1998  *
1999  * SMP races ok?
2000  * No requirements.
2001  */
2002 void
2003 pagedaemon_wakeup(void)
2004 {
2005         if (vm_paging_needed() && curthread != pagethread) {
2006                 if (vm_pages_needed == 0) {
2007                         vm_pages_needed = 1;    /* SMP race ok */
2008                         wakeup(&vm_pages_needed);
2009                 } else if (vm_page_count_min(0)) {
2010                         ++vm_pages_needed;      /* SMP race ok */
2011                 }
2012         }
2013 }
2014
2015 #if !defined(NO_SWAPPING)
2016
2017 /*
2018  * SMP races ok?
2019  * No requirements.
2020  */
2021 static void
2022 vm_req_vmdaemon(void)
2023 {
2024         static int lastrun = 0;
2025
2026         if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2027                 wakeup(&vm_daemon_needed);
2028                 lastrun = ticks;
2029         }
2030 }
2031
2032 static int vm_daemon_callback(struct proc *p, void *data __unused);
2033
2034 /*
2035  * No requirements.
2036  */
2037 static void
2038 vm_daemon(void)
2039 {
2040         /*
2041          * XXX vm_daemon_needed specific token?
2042          */
2043         while (TRUE) {
2044                 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2045                 if (vm_pageout_req_swapout) {
2046                         swapout_procs(vm_pageout_req_swapout);
2047                         vm_pageout_req_swapout = 0;
2048                 }
2049                 /*
2050                  * scan the processes for exceeding their rlimits or if
2051                  * process is swapped out -- deactivate pages
2052                  */
2053                 allproc_scan(vm_daemon_callback, NULL);
2054         }
2055 }
2056
2057 /*
2058  * Caller must hold proc_token.
2059  */
2060 static int
2061 vm_daemon_callback(struct proc *p, void *data __unused)
2062 {
2063         vm_pindex_t limit, size;
2064
2065         /*
2066          * if this is a system process or if we have already
2067          * looked at this process, skip it.
2068          */
2069         if (p->p_flags & (P_SYSTEM | P_WEXIT))
2070                 return (0);
2071
2072         /*
2073          * if the process is in a non-running type state,
2074          * don't touch it.
2075          */
2076         if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
2077                 return (0);
2078
2079         /*
2080          * get a limit
2081          */
2082         limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2083                                 p->p_rlimit[RLIMIT_RSS].rlim_max));
2084
2085         /*
2086          * let processes that are swapped out really be
2087          * swapped out.  Set the limit to nothing to get as
2088          * many pages out to swap as possible.
2089          */
2090         if (p->p_flags & P_SWAPPEDOUT)
2091                 limit = 0;
2092
2093         lwkt_gettoken(&p->p_vmspace->vm_map.token);
2094         size = vmspace_resident_count(p->p_vmspace);
2095         if (limit >= 0 && size >= limit) {
2096                 vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, limit);
2097         }
2098         lwkt_reltoken(&p->p_vmspace->vm_map.token);
2099         return (0);
2100 }
2101
2102 #endif