kernel - Add flexibility to the RSS rlimit
[dragonfly.git] / sys / vm / vm_pageout.c
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * The Mach Operating System project at Carnegie-Mellon University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
37  *
38  *
39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40  * All rights reserved.
41  *
42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  *
64  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
65  */
66
67 /*
68  *      The proverbial page-out daemon.
69  */
70
71 #include "opt_vm.h"
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/kernel.h>
75 #include <sys/proc.h>
76 #include <sys/kthread.h>
77 #include <sys/resourcevar.h>
78 #include <sys/signalvar.h>
79 #include <sys/vnode.h>
80 #include <sys/vmmeter.h>
81 #include <sys/sysctl.h>
82
83 #include <vm/vm.h>
84 #include <vm/vm_param.h>
85 #include <sys/lock.h>
86 #include <vm/vm_object.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_map.h>
89 #include <vm/vm_pageout.h>
90 #include <vm/vm_pager.h>
91 #include <vm/swap_pager.h>
92 #include <vm/vm_extern.h>
93
94 #include <sys/thread2.h>
95 #include <sys/spinlock2.h>
96 #include <vm/vm_page2.h>
97
98 /*
99  * System initialization
100  */
101
102 /* the kernel process "vm_pageout"*/
103 static int vm_pageout_page(vm_page_t m, int *max_launderp,
104                            int *vnodes_skippedp, struct vnode **vpfailedp,
105                            int pass, int vmflush_flags);
106 static int vm_pageout_clean_helper (vm_page_t, int);
107 static int vm_pageout_free_page_calc (vm_size_t count);
108 static void vm_pageout_page_free(vm_page_t m) ;
109 struct thread *pagethread;
110
111 #if !defined(NO_SWAPPING)
112 /* the kernel process "vm_daemon"*/
113 static void vm_daemon (void);
114 static struct   thread *vmthread;
115
116 static struct kproc_desc vm_kp = {
117         "vmdaemon",
118         vm_daemon,
119         &vmthread
120 };
121 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
122 #endif
123
124 int vm_pages_needed = 0;        /* Event on which pageout daemon sleeps */
125 int vm_pageout_deficit = 0;     /* Estimated number of pages deficit */
126 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
127 int vm_page_free_hysteresis = 16;
128
129 #if !defined(NO_SWAPPING)
130 static int vm_pageout_req_swapout;
131 static int vm_daemon_needed;
132 #endif
133 static int vm_max_launder = 4096;
134 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
135 static int vm_pageout_full_stats_interval = 0;
136 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
137 static int defer_swap_pageouts=0;
138 static int disable_swap_pageouts=0;
139 static u_int vm_anonmem_decline = ACT_DECLINE;
140 static u_int vm_filemem_decline = ACT_DECLINE * 2;
141
142 #if defined(NO_SWAPPING)
143 static int vm_swap_enabled=0;
144 static int vm_swap_idle_enabled=0;
145 #else
146 static int vm_swap_enabled=1;
147 static int vm_swap_idle_enabled=0;
148 #endif
149 int vm_pageout_memuse_mode=1;   /* 0-disable, 1-passive, 2-active swp*/
150
151 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
152         CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
153
154 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
155         CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
156
157 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
158         CTLFLAG_RW, &vm_page_free_hysteresis, 0,
159         "Free more pages than the minimum required");
160
161 SYSCTL_INT(_vm, OID_AUTO, max_launder,
162         CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
163
164 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
165         CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
166
167 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
168         CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
169
170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
171         CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
172
173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
174         CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
175 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
176         CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
177
178 #if defined(NO_SWAPPING)
179 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
180         CTLFLAG_RD, &vm_swap_enabled, 0, "");
181 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
182         CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
183 #else
184 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
185         CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
186 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
187         CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
188 #endif
189
190 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
191         CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
192
193 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
194         CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
195
196 static int pageout_lock_miss;
197 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
198         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
199
200 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
201
202 #if !defined(NO_SWAPPING)
203 static vm_pindex_t vm_pageout_object_deactivate_pages(vm_map_t map,
204                         vm_object_t object, vm_pindex_t limit,
205                         vm_pindex_t obj_beg, vm_pindex_t obj_end);
206 static void vm_req_vmdaemon (void);
207 #endif
208 static void vm_pageout_page_stats(int q);
209
210 /*
211  * Calculate approximately how many pages on each queue to try to
212  * clean.  An exact calculation creates an edge condition when the
213  * queues are unbalanced so add significant slop.  The queue scans
214  * will stop early when targets are reached and will start where they
215  * left off on the next pass.
216  *
217  * We need to be generous here because there are all sorts of loading
218  * conditions that can cause edge cases if try to average over all queues.
219  * In particular, storage subsystems have become so fast that paging
220  * activity can become quite frantic.  Eventually we will probably need
221  * two paging threads, one for dirty pages and one for clean, to deal
222  * with the bandwidth requirements.
223
224  * So what we do is calculate a value that can be satisfied nominally by
225  * only having to scan half the queues.
226  */
227 static __inline int
228 PQAVERAGE(int n)
229 {
230         int avg;
231
232         if (n >= 0) {
233                 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
234         } else {
235                 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
236         }
237         return avg;
238 }
239
240 /*
241  * vm_pageout_clean_helper:
242  *
243  * Clean the page and remove it from the laundry.  The page must not be
244  * busy on-call.
245  * 
246  * We set the busy bit to cause potential page faults on this page to
247  * block.  Note the careful timing, however, the busy bit isn't set till
248  * late and we cannot do anything that will mess with the page.
249  */
250 static int
251 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
252 {
253         vm_object_t object;
254         vm_page_t mc[BLIST_MAX_ALLOC];
255         int error;
256         int ib, is, page_base;
257         vm_pindex_t pindex = m->pindex;
258
259         object = m->object;
260
261         /*
262          * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
263          * with the new swapper, but we could have serious problems paging
264          * out other object types if there is insufficient memory.  
265          *
266          * Unfortunately, checking free memory here is far too late, so the
267          * check has been moved up a procedural level.
268          */
269
270         /*
271          * Don't mess with the page if it's busy, held, or special
272          *
273          * XXX do we really need to check hold_count here?  hold_count
274          * isn't supposed to mess with vm_page ops except prevent the
275          * page from being reused.
276          */
277         if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) {
278                 vm_page_wakeup(m);
279                 return 0;
280         }
281
282         /*
283          * Place page in cluster.  Align cluster for optimal swap space
284          * allocation (whether it is swap or not).  This is typically ~16-32
285          * pages, which also tends to align the cluster to multiples of the
286          * filesystem block size if backed by a filesystem.
287          */
288         page_base = pindex % BLIST_MAX_ALLOC;
289         mc[page_base] = m;
290         ib = page_base - 1;
291         is = page_base + 1;
292
293         /*
294          * Scan object for clusterable pages.
295          *
296          * We can cluster ONLY if: ->> the page is NOT
297          * clean, wired, busy, held, or mapped into a
298          * buffer, and one of the following:
299          * 1) The page is inactive, or a seldom used
300          *    active page.
301          * -or-
302          * 2) we force the issue.
303          *
304          * During heavy mmap/modification loads the pageout
305          * daemon can really fragment the underlying file
306          * due to flushing pages out of order and not trying
307          * align the clusters (which leave sporatic out-of-order
308          * holes).  To solve this problem we do the reverse scan
309          * first and attempt to align our cluster, then do a 
310          * forward scan if room remains.
311          */
312         vm_object_hold(object);
313
314         while (ib >= 0) {
315                 vm_page_t p;
316
317                 p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
318                                             TRUE, &error);
319                 if (error || p == NULL)
320                         break;
321                 if ((p->queue - p->pc) == PQ_CACHE ||
322                     (p->flags & PG_UNMANAGED)) {
323                         vm_page_wakeup(p);
324                         break;
325                 }
326                 vm_page_test_dirty(p);
327                 if (((p->dirty & p->valid) == 0 &&
328                      (p->flags & PG_NEED_COMMIT) == 0) ||
329                     p->wire_count != 0 ||       /* may be held by buf cache */
330                     p->hold_count != 0) {       /* may be undergoing I/O */
331                         vm_page_wakeup(p);
332                         break;
333                 }
334                 if (p->queue - p->pc != PQ_INACTIVE) {
335                         if (p->queue - p->pc != PQ_ACTIVE ||
336                             (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
337                                 vm_page_wakeup(p);
338                                 break;
339                         }
340                 }
341
342                 /*
343                  * Try to maintain page groupings in the cluster.
344                  */
345                 if (m->flags & PG_WINATCFLS)
346                         vm_page_flag_set(p, PG_WINATCFLS);
347                 else
348                         vm_page_flag_clear(p, PG_WINATCFLS);
349                 p->act_count = m->act_count;
350
351                 mc[ib] = p;
352                 --ib;
353         }
354         ++ib;   /* fixup */
355
356         while (is < BLIST_MAX_ALLOC &&
357                pindex - page_base + is < object->size) {
358                 vm_page_t p;
359
360                 p = vm_page_lookup_busy_try(object, pindex - page_base + is,
361                                             TRUE, &error);
362                 if (error || p == NULL)
363                         break;
364                 if (((p->queue - p->pc) == PQ_CACHE) ||
365                     (p->flags & PG_UNMANAGED)) {
366                         vm_page_wakeup(p);
367                         break;
368                 }
369                 vm_page_test_dirty(p);
370                 if (((p->dirty & p->valid) == 0 &&
371                      (p->flags & PG_NEED_COMMIT) == 0) ||
372                     p->wire_count != 0 ||       /* may be held by buf cache */
373                     p->hold_count != 0) {       /* may be undergoing I/O */
374                         vm_page_wakeup(p);
375                         break;
376                 }
377                 if (p->queue - p->pc != PQ_INACTIVE) {
378                         if (p->queue - p->pc != PQ_ACTIVE ||
379                             (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
380                                 vm_page_wakeup(p);
381                                 break;
382                         }
383                 }
384
385                 /*
386                  * Try to maintain page groupings in the cluster.
387                  */
388                 if (m->flags & PG_WINATCFLS)
389                         vm_page_flag_set(p, PG_WINATCFLS);
390                 else
391                         vm_page_flag_clear(p, PG_WINATCFLS);
392                 p->act_count = m->act_count;
393
394                 mc[is] = p;
395                 ++is;
396         }
397
398         vm_object_drop(object);
399
400         /*
401          * we allow reads during pageouts...
402          */
403         return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
404 }
405
406 /*
407  * vm_pageout_flush() - launder the given pages
408  *
409  *      The given pages are laundered.  Note that we setup for the start of
410  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
411  *      reference count all in here rather then in the parent.  If we want
412  *      the parent to do more sophisticated things we may have to change
413  *      the ordering.
414  *
415  *      The pages in the array must be busied by the caller and will be
416  *      unbusied by this function.
417  */
418 int
419 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
420 {
421         vm_object_t object;
422         int pageout_status[count];
423         int numpagedout = 0;
424         int i;
425
426         /*
427          * Initiate I/O.  Bump the vm_page_t->busy counter.
428          */
429         for (i = 0; i < count; i++) {
430                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
431                         ("vm_pageout_flush page %p index %d/%d: partially "
432                          "invalid page", mc[i], i, count));
433                 vm_page_io_start(mc[i]);
434         }
435
436         /*
437          * We must make the pages read-only.  This will also force the
438          * modified bit in the related pmaps to be cleared.  The pager
439          * cannot clear the bit for us since the I/O completion code
440          * typically runs from an interrupt.  The act of making the page
441          * read-only handles the case for us.
442          *
443          * Then we can unbusy the pages, we still hold a reference by virtue
444          * of our soft-busy.
445          */
446         for (i = 0; i < count; i++) {
447                 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE)
448                         vm_page_protect(mc[i], VM_PROT_NONE);
449                 else
450                         vm_page_protect(mc[i], VM_PROT_READ);
451                 vm_page_wakeup(mc[i]);
452         }
453
454         object = mc[0]->object;
455         vm_object_pip_add(object, count);
456
457         vm_pager_put_pages(object, mc, count,
458             (vmflush_flags |
459              ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
460             pageout_status);
461
462         for (i = 0; i < count; i++) {
463                 vm_page_t mt = mc[i];
464
465                 switch (pageout_status[i]) {
466                 case VM_PAGER_OK:
467                         numpagedout++;
468                         break;
469                 case VM_PAGER_PEND:
470                         numpagedout++;
471                         break;
472                 case VM_PAGER_BAD:
473                         /*
474                          * Page outside of range of object. Right now we
475                          * essentially lose the changes by pretending it
476                          * worked.
477                          */
478                         vm_page_busy_wait(mt, FALSE, "pgbad");
479                         pmap_clear_modify(mt);
480                         vm_page_undirty(mt);
481                         vm_page_wakeup(mt);
482                         break;
483                 case VM_PAGER_ERROR:
484                 case VM_PAGER_FAIL:
485                         /*
486                          * A page typically cannot be paged out when we
487                          * have run out of swap.  We leave the page
488                          * marked inactive and will try to page it out
489                          * again later.
490                          *
491                          * Starvation of the active page list is used to
492                          * determine when the system is massively memory
493                          * starved.
494                          */
495                         break;
496                 case VM_PAGER_AGAIN:
497                         break;
498                 }
499
500                 /*
501                  * If not PENDing this was a synchronous operation and we
502                  * clean up after the I/O.  If it is PENDing the mess is
503                  * cleaned up asynchronously.
504                  *
505                  * Also nominally act on the caller's wishes if the caller
506                  * wants to try to really clean (cache or free) the page.
507                  *
508                  * Also nominally deactivate the page if the system is
509                  * memory-stressed.
510                  */
511                 if (pageout_status[i] != VM_PAGER_PEND) {
512                         vm_page_busy_wait(mt, FALSE, "pgouw");
513                         vm_page_io_finish(mt);
514                         if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) {
515                                 vm_page_try_to_cache(mt);
516                         } else if (vm_page_count_severe()) {
517                                 vm_page_deactivate(mt);
518                                 vm_page_wakeup(mt);
519                         } else {
520                                 vm_page_wakeup(mt);
521                         }
522                         vm_object_pip_wakeup(object);
523                 }
524         }
525         return numpagedout;
526 }
527
528 #if !defined(NO_SWAPPING)
529
530 /*
531  * Deactivate pages until the map RSS falls below the specified limit.
532  *
533  * This code is part of the process rlimit and vm_daemon handler and not
534  * part of the normal demand-paging code.  We only check the top-level
535  * object.
536  *
537  * The map must be locked.
538  * The caller must hold the vm_object.
539  */
540 static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
541 static int vm_pageout_object_deactivate_pages_cmp(vm_page_t, void *);
542
543 static vm_pindex_t
544 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
545                                    vm_pindex_t limit,
546                                    vm_pindex_t obj_beg,
547                                    vm_pindex_t obj_end)
548 {
549         struct rb_vm_page_scan_info info;
550         int remove_mode;
551
552         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
553
554         info.count = 0;
555         info.backing_offset_index = obj_beg;
556         info.backing_object = object;
557
558         for (;;) {
559                 vm_pindex_t advance;
560
561                 if (pmap_resident_tlnw_count(vm_map_pmap(map)) <= limit)
562                         break;
563                 if (object->type == OBJT_DEVICE ||
564                     object->type == OBJT_MGTDEVICE ||
565                     object->type == OBJT_PHYS) {
566                         break;
567                 }
568 #if 0
569                 if (object->paging_in_progress)
570                         break;
571 #endif
572
573                 remove_mode = 0;
574                 if (object->shadow_count > 1)
575                         remove_mode = 1;
576
577                 /*
578                  * scan the objects entire memory queue.  We hold the
579                  * object's token so the scan should not race anything.
580                  *
581                  * The callback will adjust backing_offset_index past the
582                  * last index scanned.  This value only matters if we
583                  * terminate early.
584                  */
585                 info.limit = remove_mode;
586                 info.map = map;
587                 info.desired = limit;
588                 info.start_pindex = obj_beg;
589                 info.end_pindex = obj_end;
590                 info.object = object;
591
592                 vm_page_rb_tree_RB_SCAN(&object->rb_memq,
593                                 vm_pageout_object_deactivate_pages_cmp,
594                                 vm_pageout_object_deactivate_pages_callback,
595                                 &info);
596
597                 /*
598                  * Backing object recursion (we will loop up).
599                  */
600                 while ((object = info.object->backing_object) != NULL) {
601                         vm_object_hold(object);
602                         if (object != info.object->backing_object) {
603                                 vm_object_drop(object);
604                                 continue;
605                         }
606                         break;
607                 }
608                 if (object == NULL) {
609                         if (info.object != info.backing_object)
610                                 vm_object_drop(info.object);
611                         break;
612                 }
613                 advance = OFF_TO_IDX(info.object->backing_object_offset);
614                 info.start_pindex += advance;
615                 info.end_pindex += advance;
616                 info.backing_offset_index += advance;
617                 if (info.object != info.backing_object) {
618                         vm_object_lock_swap();
619                         vm_object_drop(info.object);
620                 }
621                 info.object = object;
622         }
623
624         /*
625          * Return how far we want the caller to advance.  The caller will
626          * ignore this value and use obj_end if the RSS limit is still not
627          * satisfied.
628          */
629         return (info.backing_offset_index - info.start_pindex);
630 }
631
632 /*
633  * Only page indices above start_pindex
634  */
635 static
636 int
637 vm_pageout_object_deactivate_pages_cmp(vm_page_t p, void *data)
638 {
639         struct rb_vm_page_scan_info *info = data;
640
641         if (p->pindex < info->start_pindex)
642                 return -1;
643         if (p->pindex >= info->end_pindex)
644                 return +1;
645         return 0;
646 }
647
648 /*
649  * The caller must hold the vm_object.
650  *
651  * info->count is bumped for every page removed from the process pmap.
652  *
653  * info->backing_offset_index is updated past the last scanned page.
654  * This value will be ignored and the scan forced to the mapent boundary
655  * by the caller if the resident count remains too high.
656  */
657 static int
658 vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
659 {
660         struct rb_vm_page_scan_info *info = data;
661         int actcount;
662         int cleanit = 0;
663
664         /*
665          * Basic tests - There should never be a marker, and we can stop
666          *               once the RSS is below the required level.
667          */
668         KKASSERT((p->flags & PG_MARKER) == 0);
669         if (pmap_resident_tlnw_count(vm_map_pmap(info->map)) <=
670             info->desired) {
671                 return(-1);
672         }
673
674         mycpu->gd_cnt.v_pdpages++;
675         info->backing_offset_index = p->pindex + 1;
676
677         if (vm_page_busy_try(p, TRUE))
678                 return(0);
679
680         if (p->object != info->object) {
681                 vm_page_wakeup(p);
682                 return(0);
683         }
684         if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) {
685                 vm_page_wakeup(p);
686                 goto done;
687         }
688         if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
689                 vm_page_wakeup(p);
690                 goto done;
691         }
692
693         actcount = pmap_ts_referenced(p);
694         if (actcount) {
695                 vm_page_flag_set(p, PG_REFERENCED);
696         } else if (p->flags & PG_REFERENCED) {
697                 actcount = 1;
698         }
699
700         vm_page_and_queue_spin_lock(p);
701         if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
702                 vm_page_and_queue_spin_unlock(p);
703                 vm_page_activate(p);
704                 p->act_count += actcount;
705                 vm_page_flag_clear(p, PG_REFERENCED);
706         } else if (p->queue - p->pc == PQ_ACTIVE) {
707                 if ((p->flags & PG_REFERENCED) == 0) {
708                         /* use ACT_ADVANCE for a faster decline */
709                         p->act_count -= min(p->act_count, ACT_ADVANCE);
710                         if (!info->limit &&
711                             (vm_pageout_algorithm || (p->act_count == 0))) {
712                                 vm_page_and_queue_spin_unlock(p);
713                                 vm_page_deactivate(p);
714                                 cleanit = 1;
715                         } else {
716                                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
717                                              p, pageq);
718                                 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
719                                                   p, pageq);
720                                 vm_page_and_queue_spin_unlock(p);
721                         }
722                 } else {
723                         vm_page_and_queue_spin_unlock(p);
724                         vm_page_activate(p);
725                         vm_page_flag_clear(p, PG_REFERENCED);
726
727                         vm_page_and_queue_spin_lock(p);
728                         if (p->queue - p->pc == PQ_ACTIVE) {
729                                 if (p->act_count < (ACT_MAX - ACT_ADVANCE))
730                                         p->act_count += ACT_ADVANCE;
731                                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
732                                              p, pageq);
733                                 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
734                                                   p, pageq);
735                         }
736                         vm_page_and_queue_spin_unlock(p);
737                 }
738         } else if (p->queue - p->pc == PQ_INACTIVE) {
739 #if 0
740                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
741                              p, pageq);
742                 TAILQ_INSERT_HEAD(&vm_page_queues[p->queue].pl,
743                                   p, pageq);
744 #endif
745                 /* use ACT_ADVANCE for a faster decline */
746                 p->act_count -= min(p->act_count, ACT_ADVANCE);
747                 vm_page_and_queue_spin_unlock(p);
748                 if (p->act_count == 0) {
749                         cleanit = 1;
750                 }
751         } else {
752                 vm_page_and_queue_spin_unlock(p);
753         }
754
755         /*
756          * Ok, try to fully clean the page and any nearby pages such that at
757          * least the requested page is freed or moved to the cache queue.
758          *
759          * We usually do this synchronously to allow us to get the page into
760          * the CACHE queue quickly, which will prevent memory exhaustion if
761          * a process with a memoryuse limit is running away.  However, the
762          * sysadmin may desire to set vm.swap_user_async which relaxes this
763          * and improves write performance.
764          */
765         if (cleanit) {
766                 int max_launder = 0x7FFF;
767                 int vnodes_skipped = 0;
768                 int vmflush_flags;
769                 struct vnode *vpfailed = NULL;
770
771                 vmflush_flags = VM_PAGER_TRY_TO_CACHE | VM_PAGER_ALLOW_ACTIVE;
772                 if (swap_user_async == 0)
773                         vmflush_flags |= VM_PAGER_PUT_SYNC;
774
775                 if (vm_pageout_memuse_mode >= 1)
776                         vm_page_protect(p, VM_PROT_NONE);
777                 if (vm_pageout_memuse_mode >= 2) {
778                         vm_page_flag_set(p, PG_WINATCFLS);
779                         info->count += vm_pageout_page(p, &max_launder,
780                                                        &vnodes_skipped,
781                                                        &vpfailed, 1, vmflush_flags);
782                 } else {
783                         ++info->count;
784                         vm_page_wakeup(p);
785                 }
786         } else {
787                 vm_page_wakeup(p);
788         }
789
790 done:
791         lwkt_user_yield();
792         return(0);
793 }
794
795 /*
796  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
797  * that is relatively difficult to do.
798  *
799  * Called when vm_pageout_memuse_mode is >= 1.
800  */
801 void
802 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
803 {
804         vm_map_entry_t tmpe;
805         vm_object_t obj;
806         vm_ooffset_t pgout_offset;
807         vm_ooffset_t tmpe_end;
808         vm_pindex_t obj_beg;
809         vm_pindex_t obj_end;
810         vm_pindex_t count;
811         int retries = 3;
812
813         lockmgr(&map->lock, LK_EXCLUSIVE);
814
815         /*
816          * Scan the map incrementally.
817          */
818         pgout_offset = map->pgout_offset;
819 again:
820         tmpe = map->header.next;
821         obj_beg = 0;
822         obj_end = 0;
823         tmpe_end = 0;
824         obj = NULL;
825
826         while (tmpe != &map->header) {
827                 if (tmpe->end <= pgout_offset) {
828                         tmpe = tmpe->next;
829                         continue;
830                 }
831                 if (tmpe->maptype == VM_MAPTYPE_NORMAL ||
832                     tmpe->maptype == VM_MAPTYPE_VPAGETABLE) {
833                         obj = tmpe->object.vm_object;
834                         if (obj && obj->shadow_count <= 1) {
835                                 if (pgout_offset < tmpe->start) {
836                                         obj_beg = tmpe->offset >> PAGE_SHIFT;
837                                         obj_end = ((tmpe->end - tmpe->start) +
838                                                    tmpe->offset) >> PAGE_SHIFT;
839                                 } else {
840                                         obj_beg = (pgout_offset - tmpe->start +
841                                                    tmpe->offset) >> PAGE_SHIFT;
842                                         obj_end = (tmpe->end - tmpe->start +
843                                                    tmpe->offset) >> PAGE_SHIFT;
844                                 }
845                                 tmpe_end = tmpe->end;
846                                 break;
847                         }
848                         obj = NULL;
849                 }
850                 tmpe = tmpe->next;
851         }
852
853         /*
854          * Attempt to continue where we left off until the RLIMIT is
855          * satisfied or we run out of retries.  Note that the map remains
856          * locked, so the program is not going to be taking any faults
857          * while we are doing this.
858          *
859          * Only circle around in this particular function when the
860          * memuse_mode is >= 2.
861          */
862         if (obj)  {
863                 vm_object_hold(obj);
864                 count = vm_pageout_object_deactivate_pages(map, obj, limit,
865                                                    obj_beg, obj_end);
866                 vm_object_drop(obj);
867                 if (pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
868                         pgout_offset = tmpe_end;
869                         goto again;
870                 }
871                 pgout_offset += count << PAGE_SHIFT;
872         } else {
873                 pgout_offset = 0;
874                 if (pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
875                         if (retries && vm_pageout_memuse_mode >= 2) {
876                                 --retries;
877                                 goto again;
878                         }
879                 }
880         }
881
882         map->pgout_offset = pgout_offset;
883
884         vm_map_unlock(map);
885 }
886 #endif
887
888 /*
889  * Called when the pageout scan wants to free a page.  We no longer
890  * try to cycle the vm_object here with a reference & dealloc, which can
891  * cause a non-trivial object collapse in a critical path.
892  *
893  * It is unclear why we cycled the ref_count in the past, perhaps to try
894  * to optimize shadow chain collapses but I don't quite see why it would
895  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
896  * synchronously and not have to be kicked-start.
897  */
898 static void
899 vm_pageout_page_free(vm_page_t m) 
900 {
901         vm_page_protect(m, VM_PROT_NONE);
902         vm_page_free(m);
903 }
904
905 /*
906  * vm_pageout_scan does the dirty work for the pageout daemon.
907  */
908 struct vm_pageout_scan_info {
909         struct proc *bigproc;
910         vm_offset_t bigsize;
911 };
912
913 static int vm_pageout_scan_callback(struct proc *p, void *data);
914
915 static int
916 vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
917                          int *vnodes_skipped)
918 {
919         vm_page_t m;
920         struct vm_page marker;
921         struct vnode *vpfailed;         /* warning, allowed to be stale */
922         int maxscan;
923         int delta = 0;
924         int max_launder;
925
926         /*
927          * Start scanning the inactive queue for pages we can move to the
928          * cache or free.  The scan will stop when the target is reached or
929          * we have scanned the entire inactive queue.  Note that m->act_count
930          * is not used to form decisions for the inactive queue, only for the
931          * active queue.
932          *
933          * max_launder limits the number of dirty pages we flush per scan.
934          * For most systems a smaller value (16 or 32) is more robust under
935          * extreme memory and disk pressure because any unnecessary writes
936          * to disk can result in extreme performance degredation.  However,
937          * systems with excessive dirty pages (especially when MAP_NOSYNC is
938          * used) will die horribly with limited laundering.  If the pageout
939          * daemon cannot clean enough pages in the first pass, we let it go
940          * all out in succeeding passes.
941          */
942         if ((max_launder = vm_max_launder) <= 1)
943                 max_launder = 1;
944         if (pass)
945                 max_launder = 10000;
946
947         /*
948          * Initialize our marker
949          */
950         bzero(&marker, sizeof(marker));
951         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
952         marker.queue = PQ_INACTIVE + q;
953         marker.pc = q;
954         marker.wire_count = 1;
955
956         /*
957          * Inactive queue scan.
958          *
959          * NOTE: The vm_page must be spinlocked before the queue to avoid
960          *       deadlocks, so it is easiest to simply iterate the loop
961          *       with the queue unlocked at the top.
962          */
963         vpfailed = NULL;
964
965         vm_page_queues_spin_lock(PQ_INACTIVE + q);
966         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
967         maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt;
968
969         /*
970          * Queue locked at top of loop to avoid stack marker issues.
971          */
972         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
973                maxscan-- > 0 && avail_shortage - delta > 0)
974         {
975                 int count;
976
977                 KKASSERT(m->queue == PQ_INACTIVE + q);
978                 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
979                              &marker, pageq);
980                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
981                                    &marker, pageq);
982                 mycpu->gd_cnt.v_pdpages++;
983
984                 /*
985                  * Skip marker pages (atomic against other markers to avoid
986                  * infinite hop-over scans).
987                  */
988                 if (m->flags & PG_MARKER)
989                         continue;
990
991                 /*
992                  * Try to busy the page.  Don't mess with pages which are
993                  * already busy or reorder them in the queue.
994                  */
995                 if (vm_page_busy_try(m, TRUE))
996                         continue;
997
998                 /*
999                  * Remaining operations run with the page busy and neither
1000                  * the page or the queue will be spin-locked.
1001                  */
1002                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1003                 KKASSERT(m->queue == PQ_INACTIVE + q);
1004
1005                 count = vm_pageout_page(m, &max_launder, vnodes_skipped,
1006                                         &vpfailed, pass, 0);
1007                 delta += count;
1008
1009                 /*
1010                  * Systems with a ton of memory can wind up with huge
1011                  * deactivation counts.  Because the inactive scan is
1012                  * doing a lot of flushing, the combination can result
1013                  * in excessive paging even in situations where other
1014                  * unrelated threads free up sufficient VM.
1015                  *
1016                  * To deal with this we abort the nominal active->inactive
1017                  * scan before we hit the inactive target when free+cache
1018                  * levels have reached a reasonable target.
1019                  *
1020                  * When deciding to stop early we need to add some slop to
1021                  * the test and we need to return full completion to the caller
1022                  * to prevent the caller from thinking there is something
1023                  * wrong and issuing a low-memory+swap warning or pkill.
1024                  *
1025                  * A deficit forces paging regardless of the state of the
1026                  * VM page queues (used for RSS enforcement).
1027                  */
1028                 lwkt_yield();
1029                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
1030                 if (vm_paging_target() < -vm_max_launder) {
1031                         /*
1032                          * Stopping early, return full completion to caller.
1033                          */
1034                         if (delta < avail_shortage)
1035                                 delta = avail_shortage;
1036                         break;
1037                 }
1038         }
1039
1040         /* page queue still spin-locked */
1041         TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
1042         vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1043
1044         return (delta);
1045 }
1046
1047 /*
1048  * Pageout the specified page, return the total number of pages paged out
1049  * (this routine may cluster).
1050  *
1051  * The page must be busied and soft-busied by the caller and will be disposed
1052  * of by this function.
1053  */
1054 static int
1055 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp,
1056                 struct vnode **vpfailedp, int pass, int vmflush_flags)
1057 {
1058         vm_object_t object;
1059         int actcount;
1060         int count = 0;
1061
1062         /*
1063          * It is possible for a page to be busied ad-hoc (e.g. the
1064          * pmap_collect() code) and wired and race against the
1065          * allocation of a new page.  vm_page_alloc() may be forced
1066          * to deactivate the wired page in which case it winds up
1067          * on the inactive queue and must be handled here.  We
1068          * correct the problem simply by unqueuing the page.
1069          */
1070         if (m->wire_count) {
1071                 vm_page_unqueue_nowakeup(m);
1072                 vm_page_wakeup(m);
1073                 kprintf("WARNING: pagedaemon: wired page on "
1074                         "inactive queue %p\n", m);
1075                 return 0;
1076         }
1077
1078         /*
1079          * A held page may be undergoing I/O, so skip it.
1080          */
1081         if (m->hold_count) {
1082                 vm_page_and_queue_spin_lock(m);
1083                 if (m->queue - m->pc == PQ_INACTIVE) {
1084                         TAILQ_REMOVE(
1085                                 &vm_page_queues[m->queue].pl, m, pageq);
1086                         TAILQ_INSERT_TAIL(
1087                                 &vm_page_queues[m->queue].pl, m, pageq);
1088                         ++vm_swapcache_inactive_heuristic;
1089                 }
1090                 vm_page_and_queue_spin_unlock(m);
1091                 vm_page_wakeup(m);
1092                 return 0;
1093         }
1094
1095         if (m->object == NULL || m->object->ref_count == 0) {
1096                 /*
1097                  * If the object is not being used, we ignore previous
1098                  * references.
1099                  */
1100                 vm_page_flag_clear(m, PG_REFERENCED);
1101                 pmap_clear_reference(m);
1102                 /* fall through to end */
1103         } else if (((m->flags & PG_REFERENCED) == 0) &&
1104                     (actcount = pmap_ts_referenced(m))) {
1105                 /*
1106                  * Otherwise, if the page has been referenced while
1107                  * in the inactive queue, we bump the "activation
1108                  * count" upwards, making it less likely that the
1109                  * page will be added back to the inactive queue
1110                  * prematurely again.  Here we check the page tables
1111                  * (or emulated bits, if any), given the upper level
1112                  * VM system not knowing anything about existing
1113                  * references.
1114                  */
1115                 vm_page_activate(m);
1116                 m->act_count += (actcount + ACT_ADVANCE);
1117                 vm_page_wakeup(m);
1118                 return 0;
1119         }
1120
1121         /*
1122          * (m) is still busied.
1123          *
1124          * If the upper level VM system knows about any page
1125          * references, we activate the page.  We also set the
1126          * "activation count" higher than normal so that we will less
1127          * likely place pages back onto the inactive queue again.
1128          */
1129         if ((m->flags & PG_REFERENCED) != 0) {
1130                 vm_page_flag_clear(m, PG_REFERENCED);
1131                 actcount = pmap_ts_referenced(m);
1132                 vm_page_activate(m);
1133                 m->act_count += (actcount + ACT_ADVANCE + 1);
1134                 vm_page_wakeup(m);
1135                 return 0;
1136         }
1137
1138         /*
1139          * If the upper level VM system doesn't know anything about
1140          * the page being dirty, we have to check for it again.  As
1141          * far as the VM code knows, any partially dirty pages are
1142          * fully dirty.
1143          *
1144          * Pages marked PG_WRITEABLE may be mapped into the user
1145          * address space of a process running on another cpu.  A
1146          * user process (without holding the MP lock) running on
1147          * another cpu may be able to touch the page while we are
1148          * trying to remove it.  vm_page_cache() will handle this
1149          * case for us.
1150          */
1151         if (m->dirty == 0) {
1152                 vm_page_test_dirty(m);
1153         } else {
1154                 vm_page_dirty(m);
1155         }
1156
1157         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1158                 /*
1159                  * Invalid pages can be easily freed
1160                  */
1161                 vm_pageout_page_free(m);
1162                 mycpu->gd_cnt.v_dfree++;
1163                 ++count;
1164         } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1165                 /*
1166                  * Clean pages can be placed onto the cache queue.
1167                  * This effectively frees them.
1168                  */
1169                 vm_page_cache(m);
1170                 ++count;
1171         } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1172                 /*
1173                  * Dirty pages need to be paged out, but flushing
1174                  * a page is extremely expensive verses freeing
1175                  * a clean page.  Rather then artificially limiting
1176                  * the number of pages we can flush, we instead give
1177                  * dirty pages extra priority on the inactive queue
1178                  * by forcing them to be cycled through the queue
1179                  * twice before being flushed, after which the
1180                  * (now clean) page will cycle through once more
1181                  * before being freed.  This significantly extends
1182                  * the thrash point for a heavily loaded machine.
1183                  */
1184                 vm_page_flag_set(m, PG_WINATCFLS);
1185                 vm_page_and_queue_spin_lock(m);
1186                 if (m->queue - m->pc == PQ_INACTIVE) {
1187                         TAILQ_REMOVE(
1188                                 &vm_page_queues[m->queue].pl, m, pageq);
1189                         TAILQ_INSERT_TAIL(
1190                                 &vm_page_queues[m->queue].pl, m, pageq);
1191                         ++vm_swapcache_inactive_heuristic;
1192                 }
1193                 vm_page_and_queue_spin_unlock(m);
1194                 vm_page_wakeup(m);
1195         } else if (*max_launderp > 0) {
1196                 /*
1197                  * We always want to try to flush some dirty pages if
1198                  * we encounter them, to keep the system stable.
1199                  * Normally this number is small, but under extreme
1200                  * pressure where there are insufficient clean pages
1201                  * on the inactive queue, we may have to go all out.
1202                  */
1203                 int swap_pageouts_ok;
1204                 struct vnode *vp = NULL;
1205
1206                 swap_pageouts_ok = 0;
1207                 object = m->object;
1208                 if (object &&
1209                     (object->type != OBJT_SWAP) &&
1210                     (object->type != OBJT_DEFAULT)) {
1211                         swap_pageouts_ok = 1;
1212                 } else {
1213                         swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
1214                         swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
1215                         vm_page_count_min(0));
1216                 }
1217
1218                 /*
1219                  * We don't bother paging objects that are "dead".
1220                  * Those objects are in a "rundown" state.
1221                  */
1222                 if (!swap_pageouts_ok ||
1223                     (object == NULL) ||
1224                     (object->flags & OBJ_DEAD)) {
1225                         vm_page_and_queue_spin_lock(m);
1226                         if (m->queue - m->pc == PQ_INACTIVE) {
1227                                 TAILQ_REMOVE(
1228                                     &vm_page_queues[m->queue].pl,
1229                                     m, pageq);
1230                                 TAILQ_INSERT_TAIL(
1231                                     &vm_page_queues[m->queue].pl,
1232                                     m, pageq);
1233                                 ++vm_swapcache_inactive_heuristic;
1234                         }
1235                         vm_page_and_queue_spin_unlock(m);
1236                         vm_page_wakeup(m);
1237                         return 0;
1238                 }
1239
1240                 /*
1241                  * (m) is still busied.
1242                  *
1243                  * The object is already known NOT to be dead.   It
1244                  * is possible for the vget() to block the whole
1245                  * pageout daemon, but the new low-memory handling
1246                  * code should prevent it.
1247                  *
1248                  * The previous code skipped locked vnodes and, worse,
1249                  * reordered pages in the queue.  This results in
1250                  * completely non-deterministic operation because,
1251                  * quite often, a vm_fault has initiated an I/O and
1252                  * is holding a locked vnode at just the point where
1253                  * the pageout daemon is woken up.
1254                  *
1255                  * We can't wait forever for the vnode lock, we might
1256                  * deadlock due to a vn_read() getting stuck in
1257                  * vm_wait while holding this vnode.  We skip the
1258                  * vnode if we can't get it in a reasonable amount
1259                  * of time.
1260                  *
1261                  * vpfailed is used to (try to) avoid the case where
1262                  * a large number of pages are associated with a
1263                  * locked vnode, which could cause the pageout daemon
1264                  * to stall for an excessive amount of time.
1265                  */
1266                 if (object->type == OBJT_VNODE) {
1267                         int flags;
1268
1269                         vp = object->handle;
1270                         flags = LK_EXCLUSIVE;
1271                         if (vp == *vpfailedp)
1272                                 flags |= LK_NOWAIT;
1273                         else
1274                                 flags |= LK_TIMELOCK;
1275                         vm_page_hold(m);
1276                         vm_page_wakeup(m);
1277
1278                         /*
1279                          * We have unbusied (m) temporarily so we can
1280                          * acquire the vp lock without deadlocking.
1281                          * (m) is held to prevent destruction.
1282                          */
1283                         if (vget(vp, flags) != 0) {
1284                                 *vpfailedp = vp;
1285                                 ++pageout_lock_miss;
1286                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1287                                             ++*vnodes_skippedp;
1288                                 vm_page_unhold(m);
1289                                 return 0;
1290                         }
1291
1292                         /*
1293                          * The page might have been moved to another
1294                          * queue during potential blocking in vget()
1295                          * above.  The page might have been freed and
1296                          * reused for another vnode.  The object might
1297                          * have been reused for another vnode.
1298                          */
1299                         if (m->queue - m->pc != PQ_INACTIVE ||
1300                             m->object != object ||
1301                             object->handle != vp) {
1302                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1303                                         ++*vnodes_skippedp;
1304                                 vput(vp);
1305                                 vm_page_unhold(m);
1306                                 return 0;
1307                         }
1308
1309                         /*
1310                          * The page may have been busied during the
1311                          * blocking in vput();  We don't move the
1312                          * page back onto the end of the queue so that
1313                          * statistics are more correct if we don't.
1314                          */
1315                         if (vm_page_busy_try(m, TRUE)) {
1316                                 vput(vp);
1317                                 vm_page_unhold(m);
1318                                 return 0;
1319                         }
1320                         vm_page_unhold(m);
1321
1322                         /*
1323                          * (m) is busied again
1324                          *
1325                          * We own the busy bit and remove our hold
1326                          * bit.  If the page is still held it
1327                          * might be undergoing I/O, so skip it.
1328                          */
1329                         if (m->hold_count) {
1330                                 vm_page_and_queue_spin_lock(m);
1331                                 if (m->queue - m->pc == PQ_INACTIVE) {
1332                                         TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1333                                         TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1334                                         ++vm_swapcache_inactive_heuristic;
1335                                 }
1336                                 vm_page_and_queue_spin_unlock(m);
1337                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1338                                         ++*vnodes_skippedp;
1339                                 vm_page_wakeup(m);
1340                                 vput(vp);
1341                                 return 0;
1342                         }
1343                         /* (m) is left busied as we fall through */
1344                 }
1345
1346                 /*
1347                  * page is busy and not held here.
1348                  *
1349                  * If a page is dirty, then it is either being washed
1350                  * (but not yet cleaned) or it is still in the
1351                  * laundry.  If it is still in the laundry, then we
1352                  * start the cleaning operation.
1353                  *
1354                  * decrement inactive_shortage on success to account
1355                  * for the (future) cleaned page.  Otherwise we
1356                  * could wind up laundering or cleaning too many
1357                  * pages.
1358                  *
1359                  * NOTE: Cleaning the page here does not cause
1360                  *       force_deficit to be adjusted, because the
1361                  *       page is not being freed or moved to the
1362                  *       cache.
1363                  */
1364                 count = vm_pageout_clean_helper(m, vmflush_flags);
1365                 *max_launderp -= count;
1366
1367                 /*
1368                  * Clean ate busy, page no longer accessible
1369                  */
1370                 if (vp != NULL)
1371                         vput(vp);
1372         } else {
1373                 vm_page_wakeup(m);
1374         }
1375         return count;
1376 }
1377
1378 static int
1379 vm_pageout_scan_active(int pass, int q,
1380                        int avail_shortage, int inactive_shortage,
1381                        int *recycle_countp)
1382 {
1383         struct vm_page marker;
1384         vm_page_t m;
1385         int actcount;
1386         int delta = 0;
1387         int maxscan;
1388
1389         /*
1390          * We want to move pages from the active queue to the inactive
1391          * queue to get the inactive queue to the inactive target.  If
1392          * we still have a page shortage from above we try to directly free
1393          * clean pages instead of moving them.
1394          *
1395          * If we do still have a shortage we keep track of the number of
1396          * pages we free or cache (recycle_count) as a measure of thrashing
1397          * between the active and inactive queues.
1398          *
1399          * If we were able to completely satisfy the free+cache targets
1400          * from the inactive pool we limit the number of pages we move
1401          * from the active pool to the inactive pool to 2x the pages we
1402          * had removed from the inactive pool (with a minimum of 1/5 the
1403          * inactive target).  If we were not able to completely satisfy
1404          * the free+cache targets we go for the whole target aggressively.
1405          *
1406          * NOTE: Both variables can end up negative.
1407          * NOTE: We are still in a critical section.
1408          */
1409
1410         bzero(&marker, sizeof(marker));
1411         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1412         marker.queue = PQ_ACTIVE + q;
1413         marker.pc = q;
1414         marker.wire_count = 1;
1415
1416         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1417         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1418         maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt;
1419
1420         /*
1421          * Queue locked at top of loop to avoid stack marker issues.
1422          */
1423         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1424                maxscan-- > 0 && (avail_shortage - delta > 0 ||
1425                                 inactive_shortage > 0))
1426         {
1427                 KKASSERT(m->queue == PQ_ACTIVE + q);
1428                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1429                              &marker, pageq);
1430                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1431                                    &marker, pageq);
1432
1433                 /*
1434                  * Skip marker pages (atomic against other markers to avoid
1435                  * infinite hop-over scans).
1436                  */
1437                 if (m->flags & PG_MARKER)
1438                         continue;
1439
1440                 /*
1441                  * Try to busy the page.  Don't mess with pages which are
1442                  * already busy or reorder them in the queue.
1443                  */
1444                 if (vm_page_busy_try(m, TRUE))
1445                         continue;
1446
1447                 /*
1448                  * Remaining operations run with the page busy and neither
1449                  * the page or the queue will be spin-locked.
1450                  */
1451                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1452                 KKASSERT(m->queue == PQ_ACTIVE + q);
1453
1454                 /*
1455                  * Don't deactivate pages that are held, even if we can
1456                  * busy them.  (XXX why not?)
1457                  */
1458                 if (m->hold_count != 0) {
1459                         vm_page_and_queue_spin_lock(m);
1460                         if (m->queue - m->pc == PQ_ACTIVE) {
1461                                 TAILQ_REMOVE(
1462                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1463                                         m, pageq);
1464                                 TAILQ_INSERT_TAIL(
1465                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1466                                         m, pageq);
1467                         }
1468                         vm_page_and_queue_spin_unlock(m);
1469                         vm_page_wakeup(m);
1470                         goto next;
1471                 }
1472
1473                 /*
1474                  * The count for pagedaemon pages is done after checking the
1475                  * page for eligibility...
1476                  */
1477                 mycpu->gd_cnt.v_pdpages++;
1478
1479                 /*
1480                  * Check to see "how much" the page has been used and clear
1481                  * the tracking access bits.  If the object has no references
1482                  * don't bother paying the expense.
1483                  */
1484                 actcount = 0;
1485                 if (m->object && m->object->ref_count != 0) {
1486                         if (m->flags & PG_REFERENCED)
1487                                 ++actcount;
1488                         actcount += pmap_ts_referenced(m);
1489                         if (actcount) {
1490                                 m->act_count += ACT_ADVANCE + actcount;
1491                                 if (m->act_count > ACT_MAX)
1492                                         m->act_count = ACT_MAX;
1493                         }
1494                 }
1495                 vm_page_flag_clear(m, PG_REFERENCED);
1496
1497                 /*
1498                  * actcount is only valid if the object ref_count is non-zero.
1499                  * If the page does not have an object, actcount will be zero.
1500                  */
1501                 if (actcount && m->object->ref_count != 0) {
1502                         vm_page_and_queue_spin_lock(m);
1503                         if (m->queue - m->pc == PQ_ACTIVE) {
1504                                 TAILQ_REMOVE(
1505                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1506                                         m, pageq);
1507                                 TAILQ_INSERT_TAIL(
1508                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1509                                         m, pageq);
1510                         }
1511                         vm_page_and_queue_spin_unlock(m);
1512                         vm_page_wakeup(m);
1513                 } else {
1514                         switch(m->object->type) {
1515                         case OBJT_DEFAULT:
1516                         case OBJT_SWAP:
1517                                 m->act_count -= min(m->act_count,
1518                                                     vm_anonmem_decline);
1519                                 break;
1520                         default:
1521                                 m->act_count -= min(m->act_count,
1522                                                     vm_filemem_decline);
1523                                 break;
1524                         }
1525                         if (vm_pageout_algorithm ||
1526                             (m->object == NULL) ||
1527                             (m->object && (m->object->ref_count == 0)) ||
1528                             m->act_count < pass + 1
1529                         ) {
1530                                 /*
1531                                  * Deactivate the page.  If we had a
1532                                  * shortage from our inactive scan try to
1533                                  * free (cache) the page instead.
1534                                  *
1535                                  * Don't just blindly cache the page if
1536                                  * we do not have a shortage from the
1537                                  * inactive scan, that could lead to
1538                                  * gigabytes being moved.
1539                                  */
1540                                 --inactive_shortage;
1541                                 if (avail_shortage - delta > 0 ||
1542                                     (m->object && (m->object->ref_count == 0)))
1543                                 {
1544                                         if (avail_shortage - delta > 0)
1545                                                 ++*recycle_countp;
1546                                         vm_page_protect(m, VM_PROT_NONE);
1547                                         if (m->dirty == 0 &&
1548                                             (m->flags & PG_NEED_COMMIT) == 0 &&
1549                                             avail_shortage - delta > 0) {
1550                                                 vm_page_cache(m);
1551                                         } else {
1552                                                 vm_page_deactivate(m);
1553                                                 vm_page_wakeup(m);
1554                                         }
1555                                 } else {
1556                                         vm_page_deactivate(m);
1557                                         vm_page_wakeup(m);
1558                                 }
1559                                 ++delta;
1560                         } else {
1561                                 vm_page_and_queue_spin_lock(m);
1562                                 if (m->queue - m->pc == PQ_ACTIVE) {
1563                                         TAILQ_REMOVE(
1564                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1565                                             m, pageq);
1566                                         TAILQ_INSERT_TAIL(
1567                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1568                                             m, pageq);
1569                                 }
1570                                 vm_page_and_queue_spin_unlock(m);
1571                                 vm_page_wakeup(m);
1572                         }
1573                 }
1574 next:
1575                 lwkt_yield();
1576                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1577         }
1578
1579         /*
1580          * Clean out our local marker.
1581          *
1582          * Page queue still spin-locked.
1583          */
1584         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1585         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1586
1587         return (delta);
1588 }
1589
1590 /*
1591  * The number of actually free pages can drop down to v_free_reserved,
1592  * we try to build the free count back above v_free_min.  Note that
1593  * vm_paging_needed() also returns TRUE if v_free_count is not at
1594  * least v_free_min so that is the minimum we must build the free
1595  * count to.
1596  *
1597  * We use a slightly higher target to improve hysteresis,
1598  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1599  * is usually the same as v_cache_min this maintains about
1600  * half the pages in the free queue as are in the cache queue,
1601  * providing pretty good pipelining for pageout operation.
1602  *
1603  * The system operator can manipulate vm.v_cache_min and
1604  * vm.v_free_target to tune the pageout demon.  Be sure
1605  * to keep vm.v_free_min < vm.v_free_target.
1606  *
1607  * Note that the original paging target is to get at least
1608  * (free_min + cache_min) into (free + cache).  The slightly
1609  * higher target will shift additional pages from cache to free
1610  * without effecting the original paging target in order to
1611  * maintain better hysteresis and not have the free count always
1612  * be dead-on v_free_min.
1613  *
1614  * NOTE: we are still in a critical section.
1615  *
1616  * Pages moved from PQ_CACHE to totally free are not counted in the
1617  * pages_freed counter.
1618  */
1619 static void
1620 vm_pageout_scan_cache(int avail_shortage, int pass,
1621                       int vnodes_skipped, int recycle_count)
1622 {
1623         static int lastkillticks;
1624         struct vm_pageout_scan_info info;
1625         vm_page_t m;
1626
1627         while (vmstats.v_free_count <
1628                (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1629                 /*
1630                  * This steals some code from vm/vm_page.c
1631                  */
1632                 static int cache_rover = 0;
1633
1634                 m = vm_page_list_find(PQ_CACHE,
1635                                       cache_rover & PQ_L2_MASK, FALSE);
1636                 if (m == NULL)
1637                         break;
1638                 /* page is returned removed from its queue and spinlocked */
1639                 if (vm_page_busy_try(m, TRUE)) {
1640                         vm_page_deactivate_locked(m);
1641                         vm_page_spin_unlock(m);
1642                         continue;
1643                 }
1644                 vm_page_spin_unlock(m);
1645                 pagedaemon_wakeup();
1646                 lwkt_yield();
1647
1648                 /*
1649                  * Remaining operations run with the page busy and neither
1650                  * the page or the queue will be spin-locked.
1651                  */
1652                 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1653                     m->hold_count ||
1654                     m->wire_count) {
1655                         vm_page_deactivate(m);
1656                         vm_page_wakeup(m);
1657                         continue;
1658                 }
1659                 KKASSERT((m->flags & PG_MAPPED) == 0);
1660                 KKASSERT(m->dirty == 0);
1661                 cache_rover += PQ_PRIME2;
1662                 vm_pageout_page_free(m);
1663                 mycpu->gd_cnt.v_dfree++;
1664         }
1665
1666 #if !defined(NO_SWAPPING)
1667         /*
1668          * Idle process swapout -- run once per second.
1669          */
1670         if (vm_swap_idle_enabled) {
1671                 static time_t lsec;
1672                 if (time_uptime != lsec) {
1673                         atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE);
1674                         vm_req_vmdaemon();
1675                         lsec = time_uptime;
1676                 }
1677         }
1678 #endif
1679                 
1680         /*
1681          * If we didn't get enough free pages, and we have skipped a vnode
1682          * in a writeable object, wakeup the sync daemon.  And kick swapout
1683          * if we did not get enough free pages.
1684          */
1685         if (vm_paging_target() > 0) {
1686                 if (vnodes_skipped && vm_page_count_min(0))
1687                         speedup_syncer(NULL);
1688 #if !defined(NO_SWAPPING)
1689                 if (vm_swap_enabled && vm_page_count_target()) {
1690                         atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL);
1691                         vm_req_vmdaemon();
1692                 }
1693 #endif
1694         }
1695
1696         /*
1697          * Handle catastrophic conditions.  Under good conditions we should
1698          * be at the target, well beyond our minimum.  If we could not even
1699          * reach our minimum the system is under heavy stress.  But just being
1700          * under heavy stress does not trigger process killing.
1701          *
1702          * We consider ourselves to have run out of memory if the swap pager
1703          * is full and avail_shortage is still positive.  The secondary check
1704          * ensures that we do not kill processes if the instantanious
1705          * availability is good, even if the pageout demon pass says it
1706          * couldn't get to the target.
1707          */
1708         if (swap_pager_almost_full &&
1709             pass > 0 &&
1710             (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1711                 kprintf("Warning: system low on memory+swap "
1712                         "shortage %d for %d ticks!\n",
1713                         avail_shortage, ticks - swap_fail_ticks);
1714         }
1715         if (swap_pager_full &&
1716             pass > 1 &&
1717             avail_shortage > 0 &&
1718             vm_paging_target() > 0 &&
1719             (unsigned int)(ticks - lastkillticks) >= hz) {
1720                 /*
1721                  * Kill something, maximum rate once per second to give
1722                  * the process time to free up sufficient memory.
1723                  */
1724                 lastkillticks = ticks;
1725                 info.bigproc = NULL;
1726                 info.bigsize = 0;
1727                 allproc_scan(vm_pageout_scan_callback, &info);
1728                 if (info.bigproc != NULL) {
1729                         info.bigproc->p_nice = PRIO_MIN;
1730                         info.bigproc->p_usched->resetpriority(
1731                                 FIRST_LWP_IN_PROC(info.bigproc));
1732                         atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1733                         killproc(info.bigproc, "out of swap space");
1734                         wakeup(&vmstats.v_free_count);
1735                         PRELE(info.bigproc);
1736                 }
1737         }
1738 }
1739
1740 static int
1741 vm_pageout_scan_callback(struct proc *p, void *data)
1742 {
1743         struct vm_pageout_scan_info *info = data;
1744         vm_offset_t size;
1745
1746         /*
1747          * Never kill system processes or init.  If we have configured swap
1748          * then try to avoid killing low-numbered pids.
1749          */
1750         if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1751             ((p->p_pid < 48) && (vm_swap_size != 0))) {
1752                 return (0);
1753         }
1754
1755         lwkt_gettoken(&p->p_token);
1756
1757         /*
1758          * if the process is in a non-running type state,
1759          * don't touch it.
1760          */
1761         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1762                 lwkt_reltoken(&p->p_token);
1763                 return (0);
1764         }
1765
1766         /*
1767          * Get the approximate process size.  Note that anonymous pages
1768          * with backing swap will be counted twice, but there should not
1769          * be too many such pages due to the stress the VM system is
1770          * under at this point.
1771          */
1772         size = vmspace_anonymous_count(p->p_vmspace) +
1773                 vmspace_swap_count(p->p_vmspace);
1774
1775         /*
1776          * If the this process is bigger than the biggest one
1777          * remember it.
1778          */
1779         if (info->bigsize < size) {
1780                 if (info->bigproc)
1781                         PRELE(info->bigproc);
1782                 PHOLD(p);
1783                 info->bigproc = p;
1784                 info->bigsize = size;
1785         }
1786         lwkt_reltoken(&p->p_token);
1787         lwkt_yield();
1788
1789         return(0);
1790 }
1791
1792 /*
1793  * This routine tries to maintain the pseudo LRU active queue,
1794  * so that during long periods of time where there is no paging,
1795  * that some statistic accumulation still occurs.  This code
1796  * helps the situation where paging just starts to occur.
1797  */
1798 static void
1799 vm_pageout_page_stats(int q)
1800 {
1801         static int fullintervalcount = 0;
1802         struct vm_page marker;
1803         vm_page_t m;
1804         int pcount, tpcount;            /* Number of pages to check */
1805         int page_shortage;
1806
1807         page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1808                          vmstats.v_free_min) -
1809                         (vmstats.v_free_count + vmstats.v_inactive_count +
1810                          vmstats.v_cache_count);
1811
1812         if (page_shortage <= 0)
1813                 return;
1814
1815         pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1816         fullintervalcount += vm_pageout_stats_interval;
1817         if (fullintervalcount < vm_pageout_full_stats_interval) {
1818                 tpcount = (vm_pageout_stats_max * pcount) /
1819                           vmstats.v_page_count + 1;
1820                 if (pcount > tpcount)
1821                         pcount = tpcount;
1822         } else {
1823                 fullintervalcount = 0;
1824         }
1825
1826         bzero(&marker, sizeof(marker));
1827         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1828         marker.queue = PQ_ACTIVE + q;
1829         marker.pc = q;
1830         marker.wire_count = 1;
1831
1832         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1833         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1834
1835         /*
1836          * Queue locked at top of loop to avoid stack marker issues.
1837          */
1838         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1839                pcount-- > 0)
1840         {
1841                 int actcount;
1842
1843                 KKASSERT(m->queue == PQ_ACTIVE + q);
1844                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1845                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1846                                    &marker, pageq);
1847
1848                 /*
1849                  * Skip marker pages (atomic against other markers to avoid
1850                  * infinite hop-over scans).
1851                  */
1852                 if (m->flags & PG_MARKER)
1853                         continue;
1854
1855                 /*
1856                  * Ignore pages we can't busy
1857                  */
1858                 if (vm_page_busy_try(m, TRUE))
1859                         continue;
1860
1861                 /*
1862                  * Remaining operations run with the page busy and neither
1863                  * the page or the queue will be spin-locked.
1864                  */
1865                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1866                 KKASSERT(m->queue == PQ_ACTIVE + q);
1867
1868                 /*
1869                  * We now have a safely busied page, the page and queue
1870                  * spinlocks have been released.
1871                  *
1872                  * Ignore held pages
1873                  */
1874                 if (m->hold_count) {
1875                         vm_page_wakeup(m);
1876                         goto next;
1877                 }
1878
1879                 /*
1880                  * Calculate activity
1881                  */
1882                 actcount = 0;
1883                 if (m->flags & PG_REFERENCED) {
1884                         vm_page_flag_clear(m, PG_REFERENCED);
1885                         actcount += 1;
1886                 }
1887                 actcount += pmap_ts_referenced(m);
1888
1889                 /*
1890                  * Update act_count and move page to end of queue.
1891                  */
1892                 if (actcount) {
1893                         m->act_count += ACT_ADVANCE + actcount;
1894                         if (m->act_count > ACT_MAX)
1895                                 m->act_count = ACT_MAX;
1896                         vm_page_and_queue_spin_lock(m);
1897                         if (m->queue - m->pc == PQ_ACTIVE) {
1898                                 TAILQ_REMOVE(
1899                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1900                                         m, pageq);
1901                                 TAILQ_INSERT_TAIL(
1902                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1903                                         m, pageq);
1904                         }
1905                         vm_page_and_queue_spin_unlock(m);
1906                         vm_page_wakeup(m);
1907                         goto next;
1908                 }
1909
1910                 if (m->act_count == 0) {
1911                         /*
1912                          * We turn off page access, so that we have
1913                          * more accurate RSS stats.  We don't do this
1914                          * in the normal page deactivation when the
1915                          * system is loaded VM wise, because the
1916                          * cost of the large number of page protect
1917                          * operations would be higher than the value
1918                          * of doing the operation.
1919                          *
1920                          * We use the marker to save our place so
1921                          * we can release the spin lock.  both (m)
1922                          * and (next) will be invalid.
1923                          */
1924                         vm_page_protect(m, VM_PROT_NONE);
1925                         vm_page_deactivate(m);
1926                 } else {
1927                         m->act_count -= min(m->act_count, ACT_DECLINE);
1928                         vm_page_and_queue_spin_lock(m);
1929                         if (m->queue - m->pc == PQ_ACTIVE) {
1930                                 TAILQ_REMOVE(
1931                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1932                                         m, pageq);
1933                                 TAILQ_INSERT_TAIL(
1934                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1935                                         m, pageq);
1936                         }
1937                         vm_page_and_queue_spin_unlock(m);
1938                 }
1939                 vm_page_wakeup(m);
1940 next:
1941                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1942         }
1943
1944         /*
1945          * Remove our local marker
1946          *
1947          * Page queue still spin-locked.
1948          */
1949         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1950         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1951 }
1952
1953 static int
1954 vm_pageout_free_page_calc(vm_size_t count)
1955 {
1956         if (count < vmstats.v_page_count)
1957                  return 0;
1958         /*
1959          * free_reserved needs to include enough for the largest swap pager
1960          * structures plus enough for any pv_entry structs when paging.
1961          *
1962          * v_free_min           normal allocations
1963          * v_free_reserved      system allocations
1964          * v_pageout_free_min   allocations by pageout daemon
1965          * v_interrupt_free_min low level allocations (e.g swap structures)
1966          */
1967         if (vmstats.v_page_count > 1024)
1968                 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200;
1969         else
1970                 vmstats.v_free_min = 64;
1971         vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
1972         vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
1973         vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
1974         vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
1975
1976         return 1;
1977 }
1978
1979
1980 /*
1981  * vm_pageout is the high level pageout daemon.
1982  *
1983  * No requirements.
1984  */
1985 static void
1986 vm_pageout_thread(void)
1987 {
1988         int pass;
1989         int q;
1990         int q1iterator = 0;
1991         int q2iterator = 0;
1992
1993         /*
1994          * Initialize some paging parameters.
1995          */
1996         curthread->td_flags |= TDF_SYSTHREAD;
1997
1998         vm_pageout_free_page_calc(vmstats.v_page_count);
1999
2000         /*
2001          * v_free_target and v_cache_min control pageout hysteresis.  Note
2002          * that these are more a measure of the VM cache queue hysteresis
2003          * then the VM free queue.  Specifically, v_free_target is the
2004          * high water mark (free+cache pages).
2005          *
2006          * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2007          * low water mark, while v_free_min is the stop.  v_cache_min must
2008          * be big enough to handle memory needs while the pageout daemon
2009          * is signalled and run to free more pages.
2010          */
2011         if (vmstats.v_free_count > 6144)
2012                 vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
2013         else
2014                 vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
2015
2016         /*
2017          * NOTE: With the new buffer cache b_act_count we want the default
2018          *       inactive target to be a percentage of available memory.
2019          *
2020          *       The inactive target essentially determines the minimum
2021          *       number of 'temporary' pages capable of caching one-time-use
2022          *       files when the VM system is otherwise full of pages
2023          *       belonging to multi-time-use files or active program data.
2024          *
2025          * NOTE: The inactive target is aggressively persued only if the
2026          *       inactive queue becomes too small.  If the inactive queue
2027          *       is large enough to satisfy page movement to free+cache
2028          *       then it is repopulated more slowly from the active queue.
2029          *       This allows a general inactive_target default to be set.
2030          *
2031          *       There is an issue here for processes which sit mostly idle
2032          *       'overnight', such as sshd, tcsh, and X.  Any movement from
2033          *       the active queue will eventually cause such pages to
2034          *       recycle eventually causing a lot of paging in the morning.
2035          *       To reduce the incidence of this pages cycled out of the
2036          *       buffer cache are moved directly to the inactive queue if
2037          *       they were only used once or twice.
2038          *
2039          *       The vfs.vm_cycle_point sysctl can be used to adjust this.
2040          *       Increasing the value (up to 64) increases the number of
2041          *       buffer recyclements which go directly to the inactive queue.
2042          */
2043         if (vmstats.v_free_count > 2048) {
2044                 vmstats.v_cache_min = vmstats.v_free_target;
2045                 vmstats.v_cache_max = 2 * vmstats.v_cache_min;
2046         } else {
2047                 vmstats.v_cache_min = 0;
2048                 vmstats.v_cache_max = 0;
2049         }
2050         vmstats.v_inactive_target = vmstats.v_free_count / 4;
2051
2052         /* XXX does not really belong here */
2053         if (vm_page_max_wired == 0)
2054                 vm_page_max_wired = vmstats.v_free_count / 3;
2055
2056         if (vm_pageout_stats_max == 0)
2057                 vm_pageout_stats_max = vmstats.v_free_target;
2058
2059         /*
2060          * Set interval in seconds for stats scan.
2061          */
2062         if (vm_pageout_stats_interval == 0)
2063                 vm_pageout_stats_interval = 5;
2064         if (vm_pageout_full_stats_interval == 0)
2065                 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
2066         
2067
2068         /*
2069          * Set maximum free per pass
2070          */
2071         if (vm_pageout_stats_free_max == 0)
2072                 vm_pageout_stats_free_max = 5;
2073
2074         swap_pager_swap_init();
2075         pass = 0;
2076
2077         /*
2078          * The pageout daemon is never done, so loop forever.
2079          */
2080         while (TRUE) {
2081                 int error;
2082                 int avail_shortage;
2083                 int inactive_shortage;
2084                 int vnodes_skipped = 0;
2085                 int recycle_count = 0;
2086                 int tmp;
2087
2088                 /*
2089                  * Wait for an action request.  If we timeout check to
2090                  * see if paging is needed (in case the normal wakeup
2091                  * code raced us).
2092                  */
2093                 if (vm_pages_needed == 0) {
2094                         error = tsleep(&vm_pages_needed,
2095                                        0, "psleep",
2096                                        vm_pageout_stats_interval * hz);
2097                         if (error &&
2098                             vm_paging_needed() == 0 &&
2099                             vm_pages_needed == 0) {
2100                                 for (q = 0; q < PQ_L2_SIZE; ++q)
2101                                         vm_pageout_page_stats(q);
2102                                 continue;
2103                         }
2104                         vm_pages_needed = 1;
2105                 }
2106
2107                 mycpu->gd_cnt.v_pdwakeups++;
2108
2109                 /*
2110                  * Scan for INACTIVE->CLEAN/PAGEOUT
2111                  *
2112                  * This routine tries to avoid thrashing the system with
2113                  * unnecessary activity.
2114                  *
2115                  * Calculate our target for the number of free+cache pages we
2116                  * want to get to.  This is higher then the number that causes
2117                  * allocations to stall (severe) in order to provide hysteresis,
2118                  * and if we don't make it all the way but get to the minimum
2119                  * we're happy.  Goose it a bit if there are multiple requests
2120                  * for memory.
2121                  *
2122                  * Don't reduce avail_shortage inside the loop or the
2123                  * PQAVERAGE() calculation will break.
2124                  *
2125                  * NOTE! deficit is differentiated from avail_shortage as
2126                  *       REQUIRING at least (deficit) pages to be cleaned,
2127                  *       even if the page queues are in good shape.  This
2128                  *       is used primarily for handling per-process
2129                  *       RLIMIT_RSS and may also see small values when
2130                  *       processes block due to low memory.
2131                  */
2132                 avail_shortage = vm_paging_target() + vm_pageout_deficit;
2133                 vm_pageout_deficit = 0;
2134
2135                 if (avail_shortage > 0) {
2136                         int delta = 0;
2137
2138                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2139                                 delta += vm_pageout_scan_inactive(
2140                                             pass,
2141                                             (q + q1iterator) & PQ_L2_MASK,
2142                                             PQAVERAGE(avail_shortage),
2143                                             &vnodes_skipped);
2144                                 if (avail_shortage - delta <= 0)
2145                                         break;
2146                         }
2147                         avail_shortage -= delta;
2148                         q1iterator = q + 1;
2149                 }
2150
2151                 /*
2152                  * Figure out how many active pages we must deactivate.  If
2153                  * we were able to reach our target with just the inactive
2154                  * scan above we limit the number of active pages we
2155                  * deactivate to reduce unnecessary work.
2156                  */
2157                 inactive_shortage = vmstats.v_inactive_target -
2158                                     vmstats.v_inactive_count;
2159
2160                 /*
2161                  * If we were unable to free sufficient inactive pages to
2162                  * satisfy the free/cache queue requirements then simply
2163                  * reaching the inactive target may not be good enough.
2164                  * Try to deactivate pages in excess of the target based
2165                  * on the shortfall.
2166                  *
2167                  * However to prevent thrashing the VM system do not
2168                  * deactivate more than an additional 1/10 the inactive
2169                  * target's worth of active pages.
2170                  */
2171                 if (avail_shortage > 0) {
2172                         tmp = avail_shortage * 2;
2173                         if (tmp > vmstats.v_inactive_target / 10)
2174                                 tmp = vmstats.v_inactive_target / 10;
2175                         inactive_shortage += tmp;
2176                 }
2177
2178                 /*
2179                  * Only trigger a pmap cleanup on inactive shortage.
2180                  */
2181                 if (inactive_shortage > 0) {
2182                         pmap_collect();
2183                 }
2184
2185                 /*
2186                  * Scan for ACTIVE->INACTIVE
2187                  *
2188                  * Only trigger on inactive shortage.  Triggering on
2189                  * avail_shortage can starve the active queue with
2190                  * unnecessary active->inactive transitions and destroy
2191                  * performance.
2192                  */
2193                 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2194                         int delta = 0;
2195
2196                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2197                                 delta += vm_pageout_scan_active(
2198                                                 pass,
2199                                                 (q + q2iterator) & PQ_L2_MASK,
2200                                                 PQAVERAGE(avail_shortage),
2201                                                 PQAVERAGE(inactive_shortage),
2202                                                 &recycle_count);
2203                                 if (inactive_shortage - delta <= 0 &&
2204                                     avail_shortage - delta <= 0) {
2205                                         break;
2206                                 }
2207                         }
2208                         inactive_shortage -= delta;
2209                         avail_shortage -= delta;
2210                         q2iterator = q + 1;
2211                 }
2212
2213                 /*
2214                  * Scan for CACHE->FREE
2215                  *
2216                  * Finally free enough cache pages to meet our free page
2217                  * requirement and take more drastic measures if we are
2218                  * still in trouble.
2219                  */
2220                 vm_pageout_scan_cache(avail_shortage, pass,
2221                                       vnodes_skipped, recycle_count);
2222
2223                 /*
2224                  * Wait for more work.
2225                  */
2226                 if (avail_shortage > 0) {
2227                         ++pass;
2228                         if (pass < 10 && vm_pages_needed > 1) {
2229                                 /*
2230                                  * Normal operation, additional processes
2231                                  * have already kicked us.  Retry immediately
2232                                  * unless swap space is completely full in
2233                                  * which case delay a bit.
2234                                  */
2235                                 if (swap_pager_full) {
2236                                         tsleep(&vm_pages_needed, 0, "pdelay",
2237                                                 hz / 5);
2238                                 } /* else immediate retry */
2239                         } else if (pass < 10) {
2240                                 /*
2241                                  * Normal operation, fewer processes.  Delay
2242                                  * a bit but allow wakeups.
2243                                  */
2244                                 vm_pages_needed = 0;
2245                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2246                                 vm_pages_needed = 1;
2247                         } else if (swap_pager_full == 0) {
2248                                 /*
2249                                  * We've taken too many passes, forced delay.
2250                                  */
2251                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2252                         } else {
2253                                 /*
2254                                  * Running out of memory, catastrophic
2255                                  * back-off to one-second intervals.
2256                                  */
2257                                 tsleep(&vm_pages_needed, 0, "pdelay", hz);
2258                         }
2259                 } else if (vm_pages_needed) {
2260                         /*
2261                          * Interlocked wakeup of waiters (non-optional).
2262                          *
2263                          * Similar to vm_page_free_wakeup() in vm_page.c,
2264                          * wake
2265                          */
2266                         pass = 0;
2267                         if (!vm_page_count_min(vm_page_free_hysteresis) ||
2268                             !vm_page_count_target()) {
2269                                 vm_pages_needed = 0;
2270                                 wakeup(&vmstats.v_free_count);
2271                         }
2272                 } else {
2273                         pass = 0;
2274                 }
2275         }
2276 }
2277
2278 static struct kproc_desc page_kp = {
2279         "pagedaemon",
2280         vm_pageout_thread,
2281         &pagethread
2282 };
2283 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp);
2284
2285
2286 /*
2287  * Called after allocating a page out of the cache or free queue
2288  * to possibly wake the pagedaemon up to replentish our supply.
2289  *
2290  * We try to generate some hysteresis by waking the pagedaemon up
2291  * when our free+cache pages go below the free_min+cache_min level.
2292  * The pagedaemon tries to get the count back up to at least the
2293  * minimum, and through to the target level if possible.
2294  *
2295  * If the pagedaemon is already active bump vm_pages_needed as a hint
2296  * that there are even more requests pending.
2297  *
2298  * SMP races ok?
2299  * No requirements.
2300  */
2301 void
2302 pagedaemon_wakeup(void)
2303 {
2304         if (vm_paging_needed() && curthread != pagethread) {
2305                 if (vm_pages_needed == 0) {
2306                         vm_pages_needed = 1;    /* SMP race ok */
2307                         wakeup(&vm_pages_needed);
2308                 } else if (vm_page_count_min(0)) {
2309                         ++vm_pages_needed;      /* SMP race ok */
2310                 }
2311         }
2312 }
2313
2314 #if !defined(NO_SWAPPING)
2315
2316 /*
2317  * SMP races ok?
2318  * No requirements.
2319  */
2320 static void
2321 vm_req_vmdaemon(void)
2322 {
2323         static int lastrun = 0;
2324
2325         if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2326                 wakeup(&vm_daemon_needed);
2327                 lastrun = ticks;
2328         }
2329 }
2330
2331 static int vm_daemon_callback(struct proc *p, void *data __unused);
2332
2333 /*
2334  * No requirements.
2335  */
2336 static void
2337 vm_daemon(void)
2338 {
2339         int req_swapout;
2340
2341         while (TRUE) {
2342                 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2343                 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0);
2344
2345                 /*
2346                  * forced swapouts
2347                  */
2348                 if (req_swapout)
2349                         swapout_procs(vm_pageout_req_swapout);
2350
2351                 /*
2352                  * scan the processes for exceeding their rlimits or if
2353                  * process is swapped out -- deactivate pages
2354                  */
2355                 allproc_scan(vm_daemon_callback, NULL);
2356         }
2357 }
2358
2359 static int
2360 vm_daemon_callback(struct proc *p, void *data __unused)
2361 {
2362         struct vmspace *vm;
2363         vm_pindex_t limit, size;
2364
2365         /*
2366          * if this is a system process or if we have already
2367          * looked at this process, skip it.
2368          */
2369         lwkt_gettoken(&p->p_token);
2370
2371         if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2372                 lwkt_reltoken(&p->p_token);
2373                 return (0);
2374         }
2375
2376         /*
2377          * if the process is in a non-running type state,
2378          * don't touch it.
2379          */
2380         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2381                 lwkt_reltoken(&p->p_token);
2382                 return (0);
2383         }
2384
2385         /*
2386          * get a limit
2387          */
2388         limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2389                                 p->p_rlimit[RLIMIT_RSS].rlim_max));
2390
2391         /*
2392          * let processes that are swapped out really be
2393          * swapped out.  Set the limit to nothing to get as
2394          * many pages out to swap as possible.
2395          */
2396         if (p->p_flags & P_SWAPPEDOUT)
2397                 limit = 0;
2398
2399         vm = p->p_vmspace;
2400         vmspace_hold(vm);
2401         size = pmap_resident_tlnw_count(&vm->vm_pmap);
2402         if (limit >= 0 && size >= limit && vm_pageout_memuse_mode >= 1) {
2403                 vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2404         }
2405         vmspace_drop(vm);
2406
2407         lwkt_reltoken(&p->p_token);
2408
2409         return (0);
2410 }
2411
2412 #endif