kernel - VM rework part 7 - Initial vm_map_backing index
[dragonfly.git] / sys / vm / swap_pager.c
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1998-2010 The DragonFly Project.  All rights reserved.
5  * 
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  * 
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  * 
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  * 
36  * Copyright (c) 1994 John S. Dyson
37  * Copyright (c) 1990 University of Utah.
38  * Copyright (c) 1991, 1993
39  *      The Regents of the University of California.  All rights reserved.
40  *
41  * This code is derived from software contributed to Berkeley by
42  * the Systems Programming Group of the University of Utah Computer
43  * Science Department.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  * 3. Neither the name of the University nor the names of its contributors
54  *    may be used to endorse or promote products derived from this software
55  *    without specific prior written permission.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67  * SUCH DAMAGE.
68  *
69  *                              New Swap System
70  *                              Matthew Dillon
71  *
72  * Radix Bitmap 'blists'.
73  *
74  *      - The new swapper uses the new radix bitmap code.  This should scale
75  *        to arbitrarily small or arbitrarily large swap spaces and an almost
76  *        arbitrary degree of fragmentation.
77  *
78  * Features:
79  *
80  *      - on the fly reallocation of swap during putpages.  The new system
81  *        does not try to keep previously allocated swap blocks for dirty
82  *        pages.  
83  *
84  *      - on the fly deallocation of swap
85  *
86  *      - No more garbage collection required.  Unnecessarily allocated swap
87  *        blocks only exist for dirty vm_page_t's now and these are already
88  *        cycled (in a high-load system) by the pager.  We also do on-the-fly
89  *        removal of invalidated swap blocks when a page is destroyed
90  *        or renamed.
91  *
92  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
93  * @(#)swap_pager.c     8.9 (Berkeley) 3/21/94
94  * $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
95  */
96
97 #include "opt_swap.h"
98 #include <sys/param.h>
99 #include <sys/systm.h>
100 #include <sys/conf.h>
101 #include <sys/kernel.h>
102 #include <sys/proc.h>
103 #include <sys/buf.h>
104 #include <sys/vnode.h>
105 #include <sys/malloc.h>
106 #include <sys/vmmeter.h>
107 #include <sys/sysctl.h>
108 #include <sys/blist.h>
109 #include <sys/lock.h>
110 #include <sys/kcollect.h>
111
112 #include <vm/vm.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
115 #include <vm/vm_pager.h>
116 #include <vm/vm_pageout.h>
117 #include <vm/swap_pager.h>
118 #include <vm/vm_extern.h>
119 #include <vm/vm_zone.h>
120 #include <vm/vnode_pager.h>
121
122 #include <sys/buf2.h>
123 #include <vm/vm_page2.h>
124
125 #ifndef MAX_PAGEOUT_CLUSTER
126 #define MAX_PAGEOUT_CLUSTER     SWB_NPAGES
127 #endif
128
129 #define SWM_FREE        0x02    /* free, period                 */
130 #define SWM_POP         0x04    /* pop out                      */
131
132 #define SWBIO_READ      0x01
133 #define SWBIO_WRITE     0x02
134 #define SWBIO_SYNC      0x04
135 #define SWBIO_TTC       0x08    /* for VM_PAGER_TRY_TO_CACHE */
136
137 struct swfreeinfo {
138         vm_object_t     object;
139         vm_pindex_t     basei;
140         vm_pindex_t     begi;
141         vm_pindex_t     endi;   /* inclusive */
142 };
143
144 struct swswapoffinfo {
145         vm_object_t     object;
146         int             devidx;
147         int             shared;
148 };
149
150 /*
151  * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
152  * in the old system.
153  */
154
155 int swap_pager_full;            /* swap space exhaustion (task killing) */
156 int swap_fail_ticks;            /* when we became exhausted */
157 int swap_pager_almost_full;     /* swap space exhaustion (w/ hysteresis)*/
158 swblk_t vm_swap_cache_use;
159 swblk_t vm_swap_anon_use;
160 static int vm_report_swap_allocs;
161
162 static struct krate kswaprate = { 1 };
163 static int nsw_rcount;          /* free read buffers                    */
164 static int nsw_wcount_sync;     /* limit write buffers / synchronous    */
165 static int nsw_wcount_async;    /* limit write buffers / asynchronous   */
166 static int nsw_wcount_async_max;/* assigned maximum                     */
167 static int nsw_cluster_max;     /* maximum VOP I/O allowed              */
168
169 struct blist *swapblist;
170 static int swap_async_max = 4;  /* maximum in-progress async I/O's      */
171 static int swap_burst_read = 0; /* allow burst reading */
172 static swblk_t swapiterator;    /* linearize allocations */
173 int swap_user_async = 0;        /* user swap pager operation can be async */
174
175 static struct spinlock swapbp_spin = SPINLOCK_INITIALIZER(&swapbp_spin, "swapbp_spin");
176
177 /* from vm_swap.c */
178 extern struct vnode *swapdev_vp;
179 extern struct swdevt *swdevt;
180 extern int nswdev;
181
182 #define BLK2DEVIDX(blk) (nswdev > 1 ? blk / SWB_DMMAX % nswdev : 0)
183
184 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
185         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
186 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
187         CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
188 SYSCTL_INT(_vm, OID_AUTO, swap_user_async,
189         CTLFLAG_RW, &swap_user_async, 0, "Allow async uuser swap write I/O");
190
191 #if SWBLK_BITS == 64
192 SYSCTL_LONG(_vm, OID_AUTO, swap_cache_use,
193         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
194 SYSCTL_LONG(_vm, OID_AUTO, swap_anon_use,
195         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
196 SYSCTL_LONG(_vm, OID_AUTO, swap_size,
197         CTLFLAG_RD, &vm_swap_size, 0, "");
198 #else
199 SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
200         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
201 SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
202         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
203 SYSCTL_INT(_vm, OID_AUTO, swap_size,
204         CTLFLAG_RD, &vm_swap_size, 0, "");
205 #endif
206 SYSCTL_INT(_vm, OID_AUTO, report_swap_allocs,
207         CTLFLAG_RW, &vm_report_swap_allocs, 0, "");
208
209 vm_zone_t               swap_zone;
210
211 /*
212  * Red-Black tree for swblock entries
213  *
214  * The caller must hold vm_token
215  */
216 RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
217              vm_pindex_t, swb_index);
218
219 int
220 rb_swblock_compare(struct swblock *swb1, struct swblock *swb2)
221 {
222         if (swb1->swb_index < swb2->swb_index)
223                 return(-1);
224         if (swb1->swb_index > swb2->swb_index)
225                 return(1);
226         return(0);
227 }
228
229 static
230 int
231 rb_swblock_scancmp(struct swblock *swb, void *data)
232 {
233         struct swfreeinfo *info = data;
234
235         if (swb->swb_index < info->basei)
236                 return(-1);
237         if (swb->swb_index > info->endi)
238                 return(1);
239         return(0);
240 }
241
242 static
243 int
244 rb_swblock_condcmp(struct swblock *swb, void *data)
245 {
246         struct swfreeinfo *info = data;
247
248         if (swb->swb_index < info->basei)
249                 return(-1);
250         return(0);
251 }
252
253 /*
254  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
255  * calls hooked from other parts of the VM system and do not appear here.
256  * (see vm/swap_pager.h).
257  */
258
259 static void     swap_pager_dealloc (vm_object_t object);
260 static int      swap_pager_getpage (vm_object_t, vm_page_t *, int);
261 static void     swap_chain_iodone(struct bio *biox);
262
263 struct pagerops swappagerops = {
264         swap_pager_dealloc,     /* deallocate an OBJT_SWAP object       */
265         swap_pager_getpage,     /* pagein                               */
266         swap_pager_putpages,    /* pageout                              */
267         swap_pager_haspage      /* get backing store status for page    */
268 };
269
270 /*
271  * SWB_DMMAX is in page-sized chunks with the new swap system.  It was
272  * dev-bsized chunks in the old.  SWB_DMMAX is always a power of 2.
273  *
274  * swap_*() routines are externally accessible.  swp_*() routines are
275  * internal.
276  */
277
278 int nswap_lowat = 128;          /* in pages, swap_pager_almost_full warn */
279 int nswap_hiwat = 512;          /* in pages, swap_pager_almost_full warn */
280
281 static __inline void    swp_sizecheck (void);
282 static void     swp_pager_async_iodone (struct bio *bio);
283
284 /*
285  * Swap bitmap functions
286  */
287
288 static __inline void    swp_pager_freeswapspace(vm_object_t object,
289                                                 swblk_t blk, int npages);
290 static __inline swblk_t swp_pager_getswapspace(vm_object_t object, int npages);
291
292 /*
293  * Metadata functions
294  */
295
296 static void swp_pager_meta_convert(vm_object_t);
297 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, swblk_t);
298 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
299 static void swp_pager_meta_free_all(vm_object_t);
300 static swblk_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
301
302 /*
303  * SWP_SIZECHECK() -    update swap_pager_full indication
304  *      
305  *      update the swap_pager_almost_full indication and warn when we are
306  *      about to run out of swap space, using lowat/hiwat hysteresis.
307  *
308  *      Clear swap_pager_full ( task killing ) indication when lowat is met.
309  *
310  * No restrictions on call
311  * This routine may not block.
312  * SMP races are ok.
313  */
314 static __inline void
315 swp_sizecheck(void)
316 {
317         if (vm_swap_size < nswap_lowat) {
318                 if (swap_pager_almost_full == 0) {
319                         kprintf("swap_pager: out of swap space\n");
320                         swap_pager_almost_full = 1;
321                         swap_fail_ticks = ticks;
322                 }
323         } else {
324                 swap_pager_full = 0;
325                 if (vm_swap_size > nswap_hiwat)
326                         swap_pager_almost_full = 0;
327         }
328 }
329
330 /*
331  * Long-term data collection on 10-second interval.  Return the value
332  * for KCOLLECT_SWAPPCT and set the values for SWAPANO and SWAPCCAC.
333  *
334  * Return total swap in the scale field.  This can change if swap is
335  * regularly added or removed and may cause some historical confusion
336  * in that case, but SWAPPCT will always be historically accurate.
337  */
338
339 #define PTOB(value)     ((uint64_t)(value) << PAGE_SHIFT)
340
341 static uint64_t
342 collect_swap_callback(int n)
343 {
344         uint64_t total = vm_swap_max;
345         uint64_t anon = vm_swap_anon_use;
346         uint64_t cache = vm_swap_cache_use;
347
348         if (total == 0)         /* avoid divide by zero */
349                 total = 1;
350         kcollect_setvalue(KCOLLECT_SWAPANO, PTOB(anon));
351         kcollect_setvalue(KCOLLECT_SWAPCAC, PTOB(cache));
352         kcollect_setscale(KCOLLECT_SWAPANO,
353                           KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, PTOB(total)));
354         kcollect_setscale(KCOLLECT_SWAPCAC,
355                           KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, PTOB(total)));
356         return (((anon + cache) * 10000 + (total >> 1)) / total);
357 }
358
359 /*
360  * SWAP_PAGER_INIT() -  initialize the swap pager!
361  *
362  *      Expected to be started from system init.  NOTE:  This code is run 
363  *      before much else so be careful what you depend on.  Most of the VM
364  *      system has yet to be initialized at this point.
365  *
366  * Called from the low level boot code only.
367  */
368 static void
369 swap_pager_init(void *arg __unused)
370 {
371         kcollect_register(KCOLLECT_SWAPPCT, "swapuse", collect_swap_callback,
372                           KCOLLECT_SCALE(KCOLLECT_SWAPPCT_FORMAT, 0));
373         kcollect_register(KCOLLECT_SWAPANO, "swapano", NULL,
374                           KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, 0));
375         kcollect_register(KCOLLECT_SWAPCAC, "swapcac", NULL,
376                           KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, 0));
377 }
378 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL);
379
380 /*
381  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
382  *
383  *      Expected to be started from pageout process once, prior to entering
384  *      its main loop.
385  *
386  * Called from the low level boot code only.
387  */
388 void
389 swap_pager_swap_init(void)
390 {
391         int n, n2;
392
393         /*
394          * Number of in-transit swap bp operations.  Don't
395          * exhaust the pbufs completely.  Make sure we
396          * initialize workable values (0 will work for hysteresis
397          * but it isn't very efficient).
398          *
399          * The nsw_cluster_max is constrained by the number of pages an XIO
400          * holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
401          * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
402          * constrained by the swap device interleave stripe size.
403          *
404          * Currently we hardwire nsw_wcount_async to 4.  This limit is 
405          * designed to prevent other I/O from having high latencies due to
406          * our pageout I/O.  The value 4 works well for one or two active swap
407          * devices but is probably a little low if you have more.  Even so,
408          * a higher value would probably generate only a limited improvement
409          * with three or four active swap devices since the system does not
410          * typically have to pageout at extreme bandwidths.   We will want
411          * at least 2 per swap devices, and 4 is a pretty good value if you
412          * have one NFS swap device due to the command/ack latency over NFS.
413          * So it all works out pretty well.
414          */
415
416         nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
417
418         nsw_rcount = (nswbuf_kva + 1) / 2;
419         nsw_wcount_sync = (nswbuf_kva + 3) / 4;
420         nsw_wcount_async = 4;
421         nsw_wcount_async_max = nsw_wcount_async;
422
423         /*
424          * The zone is dynamically allocated so generally size it to
425          * maxswzone (32MB to 256GB of KVM).  Set a minimum size based
426          * on physical memory of around 8x (each swblock can hold 16 pages).
427          *
428          * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
429          * has increased dramatically.
430          */
431         n = vmstats.v_page_count / 2;
432         if (maxswzone && n < maxswzone / sizeof(struct swblock))
433                 n = maxswzone / sizeof(struct swblock);
434         n2 = n;
435
436         do {
437                 swap_zone = zinit(
438                         "SWAPMETA", 
439                         sizeof(struct swblock), 
440                         n,
441                         ZONE_INTERRUPT);
442                 if (swap_zone != NULL)
443                         break;
444                 /*
445                  * if the allocation failed, try a zone two thirds the
446                  * size of the previous attempt.
447                  */
448                 n -= ((n + 2) / 3);
449         } while (n > 0);
450
451         if (swap_zone == NULL)
452                 panic("swap_pager_swap_init: swap_zone == NULL");
453         if (n2 != n)
454                 kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
455 }
456
457 /*
458  * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
459  *                      its metadata structures.
460  *
461  *      This routine is called from the mmap and fork code to create a new
462  *      OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
463  *      and then converting it with swp_pager_meta_convert().
464  *
465  *      We only support unnamed objects.
466  *
467  * No restrictions.
468  */
469 vm_object_t
470 swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
471 {
472         vm_object_t object;
473
474         KKASSERT(handle == NULL);
475         object = vm_object_allocate_hold(OBJT_DEFAULT,
476                                          OFF_TO_IDX(offset + PAGE_MASK + size));
477         swp_pager_meta_convert(object);
478         vm_object_drop(object);
479
480         return (object);
481 }
482
483 /*
484  * SWAP_PAGER_DEALLOC() -       remove swap metadata from object
485  *
486  *      The swap backing for the object is destroyed.  The code is 
487  *      designed such that we can reinstantiate it later, but this
488  *      routine is typically called only when the entire object is
489  *      about to be destroyed.
490  *
491  * The object must be locked or unreferenceable.
492  * No other requirements.
493  */
494 static void
495 swap_pager_dealloc(vm_object_t object)
496 {
497         vm_object_hold(object);
498         vm_object_pip_wait(object, "swpdea");
499
500         /*
501          * Free all remaining metadata.  We only bother to free it from 
502          * the swap meta data.  We do not attempt to free swapblk's still
503          * associated with vm_page_t's for this object.  We do not care
504          * if paging is still in progress on some objects.
505          */
506         swp_pager_meta_free_all(object);
507         vm_object_drop(object);
508 }
509
510 /************************************************************************
511  *                      SWAP PAGER BITMAP ROUTINES                      *
512  ************************************************************************/
513
514 /*
515  * SWP_PAGER_GETSWAPSPACE() -   allocate raw swap space
516  *
517  *      Allocate swap for the requested number of pages.  The starting
518  *      swap block number (a page index) is returned or SWAPBLK_NONE
519  *      if the allocation failed.
520  *
521  *      Also has the side effect of advising that somebody made a mistake
522  *      when they configured swap and didn't configure enough.
523  *
524  * The caller must hold the object.
525  * This routine may not block.
526  */
527 static __inline swblk_t
528 swp_pager_getswapspace(vm_object_t object, int npages)
529 {
530         swblk_t blk;
531
532         lwkt_gettoken(&vm_token);
533         blk = blist_allocat(swapblist, npages, swapiterator);
534         if (blk == SWAPBLK_NONE)
535                 blk = blist_allocat(swapblist, npages, 0);
536         if (blk == SWAPBLK_NONE) {
537                 if (swap_pager_full != 2) {
538                         if (vm_swap_max == 0) {
539                                 krateprintf(&kswaprate,
540                                         "Warning: The system would like to "
541                                         "page to swap but no swap space "
542                                         "is configured!\n");
543                         } else {
544                                 krateprintf(&kswaprate,
545                                         "swap_pager_getswapspace: "
546                                         "swap full allocating %d pages\n",
547                                         npages);
548                         }
549                         swap_pager_full = 2;
550                         if (swap_pager_almost_full == 0)
551                                 swap_fail_ticks = ticks;
552                         swap_pager_almost_full = 1;
553                 }
554         } else {
555                 /* swapiterator = blk; disable for now, doesn't work well */
556                 swapacctspace(blk, -npages);
557                 if (object->type == OBJT_SWAP)
558                         vm_swap_anon_use += npages;
559                 else
560                         vm_swap_cache_use += npages;
561                 swp_sizecheck();
562         }
563         lwkt_reltoken(&vm_token);
564         return(blk);
565 }
566
567 /*
568  * SWP_PAGER_FREESWAPSPACE() -  free raw swap space 
569  *
570  *      This routine returns the specified swap blocks back to the bitmap.
571  *
572  *      Note:  This routine may not block (it could in the old swap code),
573  *      and through the use of the new blist routines it does not block.
574  *
575  * This routine may not block.
576  */
577
578 static __inline void
579 swp_pager_freeswapspace(vm_object_t object, swblk_t blk, int npages)
580 {
581         struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
582
583         lwkt_gettoken(&vm_token);
584         sp->sw_nused -= npages;
585         if (object->type == OBJT_SWAP)
586                 vm_swap_anon_use -= npages;
587         else
588                 vm_swap_cache_use -= npages;
589
590         if (sp->sw_flags & SW_CLOSING) {
591                 lwkt_reltoken(&vm_token);
592                 return;
593         }
594
595         blist_free(swapblist, blk, npages);
596         vm_swap_size += npages;
597         swp_sizecheck();
598         lwkt_reltoken(&vm_token);
599 }
600
601 /*
602  * SWAP_PAGER_FREESPACE() -     frees swap blocks associated with a page
603  *                              range within an object.
604  *
605  *      This is a globally accessible routine.
606  *
607  *      This routine removes swapblk assignments from swap metadata.
608  *
609  *      The external callers of this routine typically have already destroyed 
610  *      or renamed vm_page_t's associated with this range in the object so 
611  *      we should be ok.
612  *
613  * No requirements.
614  */
615 void
616 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
617 {
618         vm_object_hold(object);
619         swp_pager_meta_free(object, start, size);
620         vm_object_drop(object);
621 }
622
623 /*
624  * No requirements.
625  */
626 void
627 swap_pager_freespace_all(vm_object_t object)
628 {
629         vm_object_hold(object);
630         swp_pager_meta_free_all(object);
631         vm_object_drop(object);
632 }
633
634 /*
635  * This function conditionally frees swap cache swap starting at
636  * (*basei) in the object.  (count) swap blocks will be nominally freed.
637  * The actual number of blocks freed can be more or less than the
638  * requested number.
639  *
640  * This function nominally returns the number of blocks freed.  However,
641  * the actual number of blocks freed may be less then the returned value.
642  * If the function is unable to exhaust the object or if it is able to
643  * free (approximately) the requested number of blocks it returns
644  * a value n > count.
645  *
646  * If we exhaust the object we will return a value n <= count.
647  *
648  * The caller must hold the object.
649  *
650  * WARNING!  If count == 0 then -1 can be returned as a degenerate case,
651  *           callers should always pass a count value > 0.
652  */
653 static int swap_pager_condfree_callback(struct swblock *swap, void *data);
654
655 int
656 swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count)
657 {
658         struct swfreeinfo info;
659         int n;
660         int t;
661
662         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
663
664         info.object = object;
665         info.basei = *basei;    /* skip up to this page index */
666         info.begi = count;      /* max swap pages to destroy */
667         info.endi = count * 8;  /* max swblocks to scan */
668
669         swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
670                                 swap_pager_condfree_callback, &info);
671         *basei = info.basei;
672
673         /*
674          * Take the higher difference swblocks vs pages
675          */
676         n = count - (int)info.begi;
677         t = count * 8 - (int)info.endi;
678         if (n < t)
679                 n = t;
680         if (n < 1)
681                 n = 1;
682         return(n);
683 }
684
685 /*
686  * The idea is to free whole meta-block to avoid fragmenting
687  * the swap space or disk I/O.  We only do this if NO VM pages
688  * are present.
689  *
690  * We do not have to deal with clearing PG_SWAPPED in related VM
691  * pages because there are no related VM pages.
692  *
693  * The caller must hold the object.
694  */
695 static int
696 swap_pager_condfree_callback(struct swblock *swap, void *data)
697 {
698         struct swfreeinfo *info = data;
699         vm_object_t object = info->object;
700         int i;
701
702         for (i = 0; i < SWAP_META_PAGES; ++i) {
703                 if (vm_page_lookup(object, swap->swb_index + i))
704                         break;
705         }
706         info->basei = swap->swb_index + SWAP_META_PAGES;
707         if (i == SWAP_META_PAGES) {
708                 info->begi -= swap->swb_count;
709                 swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
710         }
711         --info->endi;
712         if ((int)info->begi < 0 || (int)info->endi < 0)
713                 return(-1);
714         lwkt_yield();
715         return(0);
716 }
717
718 /*
719  * Called by vm_page_alloc() when a new VM page is inserted
720  * into a VM object.  Checks whether swap has been assigned to
721  * the page and sets PG_SWAPPED as necessary.
722  *
723  * (m) must be busied by caller and remains busied on return.
724  */
725 void
726 swap_pager_page_inserted(vm_page_t m)
727 {
728         if (m->object->swblock_count) {
729                 vm_object_hold(m->object);
730                 if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
731                         vm_page_flag_set(m, PG_SWAPPED);
732                 vm_object_drop(m->object);
733         }
734 }
735
736 /*
737  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
738  *
739  *      Assigns swap blocks to the specified range within the object.  The 
740  *      swap blocks are not zerod.  Any previous swap assignment is destroyed.
741  *
742  *      Returns 0 on success, -1 on failure.
743  *
744  * The caller is responsible for avoiding races in the specified range.
745  * No other requirements.
746  */
747 int
748 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
749 {
750         int n = 0;
751         swblk_t blk = SWAPBLK_NONE;
752         vm_pindex_t beg = start;        /* save start index */
753
754         vm_object_hold(object);
755
756         while (size) {
757                 if (n == 0) {
758                         n = BLIST_MAX_ALLOC;
759                         while ((blk = swp_pager_getswapspace(object, n)) ==
760                                SWAPBLK_NONE)
761                         {
762                                 n >>= 1;
763                                 if (n == 0) {
764                                         swp_pager_meta_free(object, beg,
765                                                             start - beg);
766                                         vm_object_drop(object);
767                                         return(-1);
768                                 }
769                         }
770                 }
771                 swp_pager_meta_build(object, start, blk);
772                 --size;
773                 ++start;
774                 ++blk;
775                 --n;
776         }
777         swp_pager_meta_free(object, start, n);
778         vm_object_drop(object);
779         return(0);
780 }
781
782 /*
783  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
784  *                      and destroy the source.
785  *
786  *      Copy any valid swapblks from the source to the destination.  In
787  *      cases where both the source and destination have a valid swapblk,
788  *      we keep the destination's.
789  *
790  *      This routine is allowed to block.  It may block allocating metadata
791  *      indirectly through swp_pager_meta_build() or if paging is still in
792  *      progress on the source. 
793  *
794  *      XXX vm_page_collapse() kinda expects us not to block because we 
795  *      supposedly do not need to allocate memory, but for the moment we
796  *      *may* have to get a little memory from the zone allocator, but
797  *      it is taken from the interrupt memory.  We should be ok. 
798  *
799  *      The source object contains no vm_page_t's (which is just as well)
800  *      The source object is of type OBJT_SWAP.
801  *
802  *      The source and destination objects must be held by the caller.
803  */
804 void
805 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
806                 vm_pindex_t base_index, int destroysource)
807 {
808         vm_pindex_t i;
809
810         ASSERT_LWKT_TOKEN_HELD(vm_object_token(srcobject));
811         ASSERT_LWKT_TOKEN_HELD(vm_object_token(dstobject));
812
813         /*
814          * transfer source to destination.
815          */
816         for (i = 0; i < dstobject->size; ++i) {
817                 swblk_t dstaddr;
818
819                 /*
820                  * Locate (without changing) the swapblk on the destination,
821                  * unless it is invalid in which case free it silently, or
822                  * if the destination is a resident page, in which case the
823                  * source is thrown away.
824                  */
825                 dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
826
827                 if (dstaddr == SWAPBLK_NONE) {
828                         /*
829                          * Destination has no swapblk and is not resident,
830                          * copy source.
831                          */
832                         swblk_t srcaddr;
833
834                         srcaddr = swp_pager_meta_ctl(srcobject,
835                                                      base_index + i, SWM_POP);
836
837                         if (srcaddr != SWAPBLK_NONE)
838                                 swp_pager_meta_build(dstobject, i, srcaddr);
839                 } else {
840                         /*
841                          * Destination has valid swapblk or it is represented
842                          * by a resident page.  We destroy the sourceblock.
843                          */
844                         swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
845                 }
846         }
847
848         /*
849          * Free left over swap blocks in source.
850          *
851          * We have to revert the type to OBJT_DEFAULT so we do not accidently
852          * double-remove the object from the swap queues.
853          */
854         if (destroysource) {
855                 /*
856                  * Reverting the type is not necessary, the caller is going
857                  * to destroy srcobject directly, but I'm doing it here
858                  * for consistency since we've removed the object from its
859                  * queues.
860                  */
861                 swp_pager_meta_free_all(srcobject);
862                 if (srcobject->type == OBJT_SWAP)
863                         srcobject->type = OBJT_DEFAULT;
864         }
865 }
866
867 /*
868  * SWAP_PAGER_HASPAGE() -       determine if we have good backing store for
869  *                              the requested page.
870  *
871  *      We determine whether good backing store exists for the requested
872  *      page and return TRUE if it does, FALSE if it doesn't.
873  *
874  *      If TRUE, we also try to determine how much valid, contiguous backing
875  *      store exists before and after the requested page within a reasonable
876  *      distance.  We do not try to restrict it to the swap device stripe
877  *      (that is handled in getpages/putpages).  It probably isn't worth
878  *      doing here.
879  *
880  * No requirements.
881  */
882 boolean_t
883 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
884 {
885         swblk_t blk0;
886
887         /*
888          * do we have good backing store at the requested index ?
889          */
890         vm_object_hold(object);
891         blk0 = swp_pager_meta_ctl(object, pindex, 0);
892
893         if (blk0 == SWAPBLK_NONE) {
894                 vm_object_drop(object);
895                 return (FALSE);
896         }
897         vm_object_drop(object);
898         return (TRUE);
899 }
900
901 /*
902  * Object must be held exclusive or shared by the caller.
903  */
904 boolean_t
905 swap_pager_haspage_locked(vm_object_t object, vm_pindex_t pindex)
906 {
907         if (swp_pager_meta_ctl(object, pindex, 0) == SWAPBLK_NONE)
908                 return FALSE;
909         return TRUE;
910 }
911
912 /*
913  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
914  *
915  * This removes any associated swap backing store, whether valid or
916  * not, from the page.  This operates on any VM object, not just OBJT_SWAP
917  * objects.
918  *
919  * This routine is typically called when a page is made dirty, at
920  * which point any associated swap can be freed.  MADV_FREE also
921  * calls us in a special-case situation
922  *
923  * NOTE!!!  If the page is clean and the swap was valid, the caller
924  *          should make the page dirty before calling this routine.
925  *          This routine does NOT change the m->dirty status of the page.
926  *          Also: MADV_FREE depends on it.
927  *
928  * The page must be busied.
929  * The caller can hold the object to avoid blocking, else we might block.
930  * No other requirements.
931  */
932 void
933 swap_pager_unswapped(vm_page_t m)
934 {
935         if (m->flags & PG_SWAPPED) {
936                 vm_object_hold(m->object);
937                 KKASSERT(m->flags & PG_SWAPPED);
938                 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
939                 vm_page_flag_clear(m, PG_SWAPPED);
940                 vm_object_drop(m->object);
941         }
942 }
943
944 /*
945  * SWAP_PAGER_STRATEGY() - read, write, free blocks
946  *
947  * This implements a VM OBJECT strategy function using swap backing store.
948  * This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
949  * types.  Only BUF_CMD_{READ,WRITE,FREEBLKS} is supported, any other
950  * requests will return EINVAL.
951  *
952  * This is intended to be a cacheless interface (i.e. caching occurs at
953  * higher levels), and is also used as a swap-based SSD cache for vnode
954  * and device objects.
955  *
956  * All I/O goes directly to and from the swap device.
957  *      
958  * We currently attempt to run I/O synchronously or asynchronously as
959  * the caller requests.  This isn't perfect because we loose error
960  * sequencing when we run multiple ops in parallel to satisfy a request.
961  * But this is swap, so we let it all hang out.
962  *
963  * NOTE: This function supports the KVABIO API wherein bp->b_data might
964  *       not be synchronized to the current cpu.
965  *
966  * No requirements.
967  */
968 void
969 swap_pager_strategy(vm_object_t object, struct bio *bio)
970 {
971         struct buf *bp = bio->bio_buf;
972         struct bio *nbio;
973         vm_pindex_t start;
974         vm_pindex_t biox_blkno = 0;
975         int count;
976         char *data;
977         struct bio *biox;
978         struct buf *bufx;
979 #if 0
980         struct bio_track *track;
981 #endif
982
983 #if 0
984         /*
985          * tracking for swapdev vnode I/Os
986          */
987         if (bp->b_cmd == BUF_CMD_READ)
988                 track = &swapdev_vp->v_track_read;
989         else
990                 track = &swapdev_vp->v_track_write;
991 #endif
992
993         /*
994          * Only supported commands
995          */
996         if (bp->b_cmd != BUF_CMD_FREEBLKS &&
997             bp->b_cmd != BUF_CMD_READ &&
998             bp->b_cmd != BUF_CMD_WRITE) {
999                 bp->b_error = EINVAL;
1000                 bp->b_flags |= B_ERROR | B_INVAL;
1001                 biodone(bio);
1002                 return;
1003         }
1004
1005         /*
1006          * bcount must be an integral number of pages.
1007          */
1008         if (bp->b_bcount & PAGE_MASK) {
1009                 bp->b_error = EINVAL;
1010                 bp->b_flags |= B_ERROR | B_INVAL;
1011                 biodone(bio);
1012                 kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
1013                         "not page bounded\n",
1014                         bp, (long long)bio->bio_offset, (int)bp->b_bcount);
1015                 return;
1016         }
1017
1018         /*
1019          * Clear error indication, initialize page index, count, data pointer.
1020          */
1021         bp->b_error = 0;
1022         bp->b_flags &= ~B_ERROR;
1023         bp->b_resid = bp->b_bcount;
1024
1025         start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT);
1026         count = howmany(bp->b_bcount, PAGE_SIZE);
1027
1028         /*
1029          * WARNING!  Do not dereference *data without issuing a bkvasync()
1030          */
1031         data = bp->b_data;
1032
1033         /*
1034          * Deal with BUF_CMD_FREEBLKS
1035          */
1036         if (bp->b_cmd == BUF_CMD_FREEBLKS) {
1037                 /*
1038                  * FREE PAGE(s) - destroy underlying swap that is no longer
1039                  *                needed.
1040                  */
1041                 vm_object_hold(object);
1042                 swp_pager_meta_free(object, start, count);
1043                 vm_object_drop(object);
1044                 bp->b_resid = 0;
1045                 biodone(bio);
1046                 return;
1047         }
1048
1049         /*
1050          * We need to be able to create a new cluster of I/O's.  We cannot
1051          * use the caller fields of the passed bio so push a new one.
1052          *
1053          * Because nbio is just a placeholder for the cluster links,
1054          * we can biodone() the original bio instead of nbio to make
1055          * things a bit more efficient.
1056          */
1057         nbio = push_bio(bio);
1058         nbio->bio_offset = bio->bio_offset;
1059         nbio->bio_caller_info1.cluster_head = NULL;
1060         nbio->bio_caller_info2.cluster_tail = NULL;
1061
1062         biox = NULL;
1063         bufx = NULL;
1064
1065         /*
1066          * Execute read or write
1067          */
1068         vm_object_hold(object);
1069
1070         while (count > 0) {
1071                 swblk_t blk;
1072
1073                 /*
1074                  * Obtain block.  If block not found and writing, allocate a
1075                  * new block and build it into the object.
1076                  */
1077                 blk = swp_pager_meta_ctl(object, start, 0);
1078                 if ((blk == SWAPBLK_NONE) && bp->b_cmd == BUF_CMD_WRITE) {
1079                         blk = swp_pager_getswapspace(object, 1);
1080                         if (blk == SWAPBLK_NONE) {
1081                                 bp->b_error = ENOMEM;
1082                                 bp->b_flags |= B_ERROR;
1083                                 break;
1084                         }
1085                         swp_pager_meta_build(object, start, blk);
1086                 }
1087                         
1088                 /*
1089                  * Do we have to flush our current collection?  Yes if:
1090                  *
1091                  *      - no swap block at this index
1092                  *      - swap block is not contiguous
1093                  *      - we cross a physical disk boundry in the
1094                  *        stripe.
1095                  */
1096                 if (biox &&
1097                     (biox_blkno + btoc(bufx->b_bcount) != blk ||
1098                      ((biox_blkno ^ blk) & ~SWB_DMMASK))) {
1099                         switch(bp->b_cmd) {
1100                         case BUF_CMD_READ:
1101                                 ++mycpu->gd_cnt.v_swapin;
1102                                 mycpu->gd_cnt.v_swappgsin +=
1103                                         btoc(bufx->b_bcount);
1104                                 break;
1105                         case BUF_CMD_WRITE:
1106                                 ++mycpu->gd_cnt.v_swapout;
1107                                 mycpu->gd_cnt.v_swappgsout +=
1108                                         btoc(bufx->b_bcount);
1109                                 bufx->b_dirtyend = bufx->b_bcount;
1110                                 break;
1111                         default:
1112                                 /* NOT REACHED */
1113                                 break;
1114                         }
1115
1116                         /*
1117                          * Finished with this buf.
1118                          */
1119                         KKASSERT(bufx->b_bcount != 0);
1120                         if (bufx->b_cmd != BUF_CMD_READ)
1121                                 bufx->b_dirtyend = bufx->b_bcount;
1122                         biox = NULL;
1123                         bufx = NULL;
1124                 }
1125
1126                 /*
1127                  * Add new swapblk to biox, instantiating biox if necessary.
1128                  * Zero-fill reads are able to take a shortcut.
1129                  */
1130                 if (blk == SWAPBLK_NONE) {
1131                         /*
1132                          * We can only get here if we are reading.
1133                          */
1134                         bkvasync(bp);
1135                         bzero(data, PAGE_SIZE);
1136                         bp->b_resid -= PAGE_SIZE;
1137                 } else {
1138                         if (biox == NULL) {
1139                                 /* XXX chain count > 4, wait to <= 4 */
1140
1141                                 bufx = getpbuf(NULL);
1142                                 bufx->b_flags |= B_KVABIO;
1143                                 biox = &bufx->b_bio1;
1144                                 cluster_append(nbio, bufx);
1145                                 bufx->b_cmd = bp->b_cmd;
1146                                 biox->bio_done = swap_chain_iodone;
1147                                 biox->bio_offset = (off_t)blk << PAGE_SHIFT;
1148                                 biox->bio_caller_info1.cluster_parent = nbio;
1149                                 biox_blkno = blk;
1150                                 bufx->b_bcount = 0;
1151                                 bufx->b_data = data;
1152                         }
1153                         bufx->b_bcount += PAGE_SIZE;
1154                 }
1155                 --count;
1156                 ++start;
1157                 data += PAGE_SIZE;
1158         }
1159
1160         vm_object_drop(object);
1161
1162         /*
1163          *  Flush out last buffer
1164          */
1165         if (biox) {
1166                 if (bufx->b_cmd == BUF_CMD_READ) {
1167                         ++mycpu->gd_cnt.v_swapin;
1168                         mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1169                 } else {
1170                         ++mycpu->gd_cnt.v_swapout;
1171                         mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1172                         bufx->b_dirtyend = bufx->b_bcount;
1173                 }
1174                 KKASSERT(bufx->b_bcount);
1175                 if (bufx->b_cmd != BUF_CMD_READ)
1176                         bufx->b_dirtyend = bufx->b_bcount;
1177                 /* biox, bufx = NULL */
1178         }
1179
1180         /*
1181          * Now initiate all the I/O.  Be careful looping on our chain as
1182          * I/O's may complete while we are still initiating them.
1183          *
1184          * If the request is a 100% sparse read no bios will be present
1185          * and we just biodone() the buffer.
1186          */
1187         nbio->bio_caller_info2.cluster_tail = NULL;
1188         bufx = nbio->bio_caller_info1.cluster_head;
1189
1190         if (bufx) {
1191                 while (bufx) {
1192                         biox = &bufx->b_bio1;
1193                         BUF_KERNPROC(bufx);
1194                         bufx = bufx->b_cluster_next;
1195                         vn_strategy(swapdev_vp, biox);
1196                 }
1197         } else {
1198                 biodone(bio);
1199         }
1200
1201         /*
1202          * Completion of the cluster will also call biodone_chain(nbio).
1203          * We never call biodone(nbio) so we don't have to worry about
1204          * setting up a bio_done callback.  It's handled in the sub-IO.
1205          */
1206         /**/
1207 }
1208
1209 /*
1210  * biodone callback
1211  *
1212  * No requirements.
1213  */
1214 static void
1215 swap_chain_iodone(struct bio *biox)
1216 {
1217         struct buf **nextp;
1218         struct buf *bufx;       /* chained sub-buffer */
1219         struct bio *nbio;       /* parent nbio with chain glue */
1220         struct buf *bp;         /* original bp associated with nbio */
1221         int chain_empty;
1222
1223         bufx = biox->bio_buf;
1224         nbio = biox->bio_caller_info1.cluster_parent;
1225         bp = nbio->bio_buf;
1226
1227         /*
1228          * Update the original buffer
1229          */
1230         KKASSERT(bp != NULL);
1231         if (bufx->b_flags & B_ERROR) {
1232                 atomic_set_int(&bufx->b_flags, B_ERROR);
1233                 bp->b_error = bufx->b_error;    /* race ok */
1234         } else if (bufx->b_resid != 0) {
1235                 atomic_set_int(&bufx->b_flags, B_ERROR);
1236                 bp->b_error = EINVAL;           /* race ok */
1237         } else {
1238                 atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
1239         }
1240
1241         /*
1242          * Remove us from the chain.
1243          */
1244         spin_lock(&swapbp_spin);
1245         nextp = &nbio->bio_caller_info1.cluster_head;
1246         while (*nextp != bufx) {
1247                 KKASSERT(*nextp != NULL);
1248                 nextp = &(*nextp)->b_cluster_next;
1249         }
1250         *nextp = bufx->b_cluster_next;
1251         chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
1252         spin_unlock(&swapbp_spin);
1253
1254         /*
1255          * Clean up bufx.  If the chain is now empty we finish out
1256          * the parent.  Note that we may be racing other completions
1257          * so we must use the chain_empty status from above.
1258          */
1259         if (chain_empty) {
1260                 if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
1261                         atomic_set_int(&bp->b_flags, B_ERROR);
1262                         bp->b_error = EINVAL;
1263                 }
1264                 biodone_chain(nbio);
1265         }
1266         relpbuf(bufx, NULL);
1267 }
1268
1269 /*
1270  * SWAP_PAGER_GETPAGES() - bring page in from swap
1271  *
1272  * The requested page may have to be brought in from swap.  Calculate the
1273  * swap block and bring in additional pages if possible.  All pages must
1274  * have contiguous swap block assignments and reside in the same object.
1275  *
1276  * The caller has a single vm_object_pip_add() reference prior to
1277  * calling us and we should return with the same.
1278  *
1279  * The caller has BUSY'd the page.  We should return with (*mpp) left busy,
1280  * and any additinal pages unbusied.
1281  *
1282  * If the caller encounters a PG_RAM page it will pass it to us even though
1283  * it may be valid and dirty.  We cannot overwrite the page in this case!
1284  * The case is used to allow us to issue pure read-aheads.
1285  *
1286  * NOTE! XXX This code does not entirely pipeline yet due to the fact that
1287  *       the PG_RAM page is validated at the same time as mreq.  What we
1288  *       really need to do is issue a separate read-ahead pbuf.
1289  *
1290  * No requirements.
1291  */
1292 static int
1293 swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
1294 {
1295         struct buf *bp;
1296         struct bio *bio;
1297         vm_page_t mreq;
1298         vm_page_t m;
1299         vm_offset_t kva;
1300         swblk_t blk;
1301         int i;
1302         int j;
1303         int raonly;
1304         int error;
1305         u_int32_t busy_count;
1306         vm_page_t marray[XIO_INTERNAL_PAGES];
1307
1308         mreq = *mpp;
1309
1310         vm_object_hold(object);
1311         if (mreq->object != object) {
1312                 panic("swap_pager_getpages: object mismatch %p/%p", 
1313                     object, 
1314                     mreq->object
1315                 );
1316         }
1317
1318         /*
1319          * We don't want to overwrite a fully valid page as it might be
1320          * dirty.  This case can occur when e.g. vm_fault hits a perfectly
1321          * valid page with PG_RAM set.
1322          *
1323          * In this case we see if the next page is a suitable page-in
1324          * candidate and if it is we issue read-ahead.  PG_RAM will be
1325          * set on the last page of the read-ahead to continue the pipeline.
1326          */
1327         if (mreq->valid == VM_PAGE_BITS_ALL) {
1328                 if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size) {
1329                         vm_object_drop(object);
1330                         return(VM_PAGER_OK);
1331                 }
1332                 blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
1333                 if (blk == SWAPBLK_NONE) {
1334                         vm_object_drop(object);
1335                         return(VM_PAGER_OK);
1336                 }
1337                 m = vm_page_lookup_busy_try(object, mreq->pindex + 1,
1338                                             TRUE, &error);
1339                 if (error) {
1340                         vm_object_drop(object);
1341                         return(VM_PAGER_OK);
1342                 } else if (m == NULL) {
1343                         /*
1344                          * Use VM_ALLOC_QUICK to avoid blocking on cache
1345                          * page reuse.
1346                          */
1347                         m = vm_page_alloc(object, mreq->pindex + 1,
1348                                           VM_ALLOC_QUICK);
1349                         if (m == NULL) {
1350                                 vm_object_drop(object);
1351                                 return(VM_PAGER_OK);
1352                         }
1353                 } else {
1354                         if (m->valid) {
1355                                 vm_page_wakeup(m);
1356                                 vm_object_drop(object);
1357                                 return(VM_PAGER_OK);
1358                         }
1359                         vm_page_unqueue_nowakeup(m);
1360                 }
1361                 /* page is busy */
1362                 mreq = m;
1363                 raonly = 1;
1364         } else {
1365                 raonly = 0;
1366         }
1367
1368         /*
1369          * Try to block-read contiguous pages from swap if sequential,
1370          * otherwise just read one page.  Contiguous pages from swap must
1371          * reside within a single device stripe because the I/O cannot be
1372          * broken up across multiple stripes.
1373          *
1374          * Note that blk and iblk can be SWAPBLK_NONE but the loop is
1375          * set up such that the case(s) are handled implicitly.
1376          */
1377         blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1378         marray[0] = mreq;
1379
1380         for (i = 1; i <= swap_burst_read &&
1381                     i < XIO_INTERNAL_PAGES &&
1382                     mreq->pindex + i < object->size; ++i) {
1383                 swblk_t iblk;
1384
1385                 iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
1386                 if (iblk != blk + i)
1387                         break;
1388                 if ((blk ^ iblk) & ~SWB_DMMASK)
1389                         break;
1390                 m = vm_page_lookup_busy_try(object, mreq->pindex + i,
1391                                             TRUE, &error);
1392                 if (error) {
1393                         break;
1394                 } else if (m == NULL) {
1395                         /*
1396                          * Use VM_ALLOC_QUICK to avoid blocking on cache
1397                          * page reuse.
1398                          */
1399                         m = vm_page_alloc(object, mreq->pindex + i,
1400                                           VM_ALLOC_QUICK);
1401                         if (m == NULL)
1402                                 break;
1403                 } else {
1404                         if (m->valid) {
1405                                 vm_page_wakeup(m);
1406                                 break;
1407                         }
1408                         vm_page_unqueue_nowakeup(m);
1409                 }
1410                 /* page is busy */
1411                 marray[i] = m;
1412         }
1413         if (i > 1)
1414                 vm_page_flag_set(marray[i - 1], PG_RAM);
1415
1416         /*
1417          * If mreq is the requested page and we have nothing to do return
1418          * VM_PAGER_FAIL.  If raonly is set mreq is just another read-ahead
1419          * page and must be cleaned up.
1420          */
1421         if (blk == SWAPBLK_NONE) {
1422                 KKASSERT(i == 1);
1423                 if (raonly) {
1424                         vnode_pager_freepage(mreq);
1425                         vm_object_drop(object);
1426                         return(VM_PAGER_OK);
1427                 } else {
1428                         vm_object_drop(object);
1429                         return(VM_PAGER_FAIL);
1430                 }
1431         }
1432
1433         /*
1434          * Map our page(s) into kva for input
1435          *
1436          * Use the KVABIO API to avoid synchronizing the pmap.
1437          */
1438         bp = getpbuf_kva(&nsw_rcount);
1439         bio = &bp->b_bio1;
1440         kva = (vm_offset_t) bp->b_kvabase;
1441         bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
1442         pmap_qenter_noinval(kva, bp->b_xio.xio_pages, i);
1443
1444         bp->b_data = (caddr_t)kva;
1445         bp->b_bcount = PAGE_SIZE * i;
1446         bp->b_xio.xio_npages = i;
1447         bp->b_flags |= B_KVABIO;
1448         bio->bio_done = swp_pager_async_iodone;
1449         bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1450         bio->bio_caller_info1.index = SWBIO_READ;
1451
1452         /*
1453          * Set index.  If raonly set the index beyond the array so all
1454          * the pages are treated the same, otherwise the original mreq is
1455          * at index 0.
1456          */
1457         if (raonly)
1458                 bio->bio_driver_info = (void *)(intptr_t)i;
1459         else
1460                 bio->bio_driver_info = (void *)(intptr_t)0;
1461
1462         for (j = 0; j < i; ++j) {
1463                 atomic_set_int(&bp->b_xio.xio_pages[j]->busy_count,
1464                                PBUSY_SWAPINPROG);
1465         }
1466
1467         mycpu->gd_cnt.v_swapin++;
1468         mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
1469
1470         /*
1471          * We still hold the lock on mreq, and our automatic completion routine
1472          * does not remove it.
1473          */
1474         vm_object_pip_add(object, bp->b_xio.xio_npages);
1475
1476         /*
1477          * perform the I/O.  NOTE!!!  bp cannot be considered valid after
1478          * this point because we automatically release it on completion.
1479          * Instead, we look at the one page we are interested in which we
1480          * still hold a lock on even through the I/O completion.
1481          *
1482          * The other pages in our m[] array are also released on completion,
1483          * so we cannot assume they are valid anymore either.
1484          */
1485         bp->b_cmd = BUF_CMD_READ;
1486         BUF_KERNPROC(bp);
1487         vn_strategy(swapdev_vp, bio);
1488
1489         /*
1490          * Wait for the page we want to complete.  PBUSY_SWAPINPROG is always
1491          * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
1492          * is set in the meta-data.
1493          *
1494          * If this is a read-ahead only we return immediately without
1495          * waiting for I/O.
1496          */
1497         if (raonly) {
1498                 vm_object_drop(object);
1499                 return(VM_PAGER_OK);
1500         }
1501
1502         /*
1503          * Read-ahead includes originally requested page case.
1504          */
1505         for (;;) {
1506                 busy_count = mreq->busy_count;
1507                 cpu_ccfence();
1508                 if ((busy_count & PBUSY_SWAPINPROG) == 0)
1509                         break;
1510                 tsleep_interlock(mreq, 0);
1511                 if (!atomic_cmpset_int(&mreq->busy_count, busy_count,
1512                                        busy_count |
1513                                         PBUSY_SWAPINPROG | PBUSY_WANTED)) {
1514                         continue;
1515                 }
1516                 atomic_set_int(&mreq->flags, PG_REFERENCED);
1517                 mycpu->gd_cnt.v_intrans++;
1518                 if (tsleep(mreq, PINTERLOCKED, "swread", hz*20)) {
1519                         kprintf(
1520                             "swap_pager: indefinite wait buffer: "
1521                                 " bp %p offset: %lld, size: %ld\n",
1522                             bp,
1523                             (long long)bio->bio_offset,
1524                             (long)bp->b_bcount
1525                         );
1526                 }
1527         }
1528
1529         /*
1530          * Disallow speculative reads prior to the SWAPINPROG test.
1531          */
1532         cpu_lfence();
1533
1534         /*
1535          * mreq is left busied after completion, but all the other pages
1536          * are freed.  If we had an unrecoverable read error the page will
1537          * not be valid.
1538          */
1539         vm_object_drop(object);
1540         if (mreq->valid != VM_PAGE_BITS_ALL)
1541                 return(VM_PAGER_ERROR);
1542         else
1543                 return(VM_PAGER_OK);
1544
1545         /*
1546          * A final note: in a low swap situation, we cannot deallocate swap
1547          * and mark a page dirty here because the caller is likely to mark
1548          * the page clean when we return, causing the page to possibly revert 
1549          * to all-zero's later.
1550          */
1551 }
1552
1553 /*
1554  *      swap_pager_putpages: 
1555  *
1556  *      Assign swap (if necessary) and initiate I/O on the specified pages.
1557  *
1558  *      We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
1559  *      are automatically converted to SWAP objects.
1560  *
1561  *      In a low memory situation we may block in vn_strategy(), but the new 
1562  *      vm_page reservation system coupled with properly written VFS devices 
1563  *      should ensure that no low-memory deadlock occurs.  This is an area
1564  *      which needs work.
1565  *
1566  *      The parent has N vm_object_pip_add() references prior to
1567  *      calling us and will remove references for rtvals[] that are
1568  *      not set to VM_PAGER_PEND.  We need to remove the rest on I/O
1569  *      completion.
1570  *
1571  *      The parent has soft-busy'd the pages it passes us and will unbusy
1572  *      those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1573  *      We need to unbusy the rest on I/O completion.
1574  *
1575  * No requirements.
1576  */
1577 void
1578 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
1579                     int flags, int *rtvals)
1580 {
1581         int i;
1582         int n = 0;
1583
1584         vm_object_hold(object);
1585
1586         if (count && m[0]->object != object) {
1587                 panic("swap_pager_getpages: object mismatch %p/%p", 
1588                     object, 
1589                     m[0]->object
1590                 );
1591         }
1592
1593         /*
1594          * Step 1
1595          *
1596          * Turn object into OBJT_SWAP
1597          * Check for bogus sysops
1598          *
1599          * Force sync if not pageout process, we don't want any single
1600          * non-pageout process to be able to hog the I/O subsystem!  This
1601          * can be overridden by setting.
1602          */
1603         if (object->type == OBJT_DEFAULT) {
1604                 if (object->type == OBJT_DEFAULT)
1605                         swp_pager_meta_convert(object);
1606         }
1607
1608         /*
1609          * Normally we force synchronous swap I/O if this is not the
1610          * pageout daemon to prevent any single user process limited
1611          * via RLIMIT_RSS from hogging swap write bandwidth.
1612          */
1613         if (curthread != pagethread &&
1614             curthread != emergpager &&
1615             swap_user_async == 0) {
1616                 flags |= VM_PAGER_PUT_SYNC;
1617         }
1618
1619         /*
1620          * Step 2
1621          *
1622          * Update nsw parameters from swap_async_max sysctl values.  
1623          * Do not let the sysop crash the machine with bogus numbers.
1624          */
1625         if (swap_async_max != nsw_wcount_async_max) {
1626                 int n;
1627
1628                 /*
1629                  * limit range
1630                  */
1631                 if ((n = swap_async_max) > nswbuf_kva / 2)
1632                         n = nswbuf_kva / 2;
1633                 if (n < 1)
1634                         n = 1;
1635                 swap_async_max = n;
1636
1637                 /*
1638                  * Adjust difference ( if possible ).  If the current async
1639                  * count is too low, we may not be able to make the adjustment
1640                  * at this time.
1641                  *
1642                  * vm_token needed for nsw_wcount sleep interlock
1643                  */
1644                 lwkt_gettoken(&vm_token);
1645                 n -= nsw_wcount_async_max;
1646                 if (nsw_wcount_async + n >= 0) {
1647                         nsw_wcount_async_max += n;
1648                         pbuf_adjcount(&nsw_wcount_async, n);
1649                 }
1650                 lwkt_reltoken(&vm_token);
1651         }
1652
1653         /*
1654          * Step 3
1655          *
1656          * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
1657          * The page is left dirty until the pageout operation completes
1658          * successfully.
1659          */
1660
1661         for (i = 0; i < count; i += n) {
1662                 struct buf *bp;
1663                 struct bio *bio;
1664                 swblk_t blk;
1665                 int j;
1666
1667                 /*
1668                  * Maximum I/O size is limited by a number of factors.
1669                  */
1670
1671                 n = min(BLIST_MAX_ALLOC, count - i);
1672                 n = min(n, nsw_cluster_max);
1673
1674                 lwkt_gettoken(&vm_token);
1675
1676                 /*
1677                  * Get biggest block of swap we can.  If we fail, fall
1678                  * back and try to allocate a smaller block.  Don't go
1679                  * overboard trying to allocate space if it would overly
1680                  * fragment swap.
1681                  */
1682                 while (
1683                     (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
1684                     n > 4
1685                 ) {
1686                         n >>= 1;
1687                 }
1688                 if (blk == SWAPBLK_NONE) {
1689                         for (j = 0; j < n; ++j)
1690                                 rtvals[i+j] = VM_PAGER_FAIL;
1691                         lwkt_reltoken(&vm_token);
1692                         continue;
1693                 }
1694                 if (vm_report_swap_allocs > 0) {
1695                         kprintf("swap_alloc %08jx,%d\n", (intmax_t)blk, n);
1696                         --vm_report_swap_allocs;
1697                 }
1698
1699                 /*
1700                  * The I/O we are constructing cannot cross a physical
1701                  * disk boundry in the swap stripe.
1702                  */
1703                 if ((blk ^ (blk + n)) & ~SWB_DMMASK) {
1704                         j = ((blk + SWB_DMMAX) & ~SWB_DMMASK) - blk;
1705                         swp_pager_freeswapspace(object, blk + j, n - j);
1706                         n = j;
1707                 }
1708
1709                 /*
1710                  * All I/O parameters have been satisfied, build the I/O
1711                  * request and assign the swap space.
1712                  *
1713                  * Use the KVABIO API to avoid synchronizing the pmap.
1714                  */
1715                 if ((flags & VM_PAGER_PUT_SYNC))
1716                         bp = getpbuf_kva(&nsw_wcount_sync);
1717                 else
1718                         bp = getpbuf_kva(&nsw_wcount_async);
1719                 bio = &bp->b_bio1;
1720
1721                 lwkt_reltoken(&vm_token);
1722
1723                 pmap_qenter_noinval((vm_offset_t)bp->b_data, &m[i], n);
1724
1725                 bp->b_flags |= B_KVABIO;
1726                 bp->b_bcount = PAGE_SIZE * n;
1727                 bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1728
1729                 for (j = 0; j < n; ++j) {
1730                         vm_page_t mreq = m[i+j];
1731
1732                         swp_pager_meta_build(mreq->object, mreq->pindex,
1733                                              blk + j);
1734                         if (object->type == OBJT_SWAP)
1735                                 vm_page_dirty(mreq);
1736                         rtvals[i+j] = VM_PAGER_OK;
1737
1738                         atomic_set_int(&mreq->busy_count, PBUSY_SWAPINPROG);
1739                         bp->b_xio.xio_pages[j] = mreq;
1740                 }
1741                 bp->b_xio.xio_npages = n;
1742
1743                 mycpu->gd_cnt.v_swapout++;
1744                 mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
1745
1746                 bp->b_dirtyoff = 0;             /* req'd for NFS */
1747                 bp->b_dirtyend = bp->b_bcount;  /* req'd for NFS */
1748                 bp->b_cmd = BUF_CMD_WRITE;
1749                 bio->bio_caller_info1.index = SWBIO_WRITE;
1750
1751                 /*
1752                  * asynchronous
1753                  */
1754                 if ((flags & VM_PAGER_PUT_SYNC) == 0) {
1755                         bio->bio_done = swp_pager_async_iodone;
1756                         BUF_KERNPROC(bp);
1757                         vn_strategy(swapdev_vp, bio);
1758
1759                         for (j = 0; j < n; ++j)
1760                                 rtvals[i+j] = VM_PAGER_PEND;
1761                         continue;
1762                 }
1763
1764                 /*
1765                  * Issue synchrnously.
1766                  *
1767                  * Wait for the sync I/O to complete, then update rtvals.
1768                  * We just set the rtvals[] to VM_PAGER_PEND so we can call
1769                  * our async completion routine at the end, thus avoiding a
1770                  * double-free.
1771                  */
1772                 bio->bio_caller_info1.index |= SWBIO_SYNC;
1773                 if (flags & VM_PAGER_TRY_TO_CACHE)
1774                         bio->bio_caller_info1.index |= SWBIO_TTC;
1775                 bio->bio_done = biodone_sync;
1776                 bio->bio_flags |= BIO_SYNC;
1777                 vn_strategy(swapdev_vp, bio);
1778                 biowait(bio, "swwrt");
1779
1780                 for (j = 0; j < n; ++j)
1781                         rtvals[i+j] = VM_PAGER_PEND;
1782
1783                 /*
1784                  * Now that we are through with the bp, we can call the
1785                  * normal async completion, which frees everything up.
1786                  */
1787                 swp_pager_async_iodone(bio);
1788         }
1789         vm_object_drop(object);
1790 }
1791
1792 /*
1793  * No requirements.
1794  *
1795  * Recalculate the low and high-water marks.
1796  */
1797 void
1798 swap_pager_newswap(void)
1799 {
1800         /*
1801          * NOTE: vm_swap_max cannot exceed 1 billion blocks, which is the
1802          *       limitation imposed by the blist code.  Remember that this
1803          *       will be divided by NSWAP_MAX (4), so each swap device is
1804          *       limited to around a terrabyte.
1805          */
1806         if (vm_swap_max) {
1807                 nswap_lowat = (int64_t)vm_swap_max * 4 / 100;   /* 4% left */
1808                 nswap_hiwat = (int64_t)vm_swap_max * 6 / 100;   /* 6% left */
1809                 kprintf("swap low/high-water marks set to %d/%d\n",
1810                         nswap_lowat, nswap_hiwat);
1811         } else {
1812                 nswap_lowat = 128;
1813                 nswap_hiwat = 512;
1814         }
1815         swp_sizecheck();
1816 }
1817
1818 /*
1819  *      swp_pager_async_iodone:
1820  *
1821  *      Completion routine for asynchronous reads and writes from/to swap.
1822  *      Also called manually by synchronous code to finish up a bp.
1823  *
1824  *      For READ operations, the pages are BUSY'd.  For WRITE operations,
1825  *      the pages are vm_page_t->busy'd.  For READ operations, we BUSY
1826  *      unbusy all pages except the 'main' request page.  For WRITE 
1827  *      operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
1828  *      because we marked them all VM_PAGER_PEND on return from putpages ).
1829  *
1830  *      This routine may not block.
1831  *
1832  * No requirements.
1833  */
1834 static void
1835 swp_pager_async_iodone(struct bio *bio)
1836 {
1837         struct buf *bp = bio->bio_buf;
1838         vm_object_t object = NULL;
1839         int i;
1840         int *nswptr;
1841
1842         /*
1843          * report error
1844          */
1845         if (bp->b_flags & B_ERROR) {
1846                 kprintf(
1847                     "swap_pager: I/O error - %s failed; offset %lld,"
1848                         "size %ld, error %d\n",
1849                     ((bio->bio_caller_info1.index & SWBIO_READ) ?
1850                         "pagein" : "pageout"),
1851                     (long long)bio->bio_offset,
1852                     (long)bp->b_bcount,
1853                     bp->b_error
1854                 );
1855         }
1856
1857         /*
1858          * set object.
1859          */
1860         if (bp->b_xio.xio_npages)
1861                 object = bp->b_xio.xio_pages[0]->object;
1862
1863 #if 0
1864         /* PMAP TESTING CODE (useful, keep it in but #if 0'd) */
1865         if (bio->bio_caller_info1.index & SWBIO_WRITE) {
1866                 if (bio->bio_crc != iscsi_crc32(bp->b_data, bp->b_bcount)) {
1867                         kprintf("SWAPOUT: BADCRC %08x %08x\n",
1868                                 bio->bio_crc,
1869                                 iscsi_crc32(bp->b_data, bp->b_bcount));
1870                         for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1871                                 vm_page_t m = bp->b_xio.xio_pages[i];
1872                                 if (m->flags & PG_WRITEABLE)
1873                                         kprintf("SWAPOUT: "
1874                                                 "%d/%d %p writable\n",
1875                                                 i, bp->b_xio.xio_npages, m);
1876                         }
1877                 }
1878         }
1879 #endif
1880
1881         /*
1882          * remove the mapping for kernel virtual
1883          */
1884         pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
1885
1886         /*
1887          * cleanup pages.  If an error occurs writing to swap, we are in
1888          * very serious trouble.  If it happens to be a disk error, though,
1889          * we may be able to recover by reassigning the swap later on.  So
1890          * in this case we remove the m->swapblk assignment for the page 
1891          * but do not free it in the rlist.  The errornous block(s) are thus
1892          * never reallocated as swap.  Redirty the page and continue.
1893          */
1894         for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1895                 vm_page_t m = bp->b_xio.xio_pages[i];
1896
1897                 if (bp->b_flags & B_ERROR) {
1898                         /*
1899                          * If an error occurs I'd love to throw the swapblk
1900                          * away without freeing it back to swapspace, so it
1901                          * can never be used again.  But I can't from an 
1902                          * interrupt.
1903                          */
1904
1905                         if (bio->bio_caller_info1.index & SWBIO_READ) {
1906                                 /*
1907                                  * When reading, reqpage needs to stay
1908                                  * locked for the parent, but all other
1909                                  * pages can be freed.  We still want to
1910                                  * wakeup the parent waiting on the page,
1911                                  * though.  ( also: pg_reqpage can be -1 and 
1912                                  * not match anything ).
1913                                  *
1914                                  * We have to wake specifically requested pages
1915                                  * up too because we cleared SWAPINPROG and
1916                                  * someone may be waiting for that.
1917                                  *
1918                                  * NOTE: For reads, m->dirty will probably
1919                                  *       be overridden by the original caller
1920                                  *       of getpages so don't play cute tricks
1921                                  *       here.
1922                                  *
1923                                  * NOTE: We can't actually free the page from
1924                                  *       here, because this is an interrupt.
1925                                  *       It is not legal to mess with
1926                                  *       object->memq from an interrupt.
1927                                  *       Deactivate the page instead.
1928                                  *
1929                                  * WARNING! The instant SWAPINPROG is
1930                                  *          cleared another cpu may start
1931                                  *          using the mreq page (it will
1932                                  *          check m->valid immediately).
1933                                  */
1934
1935                                 m->valid = 0;
1936                                 atomic_clear_int(&m->busy_count,
1937                                                  PBUSY_SWAPINPROG);
1938
1939                                 /*
1940                                  * bio_driver_info holds the requested page
1941                                  * index.
1942                                  */
1943                                 if (i != (int)(intptr_t)bio->bio_driver_info) {
1944                                         vm_page_deactivate(m);
1945                                         vm_page_wakeup(m);
1946                                 } else {
1947                                         vm_page_flash(m);
1948                                 }
1949                                 /*
1950                                  * If i == bp->b_pager.pg_reqpage, do not wake 
1951                                  * the page up.  The caller needs to.
1952                                  */
1953                         } else {
1954                                 /*
1955                                  * If a write error occurs remove the swap
1956                                  * assignment (note that PG_SWAPPED may or
1957                                  * may not be set depending on prior activity).
1958                                  *
1959                                  * Re-dirty OBJT_SWAP pages as there is no
1960                                  * other backing store, we can't throw the
1961                                  * page away.
1962                                  *
1963                                  * Non-OBJT_SWAP pages (aka swapcache) must
1964                                  * not be dirtied since they may not have
1965                                  * been dirty in the first place, and they
1966                                  * do have backing store (the vnode).
1967                                  */
1968                                 vm_page_busy_wait(m, FALSE, "swadpg");
1969                                 vm_object_hold(m->object);
1970                                 swp_pager_meta_ctl(m->object, m->pindex,
1971                                                    SWM_FREE);
1972                                 vm_page_flag_clear(m, PG_SWAPPED);
1973                                 vm_object_drop(m->object);
1974                                 if (m->object->type == OBJT_SWAP) {
1975                                         vm_page_dirty(m);
1976                                         vm_page_activate(m);
1977                                 }
1978                                 vm_page_io_finish(m);
1979                                 atomic_clear_int(&m->busy_count,
1980                                                  PBUSY_SWAPINPROG);
1981                                 vm_page_wakeup(m);
1982                         }
1983                 } else if (bio->bio_caller_info1.index & SWBIO_READ) {
1984                         /*
1985                          * NOTE: for reads, m->dirty will probably be 
1986                          * overridden by the original caller of getpages so
1987                          * we cannot set them in order to free the underlying
1988                          * swap in a low-swap situation.  I don't think we'd
1989                          * want to do that anyway, but it was an optimization
1990                          * that existed in the old swapper for a time before
1991                          * it got ripped out due to precisely this problem.
1992                          *
1993                          * If not the requested page then deactivate it.
1994                          *
1995                          * Note that the requested page, reqpage, is left
1996                          * busied, but we still have to wake it up.  The
1997                          * other pages are released (unbusied) by 
1998                          * vm_page_wakeup().  We do not set reqpage's
1999                          * valid bits here, it is up to the caller.
2000                          */
2001
2002                         /* 
2003                          * NOTE: Can't call pmap_clear_modify(m) from an
2004                          *       interrupt thread, the pmap code may have to
2005                          *       map non-kernel pmaps and currently asserts
2006                          *       the case.
2007                          *
2008                          * WARNING! The instant SWAPINPROG is
2009                          *          cleared another cpu may start
2010                          *          using the mreq page (it will
2011                          *          check m->valid immediately).
2012                          */
2013                         /*pmap_clear_modify(m);*/
2014                         m->valid = VM_PAGE_BITS_ALL;
2015                         vm_page_undirty(m);
2016                         vm_page_flag_set(m, PG_SWAPPED);
2017                         atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
2018
2019                         /*
2020                          * We have to wake specifically requested pages
2021                          * up too because we cleared SWAPINPROG and
2022                          * could be waiting for it in getpages.  However,
2023                          * be sure to not unbusy getpages specifically
2024                          * requested page - getpages expects it to be 
2025                          * left busy.
2026                          *
2027                          * bio_driver_info holds the requested page
2028                          */
2029                         if (i != (int)(intptr_t)bio->bio_driver_info) {
2030                                 vm_page_deactivate(m);
2031                                 vm_page_wakeup(m);
2032                         } else {
2033                                 vm_page_flash(m);
2034                         }
2035                 } else {
2036                         /*
2037                          * Mark the page clean but do not mess with the
2038                          * pmap-layer's modified state.  That state should
2039                          * also be clear since the caller protected the
2040                          * page VM_PROT_READ, but allow the case.
2041                          *
2042                          * We are in an interrupt, avoid pmap operations.
2043                          *
2044                          * If we have a severe page deficit, deactivate the
2045                          * page.  Do not try to cache it (which would also
2046                          * involve a pmap op), because the page might still
2047                          * be read-heavy.
2048                          *
2049                          * When using the swap to cache clean vnode pages
2050                          * we do not mess with the page dirty bits.
2051                          *
2052                          * NOTE! Nobody is waiting for the key mreq page
2053                          *       on write completion.
2054                          */
2055                         vm_page_busy_wait(m, FALSE, "swadpg");
2056                         if (m->object->type == OBJT_SWAP)
2057                                 vm_page_undirty(m);
2058                         vm_page_flag_set(m, PG_SWAPPED);
2059                         atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
2060                         if (vm_page_count_severe())
2061                                 vm_page_deactivate(m);
2062                         vm_page_io_finish(m);
2063                         if (bio->bio_caller_info1.index & SWBIO_TTC)
2064                                 vm_page_try_to_cache(m);
2065                         else
2066                                 vm_page_wakeup(m);
2067                 }
2068         }
2069
2070         /*
2071          * adjust pip.  NOTE: the original parent may still have its own
2072          * pip refs on the object.
2073          */
2074
2075         if (object)
2076                 vm_object_pip_wakeup_n(object, bp->b_xio.xio_npages);
2077
2078         /*
2079          * Release the physical I/O buffer.
2080          *
2081          * NOTE: Due to synchronous operations in the write case b_cmd may
2082          *       already be set to BUF_CMD_DONE and BIO_SYNC may have already
2083          *       been cleared.
2084          *
2085          * Use vm_token to interlock nsw_rcount/wcount wakeup?
2086          */
2087         lwkt_gettoken(&vm_token);
2088         if (bio->bio_caller_info1.index & SWBIO_READ)
2089                 nswptr = &nsw_rcount;
2090         else if (bio->bio_caller_info1.index & SWBIO_SYNC)
2091                 nswptr = &nsw_wcount_sync;
2092         else
2093                 nswptr = &nsw_wcount_async;
2094         bp->b_cmd = BUF_CMD_DONE;
2095         relpbuf(bp, nswptr);
2096         lwkt_reltoken(&vm_token);
2097 }
2098
2099 /*
2100  * Fault-in a potentially swapped page and remove the swap reference.
2101  * (used by swapoff code)
2102  *
2103  * object must be held.
2104  */
2105 static __inline void
2106 swp_pager_fault_page(vm_object_t object, int *sharedp, vm_pindex_t pindex)
2107 {
2108         struct vnode *vp;
2109         vm_page_t m;
2110         int error;
2111
2112         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2113
2114         if (object->type == OBJT_VNODE) {
2115                 /*
2116                  * Any swap related to a vnode is due to swapcache.  We must
2117                  * vget() the vnode in case it is not active (otherwise
2118                  * vref() will panic).  Calling vm_object_page_remove() will
2119                  * ensure that any swap ref is removed interlocked with the
2120                  * page.  clean_only is set to TRUE so we don't throw away
2121                  * dirty pages.
2122                  */
2123                 vp = object->handle;
2124                 error = vget(vp, LK_SHARED | LK_RETRY | LK_CANRECURSE);
2125                 if (error == 0) {
2126                         vm_object_page_remove(object, pindex, pindex + 1, TRUE);
2127                         vput(vp);
2128                 }
2129         } else {
2130                 /*
2131                  * Otherwise it is a normal OBJT_SWAP object and we can
2132                  * fault the page in and remove the swap.
2133                  */
2134                 m = vm_fault_object_page(object, IDX_TO_OFF(pindex),
2135                                          VM_PROT_NONE,
2136                                          VM_FAULT_DIRTY | VM_FAULT_UNSWAP,
2137                                          sharedp, &error);
2138                 if (m)
2139                         vm_page_unhold(m);
2140         }
2141 }
2142
2143 /*
2144  * This removes all swap blocks related to a particular device.  We have
2145  * to be careful of ripups during the scan.
2146  */
2147 static int swp_pager_swapoff_callback(struct swblock *swap, void *data);
2148
2149 int
2150 swap_pager_swapoff(int devidx)
2151 {
2152         struct vm_object_hash *hash;
2153         struct swswapoffinfo info;
2154         struct vm_object marker;
2155         vm_object_t object;
2156         int n;
2157
2158         bzero(&marker, sizeof(marker));
2159         marker.type = OBJT_MARKER;
2160
2161         for (n = 0; n < VMOBJ_HSIZE; ++n) {
2162                 hash = &vm_object_hash[n];
2163
2164                 lwkt_gettoken(&hash->token);
2165                 TAILQ_INSERT_HEAD(&hash->list, &marker, object_entry);
2166
2167                 while ((object = TAILQ_NEXT(&marker, object_entry)) != NULL) {
2168                         if (object->type == OBJT_MARKER)
2169                                 goto skip;
2170                         if (object->type != OBJT_SWAP &&
2171                             object->type != OBJT_VNODE)
2172                                 goto skip;
2173                         vm_object_hold(object);
2174                         if (object->type != OBJT_SWAP &&
2175                             object->type != OBJT_VNODE) {
2176                                 vm_object_drop(object);
2177                                 goto skip;
2178                         }
2179
2180                         /*
2181                          * Object is special in that we can't just pagein
2182                          * into vm_page's in it (tmpfs, vn).
2183                          */
2184                         if ((object->flags & OBJ_NOPAGEIN) &&
2185                             RB_ROOT(&object->swblock_root)) {
2186                                 vm_object_drop(object);
2187                                 goto skip;
2188                         }
2189
2190                         info.object = object;
2191                         info.shared = 0;
2192                         info.devidx = devidx;
2193                         swblock_rb_tree_RB_SCAN(&object->swblock_root,
2194                                             NULL, swp_pager_swapoff_callback,
2195                                             &info);
2196                         vm_object_drop(object);
2197 skip:
2198                         if (object == TAILQ_NEXT(&marker, object_entry)) {
2199                                 TAILQ_REMOVE(&hash->list, &marker,
2200                                              object_entry);
2201                                 TAILQ_INSERT_AFTER(&hash->list, object,
2202                                                    &marker, object_entry);
2203                         }
2204                 }
2205                 TAILQ_REMOVE(&hash->list, &marker, object_entry);
2206                 lwkt_reltoken(&hash->token);
2207         }
2208
2209         /*
2210          * If we fail to locate all swblocks we just fail gracefully and
2211          * do not bother to restore paging on the swap device.  If the
2212          * user wants to retry the user can retry.
2213          */
2214         if (swdevt[devidx].sw_nused)
2215                 return (1);
2216         else
2217                 return (0);
2218 }
2219
2220 static
2221 int
2222 swp_pager_swapoff_callback(struct swblock *swap, void *data)
2223 {
2224         struct swswapoffinfo *info = data;
2225         vm_object_t object = info->object;
2226         vm_pindex_t index;
2227         swblk_t v;
2228         int i;
2229
2230         index = swap->swb_index;
2231         for (i = 0; i < SWAP_META_PAGES; ++i) {
2232                 /*
2233                  * Make sure we don't race a dying object.  This will
2234                  * kill the scan of the object's swap blocks entirely.
2235                  */
2236                 if (object->flags & OBJ_DEAD)
2237                         return(-1);
2238
2239                 /*
2240                  * Fault the page, which can obviously block.  If the swap
2241                  * structure disappears break out.
2242                  */
2243                 v = swap->swb_pages[i];
2244                 if (v != SWAPBLK_NONE && BLK2DEVIDX(v) == info->devidx) {
2245                         swp_pager_fault_page(object, &info->shared,
2246                                              swap->swb_index + i);
2247                         /* swap ptr might go away */
2248                         if (RB_LOOKUP(swblock_rb_tree,
2249                                       &object->swblock_root, index) != swap) {
2250                                 break;
2251                         }
2252                 }
2253         }
2254         return(0);
2255 }
2256
2257 /************************************************************************
2258  *                              SWAP META DATA                          *
2259  ************************************************************************
2260  *
2261  *      These routines manipulate the swap metadata stored in the 
2262  *      OBJT_SWAP object.
2263  *
2264  *      Swap metadata is implemented with a global hash and not directly
2265  *      linked into the object.  Instead the object simply contains
2266  *      appropriate tracking counters.
2267  */
2268
2269 /*
2270  * Lookup the swblock containing the specified swap block index.
2271  *
2272  * The caller must hold the object.
2273  */
2274 static __inline
2275 struct swblock *
2276 swp_pager_lookup(vm_object_t object, vm_pindex_t index)
2277 {
2278         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2279         index &= ~(vm_pindex_t)SWAP_META_MASK;
2280         return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index));
2281 }
2282
2283 /*
2284  * Remove a swblock from the RB tree.
2285  *
2286  * The caller must hold the object.
2287  */
2288 static __inline
2289 void
2290 swp_pager_remove(vm_object_t object, struct swblock *swap)
2291 {
2292         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2293         RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap);
2294 }
2295
2296 /*
2297  * Convert default object to swap object if necessary
2298  *
2299  * The caller must hold the object.
2300  */
2301 static void
2302 swp_pager_meta_convert(vm_object_t object)
2303 {
2304         if (object->type == OBJT_DEFAULT) {
2305                 object->type = OBJT_SWAP;
2306                 KKASSERT(object->swblock_count == 0);
2307         }
2308 }
2309
2310 /*
2311  * SWP_PAGER_META_BUILD() -     add swap block to swap meta data for object
2312  *
2313  *      We first convert the object to a swap object if it is a default
2314  *      object.  Vnode objects do not need to be converted.
2315  *
2316  *      The specified swapblk is added to the object's swap metadata.  If
2317  *      the swapblk is not valid, it is freed instead.  Any previously
2318  *      assigned swapblk is freed.
2319  *
2320  * The caller must hold the object.
2321  */
2322 static void
2323 swp_pager_meta_build(vm_object_t object, vm_pindex_t index, swblk_t swapblk)
2324 {
2325         struct swblock *swap;
2326         struct swblock *oswap;
2327         vm_pindex_t v;
2328
2329         KKASSERT(swapblk != SWAPBLK_NONE);
2330         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2331
2332         /*
2333          * Convert object if necessary
2334          */
2335         if (object->type == OBJT_DEFAULT)
2336                 swp_pager_meta_convert(object);
2337         
2338         /*
2339          * Locate swblock.  If not found create, but if we aren't adding
2340          * anything just return.  If we run out of space in the map we wait
2341          * and, since the hash table may have changed, retry.
2342          */
2343 retry:
2344         swap = swp_pager_lookup(object, index);
2345
2346         if (swap == NULL) {
2347                 int i;
2348
2349                 swap = zalloc(swap_zone);
2350                 if (swap == NULL) {
2351                         vm_wait(0);
2352                         goto retry;
2353                 }
2354                 swap->swb_index = index & ~(vm_pindex_t)SWAP_META_MASK;
2355                 swap->swb_count = 0;
2356
2357                 ++object->swblock_count;
2358
2359                 for (i = 0; i < SWAP_META_PAGES; ++i)
2360                         swap->swb_pages[i] = SWAPBLK_NONE;
2361                 oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap);
2362                 KKASSERT(oswap == NULL);
2363         }
2364
2365         /*
2366          * Delete prior contents of metadata.
2367          *
2368          * NOTE: Decrement swb_count after the freeing operation (which
2369          *       might block) to prevent racing destruction of the swblock.
2370          */
2371         index &= SWAP_META_MASK;
2372
2373         while ((v = swap->swb_pages[index]) != SWAPBLK_NONE) {
2374                 swap->swb_pages[index] = SWAPBLK_NONE;
2375                 /* can block */
2376                 swp_pager_freeswapspace(object, v, 1);
2377                 --swap->swb_count;
2378                 --mycpu->gd_vmtotal.t_vm;
2379         }
2380
2381         /*
2382          * Enter block into metadata
2383          */
2384         swap->swb_pages[index] = swapblk;
2385         if (swapblk != SWAPBLK_NONE) {
2386                 ++swap->swb_count;
2387                 ++mycpu->gd_vmtotal.t_vm;
2388         }
2389 }
2390
2391 /*
2392  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
2393  *
2394  *      The requested range of blocks is freed, with any associated swap 
2395  *      returned to the swap bitmap.
2396  *
2397  *      This routine will free swap metadata structures as they are cleaned 
2398  *      out.  This routine does *NOT* operate on swap metadata associated
2399  *      with resident pages.
2400  *
2401  * The caller must hold the object.
2402  */
2403 static int swp_pager_meta_free_callback(struct swblock *swb, void *data);
2404
2405 static void
2406 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
2407 {
2408         struct swfreeinfo info;
2409
2410         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2411
2412         /*
2413          * Nothing to do
2414          */
2415         if (object->swblock_count == 0) {
2416                 KKASSERT(RB_EMPTY(&object->swblock_root));
2417                 return;
2418         }
2419         if (count == 0)
2420                 return;
2421
2422         /*
2423          * Setup for RB tree scan.  Note that the pindex range can be huge
2424          * due to the 64 bit page index space so we cannot safely iterate.
2425          */
2426         info.object = object;
2427         info.basei = index & ~(vm_pindex_t)SWAP_META_MASK;
2428         info.begi = index;
2429         info.endi = index + count - 1;
2430         swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp,
2431                                 swp_pager_meta_free_callback, &info);
2432 }
2433
2434 /*
2435  * The caller must hold the object.
2436  */
2437 static
2438 int
2439 swp_pager_meta_free_callback(struct swblock *swap, void *data)
2440 {
2441         struct swfreeinfo *info = data;
2442         vm_object_t object = info->object;
2443         int index;
2444         int eindex;
2445
2446         /*
2447          * Figure out the range within the swblock.  The wider scan may
2448          * return edge-case swap blocks when the start and/or end points
2449          * are in the middle of a block.
2450          */
2451         if (swap->swb_index < info->begi)
2452                 index = (int)info->begi & SWAP_META_MASK;
2453         else
2454                 index = 0;
2455
2456         if (swap->swb_index + SWAP_META_PAGES > info->endi)
2457                 eindex = (int)info->endi & SWAP_META_MASK;
2458         else
2459                 eindex = SWAP_META_MASK;
2460
2461         /*
2462          * Scan and free the blocks.  The loop terminates early
2463          * if (swap) runs out of blocks and could be freed.
2464          *
2465          * NOTE: Decrement swb_count after swp_pager_freeswapspace()
2466          *       to deal with a zfree race.
2467          */
2468         while (index <= eindex) {
2469                 swblk_t v = swap->swb_pages[index];
2470
2471                 if (v != SWAPBLK_NONE) {
2472                         swap->swb_pages[index] = SWAPBLK_NONE;
2473                         /* can block */
2474                         swp_pager_freeswapspace(object, v, 1);
2475                         --mycpu->gd_vmtotal.t_vm;
2476                         if (--swap->swb_count == 0) {
2477                                 swp_pager_remove(object, swap);
2478                                 zfree(swap_zone, swap);
2479                                 --object->swblock_count;
2480                                 break;
2481                         }
2482                 }
2483                 ++index;
2484         }
2485
2486         /* swap may be invalid here due to zfree above */
2487         lwkt_yield();
2488
2489         return(0);
2490 }
2491
2492 /*
2493  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
2494  *
2495  *      This routine locates and destroys all swap metadata associated with
2496  *      an object.
2497  *
2498  * NOTE: Decrement swb_count after the freeing operation (which
2499  *       might block) to prevent racing destruction of the swblock.
2500  *
2501  * The caller must hold the object.
2502  */
2503 static void
2504 swp_pager_meta_free_all(vm_object_t object)
2505 {
2506         struct swblock *swap;
2507         int i;
2508
2509         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2510
2511         while ((swap = RB_ROOT(&object->swblock_root)) != NULL) {
2512                 swp_pager_remove(object, swap);
2513                 for (i = 0; i < SWAP_META_PAGES; ++i) {
2514                         swblk_t v = swap->swb_pages[i];
2515                         if (v != SWAPBLK_NONE) {
2516                                 /* can block */
2517                                 swp_pager_freeswapspace(object, v, 1);
2518                                 --swap->swb_count;
2519                                 --mycpu->gd_vmtotal.t_vm;
2520                         }
2521                 }
2522                 if (swap->swb_count != 0)
2523                         panic("swap_pager_meta_free_all: swb_count != 0");
2524                 zfree(swap_zone, swap);
2525                 --object->swblock_count;
2526                 lwkt_yield();
2527         }
2528         KKASSERT(object->swblock_count == 0);
2529 }
2530
2531 /*
2532  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
2533  *
2534  *      This routine is capable of looking up, popping, or freeing
2535  *      swapblk assignments in the swap meta data or in the vm_page_t.
2536  *      The routine typically returns the swapblk being looked-up, or popped,
2537  *      or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
2538  *      was invalid.  This routine will automatically free any invalid 
2539  *      meta-data swapblks.
2540  *
2541  *      It is not possible to store invalid swapblks in the swap meta data
2542  *      (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
2543  *
2544  *      When acting on a busy resident page and paging is in progress, we 
2545  *      have to wait until paging is complete but otherwise can act on the 
2546  *      busy page.
2547  *
2548  *      SWM_FREE        remove and free swap block from metadata
2549  *      SWM_POP         remove from meta data but do not free.. pop it out
2550  *
2551  * The caller must hold the object.
2552  */
2553 static swblk_t
2554 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
2555 {
2556         struct swblock *swap;
2557         swblk_t r1;
2558
2559         if (object->swblock_count == 0)
2560                 return(SWAPBLK_NONE);
2561
2562         r1 = SWAPBLK_NONE;
2563         swap = swp_pager_lookup(object, index);
2564
2565         if (swap != NULL) {
2566                 index &= SWAP_META_MASK;
2567                 r1 = swap->swb_pages[index];
2568
2569                 if (r1 != SWAPBLK_NONE) {
2570                         if (flags & (SWM_FREE|SWM_POP)) {
2571                                 swap->swb_pages[index] = SWAPBLK_NONE;
2572                                 --mycpu->gd_vmtotal.t_vm;
2573                                 if (--swap->swb_count == 0) {
2574                                         swp_pager_remove(object, swap);
2575                                         zfree(swap_zone, swap);
2576                                         --object->swblock_count;
2577                                 }
2578                         } 
2579                         /* swap ptr may be invalid */
2580                         if (flags & SWM_FREE) {
2581                                 swp_pager_freeswapspace(object, r1, 1);
2582                                 r1 = SWAPBLK_NONE;
2583                         }
2584                 }
2585                 /* swap ptr may be invalid */
2586         }
2587         return(r1);
2588 }