gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1998,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1994 John S. Dyson
	35	* Copyright (c) 1990 University of Utah.
	36	* Copyright (c) 1991, 1993
	37	* The Regents of the University of California. All rights reserved.
	38	*
	39	* This code is derived from software contributed to Berkeley by
	40	* the Systems Programming Group of the University of Utah Computer
	41	* Science Department.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
	51	* 3. All advertising materials mentioning features or use of this software
	52	* must display the following acknowledgement:
	53	* This product includes software developed by the University of
	54	* California, Berkeley and its contributors.
	55	* 4. Neither the name of the University nor the names of its contributors
	56	* may be used to endorse or promote products derived from this software
	57	* without specific prior written permission.
	58	*
	59	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	60	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	61	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	62	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	63	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	64	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	65	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	66	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	67	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	68	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	69	* SUCH DAMAGE.
	70	*
	71	* New Swap System
	72	* Matthew Dillon
	73	*
	74	* Radix Bitmap 'blists'.
	75	*
	76	* - The new swapper uses the new radix bitmap code. This should scale
	77	* to arbitrarily small or arbitrarily large swap spaces and an almost
	78	* arbitrary degree of fragmentation.
	79	*
	80	* Features:
	81	*
	82	* - on the fly reallocation of swap during putpages. The new system
	83	* does not try to keep previously allocated swap blocks for dirty
	84	* pages.
	85	*
	86	* - on the fly deallocation of swap
	87	*
	88	* - No more garbage collection required. Unnecessarily allocated swap
	89	* blocks only exist for dirty vm_page_t's now and these are already
	90	* cycled (in a high-load system) by the pager. We also do on-the-fly
	91	* removal of invalidated swap blocks when a page is destroyed
	92	* or renamed.
	93	*
	94	* from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
	95	*
	96	* @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
	97	*
	98	* $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
	99	* $DragonFly: src/sys/vm/swap_pager.c,v 1.32 2008/07/01 02:02:56 dillon Exp $
	100	*/
	101
	102	#include <sys/param.h>
	103	#include <sys/systm.h>
	104	#include <sys/conf.h>
	105	#include <sys/kernel.h>
	106	#include <sys/proc.h>
	107	#include <sys/buf.h>
	108	#include <sys/vnode.h>
	109	#include <sys/malloc.h>
	110	#include <sys/vmmeter.h>
	111	#include <sys/sysctl.h>
	112	#include <sys/blist.h>
	113	#include <sys/lock.h>
	114	#include <sys/thread2.h>
	115
	116	#ifndef MAX_PAGEOUT_CLUSTER
	117	#define MAX_PAGEOUT_CLUSTER 16
	118	#endif
	119
	120	#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
	121
	122	#include "opt_swap.h"
	123	#include <vm/vm.h>
	124	#include <vm/vm_object.h>
	125	#include <vm/vm_page.h>
	126	#include <vm/vm_pager.h>
	127	#include <vm/vm_pageout.h>
	128	#include <vm/swap_pager.h>
	129	#include <vm/vm_extern.h>
	130	#include <vm/vm_zone.h>
	131	#include <vm/vnode_pager.h>
	132
	133	#include <sys/buf2.h>
	134	#include <vm/vm_page2.h>
	135
	136	#define SWM_FREE 0x02 /* free, period */
	137	#define SWM_POP 0x04 /* pop out */
	138
	139	#define SWBIO_READ 0x01
	140	#define SWBIO_WRITE 0x02
	141	#define SWBIO_SYNC 0x04
	142
	143	struct swfreeinfo {
	144	vm_object_t object;
	145	vm_pindex_t basei;
	146	vm_pindex_t begi;
	147	vm_pindex_t endi; /* inclusive */
	148	};
	149
	150	/*
	151	* vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks
	152	* in the old system.
	153	*/
	154
	155	int swap_pager_full; /* swap space exhaustion (task killing) */
	156	int vm_swap_cache_use;
	157	int vm_swap_anon_use;
	158
	159	static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
	160	static int nsw_rcount; /* free read buffers */
	161	static int nsw_wcount_sync; /* limit write buffers / synchronous */
	162	static int nsw_wcount_async; /* limit write buffers / asynchronous */
	163	static int nsw_wcount_async_max;/* assigned maximum */
	164	static int nsw_cluster_max; /* maximum VOP I/O allowed */
	165
	166	struct blist *swapblist;
	167	static int swap_async_max = 4; /* maximum in-progress async I/O's */
	168	static int swap_burst_read = 0; /* allow burst reading */
	169
	170	extern struct vnode swapdev_vp; / from vm_swap.c */
	171
	172	SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
	173	CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
	174	SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
	175	CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
	176
	177	SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
	178	CTLFLAG_RD, &vm_swap_cache_use, 0, "");
	179	SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
	180	CTLFLAG_RD, &vm_swap_anon_use, 0, "");
	181
	182	vm_zone_t swap_zone;
	183
	184	/*
	185	* Red-Black tree for swblock entries
	186	*/
	187	RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
	188	vm_pindex_t, swb_index);
	189
	190	int
	191	rb_swblock_compare(struct swblock swb1, struct swblock swb2)
	192	{
	193	if (swb1->swb_index < swb2->swb_index)
	194	return(-1);
	195	if (swb1->swb_index > swb2->swb_index)
	196	return(1);
	197	return(0);
	198	}
	199
	200	static
	201	int
	202	rb_swblock_scancmp(struct swblock swb, void data)
	203	{
	204	struct swfreeinfo *info = data;
	205
	206	if (swb->swb_index < info->basei)
	207	return(-1);
	208	if (swb->swb_index > info->endi)
	209	return(1);
	210	return(0);
	211	}
	212
	213	/*
	214	* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
	215	* calls hooked from other parts of the VM system and do not appear here.
	216	* (see vm/swap_pager.h).
	217	*/
	218
	219	static vm_object_t
	220	swap_pager_alloc (void *handle, off_t size,
	221	vm_prot_t prot, off_t offset);
	222	static void swap_pager_dealloc (vm_object_t object);
	223	static int swap_pager_getpage (vm_object_t, vm_page_t *, int);
	224	static void swap_chain_iodone(struct bio *biox);
	225
	226	struct pagerops swappagerops = {
	227	swap_pager_alloc, /* allocate an OBJT_SWAP object */
	228	swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
	229	swap_pager_getpage, /* pagein */
	230	swap_pager_putpages, /* pageout */
	231	swap_pager_haspage /* get backing store status for page */
	232	};
	233
	234	/*
	235	* dmmax is in page-sized chunks with the new swap system. It was
	236	* dev-bsized chunks in the old. dmmax is always a power of 2.
	237	*
	238	* swap_() routines are externally accessible. swp_() routines are
	239	* internal.
	240	*/
	241
	242	int dmmax;
	243	static int dmmax_mask;
	244	int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
	245	int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */
	246
	247	static __inline void swp_sizecheck (void);
	248	static void swp_pager_async_iodone (struct bio *bio);
	249
	250	/*
	251	* Swap bitmap functions
	252	*/
	253
	254	static __inline void swp_pager_freeswapspace (vm_object_t object, daddr_t blk, int npages);
	255	static __inline daddr_t swp_pager_getswapspace (vm_object_t object, int npages);
	256
	257	/*
	258	* Metadata functions
	259	*/
	260
	261	static void swp_pager_meta_convert (vm_object_t);
	262	static void swp_pager_meta_build (vm_object_t, vm_pindex_t, daddr_t);
	263	static void swp_pager_meta_free (vm_object_t, vm_pindex_t, vm_pindex_t);
	264	static void swp_pager_meta_free_all (vm_object_t);
	265	static daddr_t swp_pager_meta_ctl (vm_object_t, vm_pindex_t, int);
	266
	267	/*
	268	* SWP_SIZECHECK() - update swap_pager_full indication
	269	*
	270	* update the swap_pager_almost_full indication and warn when we are
	271	* about to run out of swap space, using lowat/hiwat hysteresis.
	272	*
	273	* Clear swap_pager_full ( task killing ) indication when lowat is met.
	274	*
	275	* No restrictions on call
	276	* This routine may not block.
	277	* This routine must be called at splvm()
	278	*/
	279
	280	static __inline void
	281	swp_sizecheck(void)
	282	{
	283	if (vm_swap_size < nswap_lowat) {
	284	if (swap_pager_almost_full == 0) {
	285	kprintf("swap_pager: out of swap space\n");
	286	swap_pager_almost_full = 1;
	287	}
	288	} else {
	289	swap_pager_full = 0;
	290	if (vm_swap_size > nswap_hiwat)
	291	swap_pager_almost_full = 0;
	292	}
	293	}
	294
	295	/*
	296	* SWAP_PAGER_INIT() - initialize the swap pager!
	297	*
	298	* Expected to be started from system init. NOTE: This code is run
	299	* before much else so be careful what you depend on. Most of the VM
	300	* system has yet to be initialized at this point.
	301	*/
	302	static void
	303	swap_pager_init(void *arg __unused)
	304	{
	305	/*
	306	* Device Stripe, in PAGE_SIZE'd blocks
	307	*/
	308	dmmax = SWB_NPAGES * 2;
	309	dmmax_mask = ~(dmmax - 1);
	310	}
	311	SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL)
	312
	313	/*
	314	* SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
	315	*
	316	* Expected to be started from pageout process once, prior to entering
	317	* its main loop.
	318	*/
	319
	320	void
	321	swap_pager_swap_init(void)
	322	{
	323	int n, n2;
	324
	325	/*
	326	* Number of in-transit swap bp operations. Don't
	327	* exhaust the pbufs completely. Make sure we
	328	* initialize workable values (0 will work for hysteresis
	329	* but it isn't very efficient).
	330	*
	331	* The nsw_cluster_max is constrained by the number of pages an XIO
	332	* holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
	333	* MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
	334	* constrained by the swap device interleave stripe size.
	335	*
	336	* Currently we hardwire nsw_wcount_async to 4. This limit is
	337	* designed to prevent other I/O from having high latencies due to
	338	* our pageout I/O. The value 4 works well for one or two active swap
	339	* devices but is probably a little low if you have more. Even so,
	340	* a higher value would probably generate only a limited improvement
	341	* with three or four active swap devices since the system does not
	342	* typically have to pageout at extreme bandwidths. We will want
	343	* at least 2 per swap devices, and 4 is a pretty good value if you
	344	* have one NFS swap device due to the command/ack latency over NFS.
	345	* So it all works out pretty well.
	346	*/
	347
	348	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
	349
	350	nsw_rcount = (nswbuf + 1) / 2;
	351	nsw_wcount_sync = (nswbuf + 3) / 4;
	352	nsw_wcount_async = 4;
	353	nsw_wcount_async_max = nsw_wcount_async;
	354
	355	/*
	356	* The zone is dynamically allocated so generally size it to
	357	* maxswzone (32MB to 512MB of KVM). Set a minimum size based
	358	* on physical memory of around 8x (each swblock can hold 16 pages).
	359	*
	360	* With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
	361	* has increased dramatically.
	362	*/
	363	n = vmstats.v_page_count / 2;
	364	if (maxswzone && n < maxswzone / sizeof(struct swblock))
	365	n = maxswzone / sizeof(struct swblock);
	366	n2 = n;
	367
	368	do {
	369	swap_zone = zinit(
	370	"SWAPMETA",
	371	sizeof(struct swblock),
	372	n,
	373	ZONE_INTERRUPT,
	374	1);
	375	if (swap_zone != NULL)
	376	break;
	377	/*
	378	* if the allocation failed, try a zone two thirds the
	379	* size of the previous attempt.
	380	*/
	381	n -= ((n + 2) / 3);
	382	} while (n > 0);
	383
	384	if (swap_zone == NULL)
	385	panic("swap_pager_swap_init: swap_zone == NULL");
	386	if (n2 != n)
	387	kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
	388	}
	389
	390	/*
	391	* SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
	392	* its metadata structures.
	393	*
	394	* This routine is called from the mmap and fork code to create a new
	395	* OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
	396	* and then converting it with swp_pager_meta_convert().
	397	*
	398	* This routine may block in vm_object_allocate() and create a named
	399	* object lookup race, so we must interlock. We must also run at
	400	* splvm() for the object lookup to handle races with interrupts, but
	401	* we do not have to maintain splvm() in between the lookup and the
	402	* add because (I believe) it is not possible to attempt to create
	403	* a new swap object w/handle when a default object with that handle
	404	* already exists.
	405	*/
	406
	407	static vm_object_t
	408	swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
	409	{
	410	vm_object_t object;
	411
	412	KKASSERT(handle == NULL);
	413	#if 0
	414	if (handle) {
	415	/*
	416	* Reference existing named region or allocate new one. There
	417	* should not be a race here against swp_pager_meta_build()
	418	* as called from vm_page_remove() in regards to the lookup
	419	* of the handle.
	420	*/
	421	while (sw_alloc_interlock) {
	422	sw_alloc_interlock = -1;
	423	tsleep(&sw_alloc_interlock, 0, "swpalc", 0);
	424	}
	425	sw_alloc_interlock = 1;
	426
	427	object = vm_pager_object_lookup(NOBJLIST(handle), handle);
	428
	429	if (object != NULL) {
	430	vm_object_reference(object);
	431	} else {
	432	object = vm_object_allocate(OBJT_DEFAULT,
	433	OFF_TO_IDX(offset + PAGE_MASK + size));
	434	object->handle = handle;
	435	swp_pager_meta_convert(object);
	436	}
	437
	438	if (sw_alloc_interlock < 0)
	439	wakeup(&sw_alloc_interlock);
	440	sw_alloc_interlock = 0;
	441	} else { ... }
	442	#endif
	443	object = vm_object_allocate(OBJT_DEFAULT,
	444	OFF_TO_IDX(offset + PAGE_MASK + size));
	445	swp_pager_meta_convert(object);
	446
	447	return (object);
	448	}
	449
	450	/*
	451	* SWAP_PAGER_DEALLOC() - remove swap metadata from object
	452	*
	453	* The swap backing for the object is destroyed. The code is
	454	* designed such that we can reinstantiate it later, but this
	455	* routine is typically called only when the entire object is
	456	* about to be destroyed.
	457	*
	458	* This routine may block, but no longer does.
	459	*
	460	* The object must be locked or unreferenceable.
	461	*/
	462
	463	static void
	464	swap_pager_dealloc(vm_object_t object)
	465	{
	466	vm_object_pip_wait(object, "swpdea");
	467
	468	/*
	469	* Free all remaining metadata. We only bother to free it from
	470	* the swap meta data. We do not attempt to free swapblk's still
	471	* associated with vm_page_t's for this object. We do not care
	472	* if paging is still in progress on some objects.
	473	*/
	474	crit_enter();
	475	swp_pager_meta_free_all(object);
	476	crit_exit();
	477	}
	478
	479	/************************************************************************
	480	* SWAP PAGER BITMAP ROUTINES *
	481	************************************************************************/
	482
	483	/*
	484	* SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
	485	*
	486	* Allocate swap for the requested number of pages. The starting
	487	* swap block number (a page index) is returned or SWAPBLK_NONE
	488	* if the allocation failed.
	489	*
	490	* Also has the side effect of advising that somebody made a mistake
	491	* when they configured swap and didn't configure enough.
	492	*
	493	* Must be called at splvm() to avoid races with bitmap frees from
	494	* vm_page_remove() aka swap_pager_page_removed().
	495	*
	496	* This routine may not block
	497	* This routine must be called at splvm().
	498	*/
	499	static __inline daddr_t
	500	swp_pager_getswapspace(vm_object_t object, int npages)
	501	{
	502	daddr_t blk;
	503
	504	if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
	505	if (swap_pager_full != 2) {
	506	kprintf("swap_pager_getswapspace: failed\n");
	507	swap_pager_full = 2;
	508	swap_pager_almost_full = 1;
	509	}
	510	} else {
	511	vm_swap_size -= npages;
	512	if (object->type == OBJT_SWAP)
	513	vm_swap_anon_use += npages;
	514	else
	515	vm_swap_cache_use += npages;
	516	swp_sizecheck();
	517	}
	518	return(blk);
	519	}
	520
	521	/*
	522	* SWP_PAGER_FREESWAPSPACE() - free raw swap space
	523	*
	524	* This routine returns the specified swap blocks back to the bitmap.
	525	*
	526	* Note: This routine may not block (it could in the old swap code),
	527	* and through the use of the new blist routines it does not block.
	528	*
	529	* We must be called at splvm() to avoid races with bitmap frees from
	530	* vm_page_remove() aka swap_pager_page_removed().
	531	*
	532	* This routine may not block
	533	* This routine must be called at splvm().
	534	*/
	535
	536	static __inline void
	537	swp_pager_freeswapspace(vm_object_t object, daddr_t blk, int npages)
	538	{
	539	blist_free(swapblist, blk, npages);
	540	vm_swap_size += npages;
	541	if (object->type == OBJT_SWAP)
	542	vm_swap_anon_use -= npages;
	543	else
	544	vm_swap_cache_use -= npages;
	545	swp_sizecheck();
	546	}
	547
	548	/*
	549	* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
	550	* range within an object.
	551	*
	552	* This is a globally accessible routine.
	553	*
	554	* This routine removes swapblk assignments from swap metadata.
	555	*
	556	* The external callers of this routine typically have already destroyed
	557	* or renamed vm_page_t's associated with this range in the object so
	558	* we should be ok.
	559	*
	560	* This routine may be called at any spl. We up our spl to splvm
	561	* temporarily in order to perform the metadata removal.
	562	*/
	563	void
	564	swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
	565	{
	566	crit_enter();
	567	swp_pager_meta_free(object, start, size);
	568	crit_exit();
	569	}
	570
	571	void
	572	swap_pager_freespace_all(vm_object_t object)
	573	{
	574	crit_enter();
	575	swp_pager_meta_free_all(object);
	576	crit_exit();
	577	}
	578
	579	/*
	580	* Called by vm_page_alloc() when a new VM page is inserted
	581	* into a VM object. Checks whether swap has been assigned to
	582	* the page and sets PG_SWAPPED as necessary.
	583	*/
	584	void
	585	swap_pager_page_inserted(vm_page_t m)
	586	{
	587	if (m->object->swblock_count) {
	588	crit_enter();
	589	if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
	590	vm_page_flag_set(m, PG_SWAPPED);
	591	crit_exit();
	592	}
	593	}
	594
	595	/*
	596	* SWAP_PAGER_RESERVE() - reserve swap blocks in object
	597	*
	598	* Assigns swap blocks to the specified range within the object. The
	599	* swap blocks are not zerod. Any previous swap assignment is destroyed.
	600	*
	601	* Returns 0 on success, -1 on failure.
	602	*/
	603	int
	604	swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
	605	{
	606	int n = 0;
	607	daddr_t blk = SWAPBLK_NONE;
	608	vm_pindex_t beg = start; /* save start index */
	609
	610	crit_enter();
	611	while (size) {
	612	if (n == 0) {
	613	n = BLIST_MAX_ALLOC;
	614	while ((blk = swp_pager_getswapspace(object, n)) ==
	615	SWAPBLK_NONE)
	616	{
	617	n >>= 1;
	618	if (n == 0) {
	619	swp_pager_meta_free(object, beg,
	620	start - beg);
	621	crit_exit();
	622	return(-1);
	623	}
	624	}
	625	}
	626	swp_pager_meta_build(object, start, blk);
	627	--size;
	628	++start;
	629	++blk;
	630	--n;
	631	}
	632	swp_pager_meta_free(object, start, n);
	633	crit_exit();
	634	return(0);
	635	}
	636
	637	/*
	638	* SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
	639	* and destroy the source.
	640	*
	641	* Copy any valid swapblks from the source to the destination. In
	642	* cases where both the source and destination have a valid swapblk,
	643	* we keep the destination's.
	644	*
	645	* This routine is allowed to block. It may block allocating metadata
	646	* indirectly through swp_pager_meta_build() or if paging is still in
	647	* progress on the source.
	648	*
	649	* This routine can be called at any spl
	650	*
	651	* XXX vm_page_collapse() kinda expects us not to block because we
	652	* supposedly do not need to allocate memory, but for the moment we
	653	* may have to get a little memory from the zone allocator, but
	654	* it is taken from the interrupt memory. We should be ok.
	655	*
	656	* The source object contains no vm_page_t's (which is just as well)
	657	*
	658	* The source object is of type OBJT_SWAP.
	659	*
	660	* The source and destination objects must be locked or
	661	* inaccessible (XXX are they ?)
	662	*/
	663
	664	void
	665	swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
	666	vm_pindex_t base_index, int destroysource)
	667	{
	668	vm_pindex_t i;
	669
	670	crit_enter();
	671
	672	/*
	673	* transfer source to destination.
	674	*/
	675	for (i = 0; i < dstobject->size; ++i) {
	676	daddr_t dstaddr;
	677
	678	/*
	679	* Locate (without changing) the swapblk on the destination,
	680	* unless it is invalid in which case free it silently, or
	681	* if the destination is a resident page, in which case the
	682	* source is thrown away.
	683	*/
	684	dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
	685
	686	if (dstaddr == SWAPBLK_NONE) {
	687	/*
	688	* Destination has no swapblk and is not resident,
	689	* copy source.
	690	*/
	691	daddr_t srcaddr;
	692
	693	srcaddr = swp_pager_meta_ctl(srcobject,
	694	base_index + i, SWM_POP);
	695
	696	if (srcaddr != SWAPBLK_NONE)
	697	swp_pager_meta_build(dstobject, i, srcaddr);
	698	} else {
	699	/*
	700	* Destination has valid swapblk or it is represented
	701	* by a resident page. We destroy the sourceblock.
	702	*/
	703	swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
	704	}
	705	}
	706
	707	/*
	708	* Free left over swap blocks in source.
	709	*
	710	* We have to revert the type to OBJT_DEFAULT so we do not accidently
	711	* double-remove the object from the swap queues.
	712	*/
	713	if (destroysource) {
	714	/*
	715	* Reverting the type is not necessary, the caller is going
	716	* to destroy srcobject directly, but I'm doing it here
	717	* for consistency since we've removed the object from its
	718	* queues.
	719	*/
	720	swp_pager_meta_free_all(srcobject);
	721	if (srcobject->type == OBJT_SWAP)
	722	srcobject->type = OBJT_DEFAULT;
	723	}
	724	crit_exit();
	725	}
	726
	727	/*
	728	* SWAP_PAGER_HASPAGE() - determine if we have good backing store for
	729	* the requested page.
	730	*
	731	* We determine whether good backing store exists for the requested
	732	* page and return TRUE if it does, FALSE if it doesn't.
	733	*
	734	* If TRUE, we also try to determine how much valid, contiguous backing
	735	* store exists before and after the requested page within a reasonable
	736	* distance. We do not try to restrict it to the swap device stripe
	737	* (that is handled in getpages/putpages). It probably isn't worth
	738	* doing here.
	739	*/
	740
	741	boolean_t
	742	swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
	743	{
	744	daddr_t blk0;
	745
	746	/*
	747	* do we have good backing store at the requested index ?
	748	*/
	749
	750	crit_enter();
	751	blk0 = swp_pager_meta_ctl(object, pindex, 0);
	752
	753	if (blk0 == SWAPBLK_NONE) {
	754	crit_exit();
	755	return (FALSE);
	756	}
	757
	758	#if 0
	759	/*
	760	* find backwards-looking contiguous good backing store
	761	*/
	762	if (before != NULL) {
	763	int i;
	764
	765	for (i = 1; i < (SWB_NPAGES/2); ++i) {
	766	daddr_t blk;
	767
	768	if (i > pindex)
	769	break;
	770	blk = swp_pager_meta_ctl(object, pindex - i, 0);
	771	if (blk != blk0 - i)
	772	break;
	773	}
	774	*before = (i - 1);
	775	}
	776
	777	/*
	778	* find forward-looking contiguous good backing store
	779	*/
	780
	781	if (after != NULL) {
	782	int i;
	783
	784	for (i = 1; i < (SWB_NPAGES/2); ++i) {
	785	daddr_t blk;
	786
	787	blk = swp_pager_meta_ctl(object, pindex + i, 0);
	788	if (blk != blk0 + i)
	789	break;
	790	}
	791	*after = (i - 1);
	792	}
	793	#endif
	794	crit_exit();
	795	return (TRUE);
	796	}
	797
	798	/*
	799	* SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
	800	*
	801	* This removes any associated swap backing store, whether valid or
	802	* not, from the page. This operates on any VM object, not just OBJT_SWAP
	803	* objects.
	804	*
	805	* This routine is typically called when a page is made dirty, at
	806	* which point any associated swap can be freed. MADV_FREE also
	807	* calls us in a special-case situation
	808	*
	809	* NOTE!!! If the page is clean and the swap was valid, the caller
	810	* should make the page dirty before calling this routine. This routine
	811	* does NOT change the m->dirty status of the page. Also: MADV_FREE
	812	* depends on it.
	813	*
	814	* This routine may not block
	815	* This routine must be called at splvm()
	816	*/
	817	void
	818	swap_pager_unswapped(vm_page_t m)
	819	{
	820	if (m->flags & PG_SWAPPED) {
	821	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
	822	vm_page_flag_clear(m, PG_SWAPPED);
	823	}
	824	}
	825
	826	/*
	827	* SWAP_PAGER_STRATEGY() - read, write, free blocks
	828	*
	829	* This implements a VM OBJECT strategy function using swap backing store.
	830	* This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
	831	* types.
	832	*
	833	* This is intended to be a cacheless interface (i.e. caching occurs at
	834	* higher levels), and is also used as a swap-based SSD cache for vnode
	835	* and device objects.
	836	*
	837	* All I/O goes directly to and from the swap device.
	838	*
	839	* We currently attempt to run I/O synchronously or asynchronously as
	840	* the caller requests. This isn't perfect because we loose error
	841	* sequencing when we run multiple ops in parallel to satisfy a request.
	842	* But this is swap, so we let it all hang out.
	843	*/
	844	void
	845	swap_pager_strategy(vm_object_t object, struct bio *bio)
	846	{
	847	struct buf *bp = bio->bio_buf;
	848	struct bio *nbio;
	849	vm_pindex_t start;
	850	vm_pindex_t biox_blkno = 0;
	851	int count;
	852	char *data;
	853	struct bio *biox;
	854	struct buf *bufx;
	855	struct bio_track *track;
	856
	857	/*
	858	* tracking for swapdev vnode I/Os
	859	*/
	860	if (bp->b_cmd == BUF_CMD_READ)
	861	track = &swapdev_vp->v_track_read;
	862	else
	863	track = &swapdev_vp->v_track_write;
	864
	865	if (bp->b_bcount & PAGE_MASK) {
	866	bp->b_error = EINVAL;
	867	bp->b_flags \|= B_ERROR \| B_INVAL;
	868	biodone(bio);
	869	kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
	870	"not page bounded\n",
	871	bp, (long long)bio->bio_offset, (int)bp->b_bcount);
	872	return;
	873	}
	874
	875	/*
	876	* Clear error indication, initialize page index, count, data pointer.
	877	*/
	878	bp->b_error = 0;
	879	bp->b_flags &= ~B_ERROR;
	880	bp->b_resid = bp->b_bcount;
	881
	882	start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT);
	883	count = howmany(bp->b_bcount, PAGE_SIZE);
	884	data = bp->b_data;
	885
	886	/*
	887	* Deal with BUF_CMD_FREEBLKS
	888	*/
	889	if (bp->b_cmd == BUF_CMD_FREEBLKS) {
	890	/*
	891	* FREE PAGE(s) - destroy underlying swap that is no longer
	892	* needed.
	893	*/
	894	swp_pager_meta_free(object, start, count);
	895	bp->b_resid = 0;
	896	biodone(bio);
	897	return;
	898	}
	899
	900	/*
	901	* We need to be able to create a new cluster of I/O's. We cannot
	902	* use the caller fields of the passed bio so push a new one.
	903	*
	904	* Because nbio is just a placeholder for the cluster links,
	905	* we can biodone() the original bio instead of nbio to make
	906	* things a bit more efficient.
	907	*/
	908	nbio = push_bio(bio);
	909	nbio->bio_offset = bio->bio_offset;
	910	nbio->bio_caller_info1.cluster_head = NULL;
	911	nbio->bio_caller_info2.cluster_tail = NULL;
	912
	913	biox = NULL;
	914	bufx = NULL;
	915
	916	/*
	917	* Execute read or write
	918	*/
	919	while (count > 0) {
	920	daddr_t blk;
	921
	922	/*
	923	* Obtain block. If block not found and writing, allocate a
	924	* new block and build it into the object.
	925	*/
	926	blk = swp_pager_meta_ctl(object, start, 0);
	927	if ((blk == SWAPBLK_NONE) && bp->b_cmd != BUF_CMD_READ) {
	928	blk = swp_pager_getswapspace(object, 1);
	929	if (blk == SWAPBLK_NONE) {
	930	bp->b_error = ENOMEM;
	931	bp->b_flags \|= B_ERROR;
	932	break;
	933	}
	934	swp_pager_meta_build(object, start, blk);
	935	}
	936
	937	/*
	938	* Do we have to flush our current collection? Yes if:
	939	*
	940	* - no swap block at this index
	941	* - swap block is not contiguous
	942	* - we cross a physical disk boundry in the
	943	* stripe.
	944	*/
	945	if (
	946	biox && (biox_blkno + btoc(bufx->b_bcount) != blk \|\|
	947	((biox_blkno ^ blk) & dmmax_mask)
	948	)
	949	) {
	950	if (bp->b_cmd == BUF_CMD_READ) {
	951	++mycpu->gd_cnt.v_swapin;
	952	mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
	953	} else {
	954	++mycpu->gd_cnt.v_swapout;
	955	mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
	956	bufx->b_dirtyend = bufx->b_bcount;
	957	}
	958
	959	/*
	960	* Finished with this buf.
	961	*/
	962	KKASSERT(bufx->b_bcount != 0);
	963	if (bufx->b_cmd != BUF_CMD_READ)
	964	bufx->b_dirtyend = bufx->b_bcount;
	965	biox = NULL;
	966	bufx = NULL;
	967	}
	968
	969	/*
	970	* Add new swapblk to biox, instantiating biox if necessary.
	971	* Zero-fill reads are able to take a shortcut.
	972	*/
	973	if (blk == SWAPBLK_NONE) {
	974	/*
	975	* We can only get here if we are reading. Since
	976	* we are at splvm() we can safely modify b_resid,
	977	* even if chain ops are in progress.
	978	*/
	979	bzero(data, PAGE_SIZE);
	980	bp->b_resid -= PAGE_SIZE;
	981	} else {
	982	if (biox == NULL) {
	983	/* XXX chain count > 4, wait to <= 4 */
	984
	985	bufx = getpbuf(NULL);
	986	biox = &bufx->b_bio1;
	987	cluster_append(nbio, bufx);
	988	bufx->b_flags \|= (bufx->b_flags & B_ORDERED);
	989	bufx->b_cmd = bp->b_cmd;
	990	biox->bio_done = swap_chain_iodone;
	991	biox->bio_offset = (off_t)blk << PAGE_SHIFT;
	992	biox->bio_caller_info1.cluster_parent = nbio;
	993	biox_blkno = blk;
	994	bufx->b_bcount = 0;
	995	bufx->b_data = data;
	996	}
	997	bufx->b_bcount += PAGE_SIZE;
	998	}
	999	--count;
	1000	++start;
	1001	data += PAGE_SIZE;
	1002	}
	1003
	1004	/*
	1005	* Flush out last buffer
	1006	*/
	1007	if (biox) {
	1008	if (bufx->b_cmd == BUF_CMD_READ) {
	1009	++mycpu->gd_cnt.v_swapin;
	1010	mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
	1011	} else {
	1012	++mycpu->gd_cnt.v_swapout;
	1013	mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
	1014	bufx->b_dirtyend = bufx->b_bcount;
	1015	}
	1016	KKASSERT(bufx->b_bcount);
	1017	if (bufx->b_cmd != BUF_CMD_READ)
	1018	bufx->b_dirtyend = bufx->b_bcount;
	1019	/* biox, bufx = NULL */
	1020	}
	1021
	1022	/*
	1023	* Now initiate all the I/O. Be careful looping on our chain as
	1024	* I/O's may complete while we are still initiating them.
	1025	*/
	1026	nbio->bio_caller_info2.cluster_tail = NULL;
	1027	bufx = nbio->bio_caller_info1.cluster_head;
	1028
	1029	while (bufx) {
	1030	biox = &bufx->b_bio1;
	1031	BUF_KERNPROC(bufx);
	1032	bufx = bufx->b_cluster_next;
	1033	vn_strategy(swapdev_vp, biox);
	1034	}
	1035
	1036	/*
	1037	* Completion of the cluster will also call biodone_chain(nbio).
	1038	* We never call biodone(nbio) so we don't have to worry about
	1039	* setting up a bio_done callback. It's handled in the sub-IO.
	1040	*/
	1041	/**/
	1042	}
	1043
	1044	static void
	1045	swap_chain_iodone(struct bio *biox)
	1046	{
	1047	struct buf **nextp;
	1048	struct buf bufx; / chained sub-buffer */
	1049	struct bio nbio; / parent nbio with chain glue */
	1050	struct buf bp; / original bp associated with nbio */
	1051	int chain_empty;
	1052
	1053	bufx = biox->bio_buf;
	1054	nbio = biox->bio_caller_info1.cluster_parent;
	1055	bp = nbio->bio_buf;
	1056
	1057	/*
	1058	* Update the original buffer
	1059	*/
	1060	KKASSERT(bp != NULL);
	1061	if (bufx->b_flags & B_ERROR) {
	1062	atomic_set_int(&bufx->b_flags, B_ERROR);
	1063	bp->b_error = bufx->b_error;
	1064	} else if (bufx->b_resid != 0) {
	1065	atomic_set_int(&bufx->b_flags, B_ERROR);
	1066	bp->b_error = EINVAL;
	1067	} else {
	1068	atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
	1069	}
	1070
	1071	/*
	1072	* Remove us from the chain.
	1073	*/
	1074	spin_lock_wr(&bp->b_lock.lk_spinlock);
	1075	nextp = &nbio->bio_caller_info1.cluster_head;
	1076	while (*nextp != bufx) {
	1077	KKASSERT(*nextp != NULL);
	1078	nextp = &(*nextp)->b_cluster_next;
	1079	}
	1080	*nextp = bufx->b_cluster_next;
	1081	chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
	1082	spin_unlock_wr(&bp->b_lock.lk_spinlock);
	1083
	1084	/*
	1085	* Clean up bufx. If the chain is now empty we finish out
	1086	* the parent. Note that we may be racing other completions
	1087	* so we must use the chain_empty status from above.
	1088	*/
	1089	if (chain_empty) {
	1090	if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
	1091	atomic_set_int(&bp->b_flags, B_ERROR);
	1092	bp->b_error = EINVAL;
	1093	}
	1094	biodone_chain(nbio);
	1095	}
	1096	relpbuf(bufx, NULL);
	1097	}
	1098
	1099	/*
	1100	* SWAP_PAGER_GETPAGES() - bring page in from swap
	1101	*
	1102	* The requested page may have to be brought in from swap. Calculate the
	1103	* swap block and bring in additional pages if possible. All pages must
	1104	* have contiguous swap block assignments and reside in the same object.
	1105	*
	1106	* The caller has a single vm_object_pip_add() reference prior to
	1107	* calling us and we should return with the same.
	1108	*
	1109	* The caller has BUSY'd the page. We should return with (*mpp) left busy,
	1110	* and any additinal pages unbusied.
	1111	*
	1112	* If the caller encounters a PG_RAM page it will pass it to us even though
	1113	* it may be valid and dirty. We cannot overwrite the page in this case!
	1114	* The case is used to allow us to issue pure read-aheads.
	1115	*
	1116	* NOTE! XXX This code does not entirely pipeline yet due to the fact that
	1117	* the PG_RAM page is validated at the same time as mreq. What we
	1118	* really need to do is issue a separate read-ahead pbuf.
	1119	*/
	1120	static int
	1121	swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
	1122	{
	1123	struct buf *bp;
	1124	struct bio *bio;
	1125	vm_page_t mreq;
	1126	vm_page_t m;
	1127	vm_offset_t kva;
	1128	daddr_t blk;
	1129	int i;
	1130	int j;
	1131	int raonly;
	1132	vm_page_t marray[XIO_INTERNAL_PAGES];
	1133
	1134	mreq = *mpp;
	1135
	1136	if (mreq->object != object) {
	1137	panic("swap_pager_getpages: object mismatch %p/%p",
	1138	object,
	1139	mreq->object
	1140	);
	1141	}
	1142
	1143	/*
	1144	* We don't want to overwrite a fully valid page as it might be
	1145	* dirty. This case can occur when e.g. vm_fault hits a perfectly
	1146	* valid page with PG_RAM set.
	1147	*
	1148	* In this case we see if the next page is a suitable page-in
	1149	* candidate and if it is we issue read-ahead. PG_RAM will be
	1150	* set on the last page of the read-ahead to continue the pipeline.
	1151	*/
	1152	if (mreq->valid == VM_PAGE_BITS_ALL) {
	1153	if (swap_burst_read == 0 \|\| mreq->pindex + 1 >= object->size)
	1154	return(VM_PAGER_OK);
	1155	crit_enter();
	1156	blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
	1157	if (blk == SWAPBLK_NONE) {
	1158	crit_exit();
	1159	return(VM_PAGER_OK);
	1160	}
	1161	m = vm_page_lookup(object, mreq->pindex + 1);
	1162	if (m == NULL) {
	1163	m = vm_page_alloc(object, mreq->pindex + 1,
	1164	VM_ALLOC_QUICK);
	1165	if (m == NULL) {
	1166	crit_exit();
	1167	return(VM_PAGER_OK);
	1168	}
	1169	} else {
	1170	if ((m->flags & PG_BUSY) \|\| m->busy \|\| m->valid) {
	1171	crit_exit();
	1172	return(VM_PAGER_OK);
	1173	}
	1174	vm_page_unqueue_nowakeup(m);
	1175	vm_page_busy(m);
	1176	}
	1177	mreq = m;
	1178	raonly = 1;
	1179	crit_exit();
	1180	} else {
	1181	raonly = 0;
	1182	}
	1183
	1184	/*
	1185	* Try to block-read contiguous pages from swap if sequential,
	1186	* otherwise just read one page. Contiguous pages from swap must
	1187	* reside within a single device stripe because the I/O cannot be
	1188	* broken up across multiple stripes.
	1189	*
	1190	* Note that blk and iblk can be SWAPBLK_NONE but the loop is
	1191	* set up such that the case(s) are handled implicitly.
	1192	*/
	1193	crit_enter();
	1194	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
	1195	marray[0] = mreq;
	1196
	1197	for (i = 1; swap_burst_read &&
	1198	i < XIO_INTERNAL_PAGES &&
	1199	mreq->pindex + i < object->size; ++i) {
	1200	daddr_t iblk;
	1201
	1202	iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
	1203	if (iblk != blk + i)
	1204	break;
	1205	if ((blk ^ iblk) & dmmax_mask)
	1206	break;
	1207	m = vm_page_lookup(object, mreq->pindex + i);
	1208	if (m == NULL) {
	1209	m = vm_page_alloc(object, mreq->pindex + i,
	1210	VM_ALLOC_QUICK);
	1211	if (m == NULL)
	1212	break;
	1213	} else {
	1214	if ((m->flags & PG_BUSY) \|\| m->busy \|\| m->valid)
	1215	break;
	1216	vm_page_unqueue_nowakeup(m);
	1217	vm_page_busy(m);
	1218	}
	1219	marray[i] = m;
	1220	}
	1221	if (i > 1)
	1222	vm_page_flag_set(marray[i - 1], PG_RAM);
	1223
	1224	crit_exit();
	1225
	1226	/*
	1227	* If mreq is the requested page and we have nothing to do return
	1228	* VM_PAGER_FAIL. If raonly is set mreq is just another read-ahead
	1229	* page and must be cleaned up.
	1230	*/
	1231	if (blk == SWAPBLK_NONE) {
	1232	KKASSERT(i == 1);
	1233	if (raonly) {
	1234	vnode_pager_freepage(mreq);
	1235	return(VM_PAGER_OK);
	1236	} else {
	1237	return(VM_PAGER_FAIL);
	1238	}
	1239	}
	1240
	1241	/*
	1242	* map our page(s) into kva for input
	1243	*/
	1244	bp = getpbuf(&nsw_rcount);
	1245	bio = &bp->b_bio1;
	1246	kva = (vm_offset_t) bp->b_kvabase;
	1247	bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
	1248	pmap_qenter(kva, bp->b_xio.xio_pages, i);
	1249
	1250	bp->b_data = (caddr_t)kva;
	1251	bp->b_bcount = PAGE_SIZE * i;
	1252	bp->b_xio.xio_npages = i;
	1253	bio->bio_done = swp_pager_async_iodone;
	1254	bio->bio_offset = (off_t)blk << PAGE_SHIFT;
	1255	bio->bio_caller_info1.index = SWBIO_READ;
	1256
	1257	/*
	1258	* Set index. If raonly set the index beyond the array so all
	1259	* the pages are treated the same, otherwise the original mreq is
	1260	* at index 0.
	1261	*/
	1262	if (raonly)
	1263	bio->bio_driver_info = (void *)(intptr_t)i;
	1264	else
	1265	bio->bio_driver_info = (void *)(intptr_t)0;
	1266
	1267	for (j = 0; j < i; ++j)
	1268	vm_page_flag_set(bp->b_xio.xio_pages[j], PG_SWAPINPROG);
	1269
	1270	mycpu->gd_cnt.v_swapin++;
	1271	mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
	1272
	1273	/*
	1274	* We still hold the lock on mreq, and our automatic completion routine
	1275	* does not remove it.
	1276	*/
	1277	vm_object_pip_add(object, bp->b_xio.xio_npages);
	1278
	1279	/*
	1280	* perform the I/O. NOTE!!! bp cannot be considered valid after
	1281	* this point because we automatically release it on completion.
	1282	* Instead, we look at the one page we are interested in which we
	1283	* still hold a lock on even through the I/O completion.
	1284	*
	1285	* The other pages in our m[] array are also released on completion,
	1286	* so we cannot assume they are valid anymore either.
	1287	*/
	1288	bp->b_cmd = BUF_CMD_READ;
	1289	BUF_KERNPROC(bp);
	1290	vn_strategy(swapdev_vp, bio);
	1291
	1292	/*
	1293	* Wait for the page we want to complete. PG_SWAPINPROG is always
	1294	* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
	1295	* is set in the meta-data.
	1296	*
	1297	* If this is a read-ahead only we return immediately without
	1298	* waiting for I/O.
	1299	*/
	1300	if (raonly)
	1301	return(VM_PAGER_OK);
	1302
	1303	/*
	1304	* Read-ahead includes originally requested page case.
	1305	*/
	1306	crit_enter();
	1307	while ((mreq->flags & PG_SWAPINPROG) != 0) {
	1308	vm_page_flag_set(mreq, PG_WANTED \| PG_REFERENCED);
	1309	mycpu->gd_cnt.v_intrans++;
	1310	if (tsleep(mreq, 0, "swread", hz*20)) {
	1311	kprintf(
	1312	"swap_pager: indefinite wait buffer: "
	1313	" offset: %lld, size: %ld\n",
	1314	(long long)bio->bio_offset,
	1315	(long)bp->b_bcount
	1316	);
	1317	}
	1318	}
	1319	crit_exit();
	1320
	1321	/*
	1322	* mreq is left bussied after completion, but all the other pages
	1323	* are freed. If we had an unrecoverable read error the page will
	1324	* not be valid.
	1325	*/
	1326	if (mreq->valid != VM_PAGE_BITS_ALL)
	1327	return(VM_PAGER_ERROR);
	1328	else
	1329	return(VM_PAGER_OK);
	1330
	1331	/*
	1332	* A final note: in a low swap situation, we cannot deallocate swap
	1333	* and mark a page dirty here because the caller is likely to mark
	1334	* the page clean when we return, causing the page to possibly revert
	1335	* to all-zero's later.
	1336	*/
	1337	}
	1338
	1339	/*
	1340	* swap_pager_putpages:
	1341	*
	1342	* Assign swap (if necessary) and initiate I/O on the specified pages.
	1343	*
	1344	* We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
	1345	* are automatically converted to SWAP objects.
	1346	*
	1347	* In a low memory situation we may block in vn_strategy(), but the new
	1348	* vm_page reservation system coupled with properly written VFS devices
	1349	* should ensure that no low-memory deadlock occurs. This is an area
	1350	* which needs work.
	1351	*
	1352	* The parent has N vm_object_pip_add() references prior to
	1353	* calling us and will remove references for rtvals[] that are
	1354	* not set to VM_PAGER_PEND. We need to remove the rest on I/O
	1355	* completion.
	1356	*
	1357	* The parent has soft-busy'd the pages it passes us and will unbusy
	1358	* those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
	1359	* We need to unbusy the rest on I/O completion.
	1360	*/
	1361	void
	1362	swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
	1363	boolean_t sync, int *rtvals)
	1364	{
	1365	int i;
	1366	int n = 0;
	1367
	1368	if (count && m[0]->object != object) {
	1369	panic("swap_pager_getpages: object mismatch %p/%p",
	1370	object,
	1371	m[0]->object
	1372	);
	1373	}
	1374
	1375	/*
	1376	* Step 1
	1377	*
	1378	* Turn object into OBJT_SWAP
	1379	* check for bogus sysops
	1380	* force sync if not pageout process
	1381	*/
	1382	if (object->type == OBJT_DEFAULT)
	1383	swp_pager_meta_convert(object);
	1384
	1385	if (curthread != pagethread)
	1386	sync = TRUE;
	1387
	1388	/*
	1389	* Step 2
	1390	*
	1391	* Update nsw parameters from swap_async_max sysctl values.
	1392	* Do not let the sysop crash the machine with bogus numbers.
	1393	*/
	1394
	1395	if (swap_async_max != nsw_wcount_async_max) {
	1396	int n;
	1397
	1398	/*
	1399	* limit range
	1400	*/
	1401	if ((n = swap_async_max) > nswbuf / 2)
	1402	n = nswbuf / 2;
	1403	if (n < 1)
	1404	n = 1;
	1405	swap_async_max = n;
	1406
	1407	/*
	1408	* Adjust difference ( if possible ). If the current async
	1409	* count is too low, we may not be able to make the adjustment
	1410	* at this time.
	1411	*/
	1412	crit_enter();
	1413	n -= nsw_wcount_async_max;
	1414	if (nsw_wcount_async + n >= 0) {
	1415	nsw_wcount_async += n;
	1416	nsw_wcount_async_max += n;
	1417	wakeup(&nsw_wcount_async);
	1418	}
	1419	crit_exit();
	1420	}
	1421
	1422	/*
	1423	* Step 3
	1424	*
	1425	* Assign swap blocks and issue I/O. We reallocate swap on the fly.
	1426	* The page is left dirty until the pageout operation completes
	1427	* successfully.
	1428	*/
	1429
	1430	for (i = 0; i < count; i += n) {
	1431	struct buf *bp;
	1432	struct bio *bio;
	1433	daddr_t blk;
	1434	int j;
	1435
	1436	/*
	1437	* Maximum I/O size is limited by a number of factors.
	1438	*/
	1439
	1440	n = min(BLIST_MAX_ALLOC, count - i);
	1441	n = min(n, nsw_cluster_max);
	1442
	1443	crit_enter();
	1444
	1445	/*
	1446	* Get biggest block of swap we can. If we fail, fall
	1447	* back and try to allocate a smaller block. Don't go
	1448	* overboard trying to allocate space if it would overly
	1449	* fragment swap.
	1450	*/
	1451	while (
	1452	(blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
	1453	n > 4
	1454	) {
	1455	n >>= 1;
	1456	}
	1457	if (blk == SWAPBLK_NONE) {
	1458	for (j = 0; j < n; ++j)
	1459	rtvals[i+j] = VM_PAGER_FAIL;
	1460	crit_exit();
	1461	continue;
	1462	}
	1463
	1464	/*
	1465	* The I/O we are constructing cannot cross a physical
	1466	* disk boundry in the swap stripe. Note: we are still
	1467	* at splvm().
	1468	*/
	1469	if ((blk ^ (blk + n)) & dmmax_mask) {
	1470	j = ((blk + dmmax) & dmmax_mask) - blk;
	1471	swp_pager_freeswapspace(object, blk + j, n - j);
	1472	n = j;
	1473	}
	1474
	1475	/*
	1476	* All I/O parameters have been satisfied, build the I/O
	1477	* request and assign the swap space.
	1478	*/
	1479	if (sync == TRUE)
	1480	bp = getpbuf(&nsw_wcount_sync);
	1481	else
	1482	bp = getpbuf(&nsw_wcount_async);
	1483	bio = &bp->b_bio1;
	1484
	1485	pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
	1486
	1487	bp->b_bcount = PAGE_SIZE * n;
	1488	bio->bio_offset = (off_t)blk << PAGE_SHIFT;
	1489
	1490	for (j = 0; j < n; ++j) {
	1491	vm_page_t mreq = m[i+j];
	1492
	1493	swp_pager_meta_build(mreq->object, mreq->pindex,
	1494	blk + j);
	1495	if (object->type == OBJT_SWAP)
	1496	vm_page_dirty(mreq);
	1497	rtvals[i+j] = VM_PAGER_OK;
	1498
	1499	vm_page_flag_set(mreq, PG_SWAPINPROG);
	1500	bp->b_xio.xio_pages[j] = mreq;
	1501	}
	1502	bp->b_xio.xio_npages = n;
	1503
	1504	mycpu->gd_cnt.v_swapout++;
	1505	mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
	1506
	1507	crit_exit();
	1508
	1509	bp->b_dirtyoff = 0; /* req'd for NFS */
	1510	bp->b_dirtyend = bp->b_bcount; /* req'd for NFS */
	1511	bp->b_cmd = BUF_CMD_WRITE;
	1512	bio->bio_caller_info1.index = SWBIO_WRITE;
	1513
	1514	/*
	1515	* asynchronous
	1516	*/
	1517	if (sync == FALSE) {
	1518	bio->bio_done = swp_pager_async_iodone;
	1519	BUF_KERNPROC(bp);
	1520	vn_strategy(swapdev_vp, bio);
	1521
	1522	for (j = 0; j < n; ++j)
	1523	rtvals[i+j] = VM_PAGER_PEND;
	1524	continue;
	1525	}
	1526
	1527	/*
	1528	* Issue synchrnously.
	1529	*
	1530	* Wait for the sync I/O to complete, then update rtvals.
	1531	* We just set the rtvals[] to VM_PAGER_PEND so we can call
	1532	* our async completion routine at the end, thus avoiding a
	1533	* double-free.
	1534	*/
	1535	bio->bio_caller_info1.index \|= SWBIO_SYNC;
	1536	bio->bio_done = biodone_sync;
	1537	bio->bio_flags \|= BIO_SYNC;
	1538	vn_strategy(swapdev_vp, bio);
	1539	biowait(bio, "swwrt");
	1540
	1541	for (j = 0; j < n; ++j)
	1542	rtvals[i+j] = VM_PAGER_PEND;
	1543
	1544	/*
	1545	* Now that we are through with the bp, we can call the
	1546	* normal async completion, which frees everything up.
	1547	*/
	1548	swp_pager_async_iodone(bio);
	1549	}
	1550	}
	1551
	1552	void
	1553	swap_pager_newswap(void)
	1554	{
	1555	swp_sizecheck();
	1556	}
	1557
	1558	/*
	1559	* swp_pager_async_iodone:
	1560	*
	1561	* Completion routine for asynchronous reads and writes from/to swap.
	1562	* Also called manually by synchronous code to finish up a bp.
	1563	*
	1564	* For READ operations, the pages are PG_BUSY'd. For WRITE operations,
	1565	* the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY
	1566	* unbusy all pages except the 'main' request page. For WRITE
	1567	* operations, we vm_page_t->busy'd unbusy all pages ( we can do this
	1568	* because we marked them all VM_PAGER_PEND on return from putpages ).
	1569	*
	1570	* This routine may not block.
	1571	*/
	1572	static void
	1573	swp_pager_async_iodone(struct bio *bio)
	1574	{
	1575	struct buf *bp = bio->bio_buf;
	1576	vm_object_t object = NULL;
	1577	int i;
	1578	int *nswptr;
	1579
	1580	/*
	1581	* report error
	1582	*/
	1583	if (bp->b_flags & B_ERROR) {
	1584	kprintf(
	1585	"swap_pager: I/O error - %s failed; offset %lld,"
	1586	"size %ld, error %d\n",
	1587	((bio->bio_caller_info1.index & SWBIO_READ) ?
	1588	"pagein" : "pageout"),
	1589	(long long)bio->bio_offset,
	1590	(long)bp->b_bcount,
	1591	bp->b_error
	1592	);
	1593	}
	1594
	1595	/*
	1596	* set object, raise to splvm().
	1597	*/
	1598	if (bp->b_xio.xio_npages)
	1599	object = bp->b_xio.xio_pages[0]->object;
	1600	crit_enter();
	1601
	1602	/*
	1603	* remove the mapping for kernel virtual
	1604	*/
	1605	pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
	1606
	1607	/*
	1608	* cleanup pages. If an error occurs writing to swap, we are in
	1609	* very serious trouble. If it happens to be a disk error, though,
	1610	* we may be able to recover by reassigning the swap later on. So
	1611	* in this case we remove the m->swapblk assignment for the page
	1612	* but do not free it in the rlist. The errornous block(s) are thus
	1613	* never reallocated as swap. Redirty the page and continue.
	1614	*/
	1615	for (i = 0; i < bp->b_xio.xio_npages; ++i) {
	1616	vm_page_t m = bp->b_xio.xio_pages[i];
	1617
	1618	if (bp->b_flags & B_ERROR) {
	1619	/*
	1620	* If an error occurs I'd love to throw the swapblk
	1621	* away without freeing it back to swapspace, so it
	1622	* can never be used again. But I can't from an
	1623	* interrupt.
	1624	*/
	1625
	1626	if (bio->bio_caller_info1.index & SWBIO_READ) {
	1627	/*
	1628	* When reading, reqpage needs to stay
	1629	* locked for the parent, but all other
	1630	* pages can be freed. We still want to
	1631	* wakeup the parent waiting on the page,
	1632	* though. ( also: pg_reqpage can be -1 and
	1633	* not match anything ).
	1634	*
	1635	* We have to wake specifically requested pages
	1636	* up too because we cleared PG_SWAPINPROG and
	1637	* someone may be waiting for that.
	1638	*
	1639	* NOTE: for reads, m->dirty will probably
	1640	* be overridden by the original caller of
	1641	* getpages so don't play cute tricks here.
	1642	*
	1643	* NOTE: We can't actually free the page from
	1644	* here, because this is an interrupt. It
	1645	* is not legal to mess with object->memq
	1646	* from an interrupt. Deactivate the page
	1647	* instead.
	1648	*/
	1649
	1650	m->valid = 0;
	1651	vm_page_flag_clear(m, PG_ZERO);
	1652	vm_page_flag_clear(m, PG_SWAPINPROG);
	1653
	1654	/*
	1655	* bio_driver_info holds the requested page
	1656	* index.
	1657	*/
	1658	if (i != (int)(intptr_t)bio->bio_driver_info) {
	1659	vm_page_deactivate(m);
	1660	vm_page_wakeup(m);
	1661	} else {
	1662	vm_page_flash(m);
	1663	}
	1664	/*
	1665	* If i == bp->b_pager.pg_reqpage, do not wake
	1666	* the page up. The caller needs to.
	1667	*/
	1668	} else {
	1669	/*
	1670	* If a write error occurs, reactivate page
	1671	* so it doesn't clog the inactive list,
	1672	* then finish the I/O.
	1673	*
	1674	* Only for OBJT_SWAP. When using the swap
	1675	* as a cache for clean vnode-backed pages
	1676	* we don't mess with the page dirty state.
	1677	*/
	1678	vm_page_flag_clear(m, PG_SWAPINPROG);
	1679	if (m->object->type == OBJT_SWAP) {
	1680	vm_page_dirty(m);
	1681	vm_page_activate(m);
	1682	}
	1683	vm_page_io_finish(m);
	1684	}
	1685	} else if (bio->bio_caller_info1.index & SWBIO_READ) {
	1686	/*
	1687	* NOTE: for reads, m->dirty will probably be
	1688	* overridden by the original caller of getpages so
	1689	* we cannot set them in order to free the underlying
	1690	* swap in a low-swap situation. I don't think we'd
	1691	* want to do that anyway, but it was an optimization
	1692	* that existed in the old swapper for a time before
	1693	* it got ripped out due to precisely this problem.
	1694	*
	1695	* clear PG_ZERO in page.
	1696	*
	1697	* If not the requested page then deactivate it.
	1698	*
	1699	* Note that the requested page, reqpage, is left
	1700	* busied, but we still have to wake it up. The
	1701	* other pages are released (unbusied) by
	1702	* vm_page_wakeup(). We do not set reqpage's
	1703	* valid bits here, it is up to the caller.
	1704	*/
	1705
	1706	/*
	1707	* NOTE: can't call pmap_clear_modify(m) from an
	1708	* interrupt thread, the pmap code may have to map
	1709	* non-kernel pmaps and currently asserts the case.
	1710	*/
	1711	/pmap_clear_modify(m);/
	1712	m->valid = VM_PAGE_BITS_ALL;
	1713	vm_page_undirty(m);
	1714	vm_page_flag_clear(m, PG_ZERO \| PG_SWAPINPROG);
	1715	vm_page_flag_set(m, PG_SWAPPED);
	1716
	1717	/*
	1718	* We have to wake specifically requested pages
	1719	* up too because we cleared PG_SWAPINPROG and
	1720	* could be waiting for it in getpages. However,
	1721	* be sure to not unbusy getpages specifically
	1722	* requested page - getpages expects it to be
	1723	* left busy.
	1724	*
	1725	* bio_driver_info holds the requested page
	1726	*/
	1727	if (i != (int)(intptr_t)bio->bio_driver_info) {
	1728	vm_page_deactivate(m);
	1729	vm_page_wakeup(m);
	1730	} else {
	1731	vm_page_flash(m);
	1732	}
	1733	} else {
	1734	/*
	1735	* Mark the page clean but do not mess with the
	1736	* pmap-layer's modified state. That state should
	1737	* also be clear since the caller protected the
	1738	* page VM_PROT_READ, but allow the case.
	1739	*
	1740	* We are in an interrupt, avoid pmap operations.
	1741	*
	1742	* If we have a severe page deficit, deactivate the
	1743	* page. Do not try to cache it (which would also
	1744	* involve a pmap op), because the page might still
	1745	* be read-heavy.
	1746	*
	1747	* When using the swap to cache clean vnode pages
	1748	* we do not mess with the page dirty bits.
	1749	*/
	1750	if (m->object->type == OBJT_SWAP)
	1751	vm_page_undirty(m);
	1752	vm_page_flag_clear(m, PG_SWAPINPROG);
	1753	vm_page_flag_set(m, PG_SWAPPED);
	1754	vm_page_io_finish(m);
	1755	if (vm_page_count_severe())
	1756	vm_page_deactivate(m);
	1757	#if 0
	1758	if (!vm_page_count_severe() \|\| !vm_page_try_to_cache(m))
	1759	vm_page_protect(m, VM_PROT_READ);
	1760	#endif
	1761	}
	1762	}
	1763
	1764	/*
	1765	* adjust pip. NOTE: the original parent may still have its own
	1766	* pip refs on the object.
	1767	*/
	1768
	1769	if (object)
	1770	vm_object_pip_wakeupn(object, bp->b_xio.xio_npages);
	1771
	1772	/*
	1773	* Release the physical I/O buffer.
	1774	*
	1775	* NOTE: Due to synchronous operations in the write case b_cmd may
	1776	* already be set to BUF_CMD_DONE and BIO_SYNC may have already
	1777	* been cleared.
	1778	*/
	1779	if (bio->bio_caller_info1.index & SWBIO_READ)
	1780	nswptr = &nsw_rcount;
	1781	else if (bio->bio_caller_info1.index & SWBIO_SYNC)
	1782	nswptr = &nsw_wcount_sync;
	1783	else
	1784	nswptr = &nsw_wcount_async;
	1785	bp->b_cmd = BUF_CMD_DONE;
	1786	relpbuf(bp, nswptr);
	1787	crit_exit();
	1788	}
	1789
	1790	/************************************************************************
	1791	* SWAP META DATA *
	1792	************************************************************************
	1793	*
	1794	* These routines manipulate the swap metadata stored in the
	1795	* OBJT_SWAP object. All swp_*() routines must be called at
	1796	* splvm() because swap can be freed up by the low level vm_page
	1797	* code which might be called from interrupts beyond what splbio() covers.
	1798	*
	1799	* Swap metadata is implemented with a global hash and not directly
	1800	* linked into the object. Instead the object simply contains
	1801	* appropriate tracking counters.
	1802	*/
	1803
	1804	/*
	1805	* Lookup the swblock containing the specified swap block index.
	1806	*/
	1807	static __inline
	1808	struct swblock *
	1809	swp_pager_lookup(vm_object_t object, vm_pindex_t index)
	1810	{
	1811	index &= ~SWAP_META_MASK;
	1812	return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index));
	1813	}
	1814
	1815	/*
	1816	* Remove a swblock from the RB tree.
	1817	*/
	1818	static __inline
	1819	void
	1820	swp_pager_remove(vm_object_t object, struct swblock *swap)
	1821	{
	1822	RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap);
	1823	}
	1824
	1825	/*
	1826	* Convert default object to swap object if necessary
	1827	*/
	1828	static void
	1829	swp_pager_meta_convert(vm_object_t object)
	1830	{
	1831	if (object->type == OBJT_DEFAULT) {
	1832	object->type = OBJT_SWAP;
	1833	KKASSERT(object->swblock_count == 0);
	1834	}
	1835	}
	1836
	1837	/*
	1838	* SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
	1839	*
	1840	* We first convert the object to a swap object if it is a default
	1841	* object. Vnode objects do not need to be converted.
	1842	*
	1843	* The specified swapblk is added to the object's swap metadata. If
	1844	* the swapblk is not valid, it is freed instead. Any previously
	1845	* assigned swapblk is freed.
	1846	*/
	1847	static void
	1848	swp_pager_meta_build(vm_object_t object, vm_pindex_t index, daddr_t swapblk)
	1849	{
	1850	struct swblock *swap;
	1851	struct swblock *oswap;
	1852
	1853	KKASSERT(swapblk != SWAPBLK_NONE);
	1854
	1855	/*
	1856	* Convert object if necessary
	1857	*/
	1858	if (object->type == OBJT_DEFAULT)
	1859	swp_pager_meta_convert(object);
	1860
	1861	/*
	1862	* Locate swblock. If not found create, but if we aren't adding
	1863	* anything just return. If we run out of space in the map we wait
	1864	* and, since the hash table may have changed, retry.
	1865	*/
	1866	retry:
	1867	swap = swp_pager_lookup(object, index);
	1868
	1869	if (swap == NULL) {
	1870	int i;
	1871
	1872	swap = zalloc(swap_zone);
	1873	if (swap == NULL) {
	1874	vm_wait(0);
	1875	goto retry;
	1876	}
	1877	swap->swb_index = index & ~SWAP_META_MASK;
	1878	swap->swb_count = 0;
	1879
	1880	++object->swblock_count;
	1881
	1882	for (i = 0; i < SWAP_META_PAGES; ++i)
	1883	swap->swb_pages[i] = SWAPBLK_NONE;
	1884	oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap);
	1885	KKASSERT(oswap == NULL);
	1886	}
	1887
	1888	/*
	1889	* Delete prior contents of metadata
	1890	*/
	1891
	1892	index &= SWAP_META_MASK;
	1893
	1894	if (swap->swb_pages[index] != SWAPBLK_NONE) {
	1895	swp_pager_freeswapspace(object, swap->swb_pages[index], 1);
	1896	--swap->swb_count;
	1897	}
	1898
	1899	/*
	1900	* Enter block into metadata
	1901	*/
	1902	swap->swb_pages[index] = swapblk;
	1903	if (swapblk != SWAPBLK_NONE)
	1904	++swap->swb_count;
	1905	}
	1906
	1907	/*
	1908	* SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
	1909	*
	1910	* The requested range of blocks is freed, with any associated swap
	1911	* returned to the swap bitmap.
	1912	*
	1913	* This routine will free swap metadata structures as they are cleaned
	1914	* out. This routine does NOT operate on swap metadata associated
	1915	* with resident pages.
	1916	*
	1917	* This routine must be called at splvm()
	1918	*/
	1919	static int swp_pager_meta_free_callback(struct swblock swb, void data);
	1920
	1921	static void
	1922	swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
	1923	{
	1924	struct swfreeinfo info;
	1925
	1926	/*
	1927	* Nothing to do
	1928	*/
	1929	if (object->swblock_count == 0) {
	1930	KKASSERT(RB_EMPTY(&object->swblock_root));
	1931	return;
	1932	}
	1933	if (count == 0)
	1934	return;
	1935
	1936	/*
	1937	* Setup for RB tree scan. Note that the pindex range can be huge
	1938	* due to the 64 bit page index space so we cannot safely iterate.
	1939	*/
	1940	info.object = object;
	1941	info.basei = index & ~SWAP_META_MASK;
	1942	info.begi = index;
	1943	info.endi = index + count - 1;
	1944	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp,
	1945	swp_pager_meta_free_callback, &info);
	1946	}
	1947
	1948	static
	1949	int
	1950	swp_pager_meta_free_callback(struct swblock swap, void data)
	1951	{
	1952	struct swfreeinfo *info = data;
	1953	vm_object_t object = info->object;
	1954	int index;
	1955	int eindex;
	1956
	1957	/*
	1958	* Figure out the range within the swblock. The wider scan may
	1959	* return edge-case swap blocks when the start and/or end points
	1960	* are in the middle of a block.
	1961	*/
	1962	if (swap->swb_index < info->begi)
	1963	index = (int)info->begi & SWAP_META_MASK;
	1964	else
	1965	index = 0;
	1966
	1967	if (swap->swb_index + SWAP_META_PAGES > info->endi)
	1968	eindex = (int)info->endi & SWAP_META_MASK;
	1969	else
	1970	eindex = SWAP_META_MASK;
	1971
	1972	/*
	1973	* Scan and free the blocks. The loop terminates early
	1974	* if (swap) runs out of blocks and could be freed.
	1975	*/
	1976	while (index <= eindex) {
	1977	daddr_t v = swap->swb_pages[index];
	1978
	1979	if (v != SWAPBLK_NONE) {
	1980	swp_pager_freeswapspace(object, v, 1);
	1981	swap->swb_pages[index] = SWAPBLK_NONE;
	1982	if (--swap->swb_count == 0) {
	1983	swp_pager_remove(object, swap);
	1984	zfree(swap_zone, swap);
	1985	--object->swblock_count;
	1986	break;
	1987	}
	1988	}
	1989	++index;
	1990	}
	1991	/* swap may be invalid here due to zfree above */
	1992	return(0);
	1993	}
	1994
	1995	/*
	1996	* SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
	1997	*
	1998	* This routine locates and destroys all swap metadata associated with
	1999	* an object.
	2000	*
	2001	* This routine must be called at splvm()
	2002	*/
	2003	static void
	2004	swp_pager_meta_free_all(vm_object_t object)
	2005	{
	2006	struct swblock *swap;
	2007	int i;
	2008
	2009	while ((swap = RB_ROOT(&object->swblock_root)) != NULL) {
	2010	swp_pager_remove(object, swap);
	2011	for (i = 0; i < SWAP_META_PAGES; ++i) {
	2012	daddr_t v = swap->swb_pages[i];
	2013	if (v != SWAPBLK_NONE) {
	2014	--swap->swb_count;
	2015	swp_pager_freeswapspace(object, v, 1);
	2016	}
	2017	}
	2018	if (swap->swb_count != 0)
	2019	panic("swap_pager_meta_free_all: swb_count != 0");
	2020	zfree(swap_zone, swap);
	2021	--object->swblock_count;
	2022	}
	2023	KKASSERT(object->swblock_count == 0);
	2024	}
	2025
	2026	/*
	2027	* SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data.
	2028	*
	2029	* This routine is capable of looking up, popping, or freeing
	2030	* swapblk assignments in the swap meta data or in the vm_page_t.
	2031	* The routine typically returns the swapblk being looked-up, or popped,
	2032	* or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
	2033	* was invalid. This routine will automatically free any invalid
	2034	* meta-data swapblks.
	2035	*
	2036	* It is not possible to store invalid swapblks in the swap meta data
	2037	* (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
	2038	*
	2039	* When acting on a busy resident page and paging is in progress, we
	2040	* have to wait until paging is complete but otherwise can act on the
	2041	* busy page.
	2042	*
	2043	* This routine must be called at splvm().
	2044	*
	2045	* SWM_FREE remove and free swap block from metadata
	2046	* SWM_POP remove from meta data but do not free.. pop it out
	2047	*/
	2048	static daddr_t
	2049	swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
	2050	{
	2051	struct swblock *swap;
	2052	daddr_t r1;
	2053
	2054	if (object->swblock_count == 0)
	2055	return(SWAPBLK_NONE);
	2056
	2057	r1 = SWAPBLK_NONE;
	2058	swap = swp_pager_lookup(object, index);
	2059
	2060	if (swap != NULL) {
	2061	index &= SWAP_META_MASK;
	2062	r1 = swap->swb_pages[index];
	2063
	2064	if (r1 != SWAPBLK_NONE) {
	2065	if (flags & SWM_FREE) {
	2066	swp_pager_freeswapspace(object, r1, 1);
	2067	r1 = SWAPBLK_NONE;
	2068	}
	2069	if (flags & (SWM_FREE\|SWM_POP)) {
	2070	swap->swb_pages[index] = SWAPBLK_NONE;
	2071	if (--swap->swb_count == 0) {
	2072	swp_pager_remove(object, swap);
	2073	zfree(swap_zone, swap);
	2074	--object->swblock_count;
	2075	}
	2076	}
	2077	}
	2078	}
	2079	return(r1);
	2080	}