gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* The Mach Operating System project at Carnegie-Mellon University.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94
	37	*
	38	*
	39	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	40	* All rights reserved.
	41	*
	42	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	43	*
	44	* Permission to use, copy, modify and distribute this software and
	45	* its documentation is hereby granted, provided that both the copyright
	46	* notice and this permission notice appear in all copies of the
	47	* software, derivative works or modified versions, and any portions
	48	* thereof, and that both notices appear in supporting documentation.
	49	*
	50	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	51	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	52	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	53	*
	54	* Carnegie Mellon requests users of this software to return to
	55	*
	56	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	57	* School of Computer Science
	58	* Carnegie Mellon University
	59	* Pittsburgh PA 15213-3890
	60	*
	61	* any improvements or extensions that they make and grant Carnegie the
	62	* rights to redistribute these changes.
	63	*
	64	* $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
	65	* $DragonFly: src/sys/vm/vm_object.c,v 1.33 2008/05/09 07:24:48 dillon Exp $
	66	*/
	67
	68	/*
	69	* Virtual memory object module.
	70	*/
	71
	72	#include <sys/param.h>
	73	#include <sys/systm.h>
	74	#include <sys/proc.h> /* for curproc, pageproc */
	75	#include <sys/vnode.h>
	76	#include <sys/vmmeter.h>
	77	#include <sys/mman.h>
	78	#include <sys/mount.h>
	79	#include <sys/kernel.h>
	80	#include <sys/sysctl.h>
	81
	82	#include <vm/vm.h>
	83	#include <vm/vm_param.h>
	84	#include <vm/pmap.h>
	85	#include <vm/vm_map.h>
	86	#include <vm/vm_object.h>
	87	#include <vm/vm_page.h>
	88	#include <vm/vm_pageout.h>
	89	#include <vm/vm_pager.h>
	90	#include <vm/swap_pager.h>
	91	#include <vm/vm_kern.h>
	92	#include <vm/vm_extern.h>
	93	#include <vm/vm_zone.h>
	94
	95	#define EASY_SCAN_FACTOR 8
	96
	97	static void vm_object_qcollapse(vm_object_t object);
	98	static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
	99	int pagerflags);
	100
	101	/*
	102	* Virtual memory objects maintain the actual data
	103	* associated with allocated virtual memory. A given
	104	* page of memory exists within exactly one object.
	105	*
	106	* An object is only deallocated when all "references"
	107	* are given up. Only one "reference" to a given
	108	* region of an object should be writeable.
	109	*
	110	* Associated with each object is a list of all resident
	111	* memory pages belonging to that object; this list is
	112	* maintained by the "vm_page" module, and locked by the object's
	113	* lock.
	114	*
	115	* Each object also records a "pager" routine which is
	116	* used to retrieve (and store) pages to the proper backing
	117	* storage. In addition, objects may be backed by other
	118	* objects from which they were virtual-copied.
	119	*
	120	* The only items within the object structure which are
	121	* modified after time of creation are:
	122	* reference count locked by object's lock
	123	* pager routine locked by object's lock
	124	*
	125	*/
	126
	127	struct object_q vm_object_list;
	128	struct vm_object kernel_object;
	129
	130	static long vm_object_count; /* count of all objects */
	131	extern int vm_pageout_page_count;
	132
	133	static long object_collapses;
	134	static long object_bypasses;
	135	static int next_index;
	136	static vm_zone_t obj_zone;
	137	static struct vm_zone obj_zone_store;
	138	static int object_hash_rand;
	139	#define VM_OBJECTS_INIT 256
	140	static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
	141
	142	void
	143	_vm_object_allocate(objtype_t type, vm_size_t size, vm_object_t object)
	144	{
	145	int incr;
	146	RB_INIT(&object->rb_memq);
	147	LIST_INIT(&object->shadow_head);
	148
	149	object->type = type;
	150	object->size = size;
	151	object->ref_count = 1;
	152	object->flags = 0;
	153	if ((object->type == OBJT_DEFAULT) \|\| (object->type == OBJT_SWAP))
	154	vm_object_set_flag(object, OBJ_ONEMAPPING);
	155	object->paging_in_progress = 0;
	156	object->resident_page_count = 0;
	157	object->shadow_count = 0;
	158	object->pg_color = next_index;
	159	if ( size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
	160	incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
	161	else
	162	incr = size;
	163	next_index = (next_index + incr) & PQ_L2_MASK;
	164	object->handle = NULL;
	165	object->backing_object = NULL;
	166	object->backing_object_offset = (vm_ooffset_t) 0;
	167	/*
	168	* Try to generate a number that will spread objects out in the
	169	* hash table. We 'wipe' new objects across the hash in 128 page
	170	* increments plus 1 more to offset it a little more by the time
	171	* it wraps around.
	172	*/
	173	object->hash_rand = object_hash_rand - 129;
	174
	175	object->generation++;
	176	object->swblock_count = 0;
	177	RB_INIT(&object->swblock_root);
	178
	179	crit_enter();
	180	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
	181	vm_object_count++;
	182	object_hash_rand = object->hash_rand;
	183	crit_exit();
	184	}
	185
	186	/*
	187	* vm_object_init:
	188	*
	189	* Initialize the VM objects module.
	190	*/
	191	void
	192	vm_object_init(void)
	193	{
	194	TAILQ_INIT(&vm_object_list);
	195
	196	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
	197	&kernel_object);
	198
	199	obj_zone = &obj_zone_store;
	200	zbootinit(obj_zone, "VM OBJECT", sizeof (struct vm_object),
	201	vm_objects_init, VM_OBJECTS_INIT);
	202	}
	203
	204	void
	205	vm_object_init2(void)
	206	{
	207	zinitna(obj_zone, NULL, NULL, 0, 0, ZONE_PANICFAIL, 1);
	208	}
	209
	210	/*
	211	* vm_object_allocate:
	212	*
	213	* Returns a new object with the given size.
	214	*/
	215
	216	vm_object_t
	217	vm_object_allocate(objtype_t type, vm_size_t size)
	218	{
	219	vm_object_t result;
	220
	221	result = (vm_object_t) zalloc(obj_zone);
	222
	223	_vm_object_allocate(type, size, result);
	224
	225	return (result);
	226	}
	227
	228
	229	/*
	230	* vm_object_reference:
	231	*
	232	* Gets another reference to the given object.
	233	*/
	234	void
	235	vm_object_reference(vm_object_t object)
	236	{
	237	if (object == NULL)
	238	return;
	239
	240	object->ref_count++;
	241	if (object->type == OBJT_VNODE) {
	242	vref(object->handle);
	243	/* XXX what if the vnode is being destroyed? */
	244	}
	245	}
	246
	247	static void
	248	vm_object_vndeallocate(vm_object_t object)
	249	{
	250	struct vnode vp = (struct vnode ) object->handle;
	251
	252	KASSERT(object->type == OBJT_VNODE,
	253	("vm_object_vndeallocate: not a vnode object"));
	254	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
	255	#ifdef INVARIANTS
	256	if (object->ref_count == 0) {
	257	vprint("vm_object_vndeallocate", vp);
	258	panic("vm_object_vndeallocate: bad object reference count");
	259	}
	260	#endif
	261
	262	object->ref_count--;
	263	if (object->ref_count == 0)
	264	vclrflags(vp, VTEXT);
	265	vrele(vp);
	266	}
	267
	268	/*
	269	* vm_object_deallocate:
	270	*
	271	* Release a reference to the specified object,
	272	* gained either through a vm_object_allocate
	273	* or a vm_object_reference call. When all references
	274	* are gone, storage associated with this object
	275	* may be relinquished.
	276	*
	277	* No object may be locked.
	278	*/
	279	void
	280	vm_object_deallocate(vm_object_t object)
	281	{
	282	vm_object_t temp;
	283
	284	while (object != NULL) {
	285	if (object->type == OBJT_VNODE) {
	286	vm_object_vndeallocate(object);
	287	return;
	288	}
	289
	290	if (object->ref_count == 0) {
	291	panic("vm_object_deallocate: object deallocated too many times: %d", object->type);
	292	} else if (object->ref_count > 2) {
	293	object->ref_count--;
	294	return;
	295	}
	296
	297	/*
	298	* Here on ref_count of one or two, which are special cases for
	299	* objects.
	300	*/
	301	if ((object->ref_count == 2) && (object->shadow_count == 0)) {
	302	vm_object_set_flag(object, OBJ_ONEMAPPING);
	303	object->ref_count--;
	304	return;
	305	} else if ((object->ref_count == 2) && (object->shadow_count == 1)) {
	306	object->ref_count--;
	307	if ((object->handle == NULL) &&
	308	(object->type == OBJT_DEFAULT \|\|
	309	object->type == OBJT_SWAP)) {
	310	vm_object_t robject;
	311
	312	robject = LIST_FIRST(&object->shadow_head);
	313	KASSERT(robject != NULL,
	314	("vm_object_deallocate: ref_count: %d, shadow_count: %d",
	315	object->ref_count,
	316	object->shadow_count));
	317	if ((robject->handle == NULL) &&
	318	(robject->type == OBJT_DEFAULT \|\|
	319	robject->type == OBJT_SWAP)) {
	320
	321	robject->ref_count++;
	322
	323	while (
	324	robject->paging_in_progress \|\|
	325	object->paging_in_progress
	326	) {
	327	vm_object_pip_sleep(robject, "objde1");
	328	vm_object_pip_sleep(object, "objde2");
	329	}
	330
	331	if (robject->ref_count == 1) {
	332	robject->ref_count--;
	333	object = robject;
	334	goto doterm;
	335	}
	336
	337	object = robject;
	338	vm_object_collapse(object);
	339	continue;
	340	}
	341	}
	342
	343	return;
	344
	345	} else {
	346	object->ref_count--;
	347	if (object->ref_count != 0)
	348	return;
	349	}
	350
	351	doterm:
	352
	353	temp = object->backing_object;
	354	if (temp) {
	355	LIST_REMOVE(object, shadow_list);
	356	temp->shadow_count--;
	357	temp->generation++;
	358	object->backing_object = NULL;
	359	}
	360
	361	/*
	362	* Don't double-terminate, we could be in a termination
	363	* recursion due to the terminate having to sync data
	364	* to disk.
	365	*/
	366	if ((object->flags & OBJ_DEAD) == 0)
	367	vm_object_terminate(object);
	368	object = temp;
	369	}
	370	}
	371
	372	/*
	373	* vm_object_terminate actually destroys the specified object, freeing
	374	* up all previously used resources.
	375	*
	376	* The object must be locked.
	377	* This routine may block.
	378	*/
	379	static int vm_object_terminate_callback(vm_page_t p, void *data);
	380
	381	void
	382	vm_object_terminate(vm_object_t object)
	383	{
	384	/*
	385	* Make sure no one uses us.
	386	*/
	387	vm_object_set_flag(object, OBJ_DEAD);
	388
	389	/*
	390	* wait for the pageout daemon to be done with the object
	391	*/
	392	vm_object_pip_wait(object, "objtrm");
	393
	394	KASSERT(!object->paging_in_progress,
	395	("vm_object_terminate: pageout in progress"));
	396
	397	/*
	398	* Clean and free the pages, as appropriate. All references to the
	399	* object are gone, so we don't need to lock it.
	400	*/
	401	if (object->type == OBJT_VNODE) {
	402	struct vnode *vp;
	403
	404	/*
	405	* Clean pages and flush buffers.
	406	*/
	407	vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
	408
	409	vp = (struct vnode *) object->handle;
	410	vinvalbuf(vp, V_SAVE, 0, 0);
	411	}
	412
	413	/*
	414	* Wait for any I/O to complete, after which there had better not
	415	* be any references left on the object.
	416	*/
	417	vm_object_pip_wait(object, "objtrm");
	418
	419	if (object->ref_count != 0)
	420	panic("vm_object_terminate: object with references, ref_count=%d", object->ref_count);
	421
	422	/*
	423	* Now free any remaining pages. For internal objects, this also
	424	* removes them from paging queues. Don't free wired pages, just
	425	* remove them from the object.
	426	*/
	427	crit_enter();
	428	vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
	429	vm_object_terminate_callback, NULL);
	430	crit_exit();
	431
	432	/*
	433	* Let the pager know object is dead.
	434	*/
	435	vm_pager_deallocate(object);
	436
	437	/*
	438	* Remove the object from the global object list.
	439	*/
	440	crit_enter();
	441	TAILQ_REMOVE(&vm_object_list, object, object_list);
	442	vm_object_count--;
	443	crit_exit();
	444
	445	vm_object_dead_wakeup(object);
	446	if (object->ref_count != 0)
	447	panic("vm_object_terminate2: object with references, ref_count=%d", object->ref_count);
	448
	449	/*
	450	* Free the space for the object.
	451	*/
	452	zfree(obj_zone, object);
	453	}
	454
	455	static int
	456	vm_object_terminate_callback(vm_page_t p, void *data __unused)
	457	{
	458	if (p->busy \|\| (p->flags & PG_BUSY))
	459	panic("vm_object_terminate: freeing busy page %p", p);
	460	if (p->wire_count == 0) {
	461	vm_page_busy(p);
	462	vm_page_free(p);
	463	mycpu->gd_cnt.v_pfree++;
	464	} else {
	465	if (p->queue != PQ_NONE)
	466	kprintf("vm_object_terminate: Warning: Encountered wired page %p on queue %d\n", p, p->queue);
	467	vm_page_busy(p);
	468	vm_page_remove(p);
	469	vm_page_wakeup(p);
	470	}
	471	return(0);
	472	}
	473
	474	/*
	475	* The object is dead but still has an object<->pager association. Sleep
	476	* and return. The caller typically retests the association in a loop.
	477	*/
	478	void
	479	vm_object_dead_sleep(vm_object_t object, const char *wmesg)
	480	{
	481	crit_enter();
	482	if (object->handle) {
	483	vm_object_set_flag(object, OBJ_DEADWNT);
	484	tsleep(object, 0, wmesg, 0);
	485	}
	486	crit_exit();
	487	}
	488
	489	/*
	490	* Wakeup anyone waiting for the object<->pager disassociation on
	491	* a dead object.
	492	*/
	493	void
	494	vm_object_dead_wakeup(vm_object_t object)
	495	{
	496	if (object->flags & OBJ_DEADWNT) {
	497	vm_object_clear_flag(object, OBJ_DEADWNT);
	498	wakeup(object);
	499	}
	500	}
	501
	502	/*
	503	* vm_object_page_clean
	504	*
	505	* Clean all dirty pages in the specified range of object. Leaves page
	506	* on whatever queue it is currently on. If NOSYNC is set then do not
	507	* write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
	508	* leaving the object dirty.
	509	*
	510	* When stuffing pages asynchronously, allow clustering. XXX we need a
	511	* synchronous clustering mode implementation.
	512	*
	513	* Odd semantics: if start == end, we clean everything.
	514	*/
	515	static int vm_object_page_clean_pass1(struct vm_page p, void data);
	516	static int vm_object_page_clean_pass2(struct vm_page p, void data);
	517
	518	void
	519	vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
	520	int flags)
	521	{
	522	struct rb_vm_page_scan_info info;
	523	struct vnode *vp;
	524	int wholescan;
	525	int pagerflags;
	526	int curgeneration;
	527
	528	if (object->type != OBJT_VNODE \|\|
	529	(object->flags & OBJ_MIGHTBEDIRTY) == 0)
	530	return;
	531
	532	pagerflags = (flags & (OBJPC_SYNC \| OBJPC_INVAL)) ?
	533	VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
	534	pagerflags \|= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
	535
	536	vp = object->handle;
	537
	538	/*
	539	* Interlock other major object operations. This allows us to
	540	* temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
	541	*/
	542	crit_enter();
	543	vm_object_set_flag(object, OBJ_CLEANING);
	544
	545	/*
	546	* Handle 'entire object' case
	547	*/
	548	info.start_pindex = start;
	549	if (end == 0) {
	550	info.end_pindex = object->size - 1;
	551	} else {
	552	info.end_pindex = end - 1;
	553	}
	554	wholescan = (start == 0 && info.end_pindex == object->size - 1);
	555	info.limit = flags;
	556	info.pagerflags = pagerflags;
	557	info.object = object;
	558
	559	/*
	560	* If cleaning the entire object do a pass to mark the pages read-only.
	561	* If everything worked out ok, clear OBJ_WRITEABLE and
	562	* OBJ_MIGHTBEDIRTY.
	563	*/
	564	if (wholescan) {
	565	info.error = 0;
	566	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
	567	vm_object_page_clean_pass1, &info);
	568	if (info.error == 0) {
	569	vm_object_clear_flag(object,
	570	OBJ_WRITEABLE\|OBJ_MIGHTBEDIRTY);
	571	if (object->type == OBJT_VNODE &&
	572	(vp = (struct vnode *)object->handle) != NULL) {
	573	if (vp->v_flag & VOBJDIRTY)
	574	vclrflags(vp, VOBJDIRTY);
	575	}
	576	}
	577	}
	578
	579	/*
	580	* Do a pass to clean all the dirty pages we find.
	581	*/
	582	do {
	583	info.error = 0;
	584	curgeneration = object->generation;
	585	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
	586	vm_object_page_clean_pass2, &info);
	587	} while (info.error \|\| curgeneration != object->generation);
	588
	589	vm_object_clear_flag(object, OBJ_CLEANING);
	590	crit_exit();
	591	}
	592
	593	static
	594	int
	595	vm_object_page_clean_pass1(struct vm_page p, void data)
	596	{
	597	struct rb_vm_page_scan_info *info = data;
	598
	599	vm_page_flag_set(p, PG_CLEANCHK);
	600	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
	601	info->error = 1;
	602	else
	603	vm_page_protect(p, VM_PROT_READ); /* must not block */
	604	return(0);
	605	}
	606
	607	static
	608	int
	609	vm_object_page_clean_pass2(struct vm_page p, void data)
	610	{
	611	struct rb_vm_page_scan_info *info = data;
	612	int n;
	613
	614	/*
	615	* Do not mess with pages that were inserted after we started
	616	* the cleaning pass.
	617	*/
	618	if ((p->flags & PG_CLEANCHK) == 0)
	619	return(0);
	620
	621	/*
	622	* Before wasting time traversing the pmaps, check for trivial
	623	* cases where the page cannot be dirty.
	624	*/
	625	if (p->valid == 0 \|\| (p->queue - p->pc) == PQ_CACHE) {
	626	KKASSERT((p->dirty & p->valid) == 0);
	627	return(0);
	628	}
	629
	630	/*
	631	* Check whether the page is dirty or not. The page has been set
	632	* to be read-only so the check will not race a user dirtying the
	633	* page.
	634	*/
	635	vm_page_test_dirty(p);
	636	if ((p->dirty & p->valid) == 0) {
	637	vm_page_flag_clear(p, PG_CLEANCHK);
	638	return(0);
	639	}
	640
	641	/*
	642	* If we have been asked to skip nosync pages and this is a
	643	* nosync page, skip it. Note that the object flags were
	644	* not cleared in this case (because pass1 will have returned an
	645	* error), so we do not have to set them.
	646	*/
	647	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
	648	vm_page_flag_clear(p, PG_CLEANCHK);
	649	return(0);
	650	}
	651
	652	/*
	653	* Flush as many pages as we can. PG_CLEANCHK will be cleared on
	654	* the pages that get successfully flushed. Set info->error if
	655	* we raced an object modification.
	656	*/
	657	n = vm_object_page_collect_flush(info->object, p, info->pagerflags);
	658	if (n == 0)
	659	info->error = 1;
	660	return(0);
	661	}
	662
	663	/*
	664	* This routine must be called within a critical section to properly avoid
	665	* an interrupt unbusy/free race that can occur prior to the busy check.
	666	*
	667	* Using the object generation number here to detect page ripout is not
	668	* the best idea in the world. XXX
	669	*
	670	* NOTE: we operate under the assumption that a page found to not be busy
	671	* will not be ripped out from under us by an interrupt. XXX we should
	672	* recode this to explicitly busy the pages.
	673	*/
	674	static int
	675	vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
	676	{
	677	int runlen;
	678	int maxf;
	679	int chkb;
	680	int maxb;
	681	int i;
	682	int curgeneration;
	683	vm_pindex_t pi;
	684	vm_page_t maf[vm_pageout_page_count];
	685	vm_page_t mab[vm_pageout_page_count];
	686	vm_page_t ma[vm_pageout_page_count];
	687
	688	curgeneration = object->generation;
	689
	690	pi = p->pindex;
	691	while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
	692	if (object->generation != curgeneration) {
	693	return(0);
	694	}
	695	}
	696	KKASSERT(p->object == object && p->pindex == pi);
	697
	698	maxf = 0;
	699	for(i = 1; i < vm_pageout_page_count; i++) {
	700	vm_page_t tp;
	701
	702	if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
	703	if ((tp->flags & PG_BUSY) \|\|
	704	((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
	705	(tp->flags & PG_CLEANCHK) == 0) \|\|
	706	(tp->busy != 0))
	707	break;
	708	if((tp->queue - tp->pc) == PQ_CACHE) {
	709	vm_page_flag_clear(tp, PG_CLEANCHK);
	710	break;
	711	}
	712	vm_page_test_dirty(tp);
	713	if ((tp->dirty & tp->valid) == 0) {
	714	vm_page_flag_clear(tp, PG_CLEANCHK);
	715	break;
	716	}
	717	maf[ i - 1 ] = tp;
	718	maxf++;
	719	continue;
	720	}
	721	break;
	722	}
	723
	724	maxb = 0;
	725	chkb = vm_pageout_page_count - maxf;
	726	if (chkb) {
	727	for(i = 1; i < chkb;i++) {
	728	vm_page_t tp;
	729
	730	if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
	731	if ((tp->flags & PG_BUSY) \|\|
	732	((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
	733	(tp->flags & PG_CLEANCHK) == 0) \|\|
	734	(tp->busy != 0))
	735	break;
	736	if((tp->queue - tp->pc) == PQ_CACHE) {
	737	vm_page_flag_clear(tp, PG_CLEANCHK);
	738	break;
	739	}
	740	vm_page_test_dirty(tp);
	741	if ((tp->dirty & tp->valid) == 0) {
	742	vm_page_flag_clear(tp, PG_CLEANCHK);
	743	break;
	744	}
	745	mab[ i - 1 ] = tp;
	746	maxb++;
	747	continue;
	748	}
	749	break;
	750	}
	751	}
	752
	753	for(i = 0; i < maxb; i++) {
	754	int index = (maxb - i) - 1;
	755	ma[index] = mab[i];
	756	vm_page_flag_clear(ma[index], PG_CLEANCHK);
	757	}
	758	vm_page_flag_clear(p, PG_CLEANCHK);
	759	ma[maxb] = p;
	760	for(i = 0; i < maxf; i++) {
	761	int index = (maxb + i) + 1;
	762	ma[index] = maf[i];
	763	vm_page_flag_clear(ma[index], PG_CLEANCHK);
	764	}
	765	runlen = maxb + maxf + 1;
	766
	767	vm_pageout_flush(ma, runlen, pagerflags);
	768	for (i = 0; i < runlen; i++) {
	769	if (ma[i]->valid & ma[i]->dirty) {
	770	vm_page_protect(ma[i], VM_PROT_READ);
	771	vm_page_flag_set(ma[i], PG_CLEANCHK);
	772
	773	/*
	774	* maxf will end up being the actual number of pages
	775	* we wrote out contiguously, non-inclusive of the
	776	* first page. We do not count look-behind pages.
	777	*/
	778	if (i >= maxb + 1 && (maxf > i - maxb - 1))
	779	maxf = i - maxb - 1;
	780	}
	781	}
	782	return(maxf + 1);
	783	}
	784
	785	#ifdef not_used
	786	/* XXX I cannot tell if this should be an exported symbol */
	787	/*
	788	* vm_object_deactivate_pages
	789	*
	790	* Deactivate all pages in the specified object. (Keep its pages
	791	* in memory even though it is no longer referenced.)
	792	*
	793	* The object must be locked.
	794	*/
	795	static int vm_object_deactivate_pages_callback(vm_page_t p, void *data);
	796
	797	static void
	798	vm_object_deactivate_pages(vm_object_t object)
	799	{
	800	crit_enter();
	801	vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
	802	vm_object_deactivate_pages_callback, NULL);
	803	crit_exit();
	804	}
	805
	806	static int
	807	vm_object_deactivate_pages_callback(vm_page_t p, void *data __unused)
	808	{
	809	vm_page_deactivate(p);
	810	return(0);
	811	}
	812
	813	#endif
	814
	815	/*
	816	* Same as vm_object_pmap_copy, except range checking really
	817	* works, and is meant for small sections of an object.
	818	*
	819	* This code protects resident pages by making them read-only
	820	* and is typically called on a fork or split when a page
	821	* is converted to copy-on-write.
	822	*
	823	* NOTE: If the page is already at VM_PROT_NONE, calling
	824	* vm_page_protect will have no effect.
	825	*/
	826	void
	827	vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
	828	{
	829	vm_pindex_t idx;
	830	vm_page_t p;
	831
	832	if (object == NULL \|\| (object->flags & OBJ_WRITEABLE) == 0)
	833	return;
	834
	835	/*
	836	* spl protection needed to prevent races between the lookup,
	837	* an interrupt unbusy/free, and our protect call.
	838	*/
	839	crit_enter();
	840	for (idx = start; idx < end; idx++) {
	841	p = vm_page_lookup(object, idx);
	842	if (p == NULL)
	843	continue;
	844	vm_page_protect(p, VM_PROT_READ);
	845	}
	846	crit_exit();
	847	}
	848
	849	/*
	850	* vm_object_pmap_remove:
	851	*
	852	* Removes all physical pages in the specified
	853	* object range from all physical maps.
	854	*
	855	* The object must not be locked.
	856	*/
	857
	858	static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
	859
	860	void
	861	vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
	862	{
	863	struct rb_vm_page_scan_info info;
	864
	865	if (object == NULL)
	866	return;
	867	info.start_pindex = start;
	868	info.end_pindex = end - 1;
	869	crit_enter();
	870	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
	871	vm_object_pmap_remove_callback, &info);
	872	if (start == 0 && end == object->size)
	873	vm_object_clear_flag(object, OBJ_WRITEABLE);
	874	crit_exit();
	875	}
	876
	877	static int
	878	vm_object_pmap_remove_callback(vm_page_t p, void *data __unused)
	879	{
	880	vm_page_protect(p, VM_PROT_NONE);
	881	return(0);
	882	}
	883
	884	/*
	885	* vm_object_madvise:
	886	*
	887	* Implements the madvise function at the object/page level.
	888	*
	889	* MADV_WILLNEED (any object)
	890	*
	891	* Activate the specified pages if they are resident.
	892	*
	893	* MADV_DONTNEED (any object)
	894	*
	895	* Deactivate the specified pages if they are resident.
	896	*
	897	* MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects,
	898	* OBJ_ONEMAPPING only)
	899	*
	900	* Deactivate and clean the specified pages if they are
	901	* resident. This permits the process to reuse the pages
	902	* without faulting or the kernel to reclaim the pages
	903	* without I/O.
	904	*/
	905	void
	906	vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
	907	{
	908	vm_pindex_t end, tpindex;
	909	vm_object_t tobject;
	910	vm_page_t m;
	911
	912	if (object == NULL)
	913	return;
	914
	915	end = pindex + count;
	916
	917	/*
	918	* Locate and adjust resident pages
	919	*/
	920
	921	for (; pindex < end; pindex += 1) {
	922	relookup:
	923	tobject = object;
	924	tpindex = pindex;
	925	shadowlookup:
	926	/*
	927	* MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
	928	* and those pages must be OBJ_ONEMAPPING.
	929	*/
	930	if (advise == MADV_FREE) {
	931	if ((tobject->type != OBJT_DEFAULT &&
	932	tobject->type != OBJT_SWAP) \|\|
	933	(tobject->flags & OBJ_ONEMAPPING) == 0) {
	934	continue;
	935	}
	936	}
	937
	938	/*
	939	* spl protection is required to avoid a race between the
	940	* lookup, an interrupt unbusy/free, and our busy check.
	941	*/
	942
	943	crit_enter();
	944	m = vm_page_lookup(tobject, tpindex);
	945
	946	if (m == NULL) {
	947	/*
	948	* There may be swap even if there is no backing page
	949	*/
	950	if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
	951	swap_pager_freespace(tobject, tpindex, 1);
	952
	953	/*
	954	* next object
	955	*/
	956	crit_exit();
	957	if (tobject->backing_object == NULL)
	958	continue;
	959	tpindex += OFF_TO_IDX(tobject->backing_object_offset);
	960	tobject = tobject->backing_object;
	961	goto shadowlookup;
	962	}
	963
	964	/*
	965	* If the page is busy or not in a normal active state,
	966	* we skip it. If the page is not managed there are no
	967	* page queues to mess with. Things can break if we mess
	968	* with pages in any of the below states.
	969	*/
	970	if (
	971	m->hold_count \|\|
	972	m->wire_count \|\|
	973	(m->flags & PG_UNMANAGED) \|\|
	974	m->valid != VM_PAGE_BITS_ALL
	975	) {
	976	crit_exit();
	977	continue;
	978	}
	979
	980	if (vm_page_sleep_busy(m, TRUE, "madvpo")) {
	981	crit_exit();
	982	goto relookup;
	983	}
	984	crit_exit();
	985
	986	/*
	987	* Theoretically once a page is known not to be busy, an
	988	* interrupt cannot come along and rip it out from under us.
	989	*/
	990
	991	if (advise == MADV_WILLNEED) {
	992	vm_page_activate(m);
	993	} else if (advise == MADV_DONTNEED) {
	994	vm_page_dontneed(m);
	995	} else if (advise == MADV_FREE) {
	996	/*
	997	* Mark the page clean. This will allow the page
	998	* to be freed up by the system. However, such pages
	999	* are often reused quickly by malloc()/free()
	1000	* so we do not do anything that would cause
	1001	* a page fault if we can help it.
	1002	*
	1003	* Specifically, we do not try to actually free
	1004	* the page now nor do we try to put it in the
	1005	* cache (which would cause a page fault on reuse).
	1006	*
	1007	* But we do make the page is freeable as we
	1008	* can without actually taking the step of unmapping
	1009	* it.
	1010	*/
	1011	pmap_clear_modify(m);
	1012	m->dirty = 0;
	1013	m->act_count = 0;
	1014	vm_page_dontneed(m);
	1015	if (tobject->type == OBJT_SWAP)
	1016	swap_pager_freespace(tobject, tpindex, 1);
	1017	}
	1018	}
	1019	}
	1020
	1021	/*
	1022	* vm_object_shadow:
	1023	*
	1024	* Create a new object which is backed by the
	1025	* specified existing object range. The source
	1026	* object reference is deallocated.
	1027	*
	1028	* The new object and offset into that object
	1029	* are returned in the source parameters.
	1030	*/
	1031
	1032	void
	1033	vm_object_shadow(vm_object_t object, / IN/OUT */
	1034	vm_ooffset_t offset, / IN/OUT */
	1035	vm_size_t length)
	1036	{
	1037	vm_object_t source;
	1038	vm_object_t result;
	1039
	1040	source = *object;
	1041
	1042	/*
	1043	* Don't create the new object if the old object isn't shared.
	1044	*/
	1045
	1046	if (source != NULL &&
	1047	source->ref_count == 1 &&
	1048	source->handle == NULL &&
	1049	(source->type == OBJT_DEFAULT \|\|
	1050	source->type == OBJT_SWAP))
	1051	return;
	1052
	1053	/*
	1054	* Allocate a new object with the given length
	1055	*/
	1056
	1057	if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL)
	1058	panic("vm_object_shadow: no object for shadowing");
	1059
	1060	/*
	1061	* The new object shadows the source object, adding a reference to it.
	1062	* Our caller changes his reference to point to the new object,
	1063	* removing a reference to the source object. Net result: no change
	1064	* of reference count.
	1065	*
	1066	* Try to optimize the result object's page color when shadowing
	1067	* in order to maintain page coloring consistency in the combined
	1068	* shadowed object.
	1069	*/
	1070	result->backing_object = source;
	1071	if (source) {
	1072	LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
	1073	source->shadow_count++;
	1074	source->generation++;
	1075	result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & PQ_L2_MASK;
	1076	}
	1077
	1078	/*
	1079	* Store the offset into the source object, and fix up the offset into
	1080	* the new object.
	1081	*/
	1082
	1083	result->backing_object_offset = *offset;
	1084
	1085	/*
	1086	* Return the new things
	1087	*/
	1088
	1089	*offset = 0;
	1090	*object = result;
	1091	}
	1092
	1093	#define OBSC_TEST_ALL_SHADOWED 0x0001
	1094	#define OBSC_COLLAPSE_NOWAIT 0x0002
	1095	#define OBSC_COLLAPSE_WAIT 0x0004
	1096
	1097	static int vm_object_backing_scan_callback(vm_page_t p, void *data);
	1098
	1099	static __inline int
	1100	vm_object_backing_scan(vm_object_t object, int op)
	1101	{
	1102	struct rb_vm_page_scan_info info;
	1103	vm_object_t backing_object;
	1104
	1105	/*
	1106	* spl protection is required to avoid races between the memq/lookup,
	1107	* an interrupt doing an unbusy/free, and our busy check. Amoung
	1108	* other things.
	1109	*/
	1110	crit_enter();
	1111
	1112	backing_object = object->backing_object;
	1113	info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
	1114
	1115	/*
	1116	* Initial conditions
	1117	*/
	1118
	1119	if (op & OBSC_TEST_ALL_SHADOWED) {
	1120	/*
	1121	* We do not want to have to test for the existence of
	1122	* swap pages in the backing object. XXX but with the
	1123	* new swapper this would be pretty easy to do.
	1124	*
	1125	* XXX what about anonymous MAP_SHARED memory that hasn't
	1126	* been ZFOD faulted yet? If we do not test for this, the
	1127	* shadow test may succeed! XXX
	1128	*/
	1129	if (backing_object->type != OBJT_DEFAULT) {
	1130	crit_exit();
	1131	return(0);
	1132	}
	1133	}
	1134	if (op & OBSC_COLLAPSE_WAIT) {
	1135	KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
	1136	vm_object_set_flag(backing_object, OBJ_DEAD);
	1137	}
	1138
	1139	/*
	1140	* Our scan. We have to retry if a negative error code is returned,
	1141	* otherwise 0 or 1 will be returned in info.error. 0 Indicates that
	1142	* the scan had to be stopped because the parent does not completely
	1143	* shadow the child.
	1144	*/
	1145	info.object = object;
	1146	info.backing_object = backing_object;
	1147	info.limit = op;
	1148	do {
	1149	info.error = 1;
	1150	vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
	1151	vm_object_backing_scan_callback,
	1152	&info);
	1153	} while (info.error < 0);
	1154	crit_exit();
	1155	return(info.error);
	1156	}
	1157
	1158	static int
	1159	vm_object_backing_scan_callback(vm_page_t p, void *data)
	1160	{
	1161	struct rb_vm_page_scan_info *info = data;
	1162	vm_object_t backing_object;
	1163	vm_object_t object;
	1164	vm_pindex_t new_pindex;
	1165	vm_pindex_t backing_offset_index;
	1166	int op;
	1167
	1168	new_pindex = p->pindex - info->backing_offset_index;
	1169	op = info->limit;
	1170	object = info->object;
	1171	backing_object = info->backing_object;
	1172	backing_offset_index = info->backing_offset_index;
	1173
	1174	if (op & OBSC_TEST_ALL_SHADOWED) {
	1175	vm_page_t pp;
	1176
	1177	/*
	1178	* Ignore pages outside the parent object's range
	1179	* and outside the parent object's mapping of the
	1180	* backing object.
	1181	*
	1182	* note that we do not busy the backing object's
	1183	* page.
	1184	*/
	1185	if (
	1186	p->pindex < backing_offset_index \|\|
	1187	new_pindex >= object->size
	1188	) {
	1189	return(0);
	1190	}
	1191
	1192	/*
	1193	* See if the parent has the page or if the parent's
	1194	* object pager has the page. If the parent has the
	1195	* page but the page is not valid, the parent's
	1196	* object pager must have the page.
	1197	*
	1198	* If this fails, the parent does not completely shadow
	1199	* the object and we might as well give up now.
	1200	*/
	1201
	1202	pp = vm_page_lookup(object, new_pindex);
	1203	if ((pp == NULL \|\| pp->valid == 0) &&
	1204	!vm_pager_has_page(object, new_pindex)
	1205	) {
	1206	info->error = 0; /* problemo */
	1207	return(-1); /* stop the scan */
	1208	}
	1209	}
	1210
	1211	/*
	1212	* Check for busy page
	1213	*/
	1214
	1215	if (op & (OBSC_COLLAPSE_WAIT \| OBSC_COLLAPSE_NOWAIT)) {
	1216	vm_page_t pp;
	1217
	1218	if (op & OBSC_COLLAPSE_NOWAIT) {
	1219	if (
	1220	(p->flags & PG_BUSY) \|\|
	1221	!p->valid \|\|
	1222	p->hold_count \|\|
	1223	p->wire_count \|\|
	1224	p->busy
	1225	) {
	1226	return(0);
	1227	}
	1228	} else if (op & OBSC_COLLAPSE_WAIT) {
	1229	if (vm_page_sleep_busy(p, TRUE, "vmocol")) {
	1230	/*
	1231	* If we slept, anything could have
	1232	* happened. Ask that the scan be restarted.
	1233	*
	1234	* Since the object is marked dead, the
	1235	* backing offset should not have changed.
	1236	*/
	1237	info->error = -1;
	1238	return(-1);
	1239	}
	1240	}
	1241
	1242	/*
	1243	* Busy the page
	1244	*/
	1245	vm_page_busy(p);
	1246
	1247	KASSERT(
	1248	p->object == backing_object,
	1249	("vm_object_qcollapse(): object mismatch")
	1250	);
	1251
	1252	/*
	1253	* Destroy any associated swap
	1254	*/
	1255	if (backing_object->type == OBJT_SWAP)
	1256	swap_pager_freespace(backing_object, p->pindex, 1);
	1257
	1258	if (
	1259	p->pindex < backing_offset_index \|\|
	1260	new_pindex >= object->size
	1261	) {
	1262	/*
	1263	* Page is out of the parent object's range, we
	1264	* can simply destroy it.
	1265	*/
	1266	vm_page_protect(p, VM_PROT_NONE);
	1267	vm_page_free(p);
	1268	return(0);
	1269	}
	1270
	1271	pp = vm_page_lookup(object, new_pindex);
	1272	if (pp != NULL \|\| vm_pager_has_page(object, new_pindex)) {
	1273	/*
	1274	* page already exists in parent OR swap exists
	1275	* for this location in the parent. Destroy
	1276	* the original page from the backing object.
	1277	*
	1278	* Leave the parent's page alone
	1279	*/
	1280	vm_page_protect(p, VM_PROT_NONE);
	1281	vm_page_free(p);
	1282	return(0);
	1283	}
	1284
	1285	/*
	1286	* Page does not exist in parent, rename the
	1287	* page from the backing object to the main object.
	1288	*
	1289	* If the page was mapped to a process, it can remain
	1290	* mapped through the rename.
	1291	*/
	1292	if ((p->queue - p->pc) == PQ_CACHE)
	1293	vm_page_deactivate(p);
	1294
	1295	vm_page_rename(p, object, new_pindex);
	1296	/* page automatically made dirty by rename */
	1297	}
	1298	return(0);
	1299	}
	1300
	1301	/*
	1302	* this version of collapse allows the operation to occur earlier and
	1303	* when paging_in_progress is true for an object... This is not a complete
	1304	* operation, but should plug 99.9% of the rest of the leaks.
	1305	*/
	1306	static void
	1307	vm_object_qcollapse(vm_object_t object)
	1308	{
	1309	vm_object_t backing_object = object->backing_object;
	1310
	1311	if (backing_object->ref_count != 1)
	1312	return;
	1313
	1314	backing_object->ref_count += 2;
	1315
	1316	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
	1317
	1318	backing_object->ref_count -= 2;
	1319	}
	1320
	1321	/*
	1322	* vm_object_collapse:
	1323	*
	1324	* Collapse an object with the object backing it.
	1325	* Pages in the backing object are moved into the
	1326	* parent, and the backing object is deallocated.
	1327	*/
	1328	void
	1329	vm_object_collapse(vm_object_t object)
	1330	{
	1331	while (TRUE) {
	1332	vm_object_t backing_object;
	1333
	1334	/*
	1335	* Verify that the conditions are right for collapse:
	1336	*
	1337	* The object exists and the backing object exists.
	1338	*/
	1339	if (object == NULL)
	1340	break;
	1341
	1342	if ((backing_object = object->backing_object) == NULL)
	1343	break;
	1344
	1345	/*
	1346	* we check the backing object first, because it is most likely
	1347	* not collapsable.
	1348	*/
	1349	if (backing_object->handle != NULL \|\|
	1350	(backing_object->type != OBJT_DEFAULT &&
	1351	backing_object->type != OBJT_SWAP) \|\|
	1352	(backing_object->flags & OBJ_DEAD) \|\|
	1353	object->handle != NULL \|\|
	1354	(object->type != OBJT_DEFAULT &&
	1355	object->type != OBJT_SWAP) \|\|
	1356	(object->flags & OBJ_DEAD)) {
	1357	break;
	1358	}
	1359
	1360	if (
	1361	object->paging_in_progress != 0 \|\|
	1362	backing_object->paging_in_progress != 0
	1363	) {
	1364	vm_object_qcollapse(object);
	1365	break;
	1366	}
	1367
	1368	/*
	1369	* We know that we can either collapse the backing object (if
	1370	* the parent is the only reference to it) or (perhaps) have
	1371	* the parent bypass the object if the parent happens to shadow
	1372	* all the resident pages in the entire backing object.
	1373	*
	1374	* This is ignoring pager-backed pages such as swap pages.
	1375	* vm_object_backing_scan fails the shadowing test in this
	1376	* case.
	1377	*/
	1378
	1379	if (backing_object->ref_count == 1) {
	1380	/*
	1381	* If there is exactly one reference to the backing
	1382	* object, we can collapse it into the parent.
	1383	*/
	1384	vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
	1385
	1386	/*
	1387	* Move the pager from backing_object to object.
	1388	*/
	1389
	1390	if (backing_object->type == OBJT_SWAP) {
	1391	vm_object_pip_add(backing_object, 1);
	1392
	1393	/*
	1394	* scrap the paging_offset junk and do a
	1395	* discrete copy. This also removes major
	1396	* assumptions about how the swap-pager
	1397	* works from where it doesn't belong. The
	1398	* new swapper is able to optimize the
	1399	* destroy-source case.
	1400	*/
	1401
	1402	vm_object_pip_add(object, 1);
	1403	swap_pager_copy(
	1404	backing_object,
	1405	object,
	1406	OFF_TO_IDX(object->backing_object_offset), TRUE);
	1407	vm_object_pip_wakeup(object);
	1408
	1409	vm_object_pip_wakeup(backing_object);
	1410	}
	1411	/*
	1412	* Object now shadows whatever backing_object did.
	1413	* Note that the reference to
	1414	* backing_object->backing_object moves from within
	1415	* backing_object to within object.
	1416	*/
	1417
	1418	LIST_REMOVE(object, shadow_list);
	1419	object->backing_object->shadow_count--;
	1420	object->backing_object->generation++;
	1421	if (backing_object->backing_object) {
	1422	LIST_REMOVE(backing_object, shadow_list);
	1423	backing_object->backing_object->shadow_count--;
	1424	backing_object->backing_object->generation++;
	1425	}
	1426	object->backing_object = backing_object->backing_object;
	1427	if (object->backing_object) {
	1428	LIST_INSERT_HEAD(
	1429	&object->backing_object->shadow_head,
	1430	object,
	1431	shadow_list
	1432	);
	1433	object->backing_object->shadow_count++;
	1434	object->backing_object->generation++;
	1435	}
	1436
	1437	object->backing_object_offset +=
	1438	backing_object->backing_object_offset;
	1439
	1440	/*
	1441	* Discard backing_object.
	1442	*
	1443	* Since the backing object has no pages, no pager left,
	1444	* and no object references within it, all that is
	1445	* necessary is to dispose of it.
	1446	*/
	1447
	1448	KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
	1449	KASSERT(RB_EMPTY(&backing_object->rb_memq), ("backing_object %p somehow has left over pages during collapse!", backing_object));
	1450	crit_enter();
	1451	TAILQ_REMOVE(
	1452	&vm_object_list,
	1453	backing_object,
	1454	object_list
	1455	);
	1456	vm_object_count--;
	1457	crit_exit();
	1458
	1459	zfree(obj_zone, backing_object);
	1460
	1461	object_collapses++;
	1462	} else {
	1463	vm_object_t new_backing_object;
	1464
	1465	/*
	1466	* If we do not entirely shadow the backing object,
	1467	* there is nothing we can do so we give up.
	1468	*/
	1469
	1470	if (vm_object_backing_scan(object, OBSC_TEST_ALL_SHADOWED) == 0) {
	1471	break;
	1472	}
	1473
	1474	/*
	1475	* Make the parent shadow the next object in the
	1476	* chain. Deallocating backing_object will not remove
	1477	* it, since its reference count is at least 2.
	1478	*/
	1479
	1480	LIST_REMOVE(object, shadow_list);
	1481	backing_object->shadow_count--;
	1482	backing_object->generation++;
	1483
	1484	new_backing_object = backing_object->backing_object;
	1485	if ((object->backing_object = new_backing_object) != NULL) {
	1486	vm_object_reference(new_backing_object);
	1487	LIST_INSERT_HEAD(
	1488	&new_backing_object->shadow_head,
	1489	object,
	1490	shadow_list
	1491	);
	1492	new_backing_object->shadow_count++;
	1493	new_backing_object->generation++;
	1494	object->backing_object_offset +=
	1495	backing_object->backing_object_offset;
	1496	}
	1497
	1498	/*
	1499	* Drop the reference count on backing_object. Since
	1500	* its ref_count was at least 2, it will not vanish;
	1501	* so we don't need to call vm_object_deallocate, but
	1502	* we do anyway.
	1503	*/
	1504	vm_object_deallocate(backing_object);
	1505	object_bypasses++;
	1506	}
	1507
	1508	/*
	1509	* Try again with this object's new backing object.
	1510	*/
	1511	}
	1512	}
	1513
	1514	/*
	1515	* vm_object_page_remove: [internal]
	1516	*
	1517	* Removes all physical pages in the specified
	1518	* object range from the object's list of pages.
	1519	*/
	1520	static int vm_object_page_remove_callback(vm_page_t p, void *data);
	1521
	1522	void
	1523	vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
	1524	boolean_t clean_only)
	1525	{
	1526	struct rb_vm_page_scan_info info;
	1527	int all;
	1528
	1529	/*
	1530	* Degenerate cases and assertions
	1531	*/
	1532	if (object == NULL \|\| object->resident_page_count == 0)
	1533	return;
	1534	KASSERT(object->type != OBJT_PHYS,
	1535	("attempt to remove pages from a physical object"));
	1536
	1537	/*
	1538	* Indicate that paging is occuring on the object
	1539	*/
	1540	crit_enter();
	1541	vm_object_pip_add(object, 1);
	1542
	1543	/*
	1544	* Figure out the actual removal range and whether we are removing
	1545	* the entire contents of the object or not. If removing the entire
	1546	* contents, be sure to get all pages, even those that might be
	1547	* beyond the end of the object.
	1548	*/
	1549	info.start_pindex = start;
	1550	if (end == 0)
	1551	info.end_pindex = (vm_pindex_t)-1;
	1552	else
	1553	info.end_pindex = end - 1;
	1554	info.limit = clean_only;
	1555	all = (start == 0 && info.end_pindex >= object->size - 1);
	1556
	1557	/*
	1558	* Loop until we are sure we have gotten them all.
	1559	*/
	1560	do {
	1561	info.error = 0;
	1562	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
	1563	vm_object_page_remove_callback, &info);
	1564	} while (info.error);
	1565
	1566	/*
	1567	* Remove any related swap if throwing away pages, or for
	1568	* non-swap objects (the swap is a clean copy in that case).
	1569	*/
	1570	if (object->type != OBJT_SWAP \|\| clean_only == FALSE) {
	1571	if (all)
	1572	swap_pager_freespace_all(object);
	1573	else
	1574	swap_pager_freespace(object, info.start_pindex,
	1575	info.end_pindex - info.start_pindex + 1);
	1576	}
	1577
	1578	/*
	1579	* Cleanup
	1580	*/
	1581	vm_object_pip_wakeup(object);
	1582	crit_exit();
	1583	}
	1584
	1585	static int
	1586	vm_object_page_remove_callback(vm_page_t p, void *data)
	1587	{
	1588	struct rb_vm_page_scan_info *info = data;
	1589
	1590	/*
	1591	* Wired pages cannot be destroyed, but they can be invalidated
	1592	* and we do so if clean_only (limit) is not set.
	1593	*
	1594	* WARNING! The page may be wired due to being part of a buffer
	1595	* cache buffer, and the buffer might be marked B_CACHE.
	1596	* This is fine as part of a truncation but VFSs must be
	1597	* sure to fix the buffer up when re-extending the file.
	1598	*/
	1599	if (p->wire_count != 0) {
	1600	vm_page_protect(p, VM_PROT_NONE);
	1601	if (info->limit == 0)
	1602	p->valid = 0;
	1603	return(0);
	1604	}
	1605
	1606	/*
	1607	* The busy flags are only cleared at
	1608	* interrupt -- minimize the spl transitions
	1609	*/
	1610
	1611	if (vm_page_sleep_busy(p, TRUE, "vmopar")) {
	1612	info->error = 1;
	1613	return(0);
	1614	}
	1615
	1616	/*
	1617	* limit is our clean_only flag. If set and the page is dirty, do
	1618	* not free it. If set and the page is being held by someone, do
	1619	* not free it.
	1620	*/
	1621	if (info->limit && p->valid) {
	1622	vm_page_test_dirty(p);
	1623	if (p->valid & p->dirty)
	1624	return(0);
	1625	if (p->hold_count)
	1626	return(0);
	1627	}
	1628
	1629	/*
	1630	* Destroy the page
	1631	*/
	1632	vm_page_busy(p);
	1633	vm_page_protect(p, VM_PROT_NONE);
	1634	vm_page_free(p);
	1635	return(0);
	1636	}
	1637
	1638	/*
	1639	* Routine: vm_object_coalesce
	1640	* Function: Coalesces two objects backing up adjoining
	1641	* regions of memory into a single object.
	1642	*
	1643	* returns TRUE if objects were combined.
	1644	*
	1645	* NOTE: Only works at the moment if the second object is NULL -
	1646	* if it's not, which object do we lock first?
	1647	*
	1648	* Parameters:
	1649	* prev_object First object to coalesce
	1650	* prev_offset Offset into prev_object
	1651	* next_object Second object into coalesce
	1652	* next_offset Offset into next_object
	1653	*
	1654	* prev_size Size of reference to prev_object
	1655	* next_size Size of reference to next_object
	1656	*
	1657	* Conditions:
	1658	* The object must not be locked.
	1659	*/
	1660	boolean_t
	1661	vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
	1662	vm_size_t prev_size, vm_size_t next_size)
	1663	{
	1664	vm_pindex_t next_pindex;
	1665
	1666	if (prev_object == NULL) {
	1667	return (TRUE);
	1668	}
	1669
	1670	if (prev_object->type != OBJT_DEFAULT &&
	1671	prev_object->type != OBJT_SWAP) {
	1672	return (FALSE);
	1673	}
	1674
	1675	/*
	1676	* Try to collapse the object first
	1677	*/
	1678	vm_object_collapse(prev_object);
	1679
	1680	/*
	1681	* Can't coalesce if: . more than one reference . paged out . shadows
	1682	* another object . has a copy elsewhere (any of which mean that the
	1683	* pages not mapped to prev_entry may be in use anyway)
	1684	*/
	1685
	1686	if (prev_object->backing_object != NULL) {
	1687	return (FALSE);
	1688	}
	1689
	1690	prev_size >>= PAGE_SHIFT;
	1691	next_size >>= PAGE_SHIFT;
	1692	next_pindex = prev_pindex + prev_size;
	1693
	1694	if ((prev_object->ref_count > 1) &&
	1695	(prev_object->size != next_pindex)) {
	1696	return (FALSE);
	1697	}
	1698
	1699	/*
	1700	* Remove any pages that may still be in the object from a previous
	1701	* deallocation.
	1702	*/
	1703	if (next_pindex < prev_object->size) {
	1704	vm_object_page_remove(prev_object,
	1705	next_pindex,
	1706	next_pindex + next_size, FALSE);
	1707	if (prev_object->type == OBJT_SWAP)
	1708	swap_pager_freespace(prev_object,
	1709	next_pindex, next_size);
	1710	}
	1711
	1712	/*
	1713	* Extend the object if necessary.
	1714	*/
	1715	if (next_pindex + next_size > prev_object->size)
	1716	prev_object->size = next_pindex + next_size;
	1717
	1718	return (TRUE);
	1719	}
	1720
	1721	void
	1722	vm_object_set_writeable_dirty(vm_object_t object)
	1723	{
	1724	struct vnode *vp;
	1725
	1726	vm_object_set_flag(object, OBJ_WRITEABLE\|OBJ_MIGHTBEDIRTY);
	1727	if (object->type == OBJT_VNODE &&
	1728	(vp = (struct vnode *)object->handle) != NULL) {
	1729	if ((vp->v_flag & VOBJDIRTY) == 0) {
	1730	vsetflags(vp, VOBJDIRTY);
	1731	}
	1732	}
	1733	}
	1734
	1735
	1736
	1737	#include "opt_ddb.h"
	1738	#ifdef DDB
	1739	#include <sys/kernel.h>
	1740
	1741	#include <sys/cons.h>
	1742
	1743	#include <ddb/ddb.h>
	1744
	1745	static int _vm_object_in_map (vm_map_t map, vm_object_t object,
	1746	vm_map_entry_t entry);
	1747	static int vm_object_in_map (vm_object_t object);
	1748
	1749	static int
	1750	_vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
	1751	{
	1752	vm_map_t tmpm;
	1753	vm_map_entry_t tmpe;
	1754	vm_object_t obj;
	1755	int entcount;
	1756
	1757	if (map == 0)
	1758	return 0;
	1759	if (entry == 0) {
	1760	tmpe = map->header.next;
	1761	entcount = map->nentries;
	1762	while (entcount-- && (tmpe != &map->header)) {
	1763	if( _vm_object_in_map(map, object, tmpe)) {
	1764	return 1;
	1765	}
	1766	tmpe = tmpe->next;
	1767	}
	1768	return (0);
	1769	}
	1770	switch(entry->maptype) {
	1771	case VM_MAPTYPE_SUBMAP:
	1772	tmpm = entry->object.sub_map;
	1773	tmpe = tmpm->header.next;
	1774	entcount = tmpm->nentries;
	1775	while (entcount-- && tmpe != &tmpm->header) {
	1776	if( _vm_object_in_map(tmpm, object, tmpe)) {
	1777	return 1;
	1778	}
	1779	tmpe = tmpe->next;
	1780	}
	1781	break;
	1782	case VM_MAPTYPE_NORMAL:
	1783	case VM_MAPTYPE_VPAGETABLE:
	1784	obj = entry->object.vm_object;
	1785	while (obj) {
	1786	if (obj == object)
	1787	return 1;
	1788	obj = obj->backing_object;
	1789	}
	1790	break;
	1791	default:
	1792	break;
	1793	}
	1794	return 0;
	1795	}
	1796
	1797	static int vm_object_in_map_callback(struct proc p, void data);
	1798
	1799	struct vm_object_in_map_info {
	1800	vm_object_t object;
	1801	int rv;
	1802	};
	1803
	1804	static int
	1805	vm_object_in_map(vm_object_t object)
	1806	{
	1807	struct vm_object_in_map_info info;
	1808
	1809	info.rv = 0;
	1810	info.object = object;
	1811
	1812	allproc_scan(vm_object_in_map_callback, &info);
	1813	if (info.rv)
	1814	return 1;
	1815	if( _vm_object_in_map(&kernel_map, object, 0))
	1816	return 1;
	1817	if( _vm_object_in_map(&pager_map, object, 0))
	1818	return 1;
	1819	if( _vm_object_in_map(&buffer_map, object, 0))
	1820	return 1;
	1821	return 0;
	1822	}
	1823
	1824	static int
	1825	vm_object_in_map_callback(struct proc p, void data)
	1826	{
	1827	struct vm_object_in_map_info *info = data;
	1828
	1829	if (p->p_vmspace) {
	1830	if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
	1831	info->rv = 1;
	1832	return -1;
	1833	}
	1834	}
	1835	return (0);
	1836	}
	1837
	1838	DB_SHOW_COMMAND(vmochk, vm_object_check)
	1839	{
	1840	vm_object_t object;
	1841
	1842	/*
	1843	* make sure that internal objs are in a map somewhere
	1844	* and none have zero ref counts.
	1845	*/
	1846	for (object = TAILQ_FIRST(&vm_object_list);
	1847	object != NULL;
	1848	object = TAILQ_NEXT(object, object_list)) {
	1849	if (object->handle == NULL &&
	1850	(object->type == OBJT_DEFAULT \|\| object->type == OBJT_SWAP)) {
	1851	if (object->ref_count == 0) {
	1852	db_printf("vmochk: internal obj has zero ref count: %ld\n",
	1853	(long)object->size);
	1854	}
	1855	if (!vm_object_in_map(object)) {
	1856	db_printf(
	1857	"vmochk: internal obj is not in a map: "
	1858	"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
	1859	object->ref_count, (u_long)object->size,
	1860	(u_long)object->size,
	1861	(void *)object->backing_object);
	1862	}
	1863	}
	1864	}
	1865	}
	1866
	1867	/*
	1868	* vm_object_print: [ debug ]
	1869	*/
	1870	DB_SHOW_COMMAND(object, vm_object_print_static)
	1871	{
	1872	/* XXX convert args. */
	1873	vm_object_t object = (vm_object_t)addr;
	1874	boolean_t full = have_addr;
	1875
	1876	vm_page_t p;
	1877
	1878	/* XXX count is an (unused) arg. Avoid shadowing it. */
	1879	#define count was_count
	1880
	1881	int count;
	1882
	1883	if (object == NULL)
	1884	return;
	1885
	1886	db_iprintf(
	1887	"Object %p: type=%d, size=0x%lx, res=%d, ref=%d, flags=0x%x\n",
	1888	object, (int)object->type, (u_long)object->size,
	1889	object->resident_page_count, object->ref_count, object->flags);
	1890	/*
	1891	* XXX no %qd in kernel. Truncate object->backing_object_offset.
	1892	*/
	1893	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
	1894	object->shadow_count,
	1895	object->backing_object ? object->backing_object->ref_count : 0,
	1896	object->backing_object, (long)object->backing_object_offset);
	1897
	1898	if (!full)
	1899	return;
	1900
	1901	db_indent += 2;
	1902	count = 0;
	1903	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
	1904	if (count == 0)
	1905	db_iprintf("memory:=");
	1906	else if (count == 6) {
	1907	db_printf("\n");
	1908	db_iprintf(" ...");
	1909	count = 0;
	1910	} else
	1911	db_printf(",");
	1912	count++;
	1913
	1914	db_printf("(off=0x%lx,page=0x%lx)",
	1915	(u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
	1916	}
	1917	if (count != 0)
	1918	db_printf("\n");
	1919	db_indent -= 2;
	1920	}
	1921
	1922	/* XXX. */
	1923	#undef count
	1924
	1925	/* XXX need this non-static entry for calling from vm_map_print. */
	1926	void
	1927	vm_object_print(/* db_expr_t */ long addr,
	1928	boolean_t have_addr,
	1929	/* db_expr_t */ long count,
	1930	char *modif)
	1931	{
	1932	vm_object_print_static(addr, have_addr, count, modif);
	1933	}
	1934
	1935	DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
	1936	{
	1937	vm_object_t object;
	1938	int nl = 0;
	1939	int c;
	1940	for (object = TAILQ_FIRST(&vm_object_list);
	1941	object != NULL;
	1942	object = TAILQ_NEXT(object, object_list)) {
	1943	vm_pindex_t idx, fidx;
	1944	vm_pindex_t osize;
	1945	vm_paddr_t pa = -1, padiff;
	1946	int rcount;
	1947	vm_page_t m;
	1948
	1949	db_printf("new object: %p\n", (void *)object);
	1950	if ( nl > 18) {
	1951	c = cngetc();
	1952	if (c != ' ')
	1953	return;
	1954	nl = 0;
	1955	}
	1956	nl++;
	1957	rcount = 0;
	1958	fidx = 0;
	1959	osize = object->size;
	1960	if (osize > 128)
	1961	osize = 128;
	1962	for (idx = 0; idx < osize; idx++) {
	1963	m = vm_page_lookup(object, idx);
	1964	if (m == NULL) {
	1965	if (rcount) {
	1966	db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
	1967	(long)fidx, rcount, (long)pa);
	1968	if ( nl > 18) {
	1969	c = cngetc();
	1970	if (c != ' ')
	1971	return;
	1972	nl = 0;
	1973	}
	1974	nl++;
	1975	rcount = 0;
	1976	}
	1977	continue;
	1978	}
	1979
	1980
	1981	if (rcount &&
	1982	(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
	1983	++rcount;
	1984	continue;
	1985	}
	1986	if (rcount) {
	1987	padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
	1988	padiff >>= PAGE_SHIFT;
	1989	padiff &= PQ_L2_MASK;
	1990	if (padiff == 0) {
	1991	pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
	1992	++rcount;
	1993	continue;
	1994	}
	1995	db_printf(" index(%ld)run(%d)pa(0x%lx)",
	1996	(long)fidx, rcount, (long)pa);
	1997	db_printf("pd(%ld)\n", (long)padiff);
	1998	if ( nl > 18) {
	1999	c = cngetc();
	2000	if (c != ' ')
	2001	return;
	2002	nl = 0;
	2003	}
	2004	nl++;
	2005	}
	2006	fidx = idx;
	2007	pa = VM_PAGE_TO_PHYS(m);
	2008	rcount = 1;
	2009	}
	2010	if (rcount) {
	2011	db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
	2012	(long)fidx, rcount, (long)pa);
	2013	if ( nl > 18) {
	2014	c = cngetc();
	2015	if (c != ' ')
	2016	return;
	2017	nl = 0;
	2018	}
	2019	nl++;
	2020	}
	2021	}
	2022	}
	2023	#endif /* DDB */