kernel - more SMP optimizations in the VM system
[dragonfly.git] / sys / vm / vm_object.c
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62
63 /*
64  *      Virtual memory object module.
65  */
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>           /* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
78
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
91
92 #include <vm/vm_page2.h>
93
94 #include <machine/specialreg.h>
95
96 #define EASY_SCAN_FACTOR        8
97
98 static void     vm_object_qcollapse(vm_object_t object,
99                                     vm_object_t backing_object);
100 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
101                                              int pagerflags);
102 static void     vm_object_lock_init(vm_object_t);
103
104
105 /*
106  *      Virtual memory objects maintain the actual data
107  *      associated with allocated virtual memory.  A given
108  *      page of memory exists within exactly one object.
109  *
110  *      An object is only deallocated when all "references"
111  *      are given up.  Only one "reference" to a given
112  *      region of an object should be writeable.
113  *
114  *      Associated with each object is a list of all resident
115  *      memory pages belonging to that object; this list is
116  *      maintained by the "vm_page" module, and locked by the object's
117  *      lock.
118  *
119  *      Each object also records a "pager" routine which is
120  *      used to retrieve (and store) pages to the proper backing
121  *      storage.  In addition, objects may be backed by other
122  *      objects from which they were virtual-copied.
123  *
124  *      The only items within the object structure which are
125  *      modified after time of creation are:
126  *              reference count         locked by object's lock
127  *              pager routine           locked by object's lock
128  *
129  */
130
131 struct object_q vm_object_list;         /* locked by vmobj_token */
132 struct vm_object kernel_object;
133
134 static long vm_object_count;            /* locked by vmobj_token */
135
136 static long object_collapses;
137 static long object_bypasses;
138 static int next_index;
139 static vm_zone_t obj_zone;
140 static struct vm_zone obj_zone_store;
141 #define VM_OBJECTS_INIT 256
142 static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
143
144 /*
145  * Misc low level routines
146  */
147 static void
148 vm_object_lock_init(vm_object_t obj)
149 {
150 #if defined(DEBUG_LOCKS)
151         int i;
152
153         obj->debug_hold_bitmap = 0;
154         obj->debug_hold_ovfl = 0;
155         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
156                 obj->debug_hold_thrs[i] = NULL;
157                 obj->debug_hold_file[i] = NULL;
158                 obj->debug_hold_line[i] = 0;
159         }
160 #endif
161 }
162
163 void
164 vm_object_lock_swap(void)
165 {
166         lwkt_token_swap();
167 }
168
169 void
170 vm_object_lock(vm_object_t obj)
171 {
172         lwkt_gettoken(&obj->token);
173 }
174
175 /*
176  * Returns TRUE on sucesss
177  */
178 static int
179 vm_object_lock_try(vm_object_t obj)
180 {
181         return(lwkt_trytoken(&obj->token));
182 }
183
184 void
185 vm_object_lock_shared(vm_object_t obj)
186 {
187         lwkt_gettoken_shared(&obj->token);
188 }
189
190 void
191 vm_object_unlock(vm_object_t obj)
192 {
193         lwkt_reltoken(&obj->token);
194 }
195
196 void
197 vm_object_upgrade(vm_object_t obj)
198 {
199         lwkt_reltoken(&obj->token);
200         lwkt_gettoken(&obj->token);
201 }
202
203 void
204 vm_object_downgrade(vm_object_t obj)
205 {
206         lwkt_reltoken(&obj->token);
207         lwkt_gettoken_shared(&obj->token);
208 }
209
210 static __inline void
211 vm_object_assert_held(vm_object_t obj)
212 {
213         ASSERT_LWKT_TOKEN_HELD(&obj->token);
214 }
215
216 void
217 #ifndef DEBUG_LOCKS
218 vm_object_hold(vm_object_t obj)
219 #else
220 debugvm_object_hold(vm_object_t obj, char *file, int line)
221 #endif
222 {
223         KKASSERT(obj != NULL);
224
225         /*
226          * Object must be held (object allocation is stable due to callers
227          * context, typically already holding the token on a parent object)
228          * prior to potentially blocking on the lock, otherwise the object
229          * can get ripped away from us.
230          */
231         refcount_acquire(&obj->hold_count);
232         vm_object_lock(obj);
233
234 #if defined(DEBUG_LOCKS)
235         int i;
236         u_int mask;
237
238         for (;;) {
239                 mask = ~obj->debug_hold_bitmap;
240                 cpu_ccfence();
241                 if (mask == 0xFFFFFFFFU) {
242                         if (obj->debug_hold_ovfl == 0)
243                                 obj->debug_hold_ovfl = 1;
244                         break;
245                 }
246                 i = ffs(mask) - 1;
247                 if (atomic_cmpset_int(&obj->debug_hold_bitmap, ~mask,
248                                       ~mask | (1 << i))) {
249                         obj->debug_hold_bitmap |= (1 << i);
250                         obj->debug_hold_thrs[i] = curthread;
251                         obj->debug_hold_file[i] = file;
252                         obj->debug_hold_line[i] = line;
253                         break;
254                 }
255         }
256 #endif
257 }
258
259 int
260 #ifndef DEBUG_LOCKS
261 vm_object_hold_try(vm_object_t obj)
262 #else
263 debugvm_object_hold_try(vm_object_t obj, char *file, int line)
264 #endif
265 {
266         KKASSERT(obj != NULL);
267
268         /*
269          * Object must be held (object allocation is stable due to callers
270          * context, typically already holding the token on a parent object)
271          * prior to potentially blocking on the lock, otherwise the object
272          * can get ripped away from us.
273          */
274         refcount_acquire(&obj->hold_count);
275         if (vm_object_lock_try(obj) == 0) {
276                 if (refcount_release(&obj->hold_count)) {
277                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
278                                 zfree(obj_zone, obj);
279                 }
280                 return(0);
281         }
282
283 #if defined(DEBUG_LOCKS)
284         int i;
285         u_int mask;
286
287         for (;;) {
288                 mask = ~obj->debug_hold_bitmap;
289                 cpu_ccfence();
290                 if (mask == 0xFFFFFFFFU) {
291                         if (obj->debug_hold_ovfl == 0)
292                                 obj->debug_hold_ovfl = 1;
293                         break;
294                 }
295                 i = ffs(mask) - 1;
296                 if (atomic_cmpset_int(&obj->debug_hold_bitmap, ~mask,
297                                       ~mask | (1 << i))) {
298                         obj->debug_hold_bitmap |= (1 << i);
299                         obj->debug_hold_thrs[i] = curthread;
300                         obj->debug_hold_file[i] = file;
301                         obj->debug_hold_line[i] = line;
302                         break;
303                 }
304         }
305 #endif
306         return(1);
307 }
308
309 void
310 #ifndef DEBUG_LOCKS
311 vm_object_hold_shared(vm_object_t obj)
312 #else
313 debugvm_object_hold_shared(vm_object_t obj, char *file, int line)
314 #endif
315 {
316         KKASSERT(obj != NULL);
317
318         /*
319          * Object must be held (object allocation is stable due to callers
320          * context, typically already holding the token on a parent object)
321          * prior to potentially blocking on the lock, otherwise the object
322          * can get ripped away from us.
323          */
324         refcount_acquire(&obj->hold_count);
325         vm_object_lock_shared(obj);
326
327 #if defined(DEBUG_LOCKS)
328         int i;
329         u_int mask;
330
331         for (;;) {
332                 mask = ~obj->debug_hold_bitmap;
333                 cpu_ccfence();
334                 if (mask == 0xFFFFFFFFU) {
335                         if (obj->debug_hold_ovfl == 0)
336                                 obj->debug_hold_ovfl = 1;
337                         break;
338                 }
339                 i = ffs(mask) - 1;
340                 if (atomic_cmpset_int(&obj->debug_hold_bitmap, ~mask,
341                                       ~mask | (1 << i))) {
342                         obj->debug_hold_bitmap |= (1 << i);
343                         obj->debug_hold_thrs[i] = curthread;
344                         obj->debug_hold_file[i] = file;
345                         obj->debug_hold_line[i] = line;
346                         break;
347                 }
348         }
349 #endif
350 }
351
352 #if 0
353
354 /*
355  * Obtain either a shared or exclusive lock on VM object
356  * based on whether this is a terminal vnode object or not.
357  */
358 int
359 #ifndef DEBUG_LOCKS
360 vm_object_hold_maybe_shared(vm_object_t obj)
361 #else
362 debugvm_object_hold_maybe_shared(vm_object_t obj, char *file, int line)
363 #endif
364 {
365         if (vm_shared_fault &&
366             obj->type == OBJT_VNODE &&
367             obj->backing_object == NULL) {
368                 vm_object_hold_shared(obj);
369                 return(1);
370         } else {
371                 vm_object_hold(obj);
372                 return(0);
373         }
374 }
375
376 #endif
377
378 /*
379  * Drop the token and hold_count on the object.
380  *
381  * WARNING! Token might be shared.
382  */
383 void
384 vm_object_drop(vm_object_t obj)
385 {
386         if (obj == NULL)
387                 return;
388
389 #if defined(DEBUG_LOCKS)
390         int found = 0;
391         int i;
392
393         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
394                 if ((obj->debug_hold_bitmap & (1 << i)) &&
395                     (obj->debug_hold_thrs[i] == curthread)) {
396                         obj->debug_hold_bitmap &= ~(1 << i);
397                         obj->debug_hold_thrs[i] = NULL;
398                         obj->debug_hold_file[i] = NULL;
399                         obj->debug_hold_line[i] = 0;
400                         found = 1;
401                         break;
402                 }
403         }
404
405         if (found == 0 && obj->debug_hold_ovfl == 0)
406                 panic("vm_object: attempt to drop hold on non-self-held obj");
407 #endif
408
409         /*
410          * No new holders should be possible once we drop hold_count 1->0 as
411          * there is no longer any way to reference the object.
412          */
413         KKASSERT(obj->hold_count > 0);
414         if (refcount_release(&obj->hold_count)) {
415                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
416                         vm_object_unlock(obj);
417                         zfree(obj_zone, obj);
418                 } else {
419                         vm_object_unlock(obj);
420                 }
421         } else {
422                 vm_object_unlock(obj);
423         }
424 }
425
426 /*
427  * Initialize a freshly allocated object, returning a held object.
428  *
429  * Used only by vm_object_allocate() and zinitna().
430  *
431  * No requirements.
432  */
433 void
434 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
435 {
436         int incr;
437
438         RB_INIT(&object->rb_memq);
439         LIST_INIT(&object->shadow_head);
440         lwkt_token_init(&object->token, "vmobj");
441
442         object->type = type;
443         object->size = size;
444         object->ref_count = 1;
445         object->memattr = VM_MEMATTR_DEFAULT;
446         object->hold_count = 0;
447         object->flags = 0;
448         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
449                 vm_object_set_flag(object, OBJ_ONEMAPPING);
450         object->paging_in_progress = 0;
451         object->resident_page_count = 0;
452         object->agg_pv_list_count = 0;
453         object->shadow_count = 0;
454         /* cpu localization twist */
455         object->pg_color = (int)(intptr_t)curthread;
456         if ( size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
457                 incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
458         else
459                 incr = size;
460         next_index = (next_index + incr) & PQ_L2_MASK;
461         object->handle = NULL;
462         object->backing_object = NULL;
463         object->backing_object_offset = (vm_ooffset_t)0;
464
465         object->generation++;
466         object->swblock_count = 0;
467         RB_INIT(&object->swblock_root);
468         vm_object_lock_init(object);
469         pmap_object_init(object);
470
471         vm_object_hold(object);
472         lwkt_gettoken(&vmobj_token);
473         TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
474         vm_object_count++;
475         lwkt_reltoken(&vmobj_token);
476 }
477
478 /*
479  * Initialize the VM objects module.
480  *
481  * Called from the low level boot code only.
482  */
483 void
484 vm_object_init(void)
485 {
486         TAILQ_INIT(&vm_object_list);
487         
488         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
489                             &kernel_object);
490         vm_object_drop(&kernel_object);
491
492         obj_zone = &obj_zone_store;
493         zbootinit(obj_zone, "VM OBJECT", sizeof (struct vm_object),
494                 vm_objects_init, VM_OBJECTS_INIT);
495 }
496
497 void
498 vm_object_init2(void)
499 {
500         zinitna(obj_zone, NULL, NULL, 0, 0, ZONE_PANICFAIL, 1);
501 }
502
503 /*
504  * Allocate and return a new object of the specified type and size.
505  *
506  * No requirements.
507  */
508 vm_object_t
509 vm_object_allocate(objtype_t type, vm_pindex_t size)
510 {
511         vm_object_t result;
512
513         result = (vm_object_t) zalloc(obj_zone);
514
515         _vm_object_allocate(type, size, result);
516         vm_object_drop(result);
517
518         return (result);
519 }
520
521 /*
522  * This version returns a held object, allowing further atomic initialization
523  * of the object.
524  */
525 vm_object_t
526 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
527 {
528         vm_object_t result;
529
530         result = (vm_object_t) zalloc(obj_zone);
531
532         _vm_object_allocate(type, size, result);
533
534         return (result);
535 }
536
537 /*
538  * Add an additional reference to a vm_object.  The object must already be
539  * held.  The original non-lock version is no longer supported.  The object
540  * must NOT be chain locked by anyone at the time the reference is added.
541  *
542  * Referencing a chain-locked object can blow up the fairly sensitive
543  * ref_count and shadow_count tests in the deallocator.  Most callers
544  * will call vm_object_chain_wait() prior to calling
545  * vm_object_reference_locked() to avoid the case.
546  *
547  * The object must be held, but may be held shared if desired (hence why
548  * we use an atomic op).
549  */
550 void
551 vm_object_reference_locked(vm_object_t object)
552 {
553         KKASSERT(object != NULL);
554         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
555         KKASSERT((object->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) == 0);
556         atomic_add_int(&object->ref_count, 1);
557         if (object->type == OBJT_VNODE) {
558                 vref(object->handle);
559                 /* XXX what if the vnode is being destroyed? */
560         }
561 }
562
563 /*
564  * This version is only allowed for vnode objects.
565  */
566 void
567 vm_object_reference_quick(vm_object_t object)
568 {
569         KKASSERT(object->type == OBJT_VNODE);
570         atomic_add_int(&object->ref_count, 1);
571         vref(object->handle);
572 }
573
574 /*
575  * Object OBJ_CHAINLOCK lock handling.
576  *
577  * The caller can chain-lock backing objects recursively and then
578  * use vm_object_chain_release_all() to undo the whole chain.
579  *
580  * Chain locks are used to prevent collapses and are only applicable
581  * to OBJT_DEFAULT and OBJT_SWAP objects.  Chain locking operations
582  * on other object types are ignored.  This is also important because
583  * it allows e.g. the vnode underlying a memory mapping to take concurrent
584  * faults.
585  *
586  * The object must usually be held on entry, though intermediate
587  * objects need not be held on release.  The object must be held exclusively,
588  * NOT shared.  Note that the prefault path checks the shared state and
589  * avoids using the chain functions.
590  */
591 void
592 vm_object_chain_wait(vm_object_t object, int shared)
593 {
594         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
595         for (;;) {
596                 uint32_t chainlk = object->chainlk;
597
598                 cpu_ccfence();
599                 if (shared) {
600                         if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
601                                 tsleep_interlock(object, 0);
602                                 if (atomic_cmpset_int(&object->chainlk,
603                                                       chainlk,
604                                                       chainlk | CHAINLK_WAIT)) {
605                                         tsleep(object, PINTERLOCKED,
606                                                "objchns", 0);
607                                 }
608                                 /* retry */
609                         } else {
610                                 break;
611                         }
612                         /* retry */
613                 } else {
614                         if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
615                                 tsleep_interlock(object, 0);
616                                 if (atomic_cmpset_int(&object->chainlk,
617                                                       chainlk,
618                                                       chainlk | CHAINLK_WAIT))
619                                 {
620                                         tsleep(object, PINTERLOCKED,
621                                                "objchnx", 0);
622                                 }
623                                 /* retry */
624                         } else {
625                                 if (atomic_cmpset_int(&object->chainlk,
626                                                       chainlk,
627                                                       chainlk & ~CHAINLK_WAIT))
628                                 {
629                                         if (chainlk & CHAINLK_WAIT)
630                                                 wakeup(object);
631                                         break;
632                                 }
633                                 /* retry */
634                         }
635                 }
636                 /* retry */
637         }
638 }
639
640 void
641 vm_object_chain_acquire(vm_object_t object, int shared)
642 {
643         if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
644                 return;
645         if (vm_shared_fault == 0)
646                 shared = 0;
647
648         for (;;) {
649                 uint32_t chainlk = object->chainlk;
650
651                 cpu_ccfence();
652                 if (shared) {
653                         if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
654                                 tsleep_interlock(object, 0);
655                                 if (atomic_cmpset_int(&object->chainlk,
656                                                       chainlk,
657                                                       chainlk | CHAINLK_WAIT)) {
658                                         tsleep(object, PINTERLOCKED,
659                                                "objchns", 0);
660                                 }
661                                 /* retry */
662                         } else if (atomic_cmpset_int(&object->chainlk,
663                                               chainlk, chainlk + 1)) {
664                                 break;
665                         }
666                         /* retry */
667                 } else {
668                         if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
669                                 tsleep_interlock(object, 0);
670                                 if (atomic_cmpset_int(&object->chainlk,
671                                                       chainlk,
672                                                       chainlk |
673                                                        CHAINLK_WAIT |
674                                                        CHAINLK_EXCLREQ)) {
675                                         tsleep(object, PINTERLOCKED,
676                                                "objchnx", 0);
677                                 }
678                                 /* retry */
679                         } else {
680                                 if (atomic_cmpset_int(&object->chainlk,
681                                                       chainlk,
682                                                       (chainlk | CHAINLK_EXCL) &
683                                                       ~(CHAINLK_EXCLREQ |
684                                                         CHAINLK_WAIT))) {
685                                         if (chainlk & CHAINLK_WAIT)
686                                                 wakeup(object);
687                                         break;
688                                 }
689                                 /* retry */
690                         }
691                 }
692                 /* retry */
693         }
694 }
695
696 void
697 vm_object_chain_release(vm_object_t object)
698 {
699         /*ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));*/
700         if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
701                 return;
702         KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
703         for (;;) {
704                 uint32_t chainlk = object->chainlk;
705
706                 cpu_ccfence();
707                 if (chainlk & CHAINLK_MASK) {
708                         if ((chainlk & CHAINLK_MASK) == 1 &&
709                             atomic_cmpset_int(&object->chainlk,
710                                               chainlk,
711                                               (chainlk - 1) & ~CHAINLK_WAIT)) {
712                                 if (chainlk & CHAINLK_WAIT)
713                                         wakeup(object);
714                                 break;
715                         }
716                         if ((chainlk & CHAINLK_MASK) > 1 &&
717                             atomic_cmpset_int(&object->chainlk,
718                                               chainlk, chainlk - 1)) {
719                                 break;
720                         }
721                         /* retry */
722                 } else {
723                         KKASSERT(chainlk & CHAINLK_EXCL);
724                         if (atomic_cmpset_int(&object->chainlk,
725                                               chainlk,
726                                               chainlk & ~(CHAINLK_EXCL |
727                                                           CHAINLK_WAIT))) {
728                                 if (chainlk & CHAINLK_WAIT)
729                                         wakeup(object);
730                                 break;
731                         }
732                 }
733         }
734 }
735
736 /*
737  * Release the chain from first_object through and including stopobj.
738  * The caller is typically holding the first and last object locked
739  * (shared or exclusive) to prevent destruction races.
740  *
741  * We release stopobj first as an optimization as this object is most
742  * likely to be shared across multiple processes.
743  */
744 void
745 vm_object_chain_release_all(vm_object_t first_object, vm_object_t stopobj)
746 {
747         vm_object_t backing_object;
748         vm_object_t object;
749
750         vm_object_chain_release(stopobj);
751         object = first_object;
752
753         while (object != stopobj) {
754                 KKASSERT(object);
755 #if 0
756                 /* shouldn't need this since chain is held */
757                 if (object != first_object)
758                         vm_object_hold(object);
759 #endif
760                 backing_object = object->backing_object;
761                 vm_object_chain_release(object);
762 #if 0
763                 if (object != first_object)
764                         vm_object_drop(object);
765 #endif
766                 object = backing_object;
767         }
768 }
769
770 /*
771  * Dereference an object and its underlying vnode.
772  *
773  * The object must be held exclusively and will remain held on return.
774  * (We don't need an atomic op due to the exclusivity).
775  */
776 static void
777 vm_object_vndeallocate(vm_object_t object)
778 {
779         struct vnode *vp = (struct vnode *) object->handle;
780
781         KASSERT(object->type == OBJT_VNODE,
782             ("vm_object_vndeallocate: not a vnode object"));
783         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
784         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
785 #ifdef INVARIANTS
786         if (object->ref_count == 0) {
787                 vprint("vm_object_vndeallocate", vp);
788                 panic("vm_object_vndeallocate: bad object reference count");
789         }
790 #endif
791         atomic_add_int(&object->ref_count, -1);
792         if (object->ref_count == 0)
793                 vclrflags(vp, VTEXT);
794         vrele(vp);
795 }
796
797 /*
798  * Release a reference to the specified object, gained either through a
799  * vm_object_allocate or a vm_object_reference call.  When all references
800  * are gone, storage associated with this object may be relinquished.
801  *
802  * The caller does not have to hold the object locked but must have control
803  * over the reference in question in order to guarantee that the object
804  * does not get ripped out from under us.
805  *
806  * XXX Currently all deallocations require an exclusive lock.
807  */
808 void
809 vm_object_deallocate(vm_object_t object)
810 {
811         struct vnode *vp;
812         int count;
813
814         if (object == NULL)
815                 return;
816         for (;;) {
817                 count = object->ref_count;
818                 cpu_ccfence();
819
820                 /*
821                  * If decrementing the count enters into special handling
822                  * territory (0, 1, or 2) we have to do it the hard way.
823                  * Fortunate though, objects with only a few refs like this
824                  * are not likely to be heavily contended anyway.
825                  *
826                  * For vnode objects we only care about 1->0 transitions.
827                  */
828                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
829                         vm_object_hold(object);
830                         vm_object_deallocate_locked(object);
831                         vm_object_drop(object);
832                         break;
833                 }
834
835                 /*
836                  * Try to decrement ref_count without acquiring a hold on
837                  * the object.  This is particularly important for the exec*()
838                  * and exit*() code paths because the program binary may
839                  * have a great deal of sharing and an exclusive lock will
840                  * crowbar performance in those circumstances.
841                  */
842                 if (object->type == OBJT_VNODE) {
843                         vp = (struct vnode *)object->handle;
844                         if (atomic_cmpset_int(&object->ref_count,
845                                               count, count - 1)) {
846                                 vrele(vp);
847                                 break;
848                         }
849                         /* retry */
850                 } else {
851                         if (atomic_cmpset_int(&object->ref_count,
852                                               count, count - 1)) {
853                                 break;
854                         }
855                         /* retry */
856                 }
857                 /* retry */
858         }
859 }
860
861 void
862 vm_object_deallocate_locked(vm_object_t object)
863 {
864         struct vm_object_dealloc_list *dlist = NULL;
865         struct vm_object_dealloc_list *dtmp;
866         vm_object_t temp;
867         int must_drop = 0;
868
869         /*
870          * We may chain deallocate object, but additional objects may
871          * collect on the dlist which also have to be deallocated.  We
872          * must avoid a recursion, vm_object chains can get deep.
873          */
874 again:
875         while (object != NULL) {
876                 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
877 #if 0
878                 /*
879                  * Don't rip a ref_count out from under an object undergoing
880                  * collapse, it will confuse the collapse code.
881                  */
882                 vm_object_chain_wait(object);
883 #endif
884                 if (object->type == OBJT_VNODE) {
885                         vm_object_vndeallocate(object);
886                         break;
887                 }
888
889                 if (object->ref_count == 0) {
890                         panic("vm_object_deallocate: object deallocated "
891                               "too many times: %d", object->type);
892                 }
893                 if (object->ref_count > 2) {
894                         atomic_add_int(&object->ref_count, -1);
895                         break;
896                 }
897
898                 /*
899                  * Here on ref_count of one or two, which are special cases for
900                  * objects.
901                  *
902                  * Nominal ref_count > 1 case if the second ref is not from
903                  * a shadow.
904                  *
905                  * (ONEMAPPING only applies to DEFAULT AND SWAP objects)
906                  */
907                 if (object->ref_count == 2 && object->shadow_count == 0) {
908                         if (object->type == OBJT_DEFAULT ||
909                             object->type == OBJT_SWAP) {
910                                 vm_object_set_flag(object, OBJ_ONEMAPPING);
911                         }
912                         atomic_add_int(&object->ref_count, -1);
913                         break;
914                 }
915
916                 /*
917                  * If the second ref is from a shadow we chain along it
918                  * upwards if object's handle is exhausted.
919                  *
920                  * We have to decrement object->ref_count before potentially
921                  * collapsing the first shadow object or the collapse code
922                  * will not be able to handle the degenerate case to remove
923                  * object.  However, if we do it too early the object can
924                  * get ripped out from under us.
925                  */
926                 if (object->ref_count == 2 && object->shadow_count == 1 &&
927                     object->handle == NULL && (object->type == OBJT_DEFAULT ||
928                                                object->type == OBJT_SWAP)) {
929                         temp = LIST_FIRST(&object->shadow_head);
930                         KKASSERT(temp != NULL);
931                         vm_object_hold(temp);
932
933                         /*
934                          * Wait for any paging to complete so the collapse
935                          * doesn't (or isn't likely to) qcollapse.  pip
936                          * waiting must occur before we acquire the
937                          * chainlock.
938                          */
939                         while (
940                                 temp->paging_in_progress ||
941                                 object->paging_in_progress
942                         ) {
943                                 vm_object_pip_wait(temp, "objde1");
944                                 vm_object_pip_wait(object, "objde2");
945                         }
946
947                         /*
948                          * If the parent is locked we have to give up, as
949                          * otherwise we would be acquiring locks in the
950                          * wrong order and potentially deadlock.
951                          */
952                         if (temp->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) {
953                                 vm_object_drop(temp);
954                                 goto skip;
955                         }
956                         vm_object_chain_acquire(temp, 0);
957
958                         /*
959                          * Recheck/retry after the hold and the paging
960                          * wait, both of which can block us.
961                          */
962                         if (object->ref_count != 2 ||
963                             object->shadow_count != 1 ||
964                             object->handle ||
965                             LIST_FIRST(&object->shadow_head) != temp ||
966                             (object->type != OBJT_DEFAULT &&
967                              object->type != OBJT_SWAP)) {
968                                 vm_object_chain_release(temp);
969                                 vm_object_drop(temp);
970                                 continue;
971                         }
972
973                         /*
974                          * We can safely drop object's ref_count now.
975                          */
976                         KKASSERT(object->ref_count == 2);
977                         atomic_add_int(&object->ref_count, -1);
978
979                         /*
980                          * If our single parent is not collapseable just
981                          * decrement ref_count (2->1) and stop.
982                          */
983                         if (temp->handle || (temp->type != OBJT_DEFAULT &&
984                                              temp->type != OBJT_SWAP)) {
985                                 vm_object_chain_release(temp);
986                                 vm_object_drop(temp);
987                                 break;
988                         }
989
990                         /*
991                          * At this point we have already dropped object's
992                          * ref_count so it is possible for a race to
993                          * deallocate obj out from under us.  Any collapse
994                          * will re-check the situation.  We must not block
995                          * until we are able to collapse.
996                          *
997                          * Bump temp's ref_count to avoid an unwanted
998                          * degenerate recursion (can't call
999                          * vm_object_reference_locked() because it asserts
1000                          * that CHAINLOCK is not set).
1001                          */
1002                         atomic_add_int(&temp->ref_count, 1);
1003                         KKASSERT(temp->ref_count > 1);
1004
1005                         /*
1006                          * Collapse temp, then deallocate the extra ref
1007                          * formally.
1008                          */
1009                         vm_object_collapse(temp, &dlist);
1010                         vm_object_chain_release(temp);
1011                         if (must_drop) {
1012                                 vm_object_lock_swap();
1013                                 vm_object_drop(object);
1014                         }
1015                         object = temp;
1016                         must_drop = 1;
1017                         continue;
1018                 }
1019
1020                 /*
1021                  * Drop the ref and handle termination on the 1->0 transition.
1022                  * We may have blocked above so we have to recheck.
1023                  */
1024 skip:
1025                 KKASSERT(object->ref_count != 0);
1026                 if (object->ref_count >= 2) {
1027                         atomic_add_int(&object->ref_count, -1);
1028                         break;
1029                 }
1030                 KKASSERT(object->ref_count == 1);
1031
1032                 /*
1033                  * 1->0 transition.  Chain through the backing_object.
1034                  * Maintain the ref until we've located the backing object,
1035                  * then re-check.
1036                  */
1037                 while ((temp = object->backing_object) != NULL) {
1038                         vm_object_hold(temp);
1039                         if (temp == object->backing_object)
1040                                 break;
1041                         vm_object_drop(temp);
1042                 }
1043
1044                 /*
1045                  * 1->0 transition verified, retry if ref_count is no longer
1046                  * 1.  Otherwise disconnect the backing_object (temp) and
1047                  * clean up.
1048                  */
1049                 if (object->ref_count != 1) {
1050                         vm_object_drop(temp);
1051                         continue;
1052                 }
1053
1054                 /*
1055                  * It shouldn't be possible for the object to be chain locked
1056                  * if we're removing the last ref on it.
1057                  */
1058                 KKASSERT((object->chainlk & (CHAINLK_EXCL|CHAINLK_MASK)) == 0);
1059
1060                 if (temp) {
1061                         if (object->flags & OBJ_ONSHADOW) {
1062                                 LIST_REMOVE(object, shadow_list);
1063                                 temp->shadow_count--;
1064                                 temp->generation++;
1065                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
1066                         }
1067                         object->backing_object = NULL;
1068                 }
1069
1070                 atomic_add_int(&object->ref_count, -1);
1071                 if ((object->flags & OBJ_DEAD) == 0)
1072                         vm_object_terminate(object);
1073                 if (must_drop && temp)
1074                         vm_object_lock_swap();
1075                 if (must_drop)
1076                         vm_object_drop(object);
1077                 object = temp;
1078                 must_drop = 1;
1079         }
1080         if (must_drop && object)
1081                 vm_object_drop(object);
1082
1083         /*
1084          * Additional tail recursion on dlist.  Avoid a recursion.  Objects
1085          * on the dlist have a hold count but are not locked.
1086          */
1087         if ((dtmp = dlist) != NULL) {
1088                 dlist = dtmp->next;
1089                 object = dtmp->object;
1090                 kfree(dtmp, M_TEMP);
1091
1092                 vm_object_lock(object); /* already held, add lock */
1093                 must_drop = 1;          /* and we're responsible for it */
1094                 goto again;
1095         }
1096 }
1097
1098 /*
1099  * Destroy the specified object, freeing up related resources.
1100  *
1101  * The object must have zero references.
1102  *
1103  * The object must held.  The caller is responsible for dropping the object
1104  * after terminate returns.  Terminate does NOT drop the object.
1105  */
1106 static int vm_object_terminate_callback(vm_page_t p, void *data);
1107
1108 void
1109 vm_object_terminate(vm_object_t object)
1110 {
1111         /*
1112          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
1113          * able to safely block.
1114          */
1115         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1116         KKASSERT((object->flags & OBJ_DEAD) == 0);
1117         vm_object_set_flag(object, OBJ_DEAD);
1118
1119         /*
1120          * Wait for the pageout daemon to be done with the object
1121          */
1122         vm_object_pip_wait(object, "objtrm1");
1123
1124         KASSERT(!object->paging_in_progress,
1125                 ("vm_object_terminate: pageout in progress"));
1126
1127         /*
1128          * Clean and free the pages, as appropriate. All references to the
1129          * object are gone, so we don't need to lock it.
1130          */
1131         if (object->type == OBJT_VNODE) {
1132                 struct vnode *vp;
1133
1134                 /*
1135                  * Clean pages and flush buffers.
1136                  *
1137                  * NOTE!  TMPFS buffer flushes do not typically flush the
1138                  *        actual page to swap as this would be highly
1139                  *        inefficient, and normal filesystems usually wrap
1140                  *        page flushes with buffer cache buffers.
1141                  *
1142                  *        To deal with this we have to call vinvalbuf() both
1143                  *        before and after the vm_object_page_clean().
1144                  */
1145                 vp = (struct vnode *) object->handle;
1146                 vinvalbuf(vp, V_SAVE, 0, 0);
1147                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
1148                 vinvalbuf(vp, V_SAVE, 0, 0);
1149         }
1150
1151         /*
1152          * Wait for any I/O to complete, after which there had better not
1153          * be any references left on the object.
1154          */
1155         vm_object_pip_wait(object, "objtrm2");
1156
1157         if (object->ref_count != 0) {
1158                 panic("vm_object_terminate: object with references, "
1159                       "ref_count=%d", object->ref_count);
1160         }
1161
1162         /*
1163          * Cleanup any shared pmaps associated with this object.
1164          */
1165         pmap_object_free(object);
1166
1167         /*
1168          * Now free any remaining pages. For internal objects, this also
1169          * removes them from paging queues. Don't free wired pages, just
1170          * remove them from the object. 
1171          */
1172         vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1173                                 vm_object_terminate_callback, NULL);
1174
1175         /*
1176          * Let the pager know object is dead.
1177          */
1178         vm_pager_deallocate(object);
1179
1180         /*
1181          * Wait for the object hold count to hit 1, clean out pages as
1182          * we go.  vmobj_token interlocks any race conditions that might
1183          * pick the object up from the vm_object_list after we have cleared
1184          * rb_memq.
1185          */
1186         for (;;) {
1187                 if (RB_ROOT(&object->rb_memq) == NULL)
1188                         break;
1189                 kprintf("vm_object_terminate: Warning, object %p "
1190                         "still has %d pages\n",
1191                         object, object->resident_page_count);
1192                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1193                                         vm_object_terminate_callback, NULL);
1194         }
1195
1196         /*
1197          * There had better not be any pages left
1198          */
1199         KKASSERT(object->resident_page_count == 0);
1200
1201         /*
1202          * Remove the object from the global object list.
1203          */
1204         lwkt_gettoken(&vmobj_token);
1205         TAILQ_REMOVE(&vm_object_list, object, object_list);
1206         vm_object_count--;
1207         lwkt_reltoken(&vmobj_token);
1208
1209         if (object->ref_count != 0) {
1210                 panic("vm_object_terminate2: object with references, "
1211                       "ref_count=%d", object->ref_count);
1212         }
1213
1214         /*
1215          * NOTE: The object hold_count is at least 1, so we cannot zfree()
1216          *       the object here.  See vm_object_drop().
1217          */
1218 }
1219
1220 /*
1221  * The caller must hold the object.
1222  */
1223 static int
1224 vm_object_terminate_callback(vm_page_t p, void *data __unused)
1225 {
1226         vm_object_t object;
1227
1228         object = p->object;
1229         vm_page_busy_wait(p, TRUE, "vmpgtrm");
1230         if (object != p->object) {
1231                 kprintf("vm_object_terminate: Warning: Encountered "
1232                         "busied page %p on queue %d\n", p, p->queue);
1233                 vm_page_wakeup(p);
1234         } else if (p->wire_count == 0) {
1235                 /*
1236                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
1237                  */
1238                 vm_page_free(p);
1239                 mycpu->gd_cnt.v_pfree++;
1240         } else {
1241                 if (p->queue != PQ_NONE)
1242                         kprintf("vm_object_terminate: Warning: Encountered "
1243                                 "wired page %p on queue %d\n", p, p->queue);
1244                 vm_page_remove(p);
1245                 vm_page_wakeup(p);
1246         }
1247         lwkt_yield();
1248         return(0);
1249 }
1250
1251 /*
1252  * Clean all dirty pages in the specified range of object.  Leaves page
1253  * on whatever queue it is currently on.   If NOSYNC is set then do not
1254  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
1255  * leaving the object dirty.
1256  *
1257  * When stuffing pages asynchronously, allow clustering.  XXX we need a
1258  * synchronous clustering mode implementation.
1259  *
1260  * Odd semantics: if start == end, we clean everything.
1261  *
1262  * The object must be locked? XXX
1263  */
1264 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
1265 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
1266
1267 void
1268 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1269                      int flags)
1270 {
1271         struct rb_vm_page_scan_info info;
1272         struct vnode *vp;
1273         int wholescan;
1274         int pagerflags;
1275         int generation;
1276
1277         vm_object_hold(object);
1278         if (object->type != OBJT_VNODE ||
1279             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
1280                 vm_object_drop(object);
1281                 return;
1282         }
1283
1284         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 
1285                         VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
1286         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
1287
1288         vp = object->handle;
1289
1290         /*
1291          * Interlock other major object operations.  This allows us to 
1292          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
1293          */
1294         vm_object_set_flag(object, OBJ_CLEANING);
1295
1296         /*
1297          * Handle 'entire object' case
1298          */
1299         info.start_pindex = start;
1300         if (end == 0) {
1301                 info.end_pindex = object->size - 1;
1302         } else {
1303                 info.end_pindex = end - 1;
1304         }
1305         wholescan = (start == 0 && info.end_pindex == object->size - 1);
1306         info.limit = flags;
1307         info.pagerflags = pagerflags;
1308         info.object = object;
1309
1310         /*
1311          * If cleaning the entire object do a pass to mark the pages read-only.
1312          * If everything worked out ok, clear OBJ_WRITEABLE and
1313          * OBJ_MIGHTBEDIRTY.
1314          */
1315         if (wholescan) {
1316                 info.error = 0;
1317                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1318                                         vm_object_page_clean_pass1, &info);
1319                 if (info.error == 0) {
1320                         vm_object_clear_flag(object,
1321                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1322                         if (object->type == OBJT_VNODE &&
1323                             (vp = (struct vnode *)object->handle) != NULL) {
1324                                 if (vp->v_mount &&
1325                                     (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1326                                         vclrobjdirty(vp);
1327                                 } else {
1328                                         vclrflags(vp, VOBJDIRTY);
1329                                 }
1330                         }
1331                 }
1332         }
1333
1334         /*
1335          * Do a pass to clean all the dirty pages we find.
1336          */
1337         do {
1338                 info.error = 0;
1339                 generation = object->generation;
1340                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1341                                         vm_object_page_clean_pass2, &info);
1342         } while (info.error || generation != object->generation);
1343
1344         vm_object_clear_flag(object, OBJ_CLEANING);
1345         vm_object_drop(object);
1346 }
1347
1348 /*
1349  * The caller must hold the object.
1350  */
1351 static 
1352 int
1353 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1354 {
1355         struct rb_vm_page_scan_info *info = data;
1356
1357         vm_page_flag_set(p, PG_CLEANCHK);
1358         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1359                 info->error = 1;
1360         } else if (vm_page_busy_try(p, FALSE) == 0) {
1361                 vm_page_protect(p, VM_PROT_READ);       /* must not block */
1362                 vm_page_wakeup(p);
1363         } else {
1364                 info->error = 1;
1365         }
1366         lwkt_yield();
1367         return(0);
1368 }
1369
1370 /*
1371  * The caller must hold the object
1372  */
1373 static 
1374 int
1375 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1376 {
1377         struct rb_vm_page_scan_info *info = data;
1378         int generation;
1379
1380         /*
1381          * Do not mess with pages that were inserted after we started
1382          * the cleaning pass.
1383          */
1384         if ((p->flags & PG_CLEANCHK) == 0)
1385                 goto done;
1386
1387         generation = info->object->generation;
1388         vm_page_busy_wait(p, TRUE, "vpcwai");
1389         if (p->object != info->object ||
1390             info->object->generation != generation) {
1391                 info->error = 1;
1392                 vm_page_wakeup(p);
1393                 goto done;
1394         }
1395
1396         /*
1397          * Before wasting time traversing the pmaps, check for trivial
1398          * cases where the page cannot be dirty.
1399          */
1400         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1401                 KKASSERT((p->dirty & p->valid) == 0 &&
1402                          (p->flags & PG_NEED_COMMIT) == 0);
1403                 vm_page_wakeup(p);
1404                 goto done;
1405         }
1406
1407         /*
1408          * Check whether the page is dirty or not.  The page has been set
1409          * to be read-only so the check will not race a user dirtying the
1410          * page.
1411          */
1412         vm_page_test_dirty(p);
1413         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1414                 vm_page_flag_clear(p, PG_CLEANCHK);
1415                 vm_page_wakeup(p);
1416                 goto done;
1417         }
1418
1419         /*
1420          * If we have been asked to skip nosync pages and this is a
1421          * nosync page, skip it.  Note that the object flags were
1422          * not cleared in this case (because pass1 will have returned an
1423          * error), so we do not have to set them.
1424          */
1425         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1426                 vm_page_flag_clear(p, PG_CLEANCHK);
1427                 vm_page_wakeup(p);
1428                 goto done;
1429         }
1430
1431         /*
1432          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1433          * the pages that get successfully flushed.  Set info->error if
1434          * we raced an object modification.
1435          */
1436         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1437         vm_wait_nominal();
1438 done:
1439         lwkt_yield();
1440         return(0);
1441 }
1442
1443 /*
1444  * Collect the specified page and nearby pages and flush them out.
1445  * The number of pages flushed is returned.  The passed page is busied
1446  * by the caller and we are responsible for its disposition.
1447  *
1448  * The caller must hold the object.
1449  */
1450 static void
1451 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1452 {
1453         int error;
1454         int is;
1455         int ib;
1456         int i;
1457         int page_base;
1458         vm_pindex_t pi;
1459         vm_page_t ma[BLIST_MAX_ALLOC];
1460
1461         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1462
1463         pi = p->pindex;
1464         page_base = pi % BLIST_MAX_ALLOC;
1465         ma[page_base] = p;
1466         ib = page_base - 1;
1467         is = page_base + 1;
1468
1469         while (ib >= 0) {
1470                 vm_page_t tp;
1471
1472                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1473                                              TRUE, &error);
1474                 if (error)
1475                         break;
1476                 if (tp == NULL)
1477                         break;
1478                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1479                     (tp->flags & PG_CLEANCHK) == 0) {
1480                         vm_page_wakeup(tp);
1481                         break;
1482                 }
1483                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1484                         vm_page_flag_clear(tp, PG_CLEANCHK);
1485                         vm_page_wakeup(tp);
1486                         break;
1487                 }
1488                 vm_page_test_dirty(tp);
1489                 if ((tp->dirty & tp->valid) == 0 &&
1490                     (tp->flags & PG_NEED_COMMIT) == 0) {
1491                         vm_page_flag_clear(tp, PG_CLEANCHK);
1492                         vm_page_wakeup(tp);
1493                         break;
1494                 }
1495                 ma[ib] = tp;
1496                 --ib;
1497         }
1498         ++ib;   /* fixup */
1499
1500         while (is < BLIST_MAX_ALLOC &&
1501                pi - page_base + is < object->size) {
1502                 vm_page_t tp;
1503
1504                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1505                                              TRUE, &error);
1506                 if (error)
1507                         break;
1508                 if (tp == NULL)
1509                         break;
1510                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1511                     (tp->flags & PG_CLEANCHK) == 0) {
1512                         vm_page_wakeup(tp);
1513                         break;
1514                 }
1515                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1516                         vm_page_flag_clear(tp, PG_CLEANCHK);
1517                         vm_page_wakeup(tp);
1518                         break;
1519                 }
1520                 vm_page_test_dirty(tp);
1521                 if ((tp->dirty & tp->valid) == 0 &&
1522                     (tp->flags & PG_NEED_COMMIT) == 0) {
1523                         vm_page_flag_clear(tp, PG_CLEANCHK);
1524                         vm_page_wakeup(tp);
1525                         break;
1526                 }
1527                 ma[is] = tp;
1528                 ++is;
1529         }
1530
1531         /*
1532          * All pages in the ma[] array are busied now
1533          */
1534         for (i = ib; i < is; ++i) {
1535                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1536                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1537         }
1538         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1539         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1540                 vm_page_unhold(ma[i]);
1541 }
1542
1543 /*
1544  * Same as vm_object_pmap_copy, except range checking really
1545  * works, and is meant for small sections of an object.
1546  *
1547  * This code protects resident pages by making them read-only
1548  * and is typically called on a fork or split when a page
1549  * is converted to copy-on-write.  
1550  *
1551  * NOTE: If the page is already at VM_PROT_NONE, calling
1552  * vm_page_protect will have no effect.
1553  */
1554 void
1555 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1556 {
1557         vm_pindex_t idx;
1558         vm_page_t p;
1559
1560         if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
1561                 return;
1562
1563         vm_object_hold(object);
1564         for (idx = start; idx < end; idx++) {
1565                 p = vm_page_lookup(object, idx);
1566                 if (p == NULL)
1567                         continue;
1568                 vm_page_protect(p, VM_PROT_READ);
1569         }
1570         vm_object_drop(object);
1571 }
1572
1573 /*
1574  * Removes all physical pages in the specified object range from all
1575  * physical maps.
1576  *
1577  * The object must *not* be locked.
1578  */
1579
1580 static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
1581
1582 void
1583 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1584 {
1585         struct rb_vm_page_scan_info info;
1586
1587         if (object == NULL)
1588                 return;
1589         info.start_pindex = start;
1590         info.end_pindex = end - 1;
1591
1592         vm_object_hold(object);
1593         vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1594                                 vm_object_pmap_remove_callback, &info);
1595         if (start == 0 && end == object->size)
1596                 vm_object_clear_flag(object, OBJ_WRITEABLE);
1597         vm_object_drop(object);
1598 }
1599
1600 /*
1601  * The caller must hold the object
1602  */
1603 static int
1604 vm_object_pmap_remove_callback(vm_page_t p, void *data __unused)
1605 {
1606         vm_page_protect(p, VM_PROT_NONE);
1607         return(0);
1608 }
1609
1610 /*
1611  * Implements the madvise function at the object/page level.
1612  *
1613  * MADV_WILLNEED        (any object)
1614  *
1615  *      Activate the specified pages if they are resident.
1616  *
1617  * MADV_DONTNEED        (any object)
1618  *
1619  *      Deactivate the specified pages if they are resident.
1620  *
1621  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1622  *
1623  *      Deactivate and clean the specified pages if they are
1624  *      resident.  This permits the process to reuse the pages
1625  *      without faulting or the kernel to reclaim the pages
1626  *      without I/O.
1627  *
1628  * No requirements.
1629  */
1630 void
1631 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
1632 {
1633         vm_pindex_t end, tpindex;
1634         vm_object_t tobject;
1635         vm_object_t xobj;
1636         vm_page_t m;
1637         int error;
1638
1639         if (object == NULL)
1640                 return;
1641
1642         end = pindex + count;
1643
1644         vm_object_hold(object);
1645         tobject = object;
1646
1647         /*
1648          * Locate and adjust resident pages
1649          */
1650         for (; pindex < end; pindex += 1) {
1651 relookup:
1652                 if (tobject != object)
1653                         vm_object_drop(tobject);
1654                 tobject = object;
1655                 tpindex = pindex;
1656 shadowlookup:
1657                 /*
1658                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1659                  * and those pages must be OBJ_ONEMAPPING.
1660                  */
1661                 if (advise == MADV_FREE) {
1662                         if ((tobject->type != OBJT_DEFAULT &&
1663                              tobject->type != OBJT_SWAP) ||
1664                             (tobject->flags & OBJ_ONEMAPPING) == 0) {
1665                                 continue;
1666                         }
1667                 }
1668
1669                 m = vm_page_lookup_busy_try(tobject, tpindex, TRUE, &error);
1670
1671                 if (error) {
1672                         vm_page_sleep_busy(m, TRUE, "madvpo");
1673                         goto relookup;
1674                 }
1675                 if (m == NULL) {
1676                         /*
1677                          * There may be swap even if there is no backing page
1678                          */
1679                         if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1680                                 swap_pager_freespace(tobject, tpindex, 1);
1681
1682                         /*
1683                          * next object
1684                          */
1685                         while ((xobj = tobject->backing_object) != NULL) {
1686                                 KKASSERT(xobj != object);
1687                                 vm_object_hold(xobj);
1688                                 if (xobj == tobject->backing_object)
1689                                         break;
1690                                 vm_object_drop(xobj);
1691                         }
1692                         if (xobj == NULL)
1693                                 continue;
1694                         tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1695                         if (tobject != object) {
1696                                 vm_object_lock_swap();
1697                                 vm_object_drop(tobject);
1698                         }
1699                         tobject = xobj;
1700                         goto shadowlookup;
1701                 }
1702
1703                 /*
1704                  * If the page is not in a normal active state, we skip it.
1705                  * If the page is not managed there are no page queues to
1706                  * mess with.  Things can break if we mess with pages in
1707                  * any of the below states.
1708                  */
1709                 if (m->wire_count ||
1710                     (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1711                     m->valid != VM_PAGE_BITS_ALL
1712                 ) {
1713                         vm_page_wakeup(m);
1714                         continue;
1715                 }
1716
1717                 /*
1718                  * Theoretically once a page is known not to be busy, an
1719                  * interrupt cannot come along and rip it out from under us.
1720                  */
1721
1722                 if (advise == MADV_WILLNEED) {
1723                         vm_page_activate(m);
1724                 } else if (advise == MADV_DONTNEED) {
1725                         vm_page_dontneed(m);
1726                 } else if (advise == MADV_FREE) {
1727                         /*
1728                          * Mark the page clean.  This will allow the page
1729                          * to be freed up by the system.  However, such pages
1730                          * are often reused quickly by malloc()/free()
1731                          * so we do not do anything that would cause
1732                          * a page fault if we can help it.
1733                          *
1734                          * Specifically, we do not try to actually free
1735                          * the page now nor do we try to put it in the
1736                          * cache (which would cause a page fault on reuse).
1737                          *
1738                          * But we do make the page is freeable as we
1739                          * can without actually taking the step of unmapping
1740                          * it.
1741                          */
1742                         pmap_clear_modify(m);
1743                         m->dirty = 0;
1744                         m->act_count = 0;
1745                         vm_page_dontneed(m);
1746                         if (tobject->type == OBJT_SWAP)
1747                                 swap_pager_freespace(tobject, tpindex, 1);
1748                 }
1749                 vm_page_wakeup(m);
1750         }       
1751         if (tobject != object)
1752                 vm_object_drop(tobject);
1753         vm_object_drop(object);
1754 }
1755
1756 /*
1757  * Create a new object which is backed by the specified existing object
1758  * range.  Replace the pointer and offset that was pointing at the existing
1759  * object with the pointer/offset for the new object.
1760  *
1761  * No other requirements.
1762  */
1763 void
1764 vm_object_shadow(vm_object_t *objectp, vm_ooffset_t *offset, vm_size_t length,
1765                  int addref)
1766 {
1767         vm_object_t source;
1768         vm_object_t result;
1769         int useshadowlist;
1770
1771         source = *objectp;
1772
1773         /*
1774          * Don't create the new object if the old object isn't shared.
1775          * We have to chain wait before adding the reference to avoid
1776          * racing a collapse or deallocation.
1777          *
1778          * Add the additional ref to source here to avoid racing a later
1779          * collapse or deallocation. Clear the ONEMAPPING flag whether
1780          * addref is TRUE or not in this case because the original object
1781          * will be shadowed.
1782          */
1783         useshadowlist = 0;
1784         if (source) {
1785                 if (source->type != OBJT_VNODE) {
1786                         useshadowlist = 1;
1787                         vm_object_hold(source);
1788                         vm_object_chain_wait(source, 0);
1789                         if (source->ref_count == 1 &&
1790                             source->handle == NULL &&
1791                             (source->type == OBJT_DEFAULT ||
1792                              source->type == OBJT_SWAP)) {
1793                                 if (addref) {
1794                                         vm_object_reference_locked(source);
1795                                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
1796                                 }
1797                                 vm_object_drop(source);
1798                                 return;
1799                         }
1800                         vm_object_reference_locked(source);
1801                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
1802                 } else {
1803                         vm_object_reference_quick(source);
1804                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
1805                 }
1806         }
1807
1808         /*
1809          * Allocate a new object with the given length.  The new object
1810          * is returned referenced but we may have to add another one.
1811          * If we are adding a second reference we must clear OBJ_ONEMAPPING.
1812          * (typically because the caller is about to clone a vm_map_entry).
1813          *
1814          * The source object currently has an extra reference to prevent
1815          * collapses into it while we mess with its shadow list, which
1816          * we will remove later in this routine.
1817          */
1818         if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL)
1819                 panic("vm_object_shadow: no object for shadowing");
1820         vm_object_hold(result);
1821         if (addref) {
1822                 vm_object_reference_locked(result);
1823                 vm_object_clear_flag(result, OBJ_ONEMAPPING);
1824         }
1825
1826         /*
1827          * The new object shadows the source object.  Chain wait before
1828          * adjusting shadow_count or the shadow list to avoid races.
1829          *
1830          * Try to optimize the result object's page color when shadowing
1831          * in order to maintain page coloring consistency in the combined 
1832          * shadowed object.
1833          *
1834          * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
1835          */
1836         KKASSERT(result->backing_object == NULL);
1837         result->backing_object = source;
1838         if (source) {
1839                 if (useshadowlist) {
1840                         vm_object_chain_wait(source, 0);
1841                         LIST_INSERT_HEAD(&source->shadow_head,
1842                                          result, shadow_list);
1843                         source->shadow_count++;
1844                         source->generation++;
1845                         vm_object_set_flag(result, OBJ_ONSHADOW);
1846                 }
1847                 /* cpu localization twist */
1848                 result->pg_color = (int)(intptr_t)curthread;
1849         }
1850
1851         /*
1852          * Adjust the return storage.  Drop the ref on source before
1853          * returning.
1854          */
1855         result->backing_object_offset = *offset;
1856         vm_object_drop(result);
1857         *offset = 0;
1858         if (source) {
1859                 if (useshadowlist) {
1860                         vm_object_deallocate_locked(source);
1861                         vm_object_drop(source);
1862                 } else {
1863                         vm_object_deallocate(source);
1864                 }
1865         }
1866
1867         /*
1868          * Return the new things
1869          */
1870         *objectp = result;
1871 }
1872
1873 #define OBSC_TEST_ALL_SHADOWED  0x0001
1874 #define OBSC_COLLAPSE_NOWAIT    0x0002
1875 #define OBSC_COLLAPSE_WAIT      0x0004
1876
1877 static int vm_object_backing_scan_callback(vm_page_t p, void *data);
1878
1879 /*
1880  * The caller must hold the object.
1881  */
1882 static __inline int
1883 vm_object_backing_scan(vm_object_t object, vm_object_t backing_object, int op)
1884 {
1885         struct rb_vm_page_scan_info info;
1886
1887         vm_object_assert_held(object);
1888         vm_object_assert_held(backing_object);
1889
1890         KKASSERT(backing_object == object->backing_object);
1891         info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1892
1893         /*
1894          * Initial conditions
1895          */
1896         if (op & OBSC_TEST_ALL_SHADOWED) {
1897                 /*
1898                  * We do not want to have to test for the existence of
1899                  * swap pages in the backing object.  XXX but with the
1900                  * new swapper this would be pretty easy to do.
1901                  *
1902                  * XXX what about anonymous MAP_SHARED memory that hasn't
1903                  * been ZFOD faulted yet?  If we do not test for this, the
1904                  * shadow test may succeed! XXX
1905                  */
1906                 if (backing_object->type != OBJT_DEFAULT)
1907                         return(0);
1908         }
1909         if (op & OBSC_COLLAPSE_WAIT) {
1910                 KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
1911                 vm_object_set_flag(backing_object, OBJ_DEAD);
1912                 lwkt_gettoken(&vmobj_token);
1913                 TAILQ_REMOVE(&vm_object_list, backing_object, object_list);
1914                 vm_object_count--;
1915                 lwkt_reltoken(&vmobj_token);
1916         }
1917
1918         /*
1919          * Our scan.   We have to retry if a negative error code is returned,
1920          * otherwise 0 or 1 will be returned in info.error.  0 Indicates that
1921          * the scan had to be stopped because the parent does not completely
1922          * shadow the child.
1923          */
1924         info.object = object;
1925         info.backing_object = backing_object;
1926         info.limit = op;
1927         do {
1928                 info.error = 1;
1929                 vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
1930                                         vm_object_backing_scan_callback,
1931                                         &info);
1932         } while (info.error < 0);
1933
1934         return(info.error);
1935 }
1936
1937 /*
1938  * The caller must hold the object.
1939  */
1940 static int
1941 vm_object_backing_scan_callback(vm_page_t p, void *data)
1942 {
1943         struct rb_vm_page_scan_info *info = data;
1944         vm_object_t backing_object;
1945         vm_object_t object;
1946         vm_pindex_t pindex;
1947         vm_pindex_t new_pindex;
1948         vm_pindex_t backing_offset_index;
1949         int op;
1950
1951         pindex = p->pindex;
1952         new_pindex = pindex - info->backing_offset_index;
1953         op = info->limit;
1954         object = info->object;
1955         backing_object = info->backing_object;
1956         backing_offset_index = info->backing_offset_index;
1957
1958         if (op & OBSC_TEST_ALL_SHADOWED) {
1959                 vm_page_t pp;
1960
1961                 /*
1962                  * Ignore pages outside the parent object's range
1963                  * and outside the parent object's mapping of the 
1964                  * backing object.
1965                  *
1966                  * note that we do not busy the backing object's
1967                  * page.
1968                  */
1969                 if (pindex < backing_offset_index ||
1970                     new_pindex >= object->size
1971                 ) {
1972                         return(0);
1973                 }
1974
1975                 /*
1976                  * See if the parent has the page or if the parent's
1977                  * object pager has the page.  If the parent has the
1978                  * page but the page is not valid, the parent's
1979                  * object pager must have the page.
1980                  *
1981                  * If this fails, the parent does not completely shadow
1982                  * the object and we might as well give up now.
1983                  */
1984                 pp = vm_page_lookup(object, new_pindex);
1985                 if ((pp == NULL || pp->valid == 0) &&
1986                     !vm_pager_has_page(object, new_pindex)
1987                 ) {
1988                         info->error = 0;        /* problemo */
1989                         return(-1);             /* stop the scan */
1990                 }
1991         }
1992
1993         /*
1994          * Check for busy page.  Note that we may have lost (p) when we
1995          * possibly blocked above.
1996          */
1997         if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
1998                 vm_page_t pp;
1999
2000                 if (vm_page_busy_try(p, TRUE)) {
2001                         if (op & OBSC_COLLAPSE_NOWAIT) {
2002                                 return(0);
2003                         } else {
2004                                 /*
2005                                  * If we slept, anything could have
2006                                  * happened.   Ask that the scan be restarted.
2007                                  *
2008                                  * Since the object is marked dead, the
2009                                  * backing offset should not have changed.  
2010                                  */
2011                                 vm_page_sleep_busy(p, TRUE, "vmocol");
2012                                 info->error = -1;
2013                                 return(-1);
2014                         }
2015                 }
2016
2017                 /*
2018                  * If (p) is no longer valid restart the scan.
2019                  */
2020                 if (p->object != backing_object || p->pindex != pindex) {
2021                         kprintf("vm_object_backing_scan: Warning: page "
2022                                 "%p ripped out from under us\n", p);
2023                         vm_page_wakeup(p);
2024                         info->error = -1;
2025                         return(-1);
2026                 }
2027
2028                 if (op & OBSC_COLLAPSE_NOWAIT) {
2029                         if (p->valid == 0 ||
2030                             p->wire_count ||
2031                             (p->flags & PG_NEED_COMMIT)) {
2032                                 vm_page_wakeup(p);
2033                                 return(0);
2034                         }
2035                 } else {
2036                         /* XXX what if p->valid == 0 , hold_count, etc? */
2037                 }
2038
2039                 KASSERT(
2040                     p->object == backing_object,
2041                     ("vm_object_qcollapse(): object mismatch")
2042                 );
2043
2044                 /*
2045                  * Destroy any associated swap
2046                  */
2047                 if (backing_object->type == OBJT_SWAP)
2048                         swap_pager_freespace(backing_object, p->pindex, 1);
2049
2050                 if (
2051                     p->pindex < backing_offset_index ||
2052                     new_pindex >= object->size
2053                 ) {
2054                         /*
2055                          * Page is out of the parent object's range, we 
2056                          * can simply destroy it. 
2057                          */
2058                         vm_page_protect(p, VM_PROT_NONE);
2059                         vm_page_free(p);
2060                         return(0);
2061                 }
2062
2063                 pp = vm_page_lookup(object, new_pindex);
2064                 if (pp != NULL || vm_pager_has_page(object, new_pindex)) {
2065                         /*
2066                          * page already exists in parent OR swap exists
2067                          * for this location in the parent.  Destroy 
2068                          * the original page from the backing object.
2069                          *
2070                          * Leave the parent's page alone
2071                          */
2072                         vm_page_protect(p, VM_PROT_NONE);
2073                         vm_page_free(p);
2074                         return(0);
2075                 }
2076
2077                 /*
2078                  * Page does not exist in parent, rename the
2079                  * page from the backing object to the main object. 
2080                  *
2081                  * If the page was mapped to a process, it can remain 
2082                  * mapped through the rename.
2083                  */
2084                 if ((p->queue - p->pc) == PQ_CACHE)
2085                         vm_page_deactivate(p);
2086
2087                 vm_page_rename(p, object, new_pindex);
2088                 vm_page_wakeup(p);
2089                 /* page automatically made dirty by rename */
2090         }
2091         return(0);
2092 }
2093
2094 /*
2095  * This version of collapse allows the operation to occur earlier and
2096  * when paging_in_progress is true for an object...  This is not a complete
2097  * operation, but should plug 99.9% of the rest of the leaks.
2098  *
2099  * The caller must hold the object and backing_object and both must be
2100  * chainlocked.
2101  *
2102  * (only called from vm_object_collapse)
2103  */
2104 static void
2105 vm_object_qcollapse(vm_object_t object, vm_object_t backing_object)
2106 {
2107         if (backing_object->ref_count == 1) {
2108                 atomic_add_int(&backing_object->ref_count, 2);
2109                 vm_object_backing_scan(object, backing_object,
2110                                        OBSC_COLLAPSE_NOWAIT);
2111                 atomic_add_int(&backing_object->ref_count, -2);
2112         }
2113 }
2114
2115 /*
2116  * Collapse an object with the object backing it.  Pages in the backing
2117  * object are moved into the parent, and the backing object is deallocated.
2118  * Any conflict is resolved in favor of the parent's existing pages.
2119  *
2120  * object must be held and chain-locked on call.
2121  *
2122  * The caller must have an extra ref on object to prevent a race from
2123  * destroying it during the collapse.
2124  */
2125 void
2126 vm_object_collapse(vm_object_t object, struct vm_object_dealloc_list **dlistp)
2127 {
2128         struct vm_object_dealloc_list *dlist = NULL;
2129         vm_object_t backing_object;
2130
2131         /*
2132          * Only one thread is attempting a collapse at any given moment.
2133          * There are few restrictions for (object) that callers of this
2134          * function check so reentrancy is likely.
2135          */
2136         KKASSERT(object != NULL);
2137         vm_object_assert_held(object);
2138         KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
2139
2140         for (;;) {
2141                 vm_object_t bbobj;
2142                 int dodealloc;
2143
2144                 /*
2145                  * We can only collapse a DEFAULT/SWAP object with a
2146                  * DEFAULT/SWAP object.
2147                  */
2148                 if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) {
2149                         backing_object = NULL;
2150                         break;
2151                 }
2152
2153                 backing_object = object->backing_object;
2154                 if (backing_object == NULL)
2155                         break;
2156                 if (backing_object->type != OBJT_DEFAULT &&
2157                     backing_object->type != OBJT_SWAP) {
2158                         backing_object = NULL;
2159                         break;
2160                 }
2161
2162                 /*
2163                  * Hold the backing_object and check for races
2164                  */
2165                 vm_object_hold(backing_object);
2166                 if (backing_object != object->backing_object ||
2167                     (backing_object->type != OBJT_DEFAULT &&
2168                      backing_object->type != OBJT_SWAP)) {
2169                         vm_object_drop(backing_object);
2170                         continue;
2171                 }
2172
2173                 /*
2174                  * Chain-lock the backing object too because if we
2175                  * successfully merge its pages into the top object we
2176                  * will collapse backing_object->backing_object as the
2177                  * new backing_object.  Re-check that it is still our
2178                  * backing object.
2179                  */
2180                 vm_object_chain_acquire(backing_object, 0);
2181                 if (backing_object != object->backing_object) {
2182                         vm_object_chain_release(backing_object);
2183                         vm_object_drop(backing_object);
2184                         continue;
2185                 }
2186
2187                 /*
2188                  * we check the backing object first, because it is most likely
2189                  * not collapsable.
2190                  */
2191                 if (backing_object->handle != NULL ||
2192                     (backing_object->type != OBJT_DEFAULT &&
2193                      backing_object->type != OBJT_SWAP) ||
2194                     (backing_object->flags & OBJ_DEAD) ||
2195                     object->handle != NULL ||
2196                     (object->type != OBJT_DEFAULT &&
2197                      object->type != OBJT_SWAP) ||
2198                     (object->flags & OBJ_DEAD)) {
2199                         break;
2200                 }
2201
2202                 /*
2203                  * If paging is in progress we can't do a normal collapse.
2204                  */
2205                 if (
2206                     object->paging_in_progress != 0 ||
2207                     backing_object->paging_in_progress != 0
2208                 ) {
2209                         vm_object_qcollapse(object, backing_object);
2210                         break;
2211                 }
2212
2213                 /*
2214                  * We know that we can either collapse the backing object (if
2215                  * the parent is the only reference to it) or (perhaps) have
2216                  * the parent bypass the object if the parent happens to shadow
2217                  * all the resident pages in the entire backing object.
2218                  *
2219                  * This is ignoring pager-backed pages such as swap pages.
2220                  * vm_object_backing_scan fails the shadowing test in this
2221                  * case.
2222                  */
2223                 if (backing_object->ref_count == 1) {
2224                         /*
2225                          * If there is exactly one reference to the backing
2226                          * object, we can collapse it into the parent.  
2227                          */
2228                         KKASSERT(object->backing_object == backing_object);
2229                         vm_object_backing_scan(object, backing_object,
2230                                                OBSC_COLLAPSE_WAIT);
2231
2232                         /*
2233                          * Move the pager from backing_object to object.
2234                          */
2235                         if (backing_object->type == OBJT_SWAP) {
2236                                 vm_object_pip_add(backing_object, 1);
2237
2238                                 /*
2239                                  * scrap the paging_offset junk and do a 
2240                                  * discrete copy.  This also removes major 
2241                                  * assumptions about how the swap-pager 
2242                                  * works from where it doesn't belong.  The
2243                                  * new swapper is able to optimize the
2244                                  * destroy-source case.
2245                                  */
2246                                 vm_object_pip_add(object, 1);
2247                                 swap_pager_copy(backing_object, object,
2248                                     OFF_TO_IDX(object->backing_object_offset),
2249                                     TRUE);
2250                                 vm_object_pip_wakeup(object);
2251                                 vm_object_pip_wakeup(backing_object);
2252                         }
2253
2254                         /*
2255                          * Object now shadows whatever backing_object did.
2256                          * Remove object from backing_object's shadow_list.
2257                          */
2258                         KKASSERT(object->backing_object == backing_object);
2259                         if (object->flags & OBJ_ONSHADOW) {
2260                                 LIST_REMOVE(object, shadow_list);
2261                                 backing_object->shadow_count--;
2262                                 backing_object->generation++;
2263                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
2264                         }
2265
2266                         /*
2267                          * backing_object->backing_object moves from within
2268                          * backing_object to within object.
2269                          *
2270                          * OBJT_VNODE bbobj's should have empty shadow lists.
2271                          */
2272                         while ((bbobj = backing_object->backing_object) != NULL) {
2273                                 if (bbobj->type == OBJT_VNODE)
2274                                         vm_object_hold_shared(bbobj);
2275                                 else
2276                                         vm_object_hold(bbobj);
2277                                 if (bbobj == backing_object->backing_object)
2278                                         break;
2279                                 vm_object_drop(bbobj);
2280                         }
2281                         if (bbobj) {
2282                                 if (backing_object->flags & OBJ_ONSHADOW) {
2283                                         /* not locked exclusively if vnode */
2284                                         KKASSERT(bbobj->type != OBJT_VNODE);
2285                                         LIST_REMOVE(backing_object,
2286                                                     shadow_list);
2287                                         bbobj->shadow_count--;
2288                                         bbobj->generation++;
2289                                         vm_object_clear_flag(backing_object,
2290                                                              OBJ_ONSHADOW);
2291                                 }
2292                                 backing_object->backing_object = NULL;
2293                         }
2294                         object->backing_object = bbobj;
2295                         if (bbobj) {
2296                                 if (bbobj->type != OBJT_VNODE) {
2297                                         LIST_INSERT_HEAD(&bbobj->shadow_head,
2298                                                          object, shadow_list);
2299                                         bbobj->shadow_count++;
2300                                         bbobj->generation++;
2301                                         vm_object_set_flag(object,
2302                                                            OBJ_ONSHADOW);
2303                                 }
2304                         }
2305
2306                         object->backing_object_offset +=
2307                                 backing_object->backing_object_offset;
2308
2309                         vm_object_drop(bbobj);
2310
2311                         /*
2312                          * Discard the old backing_object.  Nothing should be
2313                          * able to ref it, other than a vm_map_split(),
2314                          * and vm_map_split() will stall on our chain lock.
2315                          * And we control the parent so it shouldn't be
2316                          * possible for it to go away either.
2317                          *
2318                          * Since the backing object has no pages, no pager
2319                          * left, and no object references within it, all
2320                          * that is necessary is to dispose of it.
2321                          */
2322                         KASSERT(backing_object->ref_count == 1,
2323                                 ("backing_object %p was somehow "
2324                                  "re-referenced during collapse!",
2325                                  backing_object));
2326                         KASSERT(RB_EMPTY(&backing_object->rb_memq),
2327                                 ("backing_object %p somehow has left "
2328                                  "over pages during collapse!",
2329                                  backing_object));
2330
2331                         /*
2332                          * The object can be destroyed.
2333                          *
2334                          * XXX just fall through and dodealloc instead
2335                          *     of forcing destruction?
2336                          */
2337                         atomic_add_int(&backing_object->ref_count, -1);
2338                         if ((backing_object->flags & OBJ_DEAD) == 0)
2339                                 vm_object_terminate(backing_object);
2340                         object_collapses++;
2341                         dodealloc = 0;
2342                 } else {
2343                         /*
2344                          * If we do not entirely shadow the backing object,
2345                          * there is nothing we can do so we give up.
2346                          */
2347                         if (vm_object_backing_scan(object, backing_object,
2348                                                 OBSC_TEST_ALL_SHADOWED) == 0) {
2349                                 break;
2350                         }
2351
2352                         /*
2353                          * bbobj is backing_object->backing_object.  Since
2354                          * object completely shadows backing_object we can
2355                          * bypass it and become backed by bbobj instead.
2356                          *
2357                          * The shadow list for vnode backing objects is not
2358                          * used and a shared hold is allowed.
2359                          */
2360                         while ((bbobj = backing_object->backing_object) != NULL) {
2361                                 if (bbobj->type == OBJT_VNODE)
2362                                         vm_object_hold_shared(bbobj);
2363                                 else
2364                                         vm_object_hold(bbobj);
2365                                 if (bbobj == backing_object->backing_object)
2366                                         break;
2367                                 vm_object_drop(bbobj);
2368                         }
2369
2370                         /*
2371                          * Make object shadow bbobj instead of backing_object.
2372                          * Remove object from backing_object's shadow list.
2373                          *
2374                          * Deallocating backing_object will not remove
2375                          * it, since its reference count is at least 2.
2376                          */
2377                         KKASSERT(object->backing_object == backing_object);
2378                         if (object->flags & OBJ_ONSHADOW) {
2379                                 LIST_REMOVE(object, shadow_list);
2380                                 backing_object->shadow_count--;
2381                                 backing_object->generation++;
2382                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
2383                         }
2384
2385                         /*
2386                          * Add a ref to bbobj, bbobj now shadows object.
2387                          *
2388                          * NOTE: backing_object->backing_object still points
2389                          *       to bbobj.  That relationship remains intact
2390                          *       because backing_object has > 1 ref, so
2391                          *       someone else is pointing to it (hence why
2392                          *       we can't collapse it into object and can
2393                          *       only handle the all-shadowed bypass case).
2394                          */
2395                         if (bbobj) {
2396                                 if (bbobj->type != OBJT_VNODE) {
2397                                         vm_object_chain_wait(bbobj, 0);
2398                                         vm_object_reference_locked(bbobj);
2399                                         LIST_INSERT_HEAD(&bbobj->shadow_head,
2400                                                          object, shadow_list);
2401                                         bbobj->shadow_count++;
2402                                         bbobj->generation++;
2403                                         vm_object_set_flag(object,
2404                                                            OBJ_ONSHADOW);
2405                                 } else {
2406                                         vm_object_reference_quick(bbobj);
2407                                 }
2408                                 object->backing_object_offset +=
2409                                         backing_object->backing_object_offset;
2410                                 object->backing_object = bbobj;
2411                                 vm_object_drop(bbobj);
2412                         } else {
2413                                 object->backing_object = NULL;
2414                         }
2415
2416                         /*
2417                          * Drop the reference count on backing_object.  To
2418                          * handle ref_count races properly we can't assume
2419                          * that the ref_count is still at least 2 so we
2420                          * have to actually call vm_object_deallocate()
2421                          * (after clearing the chainlock).
2422                          */
2423                         object_bypasses++;
2424                         dodealloc = 1;
2425                 }
2426
2427                 /*
2428                  * Ok, we want to loop on the new object->bbobj association,
2429                  * possibly collapsing it further.  However if dodealloc is
2430                  * non-zero we have to deallocate the backing_object which
2431                  * itself can potentially undergo a collapse, creating a
2432                  * recursion depth issue with the LWKT token subsystem.
2433                  *
2434                  * In the case where we must deallocate the backing_object
2435                  * it is possible now that the backing_object has a single
2436                  * shadow count on some other object (not represented here
2437                  * as yet), since it no longer shadows us.  Thus when we
2438                  * call vm_object_deallocate() it may attempt to collapse
2439                  * itself into its remaining parent.
2440                  */
2441                 if (dodealloc) {
2442                         struct vm_object_dealloc_list *dtmp;
2443
2444                         vm_object_chain_release(backing_object);
2445                         vm_object_unlock(backing_object);
2446                         /* backing_object remains held */
2447
2448                         /*
2449                          * Auto-deallocation list for caller convenience.
2450                          */
2451                         if (dlistp == NULL)
2452                                 dlistp = &dlist;
2453
2454                         dtmp = kmalloc(sizeof(*dtmp), M_TEMP, M_WAITOK);
2455                         dtmp->object = backing_object;
2456                         dtmp->next = *dlistp;
2457                         *dlistp = dtmp;
2458                 } else {
2459                         vm_object_chain_release(backing_object);
2460                         vm_object_drop(backing_object);
2461                 }
2462                 /* backing_object = NULL; not needed */
2463                 /* loop */
2464         }
2465
2466         /*
2467          * Clean up any left over backing_object
2468          */
2469         if (backing_object) {
2470                 vm_object_chain_release(backing_object);
2471                 vm_object_drop(backing_object);
2472         }
2473
2474         /*
2475          * Clean up any auto-deallocation list.  This is a convenience
2476          * for top-level callers so they don't have to pass &dlist.
2477          * Do not clean up any caller-passed dlistp, the caller will
2478          * do that.
2479          */
2480         if (dlist)
2481                 vm_object_deallocate_list(&dlist);
2482
2483 }
2484
2485 /*
2486  * vm_object_collapse() may collect additional objects in need of
2487  * deallocation.  This routine deallocates these objects.  The
2488  * deallocation itself can trigger additional collapses (which the
2489  * deallocate function takes care of).  This procedure is used to
2490  * reduce procedural recursion since these vm_object shadow chains
2491  * can become quite long.
2492  */
2493 void
2494 vm_object_deallocate_list(struct vm_object_dealloc_list **dlistp)
2495 {
2496         struct vm_object_dealloc_list *dlist;
2497
2498         while ((dlist = *dlistp) != NULL) {
2499                 *dlistp = dlist->next;
2500                 vm_object_lock(dlist->object);
2501                 vm_object_deallocate_locked(dlist->object);
2502                 vm_object_drop(dlist->object);
2503                 kfree(dlist, M_TEMP);
2504         }
2505 }
2506
2507 /*
2508  * Removes all physical pages in the specified object range from the
2509  * object's list of pages.
2510  *
2511  * No requirements.
2512  */
2513 static int vm_object_page_remove_callback(vm_page_t p, void *data);
2514
2515 void
2516 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
2517                       boolean_t clean_only)
2518 {
2519         struct rb_vm_page_scan_info info;
2520         int all;
2521
2522         /*
2523          * Degenerate cases and assertions
2524          */
2525         vm_object_hold(object);
2526         if (object == NULL ||
2527             (object->resident_page_count == 0 && object->swblock_count == 0)) {
2528                 vm_object_drop(object);
2529                 return;
2530         }
2531         KASSERT(object->type != OBJT_PHYS, 
2532                 ("attempt to remove pages from a physical object"));
2533
2534         /*
2535          * Indicate that paging is occuring on the object
2536          */
2537         vm_object_pip_add(object, 1);
2538
2539         /*
2540          * Figure out the actual removal range and whether we are removing
2541          * the entire contents of the object or not.  If removing the entire
2542          * contents, be sure to get all pages, even those that might be 
2543          * beyond the end of the object.
2544          */
2545         info.start_pindex = start;
2546         if (end == 0)
2547                 info.end_pindex = (vm_pindex_t)-1;
2548         else
2549                 info.end_pindex = end - 1;
2550         info.limit = clean_only;
2551         all = (start == 0 && info.end_pindex >= object->size - 1);
2552
2553         /*
2554          * Loop until we are sure we have gotten them all.
2555          */
2556         do {
2557                 info.error = 0;
2558                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2559                                         vm_object_page_remove_callback, &info);
2560         } while (info.error);
2561
2562         /*
2563          * Remove any related swap if throwing away pages, or for
2564          * non-swap objects (the swap is a clean copy in that case).
2565          */
2566         if (object->type != OBJT_SWAP || clean_only == FALSE) {
2567                 if (all)
2568                         swap_pager_freespace_all(object);
2569                 else
2570                         swap_pager_freespace(object, info.start_pindex,
2571                              info.end_pindex - info.start_pindex + 1);
2572         }
2573
2574         /*
2575          * Cleanup
2576          */
2577         vm_object_pip_wakeup(object);
2578         vm_object_drop(object);
2579 }
2580
2581 /*
2582  * The caller must hold the object
2583  */
2584 static int
2585 vm_object_page_remove_callback(vm_page_t p, void *data)
2586 {
2587         struct rb_vm_page_scan_info *info = data;
2588
2589         if (vm_page_busy_try(p, TRUE)) {
2590                 vm_page_sleep_busy(p, TRUE, "vmopar");
2591                 info->error = 1;
2592                 return(0);
2593         }
2594
2595         /*
2596          * Wired pages cannot be destroyed, but they can be invalidated
2597          * and we do so if clean_only (limit) is not set.
2598          *
2599          * WARNING!  The page may be wired due to being part of a buffer
2600          *           cache buffer, and the buffer might be marked B_CACHE.
2601          *           This is fine as part of a truncation but VFSs must be
2602          *           sure to fix the buffer up when re-extending the file.
2603          *
2604          * NOTE!     PG_NEED_COMMIT is ignored.
2605          */
2606         if (p->wire_count != 0) {
2607                 vm_page_protect(p, VM_PROT_NONE);
2608                 if (info->limit == 0)
2609                         p->valid = 0;
2610                 vm_page_wakeup(p);
2611                 return(0);
2612         }
2613
2614         /*
2615          * limit is our clean_only flag.  If set and the page is dirty or
2616          * requires a commit, do not free it.  If set and the page is being
2617          * held by someone, do not free it.
2618          */
2619         if (info->limit && p->valid) {
2620                 vm_page_test_dirty(p);
2621                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
2622                         vm_page_wakeup(p);
2623                         return(0);
2624                 }
2625 #if 0
2626                 if (p->hold_count) {
2627                         vm_page_wakeup(p);
2628                         return(0);
2629                 }
2630 #endif
2631         }
2632
2633         /*
2634          * Destroy the page
2635          */
2636         vm_page_protect(p, VM_PROT_NONE);
2637         vm_page_free(p);
2638         return(0);
2639 }
2640
2641 /*
2642  * Coalesces two objects backing up adjoining regions of memory into a
2643  * single object.
2644  *
2645  * returns TRUE if objects were combined.
2646  *
2647  * NOTE: Only works at the moment if the second object is NULL -
2648  *       if it's not, which object do we lock first?
2649  *
2650  * Parameters:
2651  *      prev_object     First object to coalesce
2652  *      prev_offset     Offset into prev_object
2653  *      next_object     Second object into coalesce
2654  *      next_offset     Offset into next_object
2655  *
2656  *      prev_size       Size of reference to prev_object
2657  *      next_size       Size of reference to next_object
2658  *
2659  * The caller does not need to hold (prev_object) but must have a stable
2660  * pointer to it (typically by holding the vm_map locked).
2661  */
2662 boolean_t
2663 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
2664                    vm_size_t prev_size, vm_size_t next_size)
2665 {
2666         vm_pindex_t next_pindex;
2667
2668         if (prev_object == NULL)
2669                 return (TRUE);
2670
2671         vm_object_hold(prev_object);
2672
2673         if (prev_object->type != OBJT_DEFAULT &&
2674             prev_object->type != OBJT_SWAP) {
2675                 vm_object_drop(prev_object);
2676                 return (FALSE);
2677         }
2678
2679         /*
2680          * Try to collapse the object first
2681          */
2682         vm_object_chain_acquire(prev_object, 0);
2683         vm_object_collapse(prev_object, NULL);
2684
2685         /*
2686          * Can't coalesce if: . more than one reference . paged out . shadows
2687          * another object . has a copy elsewhere (any of which mean that the
2688          * pages not mapped to prev_entry may be in use anyway)
2689          */
2690
2691         if (prev_object->backing_object != NULL) {
2692                 vm_object_chain_release(prev_object);
2693                 vm_object_drop(prev_object);
2694                 return (FALSE);
2695         }
2696
2697         prev_size >>= PAGE_SHIFT;
2698         next_size >>= PAGE_SHIFT;
2699         next_pindex = prev_pindex + prev_size;
2700
2701         if ((prev_object->ref_count > 1) &&
2702             (prev_object->size != next_pindex)) {
2703                 vm_object_chain_release(prev_object);
2704                 vm_object_drop(prev_object);
2705                 return (FALSE);
2706         }
2707
2708         /*
2709          * Remove any pages that may still be in the object from a previous
2710          * deallocation.
2711          */
2712         if (next_pindex < prev_object->size) {
2713                 vm_object_page_remove(prev_object,
2714                                       next_pindex,
2715                                       next_pindex + next_size, FALSE);
2716                 if (prev_object->type == OBJT_SWAP)
2717                         swap_pager_freespace(prev_object,
2718                                              next_pindex, next_size);
2719         }
2720
2721         /*
2722          * Extend the object if necessary.
2723          */
2724         if (next_pindex + next_size > prev_object->size)
2725                 prev_object->size = next_pindex + next_size;
2726
2727         vm_object_chain_release(prev_object);
2728         vm_object_drop(prev_object);
2729         return (TRUE);
2730 }
2731
2732 /*
2733  * Make the object writable and flag is being possibly dirty.
2734  *
2735  * The object might not be held (or might be held but held shared),
2736  * the related vnode is probably not held either.  Object and vnode are
2737  * stable by virtue of the vm_page busied by the caller preventing
2738  * destruction.
2739  *
2740  * If the related mount is flagged MNTK_THR_SYNC we need to call
2741  * vsetobjdirty().  Filesystems using this option usually shortcut
2742  * synchronization by only scanning the syncer list.
2743  */
2744 void
2745 vm_object_set_writeable_dirty(vm_object_t object)
2746 {
2747         struct vnode *vp;
2748
2749         /*vm_object_assert_held(object);*/
2750         /*
2751          * Avoid contention in vm fault path by checking the state before
2752          * issuing an atomic op on it.
2753          */
2754         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
2755             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
2756                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
2757         }
2758         if (object->type == OBJT_VNODE &&
2759             (vp = (struct vnode *)object->handle) != NULL) {
2760                 if ((vp->v_flag & VOBJDIRTY) == 0) {
2761                         if (vp->v_mount &&
2762                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
2763                                 vsetobjdirty(vp);
2764                         } else {
2765                                 vsetflags(vp, VOBJDIRTY);
2766                         }
2767                 }
2768         }
2769 }
2770
2771 #include "opt_ddb.h"
2772 #ifdef DDB
2773 #include <sys/kernel.h>
2774
2775 #include <sys/cons.h>
2776
2777 #include <ddb/ddb.h>
2778
2779 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
2780                                        vm_map_entry_t entry);
2781 static int      vm_object_in_map (vm_object_t object);
2782
2783 /*
2784  * The caller must hold the object.
2785  */
2786 static int
2787 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
2788 {
2789         vm_map_t tmpm;
2790         vm_map_entry_t tmpe;
2791         vm_object_t obj, nobj;
2792         int entcount;
2793
2794         if (map == 0)
2795                 return 0;
2796         if (entry == 0) {
2797                 tmpe = map->header.next;
2798                 entcount = map->nentries;
2799                 while (entcount-- && (tmpe != &map->header)) {
2800                         if( _vm_object_in_map(map, object, tmpe)) {
2801                                 return 1;
2802                         }
2803                         tmpe = tmpe->next;
2804                 }
2805                 return (0);
2806         }
2807         switch(entry->maptype) {
2808         case VM_MAPTYPE_SUBMAP:
2809                 tmpm = entry->object.sub_map;
2810                 tmpe = tmpm->header.next;
2811                 entcount = tmpm->nentries;
2812                 while (entcount-- && tmpe != &tmpm->header) {
2813                         if( _vm_object_in_map(tmpm, object, tmpe)) {
2814                                 return 1;
2815                         }
2816                         tmpe = tmpe->next;
2817                 }
2818                 break;
2819         case VM_MAPTYPE_NORMAL:
2820         case VM_MAPTYPE_VPAGETABLE:
2821                 obj = entry->object.vm_object;
2822                 while (obj) {
2823                         if (obj == object) {
2824                                 if (obj != entry->object.vm_object)
2825                                         vm_object_drop(obj);
2826                                 return 1;
2827                         }
2828                         while ((nobj = obj->backing_object) != NULL) {
2829                                 vm_object_hold(nobj);
2830                                 if (nobj == obj->backing_object)
2831                                         break;
2832                                 vm_object_drop(nobj);
2833                         }
2834                         if (obj != entry->object.vm_object) {
2835                                 if (nobj)
2836                                         vm_object_lock_swap();
2837                                 vm_object_drop(obj);
2838                         }
2839                         obj = nobj;
2840                 }
2841                 break;
2842         default:
2843                 break;
2844         }
2845         return 0;
2846 }
2847
2848 static int vm_object_in_map_callback(struct proc *p, void *data);
2849
2850 struct vm_object_in_map_info {
2851         vm_object_t object;
2852         int rv;
2853 };
2854
2855 /*
2856  * Debugging only
2857  */
2858 static int
2859 vm_object_in_map(vm_object_t object)
2860 {
2861         struct vm_object_in_map_info info;
2862
2863         info.rv = 0;
2864         info.object = object;
2865
2866         allproc_scan(vm_object_in_map_callback, &info);
2867         if (info.rv)
2868                 return 1;
2869         if( _vm_object_in_map(&kernel_map, object, 0))
2870                 return 1;
2871         if( _vm_object_in_map(&pager_map, object, 0))
2872                 return 1;
2873         if( _vm_object_in_map(&buffer_map, object, 0))
2874                 return 1;
2875         return 0;
2876 }
2877
2878 /*
2879  * Debugging only
2880  */
2881 static int
2882 vm_object_in_map_callback(struct proc *p, void *data)
2883 {
2884         struct vm_object_in_map_info *info = data;
2885
2886         if (p->p_vmspace) {
2887                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
2888                         info->rv = 1;
2889                         return -1;
2890                 }
2891         }
2892         return (0);
2893 }
2894
2895 DB_SHOW_COMMAND(vmochk, vm_object_check)
2896 {
2897         vm_object_t object;
2898
2899         /*
2900          * make sure that internal objs are in a map somewhere
2901          * and none have zero ref counts.
2902          */
2903         for (object = TAILQ_FIRST(&vm_object_list);
2904                         object != NULL;
2905                         object = TAILQ_NEXT(object, object_list)) {
2906                 if (object->type == OBJT_MARKER)
2907                         continue;
2908                 if (object->handle == NULL &&
2909                     (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
2910                         if (object->ref_count == 0) {
2911                                 db_printf("vmochk: internal obj has zero ref count: %ld\n",
2912                                         (long)object->size);
2913                         }
2914                         if (!vm_object_in_map(object)) {
2915                                 db_printf(
2916                         "vmochk: internal obj is not in a map: "
2917                         "ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
2918                                     object->ref_count, (u_long)object->size, 
2919                                     (u_long)object->size,
2920                                     (void *)object->backing_object);
2921                         }
2922                 }
2923         }
2924 }
2925
2926 /*
2927  * Debugging only
2928  */
2929 DB_SHOW_COMMAND(object, vm_object_print_static)
2930 {
2931         /* XXX convert args. */
2932         vm_object_t object = (vm_object_t)addr;
2933         boolean_t full = have_addr;
2934
2935         vm_page_t p;
2936
2937         /* XXX count is an (unused) arg.  Avoid shadowing it. */
2938 #define count   was_count
2939
2940         int count;
2941
2942         if (object == NULL)
2943                 return;
2944
2945         db_iprintf(
2946             "Object %p: type=%d, size=0x%lx, res=%d, ref=%d, flags=0x%x\n",
2947             object, (int)object->type, (u_long)object->size,
2948             object->resident_page_count, object->ref_count, object->flags);
2949         /*
2950          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
2951          */
2952         db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
2953             object->shadow_count, 
2954             object->backing_object ? object->backing_object->ref_count : 0,
2955             object->backing_object, (long)object->backing_object_offset);
2956
2957         if (!full)
2958                 return;
2959
2960         db_indent += 2;
2961         count = 0;
2962         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
2963                 if (count == 0)
2964                         db_iprintf("memory:=");
2965                 else if (count == 6) {
2966                         db_printf("\n");
2967                         db_iprintf(" ...");
2968                         count = 0;
2969                 } else
2970                         db_printf(",");
2971                 count++;
2972
2973                 db_printf("(off=0x%lx,page=0x%lx)",
2974                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
2975         }
2976         if (count != 0)
2977                 db_printf("\n");
2978         db_indent -= 2;
2979 }
2980
2981 /* XXX. */
2982 #undef count
2983
2984 /*
2985  * XXX need this non-static entry for calling from vm_map_print.
2986  *
2987  * Debugging only
2988  */
2989 void
2990 vm_object_print(/* db_expr_t */ long addr,
2991                 boolean_t have_addr,
2992                 /* db_expr_t */ long count,
2993                 char *modif)
2994 {
2995         vm_object_print_static(addr, have_addr, count, modif);
2996 }
2997
2998 /*
2999  * Debugging only
3000  */
3001 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
3002 {
3003         vm_object_t object;
3004         int nl = 0;
3005         int c;
3006         for (object = TAILQ_FIRST(&vm_object_list);
3007                         object != NULL;
3008                         object = TAILQ_NEXT(object, object_list)) {
3009                 vm_pindex_t idx, fidx;
3010                 vm_pindex_t osize;
3011                 vm_paddr_t pa = -1, padiff;
3012                 int rcount;
3013                 vm_page_t m;
3014
3015                 if (object->type == OBJT_MARKER)
3016                         continue;
3017                 db_printf("new object: %p\n", (void *)object);
3018                 if ( nl > 18) {
3019                         c = cngetc();
3020                         if (c != ' ')
3021                                 return;
3022                         nl = 0;
3023                 }
3024                 nl++;
3025                 rcount = 0;
3026                 fidx = 0;
3027                 osize = object->size;
3028                 if (osize > 128)
3029                         osize = 128;
3030                 for (idx = 0; idx < osize; idx++) {
3031                         m = vm_page_lookup(object, idx);
3032                         if (m == NULL) {
3033                                 if (rcount) {
3034                                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3035                                                 (long)fidx, rcount, (long)pa);
3036                                         if ( nl > 18) {
3037                                                 c = cngetc();
3038                                                 if (c != ' ')
3039                                                         return;
3040                                                 nl = 0;
3041                                         }
3042                                         nl++;
3043                                         rcount = 0;
3044                                 }
3045                                 continue;
3046                         }
3047
3048                                 
3049                         if (rcount &&
3050                                 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
3051                                 ++rcount;
3052                                 continue;
3053                         }
3054                         if (rcount) {
3055                                 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
3056                                 padiff >>= PAGE_SHIFT;
3057                                 padiff &= PQ_L2_MASK;
3058                                 if (padiff == 0) {
3059                                         pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
3060                                         ++rcount;
3061                                         continue;
3062                                 }
3063                                 db_printf(" index(%ld)run(%d)pa(0x%lx)",
3064                                         (long)fidx, rcount, (long)pa);
3065                                 db_printf("pd(%ld)\n", (long)padiff);
3066                                 if ( nl > 18) {
3067                                         c = cngetc();
3068                                         if (c != ' ')
3069                                                 return;
3070                                         nl = 0;
3071                                 }
3072                                 nl++;
3073                         }
3074                         fidx = idx;
3075                         pa = VM_PAGE_TO_PHYS(m);
3076                         rcount = 1;
3077                 }
3078                 if (rcount) {
3079                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3080                                 (long)fidx, rcount, (long)pa);
3081                         if ( nl > 18) {
3082                                 c = cngetc();
3083                                 if (c != ' ')
3084                                         return;
3085                                 nl = 0;
3086                         }
3087                         nl++;
3088                 }
3089         }
3090 }
3091 #endif /* DDB */