kernel - Fix VM bug introduced earlier this month
[dragonfly.git] / sys / vm / vm_object.c
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62
63 /*
64  *      Virtual memory object module.
65  */
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>           /* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
78
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
91
92 #include <vm/vm_page2.h>
93
94 #include <machine/specialreg.h>
95
96 #define EASY_SCAN_FACTOR        8
97
98 static void     vm_object_qcollapse(vm_object_t object,
99                                     vm_object_t backing_object);
100 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
101                                              int pagerflags);
102 static void     vm_object_lock_init(vm_object_t);
103
104
105 /*
106  *      Virtual memory objects maintain the actual data
107  *      associated with allocated virtual memory.  A given
108  *      page of memory exists within exactly one object.
109  *
110  *      An object is only deallocated when all "references"
111  *      are given up.  Only one "reference" to a given
112  *      region of an object should be writeable.
113  *
114  *      Associated with each object is a list of all resident
115  *      memory pages belonging to that object; this list is
116  *      maintained by the "vm_page" module, and locked by the object's
117  *      lock.
118  *
119  *      Each object also records a "pager" routine which is
120  *      used to retrieve (and store) pages to the proper backing
121  *      storage.  In addition, objects may be backed by other
122  *      objects from which they were virtual-copied.
123  *
124  *      The only items within the object structure which are
125  *      modified after time of creation are:
126  *              reference count         locked by object's lock
127  *              pager routine           locked by object's lock
128  *
129  */
130
131 struct vm_object kernel_object;
132
133 static long vm_object_count;
134
135 static long object_collapses;
136 static long object_bypasses;
137 static int next_index;
138 static vm_zone_t obj_zone;
139 static struct vm_zone obj_zone_store;
140 #define VM_OBJECTS_INIT 256
141 static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
142
143 struct object_q vm_object_lists[VMOBJ_HSIZE];
144 struct lwkt_token vmobj_tokens[VMOBJ_HSIZE];
145
146 #if defined(DEBUG_LOCKS)
147
148 #define vm_object_vndeallocate(obj, vpp)        \
149                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
150
151 /*
152  * Debug helper to track hold/drop/ref/deallocate calls.
153  */
154 static void
155 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
156 {
157         int i;
158
159         i = atomic_fetchadd_int(&obj->debug_index, 1);
160         i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
161         ksnprintf(obj->debug_hold_thrs[i],
162                   sizeof(obj->debug_hold_thrs[i]),
163                   "%c%d:(%d):%s",
164                   (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
165                   (curthread->td_proc ? curthread->td_proc->p_pid : -1),
166                   obj->ref_count,
167                   curthread->td_comm);
168         obj->debug_hold_file[i] = file;
169         obj->debug_hold_line[i] = line;
170 #if 0
171         /* Uncomment for debugging obj refs/derefs in reproducable cases */
172         if (strcmp(curthread->td_comm, "sshd") == 0) {
173                 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
174                         (curthread->td_proc ? curthread->td_proc->p_pid : -1),
175                         obj, obj->ref_count, addrem, file, line);
176         }
177 #endif
178 }
179
180 #endif
181
182 /*
183  * Misc low level routines
184  */
185 static void
186 vm_object_lock_init(vm_object_t obj)
187 {
188 #if defined(DEBUG_LOCKS)
189         int i;
190
191         obj->debug_index = 0;
192         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
193                 obj->debug_hold_thrs[i][0] = 0;
194                 obj->debug_hold_file[i] = NULL;
195                 obj->debug_hold_line[i] = 0;
196         }
197 #endif
198 }
199
200 void
201 vm_object_lock_swap(void)
202 {
203         lwkt_token_swap();
204 }
205
206 void
207 vm_object_lock(vm_object_t obj)
208 {
209         lwkt_gettoken(&obj->token);
210 }
211
212 /*
213  * Returns TRUE on sucesss
214  */
215 static int
216 vm_object_lock_try(vm_object_t obj)
217 {
218         return(lwkt_trytoken(&obj->token));
219 }
220
221 void
222 vm_object_lock_shared(vm_object_t obj)
223 {
224         lwkt_gettoken_shared(&obj->token);
225 }
226
227 void
228 vm_object_unlock(vm_object_t obj)
229 {
230         lwkt_reltoken(&obj->token);
231 }
232
233 void
234 vm_object_upgrade(vm_object_t obj)
235 {
236         lwkt_reltoken(&obj->token);
237         lwkt_gettoken(&obj->token);
238 }
239
240 void
241 vm_object_downgrade(vm_object_t obj)
242 {
243         lwkt_reltoken(&obj->token);
244         lwkt_gettoken_shared(&obj->token);
245 }
246
247 static __inline void
248 vm_object_assert_held(vm_object_t obj)
249 {
250         ASSERT_LWKT_TOKEN_HELD(&obj->token);
251 }
252
253 void
254 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
255 {
256         KKASSERT(obj != NULL);
257
258         /*
259          * Object must be held (object allocation is stable due to callers
260          * context, typically already holding the token on a parent object)
261          * prior to potentially blocking on the lock, otherwise the object
262          * can get ripped away from us.
263          */
264         refcount_acquire(&obj->hold_count);
265         vm_object_lock(obj);
266
267 #if defined(DEBUG_LOCKS)
268         debugvm_object_add(obj, file, line, 1);
269 #endif
270 }
271
272 int
273 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
274 {
275         KKASSERT(obj != NULL);
276
277         /*
278          * Object must be held (object allocation is stable due to callers
279          * context, typically already holding the token on a parent object)
280          * prior to potentially blocking on the lock, otherwise the object
281          * can get ripped away from us.
282          */
283         refcount_acquire(&obj->hold_count);
284         if (vm_object_lock_try(obj) == 0) {
285                 if (refcount_release(&obj->hold_count)) {
286                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
287                                 zfree(obj_zone, obj);
288                 }
289                 return(0);
290         }
291
292 #if defined(DEBUG_LOCKS)
293         debugvm_object_add(obj, file, line, 1);
294 #endif
295         return(1);
296 }
297
298 void
299 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
300 {
301         KKASSERT(obj != NULL);
302
303         /*
304          * Object must be held (object allocation is stable due to callers
305          * context, typically already holding the token on a parent object)
306          * prior to potentially blocking on the lock, otherwise the object
307          * can get ripped away from us.
308          */
309         refcount_acquire(&obj->hold_count);
310         vm_object_lock_shared(obj);
311
312 #if defined(DEBUG_LOCKS)
313         debugvm_object_add(obj, file, line, 1);
314 #endif
315 }
316
317 /*
318  * Drop the token and hold_count on the object.
319  *
320  * WARNING! Token might be shared.
321  */
322 void
323 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
324 {
325         if (obj == NULL)
326                 return;
327
328         /*
329          * No new holders should be possible once we drop hold_count 1->0 as
330          * there is no longer any way to reference the object.
331          */
332         KKASSERT(obj->hold_count > 0);
333         if (refcount_release(&obj->hold_count)) {
334 #if defined(DEBUG_LOCKS)
335                 debugvm_object_add(obj, file, line, -1);
336 #endif
337
338                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
339                         vm_object_unlock(obj);
340                         zfree(obj_zone, obj);
341                 } else {
342                         vm_object_unlock(obj);
343                 }
344         } else {
345 #if defined(DEBUG_LOCKS)
346                 debugvm_object_add(obj, file, line, -1);
347 #endif
348                 vm_object_unlock(obj);
349         }
350 }
351
352 /*
353  * Initialize a freshly allocated object, returning a held object.
354  *
355  * Used only by vm_object_allocate() and zinitna().
356  *
357  * No requirements.
358  */
359 void
360 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
361 {
362         int incr;
363         int n;
364
365         RB_INIT(&object->rb_memq);
366         LIST_INIT(&object->shadow_head);
367         lwkt_token_init(&object->token, "vmobj");
368
369         object->type = type;
370         object->size = size;
371         object->ref_count = 1;
372         object->memattr = VM_MEMATTR_DEFAULT;
373         object->hold_count = 0;
374         object->flags = 0;
375         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
376                 vm_object_set_flag(object, OBJ_ONEMAPPING);
377         object->paging_in_progress = 0;
378         object->resident_page_count = 0;
379         object->agg_pv_list_count = 0;
380         object->shadow_count = 0;
381         /* cpu localization twist */
382         object->pg_color = (int)(intptr_t)curthread;
383         if ( size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
384                 incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
385         else
386                 incr = size;
387         next_index = (next_index + incr) & PQ_L2_MASK;
388         object->handle = NULL;
389         object->backing_object = NULL;
390         object->backing_object_offset = (vm_ooffset_t)0;
391
392         object->generation++;
393         object->swblock_count = 0;
394         RB_INIT(&object->swblock_root);
395         vm_object_lock_init(object);
396         pmap_object_init(object);
397
398         vm_object_hold(object);
399
400         n = VMOBJ_HASH(object);
401         atomic_add_long(&vm_object_count, 1);
402         lwkt_gettoken(&vmobj_tokens[n]);
403         TAILQ_INSERT_TAIL(&vm_object_lists[n], object, object_list);
404         lwkt_reltoken(&vmobj_tokens[n]);
405 }
406
407 /*
408  * Initialize the VM objects module.
409  *
410  * Called from the low level boot code only.
411  */
412 void
413 vm_object_init(void)
414 {
415         int i;
416
417         for (i = 0; i < VMOBJ_HSIZE; ++i) {
418                 TAILQ_INIT(&vm_object_lists[i]);
419                 lwkt_token_init(&vmobj_tokens[i], "vmobjlst");
420         }
421         
422         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
423                             &kernel_object);
424         vm_object_drop(&kernel_object);
425
426         obj_zone = &obj_zone_store;
427         zbootinit(obj_zone, "VM OBJECT", sizeof (struct vm_object),
428                 vm_objects_init, VM_OBJECTS_INIT);
429 }
430
431 void
432 vm_object_init2(void)
433 {
434         zinitna(obj_zone, NULL, NULL, 0, 0, ZONE_PANICFAIL, 1);
435 }
436
437 /*
438  * Allocate and return a new object of the specified type and size.
439  *
440  * No requirements.
441  */
442 vm_object_t
443 vm_object_allocate(objtype_t type, vm_pindex_t size)
444 {
445         vm_object_t result;
446
447         result = (vm_object_t) zalloc(obj_zone);
448
449         _vm_object_allocate(type, size, result);
450         vm_object_drop(result);
451
452         return (result);
453 }
454
455 /*
456  * This version returns a held object, allowing further atomic initialization
457  * of the object.
458  */
459 vm_object_t
460 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
461 {
462         vm_object_t result;
463
464         result = (vm_object_t) zalloc(obj_zone);
465
466         _vm_object_allocate(type, size, result);
467
468         return (result);
469 }
470
471 /*
472  * Add an additional reference to a vm_object.  The object must already be
473  * held.  The original non-lock version is no longer supported.  The object
474  * must NOT be chain locked by anyone at the time the reference is added.
475  *
476  * Referencing a chain-locked object can blow up the fairly sensitive
477  * ref_count and shadow_count tests in the deallocator.  Most callers
478  * will call vm_object_chain_wait() prior to calling
479  * vm_object_reference_locked() to avoid the case.
480  *
481  * The object must be held, but may be held shared if desired (hence why
482  * we use an atomic op).
483  */
484 void
485 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
486 {
487         KKASSERT(object != NULL);
488         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
489         KKASSERT((object->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) == 0);
490         atomic_add_int(&object->ref_count, 1);
491         if (object->type == OBJT_VNODE) {
492                 vref(object->handle);
493                 /* XXX what if the vnode is being destroyed? */
494         }
495 #if defined(DEBUG_LOCKS)
496         debugvm_object_add(object, file, line, 1);
497 #endif
498 }
499
500 /*
501  * This version is only allowed for vnode objects.
502  */
503 void
504 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
505 {
506         KKASSERT(object->type == OBJT_VNODE);
507         atomic_add_int(&object->ref_count, 1);
508         vref(object->handle);
509 #if defined(DEBUG_LOCKS)
510         debugvm_object_add(object, file, line, 1);
511 #endif
512 }
513
514 /*
515  * Object OBJ_CHAINLOCK lock handling.
516  *
517  * The caller can chain-lock backing objects recursively and then
518  * use vm_object_chain_release_all() to undo the whole chain.
519  *
520  * Chain locks are used to prevent collapses and are only applicable
521  * to OBJT_DEFAULT and OBJT_SWAP objects.  Chain locking operations
522  * on other object types are ignored.  This is also important because
523  * it allows e.g. the vnode underlying a memory mapping to take concurrent
524  * faults.
525  *
526  * The object must usually be held on entry, though intermediate
527  * objects need not be held on release.  The object must be held exclusively,
528  * NOT shared.  Note that the prefault path checks the shared state and
529  * avoids using the chain functions.
530  */
531 void
532 vm_object_chain_wait(vm_object_t object, int shared)
533 {
534         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
535         for (;;) {
536                 uint32_t chainlk = object->chainlk;
537
538                 cpu_ccfence();
539                 if (shared) {
540                         if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
541                                 tsleep_interlock(object, 0);
542                                 if (atomic_cmpset_int(&object->chainlk,
543                                                       chainlk,
544                                                       chainlk | CHAINLK_WAIT)) {
545                                         tsleep(object, PINTERLOCKED,
546                                                "objchns", 0);
547                                 }
548                                 /* retry */
549                         } else {
550                                 break;
551                         }
552                         /* retry */
553                 } else {
554                         if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
555                                 tsleep_interlock(object, 0);
556                                 if (atomic_cmpset_int(&object->chainlk,
557                                                       chainlk,
558                                                       chainlk | CHAINLK_WAIT))
559                                 {
560                                         tsleep(object, PINTERLOCKED,
561                                                "objchnx", 0);
562                                 }
563                                 /* retry */
564                         } else {
565                                 if (atomic_cmpset_int(&object->chainlk,
566                                                       chainlk,
567                                                       chainlk & ~CHAINLK_WAIT))
568                                 {
569                                         if (chainlk & CHAINLK_WAIT)
570                                                 wakeup(object);
571                                         break;
572                                 }
573                                 /* retry */
574                         }
575                 }
576                 /* retry */
577         }
578 }
579
580 void
581 vm_object_chain_acquire(vm_object_t object, int shared)
582 {
583         if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
584                 return;
585         if (vm_shared_fault == 0)
586                 shared = 0;
587
588         for (;;) {
589                 uint32_t chainlk = object->chainlk;
590
591                 cpu_ccfence();
592                 if (shared) {
593                         if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
594                                 tsleep_interlock(object, 0);
595                                 if (atomic_cmpset_int(&object->chainlk,
596                                                       chainlk,
597                                                       chainlk | CHAINLK_WAIT)) {
598                                         tsleep(object, PINTERLOCKED,
599                                                "objchns", 0);
600                                 }
601                                 /* retry */
602                         } else if (atomic_cmpset_int(&object->chainlk,
603                                               chainlk, chainlk + 1)) {
604                                 break;
605                         }
606                         /* retry */
607                 } else {
608                         if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
609                                 tsleep_interlock(object, 0);
610                                 if (atomic_cmpset_int(&object->chainlk,
611                                                       chainlk,
612                                                       chainlk |
613                                                        CHAINLK_WAIT |
614                                                        CHAINLK_EXCLREQ)) {
615                                         tsleep(object, PINTERLOCKED,
616                                                "objchnx", 0);
617                                 }
618                                 /* retry */
619                         } else {
620                                 if (atomic_cmpset_int(&object->chainlk,
621                                                       chainlk,
622                                                       (chainlk | CHAINLK_EXCL) &
623                                                       ~(CHAINLK_EXCLREQ |
624                                                         CHAINLK_WAIT))) {
625                                         if (chainlk & CHAINLK_WAIT)
626                                                 wakeup(object);
627                                         break;
628                                 }
629                                 /* retry */
630                         }
631                 }
632                 /* retry */
633         }
634 }
635
636 void
637 vm_object_chain_release(vm_object_t object)
638 {
639         /*ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));*/
640         if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
641                 return;
642         KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
643         for (;;) {
644                 uint32_t chainlk = object->chainlk;
645
646                 cpu_ccfence();
647                 if (chainlk & CHAINLK_MASK) {
648                         if ((chainlk & CHAINLK_MASK) == 1 &&
649                             atomic_cmpset_int(&object->chainlk,
650                                               chainlk,
651                                               (chainlk - 1) & ~CHAINLK_WAIT)) {
652                                 if (chainlk & CHAINLK_WAIT)
653                                         wakeup(object);
654                                 break;
655                         }
656                         if ((chainlk & CHAINLK_MASK) > 1 &&
657                             atomic_cmpset_int(&object->chainlk,
658                                               chainlk, chainlk - 1)) {
659                                 break;
660                         }
661                         /* retry */
662                 } else {
663                         KKASSERT(chainlk & CHAINLK_EXCL);
664                         if (atomic_cmpset_int(&object->chainlk,
665                                               chainlk,
666                                               chainlk & ~(CHAINLK_EXCL |
667                                                           CHAINLK_WAIT))) {
668                                 if (chainlk & CHAINLK_WAIT)
669                                         wakeup(object);
670                                 break;
671                         }
672                 }
673         }
674 }
675
676 /*
677  * Release the chain from first_object through and including stopobj.
678  * The caller is typically holding the first and last object locked
679  * (shared or exclusive) to prevent destruction races.
680  *
681  * We release stopobj first as an optimization as this object is most
682  * likely to be shared across multiple processes.
683  */
684 void
685 vm_object_chain_release_all(vm_object_t first_object, vm_object_t stopobj)
686 {
687         vm_object_t backing_object;
688         vm_object_t object;
689
690         vm_object_chain_release(stopobj);
691         object = first_object;
692
693         while (object != stopobj) {
694                 KKASSERT(object);
695                 backing_object = object->backing_object;
696                 vm_object_chain_release(object);
697                 object = backing_object;
698         }
699 }
700
701 /*
702  * Dereference an object and its underlying vnode.  The object may be
703  * held shared.  On return the object will remain held.
704  *
705  * This function may return a vnode in *vpp which the caller must release
706  * after the caller drops its own lock.  If vpp is NULL, we assume that
707  * the caller was holding an exclusive lock on the object and we vrele()
708  * the vp ourselves.
709  */
710 static void
711 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
712                                    VMOBJDBARGS)
713 {
714         struct vnode *vp = (struct vnode *) object->handle;
715
716         KASSERT(object->type == OBJT_VNODE,
717             ("vm_object_vndeallocate: not a vnode object"));
718         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
719         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
720 #ifdef INVARIANTS
721         if (object->ref_count == 0) {
722                 vprint("vm_object_vndeallocate", vp);
723                 panic("vm_object_vndeallocate: bad object reference count");
724         }
725 #endif
726         for (;;) {
727                 int count = object->ref_count;
728                 cpu_ccfence();
729                 if (count == 1) {
730                         vm_object_upgrade(object);
731                         if (atomic_cmpset_int(&object->ref_count, count, 0)) {
732                                 vclrflags(vp, VTEXT);
733                                 break;
734                         }
735                 } else {
736                         if (atomic_cmpset_int(&object->ref_count,
737                                               count, count - 1)) {
738                                 break;
739                         }
740                 }
741                 /* retry */
742         }
743 #if defined(DEBUG_LOCKS)
744         debugvm_object_add(object, file, line, -1);
745 #endif
746
747         /*
748          * vrele or return the vp to vrele.  We can only safely vrele(vp)
749          * if the object was locked exclusively.  But there are two races
750          * here.
751          *
752          * We had to upgrade the object above to safely clear VTEXT
753          * but the alternative path where the shared lock is retained
754          * can STILL race to 0 in other paths and cause our own vrele()
755          * to terminate the vnode.  We can't allow that if the VM object
756          * is still locked shared.
757          */
758         if (vpp)
759                 *vpp = vp;
760         else
761                 vrele(vp);
762 }
763
764 /*
765  * Release a reference to the specified object, gained either through a
766  * vm_object_allocate or a vm_object_reference call.  When all references
767  * are gone, storage associated with this object may be relinquished.
768  *
769  * The caller does not have to hold the object locked but must have control
770  * over the reference in question in order to guarantee that the object
771  * does not get ripped out from under us.
772  *
773  * XXX Currently all deallocations require an exclusive lock.
774  */
775 void
776 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
777 {
778         struct vnode *vp;
779         int count;
780
781         if (object == NULL)
782                 return;
783
784         for (;;) {
785                 count = object->ref_count;
786                 cpu_ccfence();
787
788                 /*
789                  * If decrementing the count enters into special handling
790                  * territory (0, 1, or 2) we have to do it the hard way.
791                  * Fortunate though, objects with only a few refs like this
792                  * are not likely to be heavily contended anyway.
793                  *
794                  * For vnode objects we only care about 1->0 transitions.
795                  */
796                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
797 #if defined(DEBUG_LOCKS)
798                         debugvm_object_add(object, file, line, 0);
799 #endif
800                         vm_object_hold(object);
801                         vm_object_deallocate_locked(object);
802                         vm_object_drop(object);
803                         break;
804                 }
805
806                 /*
807                  * Try to decrement ref_count without acquiring a hold on
808                  * the object.  This is particularly important for the exec*()
809                  * and exit*() code paths because the program binary may
810                  * have a great deal of sharing and an exclusive lock will
811                  * crowbar performance in those circumstances.
812                  */
813                 if (object->type == OBJT_VNODE) {
814                         vp = (struct vnode *)object->handle;
815                         if (atomic_cmpset_int(&object->ref_count,
816                                               count, count - 1)) {
817 #if defined(DEBUG_LOCKS)
818                                 debugvm_object_add(object, file, line, -1);
819 #endif
820
821                                 vrele(vp);
822                                 break;
823                         }
824                         /* retry */
825                 } else {
826                         if (atomic_cmpset_int(&object->ref_count,
827                                               count, count - 1)) {
828 #if defined(DEBUG_LOCKS)
829                                 debugvm_object_add(object, file, line, -1);
830 #endif
831                                 break;
832                         }
833                         /* retry */
834                 }
835                 /* retry */
836         }
837 }
838
839 void
840 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
841 {
842         struct vm_object_dealloc_list *dlist = NULL;
843         struct vm_object_dealloc_list *dtmp;
844         vm_object_t temp;
845         int must_drop = 0;
846
847         /*
848          * We may chain deallocate object, but additional objects may
849          * collect on the dlist which also have to be deallocated.  We
850          * must avoid a recursion, vm_object chains can get deep.
851          */
852
853 again:
854         while (object != NULL) {
855                 /*
856                  * vnode case, caller either locked the object exclusively
857                  * or this is a recursion with must_drop != 0 and the vnode
858                  * object will be locked shared.
859                  *
860                  * If locked shared we have to drop the object before we can
861                  * call vrele() or risk a shared/exclusive livelock.
862                  */
863                 if (object->type == OBJT_VNODE) {
864                         ASSERT_LWKT_TOKEN_HELD(&object->token);
865                         if (must_drop) {
866                                 struct vnode *tmp_vp;
867
868                                 vm_object_vndeallocate(object, &tmp_vp);
869                                 vm_object_drop(object);
870                                 must_drop = 0;
871                                 object = NULL;
872                                 vrele(tmp_vp);
873                         } else {
874                                 vm_object_vndeallocate(object, NULL);
875                         }
876                         break;
877                 }
878                 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
879
880                 /*
881                  * Normal case (object is locked exclusively)
882                  */
883                 if (object->ref_count == 0) {
884                         panic("vm_object_deallocate: object deallocated "
885                               "too many times: %d", object->type);
886                 }
887                 if (object->ref_count > 2) {
888                         atomic_add_int(&object->ref_count, -1);
889 #if defined(DEBUG_LOCKS)
890                         debugvm_object_add(object, file, line, -1);
891 #endif
892                         break;
893                 }
894
895                 /*
896                  * Here on ref_count of one or two, which are special cases for
897                  * objects.
898                  *
899                  * Nominal ref_count > 1 case if the second ref is not from
900                  * a shadow.
901                  *
902                  * (ONEMAPPING only applies to DEFAULT AND SWAP objects)
903                  */
904                 if (object->ref_count == 2 && object->shadow_count == 0) {
905                         if (object->type == OBJT_DEFAULT ||
906                             object->type == OBJT_SWAP) {
907                                 vm_object_set_flag(object, OBJ_ONEMAPPING);
908                         }
909                         atomic_add_int(&object->ref_count, -1);
910 #if defined(DEBUG_LOCKS)
911                         debugvm_object_add(object, file, line, -1);
912 #endif
913                         break;
914                 }
915
916                 /*
917                  * If the second ref is from a shadow we chain along it
918                  * upwards if object's handle is exhausted.
919                  *
920                  * We have to decrement object->ref_count before potentially
921                  * collapsing the first shadow object or the collapse code
922                  * will not be able to handle the degenerate case to remove
923                  * object.  However, if we do it too early the object can
924                  * get ripped out from under us.
925                  */
926                 if (object->ref_count == 2 && object->shadow_count == 1 &&
927                     object->handle == NULL && (object->type == OBJT_DEFAULT ||
928                                                object->type == OBJT_SWAP)) {
929                         temp = LIST_FIRST(&object->shadow_head);
930                         KKASSERT(temp != NULL);
931                         vm_object_hold(temp);
932
933                         /*
934                          * Wait for any paging to complete so the collapse
935                          * doesn't (or isn't likely to) qcollapse.  pip
936                          * waiting must occur before we acquire the
937                          * chainlock.
938                          */
939                         while (
940                                 temp->paging_in_progress ||
941                                 object->paging_in_progress
942                         ) {
943                                 vm_object_pip_wait(temp, "objde1");
944                                 vm_object_pip_wait(object, "objde2");
945                         }
946
947                         /*
948                          * If the parent is locked we have to give up, as
949                          * otherwise we would be acquiring locks in the
950                          * wrong order and potentially deadlock.
951                          */
952                         if (temp->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) {
953                                 vm_object_drop(temp);
954                                 goto skip;
955                         }
956                         vm_object_chain_acquire(temp, 0);
957
958                         /*
959                          * Recheck/retry after the hold and the paging
960                          * wait, both of which can block us.
961                          */
962                         if (object->ref_count != 2 ||
963                             object->shadow_count != 1 ||
964                             object->handle ||
965                             LIST_FIRST(&object->shadow_head) != temp ||
966                             (object->type != OBJT_DEFAULT &&
967                              object->type != OBJT_SWAP)) {
968                                 vm_object_chain_release(temp);
969                                 vm_object_drop(temp);
970                                 continue;
971                         }
972
973                         /*
974                          * We can safely drop object's ref_count now.
975                          */
976                         KKASSERT(object->ref_count == 2);
977                         atomic_add_int(&object->ref_count, -1);
978 #if defined(DEBUG_LOCKS)
979                         debugvm_object_add(object, file, line, -1);
980 #endif
981
982                         /*
983                          * If our single parent is not collapseable just
984                          * decrement ref_count (2->1) and stop.
985                          */
986                         if (temp->handle || (temp->type != OBJT_DEFAULT &&
987                                              temp->type != OBJT_SWAP)) {
988                                 vm_object_chain_release(temp);
989                                 vm_object_drop(temp);
990                                 break;
991                         }
992
993                         /*
994                          * At this point we have already dropped object's
995                          * ref_count so it is possible for a race to
996                          * deallocate obj out from under us.  Any collapse
997                          * will re-check the situation.  We must not block
998                          * until we are able to collapse.
999                          *
1000                          * Bump temp's ref_count to avoid an unwanted
1001                          * degenerate recursion (can't call
1002                          * vm_object_reference_locked() because it asserts
1003                          * that CHAINLOCK is not set).
1004                          */
1005                         atomic_add_int(&temp->ref_count, 1);
1006                         KKASSERT(temp->ref_count > 1);
1007
1008                         /*
1009                          * Collapse temp, then deallocate the extra ref
1010                          * formally.
1011                          */
1012                         vm_object_collapse(temp, &dlist);
1013                         vm_object_chain_release(temp);
1014                         if (must_drop) {
1015                                 vm_object_lock_swap();
1016                                 vm_object_drop(object);
1017                         }
1018                         object = temp;
1019                         must_drop = 1;
1020                         continue;
1021                 }
1022
1023                 /*
1024                  * Drop the ref and handle termination on the 1->0 transition.
1025                  * We may have blocked above so we have to recheck.
1026                  */
1027 skip:
1028                 KKASSERT(object->ref_count != 0);
1029                 if (object->ref_count >= 2) {
1030                         atomic_add_int(&object->ref_count, -1);
1031 #if defined(DEBUG_LOCKS)
1032                         debugvm_object_add(object, file, line, -1);
1033 #endif
1034                         break;
1035                 }
1036                 KKASSERT(object->ref_count == 1);
1037
1038                 /*
1039                  * 1->0 transition.  Chain through the backing_object.
1040                  * Maintain the ref until we've located the backing object,
1041                  * then re-check.
1042                  */
1043                 while ((temp = object->backing_object) != NULL) {
1044                         if (temp->type == OBJT_VNODE)
1045                                 vm_object_hold_shared(temp);
1046                         else
1047                                 vm_object_hold(temp);
1048                         if (temp == object->backing_object)
1049                                 break;
1050                         vm_object_drop(temp);
1051                 }
1052
1053                 /*
1054                  * 1->0 transition verified, retry if ref_count is no longer
1055                  * 1.  Otherwise disconnect the backing_object (temp) and
1056                  * clean up.
1057                  */
1058                 if (object->ref_count != 1) {
1059                         vm_object_drop(temp);
1060                         continue;
1061                 }
1062
1063                 /*
1064                  * It shouldn't be possible for the object to be chain locked
1065                  * if we're removing the last ref on it.
1066                  *
1067                  * Removing object from temp's shadow list requires dropping
1068                  * temp, which we will do on loop.
1069                  *
1070                  * NOTE! vnodes do not use the shadow list, but still have
1071                  *       the backing_object reference.
1072                  */
1073                 KKASSERT((object->chainlk & (CHAINLK_EXCL|CHAINLK_MASK)) == 0);
1074
1075                 if (temp) {
1076                         if (object->flags & OBJ_ONSHADOW) {
1077                                 LIST_REMOVE(object, shadow_list);
1078                                 temp->shadow_count--;
1079                                 temp->generation++;
1080                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
1081                         }
1082                         object->backing_object = NULL;
1083                 }
1084
1085                 atomic_add_int(&object->ref_count, -1);
1086                 if ((object->flags & OBJ_DEAD) == 0)
1087                         vm_object_terminate(object);
1088                 if (must_drop && temp)
1089                         vm_object_lock_swap();
1090                 if (must_drop)
1091                         vm_object_drop(object);
1092                 object = temp;
1093                 must_drop = 1;
1094         }
1095
1096         if (must_drop && object)
1097                 vm_object_drop(object);
1098
1099         /*
1100          * Additional tail recursion on dlist.  Avoid a recursion.  Objects
1101          * on the dlist have a hold count but are not locked.
1102          */
1103         if ((dtmp = dlist) != NULL) {
1104                 dlist = dtmp->next;
1105                 object = dtmp->object;
1106                 kfree(dtmp, M_TEMP);
1107
1108                 vm_object_lock(object); /* already held, add lock */
1109                 must_drop = 1;          /* and we're responsible for it */
1110                 goto again;
1111         }
1112 }
1113
1114 /*
1115  * Destroy the specified object, freeing up related resources.
1116  *
1117  * The object must have zero references.
1118  *
1119  * The object must held.  The caller is responsible for dropping the object
1120  * after terminate returns.  Terminate does NOT drop the object.
1121  */
1122 static int vm_object_terminate_callback(vm_page_t p, void *data);
1123
1124 void
1125 vm_object_terminate(vm_object_t object)
1126 {
1127         struct rb_vm_page_scan_info info;
1128         int n;
1129
1130         /*
1131          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
1132          * able to safely block.
1133          */
1134         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1135         KKASSERT((object->flags & OBJ_DEAD) == 0);
1136         vm_object_set_flag(object, OBJ_DEAD);
1137
1138         /*
1139          * Wait for the pageout daemon to be done with the object
1140          */
1141         vm_object_pip_wait(object, "objtrm1");
1142
1143         KASSERT(!object->paging_in_progress,
1144                 ("vm_object_terminate: pageout in progress"));
1145
1146         /*
1147          * Clean and free the pages, as appropriate. All references to the
1148          * object are gone, so we don't need to lock it.
1149          */
1150         if (object->type == OBJT_VNODE) {
1151                 struct vnode *vp;
1152
1153                 /*
1154                  * Clean pages and flush buffers.
1155                  *
1156                  * NOTE!  TMPFS buffer flushes do not typically flush the
1157                  *        actual page to swap as this would be highly
1158                  *        inefficient, and normal filesystems usually wrap
1159                  *        page flushes with buffer cache buffers.
1160                  *
1161                  *        To deal with this we have to call vinvalbuf() both
1162                  *        before and after the vm_object_page_clean().
1163                  */
1164                 vp = (struct vnode *) object->handle;
1165                 vinvalbuf(vp, V_SAVE, 0, 0);
1166                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
1167                 vinvalbuf(vp, V_SAVE, 0, 0);
1168         }
1169
1170         /*
1171          * Wait for any I/O to complete, after which there had better not
1172          * be any references left on the object.
1173          */
1174         vm_object_pip_wait(object, "objtrm2");
1175
1176         if (object->ref_count != 0) {
1177                 panic("vm_object_terminate: object with references, "
1178                       "ref_count=%d", object->ref_count);
1179         }
1180
1181         /*
1182          * Cleanup any shared pmaps associated with this object.
1183          */
1184         pmap_object_free(object);
1185
1186         /*
1187          * Now free any remaining pages. For internal objects, this also
1188          * removes them from paging queues. Don't free wired pages, just
1189          * remove them from the object. 
1190          */
1191         info.count = 0;
1192         info.object = object;
1193         vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1194                                 vm_object_terminate_callback, &info);
1195
1196         /*
1197          * Let the pager know object is dead.
1198          */
1199         vm_pager_deallocate(object);
1200
1201         /*
1202          * Wait for the object hold count to hit 1, clean out pages as
1203          * we go.  vmobj_token interlocks any race conditions that might
1204          * pick the object up from the vm_object_list after we have cleared
1205          * rb_memq.
1206          */
1207         for (;;) {
1208                 if (RB_ROOT(&object->rb_memq) == NULL)
1209                         break;
1210                 kprintf("vm_object_terminate: Warning, object %p "
1211                         "still has %d pages\n",
1212                         object, object->resident_page_count);
1213                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1214                                         vm_object_terminate_callback, &info);
1215         }
1216
1217         /*
1218          * There had better not be any pages left
1219          */
1220         KKASSERT(object->resident_page_count == 0);
1221
1222         /*
1223          * Remove the object from the global object list.
1224          */
1225         n = VMOBJ_HASH(object);
1226         lwkt_gettoken(&vmobj_tokens[n]);
1227         TAILQ_REMOVE(&vm_object_lists[n], object, object_list);
1228         lwkt_reltoken(&vmobj_tokens[n]);
1229         atomic_add_long(&vm_object_count, -1);
1230
1231         if (object->ref_count != 0) {
1232                 panic("vm_object_terminate2: object with references, "
1233                       "ref_count=%d", object->ref_count);
1234         }
1235
1236         /*
1237          * NOTE: The object hold_count is at least 1, so we cannot zfree()
1238          *       the object here.  See vm_object_drop().
1239          */
1240 }
1241
1242 /*
1243  * The caller must hold the object.
1244  */
1245 static int
1246 vm_object_terminate_callback(vm_page_t p, void *data)
1247 {
1248         struct rb_vm_page_scan_info *info = data;
1249         vm_object_t object;
1250
1251         if ((++info->count & 63) == 0)
1252                 lwkt_user_yield();
1253         object = p->object;
1254         if (object != info->object) {
1255                 kprintf("vm_object_terminate_callback: obj/pg race %p/%p\n",
1256                         info->object, p);
1257                 return(0);
1258         }
1259         vm_page_busy_wait(p, TRUE, "vmpgtrm");
1260         if (object != p->object) {
1261                 kprintf("vm_object_terminate: Warning: Encountered "
1262                         "busied page %p on queue %d\n", p, p->queue);
1263                 vm_page_wakeup(p);
1264         } else if (p->wire_count == 0) {
1265                 /*
1266                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
1267                  */
1268                 vm_page_free(p);
1269                 mycpu->gd_cnt.v_pfree++;
1270         } else {
1271                 if (p->queue != PQ_NONE)
1272                         kprintf("vm_object_terminate: Warning: Encountered "
1273                                 "wired page %p on queue %d\n", p, p->queue);
1274                 vm_page_remove(p);
1275                 vm_page_wakeup(p);
1276         }
1277         return(0);
1278 }
1279
1280 /*
1281  * Clean all dirty pages in the specified range of object.  Leaves page
1282  * on whatever queue it is currently on.   If NOSYNC is set then do not
1283  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
1284  * leaving the object dirty.
1285  *
1286  * When stuffing pages asynchronously, allow clustering.  XXX we need a
1287  * synchronous clustering mode implementation.
1288  *
1289  * Odd semantics: if start == end, we clean everything.
1290  *
1291  * The object must be locked? XXX
1292  */
1293 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
1294 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
1295
1296 void
1297 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1298                      int flags)
1299 {
1300         struct rb_vm_page_scan_info info;
1301         struct vnode *vp;
1302         int wholescan;
1303         int pagerflags;
1304         int generation;
1305
1306         vm_object_hold(object);
1307         if (object->type != OBJT_VNODE ||
1308             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
1309                 vm_object_drop(object);
1310                 return;
1311         }
1312
1313         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 
1314                         VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
1315         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
1316
1317         vp = object->handle;
1318
1319         /*
1320          * Interlock other major object operations.  This allows us to 
1321          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
1322          */
1323         vm_object_set_flag(object, OBJ_CLEANING);
1324
1325         /*
1326          * Handle 'entire object' case
1327          */
1328         info.start_pindex = start;
1329         if (end == 0) {
1330                 info.end_pindex = object->size - 1;
1331         } else {
1332                 info.end_pindex = end - 1;
1333         }
1334         wholescan = (start == 0 && info.end_pindex == object->size - 1);
1335         info.limit = flags;
1336         info.pagerflags = pagerflags;
1337         info.object = object;
1338         info.count = 0;
1339
1340         /*
1341          * If cleaning the entire object do a pass to mark the pages read-only.
1342          * If everything worked out ok, clear OBJ_WRITEABLE and
1343          * OBJ_MIGHTBEDIRTY.
1344          */
1345         if (wholescan) {
1346                 info.error = 0;
1347                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1348                                         vm_object_page_clean_pass1, &info);
1349                 if (info.error == 0) {
1350                         vm_object_clear_flag(object,
1351                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1352                         if (object->type == OBJT_VNODE &&
1353                             (vp = (struct vnode *)object->handle) != NULL) {
1354                                 /*
1355                                  * Use new-style interface to clear VISDIRTY
1356                                  * because the vnode is not necessarily removed
1357                                  * from the syncer list(s) as often as it was
1358                                  * under the old interface, which can leave
1359                                  * the vnode on the syncer list after reclaim.
1360                                  */
1361                                 vclrobjdirty(vp);
1362                         }
1363                 }
1364         }
1365
1366         /*
1367          * Do a pass to clean all the dirty pages we find.
1368          */
1369         do {
1370                 info.error = 0;
1371                 generation = object->generation;
1372                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1373                                         vm_object_page_clean_pass2, &info);
1374         } while (info.error || generation != object->generation);
1375
1376         vm_object_clear_flag(object, OBJ_CLEANING);
1377         vm_object_drop(object);
1378 }
1379
1380 /*
1381  * The caller must hold the object.
1382  */
1383 static 
1384 int
1385 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1386 {
1387         struct rb_vm_page_scan_info *info = data;
1388
1389         if ((++info->count & 63) == 0)
1390                 lwkt_user_yield();
1391         if (p->object != info->object ||
1392             p->pindex < info->start_pindex ||
1393             p->pindex > info->end_pindex) {
1394                 kprintf("vm_object_page_clean_pass1: obj/pg race %p/%p\n",
1395                         info->object, p);
1396                 return(0);
1397         }
1398         vm_page_flag_set(p, PG_CLEANCHK);
1399         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1400                 info->error = 1;
1401         } else if (vm_page_busy_try(p, FALSE) == 0) {
1402                 if (p->object == info->object)
1403                         vm_page_protect(p, VM_PROT_READ);
1404                 vm_page_wakeup(p);
1405         } else {
1406                 info->error = 1;
1407         }
1408         return(0);
1409 }
1410
1411 /*
1412  * The caller must hold the object
1413  */
1414 static 
1415 int
1416 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1417 {
1418         struct rb_vm_page_scan_info *info = data;
1419         int generation;
1420
1421         if (p->object != info->object ||
1422             p->pindex < info->start_pindex ||
1423             p->pindex > info->end_pindex) {
1424                 kprintf("vm_object_page_clean_pass2: obj/pg race %p/%p\n",
1425                         info->object, p);
1426                 return(0);
1427         }
1428
1429         /*
1430          * Do not mess with pages that were inserted after we started
1431          * the cleaning pass.
1432          */
1433         if ((p->flags & PG_CLEANCHK) == 0)
1434                 goto done;
1435
1436         generation = info->object->generation;
1437         vm_page_busy_wait(p, TRUE, "vpcwai");
1438
1439         if (p->object != info->object ||
1440             p->pindex < info->start_pindex ||
1441             p->pindex > info->end_pindex ||
1442             info->object->generation != generation) {
1443                 info->error = 1;
1444                 vm_page_wakeup(p);
1445                 goto done;
1446         }
1447
1448         /*
1449          * Before wasting time traversing the pmaps, check for trivial
1450          * cases where the page cannot be dirty.
1451          */
1452         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1453                 KKASSERT((p->dirty & p->valid) == 0 &&
1454                          (p->flags & PG_NEED_COMMIT) == 0);
1455                 vm_page_wakeup(p);
1456                 goto done;
1457         }
1458
1459         /*
1460          * Check whether the page is dirty or not.  The page has been set
1461          * to be read-only so the check will not race a user dirtying the
1462          * page.
1463          */
1464         vm_page_test_dirty(p);
1465         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1466                 vm_page_flag_clear(p, PG_CLEANCHK);
1467                 vm_page_wakeup(p);
1468                 goto done;
1469         }
1470
1471         /*
1472          * If we have been asked to skip nosync pages and this is a
1473          * nosync page, skip it.  Note that the object flags were
1474          * not cleared in this case (because pass1 will have returned an
1475          * error), so we do not have to set them.
1476          */
1477         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1478                 vm_page_flag_clear(p, PG_CLEANCHK);
1479                 vm_page_wakeup(p);
1480                 goto done;
1481         }
1482
1483         /*
1484          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1485          * the pages that get successfully flushed.  Set info->error if
1486          * we raced an object modification.
1487          */
1488         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1489         /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1490 done:
1491         if ((++info->count & 63) == 0)
1492                 lwkt_user_yield();
1493
1494         return(0);
1495 }
1496
1497 /*
1498  * Collect the specified page and nearby pages and flush them out.
1499  * The number of pages flushed is returned.  The passed page is busied
1500  * by the caller and we are responsible for its disposition.
1501  *
1502  * The caller must hold the object.
1503  */
1504 static void
1505 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1506 {
1507         int error;
1508         int is;
1509         int ib;
1510         int i;
1511         int page_base;
1512         vm_pindex_t pi;
1513         vm_page_t ma[BLIST_MAX_ALLOC];
1514
1515         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1516
1517         pi = p->pindex;
1518         page_base = pi % BLIST_MAX_ALLOC;
1519         ma[page_base] = p;
1520         ib = page_base - 1;
1521         is = page_base + 1;
1522
1523         while (ib >= 0) {
1524                 vm_page_t tp;
1525
1526                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1527                                              TRUE, &error);
1528                 if (error)
1529                         break;
1530                 if (tp == NULL)
1531                         break;
1532                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1533                     (tp->flags & PG_CLEANCHK) == 0) {
1534                         vm_page_wakeup(tp);
1535                         break;
1536                 }
1537                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1538                         vm_page_flag_clear(tp, PG_CLEANCHK);
1539                         vm_page_wakeup(tp);
1540                         break;
1541                 }
1542                 vm_page_test_dirty(tp);
1543                 if ((tp->dirty & tp->valid) == 0 &&
1544                     (tp->flags & PG_NEED_COMMIT) == 0) {
1545                         vm_page_flag_clear(tp, PG_CLEANCHK);
1546                         vm_page_wakeup(tp);
1547                         break;
1548                 }
1549                 ma[ib] = tp;
1550                 --ib;
1551         }
1552         ++ib;   /* fixup */
1553
1554         while (is < BLIST_MAX_ALLOC &&
1555                pi - page_base + is < object->size) {
1556                 vm_page_t tp;
1557
1558                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1559                                              TRUE, &error);
1560                 if (error)
1561                         break;
1562                 if (tp == NULL)
1563                         break;
1564                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1565                     (tp->flags & PG_CLEANCHK) == 0) {
1566                         vm_page_wakeup(tp);
1567                         break;
1568                 }
1569                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1570                         vm_page_flag_clear(tp, PG_CLEANCHK);
1571                         vm_page_wakeup(tp);
1572                         break;
1573                 }
1574                 vm_page_test_dirty(tp);
1575                 if ((tp->dirty & tp->valid) == 0 &&
1576                     (tp->flags & PG_NEED_COMMIT) == 0) {
1577                         vm_page_flag_clear(tp, PG_CLEANCHK);
1578                         vm_page_wakeup(tp);
1579                         break;
1580                 }
1581                 ma[is] = tp;
1582                 ++is;
1583         }
1584
1585         /*
1586          * All pages in the ma[] array are busied now
1587          */
1588         for (i = ib; i < is; ++i) {
1589                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1590                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1591         }
1592         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1593         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1594                 vm_page_unhold(ma[i]);
1595 }
1596
1597 /*
1598  * Same as vm_object_pmap_copy, except range checking really
1599  * works, and is meant for small sections of an object.
1600  *
1601  * This code protects resident pages by making them read-only
1602  * and is typically called on a fork or split when a page
1603  * is converted to copy-on-write.  
1604  *
1605  * NOTE: If the page is already at VM_PROT_NONE, calling
1606  * vm_page_protect will have no effect.
1607  */
1608 void
1609 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1610 {
1611         vm_pindex_t idx;
1612         vm_page_t p;
1613
1614         if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
1615                 return;
1616
1617         vm_object_hold(object);
1618         for (idx = start; idx < end; idx++) {
1619                 p = vm_page_lookup(object, idx);
1620                 if (p == NULL)
1621                         continue;
1622                 vm_page_protect(p, VM_PROT_READ);
1623         }
1624         vm_object_drop(object);
1625 }
1626
1627 /*
1628  * Removes all physical pages in the specified object range from all
1629  * physical maps.
1630  *
1631  * The object must *not* be locked.
1632  */
1633
1634 static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
1635
1636 void
1637 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1638 {
1639         struct rb_vm_page_scan_info info;
1640
1641         if (object == NULL)
1642                 return;
1643         info.start_pindex = start;
1644         info.end_pindex = end - 1;
1645         info.count = 0;
1646         info.object = object;
1647
1648         vm_object_hold(object);
1649         vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1650                                 vm_object_pmap_remove_callback, &info);
1651         if (start == 0 && end == object->size)
1652                 vm_object_clear_flag(object, OBJ_WRITEABLE);
1653         vm_object_drop(object);
1654 }
1655
1656 /*
1657  * The caller must hold the object
1658  */
1659 static int
1660 vm_object_pmap_remove_callback(vm_page_t p, void *data)
1661 {
1662         struct rb_vm_page_scan_info *info = data;
1663
1664         if ((++info->count & 63) == 0)
1665                 lwkt_user_yield();
1666
1667         if (info->object != p->object ||
1668             p->pindex < info->start_pindex ||
1669             p->pindex > info->end_pindex) {
1670                 kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n",
1671                         info->object, p);
1672                 return(0);
1673         }
1674
1675         vm_page_protect(p, VM_PROT_NONE);
1676
1677         return(0);
1678 }
1679
1680 /*
1681  * Implements the madvise function at the object/page level.
1682  *
1683  * MADV_WILLNEED        (any object)
1684  *
1685  *      Activate the specified pages if they are resident.
1686  *
1687  * MADV_DONTNEED        (any object)
1688  *
1689  *      Deactivate the specified pages if they are resident.
1690  *
1691  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1692  *
1693  *      Deactivate and clean the specified pages if they are
1694  *      resident.  This permits the process to reuse the pages
1695  *      without faulting or the kernel to reclaim the pages
1696  *      without I/O.
1697  *
1698  * No requirements.
1699  */
1700 void
1701 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
1702 {
1703         vm_pindex_t end, tpindex;
1704         vm_object_t tobject;
1705         vm_object_t xobj;
1706         vm_page_t m;
1707         int error;
1708
1709         if (object == NULL)
1710                 return;
1711
1712         end = pindex + count;
1713
1714         vm_object_hold(object);
1715         tobject = object;
1716
1717         /*
1718          * Locate and adjust resident pages
1719          */
1720         for (; pindex < end; pindex += 1) {
1721 relookup:
1722                 if (tobject != object)
1723                         vm_object_drop(tobject);
1724                 tobject = object;
1725                 tpindex = pindex;
1726 shadowlookup:
1727                 /*
1728                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1729                  * and those pages must be OBJ_ONEMAPPING.
1730                  */
1731                 if (advise == MADV_FREE) {
1732                         if ((tobject->type != OBJT_DEFAULT &&
1733                              tobject->type != OBJT_SWAP) ||
1734                             (tobject->flags & OBJ_ONEMAPPING) == 0) {
1735                                 continue;
1736                         }
1737                 }
1738
1739                 m = vm_page_lookup_busy_try(tobject, tpindex, TRUE, &error);
1740
1741                 if (error) {
1742                         vm_page_sleep_busy(m, TRUE, "madvpo");
1743                         goto relookup;
1744                 }
1745                 if (m == NULL) {
1746                         /*
1747                          * There may be swap even if there is no backing page
1748                          */
1749                         if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1750                                 swap_pager_freespace(tobject, tpindex, 1);
1751
1752                         /*
1753                          * next object
1754                          */
1755                         while ((xobj = tobject->backing_object) != NULL) {
1756                                 KKASSERT(xobj != object);
1757                                 vm_object_hold(xobj);
1758                                 if (xobj == tobject->backing_object)
1759                                         break;
1760                                 vm_object_drop(xobj);
1761                         }
1762                         if (xobj == NULL)
1763                                 continue;
1764                         tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1765                         if (tobject != object) {
1766                                 vm_object_lock_swap();
1767                                 vm_object_drop(tobject);
1768                         }
1769                         tobject = xobj;
1770                         goto shadowlookup;
1771                 }
1772
1773                 /*
1774                  * If the page is not in a normal active state, we skip it.
1775                  * If the page is not managed there are no page queues to
1776                  * mess with.  Things can break if we mess with pages in
1777                  * any of the below states.
1778                  */
1779                 if (m->wire_count ||
1780                     (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1781                     m->valid != VM_PAGE_BITS_ALL
1782                 ) {
1783                         vm_page_wakeup(m);
1784                         continue;
1785                 }
1786
1787                 /*
1788                  * Theoretically once a page is known not to be busy, an
1789                  * interrupt cannot come along and rip it out from under us.
1790                  */
1791
1792                 if (advise == MADV_WILLNEED) {
1793                         vm_page_activate(m);
1794                 } else if (advise == MADV_DONTNEED) {
1795                         vm_page_dontneed(m);
1796                 } else if (advise == MADV_FREE) {
1797                         /*
1798                          * Mark the page clean.  This will allow the page
1799                          * to be freed up by the system.  However, such pages
1800                          * are often reused quickly by malloc()/free()
1801                          * so we do not do anything that would cause
1802                          * a page fault if we can help it.
1803                          *
1804                          * Specifically, we do not try to actually free
1805                          * the page now nor do we try to put it in the
1806                          * cache (which would cause a page fault on reuse).
1807                          *
1808                          * But we do make the page is freeable as we
1809                          * can without actually taking the step of unmapping
1810                          * it.
1811                          */
1812                         pmap_clear_modify(m);
1813                         m->dirty = 0;
1814                         m->act_count = 0;
1815                         vm_page_dontneed(m);
1816                         if (tobject->type == OBJT_SWAP)
1817                                 swap_pager_freespace(tobject, tpindex, 1);
1818                 }
1819                 vm_page_wakeup(m);
1820         }       
1821         if (tobject != object)
1822                 vm_object_drop(tobject);
1823         vm_object_drop(object);
1824 }
1825
1826 /*
1827  * Create a new object which is backed by the specified existing object
1828  * range.  Replace the pointer and offset that was pointing at the existing
1829  * object with the pointer/offset for the new object.
1830  *
1831  * If addref is non-zero the returned object is given an additional reference.
1832  * This mechanic exists to avoid the situation where refs might be 1 and
1833  * race against a collapse when the caller intends to bump it.  So the
1834  * caller cannot add the ref after the fact.  Used when the caller is
1835  * duplicating a vm_map_entry.
1836  *
1837  * No other requirements.
1838  */
1839 void
1840 vm_object_shadow(vm_object_t *objectp, vm_ooffset_t *offset, vm_size_t length,
1841                  int addref)
1842 {
1843         vm_object_t source;
1844         vm_object_t result;
1845         int useshadowlist;
1846
1847         source = *objectp;
1848
1849         /*
1850          * Don't create the new object if the old object isn't shared.
1851          * We have to chain wait before adding the reference to avoid
1852          * racing a collapse or deallocation.
1853          *
1854          * Clear OBJ_ONEMAPPING flag when shadowing.
1855          *
1856          * The caller owns a ref on source via *objectp which we are going
1857          * to replace.  This ref is inherited by the backing_object assignment.
1858          * from nobject and does not need to be incremented here.
1859          *
1860          * However, we add a temporary extra reference to the original source
1861          * prior to holding nobject in case we block, to avoid races where
1862          * someone else might believe that the source can be collapsed.
1863          */
1864         useshadowlist = 0;
1865         if (source) {
1866                 if (source->type != OBJT_VNODE) {
1867                         useshadowlist = 1;
1868                         vm_object_hold(source);
1869                         vm_object_chain_wait(source, 0);
1870                         if (source->ref_count == 1 &&
1871                             source->handle == NULL &&
1872                             (source->type == OBJT_DEFAULT ||
1873                              source->type == OBJT_SWAP)) {
1874                                 if (addref) {
1875                                         vm_object_reference_locked(source);
1876                                         vm_object_clear_flag(source,
1877                                                              OBJ_ONEMAPPING);
1878                                 }
1879                                 vm_object_drop(source);
1880                                 return;
1881                         }
1882                         vm_object_reference_locked(source);
1883                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
1884                 } else {
1885                         vm_object_reference_quick(source);
1886                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
1887                 }
1888         }
1889
1890         /*
1891          * Allocate a new object with the given length.  The new object
1892          * is returned referenced but we may have to add another one.
1893          * If we are adding a second reference we must clear OBJ_ONEMAPPING.
1894          * (typically because the caller is about to clone a vm_map_entry).
1895          *
1896          * The source object currently has an extra reference to prevent
1897          * collapses into it while we mess with its shadow list, which
1898          * we will remove later in this routine.
1899          *
1900          * The target object may require a second reference if asked for one
1901          * by the caller.
1902          */
1903         result = vm_object_allocate(OBJT_DEFAULT, length);
1904         if (result == NULL)
1905                 panic("vm_object_shadow: no object for shadowing");
1906         vm_object_hold(result);
1907         if (addref) {
1908                 vm_object_reference_locked(result);
1909                 vm_object_clear_flag(result, OBJ_ONEMAPPING);
1910         }
1911
1912         /*
1913          * The new object shadows the source object.  Chain wait before
1914          * adjusting shadow_count or the shadow list to avoid races.
1915          *
1916          * Try to optimize the result object's page color when shadowing
1917          * in order to maintain page coloring consistency in the combined 
1918          * shadowed object.
1919          *
1920          * The backing_object reference to source requires adding a ref to
1921          * source.  We simply inherit the ref from the original *objectp
1922          * (which we are replacing) so no additional refs need to be added.
1923          * (we must still clean up the extra ref we had to prevent collapse
1924          * races).
1925          *
1926          * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
1927          */
1928         KKASSERT(result->backing_object == NULL);
1929         result->backing_object = source;
1930         if (source) {
1931                 if (useshadowlist) {
1932                         vm_object_chain_wait(source, 0);
1933                         LIST_INSERT_HEAD(&source->shadow_head,
1934                                          result, shadow_list);
1935                         source->shadow_count++;
1936                         source->generation++;
1937                         vm_object_set_flag(result, OBJ_ONSHADOW);
1938                 }
1939                 /* cpu localization twist */
1940                 result->pg_color = (int)(intptr_t)curthread;
1941         }
1942
1943         /*
1944          * Adjust the return storage.  Drop the ref on source before
1945          * returning.
1946          */
1947         result->backing_object_offset = *offset;
1948         vm_object_drop(result);
1949         *offset = 0;
1950         if (source) {
1951                 if (useshadowlist) {
1952                         vm_object_deallocate_locked(source);
1953                         vm_object_drop(source);
1954                 } else {
1955                         vm_object_deallocate(source);
1956                 }
1957         }
1958
1959         /*
1960          * Return the new things
1961          */
1962         *objectp = result;
1963 }
1964
1965 #define OBSC_TEST_ALL_SHADOWED  0x0001
1966 #define OBSC_COLLAPSE_NOWAIT    0x0002
1967 #define OBSC_COLLAPSE_WAIT      0x0004
1968
1969 static int vm_object_backing_scan_callback(vm_page_t p, void *data);
1970
1971 /*
1972  * The caller must hold the object.
1973  */
1974 static __inline int
1975 vm_object_backing_scan(vm_object_t object, vm_object_t backing_object, int op)
1976 {
1977         struct rb_vm_page_scan_info info;
1978         int n;
1979
1980         vm_object_assert_held(object);
1981         vm_object_assert_held(backing_object);
1982
1983         KKASSERT(backing_object == object->backing_object);
1984         info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1985
1986         /*
1987          * Initial conditions
1988          */
1989         if (op & OBSC_TEST_ALL_SHADOWED) {
1990                 /*
1991                  * We do not want to have to test for the existence of
1992                  * swap pages in the backing object.  XXX but with the
1993                  * new swapper this would be pretty easy to do.
1994                  *
1995                  * XXX what about anonymous MAP_SHARED memory that hasn't
1996                  * been ZFOD faulted yet?  If we do not test for this, the
1997                  * shadow test may succeed! XXX
1998                  */
1999                 if (backing_object->type != OBJT_DEFAULT)
2000                         return(0);
2001         }
2002         if (op & OBSC_COLLAPSE_WAIT) {
2003                 KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
2004                 vm_object_set_flag(backing_object, OBJ_DEAD);
2005
2006                 n = VMOBJ_HASH(backing_object);
2007                 lwkt_gettoken(&vmobj_tokens[n]);
2008                 TAILQ_REMOVE(&vm_object_lists[n], backing_object, object_list);
2009                 lwkt_reltoken(&vmobj_tokens[n]);
2010                 atomic_add_long(&vm_object_count, -1);
2011         }
2012
2013         /*
2014          * Our scan.   We have to retry if a negative error code is returned,
2015          * otherwise 0 or 1 will be returned in info.error.  0 Indicates that
2016          * the scan had to be stopped because the parent does not completely
2017          * shadow the child.
2018          */
2019         info.object = object;
2020         info.backing_object = backing_object;
2021         info.limit = op;
2022         do {
2023                 info.error = 1;
2024                 vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
2025                                         vm_object_backing_scan_callback,
2026                                         &info);
2027         } while (info.error < 0);
2028
2029         return(info.error);
2030 }
2031
2032 /*
2033  * The caller must hold the object.
2034  */
2035 static int
2036 vm_object_backing_scan_callback(vm_page_t p, void *data)
2037 {
2038         struct rb_vm_page_scan_info *info = data;
2039         vm_object_t backing_object;
2040         vm_object_t object;
2041         vm_pindex_t pindex;
2042         vm_pindex_t new_pindex;
2043         vm_pindex_t backing_offset_index;
2044         int op;
2045
2046         pindex = p->pindex;
2047         new_pindex = pindex - info->backing_offset_index;
2048         op = info->limit;
2049         object = info->object;
2050         backing_object = info->backing_object;
2051         backing_offset_index = info->backing_offset_index;
2052
2053         if (op & OBSC_TEST_ALL_SHADOWED) {
2054                 vm_page_t pp;
2055
2056                 /*
2057                  * Ignore pages outside the parent object's range
2058                  * and outside the parent object's mapping of the 
2059                  * backing object.
2060                  *
2061                  * note that we do not busy the backing object's
2062                  * page.
2063                  */
2064                 if (pindex < backing_offset_index ||
2065                     new_pindex >= object->size
2066                 ) {
2067                         return(0);
2068                 }
2069
2070                 /*
2071                  * See if the parent has the page or if the parent's
2072                  * object pager has the page.  If the parent has the
2073                  * page but the page is not valid, the parent's
2074                  * object pager must have the page.
2075                  *
2076                  * If this fails, the parent does not completely shadow
2077                  * the object and we might as well give up now.
2078                  */
2079                 pp = vm_page_lookup(object, new_pindex);
2080                 if ((pp == NULL || pp->valid == 0) &&
2081                     !vm_pager_has_page(object, new_pindex)
2082                 ) {
2083                         info->error = 0;        /* problemo */
2084                         return(-1);             /* stop the scan */
2085                 }
2086         }
2087
2088         /*
2089          * Check for busy page.  Note that we may have lost (p) when we
2090          * possibly blocked above.
2091          */
2092         if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
2093                 vm_page_t pp;
2094
2095                 if (vm_page_busy_try(p, TRUE)) {
2096                         if (op & OBSC_COLLAPSE_NOWAIT) {
2097                                 return(0);
2098                         } else {
2099                                 /*
2100                                  * If we slept, anything could have
2101                                  * happened.   Ask that the scan be restarted.
2102                                  *
2103                                  * Since the object is marked dead, the
2104                                  * backing offset should not have changed.  
2105                                  */
2106                                 vm_page_sleep_busy(p, TRUE, "vmocol");
2107                                 info->error = -1;
2108                                 return(-1);
2109                         }
2110                 }
2111
2112                 /*
2113                  * If (p) is no longer valid restart the scan.
2114                  */
2115                 if (p->object != backing_object || p->pindex != pindex) {
2116                         kprintf("vm_object_backing_scan: Warning: page "
2117                                 "%p ripped out from under us\n", p);
2118                         vm_page_wakeup(p);
2119                         info->error = -1;
2120                         return(-1);
2121                 }
2122
2123                 if (op & OBSC_COLLAPSE_NOWAIT) {
2124                         if (p->valid == 0 ||
2125                             p->wire_count ||
2126                             (p->flags & PG_NEED_COMMIT)) {
2127                                 vm_page_wakeup(p);
2128                                 return(0);
2129                         }
2130                 } else {
2131                         /* XXX what if p->valid == 0 , hold_count, etc? */
2132                 }
2133
2134                 KASSERT(
2135                     p->object == backing_object,
2136                     ("vm_object_qcollapse(): object mismatch")
2137                 );
2138
2139                 /*
2140                  * Destroy any associated swap
2141                  */
2142                 if (backing_object->type == OBJT_SWAP)
2143                         swap_pager_freespace(backing_object, p->pindex, 1);
2144
2145                 if (
2146                     p->pindex < backing_offset_index ||
2147                     new_pindex >= object->size
2148                 ) {
2149                         /*
2150                          * Page is out of the parent object's range, we 
2151                          * can simply destroy it. 
2152                          */
2153                         vm_page_protect(p, VM_PROT_NONE);
2154                         vm_page_free(p);
2155                         return(0);
2156                 }
2157
2158                 pp = vm_page_lookup(object, new_pindex);
2159                 if (pp != NULL || vm_pager_has_page(object, new_pindex)) {
2160                         /*
2161                          * page already exists in parent OR swap exists
2162                          * for this location in the parent.  Destroy 
2163                          * the original page from the backing object.
2164                          *
2165                          * Leave the parent's page alone
2166                          */
2167                         vm_page_protect(p, VM_PROT_NONE);
2168                         vm_page_free(p);
2169                         return(0);
2170                 }
2171
2172                 /*
2173                  * Page does not exist in parent, rename the
2174                  * page from the backing object to the main object. 
2175                  *
2176                  * If the page was mapped to a process, it can remain 
2177                  * mapped through the rename.
2178                  */
2179                 if ((p->queue - p->pc) == PQ_CACHE)
2180                         vm_page_deactivate(p);
2181
2182                 vm_page_rename(p, object, new_pindex);
2183                 vm_page_wakeup(p);
2184                 /* page automatically made dirty by rename */
2185         }
2186         return(0);
2187 }
2188
2189 /*
2190  * This version of collapse allows the operation to occur earlier and
2191  * when paging_in_progress is true for an object...  This is not a complete
2192  * operation, but should plug 99.9% of the rest of the leaks.
2193  *
2194  * The caller must hold the object and backing_object and both must be
2195  * chainlocked.
2196  *
2197  * (only called from vm_object_collapse)
2198  */
2199 static void
2200 vm_object_qcollapse(vm_object_t object, vm_object_t backing_object)
2201 {
2202         if (backing_object->ref_count == 1) {
2203                 atomic_add_int(&backing_object->ref_count, 2);
2204 #if defined(DEBUG_LOCKS)
2205                 debugvm_object_add(backing_object, "qcollapse", 1, 2);
2206 #endif
2207                 vm_object_backing_scan(object, backing_object,
2208                                        OBSC_COLLAPSE_NOWAIT);
2209                 atomic_add_int(&backing_object->ref_count, -2);
2210 #if defined(DEBUG_LOCKS)
2211                 debugvm_object_add(backing_object, "qcollapse", 2, -2);
2212 #endif
2213         }
2214 }
2215
2216 /*
2217  * Collapse an object with the object backing it.  Pages in the backing
2218  * object are moved into the parent, and the backing object is deallocated.
2219  * Any conflict is resolved in favor of the parent's existing pages.
2220  *
2221  * object must be held and chain-locked on call.
2222  *
2223  * The caller must have an extra ref on object to prevent a race from
2224  * destroying it during the collapse.
2225  */
2226 void
2227 vm_object_collapse(vm_object_t object, struct vm_object_dealloc_list **dlistp)
2228 {
2229         struct vm_object_dealloc_list *dlist = NULL;
2230         vm_object_t backing_object;
2231
2232         /*
2233          * Only one thread is attempting a collapse at any given moment.
2234          * There are few restrictions for (object) that callers of this
2235          * function check so reentrancy is likely.
2236          */
2237         KKASSERT(object != NULL);
2238         vm_object_assert_held(object);
2239         KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
2240
2241         for (;;) {
2242                 vm_object_t bbobj;
2243                 int dodealloc;
2244
2245                 /*
2246                  * We can only collapse a DEFAULT/SWAP object with a
2247                  * DEFAULT/SWAP object.
2248                  */
2249                 if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) {
2250                         backing_object = NULL;
2251                         break;
2252                 }
2253
2254                 backing_object = object->backing_object;
2255                 if (backing_object == NULL)
2256                         break;
2257                 if (backing_object->type != OBJT_DEFAULT &&
2258                     backing_object->type != OBJT_SWAP) {
2259                         backing_object = NULL;
2260                         break;
2261                 }
2262
2263                 /*
2264                  * Hold the backing_object and check for races
2265                  */
2266                 vm_object_hold(backing_object);
2267                 if (backing_object != object->backing_object ||
2268                     (backing_object->type != OBJT_DEFAULT &&
2269                      backing_object->type != OBJT_SWAP)) {
2270                         vm_object_drop(backing_object);
2271                         continue;
2272                 }
2273
2274                 /*
2275                  * Chain-lock the backing object too because if we
2276                  * successfully merge its pages into the top object we
2277                  * will collapse backing_object->backing_object as the
2278                  * new backing_object.  Re-check that it is still our
2279                  * backing object.
2280                  */
2281                 vm_object_chain_acquire(backing_object, 0);
2282                 if (backing_object != object->backing_object) {
2283                         vm_object_chain_release(backing_object);
2284                         vm_object_drop(backing_object);
2285                         continue;
2286                 }
2287
2288                 /*
2289                  * we check the backing object first, because it is most likely
2290                  * not collapsable.
2291                  */
2292                 if (backing_object->handle != NULL ||
2293                     (backing_object->type != OBJT_DEFAULT &&
2294                      backing_object->type != OBJT_SWAP) ||
2295                     (backing_object->flags & OBJ_DEAD) ||
2296                     object->handle != NULL ||
2297                     (object->type != OBJT_DEFAULT &&
2298                      object->type != OBJT_SWAP) ||
2299                     (object->flags & OBJ_DEAD)) {
2300                         break;
2301                 }
2302
2303                 /*
2304                  * If paging is in progress we can't do a normal collapse.
2305                  */
2306                 if (
2307                     object->paging_in_progress != 0 ||
2308                     backing_object->paging_in_progress != 0
2309                 ) {
2310                         vm_object_qcollapse(object, backing_object);
2311                         break;
2312                 }
2313
2314                 /*
2315                  * We know that we can either collapse the backing object (if
2316                  * the parent is the only reference to it) or (perhaps) have
2317                  * the parent bypass the object if the parent happens to shadow
2318                  * all the resident pages in the entire backing object.
2319                  *
2320                  * This is ignoring pager-backed pages such as swap pages.
2321                  * vm_object_backing_scan fails the shadowing test in this
2322                  * case.
2323                  */
2324                 if (backing_object->ref_count == 1) {
2325                         /*
2326                          * If there is exactly one reference to the backing
2327                          * object, we can collapse it into the parent.  
2328                          */
2329                         KKASSERT(object->backing_object == backing_object);
2330                         vm_object_backing_scan(object, backing_object,
2331                                                OBSC_COLLAPSE_WAIT);
2332
2333                         /*
2334                          * Move the pager from backing_object to object.
2335                          */
2336                         if (backing_object->type == OBJT_SWAP) {
2337                                 vm_object_pip_add(backing_object, 1);
2338
2339                                 /*
2340                                  * scrap the paging_offset junk and do a 
2341                                  * discrete copy.  This also removes major 
2342                                  * assumptions about how the swap-pager 
2343                                  * works from where it doesn't belong.  The
2344                                  * new swapper is able to optimize the
2345                                  * destroy-source case.
2346                                  */
2347                                 vm_object_pip_add(object, 1);
2348                                 swap_pager_copy(backing_object, object,
2349                                     OFF_TO_IDX(object->backing_object_offset),
2350                                     TRUE);
2351                                 vm_object_pip_wakeup(object);
2352                                 vm_object_pip_wakeup(backing_object);
2353                         }
2354
2355                         /*
2356                          * Object now shadows whatever backing_object did.
2357                          * Remove object from backing_object's shadow_list.
2358                          *
2359                          * Removing object from backing_objects shadow list
2360                          * requires releasing object, which we will do below.
2361                          */
2362                         KKASSERT(object->backing_object == backing_object);
2363                         if (object->flags & OBJ_ONSHADOW) {
2364                                 LIST_REMOVE(object, shadow_list);
2365                                 backing_object->shadow_count--;
2366                                 backing_object->generation++;
2367                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
2368                         }
2369
2370                         /*
2371                          * backing_object->backing_object moves from within
2372                          * backing_object to within object.
2373                          *
2374                          * OBJT_VNODE bbobj's should have empty shadow lists.
2375                          */
2376                         while ((bbobj = backing_object->backing_object) != NULL) {
2377                                 if (bbobj->type == OBJT_VNODE)
2378                                         vm_object_hold_shared(bbobj);
2379                                 else
2380                                         vm_object_hold(bbobj);
2381                                 if (bbobj == backing_object->backing_object)
2382                                         break;
2383                                 vm_object_drop(bbobj);
2384                         }
2385
2386                         /*
2387                          * We are removing backing_object from bbobj's
2388                          * shadow list and adding object to bbobj's shadow
2389                          * list, so the ref_count on bbobj is unchanged.
2390                          */
2391                         if (bbobj) {
2392                                 if (backing_object->flags & OBJ_ONSHADOW) {
2393                                         /* not locked exclusively if vnode */
2394                                         KKASSERT(bbobj->type != OBJT_VNODE);
2395                                         LIST_REMOVE(backing_object,
2396                                                     shadow_list);
2397                                         bbobj->shadow_count--;
2398                                         bbobj->generation++;
2399                                         vm_object_clear_flag(backing_object,
2400                                                              OBJ_ONSHADOW);
2401                                 }
2402                                 backing_object->backing_object = NULL;
2403                         }
2404                         object->backing_object = bbobj;
2405                         if (bbobj) {
2406                                 if (bbobj->type != OBJT_VNODE) {
2407                                         LIST_INSERT_HEAD(&bbobj->shadow_head,
2408                                                          object, shadow_list);
2409                                         bbobj->shadow_count++;
2410                                         bbobj->generation++;
2411                                         vm_object_set_flag(object,
2412                                                            OBJ_ONSHADOW);
2413                                 }
2414                         }
2415
2416                         object->backing_object_offset +=
2417                                 backing_object->backing_object_offset;
2418
2419                         vm_object_drop(bbobj);
2420
2421                         /*
2422                          * Discard the old backing_object.  Nothing should be
2423                          * able to ref it, other than a vm_map_split(),
2424                          * and vm_map_split() will stall on our chain lock.
2425                          * And we control the parent so it shouldn't be
2426                          * possible for it to go away either.
2427                          *
2428                          * Since the backing object has no pages, no pager
2429                          * left, and no object references within it, all
2430                          * that is necessary is to dispose of it.
2431                          */
2432                         KASSERT(backing_object->ref_count == 1,
2433                                 ("backing_object %p was somehow "
2434                                  "re-referenced during collapse!",
2435                                  backing_object));
2436                         KASSERT(RB_EMPTY(&backing_object->rb_memq),
2437                                 ("backing_object %p somehow has left "
2438                                  "over pages during collapse!",
2439                                  backing_object));
2440
2441                         /*
2442                          * The object can be destroyed.
2443                          *
2444                          * XXX just fall through and dodealloc instead
2445                          *     of forcing destruction?
2446                          */
2447                         atomic_add_int(&backing_object->ref_count, -1);
2448 #if defined(DEBUG_LOCKS)
2449                         debugvm_object_add(backing_object, "collapse", 1, -1);
2450 #endif
2451                         if ((backing_object->flags & OBJ_DEAD) == 0)
2452                                 vm_object_terminate(backing_object);
2453                         object_collapses++;
2454                         dodealloc = 0;
2455                 } else {
2456                         /*
2457                          * If we do not entirely shadow the backing object,
2458                          * there is nothing we can do so we give up.
2459                          */
2460                         if (vm_object_backing_scan(object, backing_object,
2461                                                 OBSC_TEST_ALL_SHADOWED) == 0) {
2462                                 break;
2463                         }
2464
2465                         /*
2466                          * bbobj is backing_object->backing_object.  Since
2467                          * object completely shadows backing_object we can
2468                          * bypass it and become backed by bbobj instead.
2469                          *
2470                          * The shadow list for vnode backing objects is not
2471                          * used and a shared hold is allowed.
2472                          */
2473                         while ((bbobj = backing_object->backing_object) != NULL) {
2474                                 if (bbobj->type == OBJT_VNODE)
2475                                         vm_object_hold_shared(bbobj);
2476                                 else
2477                                         vm_object_hold(bbobj);
2478                                 if (bbobj == backing_object->backing_object)
2479                                         break;
2480                                 vm_object_drop(bbobj);
2481                         }
2482
2483                         /*
2484                          * Make object shadow bbobj instead of backing_object.
2485                          * Remove object from backing_object's shadow list.
2486                          *
2487                          * Deallocating backing_object will not remove
2488                          * it, since its reference count is at least 2.
2489                          *
2490                          * Removing object from backing_object's shadow
2491                          * list requires releasing a ref, which we do
2492                          * below by setting dodealloc to 1.
2493                          */
2494                         KKASSERT(object->backing_object == backing_object);
2495                         if (object->flags & OBJ_ONSHADOW) {
2496                                 LIST_REMOVE(object, shadow_list);
2497                                 backing_object->shadow_count--;
2498                                 backing_object->generation++;
2499                                 vm_object_clear_flag(object, OBJ_ONSHADOW);
2500                         }
2501
2502                         /*
2503                          * Add a ref to bbobj, bbobj now shadows object.
2504                          *
2505                          * NOTE: backing_object->backing_object still points
2506                          *       to bbobj.  That relationship remains intact
2507                          *       because backing_object has > 1 ref, so
2508                          *       someone else is pointing to it (hence why
2509                          *       we can't collapse it into object and can
2510                          *       only handle the all-shadowed bypass case).
2511                          */
2512                         if (bbobj) {
2513                                 if (bbobj->type != OBJT_VNODE) {
2514                                         vm_object_chain_wait(bbobj, 0);
2515                                         vm_object_reference_locked(bbobj);
2516                                         LIST_INSERT_HEAD(&bbobj->shadow_head,
2517                                                          object, shadow_list);
2518                                         bbobj->shadow_count++;
2519                                         bbobj->generation++;
2520                                         vm_object_set_flag(object,
2521                                                            OBJ_ONSHADOW);
2522                                 } else {
2523                                         vm_object_reference_quick(bbobj);
2524                                 }
2525                                 object->backing_object_offset +=
2526                                         backing_object->backing_object_offset;
2527                                 object->backing_object = bbobj;
2528                                 vm_object_drop(bbobj);
2529                         } else {
2530                                 object->backing_object = NULL;
2531                         }
2532
2533                         /*
2534                          * Drop the reference count on backing_object.  To
2535                          * handle ref_count races properly we can't assume
2536                          * that the ref_count is still at least 2 so we
2537                          * have to actually call vm_object_deallocate()
2538                          * (after clearing the chainlock).
2539                          */
2540                         object_bypasses++;
2541                         dodealloc = 1;
2542                 }
2543
2544                 /*
2545                  * Ok, we want to loop on the new object->bbobj association,
2546                  * possibly collapsing it further.  However if dodealloc is
2547                  * non-zero we have to deallocate the backing_object which
2548                  * itself can potentially undergo a collapse, creating a
2549                  * recursion depth issue with the LWKT token subsystem.
2550                  *
2551                  * In the case where we must deallocate the backing_object
2552                  * it is possible now that the backing_object has a single
2553                  * shadow count on some other object (not represented here
2554                  * as yet), since it no longer shadows us.  Thus when we
2555                  * call vm_object_deallocate() it may attempt to collapse
2556                  * itself into its remaining parent.
2557                  */
2558                 if (dodealloc) {
2559                         struct vm_object_dealloc_list *dtmp;
2560
2561                         vm_object_chain_release(backing_object);
2562                         vm_object_unlock(backing_object);
2563                         /* backing_object remains held */
2564
2565                         /*
2566                          * Auto-deallocation list for caller convenience.
2567                          */
2568                         if (dlistp == NULL)
2569                                 dlistp = &dlist;
2570
2571                         dtmp = kmalloc(sizeof(*dtmp), M_TEMP, M_WAITOK);
2572                         dtmp->object = backing_object;
2573                         dtmp->next = *dlistp;
2574                         *dlistp = dtmp;
2575                 } else {
2576                         vm_object_chain_release(backing_object);
2577                         vm_object_drop(backing_object);
2578                 }
2579                 /* backing_object = NULL; not needed */
2580                 /* loop */
2581         }
2582
2583         /*
2584          * Clean up any left over backing_object
2585          */
2586         if (backing_object) {
2587                 vm_object_chain_release(backing_object);
2588                 vm_object_drop(backing_object);
2589         }
2590
2591         /*
2592          * Clean up any auto-deallocation list.  This is a convenience
2593          * for top-level callers so they don't have to pass &dlist.
2594          * Do not clean up any caller-passed dlistp, the caller will
2595          * do that.
2596          */
2597         if (dlist)
2598                 vm_object_deallocate_list(&dlist);
2599
2600 }
2601
2602 /*
2603  * vm_object_collapse() may collect additional objects in need of
2604  * deallocation.  This routine deallocates these objects.  The
2605  * deallocation itself can trigger additional collapses (which the
2606  * deallocate function takes care of).  This procedure is used to
2607  * reduce procedural recursion since these vm_object shadow chains
2608  * can become quite long.
2609  */
2610 void
2611 vm_object_deallocate_list(struct vm_object_dealloc_list **dlistp)
2612 {
2613         struct vm_object_dealloc_list *dlist;
2614
2615         while ((dlist = *dlistp) != NULL) {
2616                 *dlistp = dlist->next;
2617                 vm_object_lock(dlist->object);
2618                 vm_object_deallocate_locked(dlist->object);
2619                 vm_object_drop(dlist->object);
2620                 kfree(dlist, M_TEMP);
2621         }
2622 }
2623
2624 /*
2625  * Removes all physical pages in the specified object range from the
2626  * object's list of pages.
2627  *
2628  * No requirements.
2629  */
2630 static int vm_object_page_remove_callback(vm_page_t p, void *data);
2631
2632 void
2633 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
2634                       boolean_t clean_only)
2635 {
2636         struct rb_vm_page_scan_info info;
2637         int all;
2638
2639         /*
2640          * Degenerate cases and assertions
2641          */
2642         vm_object_hold(object);
2643         if (object == NULL ||
2644             (object->resident_page_count == 0 && object->swblock_count == 0)) {
2645                 vm_object_drop(object);
2646                 return;
2647         }
2648         KASSERT(object->type != OBJT_PHYS, 
2649                 ("attempt to remove pages from a physical object"));
2650
2651         /*
2652          * Indicate that paging is occuring on the object
2653          */
2654         vm_object_pip_add(object, 1);
2655
2656         /*
2657          * Figure out the actual removal range and whether we are removing
2658          * the entire contents of the object or not.  If removing the entire
2659          * contents, be sure to get all pages, even those that might be 
2660          * beyond the end of the object.
2661          */
2662         info.object = object;
2663         info.start_pindex = start;
2664         if (end == 0)
2665                 info.end_pindex = (vm_pindex_t)-1;
2666         else
2667                 info.end_pindex = end - 1;
2668         info.limit = clean_only;
2669         all = (start == 0 && info.end_pindex >= object->size - 1);
2670
2671         /*
2672          * Loop until we are sure we have gotten them all.
2673          */
2674         do {
2675                 info.error = 0;
2676                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2677                                         vm_object_page_remove_callback, &info);
2678         } while (info.error);
2679
2680         /*
2681          * Remove any related swap if throwing away pages, or for
2682          * non-swap objects (the swap is a clean copy in that case).
2683          */
2684         if (object->type != OBJT_SWAP || clean_only == FALSE) {
2685                 if (all)
2686                         swap_pager_freespace_all(object);
2687                 else
2688                         swap_pager_freespace(object, info.start_pindex,
2689                              info.end_pindex - info.start_pindex + 1);
2690         }
2691
2692         /*
2693          * Cleanup
2694          */
2695         vm_object_pip_wakeup(object);
2696         vm_object_drop(object);
2697 }
2698
2699 /*
2700  * The caller must hold the object
2701  */
2702 static int
2703 vm_object_page_remove_callback(vm_page_t p, void *data)
2704 {
2705         struct rb_vm_page_scan_info *info = data;
2706
2707         if ((++info->count & 63) == 0)
2708                 lwkt_user_yield();
2709
2710         if (info->object != p->object ||
2711             p->pindex < info->start_pindex ||
2712             p->pindex > info->end_pindex) {
2713                 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
2714                         info->object, p);
2715                 return(0);
2716         }
2717         if (vm_page_busy_try(p, TRUE)) {
2718                 vm_page_sleep_busy(p, TRUE, "vmopar");
2719                 info->error = 1;
2720                 return(0);
2721         }
2722         if (info->object != p->object) {
2723                 /* this should never happen */
2724                 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
2725                         info->object, p);
2726                 vm_page_wakeup(p);
2727                 return(0);
2728         }
2729
2730         /*
2731          * Wired pages cannot be destroyed, but they can be invalidated
2732          * and we do so if clean_only (limit) is not set.
2733          *
2734          * WARNING!  The page may be wired due to being part of a buffer
2735          *           cache buffer, and the buffer might be marked B_CACHE.
2736          *           This is fine as part of a truncation but VFSs must be
2737          *           sure to fix the buffer up when re-extending the file.
2738          *
2739          * NOTE!     PG_NEED_COMMIT is ignored.
2740          */
2741         if (p->wire_count != 0) {
2742                 vm_page_protect(p, VM_PROT_NONE);
2743                 if (info->limit == 0)
2744                         p->valid = 0;
2745                 vm_page_wakeup(p);
2746                 return(0);
2747         }
2748
2749         /*
2750          * limit is our clean_only flag.  If set and the page is dirty or
2751          * requires a commit, do not free it.  If set and the page is being
2752          * held by someone, do not free it.
2753          */
2754         if (info->limit && p->valid) {
2755                 vm_page_test_dirty(p);
2756                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
2757                         vm_page_wakeup(p);
2758                         return(0);
2759                 }
2760         }
2761
2762         /*
2763          * Destroy the page
2764          */
2765         vm_page_protect(p, VM_PROT_NONE);
2766         vm_page_free(p);
2767
2768         return(0);
2769 }
2770
2771 /*
2772  * Coalesces two objects backing up adjoining regions of memory into a
2773  * single object.
2774  *
2775  * returns TRUE if objects were combined.
2776  *
2777  * NOTE: Only works at the moment if the second object is NULL -
2778  *       if it's not, which object do we lock first?
2779  *
2780  * Parameters:
2781  *      prev_object     First object to coalesce
2782  *      prev_offset     Offset into prev_object
2783  *      next_object     Second object into coalesce
2784  *      next_offset     Offset into next_object
2785  *
2786  *      prev_size       Size of reference to prev_object
2787  *      next_size       Size of reference to next_object
2788  *
2789  * The caller does not need to hold (prev_object) but must have a stable
2790  * pointer to it (typically by holding the vm_map locked).
2791  */
2792 boolean_t
2793 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
2794                    vm_size_t prev_size, vm_size_t next_size)
2795 {
2796         vm_pindex_t next_pindex;
2797
2798         if (prev_object == NULL)
2799                 return (TRUE);
2800
2801         vm_object_hold(prev_object);
2802
2803         if (prev_object->type != OBJT_DEFAULT &&
2804             prev_object->type != OBJT_SWAP) {
2805                 vm_object_drop(prev_object);
2806                 return (FALSE);
2807         }
2808
2809         /*
2810          * Try to collapse the object first
2811          */
2812         vm_object_chain_acquire(prev_object, 0);
2813         vm_object_collapse(prev_object, NULL);
2814
2815         /*
2816          * Can't coalesce if: . more than one reference . paged out . shadows
2817          * another object . has a copy elsewhere (any of which mean that the
2818          * pages not mapped to prev_entry may be in use anyway)
2819          */
2820
2821         if (prev_object->backing_object != NULL) {
2822                 vm_object_chain_release(prev_object);
2823                 vm_object_drop(prev_object);
2824                 return (FALSE);
2825         }
2826
2827         prev_size >>= PAGE_SHIFT;
2828         next_size >>= PAGE_SHIFT;
2829         next_pindex = prev_pindex + prev_size;
2830
2831         if ((prev_object->ref_count > 1) &&
2832             (prev_object->size != next_pindex)) {
2833                 vm_object_chain_release(prev_object);
2834                 vm_object_drop(prev_object);
2835                 return (FALSE);
2836         }
2837
2838         /*
2839          * Remove any pages that may still be in the object from a previous
2840          * deallocation.
2841          */
2842         if (next_pindex < prev_object->size) {
2843                 vm_object_page_remove(prev_object,
2844                                       next_pindex,
2845                                       next_pindex + next_size, FALSE);
2846                 if (prev_object->type == OBJT_SWAP)
2847                         swap_pager_freespace(prev_object,
2848                                              next_pindex, next_size);
2849         }
2850
2851         /*
2852          * Extend the object if necessary.
2853          */
2854         if (next_pindex + next_size > prev_object->size)
2855                 prev_object->size = next_pindex + next_size;
2856
2857         vm_object_chain_release(prev_object);
2858         vm_object_drop(prev_object);
2859         return (TRUE);
2860 }
2861
2862 /*
2863  * Make the object writable and flag is being possibly dirty.
2864  *
2865  * The object might not be held (or might be held but held shared),
2866  * the related vnode is probably not held either.  Object and vnode are
2867  * stable by virtue of the vm_page busied by the caller preventing
2868  * destruction.
2869  *
2870  * If the related mount is flagged MNTK_THR_SYNC we need to call
2871  * vsetobjdirty().  Filesystems using this option usually shortcut
2872  * synchronization by only scanning the syncer list.
2873  */
2874 void
2875 vm_object_set_writeable_dirty(vm_object_t object)
2876 {
2877         struct vnode *vp;
2878
2879         /*vm_object_assert_held(object);*/
2880         /*
2881          * Avoid contention in vm fault path by checking the state before
2882          * issuing an atomic op on it.
2883          */
2884         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
2885             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
2886                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
2887         }
2888         if (object->type == OBJT_VNODE &&
2889             (vp = (struct vnode *)object->handle) != NULL) {
2890                 if ((vp->v_flag & VOBJDIRTY) == 0) {
2891                         if (vp->v_mount &&
2892                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
2893                                 /*
2894                                  * New style THR_SYNC places vnodes on the
2895                                  * syncer list more deterministically.
2896                                  */
2897                                 vsetobjdirty(vp);
2898                         } else {
2899                                 /*
2900                                  * Old style scan would not necessarily place
2901                                  * a vnode on the syncer list when possibly
2902                                  * modified via mmap.
2903                                  */
2904                                 vsetflags(vp, VOBJDIRTY);
2905                         }
2906                 }
2907         }
2908 }
2909
2910 #include "opt_ddb.h"
2911 #ifdef DDB
2912 #include <sys/kernel.h>
2913
2914 #include <sys/cons.h>
2915
2916 #include <ddb/ddb.h>
2917
2918 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
2919                                        vm_map_entry_t entry);
2920 static int      vm_object_in_map (vm_object_t object);
2921
2922 /*
2923  * The caller must hold the object.
2924  */
2925 static int
2926 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
2927 {
2928         vm_map_t tmpm;
2929         vm_map_entry_t tmpe;
2930         vm_object_t obj, nobj;
2931         int entcount;
2932
2933         if (map == 0)
2934                 return 0;
2935         if (entry == 0) {
2936                 tmpe = map->header.next;
2937                 entcount = map->nentries;
2938                 while (entcount-- && (tmpe != &map->header)) {
2939                         if( _vm_object_in_map(map, object, tmpe)) {
2940                                 return 1;
2941                         }
2942                         tmpe = tmpe->next;
2943                 }
2944                 return (0);
2945         }
2946         switch(entry->maptype) {
2947         case VM_MAPTYPE_SUBMAP:
2948                 tmpm = entry->object.sub_map;
2949                 tmpe = tmpm->header.next;
2950                 entcount = tmpm->nentries;
2951                 while (entcount-- && tmpe != &tmpm->header) {
2952                         if( _vm_object_in_map(tmpm, object, tmpe)) {
2953                                 return 1;
2954                         }
2955                         tmpe = tmpe->next;
2956                 }
2957                 break;
2958         case VM_MAPTYPE_NORMAL:
2959         case VM_MAPTYPE_VPAGETABLE:
2960                 obj = entry->object.vm_object;
2961                 while (obj) {
2962                         if (obj == object) {
2963                                 if (obj != entry->object.vm_object)
2964                                         vm_object_drop(obj);
2965                                 return 1;
2966                         }
2967                         while ((nobj = obj->backing_object) != NULL) {
2968                                 vm_object_hold(nobj);
2969                                 if (nobj == obj->backing_object)
2970                                         break;
2971                                 vm_object_drop(nobj);
2972                         }
2973                         if (obj != entry->object.vm_object) {
2974                                 if (nobj)
2975                                         vm_object_lock_swap();
2976                                 vm_object_drop(obj);
2977                         }
2978                         obj = nobj;
2979                 }
2980                 break;
2981         default:
2982                 break;
2983         }
2984         return 0;
2985 }
2986
2987 static int vm_object_in_map_callback(struct proc *p, void *data);
2988
2989 struct vm_object_in_map_info {
2990         vm_object_t object;
2991         int rv;
2992 };
2993
2994 /*
2995  * Debugging only
2996  */
2997 static int
2998 vm_object_in_map(vm_object_t object)
2999 {
3000         struct vm_object_in_map_info info;
3001
3002         info.rv = 0;
3003         info.object = object;
3004
3005         allproc_scan(vm_object_in_map_callback, &info);
3006         if (info.rv)
3007                 return 1;
3008         if( _vm_object_in_map(&kernel_map, object, 0))
3009                 return 1;
3010         if( _vm_object_in_map(&pager_map, object, 0))
3011                 return 1;
3012         if( _vm_object_in_map(&buffer_map, object, 0))
3013                 return 1;
3014         return 0;
3015 }
3016
3017 /*
3018  * Debugging only
3019  */
3020 static int
3021 vm_object_in_map_callback(struct proc *p, void *data)
3022 {
3023         struct vm_object_in_map_info *info = data;
3024
3025         if (p->p_vmspace) {
3026                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
3027                         info->rv = 1;
3028                         return -1;
3029                 }
3030         }
3031         return (0);
3032 }
3033
3034 DB_SHOW_COMMAND(vmochk, vm_object_check)
3035 {
3036         vm_object_t object;
3037         int n;
3038
3039         /*
3040          * make sure that internal objs are in a map somewhere
3041          * and none have zero ref counts.
3042          */
3043         for (n = 0; n < VMOBJ_HSIZE; ++n) {
3044                 for (object = TAILQ_FIRST(&vm_object_lists[n]);
3045                                 object != NULL;
3046                                 object = TAILQ_NEXT(object, object_list)) {
3047                         if (object->type == OBJT_MARKER)
3048                                 continue;
3049                         if (object->handle != NULL ||
3050                             (object->type != OBJT_DEFAULT &&
3051                              object->type != OBJT_SWAP)) {
3052                                 continue;
3053                         }
3054                         if (object->ref_count == 0) {
3055                                 db_printf("vmochk: internal obj has "
3056                                           "zero ref count: %ld\n",
3057                                           (long)object->size);
3058                         }
3059                         if (vm_object_in_map(object))
3060                                 continue;
3061                         db_printf("vmochk: internal obj is not in a map: "
3062                                   "ref: %d, size: %lu: 0x%lx, "
3063                                   "backing_object: %p\n",
3064                                   object->ref_count, (u_long)object->size,
3065                                   (u_long)object->size,
3066                                   (void *)object->backing_object);
3067                 }
3068         }
3069 }
3070
3071 /*
3072  * Debugging only
3073  */
3074 DB_SHOW_COMMAND(object, vm_object_print_static)
3075 {
3076         /* XXX convert args. */
3077         vm_object_t object = (vm_object_t)addr;
3078         boolean_t full = have_addr;
3079
3080         vm_page_t p;
3081
3082         /* XXX count is an (unused) arg.  Avoid shadowing it. */
3083 #define count   was_count
3084
3085         int count;
3086
3087         if (object == NULL)
3088                 return;
3089
3090         db_iprintf(
3091             "Object %p: type=%d, size=0x%lx, res=%d, ref=%d, flags=0x%x\n",
3092             object, (int)object->type, (u_long)object->size,
3093             object->resident_page_count, object->ref_count, object->flags);
3094         /*
3095          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
3096          */
3097         db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
3098             object->shadow_count, 
3099             object->backing_object ? object->backing_object->ref_count : 0,
3100             object->backing_object, (long)object->backing_object_offset);
3101
3102         if (!full)
3103                 return;
3104
3105         db_indent += 2;
3106         count = 0;
3107         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
3108                 if (count == 0)
3109                         db_iprintf("memory:=");
3110                 else if (count == 6) {
3111                         db_printf("\n");
3112                         db_iprintf(" ...");
3113                         count = 0;
3114                 } else
3115                         db_printf(",");
3116                 count++;
3117
3118                 db_printf("(off=0x%lx,page=0x%lx)",
3119                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
3120         }
3121         if (count != 0)
3122                 db_printf("\n");
3123         db_indent -= 2;
3124 }
3125
3126 /* XXX. */
3127 #undef count
3128
3129 /*
3130  * XXX need this non-static entry for calling from vm_map_print.
3131  *
3132  * Debugging only
3133  */
3134 void
3135 vm_object_print(/* db_expr_t */ long addr,
3136                 boolean_t have_addr,
3137                 /* db_expr_t */ long count,
3138                 char *modif)
3139 {
3140         vm_object_print_static(addr, have_addr, count, modif);
3141 }
3142
3143 /*
3144  * Debugging only
3145  */
3146 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
3147 {
3148         vm_object_t object;
3149         int nl = 0;
3150         int c;
3151         int n;
3152
3153         for (n = 0; n < VMOBJ_HSIZE; ++n) {
3154                 for (object = TAILQ_FIRST(&vm_object_lists[n]);
3155                                 object != NULL;
3156                                 object = TAILQ_NEXT(object, object_list)) {
3157                         vm_pindex_t idx, fidx;
3158                         vm_pindex_t osize;
3159                         vm_paddr_t pa = -1, padiff;
3160                         int rcount;
3161                         vm_page_t m;
3162
3163                         if (object->type == OBJT_MARKER)
3164                                 continue;
3165                         db_printf("new object: %p\n", (void *)object);
3166                         if ( nl > 18) {
3167                                 c = cngetc();
3168                                 if (c != ' ')
3169                                         return;
3170                                 nl = 0;
3171                         }
3172                         nl++;
3173                         rcount = 0;
3174                         fidx = 0;
3175                         osize = object->size;
3176                         if (osize > 128)
3177                                 osize = 128;
3178                         for (idx = 0; idx < osize; idx++) {
3179                                 m = vm_page_lookup(object, idx);
3180                                 if (m == NULL) {
3181                                         if (rcount) {
3182                                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3183                                                         (long)fidx, rcount, (long)pa);
3184                                                 if ( nl > 18) {
3185                                                         c = cngetc();
3186                                                         if (c != ' ')
3187                                                                 return;
3188                                                         nl = 0;
3189                                                 }
3190                                                 nl++;
3191                                                 rcount = 0;
3192                                         }
3193                                         continue;
3194                                 }
3195
3196                                 if (rcount &&
3197                                         (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
3198                                         ++rcount;
3199                                         continue;
3200                                 }
3201                                 if (rcount) {
3202                                         padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
3203                                         padiff >>= PAGE_SHIFT;
3204                                         padiff &= PQ_L2_MASK;
3205                                         if (padiff == 0) {
3206                                                 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
3207                                                 ++rcount;
3208                                                 continue;
3209                                         }
3210                                         db_printf(" index(%ld)run(%d)pa(0x%lx)",
3211                                                 (long)fidx, rcount, (long)pa);
3212                                         db_printf("pd(%ld)\n", (long)padiff);
3213                                         if ( nl > 18) {
3214                                                 c = cngetc();
3215                                                 if (c != ' ')
3216                                                         return;
3217                                                 nl = 0;
3218                                         }
3219                                         nl++;
3220                                 }
3221                                 fidx = idx;
3222                                 pa = VM_PAGE_TO_PHYS(m);
3223                                 rcount = 1;
3224                         }
3225                         if (rcount) {
3226                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3227                                         (long)fidx, rcount, (long)pa);
3228                                 if ( nl > 18) {
3229                                         c = cngetc();
3230                                         if (c != ' ')
3231                                                 return;
3232                                         nl = 0;
3233                                 }
3234                                 nl++;
3235                         }
3236                 }
3237         }
3238 }
3239 #endif /* DDB */