Merge branch 'vendor/DHCPCD'
[dragonfly.git] / sys / vm / vm_object.c
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62
63 /*
64  *      Virtual memory object module.
65  */
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>           /* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
78
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
91
92 #include <vm/vm_page2.h>
93
94 #include <machine/specialreg.h>
95
96 #define EASY_SCAN_FACTOR        8
97
98 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
99                                              int pagerflags);
100 static void     vm_object_lock_init(vm_object_t);
101
102 /*
103  *      Virtual memory objects maintain the actual data
104  *      associated with allocated virtual memory.  A given
105  *      page of memory exists within exactly one object.
106  *
107  *      An object is only deallocated when all "references"
108  *      are given up.  Only one "reference" to a given
109  *      region of an object should be writeable.
110  *
111  *      Associated with each object is a list of all resident
112  *      memory pages belonging to that object; this list is
113  *      maintained by the "vm_page" module, and locked by the object's
114  *      lock.
115  *
116  *      Each object also records a "pager" routine which is
117  *      used to retrieve (and store) pages to the proper backing
118  *      storage.  In addition, objects may be backed by other
119  *      objects from which they were virtual-copied.
120  *
121  *      The only items within the object structure which are
122  *      modified after time of creation are:
123  *              reference count         locked by object's lock
124  *              pager routine           locked by object's lock
125  *
126  */
127
128 struct vm_object kernel_object;
129
130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
131
132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
133
134 #define VMOBJ_HASH_PRIME1       66555444443333333ULL
135 #define VMOBJ_HASH_PRIME2       989042931893ULL
136
137 int vm_object_debug;
138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
139
140 static __inline
141 struct vm_object_hash *
142 vmobj_hash(vm_object_t obj)
143 {
144         uintptr_t hash1;
145         uintptr_t hash2;
146
147         hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
148         hash1 %= VMOBJ_HASH_PRIME1;
149         hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
150         hash2 %= VMOBJ_HASH_PRIME2;
151         return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
152 }
153
154 #if defined(DEBUG_LOCKS)
155
156 #define vm_object_vndeallocate(obj, vpp)        \
157                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
158
159 /*
160  * Debug helper to track hold/drop/ref/deallocate calls.
161  */
162 static void
163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
164 {
165         int i;
166
167         i = atomic_fetchadd_int(&obj->debug_index, 1);
168         i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
169         ksnprintf(obj->debug_hold_thrs[i],
170                   sizeof(obj->debug_hold_thrs[i]),
171                   "%c%d:(%d):%s",
172                   (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
173                   (curthread->td_proc ? curthread->td_proc->p_pid : -1),
174                   obj->ref_count,
175                   curthread->td_comm);
176         obj->debug_hold_file[i] = file;
177         obj->debug_hold_line[i] = line;
178 #if 0
179         /* Uncomment for debugging obj refs/derefs in reproducable cases */
180         if (strcmp(curthread->td_comm, "sshd") == 0) {
181                 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
182                         (curthread->td_proc ? curthread->td_proc->p_pid : -1),
183                         obj, obj->ref_count, addrem, file, line);
184         }
185 #endif
186 }
187
188 #endif
189
190 /*
191  * Misc low level routines
192  */
193 static void
194 vm_object_lock_init(vm_object_t obj)
195 {
196 #if defined(DEBUG_LOCKS)
197         int i;
198
199         obj->debug_index = 0;
200         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
201                 obj->debug_hold_thrs[i][0] = 0;
202                 obj->debug_hold_file[i] = NULL;
203                 obj->debug_hold_line[i] = 0;
204         }
205 #endif
206 }
207
208 void
209 vm_object_lock_swap(void)
210 {
211         lwkt_token_swap();
212 }
213
214 void
215 vm_object_lock(vm_object_t obj)
216 {
217         lwkt_gettoken(&obj->token);
218 }
219
220 /*
221  * Returns TRUE on sucesss
222  */
223 static int
224 vm_object_lock_try(vm_object_t obj)
225 {
226         return(lwkt_trytoken(&obj->token));
227 }
228
229 void
230 vm_object_lock_shared(vm_object_t obj)
231 {
232         lwkt_gettoken_shared(&obj->token);
233 }
234
235 void
236 vm_object_unlock(vm_object_t obj)
237 {
238         lwkt_reltoken(&obj->token);
239 }
240
241 void
242 vm_object_upgrade(vm_object_t obj)
243 {
244         lwkt_reltoken(&obj->token);
245         lwkt_gettoken(&obj->token);
246 }
247
248 void
249 vm_object_downgrade(vm_object_t obj)
250 {
251         lwkt_reltoken(&obj->token);
252         lwkt_gettoken_shared(&obj->token);
253 }
254
255 static __inline void
256 vm_object_assert_held(vm_object_t obj)
257 {
258         ASSERT_LWKT_TOKEN_HELD(&obj->token);
259 }
260
261 int
262 vm_quickcolor(void)
263 {
264         globaldata_t gd = mycpu;
265         int pg_color;
266
267         pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
268         pg_color += gd->gd_quick_color;
269         gd->gd_quick_color += PQ_PRIME2;
270
271         return pg_color;
272 }
273
274 void
275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
276 {
277         KKASSERT(obj != NULL);
278
279         /*
280          * Object must be held (object allocation is stable due to callers
281          * context, typically already holding the token on a parent object)
282          * prior to potentially blocking on the lock, otherwise the object
283          * can get ripped away from us.
284          */
285         refcount_acquire(&obj->hold_count);
286         vm_object_lock(obj);
287
288 #if defined(DEBUG_LOCKS)
289         debugvm_object_add(obj, file, line, 1);
290 #endif
291 }
292
293 int
294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
295 {
296         KKASSERT(obj != NULL);
297
298         /*
299          * Object must be held (object allocation is stable due to callers
300          * context, typically already holding the token on a parent object)
301          * prior to potentially blocking on the lock, otherwise the object
302          * can get ripped away from us.
303          */
304         refcount_acquire(&obj->hold_count);
305         if (vm_object_lock_try(obj) == 0) {
306                 if (refcount_release(&obj->hold_count)) {
307                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
308                                 kfree(obj, M_VM_OBJECT);
309                 }
310                 return(0);
311         }
312
313 #if defined(DEBUG_LOCKS)
314         debugvm_object_add(obj, file, line, 1);
315 #endif
316         return(1);
317 }
318
319 void
320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
321 {
322         KKASSERT(obj != NULL);
323
324         /*
325          * Object must be held (object allocation is stable due to callers
326          * context, typically already holding the token on a parent object)
327          * prior to potentially blocking on the lock, otherwise the object
328          * can get ripped away from us.
329          */
330         refcount_acquire(&obj->hold_count);
331         vm_object_lock_shared(obj);
332
333 #if defined(DEBUG_LOCKS)
334         debugvm_object_add(obj, file, line, 1);
335 #endif
336 }
337
338 /*
339  * Drop the token and hold_count on the object.
340  *
341  * WARNING! Token might be shared.
342  */
343 void
344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
345 {
346         if (obj == NULL)
347                 return;
348
349         /*
350          * No new holders should be possible once we drop hold_count 1->0 as
351          * there is no longer any way to reference the object.
352          */
353         KKASSERT(obj->hold_count > 0);
354         if (refcount_release(&obj->hold_count)) {
355 #if defined(DEBUG_LOCKS)
356                 debugvm_object_add(obj, file, line, -1);
357 #endif
358
359                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
360                         vm_object_unlock(obj);
361                         kfree(obj, M_VM_OBJECT);
362                 } else {
363                         vm_object_unlock(obj);
364                 }
365         } else {
366 #if defined(DEBUG_LOCKS)
367                 debugvm_object_add(obj, file, line, -1);
368 #endif
369                 vm_object_unlock(obj);
370         }
371 }
372
373 /*
374  * Initialize a freshly allocated object, returning a held object.
375  *
376  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
377  *
378  * No requirements.
379  */
380 void
381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
382 {
383         struct vm_object_hash *hash;
384
385         RB_INIT(&object->rb_memq);
386         lwkt_token_init(&object->token, "vmobj");
387
388         TAILQ_INIT(&object->backing_list);
389         lockinit(&object->backing_lk, "baclk", 0, 0);
390
391         object->type = type;
392         object->size = size;
393         object->ref_count = 1;
394         object->memattr = VM_MEMATTR_DEFAULT;
395         object->hold_count = 0;
396         object->flags = 0;
397         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
398                 vm_object_set_flag(object, OBJ_ONEMAPPING);
399         object->paging_in_progress = 0;
400         object->resident_page_count = 0;
401         /* cpu localization twist */
402         object->pg_color = vm_quickcolor();
403         object->handle = NULL;
404
405         atomic_add_int(&object->generation, 1);
406         object->swblock_count = 0;
407         RB_INIT(&object->swblock_root);
408         vm_object_lock_init(object);
409         pmap_object_init(object);
410
411         vm_object_hold(object);
412
413         hash = vmobj_hash(object);
414         lwkt_gettoken(&hash->token);
415         TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
416         lwkt_reltoken(&hash->token);
417 }
418
419 /*
420  * Initialize a VM object.
421  */
422 void
423 vm_object_init(vm_object_t object, vm_pindex_t size)
424 {
425         _vm_object_allocate(OBJT_DEFAULT, size, object);
426         vm_object_drop(object);
427 }
428
429 /*
430  * Initialize the VM objects module.
431  *
432  * Called from the low level boot code only.  Note that this occurs before
433  * kmalloc is initialized so we cannot allocate any VM objects.
434  */
435 void
436 vm_object_init1(void)
437 {
438         int i;
439
440         for (i = 0; i < VMOBJ_HSIZE; ++i) {
441                 TAILQ_INIT(&vm_object_hash[i].list);
442                 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
443         }
444
445         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
446                             &kernel_object);
447         vm_object_drop(&kernel_object);
448 }
449
450 void
451 vm_object_init2(void)
452 {
453         kmalloc_set_unlimited(M_VM_OBJECT);
454 }
455
456 /*
457  * Allocate and return a new object of the specified type and size.
458  *
459  * No requirements.
460  */
461 vm_object_t
462 vm_object_allocate(objtype_t type, vm_pindex_t size)
463 {
464         vm_object_t obj;
465
466         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
467         _vm_object_allocate(type, size, obj);
468         vm_object_drop(obj);
469
470         return (obj);
471 }
472
473 /*
474  * This version returns a held object, allowing further atomic initialization
475  * of the object.
476  */
477 vm_object_t
478 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
479 {
480         vm_object_t obj;
481
482         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
483         _vm_object_allocate(type, size, obj);
484
485         return (obj);
486 }
487
488 /*
489  * Add an additional reference to a vm_object.  The object must already be
490  * held.  The original non-lock version is no longer supported.  The object
491  * must NOT be chain locked by anyone at the time the reference is added.
492  *
493  * The object must be held, but may be held shared if desired (hence why
494  * we use an atomic op).
495  */
496 void
497 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
498 {
499         KKASSERT(object != NULL);
500         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
501         atomic_add_int(&object->ref_count, 1);
502         if (object->type == OBJT_VNODE) {
503                 vref(object->handle);
504                 /* XXX what if the vnode is being destroyed? */
505         }
506 #if defined(DEBUG_LOCKS)
507         debugvm_object_add(object, file, line, 1);
508 #endif
509 }
510
511 /*
512  * This version is only allowed in situations where the caller
513  * already knows that the object is deterministically referenced
514  * (usually because its taken from a ref'd vnode, or during a map_entry
515  * replication).
516  */
517 void
518 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
519 {
520         KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
521         atomic_add_int(&object->ref_count, 1);
522         if (object->type == OBJT_VNODE)
523                 vref(object->handle);
524 #if defined(DEBUG_LOCKS)
525         debugvm_object_add(object, file, line, 1);
526 #endif
527 }
528
529 /*
530  * Dereference an object and its underlying vnode.  The object may be
531  * held shared.  On return the object will remain held.
532  *
533  * This function may return a vnode in *vpp which the caller must release
534  * after the caller drops its own lock.  If vpp is NULL, we assume that
535  * the caller was holding an exclusive lock on the object and we vrele()
536  * the vp ourselves.
537  */
538 static void
539 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
540                                    VMOBJDBARGS)
541 {
542         struct vnode *vp = (struct vnode *) object->handle;
543
544         KASSERT(object->type == OBJT_VNODE,
545             ("vm_object_vndeallocate: not a vnode object"));
546         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
547         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
548 #ifdef INVARIANTS
549         if (object->ref_count == 0) {
550                 vprint("vm_object_vndeallocate", vp);
551                 panic("vm_object_vndeallocate: bad object reference count");
552         }
553 #endif
554         for (;;) {
555                 int count = object->ref_count;
556                 cpu_ccfence();
557                 if (count == 1) {
558                         vm_object_upgrade(object);
559                         if (atomic_cmpset_int(&object->ref_count, count, 0)) {
560                                 vclrflags(vp, VTEXT);
561                                 break;
562                         }
563                 } else {
564                         if (atomic_cmpset_int(&object->ref_count,
565                                               count, count - 1)) {
566                                 break;
567                         }
568                 }
569                 /* retry */
570         }
571 #if defined(DEBUG_LOCKS)
572         debugvm_object_add(object, file, line, -1);
573 #endif
574
575         /*
576          * vrele or return the vp to vrele.  We can only safely vrele(vp)
577          * if the object was locked exclusively.  But there are two races
578          * here.
579          *
580          * We had to upgrade the object above to safely clear VTEXT
581          * but the alternative path where the shared lock is retained
582          * can STILL race to 0 in other paths and cause our own vrele()
583          * to terminate the vnode.  We can't allow that if the VM object
584          * is still locked shared.
585          */
586         if (vpp)
587                 *vpp = vp;
588         else
589                 vrele(vp);
590 }
591
592 /*
593  * Release a reference to the specified object, gained either through a
594  * vm_object_allocate or a vm_object_reference call.  When all references
595  * are gone, storage associated with this object may be relinquished.
596  *
597  * The caller does not have to hold the object locked but must have control
598  * over the reference in question in order to guarantee that the object
599  * does not get ripped out from under us.
600  *
601  * XXX Currently all deallocations require an exclusive lock.
602  */
603 void
604 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
605 {
606         struct vnode *vp;
607         int count;
608
609         if (object == NULL)
610                 return;
611
612         for (;;) {
613                 count = object->ref_count;
614                 cpu_ccfence();
615
616                 /*
617                  * If decrementing the count enters into special handling
618                  * territory (0, 1, or 2) we have to do it the hard way.
619                  * Fortunate though, objects with only a few refs like this
620                  * are not likely to be heavily contended anyway.
621                  *
622                  * For vnode objects we only care about 1->0 transitions.
623                  */
624                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
625 #if defined(DEBUG_LOCKS)
626                         debugvm_object_add(object, file, line, 0);
627 #endif
628                         vm_object_hold(object);
629                         vm_object_deallocate_locked(object);
630                         vm_object_drop(object);
631                         break;
632                 }
633
634                 /*
635                  * Try to decrement ref_count without acquiring a hold on
636                  * the object.  This is particularly important for the exec*()
637                  * and exit*() code paths because the program binary may
638                  * have a great deal of sharing and an exclusive lock will
639                  * crowbar performance in those circumstances.
640                  */
641                 if (object->type == OBJT_VNODE) {
642                         vp = (struct vnode *)object->handle;
643                         if (atomic_cmpset_int(&object->ref_count,
644                                               count, count - 1)) {
645 #if defined(DEBUG_LOCKS)
646                                 debugvm_object_add(object, file, line, -1);
647 #endif
648
649                                 vrele(vp);
650                                 break;
651                         }
652                         /* retry */
653                 } else {
654                         if (atomic_cmpset_int(&object->ref_count,
655                                               count, count - 1)) {
656 #if defined(DEBUG_LOCKS)
657                                 debugvm_object_add(object, file, line, -1);
658 #endif
659                                 break;
660                         }
661                         /* retry */
662                 }
663                 /* retry */
664         }
665 }
666
667 void
668 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
669 {
670         /*
671          * Degenerate case
672          */
673         if (object == NULL)
674                 return;
675
676         /*
677          * vnode case, caller either locked the object exclusively
678          * or this is a recursion with must_drop != 0 and the vnode
679          * object will be locked shared.
680          *
681          * If locked shared we have to drop the object before we can
682          * call vrele() or risk a shared/exclusive livelock.
683          */
684         if (object->type == OBJT_VNODE) {
685                 ASSERT_LWKT_TOKEN_HELD(&object->token);
686                 vm_object_vndeallocate(object, NULL);
687                 return;
688         }
689         ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
690
691         /*
692          * Normal case (object is locked exclusively)
693          */
694         if (object->ref_count == 0) {
695                 panic("vm_object_deallocate: object deallocated "
696                       "too many times: %d", object->type);
697         }
698         if (object->ref_count > 2) {
699                 atomic_add_int(&object->ref_count, -1);
700 #if defined(DEBUG_LOCKS)
701                 debugvm_object_add(object, file, line, -1);
702 #endif
703                 return;
704         }
705
706         /*
707          * Drop the ref and handle termination on the 1->0 transition.
708          * We may have blocked above so we have to recheck.
709          */
710         KKASSERT(object->ref_count != 0);
711         if (object->ref_count >= 2) {
712                 atomic_add_int(&object->ref_count, -1);
713 #if defined(DEBUG_LOCKS)
714                 debugvm_object_add(object, file, line, -1);
715 #endif
716                 return;
717         }
718
719         atomic_add_int(&object->ref_count, -1);
720         if ((object->flags & OBJ_DEAD) == 0)
721                 vm_object_terminate(object);
722 }
723
724 /*
725  * Destroy the specified object, freeing up related resources.
726  *
727  * The object must have zero references.
728  *
729  * The object must held.  The caller is responsible for dropping the object
730  * after terminate returns.  Terminate does NOT drop the object.
731  */
732 static int vm_object_terminate_callback(vm_page_t p, void *data);
733
734 void
735 vm_object_terminate(vm_object_t object)
736 {
737         struct rb_vm_page_scan_info info;
738         struct vm_object_hash *hash;
739
740         /*
741          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
742          * able to safely block.
743          */
744         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
745         KKASSERT((object->flags & OBJ_DEAD) == 0);
746         vm_object_set_flag(object, OBJ_DEAD);
747
748         /*
749          * Wait for the pageout daemon to be done with the object
750          */
751         vm_object_pip_wait(object, "objtrm1");
752
753         KASSERT(!object->paging_in_progress,
754                 ("vm_object_terminate: pageout in progress"));
755
756         /*
757          * Clean and free the pages, as appropriate. All references to the
758          * object are gone, so we don't need to lock it.
759          */
760         if (object->type == OBJT_VNODE) {
761                 struct vnode *vp;
762
763                 /*
764                  * Clean pages and flush buffers.
765                  *
766                  * NOTE!  TMPFS buffer flushes do not typically flush the
767                  *        actual page to swap as this would be highly
768                  *        inefficient, and normal filesystems usually wrap
769                  *        page flushes with buffer cache buffers.
770                  *
771                  *        To deal with this we have to call vinvalbuf() both
772                  *        before and after the vm_object_page_clean().
773                  */
774                 vp = (struct vnode *) object->handle;
775                 vinvalbuf(vp, V_SAVE, 0, 0);
776                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
777                 vinvalbuf(vp, V_SAVE, 0, 0);
778         }
779
780         /*
781          * Wait for any I/O to complete, after which there had better not
782          * be any references left on the object.
783          */
784         vm_object_pip_wait(object, "objtrm2");
785
786         if (object->ref_count != 0) {
787                 panic("vm_object_terminate: object with references, "
788                       "ref_count=%d", object->ref_count);
789         }
790
791         /*
792          * Cleanup any shared pmaps associated with this object.
793          */
794         pmap_object_free(object);
795
796         /*
797          * Now free any remaining pages. For internal objects, this also
798          * removes them from paging queues. Don't free wired pages, just
799          * remove them from the object. 
800          */
801         info.count = 0;
802         info.object = object;
803         do {
804                 info.error = 0;
805                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
806                                         vm_object_terminate_callback, &info);
807         } while (info.error);
808
809         /*
810          * Let the pager know object is dead.
811          */
812         vm_pager_deallocate(object);
813
814         /*
815          * Wait for the object hold count to hit 1, clean out pages as
816          * we go.  vmobj_token interlocks any race conditions that might
817          * pick the object up from the vm_object_list after we have cleared
818          * rb_memq.
819          */
820         for (;;) {
821                 if (RB_ROOT(&object->rb_memq) == NULL)
822                         break;
823                 kprintf("vm_object_terminate: Warning, object %p "
824                         "still has %ld pages\n",
825                         object, object->resident_page_count);
826                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
827                                         vm_object_terminate_callback, &info);
828         }
829
830         /*
831          * There had better not be any pages left
832          */
833         KKASSERT(object->resident_page_count == 0);
834
835         /*
836          * Remove the object from the global object list.
837          */
838         hash = vmobj_hash(object);
839         lwkt_gettoken(&hash->token);
840         TAILQ_REMOVE(&hash->list, object, object_entry);
841         lwkt_reltoken(&hash->token);
842
843         if (object->ref_count != 0) {
844                 panic("vm_object_terminate2: object with references, "
845                       "ref_count=%d", object->ref_count);
846         }
847
848         /*
849          * NOTE: The object hold_count is at least 1, so we cannot kfree()
850          *       the object here.  See vm_object_drop().
851          */
852 }
853
854 /*
855  * The caller must hold the object.
856  */
857 static int
858 vm_object_terminate_callback(vm_page_t p, void *data)
859 {
860         struct rb_vm_page_scan_info *info = data;
861         vm_object_t object;
862
863         object = p->object;
864         KKASSERT(object == info->object);
865         if (vm_page_busy_try(p, TRUE)) {
866                 vm_page_sleep_busy(p, TRUE, "vmotrm");
867                 info->error = 1;
868                 return 0;
869         }
870         if (object != p->object) {
871                 /* XXX remove once we determine it can't happen */
872                 kprintf("vm_object_terminate: Warning: Encountered "
873                         "busied page %p on queue %d\n", p, p->queue);
874                 vm_page_wakeup(p);
875                 info->error = 1;
876         } else if (p->wire_count == 0) {
877                 /*
878                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
879                  */
880                 vm_page_free(p);
881                 mycpu->gd_cnt.v_pfree++;
882         } else {
883                 if (p->queue != PQ_NONE) {
884                         kprintf("vm_object_terminate: Warning: Encountered "
885                                 "wired page %p on queue %d\n", p, p->queue);
886                         if (vm_object_debug > 0) {
887                                 --vm_object_debug;
888                                 print_backtrace(10);
889                         }
890                 }
891                 vm_page_remove(p);
892                 vm_page_wakeup(p);
893         }
894
895         /*
896          * Must be at end to avoid SMP races, caller holds object token
897          */
898         if ((++info->count & 63) == 0)
899                 lwkt_user_yield();
900         return(0);
901 }
902
903 /*
904  * Clean all dirty pages in the specified range of object.  Leaves page
905  * on whatever queue it is currently on.   If NOSYNC is set then do not
906  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
907  * leaving the object dirty.
908  *
909  * When stuffing pages asynchronously, allow clustering.  XXX we need a
910  * synchronous clustering mode implementation.
911  *
912  * Odd semantics: if start == end, we clean everything.
913  *
914  * The object must be locked? XXX
915  */
916 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
917 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
918
919 void
920 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
921                      int flags)
922 {
923         struct rb_vm_page_scan_info info;
924         struct vnode *vp;
925         int wholescan;
926         int pagerflags;
927         int generation;
928
929         vm_object_hold(object);
930         if (object->type != OBJT_VNODE ||
931             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
932                 vm_object_drop(object);
933                 return;
934         }
935
936         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 
937                         VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
938         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
939
940         vp = object->handle;
941
942         /*
943          * Interlock other major object operations.  This allows us to 
944          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
945          */
946         vm_object_set_flag(object, OBJ_CLEANING);
947
948         /*
949          * Handle 'entire object' case
950          */
951         info.start_pindex = start;
952         if (end == 0) {
953                 info.end_pindex = object->size - 1;
954         } else {
955                 info.end_pindex = end - 1;
956         }
957         wholescan = (start == 0 && info.end_pindex == object->size - 1);
958         info.limit = flags;
959         info.pagerflags = pagerflags;
960         info.object = object;
961
962         /*
963          * If cleaning the entire object do a pass to mark the pages read-only.
964          * If everything worked out ok, clear OBJ_WRITEABLE and
965          * OBJ_MIGHTBEDIRTY.
966          */
967         if (wholescan) {
968                 info.error = 0;
969                 info.count = 0;
970                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
971                                         vm_object_page_clean_pass1, &info);
972                 if (info.error == 0) {
973                         vm_object_clear_flag(object,
974                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
975                         if (object->type == OBJT_VNODE &&
976                             (vp = (struct vnode *)object->handle) != NULL) {
977                                 /*
978                                  * Use new-style interface to clear VISDIRTY
979                                  * because the vnode is not necessarily removed
980                                  * from the syncer list(s) as often as it was
981                                  * under the old interface, which can leave
982                                  * the vnode on the syncer list after reclaim.
983                                  */
984                                 vclrobjdirty(vp);
985                         }
986                 }
987         }
988
989         /*
990          * Do a pass to clean all the dirty pages we find.
991          */
992         do {
993                 info.error = 0;
994                 info.count = 0;
995                 generation = object->generation;
996                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
997                                         vm_object_page_clean_pass2, &info);
998         } while (info.error || generation != object->generation);
999
1000         vm_object_clear_flag(object, OBJ_CLEANING);
1001         vm_object_drop(object);
1002 }
1003
1004 /*
1005  * The caller must hold the object.
1006  */
1007 static 
1008 int
1009 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1010 {
1011         struct rb_vm_page_scan_info *info = data;
1012
1013         KKASSERT(p->object == info->object);
1014
1015         vm_page_flag_set(p, PG_CLEANCHK);
1016         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1017                 info->error = 1;
1018         } else if (vm_page_busy_try(p, FALSE)) {
1019                 info->error = 1;
1020         } else {
1021                 KKASSERT(p->object == info->object);
1022                 vm_page_protect(p, VM_PROT_READ);
1023                 vm_page_wakeup(p);
1024         }
1025
1026         /*
1027          * Must be at end to avoid SMP races, caller holds object token
1028          */
1029         if ((++info->count & 63) == 0)
1030                 lwkt_user_yield();
1031         return(0);
1032 }
1033
1034 /*
1035  * The caller must hold the object
1036  */
1037 static 
1038 int
1039 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1040 {
1041         struct rb_vm_page_scan_info *info = data;
1042         int generation;
1043
1044         KKASSERT(p->object == info->object);
1045
1046         /*
1047          * Do not mess with pages that were inserted after we started
1048          * the cleaning pass.
1049          */
1050         if ((p->flags & PG_CLEANCHK) == 0)
1051                 goto done;
1052
1053         generation = info->object->generation;
1054
1055         if (vm_page_busy_try(p, TRUE)) {
1056                 vm_page_sleep_busy(p, TRUE, "vpcwai");
1057                 info->error = 1;
1058                 goto done;
1059         }
1060
1061         KKASSERT(p->object == info->object &&
1062                  info->object->generation == generation);
1063
1064         /*
1065          * Before wasting time traversing the pmaps, check for trivial
1066          * cases where the page cannot be dirty.
1067          */
1068         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1069                 KKASSERT((p->dirty & p->valid) == 0 &&
1070                          (p->flags & PG_NEED_COMMIT) == 0);
1071                 vm_page_wakeup(p);
1072                 goto done;
1073         }
1074
1075         /*
1076          * Check whether the page is dirty or not.  The page has been set
1077          * to be read-only so the check will not race a user dirtying the
1078          * page.
1079          */
1080         vm_page_test_dirty(p);
1081         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1082                 vm_page_flag_clear(p, PG_CLEANCHK);
1083                 vm_page_wakeup(p);
1084                 goto done;
1085         }
1086
1087         /*
1088          * If we have been asked to skip nosync pages and this is a
1089          * nosync page, skip it.  Note that the object flags were
1090          * not cleared in this case (because pass1 will have returned an
1091          * error), so we do not have to set them.
1092          */
1093         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1094                 vm_page_flag_clear(p, PG_CLEANCHK);
1095                 vm_page_wakeup(p);
1096                 goto done;
1097         }
1098
1099         /*
1100          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1101          * the pages that get successfully flushed.  Set info->error if
1102          * we raced an object modification.
1103          */
1104         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1105         /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1106
1107         /*
1108          * Must be at end to avoid SMP races, caller holds object token
1109          */
1110 done:
1111         if ((++info->count & 63) == 0)
1112                 lwkt_user_yield();
1113         return(0);
1114 }
1115
1116 /*
1117  * Collect the specified page and nearby pages and flush them out.
1118  * The number of pages flushed is returned.  The passed page is busied
1119  * by the caller and we are responsible for its disposition.
1120  *
1121  * The caller must hold the object.
1122  */
1123 static void
1124 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1125 {
1126         int error;
1127         int is;
1128         int ib;
1129         int i;
1130         int page_base;
1131         vm_pindex_t pi;
1132         vm_page_t ma[BLIST_MAX_ALLOC];
1133
1134         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1135
1136         pi = p->pindex;
1137         page_base = pi % BLIST_MAX_ALLOC;
1138         ma[page_base] = p;
1139         ib = page_base - 1;
1140         is = page_base + 1;
1141
1142         while (ib >= 0) {
1143                 vm_page_t tp;
1144
1145                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1146                                              TRUE, &error);
1147                 if (error)
1148                         break;
1149                 if (tp == NULL)
1150                         break;
1151                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1152                     (tp->flags & PG_CLEANCHK) == 0) {
1153                         vm_page_wakeup(tp);
1154                         break;
1155                 }
1156                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1157                         vm_page_flag_clear(tp, PG_CLEANCHK);
1158                         vm_page_wakeup(tp);
1159                         break;
1160                 }
1161                 vm_page_test_dirty(tp);
1162                 if ((tp->dirty & tp->valid) == 0 &&
1163                     (tp->flags & PG_NEED_COMMIT) == 0) {
1164                         vm_page_flag_clear(tp, PG_CLEANCHK);
1165                         vm_page_wakeup(tp);
1166                         break;
1167                 }
1168                 ma[ib] = tp;
1169                 --ib;
1170         }
1171         ++ib;   /* fixup */
1172
1173         while (is < BLIST_MAX_ALLOC &&
1174                pi - page_base + is < object->size) {
1175                 vm_page_t tp;
1176
1177                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1178                                              TRUE, &error);
1179                 if (error)
1180                         break;
1181                 if (tp == NULL)
1182                         break;
1183                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1184                     (tp->flags & PG_CLEANCHK) == 0) {
1185                         vm_page_wakeup(tp);
1186                         break;
1187                 }
1188                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1189                         vm_page_flag_clear(tp, PG_CLEANCHK);
1190                         vm_page_wakeup(tp);
1191                         break;
1192                 }
1193                 vm_page_test_dirty(tp);
1194                 if ((tp->dirty & tp->valid) == 0 &&
1195                     (tp->flags & PG_NEED_COMMIT) == 0) {
1196                         vm_page_flag_clear(tp, PG_CLEANCHK);
1197                         vm_page_wakeup(tp);
1198                         break;
1199                 }
1200                 ma[is] = tp;
1201                 ++is;
1202         }
1203
1204         /*
1205          * All pages in the ma[] array are busied now
1206          */
1207         for (i = ib; i < is; ++i) {
1208                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1209                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1210         }
1211         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1212         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1213                 vm_page_unhold(ma[i]);
1214 }
1215
1216 /*
1217  * Implements the madvise function at the object/page level.
1218  *
1219  * MADV_WILLNEED        (any object)
1220  *
1221  *      Activate the specified pages if they are resident.
1222  *
1223  * MADV_DONTNEED        (any object)
1224  *
1225  *      Deactivate the specified pages if they are resident.
1226  *
1227  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1228  *
1229  *      Deactivate and clean the specified pages if they are
1230  *      resident.  This permits the process to reuse the pages
1231  *      without faulting or the kernel to reclaim the pages
1232  *      without I/O.
1233  *
1234  * No requirements.
1235  */
1236 void
1237 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1238                   vm_pindex_t count, int advise)
1239 {
1240         vm_pindex_t end;
1241         vm_page_t m;
1242         int error;
1243
1244         if (object == NULL)
1245                 return;
1246
1247         end = pindex + count;
1248
1249         vm_object_hold(object);
1250
1251         /*
1252          * Locate and adjust resident pages.  This only applies to the
1253          * primary object in the mapping.
1254          */
1255         for (; pindex < end; pindex += 1) {
1256 relookup:
1257                 /*
1258                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1259                  * and those pages must be OBJ_ONEMAPPING.
1260                  */
1261                 if (advise == MADV_FREE) {
1262                         if ((object->type != OBJT_DEFAULT &&
1263                              object->type != OBJT_SWAP) ||
1264                             (object->flags & OBJ_ONEMAPPING) == 0) {
1265                                 continue;
1266                         }
1267                 }
1268
1269                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1270
1271                 if (error) {
1272                         vm_page_sleep_busy(m, TRUE, "madvpo");
1273                         goto relookup;
1274                 }
1275                 if (m == NULL) {
1276                         /*
1277                          * There may be swap even if there is no backing page
1278                          */
1279                         if (advise == MADV_FREE && object->type == OBJT_SWAP)
1280                                 swap_pager_freespace(object, pindex, 1);
1281                         continue;
1282                 }
1283
1284                 /*
1285                  * If the page is not in a normal active state, we skip it.
1286                  * If the page is not managed there are no page queues to
1287                  * mess with.  Things can break if we mess with pages in
1288                  * any of the below states.
1289                  */
1290                 if (m->wire_count ||
1291                     (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1292                                  PG_NEED_COMMIT)) ||
1293                     m->valid != VM_PAGE_BITS_ALL
1294                 ) {
1295                         vm_page_wakeup(m);
1296                         continue;
1297                 }
1298
1299                 /*
1300                  * Theoretically once a page is known not to be busy, an
1301                  * interrupt cannot come along and rip it out from under us.
1302                  */
1303                 if (advise == MADV_WILLNEED) {
1304                         vm_page_activate(m);
1305                 } else if (advise == MADV_DONTNEED) {
1306                         vm_page_dontneed(m);
1307                 } else if (advise == MADV_FREE) {
1308                         /*
1309                          * Mark the page clean.  This will allow the page
1310                          * to be freed up by the system.  However, such pages
1311                          * are often reused quickly by malloc()/free()
1312                          * so we do not do anything that would cause
1313                          * a page fault if we can help it.
1314                          *
1315                          * Specifically, we do not try to actually free
1316                          * the page now nor do we try to put it in the
1317                          * cache (which would cause a page fault on reuse).
1318                          *
1319                          * But we do make the page is freeable as we
1320                          * can without actually taking the step of unmapping
1321                          * it.
1322                          */
1323                         pmap_clear_modify(m);
1324                         m->dirty = 0;
1325                         m->act_count = 0;
1326                         vm_page_dontneed(m);
1327                         if (object->type == OBJT_SWAP)
1328                                 swap_pager_freespace(object, pindex, 1);
1329                 }
1330                 vm_page_wakeup(m);
1331         }       
1332         vm_object_drop(object);
1333 }
1334
1335 /*
1336  * Removes all physical pages in the specified object range from the
1337  * object's list of pages.
1338  *
1339  * No requirements.
1340  */
1341 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1342
1343 void
1344 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1345                       boolean_t clean_only)
1346 {
1347         struct rb_vm_page_scan_info info;
1348         int all;
1349
1350         /*
1351          * Degenerate cases and assertions
1352          */
1353         vm_object_hold(object);
1354         if (object == NULL ||
1355             (object->resident_page_count == 0 && object->swblock_count == 0)) {
1356                 vm_object_drop(object);
1357                 return;
1358         }
1359         KASSERT(object->type != OBJT_PHYS,
1360                 ("attempt to remove pages from a physical object"));
1361
1362         /*
1363          * Indicate that paging is occuring on the object
1364          */
1365         vm_object_pip_add(object, 1);
1366
1367         /*
1368          * Figure out the actual removal range and whether we are removing
1369          * the entire contents of the object or not.  If removing the entire
1370          * contents, be sure to get all pages, even those that might be 
1371          * beyond the end of the object.
1372          */
1373         info.object = object;
1374         info.start_pindex = start;
1375         if (end == 0)
1376                 info.end_pindex = (vm_pindex_t)-1;
1377         else
1378                 info.end_pindex = end - 1;
1379         info.limit = clean_only;
1380         info.count = 0;
1381         all = (start == 0 && info.end_pindex >= object->size - 1);
1382
1383         /*
1384          * Loop until we are sure we have gotten them all.
1385          */
1386         do {
1387                 info.error = 0;
1388                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1389                                         vm_object_page_remove_callback, &info);
1390         } while (info.error);
1391
1392         /*
1393          * Remove any related swap if throwing away pages, or for
1394          * non-swap objects (the swap is a clean copy in that case).
1395          */
1396         if (object->type != OBJT_SWAP || clean_only == FALSE) {
1397                 if (all)
1398                         swap_pager_freespace_all(object);
1399                 else
1400                         swap_pager_freespace(object, info.start_pindex,
1401                              info.end_pindex - info.start_pindex + 1);
1402         }
1403
1404         /*
1405          * Cleanup
1406          */
1407         vm_object_pip_wakeup(object);
1408         vm_object_drop(object);
1409 }
1410
1411 /*
1412  * The caller must hold the object.
1413  *
1414  * NOTE: User yields are allowed when removing more than one page, but not
1415  *       allowed if only removing one page (the path for single page removals
1416  *       might hold a spinlock).
1417  */
1418 static int
1419 vm_object_page_remove_callback(vm_page_t p, void *data)
1420 {
1421         struct rb_vm_page_scan_info *info = data;
1422
1423         if (info->object != p->object ||
1424             p->pindex < info->start_pindex ||
1425             p->pindex > info->end_pindex) {
1426                 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1427                         info->object, p);
1428                 return(0);
1429         }
1430         if (vm_page_busy_try(p, TRUE)) {
1431                 vm_page_sleep_busy(p, TRUE, "vmopar");
1432                 info->error = 1;
1433                 return(0);
1434         }
1435         if (info->object != p->object) {
1436                 /* this should never happen */
1437                 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1438                         info->object, p);
1439                 vm_page_wakeup(p);
1440                 return(0);
1441         }
1442
1443         /*
1444          * Wired pages cannot be destroyed, but they can be invalidated
1445          * and we do so if clean_only (limit) is not set.
1446          *
1447          * WARNING!  The page may be wired due to being part of a buffer
1448          *           cache buffer, and the buffer might be marked B_CACHE.
1449          *           This is fine as part of a truncation but VFSs must be
1450          *           sure to fix the buffer up when re-extending the file.
1451          *
1452          * NOTE!     PG_NEED_COMMIT is ignored.
1453          */
1454         if (p->wire_count != 0) {
1455                 vm_page_protect(p, VM_PROT_NONE);
1456                 if (info->limit == 0)
1457                         p->valid = 0;
1458                 vm_page_wakeup(p);
1459                 goto done;
1460         }
1461
1462         /*
1463          * limit is our clean_only flag.  If set and the page is dirty or
1464          * requires a commit, do not free it.  If set and the page is being
1465          * held by someone, do not free it.
1466          */
1467         if (info->limit && p->valid) {
1468                 vm_page_test_dirty(p);
1469                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1470                         vm_page_wakeup(p);
1471                         goto done;
1472                 }
1473         }
1474
1475         /*
1476          * Destroy the page.  But we have to re-test whether its dirty after
1477          * removing it from its pmaps.
1478          */
1479         vm_page_protect(p, VM_PROT_NONE);
1480         if (info->limit && p->valid) {
1481                 vm_page_test_dirty(p);
1482                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1483                         vm_page_wakeup(p);
1484                         goto done;
1485                 }
1486         }
1487         vm_page_free(p);
1488
1489         /*
1490          * Must be at end to avoid SMP races, caller holds object token
1491          */
1492 done:
1493         if ((++info->count & 63) == 0)
1494                 lwkt_user_yield();
1495
1496         return(0);
1497 }
1498
1499 /*
1500  * Try to extend prev_object into an adjoining region of virtual
1501  * memory, return TRUE on success.
1502  *
1503  * The caller does not need to hold (prev_object) but must have a stable
1504  * pointer to it (typically by holding the vm_map locked).
1505  *
1506  * This function only works for anonymous memory objects which either
1507  * have (a) one reference or (b) we are extending the object's size.
1508  * Otherwise the related VM pages we want to use for the object might
1509  * be in use by another mapping.
1510  */
1511 boolean_t
1512 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1513                    vm_size_t prev_size, vm_size_t next_size)
1514 {
1515         vm_pindex_t next_pindex;
1516
1517         if (prev_object == NULL)
1518                 return (TRUE);
1519
1520         vm_object_hold(prev_object);
1521
1522         if (prev_object->type != OBJT_DEFAULT &&
1523             prev_object->type != OBJT_SWAP) {
1524                 vm_object_drop(prev_object);
1525                 return (FALSE);
1526         }
1527
1528 #if 0
1529         /* caller now checks this */
1530         /*
1531          * Try to collapse the object first
1532          */
1533         vm_object_collapse(prev_object, NULL);
1534 #endif
1535
1536 #if 0
1537         /* caller now checks this */
1538         /*
1539          * We can't coalesce if we shadow another object (figuring out the
1540          * relationships become too complex).
1541          */
1542         if (prev_object->backing_object != NULL) {
1543                 vm_object_chain_release(prev_object);
1544                 vm_object_drop(prev_object);
1545                 return (FALSE);
1546         }
1547 #endif
1548
1549         prev_size >>= PAGE_SHIFT;
1550         next_size >>= PAGE_SHIFT;
1551         next_pindex = prev_pindex + prev_size;
1552
1553         /*
1554          * We can't if the object has more than one ref count unless we
1555          * are extending it into newly minted space.
1556          */
1557         if (prev_object->ref_count > 1 &&
1558             prev_object->size != next_pindex) {
1559                 vm_object_drop(prev_object);
1560                 return (FALSE);
1561         }
1562
1563         /*
1564          * Remove any pages that may still be in the object from a previous
1565          * deallocation.
1566          */
1567         if (next_pindex < prev_object->size) {
1568                 vm_object_page_remove(prev_object,
1569                                       next_pindex,
1570                                       next_pindex + next_size, FALSE);
1571                 if (prev_object->type == OBJT_SWAP)
1572                         swap_pager_freespace(prev_object,
1573                                              next_pindex, next_size);
1574         }
1575
1576         /*
1577          * Extend the object if necessary.
1578          */
1579         if (next_pindex + next_size > prev_object->size)
1580                 prev_object->size = next_pindex + next_size;
1581         vm_object_drop(prev_object);
1582
1583         return (TRUE);
1584 }
1585
1586 /*
1587  * Make the object writable and flag is being possibly dirty.
1588  *
1589  * The object might not be held (or might be held but held shared),
1590  * the related vnode is probably not held either.  Object and vnode are
1591  * stable by virtue of the vm_page busied by the caller preventing
1592  * destruction.
1593  *
1594  * If the related mount is flagged MNTK_THR_SYNC we need to call
1595  * vsetobjdirty().  Filesystems using this option usually shortcut
1596  * synchronization by only scanning the syncer list.
1597  */
1598 void
1599 vm_object_set_writeable_dirty(vm_object_t object)
1600 {
1601         struct vnode *vp;
1602
1603         /*vm_object_assert_held(object);*/
1604         /*
1605          * Avoid contention in vm fault path by checking the state before
1606          * issuing an atomic op on it.
1607          */
1608         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1609             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1610                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1611         }
1612         if (object->type == OBJT_VNODE &&
1613             (vp = (struct vnode *)object->handle) != NULL) {
1614                 if ((vp->v_flag & VOBJDIRTY) == 0) {
1615                         if (vp->v_mount &&
1616                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1617                                 /*
1618                                  * New style THR_SYNC places vnodes on the
1619                                  * syncer list more deterministically.
1620                                  */
1621                                 vsetobjdirty(vp);
1622                         } else {
1623                                 /*
1624                                  * Old style scan would not necessarily place
1625                                  * a vnode on the syncer list when possibly
1626                                  * modified via mmap.
1627                                  */
1628                                 vsetflags(vp, VOBJDIRTY);
1629                         }
1630                 }
1631         }
1632 }
1633
1634 #include "opt_ddb.h"
1635 #ifdef DDB
1636 #include <sys/cons.h>
1637
1638 #include <ddb/ddb.h>
1639
1640 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
1641                                        vm_map_entry_t entry);
1642 static int      vm_object_in_map (vm_object_t object);
1643
1644 /*
1645  * The caller must hold the object.
1646  */
1647 static int
1648 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1649 {
1650         vm_map_backing_t ba;
1651         vm_map_t tmpm;
1652         vm_map_entry_t tmpe;
1653         int entcount;
1654
1655         if (map == NULL)
1656                 return 0;
1657         if (entry == NULL) {
1658                 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1659                 entcount = map->nentries;
1660                 while (entcount-- && tmpe) {
1661                         if( _vm_object_in_map(map, object, tmpe)) {
1662                                 return 1;
1663                         }
1664                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1665                 }
1666                 return (0);
1667         }
1668         switch(entry->maptype) {
1669         case VM_MAPTYPE_SUBMAP:
1670                 tmpm = entry->ba.sub_map;
1671                 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1672                 entcount = tmpm->nentries;
1673                 while (entcount-- && tmpe) {
1674                         if( _vm_object_in_map(tmpm, object, tmpe)) {
1675                                 return 1;
1676                         }
1677                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1678                 }
1679                 break;
1680         case VM_MAPTYPE_NORMAL:
1681         case VM_MAPTYPE_VPAGETABLE:
1682                 ba = &entry->ba;
1683                 while (ba) {
1684                         if (ba->object == object)
1685                                 return TRUE;
1686                         ba = ba->backing_ba;
1687                 }
1688                 break;
1689         default:
1690                 break;
1691         }
1692         return 0;
1693 }
1694
1695 static int vm_object_in_map_callback(struct proc *p, void *data);
1696
1697 struct vm_object_in_map_info {
1698         vm_object_t object;
1699         int rv;
1700 };
1701
1702 /*
1703  * Debugging only
1704  */
1705 static int
1706 vm_object_in_map(vm_object_t object)
1707 {
1708         struct vm_object_in_map_info info;
1709
1710         info.rv = 0;
1711         info.object = object;
1712
1713         allproc_scan(vm_object_in_map_callback, &info, 0);
1714         if (info.rv)
1715                 return 1;
1716         if( _vm_object_in_map(&kernel_map, object, 0))
1717                 return 1;
1718         if( _vm_object_in_map(&pager_map, object, 0))
1719                 return 1;
1720         if( _vm_object_in_map(&buffer_map, object, 0))
1721                 return 1;
1722         return 0;
1723 }
1724
1725 /*
1726  * Debugging only
1727  */
1728 static int
1729 vm_object_in_map_callback(struct proc *p, void *data)
1730 {
1731         struct vm_object_in_map_info *info = data;
1732
1733         if (p->p_vmspace) {
1734                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1735                         info->rv = 1;
1736                         return -1;
1737                 }
1738         }
1739         return (0);
1740 }
1741
1742 DB_SHOW_COMMAND(vmochk, vm_object_check)
1743 {
1744         struct vm_object_hash *hash;
1745         vm_object_t object;
1746         int n;
1747
1748         /*
1749          * make sure that internal objs are in a map somewhere
1750          * and none have zero ref counts.
1751          */
1752         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1753                 hash = &vm_object_hash[n];
1754                 for (object = TAILQ_FIRST(&hash->list);
1755                                 object != NULL;
1756                                 object = TAILQ_NEXT(object, object_entry)) {
1757                         if (object->type == OBJT_MARKER)
1758                                 continue;
1759                         if (object->handle != NULL ||
1760                             (object->type != OBJT_DEFAULT &&
1761                              object->type != OBJT_SWAP)) {
1762                                 continue;
1763                         }
1764                         if (object->ref_count == 0) {
1765                                 db_printf("vmochk: internal obj has "
1766                                           "zero ref count: %ld\n",
1767                                           (long)object->size);
1768                         }
1769                         if (vm_object_in_map(object))
1770                                 continue;
1771                         db_printf("vmochk: internal obj is not in a map: "
1772                                   "ref: %d, size: %lu: 0x%lx\n",
1773                                   object->ref_count, (u_long)object->size,
1774                                   (u_long)object->size);
1775                 }
1776         }
1777 }
1778
1779 /*
1780  * Debugging only
1781  */
1782 DB_SHOW_COMMAND(object, vm_object_print_static)
1783 {
1784         /* XXX convert args. */
1785         vm_object_t object = (vm_object_t)addr;
1786         boolean_t full = have_addr;
1787
1788         vm_page_t p;
1789
1790         /* XXX count is an (unused) arg.  Avoid shadowing it. */
1791 #define count   was_count
1792
1793         int count;
1794
1795         if (object == NULL)
1796                 return;
1797
1798         db_iprintf(
1799             "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1800             object, (int)object->type, (u_long)object->size,
1801             object->resident_page_count, object->ref_count, object->flags);
1802         /*
1803          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1804          */
1805         db_iprintf("\n");
1806
1807         if (!full)
1808                 return;
1809
1810         db_indent += 2;
1811         count = 0;
1812         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1813                 if (count == 0)
1814                         db_iprintf("memory:=");
1815                 else if (count == 6) {
1816                         db_printf("\n");
1817                         db_iprintf(" ...");
1818                         count = 0;
1819                 } else
1820                         db_printf(",");
1821                 count++;
1822
1823                 db_printf("(off=0x%lx,page=0x%lx)",
1824                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1825         }
1826         if (count != 0)
1827                 db_printf("\n");
1828         db_indent -= 2;
1829 }
1830
1831 /* XXX. */
1832 #undef count
1833
1834 /*
1835  * XXX need this non-static entry for calling from vm_map_print.
1836  *
1837  * Debugging only
1838  */
1839 void
1840 vm_object_print(/* db_expr_t */ long addr,
1841                 boolean_t have_addr,
1842                 /* db_expr_t */ long count,
1843                 char *modif)
1844 {
1845         vm_object_print_static(addr, have_addr, count, modif);
1846 }
1847
1848 /*
1849  * Debugging only
1850  */
1851 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1852 {
1853         struct vm_object_hash *hash;
1854         vm_object_t object;
1855         int nl = 0;
1856         int c;
1857         int n;
1858
1859         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1860                 hash = &vm_object_hash[n];
1861                 for (object = TAILQ_FIRST(&hash->list);
1862                                 object != NULL;
1863                                 object = TAILQ_NEXT(object, object_entry)) {
1864                         vm_pindex_t idx, fidx;
1865                         vm_pindex_t osize;
1866                         vm_paddr_t pa = -1, padiff;
1867                         int rcount;
1868                         vm_page_t m;
1869
1870                         if (object->type == OBJT_MARKER)
1871                                 continue;
1872                         db_printf("new object: %p\n", (void *)object);
1873                         if ( nl > 18) {
1874                                 c = cngetc();
1875                                 if (c != ' ')
1876                                         return;
1877                                 nl = 0;
1878                         }
1879                         nl++;
1880                         rcount = 0;
1881                         fidx = 0;
1882                         osize = object->size;
1883                         if (osize > 128)
1884                                 osize = 128;
1885                         for (idx = 0; idx < osize; idx++) {
1886                                 m = vm_page_lookup(object, idx);
1887                                 if (m == NULL) {
1888                                         if (rcount) {
1889                                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1890                                                         (long)fidx, rcount, (long)pa);
1891                                                 if ( nl > 18) {
1892                                                         c = cngetc();
1893                                                         if (c != ' ')
1894                                                                 return;
1895                                                         nl = 0;
1896                                                 }
1897                                                 nl++;
1898                                                 rcount = 0;
1899                                         }
1900                                         continue;
1901                                 }
1902
1903                                 if (rcount &&
1904                                         (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1905                                         ++rcount;
1906                                         continue;
1907                                 }
1908                                 if (rcount) {
1909                                         padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1910                                         padiff >>= PAGE_SHIFT;
1911                                         padiff &= PQ_L2_MASK;
1912                                         if (padiff == 0) {
1913                                                 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1914                                                 ++rcount;
1915                                                 continue;
1916                                         }
1917                                         db_printf(" index(%ld)run(%d)pa(0x%lx)",
1918                                                 (long)fidx, rcount, (long)pa);
1919                                         db_printf("pd(%ld)\n", (long)padiff);
1920                                         if ( nl > 18) {
1921                                                 c = cngetc();
1922                                                 if (c != ' ')
1923                                                         return;
1924                                                 nl = 0;
1925                                         }
1926                                         nl++;
1927                                 }
1928                                 fidx = idx;
1929                                 pa = VM_PAGE_TO_PHYS(m);
1930                                 rcount = 1;
1931                         }
1932                         if (rcount) {
1933                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1934                                         (long)fidx, rcount, (long)pa);
1935                                 if ( nl > 18) {
1936                                         c = cngetc();
1937                                         if (c != ' ')
1938                                                 return;
1939                                         nl = 0;
1940                                 }
1941                                 nl++;
1942                         }
1943                 }
1944         }
1945 }
1946 #endif /* DDB */