kernel - VM rework part 3 - Cleanup pass
[dragonfly.git] / sys / vm / vm_object.c
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62
63 /*
64  *      Virtual memory object module.
65  */
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>           /* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
78
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
91
92 #include <vm/vm_page2.h>
93
94 #include <machine/specialreg.h>
95
96 #define EASY_SCAN_FACTOR        8
97
98 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
99                                              int pagerflags);
100 static void     vm_object_lock_init(vm_object_t);
101
102 /*
103  *      Virtual memory objects maintain the actual data
104  *      associated with allocated virtual memory.  A given
105  *      page of memory exists within exactly one object.
106  *
107  *      An object is only deallocated when all "references"
108  *      are given up.  Only one "reference" to a given
109  *      region of an object should be writeable.
110  *
111  *      Associated with each object is a list of all resident
112  *      memory pages belonging to that object; this list is
113  *      maintained by the "vm_page" module, and locked by the object's
114  *      lock.
115  *
116  *      Each object also records a "pager" routine which is
117  *      used to retrieve (and store) pages to the proper backing
118  *      storage.  In addition, objects may be backed by other
119  *      objects from which they were virtual-copied.
120  *
121  *      The only items within the object structure which are
122  *      modified after time of creation are:
123  *              reference count         locked by object's lock
124  *              pager routine           locked by object's lock
125  *
126  */
127
128 struct vm_object kernel_object;
129
130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
131
132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
133
134 #define VMOBJ_HASH_PRIME1       66555444443333333ULL
135 #define VMOBJ_HASH_PRIME2       989042931893ULL
136
137 int vm_object_debug;
138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
139
140 static __inline
141 struct vm_object_hash *
142 vmobj_hash(vm_object_t obj)
143 {
144         uintptr_t hash1;
145         uintptr_t hash2;
146
147         hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
148         hash1 %= VMOBJ_HASH_PRIME1;
149         hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
150         hash2 %= VMOBJ_HASH_PRIME2;
151         return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
152 }
153
154 #if defined(DEBUG_LOCKS)
155
156 #define vm_object_vndeallocate(obj, vpp)        \
157                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
158
159 /*
160  * Debug helper to track hold/drop/ref/deallocate calls.
161  */
162 static void
163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
164 {
165         int i;
166
167         i = atomic_fetchadd_int(&obj->debug_index, 1);
168         i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
169         ksnprintf(obj->debug_hold_thrs[i],
170                   sizeof(obj->debug_hold_thrs[i]),
171                   "%c%d:(%d):%s",
172                   (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
173                   (curthread->td_proc ? curthread->td_proc->p_pid : -1),
174                   obj->ref_count,
175                   curthread->td_comm);
176         obj->debug_hold_file[i] = file;
177         obj->debug_hold_line[i] = line;
178 #if 0
179         /* Uncomment for debugging obj refs/derefs in reproducable cases */
180         if (strcmp(curthread->td_comm, "sshd") == 0) {
181                 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
182                         (curthread->td_proc ? curthread->td_proc->p_pid : -1),
183                         obj, obj->ref_count, addrem, file, line);
184         }
185 #endif
186 }
187
188 #endif
189
190 /*
191  * Misc low level routines
192  */
193 static void
194 vm_object_lock_init(vm_object_t obj)
195 {
196 #if defined(DEBUG_LOCKS)
197         int i;
198
199         obj->debug_index = 0;
200         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
201                 obj->debug_hold_thrs[i][0] = 0;
202                 obj->debug_hold_file[i] = NULL;
203                 obj->debug_hold_line[i] = 0;
204         }
205 #endif
206 }
207
208 void
209 vm_object_lock_swap(void)
210 {
211         lwkt_token_swap();
212 }
213
214 void
215 vm_object_lock(vm_object_t obj)
216 {
217         lwkt_gettoken(&obj->token);
218 }
219
220 /*
221  * Returns TRUE on sucesss
222  */
223 static int
224 vm_object_lock_try(vm_object_t obj)
225 {
226         return(lwkt_trytoken(&obj->token));
227 }
228
229 void
230 vm_object_lock_shared(vm_object_t obj)
231 {
232         lwkt_gettoken_shared(&obj->token);
233 }
234
235 void
236 vm_object_unlock(vm_object_t obj)
237 {
238         lwkt_reltoken(&obj->token);
239 }
240
241 void
242 vm_object_upgrade(vm_object_t obj)
243 {
244         lwkt_reltoken(&obj->token);
245         lwkt_gettoken(&obj->token);
246 }
247
248 void
249 vm_object_downgrade(vm_object_t obj)
250 {
251         lwkt_reltoken(&obj->token);
252         lwkt_gettoken_shared(&obj->token);
253 }
254
255 static __inline void
256 vm_object_assert_held(vm_object_t obj)
257 {
258         ASSERT_LWKT_TOKEN_HELD(&obj->token);
259 }
260
261 int
262 vm_quickcolor(void)
263 {
264         globaldata_t gd = mycpu;
265         int pg_color;
266
267         pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
268         pg_color += gd->gd_quick_color;
269         gd->gd_quick_color += PQ_PRIME2;
270
271         return pg_color;
272 }
273
274 void
275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
276 {
277         KKASSERT(obj != NULL);
278
279         /*
280          * Object must be held (object allocation is stable due to callers
281          * context, typically already holding the token on a parent object)
282          * prior to potentially blocking on the lock, otherwise the object
283          * can get ripped away from us.
284          */
285         refcount_acquire(&obj->hold_count);
286         vm_object_lock(obj);
287
288 #if defined(DEBUG_LOCKS)
289         debugvm_object_add(obj, file, line, 1);
290 #endif
291 }
292
293 int
294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
295 {
296         KKASSERT(obj != NULL);
297
298         /*
299          * Object must be held (object allocation is stable due to callers
300          * context, typically already holding the token on a parent object)
301          * prior to potentially blocking on the lock, otherwise the object
302          * can get ripped away from us.
303          */
304         refcount_acquire(&obj->hold_count);
305         if (vm_object_lock_try(obj) == 0) {
306                 if (refcount_release(&obj->hold_count)) {
307                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
308                                 kfree(obj, M_VM_OBJECT);
309                 }
310                 return(0);
311         }
312
313 #if defined(DEBUG_LOCKS)
314         debugvm_object_add(obj, file, line, 1);
315 #endif
316         return(1);
317 }
318
319 void
320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
321 {
322         KKASSERT(obj != NULL);
323
324         /*
325          * Object must be held (object allocation is stable due to callers
326          * context, typically already holding the token on a parent object)
327          * prior to potentially blocking on the lock, otherwise the object
328          * can get ripped away from us.
329          */
330         refcount_acquire(&obj->hold_count);
331         vm_object_lock_shared(obj);
332
333 #if defined(DEBUG_LOCKS)
334         debugvm_object_add(obj, file, line, 1);
335 #endif
336 }
337
338 /*
339  * Drop the token and hold_count on the object.
340  *
341  * WARNING! Token might be shared.
342  */
343 void
344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
345 {
346         if (obj == NULL)
347                 return;
348
349         /*
350          * No new holders should be possible once we drop hold_count 1->0 as
351          * there is no longer any way to reference the object.
352          */
353         KKASSERT(obj->hold_count > 0);
354         if (refcount_release(&obj->hold_count)) {
355 #if defined(DEBUG_LOCKS)
356                 debugvm_object_add(obj, file, line, -1);
357 #endif
358
359                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
360                         vm_object_unlock(obj);
361                         kfree(obj, M_VM_OBJECT);
362                 } else {
363                         vm_object_unlock(obj);
364                 }
365         } else {
366 #if defined(DEBUG_LOCKS)
367                 debugvm_object_add(obj, file, line, -1);
368 #endif
369                 vm_object_unlock(obj);
370         }
371 }
372
373 /*
374  * Initialize a freshly allocated object, returning a held object.
375  *
376  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
377  *
378  * No requirements.
379  */
380 void
381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
382 {
383         struct vm_object_hash *hash;
384
385         RB_INIT(&object->rb_memq);
386         lwkt_token_init(&object->token, "vmobj");
387
388         object->type = type;
389         object->size = size;
390         object->ref_count = 1;
391         object->memattr = VM_MEMATTR_DEFAULT;
392         object->hold_count = 0;
393         object->flags = 0;
394         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
395                 vm_object_set_flag(object, OBJ_ONEMAPPING);
396         object->paging_in_progress = 0;
397         object->resident_page_count = 0;
398         /* cpu localization twist */
399         object->pg_color = vm_quickcolor();
400         object->handle = NULL;
401
402         atomic_add_int(&object->generation, 1);
403         object->swblock_count = 0;
404         RB_INIT(&object->swblock_root);
405         vm_object_lock_init(object);
406         pmap_object_init(object);
407
408         vm_object_hold(object);
409
410         hash = vmobj_hash(object);
411         lwkt_gettoken(&hash->token);
412         TAILQ_INSERT_TAIL(&hash->list, object, object_list);
413         lwkt_reltoken(&hash->token);
414 }
415
416 /*
417  * Initialize a VM object.
418  */
419 void
420 vm_object_init(vm_object_t object, vm_pindex_t size)
421 {
422         _vm_object_allocate(OBJT_DEFAULT, size, object);
423         vm_object_drop(object);
424 }
425
426 /*
427  * Initialize the VM objects module.
428  *
429  * Called from the low level boot code only.  Note that this occurs before
430  * kmalloc is initialized so we cannot allocate any VM objects.
431  */
432 void
433 vm_object_init1(void)
434 {
435         int i;
436
437         for (i = 0; i < VMOBJ_HSIZE; ++i) {
438                 TAILQ_INIT(&vm_object_hash[i].list);
439                 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
440         }
441
442         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
443                             &kernel_object);
444         vm_object_drop(&kernel_object);
445 }
446
447 void
448 vm_object_init2(void)
449 {
450         kmalloc_set_unlimited(M_VM_OBJECT);
451 }
452
453 /*
454  * Allocate and return a new object of the specified type and size.
455  *
456  * No requirements.
457  */
458 vm_object_t
459 vm_object_allocate(objtype_t type, vm_pindex_t size)
460 {
461         vm_object_t obj;
462
463         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
464         _vm_object_allocate(type, size, obj);
465         vm_object_drop(obj);
466
467         return (obj);
468 }
469
470 /*
471  * This version returns a held object, allowing further atomic initialization
472  * of the object.
473  */
474 vm_object_t
475 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
476 {
477         vm_object_t obj;
478
479         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
480         _vm_object_allocate(type, size, obj);
481
482         return (obj);
483 }
484
485 /*
486  * Add an additional reference to a vm_object.  The object must already be
487  * held.  The original non-lock version is no longer supported.  The object
488  * must NOT be chain locked by anyone at the time the reference is added.
489  *
490  * The object must be held, but may be held shared if desired (hence why
491  * we use an atomic op).
492  */
493 void
494 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
495 {
496         KKASSERT(object != NULL);
497         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
498         atomic_add_int(&object->ref_count, 1);
499         if (object->type == OBJT_VNODE) {
500                 vref(object->handle);
501                 /* XXX what if the vnode is being destroyed? */
502         }
503 #if defined(DEBUG_LOCKS)
504         debugvm_object_add(object, file, line, 1);
505 #endif
506 }
507
508 /*
509  * This version is only allowed for vnode objects.
510  */
511 void
512 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
513 {
514         KKASSERT(object->type == OBJT_VNODE);
515         atomic_add_int(&object->ref_count, 1);
516         vref(object->handle);
517 #if defined(DEBUG_LOCKS)
518         debugvm_object_add(object, file, line, 1);
519 #endif
520 }
521
522 /*
523  * Dereference an object and its underlying vnode.  The object may be
524  * held shared.  On return the object will remain held.
525  *
526  * This function may return a vnode in *vpp which the caller must release
527  * after the caller drops its own lock.  If vpp is NULL, we assume that
528  * the caller was holding an exclusive lock on the object and we vrele()
529  * the vp ourselves.
530  */
531 static void
532 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
533                                    VMOBJDBARGS)
534 {
535         struct vnode *vp = (struct vnode *) object->handle;
536
537         KASSERT(object->type == OBJT_VNODE,
538             ("vm_object_vndeallocate: not a vnode object"));
539         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
540         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
541 #ifdef INVARIANTS
542         if (object->ref_count == 0) {
543                 vprint("vm_object_vndeallocate", vp);
544                 panic("vm_object_vndeallocate: bad object reference count");
545         }
546 #endif
547         for (;;) {
548                 int count = object->ref_count;
549                 cpu_ccfence();
550                 if (count == 1) {
551                         vm_object_upgrade(object);
552                         if (atomic_cmpset_int(&object->ref_count, count, 0)) {
553                                 vclrflags(vp, VTEXT);
554                                 break;
555                         }
556                 } else {
557                         if (atomic_cmpset_int(&object->ref_count,
558                                               count, count - 1)) {
559                                 break;
560                         }
561                 }
562                 /* retry */
563         }
564 #if defined(DEBUG_LOCKS)
565         debugvm_object_add(object, file, line, -1);
566 #endif
567
568         /*
569          * vrele or return the vp to vrele.  We can only safely vrele(vp)
570          * if the object was locked exclusively.  But there are two races
571          * here.
572          *
573          * We had to upgrade the object above to safely clear VTEXT
574          * but the alternative path where the shared lock is retained
575          * can STILL race to 0 in other paths and cause our own vrele()
576          * to terminate the vnode.  We can't allow that if the VM object
577          * is still locked shared.
578          */
579         if (vpp)
580                 *vpp = vp;
581         else
582                 vrele(vp);
583 }
584
585 /*
586  * Release a reference to the specified object, gained either through a
587  * vm_object_allocate or a vm_object_reference call.  When all references
588  * are gone, storage associated with this object may be relinquished.
589  *
590  * The caller does not have to hold the object locked but must have control
591  * over the reference in question in order to guarantee that the object
592  * does not get ripped out from under us.
593  *
594  * XXX Currently all deallocations require an exclusive lock.
595  */
596 void
597 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
598 {
599         struct vnode *vp;
600         int count;
601
602         if (object == NULL)
603                 return;
604
605         for (;;) {
606                 count = object->ref_count;
607                 cpu_ccfence();
608
609                 /*
610                  * If decrementing the count enters into special handling
611                  * territory (0, 1, or 2) we have to do it the hard way.
612                  * Fortunate though, objects with only a few refs like this
613                  * are not likely to be heavily contended anyway.
614                  *
615                  * For vnode objects we only care about 1->0 transitions.
616                  */
617                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
618 #if defined(DEBUG_LOCKS)
619                         debugvm_object_add(object, file, line, 0);
620 #endif
621                         vm_object_hold(object);
622                         vm_object_deallocate_locked(object);
623                         vm_object_drop(object);
624                         break;
625                 }
626
627                 /*
628                  * Try to decrement ref_count without acquiring a hold on
629                  * the object.  This is particularly important for the exec*()
630                  * and exit*() code paths because the program binary may
631                  * have a great deal of sharing and an exclusive lock will
632                  * crowbar performance in those circumstances.
633                  */
634                 if (object->type == OBJT_VNODE) {
635                         vp = (struct vnode *)object->handle;
636                         if (atomic_cmpset_int(&object->ref_count,
637                                               count, count - 1)) {
638 #if defined(DEBUG_LOCKS)
639                                 debugvm_object_add(object, file, line, -1);
640 #endif
641
642                                 vrele(vp);
643                                 break;
644                         }
645                         /* retry */
646                 } else {
647                         if (atomic_cmpset_int(&object->ref_count,
648                                               count, count - 1)) {
649 #if defined(DEBUG_LOCKS)
650                                 debugvm_object_add(object, file, line, -1);
651 #endif
652                                 break;
653                         }
654                         /* retry */
655                 }
656                 /* retry */
657         }
658 }
659
660 void
661 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
662 {
663         /*
664          * Degenerate case
665          */
666         if (object == NULL)
667                 return;
668
669         /*
670          * vnode case, caller either locked the object exclusively
671          * or this is a recursion with must_drop != 0 and the vnode
672          * object will be locked shared.
673          *
674          * If locked shared we have to drop the object before we can
675          * call vrele() or risk a shared/exclusive livelock.
676          */
677         if (object->type == OBJT_VNODE) {
678                 ASSERT_LWKT_TOKEN_HELD(&object->token);
679                 vm_object_vndeallocate(object, NULL);
680                 return;
681         }
682         ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
683
684         /*
685          * Normal case (object is locked exclusively)
686          */
687         if (object->ref_count == 0) {
688                 panic("vm_object_deallocate: object deallocated "
689                       "too many times: %d", object->type);
690         }
691         if (object->ref_count > 2) {
692                 atomic_add_int(&object->ref_count, -1);
693 #if defined(DEBUG_LOCKS)
694                 debugvm_object_add(object, file, line, -1);
695 #endif
696                 return;
697         }
698
699         /*
700          * Drop the ref and handle termination on the 1->0 transition.
701          * We may have blocked above so we have to recheck.
702          */
703         KKASSERT(object->ref_count != 0);
704         if (object->ref_count >= 2) {
705                 atomic_add_int(&object->ref_count, -1);
706 #if defined(DEBUG_LOCKS)
707                 debugvm_object_add(object, file, line, -1);
708 #endif
709                 return;
710         }
711
712         atomic_add_int(&object->ref_count, -1);
713         if ((object->flags & OBJ_DEAD) == 0)
714                 vm_object_terminate(object);
715 }
716
717 /*
718  * Destroy the specified object, freeing up related resources.
719  *
720  * The object must have zero references.
721  *
722  * The object must held.  The caller is responsible for dropping the object
723  * after terminate returns.  Terminate does NOT drop the object.
724  */
725 static int vm_object_terminate_callback(vm_page_t p, void *data);
726
727 void
728 vm_object_terminate(vm_object_t object)
729 {
730         struct rb_vm_page_scan_info info;
731         struct vm_object_hash *hash;
732
733         /*
734          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
735          * able to safely block.
736          */
737         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
738         KKASSERT((object->flags & OBJ_DEAD) == 0);
739         vm_object_set_flag(object, OBJ_DEAD);
740
741         /*
742          * Wait for the pageout daemon to be done with the object
743          */
744         vm_object_pip_wait(object, "objtrm1");
745
746         KASSERT(!object->paging_in_progress,
747                 ("vm_object_terminate: pageout in progress"));
748
749         /*
750          * Clean and free the pages, as appropriate. All references to the
751          * object are gone, so we don't need to lock it.
752          */
753         if (object->type == OBJT_VNODE) {
754                 struct vnode *vp;
755
756                 /*
757                  * Clean pages and flush buffers.
758                  *
759                  * NOTE!  TMPFS buffer flushes do not typically flush the
760                  *        actual page to swap as this would be highly
761                  *        inefficient, and normal filesystems usually wrap
762                  *        page flushes with buffer cache buffers.
763                  *
764                  *        To deal with this we have to call vinvalbuf() both
765                  *        before and after the vm_object_page_clean().
766                  */
767                 vp = (struct vnode *) object->handle;
768                 vinvalbuf(vp, V_SAVE, 0, 0);
769                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
770                 vinvalbuf(vp, V_SAVE, 0, 0);
771         }
772
773         /*
774          * Wait for any I/O to complete, after which there had better not
775          * be any references left on the object.
776          */
777         vm_object_pip_wait(object, "objtrm2");
778
779         if (object->ref_count != 0) {
780                 panic("vm_object_terminate: object with references, "
781                       "ref_count=%d", object->ref_count);
782         }
783
784         /*
785          * Cleanup any shared pmaps associated with this object.
786          */
787         pmap_object_free(object);
788
789         /*
790          * Now free any remaining pages. For internal objects, this also
791          * removes them from paging queues. Don't free wired pages, just
792          * remove them from the object. 
793          */
794         info.count = 0;
795         info.object = object;
796         do {
797                 info.error = 0;
798                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
799                                         vm_object_terminate_callback, &info);
800         } while (info.error);
801
802         /*
803          * Let the pager know object is dead.
804          */
805         vm_pager_deallocate(object);
806
807         /*
808          * Wait for the object hold count to hit 1, clean out pages as
809          * we go.  vmobj_token interlocks any race conditions that might
810          * pick the object up from the vm_object_list after we have cleared
811          * rb_memq.
812          */
813         for (;;) {
814                 if (RB_ROOT(&object->rb_memq) == NULL)
815                         break;
816                 kprintf("vm_object_terminate: Warning, object %p "
817                         "still has %ld pages\n",
818                         object, object->resident_page_count);
819                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
820                                         vm_object_terminate_callback, &info);
821         }
822
823         /*
824          * There had better not be any pages left
825          */
826         KKASSERT(object->resident_page_count == 0);
827
828         /*
829          * Remove the object from the global object list.
830          */
831         hash = vmobj_hash(object);
832         lwkt_gettoken(&hash->token);
833         TAILQ_REMOVE(&hash->list, object, object_list);
834         lwkt_reltoken(&hash->token);
835
836         if (object->ref_count != 0) {
837                 panic("vm_object_terminate2: object with references, "
838                       "ref_count=%d", object->ref_count);
839         }
840
841         /*
842          * NOTE: The object hold_count is at least 1, so we cannot kfree()
843          *       the object here.  See vm_object_drop().
844          */
845 }
846
847 /*
848  * The caller must hold the object.
849  */
850 static int
851 vm_object_terminate_callback(vm_page_t p, void *data)
852 {
853         struct rb_vm_page_scan_info *info = data;
854         vm_object_t object;
855
856         object = p->object;
857         KKASSERT(object == info->object);
858         if (vm_page_busy_try(p, TRUE)) {
859                 vm_page_sleep_busy(p, TRUE, "vmotrm");
860                 info->error = 1;
861                 return 0;
862         }
863         if (object != p->object) {
864                 /* XXX remove once we determine it can't happen */
865                 kprintf("vm_object_terminate: Warning: Encountered "
866                         "busied page %p on queue %d\n", p, p->queue);
867                 vm_page_wakeup(p);
868                 info->error = 1;
869         } else if (p->wire_count == 0) {
870                 /*
871                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
872                  */
873                 vm_page_free(p);
874                 mycpu->gd_cnt.v_pfree++;
875         } else {
876                 if (p->queue != PQ_NONE) {
877                         kprintf("vm_object_terminate: Warning: Encountered "
878                                 "wired page %p on queue %d\n", p, p->queue);
879                         if (vm_object_debug > 0) {
880                                 --vm_object_debug;
881                                 print_backtrace(10);
882                         }
883                 }
884                 vm_page_remove(p);
885                 vm_page_wakeup(p);
886         }
887
888         /*
889          * Must be at end to avoid SMP races, caller holds object token
890          */
891         if ((++info->count & 63) == 0)
892                 lwkt_user_yield();
893         return(0);
894 }
895
896 /*
897  * Clean all dirty pages in the specified range of object.  Leaves page
898  * on whatever queue it is currently on.   If NOSYNC is set then do not
899  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
900  * leaving the object dirty.
901  *
902  * When stuffing pages asynchronously, allow clustering.  XXX we need a
903  * synchronous clustering mode implementation.
904  *
905  * Odd semantics: if start == end, we clean everything.
906  *
907  * The object must be locked? XXX
908  */
909 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
910 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
911
912 void
913 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
914                      int flags)
915 {
916         struct rb_vm_page_scan_info info;
917         struct vnode *vp;
918         int wholescan;
919         int pagerflags;
920         int generation;
921
922         vm_object_hold(object);
923         if (object->type != OBJT_VNODE ||
924             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
925                 vm_object_drop(object);
926                 return;
927         }
928
929         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 
930                         VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
931         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
932
933         vp = object->handle;
934
935         /*
936          * Interlock other major object operations.  This allows us to 
937          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
938          */
939         vm_object_set_flag(object, OBJ_CLEANING);
940
941         /*
942          * Handle 'entire object' case
943          */
944         info.start_pindex = start;
945         if (end == 0) {
946                 info.end_pindex = object->size - 1;
947         } else {
948                 info.end_pindex = end - 1;
949         }
950         wholescan = (start == 0 && info.end_pindex == object->size - 1);
951         info.limit = flags;
952         info.pagerflags = pagerflags;
953         info.object = object;
954
955         /*
956          * If cleaning the entire object do a pass to mark the pages read-only.
957          * If everything worked out ok, clear OBJ_WRITEABLE and
958          * OBJ_MIGHTBEDIRTY.
959          */
960         if (wholescan) {
961                 info.error = 0;
962                 info.count = 0;
963                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
964                                         vm_object_page_clean_pass1, &info);
965                 if (info.error == 0) {
966                         vm_object_clear_flag(object,
967                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
968                         if (object->type == OBJT_VNODE &&
969                             (vp = (struct vnode *)object->handle) != NULL) {
970                                 /*
971                                  * Use new-style interface to clear VISDIRTY
972                                  * because the vnode is not necessarily removed
973                                  * from the syncer list(s) as often as it was
974                                  * under the old interface, which can leave
975                                  * the vnode on the syncer list after reclaim.
976                                  */
977                                 vclrobjdirty(vp);
978                         }
979                 }
980         }
981
982         /*
983          * Do a pass to clean all the dirty pages we find.
984          */
985         do {
986                 info.error = 0;
987                 info.count = 0;
988                 generation = object->generation;
989                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
990                                         vm_object_page_clean_pass2, &info);
991         } while (info.error || generation != object->generation);
992
993         vm_object_clear_flag(object, OBJ_CLEANING);
994         vm_object_drop(object);
995 }
996
997 /*
998  * The caller must hold the object.
999  */
1000 static 
1001 int
1002 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1003 {
1004         struct rb_vm_page_scan_info *info = data;
1005
1006         KKASSERT(p->object == info->object);
1007
1008         vm_page_flag_set(p, PG_CLEANCHK);
1009         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1010                 info->error = 1;
1011         } else if (vm_page_busy_try(p, FALSE)) {
1012                 info->error = 1;
1013         } else {
1014                 KKASSERT(p->object == info->object);
1015                 vm_page_protect(p, VM_PROT_READ);
1016                 vm_page_wakeup(p);
1017         }
1018
1019         /*
1020          * Must be at end to avoid SMP races, caller holds object token
1021          */
1022         if ((++info->count & 63) == 0)
1023                 lwkt_user_yield();
1024         return(0);
1025 }
1026
1027 /*
1028  * The caller must hold the object
1029  */
1030 static 
1031 int
1032 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1033 {
1034         struct rb_vm_page_scan_info *info = data;
1035         int generation;
1036
1037         KKASSERT(p->object == info->object);
1038
1039         /*
1040          * Do not mess with pages that were inserted after we started
1041          * the cleaning pass.
1042          */
1043         if ((p->flags & PG_CLEANCHK) == 0)
1044                 goto done;
1045
1046         generation = info->object->generation;
1047
1048         if (vm_page_busy_try(p, TRUE)) {
1049                 vm_page_sleep_busy(p, TRUE, "vpcwai");
1050                 info->error = 1;
1051                 goto done;
1052         }
1053
1054         KKASSERT(p->object == info->object &&
1055                  info->object->generation == generation);
1056
1057         /*
1058          * Before wasting time traversing the pmaps, check for trivial
1059          * cases where the page cannot be dirty.
1060          */
1061         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1062                 KKASSERT((p->dirty & p->valid) == 0 &&
1063                          (p->flags & PG_NEED_COMMIT) == 0);
1064                 vm_page_wakeup(p);
1065                 goto done;
1066         }
1067
1068         /*
1069          * Check whether the page is dirty or not.  The page has been set
1070          * to be read-only so the check will not race a user dirtying the
1071          * page.
1072          */
1073         vm_page_test_dirty(p);
1074         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1075                 vm_page_flag_clear(p, PG_CLEANCHK);
1076                 vm_page_wakeup(p);
1077                 goto done;
1078         }
1079
1080         /*
1081          * If we have been asked to skip nosync pages and this is a
1082          * nosync page, skip it.  Note that the object flags were
1083          * not cleared in this case (because pass1 will have returned an
1084          * error), so we do not have to set them.
1085          */
1086         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1087                 vm_page_flag_clear(p, PG_CLEANCHK);
1088                 vm_page_wakeup(p);
1089                 goto done;
1090         }
1091
1092         /*
1093          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1094          * the pages that get successfully flushed.  Set info->error if
1095          * we raced an object modification.
1096          */
1097         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1098         /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1099
1100         /*
1101          * Must be at end to avoid SMP races, caller holds object token
1102          */
1103 done:
1104         if ((++info->count & 63) == 0)
1105                 lwkt_user_yield();
1106         return(0);
1107 }
1108
1109 /*
1110  * Collect the specified page and nearby pages and flush them out.
1111  * The number of pages flushed is returned.  The passed page is busied
1112  * by the caller and we are responsible for its disposition.
1113  *
1114  * The caller must hold the object.
1115  */
1116 static void
1117 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1118 {
1119         int error;
1120         int is;
1121         int ib;
1122         int i;
1123         int page_base;
1124         vm_pindex_t pi;
1125         vm_page_t ma[BLIST_MAX_ALLOC];
1126
1127         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1128
1129         pi = p->pindex;
1130         page_base = pi % BLIST_MAX_ALLOC;
1131         ma[page_base] = p;
1132         ib = page_base - 1;
1133         is = page_base + 1;
1134
1135         while (ib >= 0) {
1136                 vm_page_t tp;
1137
1138                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1139                                              TRUE, &error);
1140                 if (error)
1141                         break;
1142                 if (tp == NULL)
1143                         break;
1144                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1145                     (tp->flags & PG_CLEANCHK) == 0) {
1146                         vm_page_wakeup(tp);
1147                         break;
1148                 }
1149                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1150                         vm_page_flag_clear(tp, PG_CLEANCHK);
1151                         vm_page_wakeup(tp);
1152                         break;
1153                 }
1154                 vm_page_test_dirty(tp);
1155                 if ((tp->dirty & tp->valid) == 0 &&
1156                     (tp->flags & PG_NEED_COMMIT) == 0) {
1157                         vm_page_flag_clear(tp, PG_CLEANCHK);
1158                         vm_page_wakeup(tp);
1159                         break;
1160                 }
1161                 ma[ib] = tp;
1162                 --ib;
1163         }
1164         ++ib;   /* fixup */
1165
1166         while (is < BLIST_MAX_ALLOC &&
1167                pi - page_base + is < object->size) {
1168                 vm_page_t tp;
1169
1170                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1171                                              TRUE, &error);
1172                 if (error)
1173                         break;
1174                 if (tp == NULL)
1175                         break;
1176                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1177                     (tp->flags & PG_CLEANCHK) == 0) {
1178                         vm_page_wakeup(tp);
1179                         break;
1180                 }
1181                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1182                         vm_page_flag_clear(tp, PG_CLEANCHK);
1183                         vm_page_wakeup(tp);
1184                         break;
1185                 }
1186                 vm_page_test_dirty(tp);
1187                 if ((tp->dirty & tp->valid) == 0 &&
1188                     (tp->flags & PG_NEED_COMMIT) == 0) {
1189                         vm_page_flag_clear(tp, PG_CLEANCHK);
1190                         vm_page_wakeup(tp);
1191                         break;
1192                 }
1193                 ma[is] = tp;
1194                 ++is;
1195         }
1196
1197         /*
1198          * All pages in the ma[] array are busied now
1199          */
1200         for (i = ib; i < is; ++i) {
1201                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1202                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1203         }
1204         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1205         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1206                 vm_page_unhold(ma[i]);
1207 }
1208
1209 /*
1210  * Same as vm_object_pmap_copy, except range checking really
1211  * works, and is meant for small sections of an object.
1212  *
1213  * This code protects resident pages by making them read-only
1214  * and is typically called on a fork or split when a page
1215  * is converted to copy-on-write.  
1216  *
1217  * NOTE: If the page is already at VM_PROT_NONE, calling
1218  * vm_page_protect will have no effect.
1219  */
1220 void
1221 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1222 {
1223         vm_pindex_t idx;
1224         vm_page_t p;
1225
1226         if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
1227                 return;
1228
1229         vm_object_hold(object);
1230         for (idx = start; idx < end; idx++) {
1231                 p = vm_page_lookup(object, idx);
1232                 if (p == NULL)
1233                         continue;
1234                 vm_page_protect(p, VM_PROT_READ);
1235         }
1236         vm_object_drop(object);
1237 }
1238
1239 /*
1240  * Removes all physical pages in the specified object range from all
1241  * physical maps.
1242  *
1243  * The object must *not* be locked.
1244  */
1245
1246 static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
1247
1248 void
1249 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1250 {
1251         struct rb_vm_page_scan_info info;
1252
1253         if (object == NULL)
1254                 return;
1255         if (start == end)
1256                 return;
1257         info.start_pindex = start;
1258         info.end_pindex = end - 1;
1259         info.count = 0;
1260         info.object = object;
1261
1262         vm_object_hold(object);
1263         do {
1264                 info.error = 0;
1265                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1266                                         vm_object_pmap_remove_callback, &info);
1267         } while (info.error);
1268         if (start == 0 && end == object->size)
1269                 vm_object_clear_flag(object, OBJ_WRITEABLE);
1270         vm_object_drop(object);
1271 }
1272
1273 /*
1274  * The caller must hold the object
1275  */
1276 static int
1277 vm_object_pmap_remove_callback(vm_page_t p, void *data)
1278 {
1279         struct rb_vm_page_scan_info *info = data;
1280
1281         if (info->object != p->object ||
1282             p->pindex < info->start_pindex ||
1283             p->pindex > info->end_pindex) {
1284                 kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n",
1285                         info->object, p);
1286                 info->error = 1;
1287                 return(0);
1288         }
1289
1290         vm_page_protect(p, VM_PROT_NONE);
1291
1292         /*
1293          * Must be at end to avoid SMP races, caller holds object token
1294          */
1295         if ((++info->count & 63) == 0)
1296                 lwkt_user_yield();
1297         return(0);
1298 }
1299
1300 /*
1301  * Implements the madvise function at the object/page level.
1302  *
1303  * MADV_WILLNEED        (any object)
1304  *
1305  *      Activate the specified pages if they are resident.
1306  *
1307  * MADV_DONTNEED        (any object)
1308  *
1309  *      Deactivate the specified pages if they are resident.
1310  *
1311  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1312  *
1313  *      Deactivate and clean the specified pages if they are
1314  *      resident.  This permits the process to reuse the pages
1315  *      without faulting or the kernel to reclaim the pages
1316  *      without I/O.
1317  *
1318  * No requirements.
1319  */
1320 void
1321 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1322                   vm_pindex_t count, int advise)
1323 {
1324         vm_pindex_t end;
1325         vm_page_t m;
1326         int error;
1327
1328         if (object == NULL)
1329                 return;
1330
1331         end = pindex + count;
1332
1333         vm_object_hold(object);
1334
1335         /*
1336          * Locate and adjust resident pages.  This only applies to the
1337          * primary object in the mapping.
1338          */
1339         for (; pindex < end; pindex += 1) {
1340 relookup:
1341                 /*
1342                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1343                  * and those pages must be OBJ_ONEMAPPING.
1344                  */
1345                 if (advise == MADV_FREE) {
1346                         if ((object->type != OBJT_DEFAULT &&
1347                              object->type != OBJT_SWAP) ||
1348                             (object->flags & OBJ_ONEMAPPING) == 0) {
1349                                 continue;
1350                         }
1351                 }
1352
1353                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1354
1355                 if (error) {
1356                         vm_page_sleep_busy(m, TRUE, "madvpo");
1357                         goto relookup;
1358                 }
1359                 if (m == NULL) {
1360                         /*
1361                          * There may be swap even if there is no backing page
1362                          */
1363                         if (advise == MADV_FREE &&
1364                             object->type == OBJT_SWAP &&
1365                             m->object == object) {
1366                                 swap_pager_freespace(object, pindex, 1);
1367                         }
1368                         continue;
1369                 }
1370
1371                 /*
1372                  * If the page is not in a normal active state, we skip it.
1373                  * If the page is not managed there are no page queues to
1374                  * mess with.  Things can break if we mess with pages in
1375                  * any of the below states.
1376                  */
1377                 if (m->wire_count ||
1378                     (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1379                     m->valid != VM_PAGE_BITS_ALL
1380                 ) {
1381                         vm_page_wakeup(m);
1382                         continue;
1383                 }
1384
1385                 /*
1386                  * Theoretically once a page is known not to be busy, an
1387                  * interrupt cannot come along and rip it out from under us.
1388                  */
1389                 if (advise == MADV_WILLNEED) {
1390                         vm_page_activate(m);
1391                 } else if (advise == MADV_DONTNEED) {
1392                         vm_page_dontneed(m);
1393                 } else if (advise == MADV_FREE) {
1394                         /*
1395                          * Mark the page clean.  This will allow the page
1396                          * to be freed up by the system.  However, such pages
1397                          * are often reused quickly by malloc()/free()
1398                          * so we do not do anything that would cause
1399                          * a page fault if we can help it.
1400                          *
1401                          * Specifically, we do not try to actually free
1402                          * the page now nor do we try to put it in the
1403                          * cache (which would cause a page fault on reuse).
1404                          *
1405                          * But we do make the page is freeable as we
1406                          * can without actually taking the step of unmapping
1407                          * it.
1408                          */
1409                         pmap_clear_modify(m);
1410                         m->dirty = 0;
1411                         m->act_count = 0;
1412                         vm_page_dontneed(m);
1413                         if (object->type == OBJT_SWAP)
1414                                 swap_pager_freespace(object, pindex, 1);
1415                 }
1416                 vm_page_wakeup(m);
1417         }       
1418         vm_object_drop(object);
1419 }
1420
1421 /*
1422  * Removes all physical pages in the specified object range from the
1423  * object's list of pages.
1424  *
1425  * No requirements.
1426  */
1427 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1428
1429 void
1430 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1431                       boolean_t clean_only)
1432 {
1433         struct rb_vm_page_scan_info info;
1434         int all;
1435
1436         /*
1437          * Degenerate cases and assertions
1438          */
1439         vm_object_hold(object);
1440         if (object == NULL ||
1441             (object->resident_page_count == 0 && object->swblock_count == 0)) {
1442                 vm_object_drop(object);
1443                 return;
1444         }
1445         KASSERT(object->type != OBJT_PHYS,
1446                 ("attempt to remove pages from a physical object"));
1447
1448         /*
1449          * Indicate that paging is occuring on the object
1450          */
1451         vm_object_pip_add(object, 1);
1452
1453         /*
1454          * Figure out the actual removal range and whether we are removing
1455          * the entire contents of the object or not.  If removing the entire
1456          * contents, be sure to get all pages, even those that might be 
1457          * beyond the end of the object.
1458          */
1459         info.object = object;
1460         info.start_pindex = start;
1461         if (end == 0)
1462                 info.end_pindex = (vm_pindex_t)-1;
1463         else
1464                 info.end_pindex = end - 1;
1465         info.limit = clean_only;
1466         info.count = 0;
1467         all = (start == 0 && info.end_pindex >= object->size - 1);
1468
1469         /*
1470          * Loop until we are sure we have gotten them all.
1471          */
1472         do {
1473                 info.error = 0;
1474                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1475                                         vm_object_page_remove_callback, &info);
1476         } while (info.error);
1477
1478         /*
1479          * Remove any related swap if throwing away pages, or for
1480          * non-swap objects (the swap is a clean copy in that case).
1481          */
1482         if (object->type != OBJT_SWAP || clean_only == FALSE) {
1483                 if (all)
1484                         swap_pager_freespace_all(object);
1485                 else
1486                         swap_pager_freespace(object, info.start_pindex,
1487                              info.end_pindex - info.start_pindex + 1);
1488         }
1489
1490         /*
1491          * Cleanup
1492          */
1493         vm_object_pip_wakeup(object);
1494         vm_object_drop(object);
1495 }
1496
1497 /*
1498  * The caller must hold the object.
1499  *
1500  * NOTE: User yields are allowed when removing more than one page, but not
1501  *       allowed if only removing one page (the path for single page removals
1502  *       might hold a spinlock).
1503  */
1504 static int
1505 vm_object_page_remove_callback(vm_page_t p, void *data)
1506 {
1507         struct rb_vm_page_scan_info *info = data;
1508
1509         if (info->object != p->object ||
1510             p->pindex < info->start_pindex ||
1511             p->pindex > info->end_pindex) {
1512                 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1513                         info->object, p);
1514                 return(0);
1515         }
1516         if (vm_page_busy_try(p, TRUE)) {
1517                 vm_page_sleep_busy(p, TRUE, "vmopar");
1518                 info->error = 1;
1519                 return(0);
1520         }
1521         if (info->object != p->object) {
1522                 /* this should never happen */
1523                 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1524                         info->object, p);
1525                 vm_page_wakeup(p);
1526                 return(0);
1527         }
1528
1529         /*
1530          * Wired pages cannot be destroyed, but they can be invalidated
1531          * and we do so if clean_only (limit) is not set.
1532          *
1533          * WARNING!  The page may be wired due to being part of a buffer
1534          *           cache buffer, and the buffer might be marked B_CACHE.
1535          *           This is fine as part of a truncation but VFSs must be
1536          *           sure to fix the buffer up when re-extending the file.
1537          *
1538          * NOTE!     PG_NEED_COMMIT is ignored.
1539          */
1540         if (p->wire_count != 0) {
1541                 vm_page_protect(p, VM_PROT_NONE);
1542                 if (info->limit == 0)
1543                         p->valid = 0;
1544                 vm_page_wakeup(p);
1545                 goto done;
1546         }
1547
1548         /*
1549          * limit is our clean_only flag.  If set and the page is dirty or
1550          * requires a commit, do not free it.  If set and the page is being
1551          * held by someone, do not free it.
1552          */
1553         if (info->limit && p->valid) {
1554                 vm_page_test_dirty(p);
1555                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1556                         vm_page_wakeup(p);
1557                         goto done;
1558                 }
1559         }
1560
1561         /*
1562          * Destroy the page
1563          */
1564         vm_page_protect(p, VM_PROT_NONE);
1565         vm_page_free(p);
1566
1567         /*
1568          * Must be at end to avoid SMP races, caller holds object token
1569          */
1570 done:
1571         if ((++info->count & 63) == 0)
1572                 lwkt_user_yield();
1573
1574         return(0);
1575 }
1576
1577 /*
1578  * Try to extend prev_object into an adjoining region of virtual
1579  * memory, return TRUE on success.
1580  *
1581  * The caller does not need to hold (prev_object) but must have a stable
1582  * pointer to it (typically by holding the vm_map locked).
1583  *
1584  * This function only works for anonymous memory objects which either
1585  * have (a) one reference or (b) we are extending the object's size.
1586  * Otherwise the related VM pages we want to use for the object might
1587  * be in use by another mapping.
1588  */
1589 boolean_t
1590 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1591                    vm_size_t prev_size, vm_size_t next_size)
1592 {
1593         vm_pindex_t next_pindex;
1594
1595         if (prev_object == NULL)
1596                 return (TRUE);
1597
1598         vm_object_hold(prev_object);
1599
1600         if (prev_object->type != OBJT_DEFAULT &&
1601             prev_object->type != OBJT_SWAP) {
1602                 vm_object_drop(prev_object);
1603                 return (FALSE);
1604         }
1605
1606 #if 0
1607         /* caller now checks this */
1608         /*
1609          * Try to collapse the object first
1610          */
1611         vm_object_collapse(prev_object, NULL);
1612 #endif
1613
1614 #if 0
1615         /* caller now checks this */
1616         /*
1617          * We can't coalesce if we shadow another object (figuring out the
1618          * relationships become too complex).
1619          */
1620         if (prev_object->backing_object != NULL) {
1621                 vm_object_chain_release(prev_object);
1622                 vm_object_drop(prev_object);
1623                 return (FALSE);
1624         }
1625 #endif
1626
1627         prev_size >>= PAGE_SHIFT;
1628         next_size >>= PAGE_SHIFT;
1629         next_pindex = prev_pindex + prev_size;
1630
1631         /*
1632          * We can't if the object has more than one ref count unless we
1633          * are extending it into newly minted space.
1634          */
1635         if (prev_object->ref_count > 1 &&
1636             prev_object->size != next_pindex) {
1637                 vm_object_drop(prev_object);
1638                 return (FALSE);
1639         }
1640
1641         /*
1642          * Remove any pages that may still be in the object from a previous
1643          * deallocation.
1644          */
1645         if (next_pindex < prev_object->size) {
1646                 vm_object_page_remove(prev_object,
1647                                       next_pindex,
1648                                       next_pindex + next_size, FALSE);
1649                 if (prev_object->type == OBJT_SWAP)
1650                         swap_pager_freespace(prev_object,
1651                                              next_pindex, next_size);
1652         }
1653
1654         /*
1655          * Extend the object if necessary.
1656          */
1657         if (next_pindex + next_size > prev_object->size)
1658                 prev_object->size = next_pindex + next_size;
1659         vm_object_drop(prev_object);
1660
1661         return (TRUE);
1662 }
1663
1664 /*
1665  * Make the object writable and flag is being possibly dirty.
1666  *
1667  * The object might not be held (or might be held but held shared),
1668  * the related vnode is probably not held either.  Object and vnode are
1669  * stable by virtue of the vm_page busied by the caller preventing
1670  * destruction.
1671  *
1672  * If the related mount is flagged MNTK_THR_SYNC we need to call
1673  * vsetobjdirty().  Filesystems using this option usually shortcut
1674  * synchronization by only scanning the syncer list.
1675  */
1676 void
1677 vm_object_set_writeable_dirty(vm_object_t object)
1678 {
1679         struct vnode *vp;
1680
1681         /*vm_object_assert_held(object);*/
1682         /*
1683          * Avoid contention in vm fault path by checking the state before
1684          * issuing an atomic op on it.
1685          */
1686         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1687             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1688                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1689         }
1690         if (object->type == OBJT_VNODE &&
1691             (vp = (struct vnode *)object->handle) != NULL) {
1692                 if ((vp->v_flag & VOBJDIRTY) == 0) {
1693                         if (vp->v_mount &&
1694                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1695                                 /*
1696                                  * New style THR_SYNC places vnodes on the
1697                                  * syncer list more deterministically.
1698                                  */
1699                                 vsetobjdirty(vp);
1700                         } else {
1701                                 /*
1702                                  * Old style scan would not necessarily place
1703                                  * a vnode on the syncer list when possibly
1704                                  * modified via mmap.
1705                                  */
1706                                 vsetflags(vp, VOBJDIRTY);
1707                         }
1708                 }
1709         }
1710 }
1711
1712 #include "opt_ddb.h"
1713 #ifdef DDB
1714 #include <sys/cons.h>
1715
1716 #include <ddb/ddb.h>
1717
1718 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
1719                                        vm_map_entry_t entry);
1720 static int      vm_object_in_map (vm_object_t object);
1721
1722 /*
1723  * The caller must hold the object.
1724  */
1725 static int
1726 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1727 {
1728         vm_map_backing_t ba;
1729         vm_map_t tmpm;
1730         vm_map_entry_t tmpe;
1731         int entcount;
1732
1733         if (map == NULL)
1734                 return 0;
1735         if (entry == NULL) {
1736                 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1737                 entcount = map->nentries;
1738                 while (entcount-- && tmpe) {
1739                         if( _vm_object_in_map(map, object, tmpe)) {
1740                                 return 1;
1741                         }
1742                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1743                 }
1744                 return (0);
1745         }
1746         switch(entry->maptype) {
1747         case VM_MAPTYPE_SUBMAP:
1748                 tmpm = entry->ba.sub_map;
1749                 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1750                 entcount = tmpm->nentries;
1751                 while (entcount-- && tmpe) {
1752                         if( _vm_object_in_map(tmpm, object, tmpe)) {
1753                                 return 1;
1754                         }
1755                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1756                 }
1757                 break;
1758         case VM_MAPTYPE_NORMAL:
1759         case VM_MAPTYPE_VPAGETABLE:
1760                 ba = &entry->ba;
1761                 while (ba) {
1762                         if (ba->object == object)
1763                                 return TRUE;
1764                         ba = ba->backing_ba;
1765                 }
1766                 break;
1767         default:
1768                 break;
1769         }
1770         return 0;
1771 }
1772
1773 static int vm_object_in_map_callback(struct proc *p, void *data);
1774
1775 struct vm_object_in_map_info {
1776         vm_object_t object;
1777         int rv;
1778 };
1779
1780 /*
1781  * Debugging only
1782  */
1783 static int
1784 vm_object_in_map(vm_object_t object)
1785 {
1786         struct vm_object_in_map_info info;
1787
1788         info.rv = 0;
1789         info.object = object;
1790
1791         allproc_scan(vm_object_in_map_callback, &info, 0);
1792         if (info.rv)
1793                 return 1;
1794         if( _vm_object_in_map(&kernel_map, object, 0))
1795                 return 1;
1796         if( _vm_object_in_map(&pager_map, object, 0))
1797                 return 1;
1798         if( _vm_object_in_map(&buffer_map, object, 0))
1799                 return 1;
1800         return 0;
1801 }
1802
1803 /*
1804  * Debugging only
1805  */
1806 static int
1807 vm_object_in_map_callback(struct proc *p, void *data)
1808 {
1809         struct vm_object_in_map_info *info = data;
1810
1811         if (p->p_vmspace) {
1812                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1813                         info->rv = 1;
1814                         return -1;
1815                 }
1816         }
1817         return (0);
1818 }
1819
1820 DB_SHOW_COMMAND(vmochk, vm_object_check)
1821 {
1822         struct vm_object_hash *hash;
1823         vm_object_t object;
1824         int n;
1825
1826         /*
1827          * make sure that internal objs are in a map somewhere
1828          * and none have zero ref counts.
1829          */
1830         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1831                 hash = &vm_object_hash[n];
1832                 for (object = TAILQ_FIRST(&hash->list);
1833                                 object != NULL;
1834                                 object = TAILQ_NEXT(object, object_list)) {
1835                         if (object->type == OBJT_MARKER)
1836                                 continue;
1837                         if (object->handle != NULL ||
1838                             (object->type != OBJT_DEFAULT &&
1839                              object->type != OBJT_SWAP)) {
1840                                 continue;
1841                         }
1842                         if (object->ref_count == 0) {
1843                                 db_printf("vmochk: internal obj has "
1844                                           "zero ref count: %ld\n",
1845                                           (long)object->size);
1846                         }
1847                         if (vm_object_in_map(object))
1848                                 continue;
1849                         db_printf("vmochk: internal obj is not in a map: "
1850                                   "ref: %d, size: %lu: 0x%lx\n",
1851                                   object->ref_count, (u_long)object->size,
1852                                   (u_long)object->size);
1853                 }
1854         }
1855 }
1856
1857 /*
1858  * Debugging only
1859  */
1860 DB_SHOW_COMMAND(object, vm_object_print_static)
1861 {
1862         /* XXX convert args. */
1863         vm_object_t object = (vm_object_t)addr;
1864         boolean_t full = have_addr;
1865
1866         vm_page_t p;
1867
1868         /* XXX count is an (unused) arg.  Avoid shadowing it. */
1869 #define count   was_count
1870
1871         int count;
1872
1873         if (object == NULL)
1874                 return;
1875
1876         db_iprintf(
1877             "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1878             object, (int)object->type, (u_long)object->size,
1879             object->resident_page_count, object->ref_count, object->flags);
1880         /*
1881          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1882          */
1883         db_iprintf("\n");
1884
1885         if (!full)
1886                 return;
1887
1888         db_indent += 2;
1889         count = 0;
1890         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1891                 if (count == 0)
1892                         db_iprintf("memory:=");
1893                 else if (count == 6) {
1894                         db_printf("\n");
1895                         db_iprintf(" ...");
1896                         count = 0;
1897                 } else
1898                         db_printf(",");
1899                 count++;
1900
1901                 db_printf("(off=0x%lx,page=0x%lx)",
1902                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1903         }
1904         if (count != 0)
1905                 db_printf("\n");
1906         db_indent -= 2;
1907 }
1908
1909 /* XXX. */
1910 #undef count
1911
1912 /*
1913  * XXX need this non-static entry for calling from vm_map_print.
1914  *
1915  * Debugging only
1916  */
1917 void
1918 vm_object_print(/* db_expr_t */ long addr,
1919                 boolean_t have_addr,
1920                 /* db_expr_t */ long count,
1921                 char *modif)
1922 {
1923         vm_object_print_static(addr, have_addr, count, modif);
1924 }
1925
1926 /*
1927  * Debugging only
1928  */
1929 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1930 {
1931         struct vm_object_hash *hash;
1932         vm_object_t object;
1933         int nl = 0;
1934         int c;
1935         int n;
1936
1937         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1938                 hash = &vm_object_hash[n];
1939                 for (object = TAILQ_FIRST(&hash->list);
1940                                 object != NULL;
1941                                 object = TAILQ_NEXT(object, object_list)) {
1942                         vm_pindex_t idx, fidx;
1943                         vm_pindex_t osize;
1944                         vm_paddr_t pa = -1, padiff;
1945                         int rcount;
1946                         vm_page_t m;
1947
1948                         if (object->type == OBJT_MARKER)
1949                                 continue;
1950                         db_printf("new object: %p\n", (void *)object);
1951                         if ( nl > 18) {
1952                                 c = cngetc();
1953                                 if (c != ' ')
1954                                         return;
1955                                 nl = 0;
1956                         }
1957                         nl++;
1958                         rcount = 0;
1959                         fidx = 0;
1960                         osize = object->size;
1961                         if (osize > 128)
1962                                 osize = 128;
1963                         for (idx = 0; idx < osize; idx++) {
1964                                 m = vm_page_lookup(object, idx);
1965                                 if (m == NULL) {
1966                                         if (rcount) {
1967                                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1968                                                         (long)fidx, rcount, (long)pa);
1969                                                 if ( nl > 18) {
1970                                                         c = cngetc();
1971                                                         if (c != ' ')
1972                                                                 return;
1973                                                         nl = 0;
1974                                                 }
1975                                                 nl++;
1976                                                 rcount = 0;
1977                                         }
1978                                         continue;
1979                                 }
1980
1981                                 if (rcount &&
1982                                         (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1983                                         ++rcount;
1984                                         continue;
1985                                 }
1986                                 if (rcount) {
1987                                         padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1988                                         padiff >>= PAGE_SHIFT;
1989                                         padiff &= PQ_L2_MASK;
1990                                         if (padiff == 0) {
1991                                                 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1992                                                 ++rcount;
1993                                                 continue;
1994                                         }
1995                                         db_printf(" index(%ld)run(%d)pa(0x%lx)",
1996                                                 (long)fidx, rcount, (long)pa);
1997                                         db_printf("pd(%ld)\n", (long)padiff);
1998                                         if ( nl > 18) {
1999                                                 c = cngetc();
2000                                                 if (c != ' ')
2001                                                         return;
2002                                                 nl = 0;
2003                                         }
2004                                         nl++;
2005                                 }
2006                                 fidx = idx;
2007                                 pa = VM_PAGE_TO_PHYS(m);
2008                                 rcount = 1;
2009                         }
2010                         if (rcount) {
2011                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2012                                         (long)fidx, rcount, (long)pa);
2013                                 if ( nl > 18) {
2014                                         c = cngetc();
2015                                         if (c != ' ')
2016                                                 return;
2017                                         nl = 0;
2018                                 }
2019                                 nl++;
2020                         }
2021                 }
2022         }
2023 }
2024 #endif /* DDB */