kernel/vm_pager: Add some pagerops typedefs to improve readability.
[dragonfly.git] / sys / vm / vm_object.c
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62
63 /*
64  *      Virtual memory object module.
65  */
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>           /* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/malloc.h>
77 #include <sys/sysctl.h>
78 #include <sys/refcount.h>
79
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_zone.h>
92
93 #include <vm/vm_page2.h>
94
95 #include <machine/specialreg.h>
96
97 #define EASY_SCAN_FACTOR        8
98
99 static void     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100                                              int pagerflags);
101 static void     vm_object_lock_init(vm_object_t);
102
103 /*
104  *      Virtual memory objects maintain the actual data
105  *      associated with allocated virtual memory.  A given
106  *      page of memory exists within exactly one object.
107  *
108  *      An object is only deallocated when all "references"
109  *      are given up.  Only one "reference" to a given
110  *      region of an object should be writeable.
111  *
112  *      Associated with each object is a list of all resident
113  *      memory pages belonging to that object; this list is
114  *      maintained by the "vm_page" module, and locked by the object's
115  *      lock.
116  *
117  *      Each object also records a "pager" routine which is
118  *      used to retrieve (and store) pages to the proper backing
119  *      storage.  In addition, objects may be backed by other
120  *      objects from which they were virtual-copied.
121  *
122  *      The only items within the object structure which are
123  *      modified after time of creation are:
124  *              reference count         locked by object's lock
125  *              pager routine           locked by object's lock
126  *
127  */
128
129 struct vm_object kernel_object;
130
131 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
132
133 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
134
135 #define VMOBJ_HASH_PRIME1       66555444443333333ULL
136 #define VMOBJ_HASH_PRIME2       989042931893ULL
137
138 int vm_object_debug;
139 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
140
141 static __inline
142 struct vm_object_hash *
143 vmobj_hash(vm_object_t obj)
144 {
145         uintptr_t hash1;
146         uintptr_t hash2;
147
148         hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
149         hash1 %= VMOBJ_HASH_PRIME1;
150         hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
151         hash2 %= VMOBJ_HASH_PRIME2;
152         return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
153 }
154
155 #if defined(DEBUG_LOCKS)
156
157 #define vm_object_vndeallocate(obj, vpp)        \
158                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
159
160 /*
161  * Debug helper to track hold/drop/ref/deallocate calls.
162  */
163 static void
164 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
165 {
166         int i;
167
168         i = atomic_fetchadd_int(&obj->debug_index, 1);
169         i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
170         ksnprintf(obj->debug_hold_thrs[i],
171                   sizeof(obj->debug_hold_thrs[i]),
172                   "%c%d:(%d):%s",
173                   (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
174                   (curthread->td_proc ? curthread->td_proc->p_pid : -1),
175                   obj->ref_count,
176                   curthread->td_comm);
177         obj->debug_hold_file[i] = file;
178         obj->debug_hold_line[i] = line;
179 #if 0
180         /* Uncomment for debugging obj refs/derefs in reproducable cases */
181         if (strcmp(curthread->td_comm, "sshd") == 0) {
182                 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
183                         (curthread->td_proc ? curthread->td_proc->p_pid : -1),
184                         obj, obj->ref_count, addrem, file, line);
185         }
186 #endif
187 }
188
189 #endif
190
191 /*
192  * Misc low level routines
193  */
194 static void
195 vm_object_lock_init(vm_object_t obj)
196 {
197 #if defined(DEBUG_LOCKS)
198         int i;
199
200         obj->debug_index = 0;
201         for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
202                 obj->debug_hold_thrs[i][0] = 0;
203                 obj->debug_hold_file[i] = NULL;
204                 obj->debug_hold_line[i] = 0;
205         }
206 #endif
207 }
208
209 void
210 vm_object_lock_swap(void)
211 {
212         lwkt_token_swap();
213 }
214
215 void
216 vm_object_lock(vm_object_t obj)
217 {
218         lwkt_gettoken(&obj->token);
219 }
220
221 /*
222  * Returns TRUE on sucesss
223  */
224 static int
225 vm_object_lock_try(vm_object_t obj)
226 {
227         return(lwkt_trytoken(&obj->token));
228 }
229
230 void
231 vm_object_lock_shared(vm_object_t obj)
232 {
233         lwkt_gettoken_shared(&obj->token);
234 }
235
236 void
237 vm_object_unlock(vm_object_t obj)
238 {
239         lwkt_reltoken(&obj->token);
240 }
241
242 void
243 vm_object_upgrade(vm_object_t obj)
244 {
245         lwkt_reltoken(&obj->token);
246         lwkt_gettoken(&obj->token);
247 }
248
249 void
250 vm_object_downgrade(vm_object_t obj)
251 {
252         lwkt_reltoken(&obj->token);
253         lwkt_gettoken_shared(&obj->token);
254 }
255
256 static __inline void
257 vm_object_assert_held(vm_object_t obj)
258 {
259         ASSERT_LWKT_TOKEN_HELD(&obj->token);
260 }
261
262 int
263 vm_quickcolor(void)
264 {
265         globaldata_t gd = mycpu;
266         int pg_color;
267
268         pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
269         pg_color += gd->gd_quick_color;
270         gd->gd_quick_color += PQ_PRIME2;
271
272         return pg_color;
273 }
274
275 void
276 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
277 {
278         KKASSERT(obj != NULL);
279
280         /*
281          * Object must be held (object allocation is stable due to callers
282          * context, typically already holding the token on a parent object)
283          * prior to potentially blocking on the lock, otherwise the object
284          * can get ripped away from us.
285          */
286         refcount_acquire(&obj->hold_count);
287         vm_object_lock(obj);
288
289 #if defined(DEBUG_LOCKS)
290         debugvm_object_add(obj, file, line, 1);
291 #endif
292 }
293
294 int
295 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
296 {
297         KKASSERT(obj != NULL);
298
299         /*
300          * Object must be held (object allocation is stable due to callers
301          * context, typically already holding the token on a parent object)
302          * prior to potentially blocking on the lock, otherwise the object
303          * can get ripped away from us.
304          */
305         refcount_acquire(&obj->hold_count);
306         if (vm_object_lock_try(obj) == 0) {
307                 if (refcount_release(&obj->hold_count)) {
308                         if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
309                                 kfree(obj, M_VM_OBJECT);
310                 }
311                 return(0);
312         }
313
314 #if defined(DEBUG_LOCKS)
315         debugvm_object_add(obj, file, line, 1);
316 #endif
317         return(1);
318 }
319
320 void
321 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
322 {
323         KKASSERT(obj != NULL);
324
325         /*
326          * Object must be held (object allocation is stable due to callers
327          * context, typically already holding the token on a parent object)
328          * prior to potentially blocking on the lock, otherwise the object
329          * can get ripped away from us.
330          */
331         refcount_acquire(&obj->hold_count);
332         vm_object_lock_shared(obj);
333
334 #if defined(DEBUG_LOCKS)
335         debugvm_object_add(obj, file, line, 1);
336 #endif
337 }
338
339 /*
340  * Drop the token and hold_count on the object.
341  *
342  * WARNING! Token might be shared.
343  */
344 void
345 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
346 {
347         if (obj == NULL)
348                 return;
349
350         /*
351          * No new holders should be possible once we drop hold_count 1->0 as
352          * there is no longer any way to reference the object.
353          */
354         KKASSERT(obj->hold_count > 0);
355         if (refcount_release(&obj->hold_count)) {
356 #if defined(DEBUG_LOCKS)
357                 debugvm_object_add(obj, file, line, -1);
358 #endif
359
360                 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
361                         vm_object_unlock(obj);
362                         kfree(obj, M_VM_OBJECT);
363                 } else {
364                         vm_object_unlock(obj);
365                 }
366         } else {
367 #if defined(DEBUG_LOCKS)
368                 debugvm_object_add(obj, file, line, -1);
369 #endif
370                 vm_object_unlock(obj);
371         }
372 }
373
374 /*
375  * Initialize a freshly allocated object, returning a held object.
376  *
377  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
378  *
379  * No requirements.
380  */
381 void
382 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
383                     const char *ident)
384 {
385         struct vm_object_hash *hash;
386
387         RB_INIT(&object->rb_memq);
388         lwkt_token_init(&object->token, ident);
389
390         TAILQ_INIT(&object->backing_list);
391         lockinit(&object->backing_lk, "baclk", 0, 0);
392
393         object->type = type;
394         object->size = size;
395         object->ref_count = 1;
396         object->memattr = VM_MEMATTR_DEFAULT;
397         object->hold_count = 0;
398         object->flags = 0;
399         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
400                 vm_object_set_flag(object, OBJ_ONEMAPPING);
401         object->paging_in_progress = 0;
402         object->resident_page_count = 0;
403         /* cpu localization twist */
404         object->pg_color = vm_quickcolor();
405         object->handle = NULL;
406
407         atomic_add_int(&object->generation, 1);
408         object->swblock_count = 0;
409         RB_INIT(&object->swblock_root);
410         vm_object_lock_init(object);
411         pmap_object_init(object);
412
413         vm_object_hold(object);
414
415         hash = vmobj_hash(object);
416         lwkt_gettoken(&hash->token);
417         TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
418         lwkt_reltoken(&hash->token);
419 }
420
421 /*
422  * Initialize a VM object.
423  */
424 void
425 vm_object_init(vm_object_t object, vm_pindex_t size)
426 {
427         _vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj");
428         vm_object_drop(object);
429 }
430
431 /*
432  * Initialize the VM objects module.
433  *
434  * Called from the low level boot code only.  Note that this occurs before
435  * kmalloc is initialized so we cannot allocate any VM objects.
436  */
437 void
438 vm_object_init1(void)
439 {
440         int i;
441
442         for (i = 0; i < VMOBJ_HSIZE; ++i) {
443                 TAILQ_INIT(&vm_object_hash[i].list);
444                 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
445         }
446
447         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
448                             &kernel_object, "kobj");
449         vm_object_drop(&kernel_object);
450 }
451
452 void
453 vm_object_init2(void)
454 {
455         kmalloc_set_unlimited(M_VM_OBJECT);
456 }
457
458 /*
459  * Allocate and return a new object of the specified type and size.
460  *
461  * No requirements.
462  */
463 vm_object_t
464 vm_object_allocate(objtype_t type, vm_pindex_t size)
465 {
466         vm_object_t obj;
467
468         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
469         _vm_object_allocate(type, size, obj, "vmobj");
470         vm_object_drop(obj);
471
472         return (obj);
473 }
474
475 /*
476  * This version returns a held object, allowing further atomic initialization
477  * of the object.
478  */
479 vm_object_t
480 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
481 {
482         vm_object_t obj;
483
484         obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
485         _vm_object_allocate(type, size, obj, "vmobj");
486
487         return (obj);
488 }
489
490 /*
491  * Add an additional reference to a vm_object.  The object must already be
492  * held.  The original non-lock version is no longer supported.  The object
493  * must NOT be chain locked by anyone at the time the reference is added.
494  *
495  * The object must be held, but may be held shared if desired (hence why
496  * we use an atomic op).
497  */
498 void
499 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
500 {
501         KKASSERT(object != NULL);
502         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
503         atomic_add_int(&object->ref_count, 1);
504         if (object->type == OBJT_VNODE) {
505                 vref(object->handle);
506                 /* XXX what if the vnode is being destroyed? */
507         }
508 #if defined(DEBUG_LOCKS)
509         debugvm_object_add(object, file, line, 1);
510 #endif
511 }
512
513 /*
514  * This version is only allowed in situations where the caller
515  * already knows that the object is deterministically referenced
516  * (usually because its taken from a ref'd vnode, or during a map_entry
517  * replication).
518  */
519 void
520 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
521 {
522         KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
523         atomic_add_int(&object->ref_count, 1);
524         if (object->type == OBJT_VNODE)
525                 vref(object->handle);
526 #if defined(DEBUG_LOCKS)
527         debugvm_object_add(object, file, line, 1);
528 #endif
529 }
530
531 /*
532  * Dereference an object and its underlying vnode.  The object may be
533  * held shared.  On return the object will remain held.
534  *
535  * This function may return a vnode in *vpp which the caller must release
536  * after the caller drops its own lock.  If vpp is NULL, we assume that
537  * the caller was holding an exclusive lock on the object and we vrele()
538  * the vp ourselves.
539  */
540 static void
541 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
542                                    VMOBJDBARGS)
543 {
544         struct vnode *vp = (struct vnode *) object->handle;
545         int count;
546
547         KASSERT(object->type == OBJT_VNODE,
548             ("vm_object_vndeallocate: not a vnode object"));
549         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
550         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
551 #ifdef INVARIANTS
552         if (object->ref_count == 0) {
553                 vprint("vm_object_vndeallocate", vp);
554                 panic("vm_object_vndeallocate: bad object reference count");
555         }
556 #endif
557         count = object->ref_count;
558         cpu_ccfence();
559         for (;;) {
560                 if (count == 1) {
561                         vm_object_upgrade(object);
562                         if (atomic_fcmpset_int(&object->ref_count, &count, 0)) {
563                                 vclrflags(vp, VTEXT);
564                                 break;
565                         }
566                 } else {
567                         if (atomic_fcmpset_int(&object->ref_count,
568                                                &count, count - 1)) {
569                                 break;
570                         }
571                 }
572                 cpu_pause();
573                 /* retry */
574         }
575 #if defined(DEBUG_LOCKS)
576         debugvm_object_add(object, file, line, -1);
577 #endif
578
579         /*
580          * vrele or return the vp to vrele.  We can only safely vrele(vp)
581          * if the object was locked exclusively.  But there are two races
582          * here.
583          *
584          * We had to upgrade the object above to safely clear VTEXT
585          * but the alternative path where the shared lock is retained
586          * can STILL race to 0 in other paths and cause our own vrele()
587          * to terminate the vnode.  We can't allow that if the VM object
588          * is still locked shared.
589          */
590         if (vpp)
591                 *vpp = vp;
592         else
593                 vrele(vp);
594 }
595
596 /*
597  * Release a reference to the specified object, gained either through a
598  * vm_object_allocate or a vm_object_reference call.  When all references
599  * are gone, storage associated with this object may be relinquished.
600  *
601  * The caller does not have to hold the object locked but must have control
602  * over the reference in question in order to guarantee that the object
603  * does not get ripped out from under us.
604  *
605  * XXX Currently all deallocations require an exclusive lock.
606  */
607 void
608 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
609 {
610         struct vnode *vp;
611         int count;
612
613         if (object == NULL)
614                 return;
615
616         count = object->ref_count;
617         cpu_ccfence();
618         for (;;) {
619                 /*
620                  * If decrementing the count enters into special handling
621                  * territory (0, 1, or 2) we have to do it the hard way.
622                  * Fortunate though, objects with only a few refs like this
623                  * are not likely to be heavily contended anyway.
624                  *
625                  * For vnode objects we only care about 1->0 transitions.
626                  */
627                 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
628 #if defined(DEBUG_LOCKS)
629                         debugvm_object_add(object, file, line, 0);
630 #endif
631                         vm_object_hold(object);
632                         vm_object_deallocate_locked(object);
633                         vm_object_drop(object);
634                         break;
635                 }
636
637                 /*
638                  * Try to decrement ref_count without acquiring a hold on
639                  * the object.  This is particularly important for the exec*()
640                  * and exit*() code paths because the program binary may
641                  * have a great deal of sharing and an exclusive lock will
642                  * crowbar performance in those circumstances.
643                  */
644                 if (object->type == OBJT_VNODE) {
645                         vp = (struct vnode *)object->handle;
646                         if (atomic_fcmpset_int(&object->ref_count,
647                                                &count, count - 1)) {
648 #if defined(DEBUG_LOCKS)
649                                 debugvm_object_add(object, file, line, -1);
650 #endif
651
652                                 vrele(vp);
653                                 break;
654                         }
655                         /* retry */
656                 } else {
657                         if (atomic_fcmpset_int(&object->ref_count,
658                                                &count, count - 1)) {
659 #if defined(DEBUG_LOCKS)
660                                 debugvm_object_add(object, file, line, -1);
661 #endif
662                                 break;
663                         }
664                         /* retry */
665                 }
666                 cpu_pause();
667                 /* retry */
668         }
669 }
670
671 void
672 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
673 {
674         /*
675          * Degenerate case
676          */
677         if (object == NULL)
678                 return;
679
680         /*
681          * vnode case, caller either locked the object exclusively
682          * or this is a recursion with must_drop != 0 and the vnode
683          * object will be locked shared.
684          *
685          * If locked shared we have to drop the object before we can
686          * call vrele() or risk a shared/exclusive livelock.
687          */
688         if (object->type == OBJT_VNODE) {
689                 ASSERT_LWKT_TOKEN_HELD(&object->token);
690                 vm_object_vndeallocate(object, NULL);
691                 return;
692         }
693         ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
694
695         /*
696          * Normal case (object is locked exclusively)
697          */
698         if (object->ref_count == 0) {
699                 panic("vm_object_deallocate: object deallocated "
700                       "too many times: %d", object->type);
701         }
702         if (object->ref_count > 2) {
703                 atomic_add_int(&object->ref_count, -1);
704 #if defined(DEBUG_LOCKS)
705                 debugvm_object_add(object, file, line, -1);
706 #endif
707                 return;
708         }
709
710         /*
711          * Drop the ref and handle termination on the 1->0 transition.
712          * We may have blocked above so we have to recheck.
713          */
714         KKASSERT(object->ref_count != 0);
715         if (object->ref_count >= 2) {
716                 atomic_add_int(&object->ref_count, -1);
717 #if defined(DEBUG_LOCKS)
718                 debugvm_object_add(object, file, line, -1);
719 #endif
720                 return;
721         }
722
723         atomic_add_int(&object->ref_count, -1);
724         if ((object->flags & OBJ_DEAD) == 0)
725                 vm_object_terminate(object);
726 }
727
728 /*
729  * Destroy the specified object, freeing up related resources.
730  *
731  * The object must have zero references.
732  *
733  * The object must held.  The caller is responsible for dropping the object
734  * after terminate returns.  Terminate does NOT drop the object.
735  */
736 static int vm_object_terminate_callback(vm_page_t p, void *data);
737
738 void
739 vm_object_terminate(vm_object_t object)
740 {
741         struct rb_vm_page_scan_info info;
742         struct vm_object_hash *hash;
743
744         /*
745          * Make sure no one uses us.  Once we set OBJ_DEAD we should be
746          * able to safely block.
747          */
748         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
749         KKASSERT((object->flags & OBJ_DEAD) == 0);
750         vm_object_set_flag(object, OBJ_DEAD);
751
752         /*
753          * Wait for the pageout daemon to be done with the object
754          */
755         vm_object_pip_wait(object, "objtrm1");
756
757         KASSERT(!object->paging_in_progress,
758                 ("vm_object_terminate: pageout in progress"));
759
760         /*
761          * Clean and free the pages, as appropriate. All references to the
762          * object are gone, so we don't need to lock it.
763          */
764         if (object->type == OBJT_VNODE) {
765                 struct vnode *vp;
766
767                 /*
768                  * Clean pages and flush buffers.
769                  *
770                  * NOTE!  TMPFS buffer flushes do not typically flush the
771                  *        actual page to swap as this would be highly
772                  *        inefficient, and normal filesystems usually wrap
773                  *        page flushes with buffer cache buffers.
774                  *
775                  *        To deal with this we have to call vinvalbuf() both
776                  *        before and after the vm_object_page_clean().
777                  */
778                 vp = (struct vnode *) object->handle;
779                 vinvalbuf(vp, V_SAVE, 0, 0);
780                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
781                 vinvalbuf(vp, V_SAVE, 0, 0);
782         }
783
784         /*
785          * Wait for any I/O to complete, after which there had better not
786          * be any references left on the object.
787          */
788         vm_object_pip_wait(object, "objtrm2");
789
790         if (object->ref_count != 0) {
791                 panic("vm_object_terminate: object with references, "
792                       "ref_count=%d", object->ref_count);
793         }
794
795         /*
796          * Cleanup any shared pmaps associated with this object.
797          */
798         pmap_object_free(object);
799
800         /*
801          * Now free any remaining pages. For internal objects, this also
802          * removes them from paging queues. Don't free wired pages, just
803          * remove them from the object.
804          */
805         info.count = 0;
806         info.object = object;
807         do {
808                 info.error = 0;
809                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
810                                         vm_object_terminate_callback, &info);
811         } while (info.error);
812
813         /*
814          * Let the pager know object is dead.
815          */
816         vm_pager_deallocate(object);
817
818         /*
819          * Wait for the object hold count to hit 1, clean out pages as
820          * we go.  vmobj_token interlocks any race conditions that might
821          * pick the object up from the vm_object_list after we have cleared
822          * rb_memq.
823          */
824         for (;;) {
825                 if (RB_ROOT(&object->rb_memq) == NULL)
826                         break;
827                 kprintf("vm_object_terminate: Warning, object %p "
828                         "still has %ld pages\n",
829                         object, object->resident_page_count);
830                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
831                                         vm_object_terminate_callback, &info);
832         }
833
834         /*
835          * There had better not be any pages left
836          */
837         KKASSERT(object->resident_page_count == 0);
838
839         /*
840          * Remove the object from the global object list.
841          */
842         hash = vmobj_hash(object);
843         lwkt_gettoken(&hash->token);
844         TAILQ_REMOVE(&hash->list, object, object_entry);
845         lwkt_reltoken(&hash->token);
846
847         if (object->ref_count != 0) {
848                 panic("vm_object_terminate2: object with references, "
849                       "ref_count=%d", object->ref_count);
850         }
851
852         /*
853          * NOTE: The object hold_count is at least 1, so we cannot kfree()
854          *       the object here.  See vm_object_drop().
855          */
856 }
857
858 /*
859  * The caller must hold the object.
860  *
861  * NOTE: In PMAP_ADVANCED mode it is possible for vm_page's to remain flagged
862  *       PG_MAPPED or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync()
863  *       is called, due to normal pmap operations.  This is because only
864  *       global pmap operations on the vm_page can clear the bits and not
865  *       just local operations on individual pmaps.
866  *
867  *       Most interactions that necessitate the clearing of these bits
868  *       proactively call vm_page_protect(), and we must do so here as well.
869  */
870 static int
871 vm_object_terminate_callback(vm_page_t p, void *data)
872 {
873         struct rb_vm_page_scan_info *info = data;
874         vm_object_t object;
875
876         object = p->object;
877         KKASSERT(object == info->object);
878         if (vm_page_busy_try(p, TRUE)) {
879                 vm_page_sleep_busy(p, TRUE, "vmotrm");
880                 info->error = 1;
881                 return 0;
882         }
883         if (object != p->object) {
884                 /* XXX remove once we determine it can't happen */
885                 kprintf("vm_object_terminate: Warning: Encountered "
886                         "busied page %p on queue %d\n", p, p->queue);
887                 vm_page_wakeup(p);
888                 info->error = 1;
889         } else if (p->wire_count == 0) {
890                 /*
891                  * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
892                  */
893                 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
894                         vm_page_protect(p, VM_PROT_NONE);
895                 vm_page_free(p);
896                 mycpu->gd_cnt.v_pfree++;
897         } else {
898                 if (p->queue != PQ_NONE) {
899                         kprintf("vm_object_terminate: Warning: Encountered "
900                                 "wired page %p on queue %d\n", p, p->queue);
901                         if (vm_object_debug > 0) {
902                                 --vm_object_debug;
903                                 print_backtrace(10);
904                         }
905                 }
906                 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
907                         vm_page_protect(p, VM_PROT_NONE);
908                 vm_page_remove(p);
909                 vm_page_wakeup(p);
910         }
911
912         /*
913          * Must be at end to avoid SMP races, caller holds object token
914          */
915         if ((++info->count & 63) == 0)
916                 lwkt_user_yield();
917         return(0);
918 }
919
920 /*
921  * Clean all dirty pages in the specified range of object.  Leaves page
922  * on whatever queue it is currently on.   If NOSYNC is set then do not
923  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
924  * leaving the object dirty.
925  *
926  * When stuffing pages asynchronously, allow clustering.  XXX we need a
927  * synchronous clustering mode implementation.
928  *
929  * Odd semantics: if start == end, we clean everything.
930  *
931  * The object must be locked? XXX
932  */
933 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
934 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
935
936 void
937 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
938                      int flags)
939 {
940         struct rb_vm_page_scan_info info;
941         struct vnode *vp;
942         int wholescan;
943         int pagerflags;
944         int generation;
945
946         vm_object_hold(object);
947         if (object->type != OBJT_VNODE ||
948             (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
949                 vm_object_drop(object);
950                 return;
951         }
952
953         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
954                         VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
955         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
956
957         vp = object->handle;
958
959         /*
960          * Interlock other major object operations.  This allows us to
961          * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
962          */
963         vm_object_set_flag(object, OBJ_CLEANING);
964
965         /*
966          * Handle 'entire object' case
967          */
968         info.start_pindex = start;
969         if (end == 0) {
970                 info.end_pindex = object->size - 1;
971         } else {
972                 info.end_pindex = end - 1;
973         }
974         wholescan = (start == 0 && info.end_pindex == object->size - 1);
975         info.limit = flags;
976         info.pagerflags = pagerflags;
977         info.object = object;
978
979         /*
980          * If cleaning the entire object do a pass to mark the pages read-only.
981          * If everything worked out ok, clear OBJ_WRITEABLE and
982          * OBJ_MIGHTBEDIRTY.
983          */
984         if (wholescan) {
985                 info.error = 0;
986                 info.count = 0;
987                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
988                                         vm_object_page_clean_pass1, &info);
989                 if (info.error == 0) {
990                         vm_object_clear_flag(object,
991                                              OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
992                         if (object->type == OBJT_VNODE &&
993                             (vp = (struct vnode *)object->handle) != NULL) {
994                                 /*
995                                  * Use new-style interface to clear VISDIRTY
996                                  * because the vnode is not necessarily removed
997                                  * from the syncer list(s) as often as it was
998                                  * under the old interface, which can leave
999                                  * the vnode on the syncer list after reclaim.
1000                                  */
1001                                 vclrobjdirty(vp);
1002                         }
1003                 }
1004         }
1005
1006         /*
1007          * Do a pass to clean all the dirty pages we find.
1008          */
1009         do {
1010                 info.error = 0;
1011                 info.count = 0;
1012                 generation = object->generation;
1013                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1014                                         vm_object_page_clean_pass2, &info);
1015         } while (info.error || generation != object->generation);
1016
1017         vm_object_clear_flag(object, OBJ_CLEANING);
1018         vm_object_drop(object);
1019 }
1020
1021 /*
1022  * The caller must hold the object.
1023  */
1024 static
1025 int
1026 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1027 {
1028         struct rb_vm_page_scan_info *info = data;
1029
1030         KKASSERT(p->object == info->object);
1031
1032         vm_page_flag_set(p, PG_CLEANCHK);
1033         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1034                 info->error = 1;
1035         } else if (vm_page_busy_try(p, FALSE)) {
1036                 info->error = 1;
1037         } else {
1038                 KKASSERT(p->object == info->object);
1039                 vm_page_protect(p, VM_PROT_READ);
1040                 vm_page_wakeup(p);
1041         }
1042
1043         /*
1044          * Must be at end to avoid SMP races, caller holds object token
1045          */
1046         if ((++info->count & 63) == 0)
1047                 lwkt_user_yield();
1048         return(0);
1049 }
1050
1051 /*
1052  * The caller must hold the object
1053  */
1054 static
1055 int
1056 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1057 {
1058         struct rb_vm_page_scan_info *info = data;
1059         int generation;
1060
1061         KKASSERT(p->object == info->object);
1062
1063         /*
1064          * Do not mess with pages that were inserted after we started
1065          * the cleaning pass.
1066          */
1067         if ((p->flags & PG_CLEANCHK) == 0)
1068                 goto done;
1069
1070         generation = info->object->generation;
1071
1072         if (vm_page_busy_try(p, TRUE)) {
1073                 vm_page_sleep_busy(p, TRUE, "vpcwai");
1074                 info->error = 1;
1075                 goto done;
1076         }
1077
1078         KKASSERT(p->object == info->object &&
1079                  info->object->generation == generation);
1080
1081         /*
1082          * Before wasting time traversing the pmaps, check for trivial
1083          * cases where the page cannot be dirty.
1084          */
1085         if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1086                 KKASSERT((p->dirty & p->valid) == 0 &&
1087                          (p->flags & PG_NEED_COMMIT) == 0);
1088                 vm_page_wakeup(p);
1089                 goto done;
1090         }
1091
1092         /*
1093          * Check whether the page is dirty or not.  The page has been set
1094          * to be read-only so the check will not race a user dirtying the
1095          * page.
1096          */
1097         vm_page_test_dirty(p);
1098         if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1099                 vm_page_flag_clear(p, PG_CLEANCHK);
1100                 vm_page_wakeup(p);
1101                 goto done;
1102         }
1103
1104         /*
1105          * If we have been asked to skip nosync pages and this is a
1106          * nosync page, skip it.  Note that the object flags were
1107          * not cleared in this case (because pass1 will have returned an
1108          * error), so we do not have to set them.
1109          */
1110         if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1111                 vm_page_flag_clear(p, PG_CLEANCHK);
1112                 vm_page_wakeup(p);
1113                 goto done;
1114         }
1115
1116         /*
1117          * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1118          * the pages that get successfully flushed.  Set info->error if
1119          * we raced an object modification.
1120          */
1121         vm_object_page_collect_flush(info->object, p, info->pagerflags);
1122         /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1123
1124         /*
1125          * Must be at end to avoid SMP races, caller holds object token
1126          */
1127 done:
1128         if ((++info->count & 63) == 0)
1129                 lwkt_user_yield();
1130         return(0);
1131 }
1132
1133 /*
1134  * Collect the specified page and nearby pages and flush them out.
1135  * The number of pages flushed is returned.  The passed page is busied
1136  * by the caller and we are responsible for its disposition.
1137  *
1138  * The caller must hold the object.
1139  */
1140 static void
1141 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1142 {
1143         int error;
1144         int is;
1145         int ib;
1146         int i;
1147         int page_base;
1148         vm_pindex_t pi;
1149         vm_page_t ma[BLIST_MAX_ALLOC];
1150
1151         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1152
1153         pi = p->pindex;
1154         page_base = pi % BLIST_MAX_ALLOC;
1155         ma[page_base] = p;
1156         ib = page_base - 1;
1157         is = page_base + 1;
1158
1159         while (ib >= 0) {
1160                 vm_page_t tp;
1161
1162                 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1163                                              TRUE, &error);
1164                 if (error)
1165                         break;
1166                 if (tp == NULL)
1167                         break;
1168                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1169                     (tp->flags & PG_CLEANCHK) == 0) {
1170                         vm_page_wakeup(tp);
1171                         break;
1172                 }
1173                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1174                         vm_page_flag_clear(tp, PG_CLEANCHK);
1175                         vm_page_wakeup(tp);
1176                         break;
1177                 }
1178                 vm_page_test_dirty(tp);
1179                 if ((tp->dirty & tp->valid) == 0 &&
1180                     (tp->flags & PG_NEED_COMMIT) == 0) {
1181                         vm_page_flag_clear(tp, PG_CLEANCHK);
1182                         vm_page_wakeup(tp);
1183                         break;
1184                 }
1185                 ma[ib] = tp;
1186                 --ib;
1187         }
1188         ++ib;   /* fixup */
1189
1190         while (is < BLIST_MAX_ALLOC &&
1191                pi - page_base + is < object->size) {
1192                 vm_page_t tp;
1193
1194                 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1195                                              TRUE, &error);
1196                 if (error)
1197                         break;
1198                 if (tp == NULL)
1199                         break;
1200                 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1201                     (tp->flags & PG_CLEANCHK) == 0) {
1202                         vm_page_wakeup(tp);
1203                         break;
1204                 }
1205                 if ((tp->queue - tp->pc) == PQ_CACHE) {
1206                         vm_page_flag_clear(tp, PG_CLEANCHK);
1207                         vm_page_wakeup(tp);
1208                         break;
1209                 }
1210                 vm_page_test_dirty(tp);
1211                 if ((tp->dirty & tp->valid) == 0 &&
1212                     (tp->flags & PG_NEED_COMMIT) == 0) {
1213                         vm_page_flag_clear(tp, PG_CLEANCHK);
1214                         vm_page_wakeup(tp);
1215                         break;
1216                 }
1217                 ma[is] = tp;
1218                 ++is;
1219         }
1220
1221         /*
1222          * All pages in the ma[] array are busied now
1223          */
1224         for (i = ib; i < is; ++i) {
1225                 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1226                 vm_page_hold(ma[i]);    /* XXX need this any more? */
1227         }
1228         vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1229         for (i = ib; i < is; ++i)       /* XXX need this any more? */
1230                 vm_page_unhold(ma[i]);
1231 }
1232
1233 /*
1234  * Implements the madvise function at the object/page level.
1235  *
1236  * MADV_WILLNEED        (any object)
1237  *
1238  *      Activate the specified pages if they are resident.
1239  *
1240  * MADV_DONTNEED        (any object)
1241  *
1242  *      Deactivate the specified pages if they are resident.
1243  *
1244  * MADV_FREE    (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1245  *
1246  *      Deactivate and clean the specified pages if they are
1247  *      resident.  This permits the process to reuse the pages
1248  *      without faulting or the kernel to reclaim the pages
1249  *      without I/O.
1250  *
1251  * No requirements.
1252  */
1253 void
1254 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1255                   vm_pindex_t count, int advise)
1256 {
1257         vm_pindex_t end;
1258         vm_page_t m;
1259         int error;
1260
1261         if (object == NULL)
1262                 return;
1263
1264         end = pindex + count;
1265
1266         vm_object_hold(object);
1267
1268         /*
1269          * Locate and adjust resident pages.  This only applies to the
1270          * primary object in the mapping.
1271          */
1272         for (; pindex < end; pindex += 1) {
1273 relookup:
1274                 /*
1275                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1276                  * and those pages must be OBJ_ONEMAPPING.
1277                  */
1278                 if (advise == MADV_FREE) {
1279                         if ((object->type != OBJT_DEFAULT &&
1280                              object->type != OBJT_SWAP) ||
1281                             (object->flags & OBJ_ONEMAPPING) == 0) {
1282                                 continue;
1283                         }
1284                 }
1285
1286                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1287
1288                 if (error) {
1289                         vm_page_sleep_busy(m, TRUE, "madvpo");
1290                         goto relookup;
1291                 }
1292                 if (m == NULL) {
1293                         /*
1294                          * There may be swap even if there is no backing page
1295                          */
1296                         if (advise == MADV_FREE && object->type == OBJT_SWAP)
1297                                 swap_pager_freespace(object, pindex, 1);
1298                         continue;
1299                 }
1300
1301                 /*
1302                  * If the page is not in a normal active state, we skip it.
1303                  * If the page is not managed there are no page queues to
1304                  * mess with.  Things can break if we mess with pages in
1305                  * any of the below states.
1306                  */
1307                 if (m->wire_count ||
1308                     (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1309                                  PG_NEED_COMMIT)) ||
1310                     m->valid != VM_PAGE_BITS_ALL
1311                 ) {
1312                         vm_page_wakeup(m);
1313                         continue;
1314                 }
1315
1316                 /*
1317                  * Theoretically once a page is known not to be busy, an
1318                  * interrupt cannot come along and rip it out from under us.
1319                  */
1320                 if (advise == MADV_WILLNEED) {
1321                         vm_page_activate(m);
1322                 } else if (advise == MADV_DONTNEED) {
1323                         vm_page_dontneed(m);
1324                 } else if (advise == MADV_FREE) {
1325                         /*
1326                          * Mark the page clean.  This will allow the page
1327                          * to be freed up by the system.  However, such pages
1328                          * are often reused quickly by malloc()/free()
1329                          * so we do not do anything that would cause
1330                          * a page fault if we can help it.
1331                          *
1332                          * Specifically, we do not try to actually free
1333                          * the page now nor do we try to put it in the
1334                          * cache (which would cause a page fault on reuse).
1335                          *
1336                          * But we do make the page is freeable as we
1337                          * can without actually taking the step of unmapping
1338                          * it.
1339                          */
1340                         pmap_clear_modify(m);
1341                         m->dirty = 0;
1342                         m->act_count = 0;
1343                         vm_page_dontneed(m);
1344                         if (object->type == OBJT_SWAP)
1345                                 swap_pager_freespace(object, pindex, 1);
1346                 }
1347                 vm_page_wakeup(m);
1348         }
1349         vm_object_drop(object);
1350 }
1351
1352 /*
1353  * Removes all physical pages in the specified object range from the
1354  * object's list of pages.
1355  *
1356  * No requirements.
1357  */
1358 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1359
1360 void
1361 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1362                       boolean_t clean_only)
1363 {
1364         struct rb_vm_page_scan_info info;
1365         int all;
1366
1367         /*
1368          * Degenerate cases and assertions
1369          */
1370         vm_object_hold(object);
1371         if (object == NULL ||
1372             (object->resident_page_count == 0 && object->swblock_count == 0)) {
1373                 vm_object_drop(object);
1374                 return;
1375         }
1376         KASSERT(object->type != OBJT_PHYS,
1377                 ("attempt to remove pages from a physical object"));
1378
1379         /*
1380          * Indicate that paging is occuring on the object
1381          */
1382         vm_object_pip_add(object, 1);
1383
1384         /*
1385          * Figure out the actual removal range and whether we are removing
1386          * the entire contents of the object or not.  If removing the entire
1387          * contents, be sure to get all pages, even those that might be
1388          * beyond the end of the object.
1389          */
1390         info.object = object;
1391         info.start_pindex = start;
1392         if (end == 0)
1393                 info.end_pindex = (vm_pindex_t)-1;
1394         else
1395                 info.end_pindex = end - 1;
1396         info.limit = clean_only;
1397         info.count = 0;
1398         all = (start == 0 && info.end_pindex >= object->size - 1);
1399
1400         /*
1401          * Loop until we are sure we have gotten them all.
1402          */
1403         do {
1404                 info.error = 0;
1405                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1406                                         vm_object_page_remove_callback, &info);
1407         } while (info.error);
1408
1409         /*
1410          * Remove any related swap if throwing away pages, or for
1411          * non-swap objects (the swap is a clean copy in that case).
1412          */
1413         if (object->type != OBJT_SWAP || clean_only == FALSE) {
1414                 if (all)
1415                         swap_pager_freespace_all(object);
1416                 else
1417                         swap_pager_freespace(object, info.start_pindex,
1418                              info.end_pindex - info.start_pindex + 1);
1419         }
1420
1421         /*
1422          * Cleanup
1423          */
1424         vm_object_pip_wakeup(object);
1425         vm_object_drop(object);
1426 }
1427
1428 /*
1429  * The caller must hold the object.
1430  *
1431  * NOTE: User yields are allowed when removing more than one page, but not
1432  *       allowed if only removing one page (the path for single page removals
1433  *       might hold a spinlock).
1434  */
1435 static int
1436 vm_object_page_remove_callback(vm_page_t p, void *data)
1437 {
1438         struct rb_vm_page_scan_info *info = data;
1439
1440         if (info->object != p->object ||
1441             p->pindex < info->start_pindex ||
1442             p->pindex > info->end_pindex) {
1443                 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1444                         info->object, p);
1445                 return(0);
1446         }
1447         if (vm_page_busy_try(p, TRUE)) {
1448                 vm_page_sleep_busy(p, TRUE, "vmopar");
1449                 info->error = 1;
1450                 return(0);
1451         }
1452         if (info->object != p->object) {
1453                 /* this should never happen */
1454                 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1455                         info->object, p);
1456                 vm_page_wakeup(p);
1457                 return(0);
1458         }
1459
1460         /*
1461          * Wired pages cannot be destroyed, but they can be invalidated
1462          * and we do so if clean_only (limit) is not set.
1463          *
1464          * WARNING!  The page may be wired due to being part of a buffer
1465          *           cache buffer, and the buffer might be marked B_CACHE.
1466          *           This is fine as part of a truncation but VFSs must be
1467          *           sure to fix the buffer up when re-extending the file.
1468          *
1469          * NOTE!     PG_NEED_COMMIT is ignored.
1470          */
1471         if (p->wire_count != 0) {
1472                 vm_page_protect(p, VM_PROT_NONE);
1473                 if (info->limit == 0)
1474                         p->valid = 0;
1475                 vm_page_wakeup(p);
1476                 goto done;
1477         }
1478
1479         /*
1480          * limit is our clean_only flag.  If set and the page is dirty or
1481          * requires a commit, do not free it.  If set and the page is being
1482          * held by someone, do not free it.
1483          */
1484         if (info->limit && p->valid) {
1485                 vm_page_test_dirty(p);
1486                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1487                         vm_page_wakeup(p);
1488                         goto done;
1489                 }
1490         }
1491
1492         /*
1493          * Destroy the page.  But we have to re-test whether its dirty after
1494          * removing it from its pmaps.
1495          */
1496         vm_page_protect(p, VM_PROT_NONE);
1497         if (info->limit && p->valid) {
1498                 vm_page_test_dirty(p);
1499                 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1500                         vm_page_wakeup(p);
1501                         goto done;
1502                 }
1503         }
1504         vm_page_free(p);
1505
1506         /*
1507          * Must be at end to avoid SMP races, caller holds object token
1508          */
1509 done:
1510         if ((++info->count & 63) == 0)
1511                 lwkt_user_yield();
1512
1513         return(0);
1514 }
1515
1516 /*
1517  * Try to extend prev_object into an adjoining region of virtual
1518  * memory, return TRUE on success.
1519  *
1520  * The caller does not need to hold (prev_object) but must have a stable
1521  * pointer to it (typically by holding the vm_map locked).
1522  *
1523  * This function only works for anonymous memory objects which either
1524  * have (a) one reference or (b) we are extending the object's size.
1525  * Otherwise the related VM pages we want to use for the object might
1526  * be in use by another mapping.
1527  */
1528 boolean_t
1529 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1530                    vm_size_t prev_size, vm_size_t next_size)
1531 {
1532         vm_pindex_t next_pindex;
1533
1534         if (prev_object == NULL)
1535                 return (TRUE);
1536
1537         vm_object_hold(prev_object);
1538
1539         if (prev_object->type != OBJT_DEFAULT &&
1540             prev_object->type != OBJT_SWAP) {
1541                 vm_object_drop(prev_object);
1542                 return (FALSE);
1543         }
1544
1545 #if 0
1546         /* caller now checks this */
1547         /*
1548          * Try to collapse the object first
1549          */
1550         vm_object_collapse(prev_object, NULL);
1551 #endif
1552
1553 #if 0
1554         /* caller now checks this */
1555         /*
1556          * We can't coalesce if we shadow another object (figuring out the
1557          * relationships become too complex).
1558          */
1559         if (prev_object->backing_object != NULL) {
1560                 vm_object_chain_release(prev_object);
1561                 vm_object_drop(prev_object);
1562                 return (FALSE);
1563         }
1564 #endif
1565
1566         prev_size >>= PAGE_SHIFT;
1567         next_size >>= PAGE_SHIFT;
1568         next_pindex = prev_pindex + prev_size;
1569
1570         /*
1571          * We can't if the object has more than one ref count unless we
1572          * are extending it into newly minted space.
1573          */
1574         if (prev_object->ref_count > 1 &&
1575             prev_object->size != next_pindex) {
1576                 vm_object_drop(prev_object);
1577                 return (FALSE);
1578         }
1579
1580         /*
1581          * Remove any pages that may still be in the object from a previous
1582          * deallocation.
1583          */
1584         if (next_pindex < prev_object->size) {
1585                 vm_object_page_remove(prev_object,
1586                                       next_pindex,
1587                                       next_pindex + next_size, FALSE);
1588                 if (prev_object->type == OBJT_SWAP)
1589                         swap_pager_freespace(prev_object,
1590                                              next_pindex, next_size);
1591         }
1592
1593         /*
1594          * Extend the object if necessary.
1595          */
1596         if (next_pindex + next_size > prev_object->size)
1597                 prev_object->size = next_pindex + next_size;
1598         vm_object_drop(prev_object);
1599
1600         return (TRUE);
1601 }
1602
1603 /*
1604  * Make the object writable and flag is being possibly dirty.
1605  *
1606  * The object might not be held (or might be held but held shared),
1607  * the related vnode is probably not held either.  Object and vnode are
1608  * stable by virtue of the vm_page busied by the caller preventing
1609  * destruction.
1610  *
1611  * If the related mount is flagged MNTK_THR_SYNC we need to call
1612  * vsetobjdirty().  Filesystems using this option usually shortcut
1613  * synchronization by only scanning the syncer list.
1614  */
1615 void
1616 vm_object_set_writeable_dirty(vm_object_t object)
1617 {
1618         struct vnode *vp;
1619
1620         /*vm_object_assert_held(object);*/
1621         /*
1622          * Avoid contention in vm fault path by checking the state before
1623          * issuing an atomic op on it.
1624          */
1625         if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1626             (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1627                 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1628         }
1629         if (object->type == OBJT_VNODE &&
1630             (vp = (struct vnode *)object->handle) != NULL) {
1631                 if ((vp->v_flag & VOBJDIRTY) == 0) {
1632                         if (vp->v_mount &&
1633                             (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1634                                 /*
1635                                  * New style THR_SYNC places vnodes on the
1636                                  * syncer list more deterministically.
1637                                  */
1638                                 vsetobjdirty(vp);
1639                         } else {
1640                                 /*
1641                                  * Old style scan would not necessarily place
1642                                  * a vnode on the syncer list when possibly
1643                                  * modified via mmap.
1644                                  */
1645                                 vsetflags(vp, VOBJDIRTY);
1646                         }
1647                 }
1648         }
1649 }
1650
1651 #include "opt_ddb.h"
1652 #ifdef DDB
1653 #include <sys/cons.h>
1654
1655 #include <ddb/ddb.h>
1656
1657 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
1658                                        vm_map_entry_t entry);
1659 static int      vm_object_in_map (vm_object_t object);
1660
1661 /*
1662  * The caller must hold the object.
1663  */
1664 static int
1665 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1666 {
1667         vm_map_backing_t ba;
1668         vm_map_t tmpm;
1669         vm_map_entry_t tmpe;
1670         int entcount;
1671
1672         if (map == NULL)
1673                 return 0;
1674         if (entry == NULL) {
1675                 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1676                 entcount = map->nentries;
1677                 while (entcount-- && tmpe) {
1678                         if( _vm_object_in_map(map, object, tmpe)) {
1679                                 return 1;
1680                         }
1681                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1682                 }
1683                 return (0);
1684         }
1685         switch(entry->maptype) {
1686         case VM_MAPTYPE_SUBMAP:
1687                 tmpm = entry->ba.sub_map;
1688                 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1689                 entcount = tmpm->nentries;
1690                 while (entcount-- && tmpe) {
1691                         if( _vm_object_in_map(tmpm, object, tmpe)) {
1692                                 return 1;
1693                         }
1694                         tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1695                 }
1696                 break;
1697         case VM_MAPTYPE_NORMAL:
1698         case VM_MAPTYPE_VPAGETABLE:
1699                 ba = &entry->ba;
1700                 while (ba) {
1701                         if (ba->object == object)
1702                                 return TRUE;
1703                         ba = ba->backing_ba;
1704                 }
1705                 break;
1706         default:
1707                 break;
1708         }
1709         return 0;
1710 }
1711
1712 static int vm_object_in_map_callback(struct proc *p, void *data);
1713
1714 struct vm_object_in_map_info {
1715         vm_object_t object;
1716         int rv;
1717 };
1718
1719 /*
1720  * Debugging only
1721  */
1722 static int
1723 vm_object_in_map(vm_object_t object)
1724 {
1725         struct vm_object_in_map_info info;
1726
1727         info.rv = 0;
1728         info.object = object;
1729
1730         allproc_scan(vm_object_in_map_callback, &info, 0);
1731         if (info.rv)
1732                 return 1;
1733         if( _vm_object_in_map(&kernel_map, object, 0))
1734                 return 1;
1735         if( _vm_object_in_map(&pager_map, object, 0))
1736                 return 1;
1737         if( _vm_object_in_map(&buffer_map, object, 0))
1738                 return 1;
1739         return 0;
1740 }
1741
1742 /*
1743  * Debugging only
1744  */
1745 static int
1746 vm_object_in_map_callback(struct proc *p, void *data)
1747 {
1748         struct vm_object_in_map_info *info = data;
1749
1750         if (p->p_vmspace) {
1751                 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1752                         info->rv = 1;
1753                         return -1;
1754                 }
1755         }
1756         return (0);
1757 }
1758
1759 DB_SHOW_COMMAND(vmochk, vm_object_check)
1760 {
1761         struct vm_object_hash *hash;
1762         vm_object_t object;
1763         int n;
1764
1765         /*
1766          * make sure that internal objs are in a map somewhere
1767          * and none have zero ref counts.
1768          */
1769         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1770                 hash = &vm_object_hash[n];
1771                 for (object = TAILQ_FIRST(&hash->list);
1772                                 object != NULL;
1773                                 object = TAILQ_NEXT(object, object_entry)) {
1774                         if (object->type == OBJT_MARKER)
1775                                 continue;
1776                         if (object->handle != NULL ||
1777                             (object->type != OBJT_DEFAULT &&
1778                              object->type != OBJT_SWAP)) {
1779                                 continue;
1780                         }
1781                         if (object->ref_count == 0) {
1782                                 db_printf("vmochk: internal obj has "
1783                                           "zero ref count: %ld\n",
1784                                           (long)object->size);
1785                         }
1786                         if (vm_object_in_map(object))
1787                                 continue;
1788                         db_printf("vmochk: internal obj is not in a map: "
1789                                   "ref: %d, size: %lu: 0x%lx\n",
1790                                   object->ref_count, (u_long)object->size,
1791                                   (u_long)object->size);
1792                 }
1793         }
1794 }
1795
1796 /*
1797  * Debugging only
1798  */
1799 DB_SHOW_COMMAND(object, vm_object_print_static)
1800 {
1801         /* XXX convert args. */
1802         vm_object_t object = (vm_object_t)addr;
1803         boolean_t full = have_addr;
1804
1805         vm_page_t p;
1806
1807         /* XXX count is an (unused) arg.  Avoid shadowing it. */
1808 #define count   was_count
1809
1810         int count;
1811
1812         if (object == NULL)
1813                 return;
1814
1815         db_iprintf(
1816             "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1817             object, (int)object->type, (u_long)object->size,
1818             object->resident_page_count, object->ref_count, object->flags);
1819         /*
1820          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1821          */
1822         db_iprintf("\n");
1823
1824         if (!full)
1825                 return;
1826
1827         db_indent += 2;
1828         count = 0;
1829         RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1830                 if (count == 0)
1831                         db_iprintf("memory:=");
1832                 else if (count == 6) {
1833                         db_printf("\n");
1834                         db_iprintf(" ...");
1835                         count = 0;
1836                 } else
1837                         db_printf(",");
1838                 count++;
1839
1840                 db_printf("(off=0x%lx,page=0x%lx)",
1841                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1842         }
1843         if (count != 0)
1844                 db_printf("\n");
1845         db_indent -= 2;
1846 }
1847
1848 /* XXX. */
1849 #undef count
1850
1851 /*
1852  * XXX need this non-static entry for calling from vm_map_print.
1853  *
1854  * Debugging only
1855  */
1856 void
1857 vm_object_print(/* db_expr_t */ long addr,
1858                 boolean_t have_addr,
1859                 /* db_expr_t */ long count,
1860                 char *modif)
1861 {
1862         vm_object_print_static(addr, have_addr, count, modif);
1863 }
1864
1865 /*
1866  * Debugging only
1867  */
1868 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1869 {
1870         struct vm_object_hash *hash;
1871         vm_object_t object;
1872         int nl = 0;
1873         int c;
1874         int n;
1875
1876         for (n = 0; n < VMOBJ_HSIZE; ++n) {
1877                 hash = &vm_object_hash[n];
1878                 for (object = TAILQ_FIRST(&hash->list);
1879                                 object != NULL;
1880                                 object = TAILQ_NEXT(object, object_entry)) {
1881                         vm_pindex_t idx, fidx;
1882                         vm_pindex_t osize;
1883                         vm_paddr_t pa = -1, padiff;
1884                         int rcount;
1885                         vm_page_t m;
1886
1887                         if (object->type == OBJT_MARKER)
1888                                 continue;
1889                         db_printf("new object: %p\n", (void *)object);
1890                         if ( nl > 18) {
1891                                 c = cngetc();
1892                                 if (c != ' ')
1893                                         return;
1894                                 nl = 0;
1895                         }
1896                         nl++;
1897                         rcount = 0;
1898                         fidx = 0;
1899                         osize = object->size;
1900                         if (osize > 128)
1901                                 osize = 128;
1902                         for (idx = 0; idx < osize; idx++) {
1903                                 m = vm_page_lookup(object, idx);
1904                                 if (m == NULL) {
1905                                         if (rcount) {
1906                                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1907                                                         (long)fidx, rcount, (long)pa);
1908                                                 if ( nl > 18) {
1909                                                         c = cngetc();
1910                                                         if (c != ' ')
1911                                                                 return;
1912                                                         nl = 0;
1913                                                 }
1914                                                 nl++;
1915                                                 rcount = 0;
1916                                         }
1917                                         continue;
1918                                 }
1919
1920                                 if (rcount &&
1921                                         (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1922                                         ++rcount;
1923                                         continue;
1924                                 }
1925                                 if (rcount) {
1926                                         padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1927                                         padiff >>= PAGE_SHIFT;
1928                                         padiff &= PQ_L2_MASK;
1929                                         if (padiff == 0) {
1930                                                 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1931                                                 ++rcount;
1932                                                 continue;
1933                                         }
1934                                         db_printf(" index(%ld)run(%d)pa(0x%lx)",
1935                                                 (long)fidx, rcount, (long)pa);
1936                                         db_printf("pd(%ld)\n", (long)padiff);
1937                                         if ( nl > 18) {
1938                                                 c = cngetc();
1939                                                 if (c != ' ')
1940                                                         return;
1941                                                 nl = 0;
1942                                         }
1943                                         nl++;
1944                                 }
1945                                 fidx = idx;
1946                                 pa = VM_PAGE_TO_PHYS(m);
1947                                 rcount = 1;
1948                         }
1949                         if (rcount) {
1950                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1951                                         (long)fidx, rcount, (long)pa);
1952                                 if ( nl > 18) {
1953                                         c = cngetc();
1954                                         if (c != ' ')
1955                                                 return;
1956                                         nl = 0;
1957                                 }
1958                                 nl++;
1959                         }
1960                 }
1961         }
1962 }
1963 #endif /* DDB */