2 * Copyright (c) 1991, 1993, 2013
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
40 * Permission to use, copy, modify and distribute this software and
41 * its documentation is hereby granted, provided that both the copyright
42 * notice and this permission notice appear in all copies of the
43 * software, derivative works or modified versions, and any portions
44 * thereof, and that both notices appear in supporting documentation.
46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
50 * Carnegie Mellon requests users of this software to return to
52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
53 * School of Computer Science
54 * Carnegie Mellon University
55 * Pittsburgh PA 15213-3890
57 * any improvements or extensions that they make and grant Carnegie the
58 * rights to redistribute these changes.
60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
64 * Virtual memory object module.
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h> /* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
80 #include <vm/vm_param.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
92 #include <vm/vm_page2.h>
94 #include <machine/specialreg.h>
96 #define EASY_SCAN_FACTOR 8
98 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100 static void vm_object_lock_init(vm_object_t);
103 * Virtual memory objects maintain the actual data
104 * associated with allocated virtual memory. A given
105 * page of memory exists within exactly one object.
107 * An object is only deallocated when all "references"
108 * are given up. Only one "reference" to a given
109 * region of an object should be writeable.
111 * Associated with each object is a list of all resident
112 * memory pages belonging to that object; this list is
113 * maintained by the "vm_page" module, and locked by the object's
116 * Each object also records a "pager" routine which is
117 * used to retrieve (and store) pages to the proper backing
118 * storage. In addition, objects may be backed by other
119 * objects from which they were virtual-copied.
121 * The only items within the object structure which are
122 * modified after time of creation are:
123 * reference count locked by object's lock
124 * pager routine locked by object's lock
128 struct vm_object kernel_object;
130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
134 #define VMOBJ_HASH_PRIME1 66555444443333333ULL
135 #define VMOBJ_HASH_PRIME2 989042931893ULL
138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
141 struct vm_object_hash *
142 vmobj_hash(vm_object_t obj)
147 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
148 hash1 %= VMOBJ_HASH_PRIME1;
149 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
150 hash2 %= VMOBJ_HASH_PRIME2;
151 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
154 #if defined(DEBUG_LOCKS)
156 #define vm_object_vndeallocate(obj, vpp) \
157 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
160 * Debug helper to track hold/drop/ref/deallocate calls.
163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
167 i = atomic_fetchadd_int(&obj->debug_index, 1);
168 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
169 ksnprintf(obj->debug_hold_thrs[i],
170 sizeof(obj->debug_hold_thrs[i]),
172 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
173 (curthread->td_proc ? curthread->td_proc->p_pid : -1),
176 obj->debug_hold_file[i] = file;
177 obj->debug_hold_line[i] = line;
179 /* Uncomment for debugging obj refs/derefs in reproducable cases */
180 if (strcmp(curthread->td_comm, "sshd") == 0) {
181 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
182 (curthread->td_proc ? curthread->td_proc->p_pid : -1),
183 obj, obj->ref_count, addrem, file, line);
191 * Misc low level routines
194 vm_object_lock_init(vm_object_t obj)
196 #if defined(DEBUG_LOCKS)
199 obj->debug_index = 0;
200 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
201 obj->debug_hold_thrs[i][0] = 0;
202 obj->debug_hold_file[i] = NULL;
203 obj->debug_hold_line[i] = 0;
209 vm_object_lock_swap(void)
215 vm_object_lock(vm_object_t obj)
217 lwkt_gettoken(&obj->token);
221 * Returns TRUE on sucesss
224 vm_object_lock_try(vm_object_t obj)
226 return(lwkt_trytoken(&obj->token));
230 vm_object_lock_shared(vm_object_t obj)
232 lwkt_gettoken_shared(&obj->token);
236 vm_object_unlock(vm_object_t obj)
238 lwkt_reltoken(&obj->token);
242 vm_object_upgrade(vm_object_t obj)
244 lwkt_reltoken(&obj->token);
245 lwkt_gettoken(&obj->token);
249 vm_object_downgrade(vm_object_t obj)
251 lwkt_reltoken(&obj->token);
252 lwkt_gettoken_shared(&obj->token);
256 vm_object_assert_held(vm_object_t obj)
258 ASSERT_LWKT_TOKEN_HELD(&obj->token);
264 globaldata_t gd = mycpu;
267 pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
268 pg_color += gd->gd_quick_color;
269 gd->gd_quick_color += PQ_PRIME2;
275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
277 KKASSERT(obj != NULL);
280 * Object must be held (object allocation is stable due to callers
281 * context, typically already holding the token on a parent object)
282 * prior to potentially blocking on the lock, otherwise the object
283 * can get ripped away from us.
285 refcount_acquire(&obj->hold_count);
288 #if defined(DEBUG_LOCKS)
289 debugvm_object_add(obj, file, line, 1);
294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
296 KKASSERT(obj != NULL);
299 * Object must be held (object allocation is stable due to callers
300 * context, typically already holding the token on a parent object)
301 * prior to potentially blocking on the lock, otherwise the object
302 * can get ripped away from us.
304 refcount_acquire(&obj->hold_count);
305 if (vm_object_lock_try(obj) == 0) {
306 if (refcount_release(&obj->hold_count)) {
307 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
308 kfree(obj, M_VM_OBJECT);
313 #if defined(DEBUG_LOCKS)
314 debugvm_object_add(obj, file, line, 1);
320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
322 KKASSERT(obj != NULL);
325 * Object must be held (object allocation is stable due to callers
326 * context, typically already holding the token on a parent object)
327 * prior to potentially blocking on the lock, otherwise the object
328 * can get ripped away from us.
330 refcount_acquire(&obj->hold_count);
331 vm_object_lock_shared(obj);
333 #if defined(DEBUG_LOCKS)
334 debugvm_object_add(obj, file, line, 1);
339 * Drop the token and hold_count on the object.
341 * WARNING! Token might be shared.
344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
350 * No new holders should be possible once we drop hold_count 1->0 as
351 * there is no longer any way to reference the object.
353 KKASSERT(obj->hold_count > 0);
354 if (refcount_release(&obj->hold_count)) {
355 #if defined(DEBUG_LOCKS)
356 debugvm_object_add(obj, file, line, -1);
359 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
360 vm_object_unlock(obj);
361 kfree(obj, M_VM_OBJECT);
363 vm_object_unlock(obj);
366 #if defined(DEBUG_LOCKS)
367 debugvm_object_add(obj, file, line, -1);
369 vm_object_unlock(obj);
374 * Initialize a freshly allocated object, returning a held object.
376 * Used only by vm_object_allocate(), zinitna() and vm_object_init().
381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
383 struct vm_object_hash *hash;
385 RB_INIT(&object->rb_memq);
386 lwkt_token_init(&object->token, "vmobj");
388 TAILQ_INIT(&object->backing_list);
389 lockinit(&object->backing_lk, "baclk", 0, 0);
393 object->ref_count = 1;
394 object->memattr = VM_MEMATTR_DEFAULT;
395 object->hold_count = 0;
397 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
398 vm_object_set_flag(object, OBJ_ONEMAPPING);
399 object->paging_in_progress = 0;
400 object->resident_page_count = 0;
401 /* cpu localization twist */
402 object->pg_color = vm_quickcolor();
403 object->handle = NULL;
405 atomic_add_int(&object->generation, 1);
406 object->swblock_count = 0;
407 RB_INIT(&object->swblock_root);
408 vm_object_lock_init(object);
409 pmap_object_init(object);
411 vm_object_hold(object);
413 hash = vmobj_hash(object);
414 lwkt_gettoken(&hash->token);
415 TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
416 lwkt_reltoken(&hash->token);
420 * Initialize a VM object.
423 vm_object_init(vm_object_t object, vm_pindex_t size)
425 _vm_object_allocate(OBJT_DEFAULT, size, object);
426 vm_object_drop(object);
430 * Initialize the VM objects module.
432 * Called from the low level boot code only. Note that this occurs before
433 * kmalloc is initialized so we cannot allocate any VM objects.
436 vm_object_init1(void)
440 for (i = 0; i < VMOBJ_HSIZE; ++i) {
441 TAILQ_INIT(&vm_object_hash[i].list);
442 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
445 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
447 vm_object_drop(&kernel_object);
451 vm_object_init2(void)
453 kmalloc_set_unlimited(M_VM_OBJECT);
457 * Allocate and return a new object of the specified type and size.
462 vm_object_allocate(objtype_t type, vm_pindex_t size)
466 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
467 _vm_object_allocate(type, size, obj);
474 * This version returns a held object, allowing further atomic initialization
478 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
482 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
483 _vm_object_allocate(type, size, obj);
489 * Add an additional reference to a vm_object. The object must already be
490 * held. The original non-lock version is no longer supported. The object
491 * must NOT be chain locked by anyone at the time the reference is added.
493 * The object must be held, but may be held shared if desired (hence why
494 * we use an atomic op).
497 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
499 KKASSERT(object != NULL);
500 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
501 atomic_add_int(&object->ref_count, 1);
502 if (object->type == OBJT_VNODE) {
503 vref(object->handle);
504 /* XXX what if the vnode is being destroyed? */
506 #if defined(DEBUG_LOCKS)
507 debugvm_object_add(object, file, line, 1);
512 * This version is only allowed in situations where the caller
513 * already knows that the object is deterministically referenced
514 * (usually because its taken from a ref'd vnode, or during a map_entry
518 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
520 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
521 atomic_add_int(&object->ref_count, 1);
522 if (object->type == OBJT_VNODE)
523 vref(object->handle);
524 #if defined(DEBUG_LOCKS)
525 debugvm_object_add(object, file, line, 1);
530 * Dereference an object and its underlying vnode. The object may be
531 * held shared. On return the object will remain held.
533 * This function may return a vnode in *vpp which the caller must release
534 * after the caller drops its own lock. If vpp is NULL, we assume that
535 * the caller was holding an exclusive lock on the object and we vrele()
539 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
542 struct vnode *vp = (struct vnode *) object->handle;
544 KASSERT(object->type == OBJT_VNODE,
545 ("vm_object_vndeallocate: not a vnode object"));
546 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
547 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
549 if (object->ref_count == 0) {
550 vprint("vm_object_vndeallocate", vp);
551 panic("vm_object_vndeallocate: bad object reference count");
555 int count = object->ref_count;
558 vm_object_upgrade(object);
559 if (atomic_cmpset_int(&object->ref_count, count, 0)) {
560 vclrflags(vp, VTEXT);
564 if (atomic_cmpset_int(&object->ref_count,
571 #if defined(DEBUG_LOCKS)
572 debugvm_object_add(object, file, line, -1);
576 * vrele or return the vp to vrele. We can only safely vrele(vp)
577 * if the object was locked exclusively. But there are two races
580 * We had to upgrade the object above to safely clear VTEXT
581 * but the alternative path where the shared lock is retained
582 * can STILL race to 0 in other paths and cause our own vrele()
583 * to terminate the vnode. We can't allow that if the VM object
584 * is still locked shared.
593 * Release a reference to the specified object, gained either through a
594 * vm_object_allocate or a vm_object_reference call. When all references
595 * are gone, storage associated with this object may be relinquished.
597 * The caller does not have to hold the object locked but must have control
598 * over the reference in question in order to guarantee that the object
599 * does not get ripped out from under us.
601 * XXX Currently all deallocations require an exclusive lock.
604 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
613 count = object->ref_count;
617 * If decrementing the count enters into special handling
618 * territory (0, 1, or 2) we have to do it the hard way.
619 * Fortunate though, objects with only a few refs like this
620 * are not likely to be heavily contended anyway.
622 * For vnode objects we only care about 1->0 transitions.
624 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
625 #if defined(DEBUG_LOCKS)
626 debugvm_object_add(object, file, line, 0);
628 vm_object_hold(object);
629 vm_object_deallocate_locked(object);
630 vm_object_drop(object);
635 * Try to decrement ref_count without acquiring a hold on
636 * the object. This is particularly important for the exec*()
637 * and exit*() code paths because the program binary may
638 * have a great deal of sharing and an exclusive lock will
639 * crowbar performance in those circumstances.
641 if (object->type == OBJT_VNODE) {
642 vp = (struct vnode *)object->handle;
643 if (atomic_cmpset_int(&object->ref_count,
645 #if defined(DEBUG_LOCKS)
646 debugvm_object_add(object, file, line, -1);
654 if (atomic_cmpset_int(&object->ref_count,
656 #if defined(DEBUG_LOCKS)
657 debugvm_object_add(object, file, line, -1);
668 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
677 * vnode case, caller either locked the object exclusively
678 * or this is a recursion with must_drop != 0 and the vnode
679 * object will be locked shared.
681 * If locked shared we have to drop the object before we can
682 * call vrele() or risk a shared/exclusive livelock.
684 if (object->type == OBJT_VNODE) {
685 ASSERT_LWKT_TOKEN_HELD(&object->token);
686 vm_object_vndeallocate(object, NULL);
689 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
692 * Normal case (object is locked exclusively)
694 if (object->ref_count == 0) {
695 panic("vm_object_deallocate: object deallocated "
696 "too many times: %d", object->type);
698 if (object->ref_count > 2) {
699 atomic_add_int(&object->ref_count, -1);
700 #if defined(DEBUG_LOCKS)
701 debugvm_object_add(object, file, line, -1);
707 * Drop the ref and handle termination on the 1->0 transition.
708 * We may have blocked above so we have to recheck.
710 KKASSERT(object->ref_count != 0);
711 if (object->ref_count >= 2) {
712 atomic_add_int(&object->ref_count, -1);
713 #if defined(DEBUG_LOCKS)
714 debugvm_object_add(object, file, line, -1);
719 atomic_add_int(&object->ref_count, -1);
720 if ((object->flags & OBJ_DEAD) == 0)
721 vm_object_terminate(object);
725 * Destroy the specified object, freeing up related resources.
727 * The object must have zero references.
729 * The object must held. The caller is responsible for dropping the object
730 * after terminate returns. Terminate does NOT drop the object.
732 static int vm_object_terminate_callback(vm_page_t p, void *data);
735 vm_object_terminate(vm_object_t object)
737 struct rb_vm_page_scan_info info;
738 struct vm_object_hash *hash;
741 * Make sure no one uses us. Once we set OBJ_DEAD we should be
742 * able to safely block.
744 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
745 KKASSERT((object->flags & OBJ_DEAD) == 0);
746 vm_object_set_flag(object, OBJ_DEAD);
749 * Wait for the pageout daemon to be done with the object
751 vm_object_pip_wait(object, "objtrm1");
753 KASSERT(!object->paging_in_progress,
754 ("vm_object_terminate: pageout in progress"));
757 * Clean and free the pages, as appropriate. All references to the
758 * object are gone, so we don't need to lock it.
760 if (object->type == OBJT_VNODE) {
764 * Clean pages and flush buffers.
766 * NOTE! TMPFS buffer flushes do not typically flush the
767 * actual page to swap as this would be highly
768 * inefficient, and normal filesystems usually wrap
769 * page flushes with buffer cache buffers.
771 * To deal with this we have to call vinvalbuf() both
772 * before and after the vm_object_page_clean().
774 vp = (struct vnode *) object->handle;
775 vinvalbuf(vp, V_SAVE, 0, 0);
776 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
777 vinvalbuf(vp, V_SAVE, 0, 0);
781 * Wait for any I/O to complete, after which there had better not
782 * be any references left on the object.
784 vm_object_pip_wait(object, "objtrm2");
786 if (object->ref_count != 0) {
787 panic("vm_object_terminate: object with references, "
788 "ref_count=%d", object->ref_count);
792 * Cleanup any shared pmaps associated with this object.
794 pmap_object_free(object);
797 * Now free any remaining pages. For internal objects, this also
798 * removes them from paging queues. Don't free wired pages, just
799 * remove them from the object.
802 info.object = object;
805 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
806 vm_object_terminate_callback, &info);
807 } while (info.error);
810 * Let the pager know object is dead.
812 vm_pager_deallocate(object);
815 * Wait for the object hold count to hit 1, clean out pages as
816 * we go. vmobj_token interlocks any race conditions that might
817 * pick the object up from the vm_object_list after we have cleared
821 if (RB_ROOT(&object->rb_memq) == NULL)
823 kprintf("vm_object_terminate: Warning, object %p "
824 "still has %ld pages\n",
825 object, object->resident_page_count);
826 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
827 vm_object_terminate_callback, &info);
831 * There had better not be any pages left
833 KKASSERT(object->resident_page_count == 0);
836 * Remove the object from the global object list.
838 hash = vmobj_hash(object);
839 lwkt_gettoken(&hash->token);
840 TAILQ_REMOVE(&hash->list, object, object_entry);
841 lwkt_reltoken(&hash->token);
843 if (object->ref_count != 0) {
844 panic("vm_object_terminate2: object with references, "
845 "ref_count=%d", object->ref_count);
849 * NOTE: The object hold_count is at least 1, so we cannot kfree()
850 * the object here. See vm_object_drop().
855 * The caller must hold the object.
858 vm_object_terminate_callback(vm_page_t p, void *data)
860 struct rb_vm_page_scan_info *info = data;
864 KKASSERT(object == info->object);
865 if (vm_page_busy_try(p, TRUE)) {
866 vm_page_sleep_busy(p, TRUE, "vmotrm");
870 if (object != p->object) {
871 /* XXX remove once we determine it can't happen */
872 kprintf("vm_object_terminate: Warning: Encountered "
873 "busied page %p on queue %d\n", p, p->queue);
876 } else if (p->wire_count == 0) {
878 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
881 mycpu->gd_cnt.v_pfree++;
883 if (p->queue != PQ_NONE) {
884 kprintf("vm_object_terminate: Warning: Encountered "
885 "wired page %p on queue %d\n", p, p->queue);
886 if (vm_object_debug > 0) {
896 * Must be at end to avoid SMP races, caller holds object token
898 if ((++info->count & 63) == 0)
904 * Clean all dirty pages in the specified range of object. Leaves page
905 * on whatever queue it is currently on. If NOSYNC is set then do not
906 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
907 * leaving the object dirty.
909 * When stuffing pages asynchronously, allow clustering. XXX we need a
910 * synchronous clustering mode implementation.
912 * Odd semantics: if start == end, we clean everything.
914 * The object must be locked? XXX
916 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
917 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
920 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
923 struct rb_vm_page_scan_info info;
929 vm_object_hold(object);
930 if (object->type != OBJT_VNODE ||
931 (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
932 vm_object_drop(object);
936 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
937 VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
938 pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
943 * Interlock other major object operations. This allows us to
944 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
946 vm_object_set_flag(object, OBJ_CLEANING);
949 * Handle 'entire object' case
951 info.start_pindex = start;
953 info.end_pindex = object->size - 1;
955 info.end_pindex = end - 1;
957 wholescan = (start == 0 && info.end_pindex == object->size - 1);
959 info.pagerflags = pagerflags;
960 info.object = object;
963 * If cleaning the entire object do a pass to mark the pages read-only.
964 * If everything worked out ok, clear OBJ_WRITEABLE and
970 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
971 vm_object_page_clean_pass1, &info);
972 if (info.error == 0) {
973 vm_object_clear_flag(object,
974 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
975 if (object->type == OBJT_VNODE &&
976 (vp = (struct vnode *)object->handle) != NULL) {
978 * Use new-style interface to clear VISDIRTY
979 * because the vnode is not necessarily removed
980 * from the syncer list(s) as often as it was
981 * under the old interface, which can leave
982 * the vnode on the syncer list after reclaim.
990 * Do a pass to clean all the dirty pages we find.
995 generation = object->generation;
996 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
997 vm_object_page_clean_pass2, &info);
998 } while (info.error || generation != object->generation);
1000 vm_object_clear_flag(object, OBJ_CLEANING);
1001 vm_object_drop(object);
1005 * The caller must hold the object.
1009 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1011 struct rb_vm_page_scan_info *info = data;
1013 KKASSERT(p->object == info->object);
1015 vm_page_flag_set(p, PG_CLEANCHK);
1016 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1018 } else if (vm_page_busy_try(p, FALSE)) {
1021 KKASSERT(p->object == info->object);
1022 vm_page_protect(p, VM_PROT_READ);
1027 * Must be at end to avoid SMP races, caller holds object token
1029 if ((++info->count & 63) == 0)
1035 * The caller must hold the object
1039 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1041 struct rb_vm_page_scan_info *info = data;
1044 KKASSERT(p->object == info->object);
1047 * Do not mess with pages that were inserted after we started
1048 * the cleaning pass.
1050 if ((p->flags & PG_CLEANCHK) == 0)
1053 generation = info->object->generation;
1055 if (vm_page_busy_try(p, TRUE)) {
1056 vm_page_sleep_busy(p, TRUE, "vpcwai");
1061 KKASSERT(p->object == info->object &&
1062 info->object->generation == generation);
1065 * Before wasting time traversing the pmaps, check for trivial
1066 * cases where the page cannot be dirty.
1068 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1069 KKASSERT((p->dirty & p->valid) == 0 &&
1070 (p->flags & PG_NEED_COMMIT) == 0);
1076 * Check whether the page is dirty or not. The page has been set
1077 * to be read-only so the check will not race a user dirtying the
1080 vm_page_test_dirty(p);
1081 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1082 vm_page_flag_clear(p, PG_CLEANCHK);
1088 * If we have been asked to skip nosync pages and this is a
1089 * nosync page, skip it. Note that the object flags were
1090 * not cleared in this case (because pass1 will have returned an
1091 * error), so we do not have to set them.
1093 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1094 vm_page_flag_clear(p, PG_CLEANCHK);
1100 * Flush as many pages as we can. PG_CLEANCHK will be cleared on
1101 * the pages that get successfully flushed. Set info->error if
1102 * we raced an object modification.
1104 vm_object_page_collect_flush(info->object, p, info->pagerflags);
1105 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1108 * Must be at end to avoid SMP races, caller holds object token
1111 if ((++info->count & 63) == 0)
1117 * Collect the specified page and nearby pages and flush them out.
1118 * The number of pages flushed is returned. The passed page is busied
1119 * by the caller and we are responsible for its disposition.
1121 * The caller must hold the object.
1124 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1132 vm_page_t ma[BLIST_MAX_ALLOC];
1134 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1137 page_base = pi % BLIST_MAX_ALLOC;
1145 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1151 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1152 (tp->flags & PG_CLEANCHK) == 0) {
1156 if ((tp->queue - tp->pc) == PQ_CACHE) {
1157 vm_page_flag_clear(tp, PG_CLEANCHK);
1161 vm_page_test_dirty(tp);
1162 if ((tp->dirty & tp->valid) == 0 &&
1163 (tp->flags & PG_NEED_COMMIT) == 0) {
1164 vm_page_flag_clear(tp, PG_CLEANCHK);
1173 while (is < BLIST_MAX_ALLOC &&
1174 pi - page_base + is < object->size) {
1177 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1183 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1184 (tp->flags & PG_CLEANCHK) == 0) {
1188 if ((tp->queue - tp->pc) == PQ_CACHE) {
1189 vm_page_flag_clear(tp, PG_CLEANCHK);
1193 vm_page_test_dirty(tp);
1194 if ((tp->dirty & tp->valid) == 0 &&
1195 (tp->flags & PG_NEED_COMMIT) == 0) {
1196 vm_page_flag_clear(tp, PG_CLEANCHK);
1205 * All pages in the ma[] array are busied now
1207 for (i = ib; i < is; ++i) {
1208 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1209 vm_page_hold(ma[i]); /* XXX need this any more? */
1211 vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1212 for (i = ib; i < is; ++i) /* XXX need this any more? */
1213 vm_page_unhold(ma[i]);
1217 * Implements the madvise function at the object/page level.
1219 * MADV_WILLNEED (any object)
1221 * Activate the specified pages if they are resident.
1223 * MADV_DONTNEED (any object)
1225 * Deactivate the specified pages if they are resident.
1227 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1229 * Deactivate and clean the specified pages if they are
1230 * resident. This permits the process to reuse the pages
1231 * without faulting or the kernel to reclaim the pages
1237 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1238 vm_pindex_t count, int advise)
1247 end = pindex + count;
1249 vm_object_hold(object);
1252 * Locate and adjust resident pages. This only applies to the
1253 * primary object in the mapping.
1255 for (; pindex < end; pindex += 1) {
1258 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1259 * and those pages must be OBJ_ONEMAPPING.
1261 if (advise == MADV_FREE) {
1262 if ((object->type != OBJT_DEFAULT &&
1263 object->type != OBJT_SWAP) ||
1264 (object->flags & OBJ_ONEMAPPING) == 0) {
1269 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1272 vm_page_sleep_busy(m, TRUE, "madvpo");
1277 * There may be swap even if there is no backing page
1279 if (advise == MADV_FREE && object->type == OBJT_SWAP)
1280 swap_pager_freespace(object, pindex, 1);
1285 * If the page is not in a normal active state, we skip it.
1286 * If the page is not managed there are no page queues to
1287 * mess with. Things can break if we mess with pages in
1288 * any of the below states.
1290 if (m->wire_count ||
1291 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1293 m->valid != VM_PAGE_BITS_ALL
1300 * Theoretically once a page is known not to be busy, an
1301 * interrupt cannot come along and rip it out from under us.
1303 if (advise == MADV_WILLNEED) {
1304 vm_page_activate(m);
1305 } else if (advise == MADV_DONTNEED) {
1306 vm_page_dontneed(m);
1307 } else if (advise == MADV_FREE) {
1309 * Mark the page clean. This will allow the page
1310 * to be freed up by the system. However, such pages
1311 * are often reused quickly by malloc()/free()
1312 * so we do not do anything that would cause
1313 * a page fault if we can help it.
1315 * Specifically, we do not try to actually free
1316 * the page now nor do we try to put it in the
1317 * cache (which would cause a page fault on reuse).
1319 * But we do make the page is freeable as we
1320 * can without actually taking the step of unmapping
1323 pmap_clear_modify(m);
1326 vm_page_dontneed(m);
1327 if (object->type == OBJT_SWAP)
1328 swap_pager_freespace(object, pindex, 1);
1332 vm_object_drop(object);
1336 * Removes all physical pages in the specified object range from the
1337 * object's list of pages.
1341 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1344 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1345 boolean_t clean_only)
1347 struct rb_vm_page_scan_info info;
1351 * Degenerate cases and assertions
1353 vm_object_hold(object);
1354 if (object == NULL ||
1355 (object->resident_page_count == 0 && object->swblock_count == 0)) {
1356 vm_object_drop(object);
1359 KASSERT(object->type != OBJT_PHYS,
1360 ("attempt to remove pages from a physical object"));
1363 * Indicate that paging is occuring on the object
1365 vm_object_pip_add(object, 1);
1368 * Figure out the actual removal range and whether we are removing
1369 * the entire contents of the object or not. If removing the entire
1370 * contents, be sure to get all pages, even those that might be
1371 * beyond the end of the object.
1373 info.object = object;
1374 info.start_pindex = start;
1376 info.end_pindex = (vm_pindex_t)-1;
1378 info.end_pindex = end - 1;
1379 info.limit = clean_only;
1381 all = (start == 0 && info.end_pindex >= object->size - 1);
1384 * Loop until we are sure we have gotten them all.
1388 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1389 vm_object_page_remove_callback, &info);
1390 } while (info.error);
1393 * Remove any related swap if throwing away pages, or for
1394 * non-swap objects (the swap is a clean copy in that case).
1396 if (object->type != OBJT_SWAP || clean_only == FALSE) {
1398 swap_pager_freespace_all(object);
1400 swap_pager_freespace(object, info.start_pindex,
1401 info.end_pindex - info.start_pindex + 1);
1407 vm_object_pip_wakeup(object);
1408 vm_object_drop(object);
1412 * The caller must hold the object.
1414 * NOTE: User yields are allowed when removing more than one page, but not
1415 * allowed if only removing one page (the path for single page removals
1416 * might hold a spinlock).
1419 vm_object_page_remove_callback(vm_page_t p, void *data)
1421 struct rb_vm_page_scan_info *info = data;
1423 if (info->object != p->object ||
1424 p->pindex < info->start_pindex ||
1425 p->pindex > info->end_pindex) {
1426 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1430 if (vm_page_busy_try(p, TRUE)) {
1431 vm_page_sleep_busy(p, TRUE, "vmopar");
1435 if (info->object != p->object) {
1436 /* this should never happen */
1437 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1444 * Wired pages cannot be destroyed, but they can be invalidated
1445 * and we do so if clean_only (limit) is not set.
1447 * WARNING! The page may be wired due to being part of a buffer
1448 * cache buffer, and the buffer might be marked B_CACHE.
1449 * This is fine as part of a truncation but VFSs must be
1450 * sure to fix the buffer up when re-extending the file.
1452 * NOTE! PG_NEED_COMMIT is ignored.
1454 if (p->wire_count != 0) {
1455 vm_page_protect(p, VM_PROT_NONE);
1456 if (info->limit == 0)
1463 * limit is our clean_only flag. If set and the page is dirty or
1464 * requires a commit, do not free it. If set and the page is being
1465 * held by someone, do not free it.
1467 if (info->limit && p->valid) {
1468 vm_page_test_dirty(p);
1469 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1476 * Destroy the page. But we have to re-test whether its dirty after
1477 * removing it from its pmaps.
1479 vm_page_protect(p, VM_PROT_NONE);
1480 if (info->limit && p->valid) {
1481 vm_page_test_dirty(p);
1482 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1490 * Must be at end to avoid SMP races, caller holds object token
1493 if ((++info->count & 63) == 0)
1500 * Try to extend prev_object into an adjoining region of virtual
1501 * memory, return TRUE on success.
1503 * The caller does not need to hold (prev_object) but must have a stable
1504 * pointer to it (typically by holding the vm_map locked).
1506 * This function only works for anonymous memory objects which either
1507 * have (a) one reference or (b) we are extending the object's size.
1508 * Otherwise the related VM pages we want to use for the object might
1509 * be in use by another mapping.
1512 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1513 vm_size_t prev_size, vm_size_t next_size)
1515 vm_pindex_t next_pindex;
1517 if (prev_object == NULL)
1520 vm_object_hold(prev_object);
1522 if (prev_object->type != OBJT_DEFAULT &&
1523 prev_object->type != OBJT_SWAP) {
1524 vm_object_drop(prev_object);
1529 /* caller now checks this */
1531 * Try to collapse the object first
1533 vm_object_collapse(prev_object, NULL);
1537 /* caller now checks this */
1539 * We can't coalesce if we shadow another object (figuring out the
1540 * relationships become too complex).
1542 if (prev_object->backing_object != NULL) {
1543 vm_object_chain_release(prev_object);
1544 vm_object_drop(prev_object);
1549 prev_size >>= PAGE_SHIFT;
1550 next_size >>= PAGE_SHIFT;
1551 next_pindex = prev_pindex + prev_size;
1554 * We can't if the object has more than one ref count unless we
1555 * are extending it into newly minted space.
1557 if (prev_object->ref_count > 1 &&
1558 prev_object->size != next_pindex) {
1559 vm_object_drop(prev_object);
1564 * Remove any pages that may still be in the object from a previous
1567 if (next_pindex < prev_object->size) {
1568 vm_object_page_remove(prev_object,
1570 next_pindex + next_size, FALSE);
1571 if (prev_object->type == OBJT_SWAP)
1572 swap_pager_freespace(prev_object,
1573 next_pindex, next_size);
1577 * Extend the object if necessary.
1579 if (next_pindex + next_size > prev_object->size)
1580 prev_object->size = next_pindex + next_size;
1581 vm_object_drop(prev_object);
1587 * Make the object writable and flag is being possibly dirty.
1589 * The object might not be held (or might be held but held shared),
1590 * the related vnode is probably not held either. Object and vnode are
1591 * stable by virtue of the vm_page busied by the caller preventing
1594 * If the related mount is flagged MNTK_THR_SYNC we need to call
1595 * vsetobjdirty(). Filesystems using this option usually shortcut
1596 * synchronization by only scanning the syncer list.
1599 vm_object_set_writeable_dirty(vm_object_t object)
1603 /*vm_object_assert_held(object);*/
1605 * Avoid contention in vm fault path by checking the state before
1606 * issuing an atomic op on it.
1608 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1609 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1610 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1612 if (object->type == OBJT_VNODE &&
1613 (vp = (struct vnode *)object->handle) != NULL) {
1614 if ((vp->v_flag & VOBJDIRTY) == 0) {
1616 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1618 * New style THR_SYNC places vnodes on the
1619 * syncer list more deterministically.
1624 * Old style scan would not necessarily place
1625 * a vnode on the syncer list when possibly
1626 * modified via mmap.
1628 vsetflags(vp, VOBJDIRTY);
1634 #include "opt_ddb.h"
1636 #include <sys/cons.h>
1638 #include <ddb/ddb.h>
1640 static int _vm_object_in_map (vm_map_t map, vm_object_t object,
1641 vm_map_entry_t entry);
1642 static int vm_object_in_map (vm_object_t object);
1645 * The caller must hold the object.
1648 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1650 vm_map_backing_t ba;
1652 vm_map_entry_t tmpe;
1657 if (entry == NULL) {
1658 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1659 entcount = map->nentries;
1660 while (entcount-- && tmpe) {
1661 if( _vm_object_in_map(map, object, tmpe)) {
1664 tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1668 switch(entry->maptype) {
1669 case VM_MAPTYPE_SUBMAP:
1670 tmpm = entry->ba.sub_map;
1671 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1672 entcount = tmpm->nentries;
1673 while (entcount-- && tmpe) {
1674 if( _vm_object_in_map(tmpm, object, tmpe)) {
1677 tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1680 case VM_MAPTYPE_NORMAL:
1681 case VM_MAPTYPE_VPAGETABLE:
1684 if (ba->object == object)
1686 ba = ba->backing_ba;
1695 static int vm_object_in_map_callback(struct proc *p, void *data);
1697 struct vm_object_in_map_info {
1706 vm_object_in_map(vm_object_t object)
1708 struct vm_object_in_map_info info;
1711 info.object = object;
1713 allproc_scan(vm_object_in_map_callback, &info, 0);
1716 if( _vm_object_in_map(&kernel_map, object, 0))
1718 if( _vm_object_in_map(&pager_map, object, 0))
1720 if( _vm_object_in_map(&buffer_map, object, 0))
1729 vm_object_in_map_callback(struct proc *p, void *data)
1731 struct vm_object_in_map_info *info = data;
1734 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1742 DB_SHOW_COMMAND(vmochk, vm_object_check)
1744 struct vm_object_hash *hash;
1749 * make sure that internal objs are in a map somewhere
1750 * and none have zero ref counts.
1752 for (n = 0; n < VMOBJ_HSIZE; ++n) {
1753 hash = &vm_object_hash[n];
1754 for (object = TAILQ_FIRST(&hash->list);
1756 object = TAILQ_NEXT(object, object_entry)) {
1757 if (object->type == OBJT_MARKER)
1759 if (object->handle != NULL ||
1760 (object->type != OBJT_DEFAULT &&
1761 object->type != OBJT_SWAP)) {
1764 if (object->ref_count == 0) {
1765 db_printf("vmochk: internal obj has "
1766 "zero ref count: %ld\n",
1767 (long)object->size);
1769 if (vm_object_in_map(object))
1771 db_printf("vmochk: internal obj is not in a map: "
1772 "ref: %d, size: %lu: 0x%lx\n",
1773 object->ref_count, (u_long)object->size,
1774 (u_long)object->size);
1782 DB_SHOW_COMMAND(object, vm_object_print_static)
1784 /* XXX convert args. */
1785 vm_object_t object = (vm_object_t)addr;
1786 boolean_t full = have_addr;
1790 /* XXX count is an (unused) arg. Avoid shadowing it. */
1791 #define count was_count
1799 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1800 object, (int)object->type, (u_long)object->size,
1801 object->resident_page_count, object->ref_count, object->flags);
1803 * XXX no %qd in kernel. Truncate object->backing_object_offset.
1812 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1814 db_iprintf("memory:=");
1815 else if (count == 6) {
1823 db_printf("(off=0x%lx,page=0x%lx)",
1824 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1835 * XXX need this non-static entry for calling from vm_map_print.
1840 vm_object_print(/* db_expr_t */ long addr,
1841 boolean_t have_addr,
1842 /* db_expr_t */ long count,
1845 vm_object_print_static(addr, have_addr, count, modif);
1851 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1853 struct vm_object_hash *hash;
1859 for (n = 0; n < VMOBJ_HSIZE; ++n) {
1860 hash = &vm_object_hash[n];
1861 for (object = TAILQ_FIRST(&hash->list);
1863 object = TAILQ_NEXT(object, object_entry)) {
1864 vm_pindex_t idx, fidx;
1866 vm_paddr_t pa = -1, padiff;
1870 if (object->type == OBJT_MARKER)
1872 db_printf("new object: %p\n", (void *)object);
1882 osize = object->size;
1885 for (idx = 0; idx < osize; idx++) {
1886 m = vm_page_lookup(object, idx);
1889 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1890 (long)fidx, rcount, (long)pa);
1904 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1909 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1910 padiff >>= PAGE_SHIFT;
1911 padiff &= PQ_L2_MASK;
1913 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1917 db_printf(" index(%ld)run(%d)pa(0x%lx)",
1918 (long)fidx, rcount, (long)pa);
1919 db_printf("pd(%ld)\n", (long)padiff);
1929 pa = VM_PAGE_TO_PHYS(m);
1933 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1934 (long)fidx, rcount, (long)pa);