sys/types.h ==> sys/param.h
[dragonfly.git] / sys / vm / vm_object.c
1 /*
2  * Copyright (c) 1991, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by the University of
19  *      California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
37  *
38  *
39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40  * All rights reserved.
41  *
42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  *
64  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
65  * $DragonFly: src/sys/vm/vm_object.c,v 1.21 2004/11/12 00:09:56 dillon Exp $
66  */
67
68 /*
69  *      Virtual memory object module.
70  */
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/proc.h>           /* for curproc, pageproc */
75 #include <sys/vnode.h>
76 #include <sys/vmmeter.h>
77 #include <sys/mman.h>
78 #include <sys/mount.h>
79 #include <sys/kernel.h>
80 #include <sys/sysctl.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <vm/pmap.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_object.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_pageout.h>
89 #include <vm/vm_pager.h>
90 #include <vm/swap_pager.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 #include <vm/vm_zone.h>
94
95 #include <sys/thread2.h>
96
97 #define EASY_SCAN_FACTOR        8
98
99 #define MSYNC_FLUSH_HARDSEQ     0x01
100 #define MSYNC_FLUSH_SOFTSEQ     0x02
101
102 static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
103 SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags,
104         CTLFLAG_RW, &msync_flush_flags, 0, "");
105
106 static void     vm_object_qcollapse (vm_object_t object);
107 static int      vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
108
109 /*
110  *      Virtual memory objects maintain the actual data
111  *      associated with allocated virtual memory.  A given
112  *      page of memory exists within exactly one object.
113  *
114  *      An object is only deallocated when all "references"
115  *      are given up.  Only one "reference" to a given
116  *      region of an object should be writeable.
117  *
118  *      Associated with each object is a list of all resident
119  *      memory pages belonging to that object; this list is
120  *      maintained by the "vm_page" module, and locked by the object's
121  *      lock.
122  *
123  *      Each object also records a "pager" routine which is
124  *      used to retrieve (and store) pages to the proper backing
125  *      storage.  In addition, objects may be backed by other
126  *      objects from which they were virtual-copied.
127  *
128  *      The only items within the object structure which are
129  *      modified after time of creation are:
130  *              reference count         locked by object's lock
131  *              pager routine           locked by object's lock
132  *
133  */
134
135 struct object_q vm_object_list;
136 static long vm_object_count;            /* count of all objects */
137 vm_object_t kernel_object;
138 vm_object_t kmem_object;
139 static struct vm_object kernel_object_store;
140 static struct vm_object kmem_object_store;
141 extern int vm_pageout_page_count;
142
143 static long object_collapses;
144 static long object_bypasses;
145 static int next_index;
146 static vm_zone_t obj_zone;
147 static struct vm_zone obj_zone_store;
148 static int object_hash_rand;
149 #define VM_OBJECTS_INIT 256
150 static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
151
152 void
153 _vm_object_allocate(objtype_t type, vm_size_t size, vm_object_t object)
154 {
155         int incr;
156         TAILQ_INIT(&object->memq);
157         LIST_INIT(&object->shadow_head);
158
159         object->type = type;
160         object->size = size;
161         object->ref_count = 1;
162         object->flags = 0;
163         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
164                 vm_object_set_flag(object, OBJ_ONEMAPPING);
165         object->paging_in_progress = 0;
166         object->resident_page_count = 0;
167         object->shadow_count = 0;
168         object->pg_color = next_index;
169         if ( size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
170                 incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
171         else
172                 incr = size;
173         next_index = (next_index + incr) & PQ_L2_MASK;
174         object->handle = NULL;
175         object->backing_object = NULL;
176         object->backing_object_offset = (vm_ooffset_t) 0;
177         /*
178          * Try to generate a number that will spread objects out in the
179          * hash table.  We 'wipe' new objects across the hash in 128 page
180          * increments plus 1 more to offset it a little more by the time
181          * it wraps around.
182          */
183         object->hash_rand = object_hash_rand - 129;
184
185         object->generation++;
186
187         crit_enter();
188         TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
189         vm_object_count++;
190         object_hash_rand = object->hash_rand;
191         crit_exit();
192 }
193
194 /*
195  *      vm_object_init:
196  *
197  *      Initialize the VM objects module.
198  */
199 void
200 vm_object_init(void)
201 {
202         TAILQ_INIT(&vm_object_list);
203         
204         kernel_object = &kernel_object_store;
205         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
206             kernel_object);
207
208         kmem_object = &kmem_object_store;
209         _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
210             kmem_object);
211
212         obj_zone = &obj_zone_store;
213         zbootinit(obj_zone, "VM OBJECT", sizeof (struct vm_object),
214                 vm_objects_init, VM_OBJECTS_INIT);
215 }
216
217 void
218 vm_object_init2(void)
219 {
220         zinitna(obj_zone, NULL, NULL, 0, 0, ZONE_PANICFAIL, 1);
221 }
222
223 /*
224  *      vm_object_allocate:
225  *
226  *      Returns a new object with the given size.
227  */
228
229 vm_object_t
230 vm_object_allocate(objtype_t type, vm_size_t size)
231 {
232         vm_object_t result;
233
234         result = (vm_object_t) zalloc(obj_zone);
235
236         _vm_object_allocate(type, size, result);
237
238         return (result);
239 }
240
241
242 /*
243  *      vm_object_reference:
244  *
245  *      Gets another reference to the given object.
246  */
247 void
248 vm_object_reference(vm_object_t object)
249 {
250         if (object == NULL)
251                 return;
252
253 #if 0
254         /* object can be re-referenced during final cleaning */
255         KASSERT(!(object->flags & OBJ_DEAD),
256             ("vm_object_reference: attempting to reference dead obj"));
257 #endif
258
259         object->ref_count++;
260         if (object->type == OBJT_VNODE) {
261                 vref(object->handle);
262                 /* XXX what if the vnode is being destroyed? */
263 #if 0
264                 while (vget((struct vnode *) object->handle, 
265                     LK_RETRY|LK_NOOBJ, curthread)) {
266                         printf("vm_object_reference: delay in getting object\n");
267                 }
268 #endif
269         }
270 }
271
272 void
273 vm_object_vndeallocate(vm_object_t object)
274 {
275         struct vnode *vp = (struct vnode *) object->handle;
276
277         KASSERT(object->type == OBJT_VNODE,
278             ("vm_object_vndeallocate: not a vnode object"));
279         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
280 #ifdef INVARIANTS
281         if (object->ref_count == 0) {
282                 vprint("vm_object_vndeallocate", vp);
283                 panic("vm_object_vndeallocate: bad object reference count");
284         }
285 #endif
286
287         object->ref_count--;
288         if (object->ref_count == 0) {
289                 vp->v_flag &= ~VTEXT;
290                 vm_object_clear_flag(object, OBJ_OPT);
291         }
292         vrele(vp);
293 }
294
295 /*
296  *      vm_object_deallocate:
297  *
298  *      Release a reference to the specified object,
299  *      gained either through a vm_object_allocate
300  *      or a vm_object_reference call.  When all references
301  *      are gone, storage associated with this object
302  *      may be relinquished.
303  *
304  *      No object may be locked.
305  */
306 void
307 vm_object_deallocate(vm_object_t object)
308 {
309         vm_object_t temp;
310
311         while (object != NULL) {
312
313                 if (object->type == OBJT_VNODE) {
314                         vm_object_vndeallocate(object);
315                         return;
316                 }
317
318                 if (object->ref_count == 0) {
319                         panic("vm_object_deallocate: object deallocated too many times: %d", object->type);
320                 } else if (object->ref_count > 2) {
321                         object->ref_count--;
322                         return;
323                 }
324
325                 /*
326                  * Here on ref_count of one or two, which are special cases for
327                  * objects.
328                  */
329                 if ((object->ref_count == 2) && (object->shadow_count == 0)) {
330                         vm_object_set_flag(object, OBJ_ONEMAPPING);
331                         object->ref_count--;
332                         return;
333                 } else if ((object->ref_count == 2) && (object->shadow_count == 1)) {
334                         object->ref_count--;
335                         if ((object->handle == NULL) &&
336                             (object->type == OBJT_DEFAULT ||
337                              object->type == OBJT_SWAP)) {
338                                 vm_object_t robject;
339
340                                 robject = LIST_FIRST(&object->shadow_head);
341                                 KASSERT(robject != NULL,
342                                     ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
343                                          object->ref_count,
344                                          object->shadow_count));
345                                 if ((robject->handle == NULL) &&
346                                     (robject->type == OBJT_DEFAULT ||
347                                      robject->type == OBJT_SWAP)) {
348
349                                         robject->ref_count++;
350
351                                         while (
352                                                 robject->paging_in_progress ||
353                                                 object->paging_in_progress
354                                         ) {
355                                                 vm_object_pip_sleep(robject, "objde1");
356                                                 vm_object_pip_sleep(object, "objde2");
357                                         }
358
359                                         if (robject->ref_count == 1) {
360                                                 robject->ref_count--;
361                                                 object = robject;
362                                                 goto doterm;
363                                         }
364
365                                         object = robject;
366                                         vm_object_collapse(object);
367                                         continue;
368                                 }
369                         }
370
371                         return;
372
373                 } else {
374                         object->ref_count--;
375                         if (object->ref_count != 0)
376                                 return;
377                 }
378
379 doterm:
380
381                 temp = object->backing_object;
382                 if (temp) {
383                         LIST_REMOVE(object, shadow_list);
384                         temp->shadow_count--;
385                         if (temp->ref_count == 0)
386                                 vm_object_clear_flag(temp, OBJ_OPT);
387                         temp->generation++;
388                         object->backing_object = NULL;
389                 }
390
391                 /*
392                  * Don't double-terminate, we could be in a termination
393                  * recursion due to the terminate having to sync data
394                  * to disk.
395                  */
396                 if ((object->flags & OBJ_DEAD) == 0)
397                         vm_object_terminate(object);
398                 object = temp;
399         }
400 }
401
402 /*
403  *      vm_object_terminate actually destroys the specified object, freeing
404  *      up all previously used resources.
405  *
406  *      The object must be locked.
407  *      This routine may block.
408  */
409 void
410 vm_object_terminate(vm_object_t object)
411 {
412         vm_page_t p;
413
414         /*
415          * Make sure no one uses us.
416          */
417         vm_object_set_flag(object, OBJ_DEAD);
418
419         /*
420          * wait for the pageout daemon to be done with the object
421          */
422         vm_object_pip_wait(object, "objtrm");
423
424         KASSERT(!object->paging_in_progress,
425                 ("vm_object_terminate: pageout in progress"));
426
427         /*
428          * Clean and free the pages, as appropriate. All references to the
429          * object are gone, so we don't need to lock it.
430          */
431         if (object->type == OBJT_VNODE) {
432                 struct vnode *vp;
433
434                 /*
435                  * Freeze optimized copies.
436                  */
437                 vm_freeze_copyopts(object, 0, object->size);
438
439                 /*
440                  * Clean pages and flush buffers.
441                  */
442                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
443
444                 vp = (struct vnode *) object->handle;
445                 vinvalbuf(vp, V_SAVE, NULL, 0, 0);
446         }
447
448         /*
449          * Wait for any I/O to complete, after which there had better not
450          * be any references left on the object.
451          */
452         vm_object_pip_wait(object, "objtrm");
453
454         if (object->ref_count != 0)
455                 panic("vm_object_terminate: object with references, ref_count=%d", object->ref_count);
456
457         /*
458          * Now free any remaining pages. For internal objects, this also
459          * removes them from paging queues. Don't free wired pages, just
460          * remove them from the object. 
461          */
462         crit_enter();
463         while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
464                 if (p->busy || (p->flags & PG_BUSY))
465                         panic("vm_object_terminate: freeing busy page %p", p);
466                 if (p->wire_count == 0) {
467                         vm_page_busy(p);
468                         vm_page_free(p);
469                         mycpu->gd_cnt.v_pfree++;
470                 } else {
471                         vm_page_busy(p);
472                         vm_page_remove(p);
473                         vm_page_wakeup(p);
474                 }
475         }
476         crit_exit();
477
478         /*
479          * Let the pager know object is dead.
480          */
481         vm_pager_deallocate(object);
482
483         /*
484          * Remove the object from the global object list.
485          */
486         crit_enter();
487         TAILQ_REMOVE(&vm_object_list, object, object_list);
488         vm_object_count--;
489         crit_exit();
490
491         wakeup(object);
492         if (object->ref_count != 0)
493                 panic("vm_object_terminate2: object with references, ref_count=%d", object->ref_count);
494
495         /*
496          * Free the space for the object.
497          */
498         zfree(obj_zone, object);
499 }
500
501 /*
502  *      vm_object_page_clean
503  *
504  *      Clean all dirty pages in the specified range of object.  Leaves page 
505  *      on whatever queue it is currently on.   If NOSYNC is set then do not
506  *      write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
507  *      leaving the object dirty.
508  *
509  *      When stuffing pages asynchronously, allow clustering.  XXX we need a
510  *      synchronous clustering mode implementation.
511  *
512  *      Odd semantics: if start == end, we clean everything.
513  */
514
515 void
516 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
517     int flags)
518 {
519         vm_page_t p, np;
520         vm_offset_t tstart, tend;
521         vm_pindex_t pi;
522         struct vnode *vp;
523         int clearobjflags;
524         int pagerflags;
525         int curgeneration;
526
527         if (object->type != OBJT_VNODE ||
528                 (object->flags & OBJ_MIGHTBEDIRTY) == 0)
529                 return;
530
531         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
532         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
533
534         vp = object->handle;
535
536         vm_object_set_flag(object, OBJ_CLEANING);
537
538         /*
539          * Handle 'entire object' case
540          */
541         tstart = start;
542         if (end == 0) {
543                 tend = object->size;
544         } else {
545                 tend = end;
546         }
547
548         /*
549          * If the caller is smart and only msync()s a range he knows is
550          * dirty, we may be able to avoid an object scan.  This results in
551          * a phenominal improvement in performance.  We cannot do this
552          * as a matter of course because the object may be huge - e.g.
553          * the size might be in the gigabytes or terrabytes.
554          */
555         if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
556                 vm_offset_t tscan;
557                 int scanlimit;
558                 int scanreset;
559
560                 scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
561                 if (scanreset < 16)
562                         scanreset = 16;
563                 pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
564
565                 scanlimit = scanreset;
566                 tscan = tstart;
567
568                 /*
569                  * spl protection is required despite the obj generation
570                  * tracking because we cannot safely call vm_page_test_dirty()
571                  * or avoid page field tests against an interrupt unbusy/free
572                  * race that might occur prior to the busy check in
573                  * vm_object_page_collect_flush().
574                  */
575                 crit_enter();
576                 while (tscan < tend) {
577                         curgeneration = object->generation;
578                         p = vm_page_lookup(object, tscan);
579                         if (p == NULL || p->valid == 0 ||
580                             (p->queue - p->pc) == PQ_CACHE) {
581                                 if (--scanlimit == 0)
582                                         break;
583                                 ++tscan;
584                                 continue;
585                         }
586                         vm_page_test_dirty(p);
587                         if ((p->dirty & p->valid) == 0) {
588                                 if (--scanlimit == 0)
589                                         break;
590                                 ++tscan;
591                                 continue;
592                         }
593                         /*
594                          * If we have been asked to skip nosync pages and 
595                          * this is a nosync page, we can't continue.
596                          */
597                         if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
598                                 if (--scanlimit == 0)
599                                         break;
600                                 ++tscan;
601                                 continue;
602                         }
603                         scanlimit = scanreset;
604
605                         /*
606                          * This returns 0 if it was unable to busy the first
607                          * page (i.e. had to sleep).
608                          */
609                         tscan += vm_object_page_collect_flush(object, p, 
610                                                 curgeneration, pagerflags);
611                 }
612                 crit_exit();
613
614                 /*
615                  * If everything was dirty and we flushed it successfully,
616                  * and the requested range is not the entire object, we
617                  * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
618                  * return immediately.
619                  */
620                 if (tscan >= tend && (tstart || tend < object->size)) {
621                         vm_object_clear_flag(object, OBJ_CLEANING);
622                         return;
623                 }
624                 pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
625         }
626
627         /*
628          * Generally set CLEANCHK interlock and make the page read-only so
629          * we can then clear the object flags.
630          *
631          * However, if this is a nosync mmap then the object is likely to 
632          * stay dirty so do not mess with the page and do not clear the
633          * object flags.
634          *
635          * spl protection is required because an interrupt can remove page
636          * from the object.
637          */
638         clearobjflags = 1;
639
640         crit_enter();
641         for (p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq)) {
642                 vm_page_flag_set(p, PG_CLEANCHK);
643                 if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
644                         clearobjflags = 0;
645                 else
646                         vm_page_protect(p, VM_PROT_READ);
647         }
648         crit_exit();
649
650         if (clearobjflags && (tstart == 0) && (tend == object->size)) {
651                 struct vnode *vp;
652
653                 vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
654                 if (object->type == OBJT_VNODE &&
655                     (vp = (struct vnode *)object->handle) != NULL) {
656                         if (vp->v_flag & VOBJDIRTY) 
657                                 vclrflags(vp, VOBJDIRTY);
658                 }
659         }
660
661         /*
662          * spl protection is required both to avoid an interrupt unbusy/free
663          * race against a vm_page_lookup(), and also to ensure that the
664          * memq is consistent.  We do not want a busy page to be ripped out
665          * from under us.
666          */
667         crit_enter();
668 rescan:
669         crit_exit();
670         crit_enter();
671         curgeneration = object->generation;
672
673         for (p = TAILQ_FIRST(&object->memq); p; p = np) {
674                 int n;
675
676                 np = TAILQ_NEXT(p, listq);
677
678 again:
679                 pi = p->pindex;
680                 if (((p->flags & PG_CLEANCHK) == 0) ||
681                         (pi < tstart) || (pi >= tend) ||
682                         (p->valid == 0) ||
683                         ((p->queue - p->pc) == PQ_CACHE)) {
684                         vm_page_flag_clear(p, PG_CLEANCHK);
685                         continue;
686                 }
687
688                 vm_page_test_dirty(p);
689                 if ((p->dirty & p->valid) == 0) {
690                         vm_page_flag_clear(p, PG_CLEANCHK);
691                         continue;
692                 }
693
694                 /*
695                  * If we have been asked to skip nosync pages and this is a
696                  * nosync page, skip it.  Note that the object flags were
697                  * not cleared in this case so we do not have to set them.
698                  */
699                 if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
700                         vm_page_flag_clear(p, PG_CLEANCHK);
701                         continue;
702                 }
703
704                 n = vm_object_page_collect_flush(object, p,
705                         curgeneration, pagerflags);
706                 if (n == 0)
707                         goto rescan;
708                 if (object->generation != curgeneration)
709                         goto rescan;
710
711                 /*
712                  * Try to optimize the next page.  If we can't we pick up
713                  * our (random) scan where we left off.
714                  */
715                 if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) {
716                         if ((p = vm_page_lookup(object, pi + n)) != NULL)
717                                 goto again;
718                 }
719         }
720         crit_exit();
721
722 #if 0
723         VOP_FSYNC(vp, NULL, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
724 #endif
725
726         vm_object_clear_flag(object, OBJ_CLEANING);
727         return;
728 }
729
730 /*
731  * This routine must be called within a critical section to properly avoid
732  * an interrupt unbusy/free race that can occur prior to the busy check.
733  *
734  * Using the object generation number here to detect page ripout is not
735  * the best idea in the world. XXX
736  *
737  * NOTE: we operate under the assumption that a page found to not be busy
738  * will not be ripped out from under us by an interrupt.  XXX we should
739  * recode this to explicitly busy the pages.
740  */
741 static int
742 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
743 {
744         int runlen;
745         int maxf;
746         int chkb;
747         int maxb;
748         int i;
749         vm_pindex_t pi;
750         vm_page_t maf[vm_pageout_page_count];
751         vm_page_t mab[vm_pageout_page_count];
752         vm_page_t ma[vm_pageout_page_count];
753
754         pi = p->pindex;
755         while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
756                 if (object->generation != curgeneration) {
757                         return(0);
758                 }
759         }
760
761         maxf = 0;
762         for(i = 1; i < vm_pageout_page_count; i++) {
763                 vm_page_t tp;
764
765                 if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
766                         if ((tp->flags & PG_BUSY) ||
767                                 ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 
768                                  (tp->flags & PG_CLEANCHK) == 0) ||
769                                 (tp->busy != 0))
770                                 break;
771                         if((tp->queue - tp->pc) == PQ_CACHE) {
772                                 vm_page_flag_clear(tp, PG_CLEANCHK);
773                                 break;
774                         }
775                         vm_page_test_dirty(tp);
776                         if ((tp->dirty & tp->valid) == 0) {
777                                 vm_page_flag_clear(tp, PG_CLEANCHK);
778                                 break;
779                         }
780                         maf[ i - 1 ] = tp;
781                         maxf++;
782                         continue;
783                 }
784                 break;
785         }
786
787         maxb = 0;
788         chkb = vm_pageout_page_count -  maxf;
789         if (chkb) {
790                 for(i = 1; i < chkb;i++) {
791                         vm_page_t tp;
792
793                         if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
794                                 if ((tp->flags & PG_BUSY) ||
795                                         ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 
796                                          (tp->flags & PG_CLEANCHK) == 0) ||
797                                         (tp->busy != 0))
798                                         break;
799                                 if((tp->queue - tp->pc) == PQ_CACHE) {
800                                         vm_page_flag_clear(tp, PG_CLEANCHK);
801                                         break;
802                                 }
803                                 vm_page_test_dirty(tp);
804                                 if ((tp->dirty & tp->valid) == 0) {
805                                         vm_page_flag_clear(tp, PG_CLEANCHK);
806                                         break;
807                                 }
808                                 mab[ i - 1 ] = tp;
809                                 maxb++;
810                                 continue;
811                         }
812                         break;
813                 }
814         }
815
816         for(i = 0; i < maxb; i++) {
817                 int index = (maxb - i) - 1;
818                 ma[index] = mab[i];
819                 vm_page_flag_clear(ma[index], PG_CLEANCHK);
820         }
821         vm_page_flag_clear(p, PG_CLEANCHK);
822         ma[maxb] = p;
823         for(i = 0; i < maxf; i++) {
824                 int index = (maxb + i) + 1;
825                 ma[index] = maf[i];
826                 vm_page_flag_clear(ma[index], PG_CLEANCHK);
827         }
828         runlen = maxb + maxf + 1;
829
830         vm_pageout_flush(ma, runlen, pagerflags);
831         for (i = 0; i < runlen; i++) {
832                 if (ma[i]->valid & ma[i]->dirty) {
833                         vm_page_protect(ma[i], VM_PROT_READ);
834                         vm_page_flag_set(ma[i], PG_CLEANCHK);
835
836                         /*
837                          * maxf will end up being the actual number of pages
838                          * we wrote out contiguously, non-inclusive of the
839                          * first page.  We do not count look-behind pages.
840                          */
841                         if (i >= maxb + 1 && (maxf > i - maxb - 1))
842                                 maxf = i - maxb - 1;
843                 }
844         }
845         return(maxf + 1);
846 }
847
848 #ifdef not_used
849 /* XXX I cannot tell if this should be an exported symbol */
850 /*
851  *      vm_object_deactivate_pages
852  *
853  *      Deactivate all pages in the specified object.  (Keep its pages
854  *      in memory even though it is no longer referenced.)
855  *
856  *      The object must be locked.
857  */
858 static void
859 vm_object_deactivate_pages(vm_object_t object)
860 {
861         vm_page_t p, next;
862         int s;
863
864         crit_enter();
865         for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) {
866                 next = TAILQ_NEXT(p, listq);
867                 vm_page_deactivate(p);
868         }
869         crit_exit();
870 }
871 #endif
872
873 /*
874  * Same as vm_object_pmap_copy, except range checking really
875  * works, and is meant for small sections of an object.
876  *
877  * This code protects resident pages by making them read-only
878  * and is typically called on a fork or split when a page
879  * is converted to copy-on-write.  
880  *
881  * NOTE: If the page is already at VM_PROT_NONE, calling
882  * vm_page_protect will have no effect.
883  */
884 void
885 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
886 {
887         vm_pindex_t idx;
888         vm_page_t p;
889
890         if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
891                 return;
892
893         /*
894          * spl protection needed to prevent races between the lookup,
895          * an interrupt unbusy/free, and our protect call.
896          */
897         crit_enter();
898         for (idx = start; idx < end; idx++) {
899                 p = vm_page_lookup(object, idx);
900                 if (p == NULL)
901                         continue;
902                 vm_page_protect(p, VM_PROT_READ);
903         }
904         crit_exit();
905 }
906
907 /*
908  *      vm_object_pmap_remove:
909  *
910  *      Removes all physical pages in the specified
911  *      object range from all physical maps.
912  *
913  *      The object must *not* be locked.
914  */
915 void
916 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
917 {
918         vm_page_t p;
919
920         if (object == NULL)
921                 return;
922
923         /*
924          * spl protection is required because an interrupt can unbusy/free
925          * a page.
926          */
927         crit_enter();
928         for (p = TAILQ_FIRST(&object->memq);
929             p != NULL;
930             p = TAILQ_NEXT(p, listq)
931         ) {
932                 if (p->pindex >= start && p->pindex < end)
933                         vm_page_protect(p, VM_PROT_NONE);
934         }
935         crit_exit();
936         if ((start == 0) && (object->size == end))
937                 vm_object_clear_flag(object, OBJ_WRITEABLE);
938 }
939
940 /*
941  *      vm_object_madvise:
942  *
943  *      Implements the madvise function at the object/page level.
944  *
945  *      MADV_WILLNEED   (any object)
946  *
947  *          Activate the specified pages if they are resident.
948  *
949  *      MADV_DONTNEED   (any object)
950  *
951  *          Deactivate the specified pages if they are resident.
952  *
953  *      MADV_FREE       (OBJT_DEFAULT/OBJT_SWAP objects,
954  *                       OBJ_ONEMAPPING only)
955  *
956  *          Deactivate and clean the specified pages if they are
957  *          resident.  This permits the process to reuse the pages
958  *          without faulting or the kernel to reclaim the pages
959  *          without I/O.
960  */
961 void
962 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
963 {
964         vm_pindex_t end, tpindex;
965         vm_object_t tobject;
966         vm_page_t m;
967
968         if (object == NULL)
969                 return;
970
971         end = pindex + count;
972
973         /*
974          * Locate and adjust resident pages
975          */
976
977         for (; pindex < end; pindex += 1) {
978 relookup:
979                 tobject = object;
980                 tpindex = pindex;
981 shadowlookup:
982                 /*
983                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
984                  * and those pages must be OBJ_ONEMAPPING.
985                  */
986                 if (advise == MADV_FREE) {
987                         if ((tobject->type != OBJT_DEFAULT &&
988                              tobject->type != OBJT_SWAP) ||
989                             (tobject->flags & OBJ_ONEMAPPING) == 0) {
990                                 continue;
991                         }
992                 }
993
994                 /*
995                  * spl protection is required to avoid a race between the
996                  * lookup, an interrupt unbusy/free, and our busy check.
997                  */
998
999                 crit_enter();
1000                 m = vm_page_lookup(tobject, tpindex);
1001
1002                 if (m == NULL) {
1003                         /*
1004                          * There may be swap even if there is no backing page
1005                          */
1006                         if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1007                                 swap_pager_freespace(tobject, tpindex, 1);
1008
1009                         /*
1010                          * next object
1011                          */
1012                         crit_exit();
1013                         if (tobject->backing_object == NULL)
1014                                 continue;
1015                         tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1016                         tobject = tobject->backing_object;
1017                         goto shadowlookup;
1018                 }
1019
1020                 /*
1021                  * If the page is busy or not in a normal active state,
1022                  * we skip it.  If the page is not managed there are no
1023                  * page queues to mess with.  Things can break if we mess
1024                  * with pages in any of the below states.
1025                  */
1026                 if (
1027                     m->hold_count ||
1028                     m->wire_count ||
1029                     (m->flags & PG_UNMANAGED) ||
1030                     m->valid != VM_PAGE_BITS_ALL
1031                 ) {
1032                         crit_exit();
1033                         continue;
1034                 }
1035
1036                 if (vm_page_sleep_busy(m, TRUE, "madvpo")) {
1037                         crit_exit();
1038                         goto relookup;
1039                 }
1040                 crit_exit();
1041
1042                 /*
1043                  * Theoretically once a page is known not to be busy, an
1044                  * interrupt cannot come along and rip it out from under us.
1045                  */
1046
1047                 if (advise == MADV_WILLNEED) {
1048                         vm_page_activate(m);
1049                 } else if (advise == MADV_DONTNEED) {
1050                         vm_page_dontneed(m);
1051                 } else if (advise == MADV_FREE) {
1052                         /*
1053                          * Mark the page clean.  This will allow the page
1054                          * to be freed up by the system.  However, such pages
1055                          * are often reused quickly by malloc()/free()
1056                          * so we do not do anything that would cause
1057                          * a page fault if we can help it.
1058                          *
1059                          * Specifically, we do not try to actually free
1060                          * the page now nor do we try to put it in the
1061                          * cache (which would cause a page fault on reuse).
1062                          *
1063                          * But we do make the page is freeable as we
1064                          * can without actually taking the step of unmapping
1065                          * it.
1066                          */
1067                         pmap_clear_modify(m);
1068                         m->dirty = 0;
1069                         m->act_count = 0;
1070                         vm_page_dontneed(m);
1071                         if (tobject->type == OBJT_SWAP)
1072                                 swap_pager_freespace(tobject, tpindex, 1);
1073                 }
1074         }       
1075 }
1076
1077 /*
1078  *      vm_object_shadow:
1079  *
1080  *      Create a new object which is backed by the
1081  *      specified existing object range.  The source
1082  *      object reference is deallocated.
1083  *
1084  *      The new object and offset into that object
1085  *      are returned in the source parameters.
1086  */
1087
1088 void
1089 vm_object_shadow(vm_object_t *object,   /* IN/OUT */
1090                  vm_ooffset_t *offset,  /* IN/OUT */
1091                  vm_size_t length)
1092 {
1093         vm_object_t source;
1094         vm_object_t result;
1095
1096         source = *object;
1097
1098         /*
1099          * Don't create the new object if the old object isn't shared.
1100          */
1101
1102         if (source != NULL &&
1103             source->ref_count == 1 &&
1104             source->handle == NULL &&
1105             (source->type == OBJT_DEFAULT ||
1106              source->type == OBJT_SWAP))
1107                 return;
1108
1109         /*
1110          * Allocate a new object with the given length
1111          */
1112
1113         if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL)
1114                 panic("vm_object_shadow: no object for shadowing");
1115
1116         /*
1117          * The new object shadows the source object, adding a reference to it.
1118          * Our caller changes his reference to point to the new object,
1119          * removing a reference to the source object.  Net result: no change
1120          * of reference count.
1121          *
1122          * Try to optimize the result object's page color when shadowing
1123          * in order to maintain page coloring consistency in the combined 
1124          * shadowed object.
1125          */
1126         result->backing_object = source;
1127         if (source) {
1128                 LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
1129                 source->shadow_count++;
1130                 source->generation++;
1131                 result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & PQ_L2_MASK;
1132         }
1133
1134         /*
1135          * Store the offset into the source object, and fix up the offset into
1136          * the new object.
1137          */
1138
1139         result->backing_object_offset = *offset;
1140
1141         /*
1142          * Return the new things
1143          */
1144
1145         *offset = 0;
1146         *object = result;
1147 }
1148
1149 #define OBSC_TEST_ALL_SHADOWED  0x0001
1150 #define OBSC_COLLAPSE_NOWAIT    0x0002
1151 #define OBSC_COLLAPSE_WAIT      0x0004
1152
1153 static __inline int
1154 vm_object_backing_scan(vm_object_t object, int op)
1155 {
1156         int r = 1;
1157         vm_page_t p;
1158         vm_object_t backing_object;
1159         vm_pindex_t backing_offset_index;
1160
1161         /*
1162          * spl protection is required to avoid races between the memq/lookup,
1163          * an interrupt doing an unbusy/free, and our busy check.  Amoung
1164          * other things.
1165          */
1166         crit_enter();
1167
1168         backing_object = object->backing_object;
1169         backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1170
1171         /*
1172          * Initial conditions
1173          */
1174
1175         if (op & OBSC_TEST_ALL_SHADOWED) {
1176                 /*
1177                  * We do not want to have to test for the existence of
1178                  * swap pages in the backing object.  XXX but with the
1179                  * new swapper this would be pretty easy to do.
1180                  *
1181                  * XXX what about anonymous MAP_SHARED memory that hasn't
1182                  * been ZFOD faulted yet?  If we do not test for this, the
1183                  * shadow test may succeed! XXX
1184                  */
1185                 if (backing_object->type != OBJT_DEFAULT) {
1186                         crit_exit();
1187                         return(0);
1188                 }
1189         }
1190         if (op & OBSC_COLLAPSE_WAIT) {
1191                 KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
1192                 vm_object_set_flag(backing_object, OBJ_DEAD);
1193         }
1194
1195         /*
1196          * Our scan
1197          */
1198
1199         p = TAILQ_FIRST(&backing_object->memq);
1200         while (p) {
1201                 vm_page_t next = TAILQ_NEXT(p, listq);
1202                 vm_pindex_t new_pindex = p->pindex - backing_offset_index;
1203
1204                 if (op & OBSC_TEST_ALL_SHADOWED) {
1205                         vm_page_t pp;
1206
1207                         /*
1208                          * Ignore pages outside the parent object's range
1209                          * and outside the parent object's mapping of the 
1210                          * backing object.
1211                          *
1212                          * note that we do not busy the backing object's
1213                          * page.
1214                          */
1215
1216                         if (
1217                             p->pindex < backing_offset_index ||
1218                             new_pindex >= object->size
1219                         ) {
1220                                 p = next;
1221                                 continue;
1222                         }
1223
1224                         /*
1225                          * See if the parent has the page or if the parent's
1226                          * object pager has the page.  If the parent has the
1227                          * page but the page is not valid, the parent's
1228                          * object pager must have the page.
1229                          *
1230                          * If this fails, the parent does not completely shadow
1231                          * the object and we might as well give up now.
1232                          */
1233
1234                         pp = vm_page_lookup(object, new_pindex);
1235                         if (
1236                             (pp == NULL || pp->valid == 0) &&
1237                             !vm_pager_has_page(object, new_pindex, NULL, NULL)
1238                         ) {
1239                                 r = 0;
1240                                 break;
1241                         }
1242                 }
1243
1244                 /*
1245                  * Check for busy page
1246                  */
1247
1248                 if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
1249                         vm_page_t pp;
1250
1251                         if (op & OBSC_COLLAPSE_NOWAIT) {
1252                                 if (
1253                                     (p->flags & PG_BUSY) ||
1254                                     !p->valid || 
1255                                     p->hold_count || 
1256                                     p->wire_count ||
1257                                     p->busy
1258                                 ) {
1259                                         p = next;
1260                                         continue;
1261                                 }
1262                         } else if (op & OBSC_COLLAPSE_WAIT) {
1263                                 if (vm_page_sleep_busy(p, TRUE, "vmocol")) {
1264                                         /*
1265                                          * If we slept, anything could have
1266                                          * happened.  Since the object is
1267                                          * marked dead, the backing offset
1268                                          * should not have changed so we
1269                                          * just restart our scan.
1270                                          */
1271                                         p = TAILQ_FIRST(&backing_object->memq);
1272                                         continue;
1273                                 }
1274                         }
1275
1276                         /* 
1277                          * Busy the page
1278                          */
1279                         vm_page_busy(p);
1280
1281                         KASSERT(
1282                             p->object == backing_object,
1283                             ("vm_object_qcollapse(): object mismatch")
1284                         );
1285
1286                         /*
1287                          * Destroy any associated swap
1288                          */
1289                         if (backing_object->type == OBJT_SWAP) {
1290                                 swap_pager_freespace(
1291                                     backing_object, 
1292                                     p->pindex,
1293                                     1
1294                                 );
1295                         }
1296
1297                         if (
1298                             p->pindex < backing_offset_index ||
1299                             new_pindex >= object->size
1300                         ) {
1301                                 /*
1302                                  * Page is out of the parent object's range, we 
1303                                  * can simply destroy it. 
1304                                  */
1305                                 vm_page_protect(p, VM_PROT_NONE);
1306                                 vm_page_free(p);
1307                                 p = next;
1308                                 continue;
1309                         }
1310
1311                         pp = vm_page_lookup(object, new_pindex);
1312                         if (
1313                             pp != NULL ||
1314                             vm_pager_has_page(object, new_pindex, NULL, NULL)
1315                         ) {
1316                                 /*
1317                                  * page already exists in parent OR swap exists
1318                                  * for this location in the parent.  Destroy 
1319                                  * the original page from the backing object.
1320                                  *
1321                                  * Leave the parent's page alone
1322                                  */
1323                                 vm_page_protect(p, VM_PROT_NONE);
1324                                 vm_page_free(p);
1325                                 p = next;
1326                                 continue;
1327                         }
1328
1329                         /*
1330                          * Page does not exist in parent, rename the
1331                          * page from the backing object to the main object. 
1332                          *
1333                          * If the page was mapped to a process, it can remain 
1334                          * mapped through the rename.
1335                          */
1336                         if ((p->queue - p->pc) == PQ_CACHE)
1337                                 vm_page_deactivate(p);
1338
1339                         vm_page_rename(p, object, new_pindex);
1340                         /* page automatically made dirty by rename */
1341                 }
1342                 p = next;
1343         }
1344         crit_exit();
1345         return(r);
1346 }
1347
1348
1349 /*
1350  * this version of collapse allows the operation to occur earlier and
1351  * when paging_in_progress is true for an object...  This is not a complete
1352  * operation, but should plug 99.9% of the rest of the leaks.
1353  */
1354 static void
1355 vm_object_qcollapse(vm_object_t object)
1356 {
1357         vm_object_t backing_object = object->backing_object;
1358
1359         if (backing_object->ref_count != 1)
1360                 return;
1361
1362         backing_object->ref_count += 2;
1363
1364         vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
1365
1366         backing_object->ref_count -= 2;
1367 }
1368
1369 /*
1370  *      vm_object_collapse:
1371  *
1372  *      Collapse an object with the object backing it.
1373  *      Pages in the backing object are moved into the
1374  *      parent, and the backing object is deallocated.
1375  */
1376 void
1377 vm_object_collapse(vm_object_t object)
1378 {
1379         while (TRUE) {
1380                 vm_object_t backing_object;
1381
1382                 /*
1383                  * Verify that the conditions are right for collapse:
1384                  *
1385                  * The object exists and the backing object exists.
1386                  */
1387                 if (object == NULL)
1388                         break;
1389
1390                 if ((backing_object = object->backing_object) == NULL)
1391                         break;
1392
1393                 /*
1394                  * we check the backing object first, because it is most likely
1395                  * not collapsable.
1396                  */
1397                 if (backing_object->handle != NULL ||
1398                     (backing_object->type != OBJT_DEFAULT &&
1399                      backing_object->type != OBJT_SWAP) ||
1400                     (backing_object->flags & OBJ_DEAD) ||
1401                     object->handle != NULL ||
1402                     (object->type != OBJT_DEFAULT &&
1403                      object->type != OBJT_SWAP) ||
1404                     (object->flags & OBJ_DEAD)) {
1405                         break;
1406                 }
1407
1408                 if (
1409                     object->paging_in_progress != 0 ||
1410                     backing_object->paging_in_progress != 0
1411                 ) {
1412                         vm_object_qcollapse(object);
1413                         break;
1414                 }
1415
1416                 /*
1417                  * We know that we can either collapse the backing object (if
1418                  * the parent is the only reference to it) or (perhaps) have
1419                  * the parent bypass the object if the parent happens to shadow
1420                  * all the resident pages in the entire backing object.
1421                  *
1422                  * This is ignoring pager-backed pages such as swap pages.
1423                  * vm_object_backing_scan fails the shadowing test in this
1424                  * case.
1425                  */
1426
1427                 if (backing_object->ref_count == 1) {
1428                         /*
1429                          * If there is exactly one reference to the backing
1430                          * object, we can collapse it into the parent.  
1431                          */
1432                         vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
1433
1434                         /*
1435                          * Move the pager from backing_object to object.
1436                          */
1437
1438                         if (backing_object->type == OBJT_SWAP) {
1439                                 vm_object_pip_add(backing_object, 1);
1440
1441                                 /*
1442                                  * scrap the paging_offset junk and do a 
1443                                  * discrete copy.  This also removes major 
1444                                  * assumptions about how the swap-pager 
1445                                  * works from where it doesn't belong.  The
1446                                  * new swapper is able to optimize the
1447                                  * destroy-source case.
1448                                  */
1449
1450                                 vm_object_pip_add(object, 1);
1451                                 swap_pager_copy(
1452                                     backing_object,
1453                                     object,
1454                                     OFF_TO_IDX(object->backing_object_offset), TRUE);
1455                                 vm_object_pip_wakeup(object);
1456
1457                                 vm_object_pip_wakeup(backing_object);
1458                         }
1459                         /*
1460                          * Object now shadows whatever backing_object did.
1461                          * Note that the reference to 
1462                          * backing_object->backing_object moves from within 
1463                          * backing_object to within object.
1464                          */
1465
1466                         LIST_REMOVE(object, shadow_list);
1467                         object->backing_object->shadow_count--;
1468                         object->backing_object->generation++;
1469                         if (backing_object->backing_object) {
1470                                 LIST_REMOVE(backing_object, shadow_list);
1471                                 backing_object->backing_object->shadow_count--;
1472                                 backing_object->backing_object->generation++;
1473                         }
1474                         object->backing_object = backing_object->backing_object;
1475                         if (object->backing_object) {
1476                                 LIST_INSERT_HEAD(
1477                                     &object->backing_object->shadow_head,
1478                                     object, 
1479                                     shadow_list
1480                                 );
1481                                 object->backing_object->shadow_count++;
1482                                 object->backing_object->generation++;
1483                         }
1484
1485                         object->backing_object_offset +=
1486                             backing_object->backing_object_offset;
1487
1488                         /*
1489                          * Discard backing_object.
1490                          *
1491                          * Since the backing object has no pages, no pager left,
1492                          * and no object references within it, all that is
1493                          * necessary is to dispose of it.
1494                          */
1495
1496                         KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
1497                         KASSERT(TAILQ_FIRST(&backing_object->memq) == NULL, ("backing_object %p somehow has left over pages during collapse!", backing_object));
1498                         crit_enter();
1499                         TAILQ_REMOVE(
1500                             &vm_object_list, 
1501                             backing_object,
1502                             object_list
1503                         );
1504                         vm_object_count--;
1505                         crit_exit();
1506
1507                         zfree(obj_zone, backing_object);
1508
1509                         object_collapses++;
1510                 } else {
1511                         vm_object_t new_backing_object;
1512
1513                         /*
1514                          * If we do not entirely shadow the backing object,
1515                          * there is nothing we can do so we give up.
1516                          */
1517
1518                         if (vm_object_backing_scan(object, OBSC_TEST_ALL_SHADOWED) == 0) {
1519                                 break;
1520                         }
1521
1522                         /*
1523                          * Make the parent shadow the next object in the
1524                          * chain.  Deallocating backing_object will not remove
1525                          * it, since its reference count is at least 2.
1526                          */
1527
1528                         LIST_REMOVE(object, shadow_list);
1529                         backing_object->shadow_count--;
1530                         backing_object->generation++;
1531
1532                         new_backing_object = backing_object->backing_object;
1533                         if ((object->backing_object = new_backing_object) != NULL) {
1534                                 vm_object_reference(new_backing_object);
1535                                 LIST_INSERT_HEAD(
1536                                     &new_backing_object->shadow_head,
1537                                     object,
1538                                     shadow_list
1539                                 );
1540                                 new_backing_object->shadow_count++;
1541                                 new_backing_object->generation++;
1542                                 object->backing_object_offset +=
1543                                         backing_object->backing_object_offset;
1544                         }
1545
1546                         /*
1547                          * Drop the reference count on backing_object. Since
1548                          * its ref_count was at least 2, it will not vanish;
1549                          * so we don't need to call vm_object_deallocate, but
1550                          * we do anyway.
1551                          */
1552                         vm_object_deallocate(backing_object);
1553                         object_bypasses++;
1554                 }
1555
1556                 /*
1557                  * Try again with this object's new backing object.
1558                  */
1559         }
1560 }
1561
1562 /*
1563  *      vm_object_page_remove: [internal]
1564  *
1565  *      Removes all physical pages in the specified
1566  *      object range from the object's list of pages.
1567  */
1568 void
1569 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1570     boolean_t clean_only)
1571 {
1572         vm_page_t p, next;
1573         unsigned int size;
1574         int all;
1575
1576         if (object == NULL || object->resident_page_count == 0)
1577                 return;
1578
1579         all = ((end == 0) && (start == 0));
1580
1581         /*
1582          * Since physically-backed objects do not use managed pages, we can't
1583          * remove pages from the object (we must instead remove the page
1584          * references, and then destroy the object).
1585          */
1586         KASSERT(object->type != OBJT_PHYS, 
1587                 ("attempt to remove pages from a physical object"));
1588
1589         /*
1590          * Indicating that the object is undergoing paging.
1591          *
1592          * spl protection is required to avoid a race between the memq scan,
1593          * an interrupt unbusy/free, and the busy check.
1594          */
1595         vm_object_pip_add(object, 1);
1596         crit_enter();
1597 again:
1598         size = end - start;
1599         if (all || size > object->resident_page_count / 4) {
1600                 for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) {
1601                         next = TAILQ_NEXT(p, listq);
1602                         if (all || ((start <= p->pindex) && (p->pindex < end))) {
1603                                 if (p->wire_count != 0) {
1604                                         vm_page_protect(p, VM_PROT_NONE);
1605                                         if (!clean_only)
1606                                                 p->valid = 0;
1607                                         continue;
1608                                 }
1609
1610                                 /*
1611                                  * The busy flags are only cleared at
1612                                  * interrupt -- minimize the spl transitions
1613                                  */
1614
1615                                 if (vm_page_sleep_busy(p, TRUE, "vmopar"))
1616                                         goto again;
1617
1618                                 if (clean_only && p->valid) {
1619                                         vm_page_test_dirty(p);
1620                                         if (p->valid & p->dirty)
1621                                                 continue;
1622                                 }
1623
1624                                 vm_page_busy(p);
1625                                 vm_page_protect(p, VM_PROT_NONE);
1626                                 vm_page_free(p);
1627                         }
1628                 }
1629         } else {
1630                 while (size > 0) {
1631                         if ((p = vm_page_lookup(object, start)) != 0) {
1632                                 if (p->wire_count != 0) {
1633                                         vm_page_protect(p, VM_PROT_NONE);
1634                                         if (!clean_only)
1635                                                 p->valid = 0;
1636                                         start += 1;
1637                                         size -= 1;
1638                                         continue;
1639                                 }
1640
1641                                 /*
1642                                  * The busy flags are only cleared at
1643                                  * interrupt -- minimize the spl transitions
1644                                  */
1645                                 if (vm_page_sleep_busy(p, TRUE, "vmopar"))
1646                                         goto again;
1647
1648                                 if (clean_only && p->valid) {
1649                                         vm_page_test_dirty(p);
1650                                         if (p->valid & p->dirty) {
1651                                                 start += 1;
1652                                                 size -= 1;
1653                                                 continue;
1654                                         }
1655                                 }
1656
1657                                 vm_page_busy(p);
1658                                 vm_page_protect(p, VM_PROT_NONE);
1659                                 vm_page_free(p);
1660                         }
1661                         start += 1;
1662                         size -= 1;
1663                 }
1664         }
1665         crit_exit();
1666         vm_object_pip_wakeup(object);
1667 }
1668
1669 /*
1670  *      Routine:        vm_object_coalesce
1671  *      Function:       Coalesces two objects backing up adjoining
1672  *                      regions of memory into a single object.
1673  *
1674  *      returns TRUE if objects were combined.
1675  *
1676  *      NOTE:   Only works at the moment if the second object is NULL -
1677  *              if it's not, which object do we lock first?
1678  *
1679  *      Parameters:
1680  *              prev_object     First object to coalesce
1681  *              prev_offset     Offset into prev_object
1682  *              next_object     Second object into coalesce
1683  *              next_offset     Offset into next_object
1684  *
1685  *              prev_size       Size of reference to prev_object
1686  *              next_size       Size of reference to next_object
1687  *
1688  *      Conditions:
1689  *      The object must *not* be locked.
1690  */
1691 boolean_t
1692 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1693     vm_size_t prev_size, vm_size_t next_size)
1694 {
1695         vm_pindex_t next_pindex;
1696
1697         if (prev_object == NULL) {
1698                 return (TRUE);
1699         }
1700
1701         if (prev_object->type != OBJT_DEFAULT &&
1702             prev_object->type != OBJT_SWAP) {
1703                 return (FALSE);
1704         }
1705
1706         /*
1707          * Try to collapse the object first
1708          */
1709         vm_object_collapse(prev_object);
1710
1711         /*
1712          * Can't coalesce if: . more than one reference . paged out . shadows
1713          * another object . has a copy elsewhere (any of which mean that the
1714          * pages not mapped to prev_entry may be in use anyway)
1715          */
1716
1717         if (prev_object->backing_object != NULL) {
1718                 return (FALSE);
1719         }
1720
1721         prev_size >>= PAGE_SHIFT;
1722         next_size >>= PAGE_SHIFT;
1723         next_pindex = prev_pindex + prev_size;
1724
1725         if ((prev_object->ref_count > 1) &&
1726             (prev_object->size != next_pindex)) {
1727                 return (FALSE);
1728         }
1729
1730         /*
1731          * Remove any pages that may still be in the object from a previous
1732          * deallocation.
1733          */
1734         if (next_pindex < prev_object->size) {
1735                 vm_object_page_remove(prev_object,
1736                                       next_pindex,
1737                                       next_pindex + next_size, FALSE);
1738                 if (prev_object->type == OBJT_SWAP)
1739                         swap_pager_freespace(prev_object,
1740                                              next_pindex, next_size);
1741         }
1742
1743         /*
1744          * Extend the object if necessary.
1745          */
1746         if (next_pindex + next_size > prev_object->size)
1747                 prev_object->size = next_pindex + next_size;
1748
1749         return (TRUE);
1750 }
1751
1752 void
1753 vm_object_set_writeable_dirty(vm_object_t object)
1754 {
1755         struct vnode *vp;
1756
1757         vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1758         if (object->type == OBJT_VNODE &&
1759             (vp = (struct vnode *)object->handle) != NULL) {
1760                 if ((vp->v_flag & VOBJDIRTY) == 0) {
1761                         vsetflags(vp, VOBJDIRTY);
1762                 }
1763         }
1764 }
1765
1766
1767
1768 #include "opt_ddb.h"
1769 #ifdef DDB
1770 #include <sys/kernel.h>
1771
1772 #include <sys/cons.h>
1773
1774 #include <ddb/ddb.h>
1775
1776 static int      _vm_object_in_map (vm_map_t map, vm_object_t object,
1777                                        vm_map_entry_t entry);
1778 static int      vm_object_in_map (vm_object_t object);
1779
1780 static int
1781 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1782 {
1783         vm_map_t tmpm;
1784         vm_map_entry_t tmpe;
1785         vm_object_t obj;
1786         int entcount;
1787
1788         if (map == 0)
1789                 return 0;
1790
1791         if (entry == 0) {
1792                 tmpe = map->header.next;
1793                 entcount = map->nentries;
1794                 while (entcount-- && (tmpe != &map->header)) {
1795                         if( _vm_object_in_map(map, object, tmpe)) {
1796                                 return 1;
1797                         }
1798                         tmpe = tmpe->next;
1799                 }
1800         } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
1801                 tmpm = entry->object.sub_map;
1802                 tmpe = tmpm->header.next;
1803                 entcount = tmpm->nentries;
1804                 while (entcount-- && tmpe != &tmpm->header) {
1805                         if( _vm_object_in_map(tmpm, object, tmpe)) {
1806                                 return 1;
1807                         }
1808                         tmpe = tmpe->next;
1809                 }
1810         } else if ((obj = entry->object.vm_object) != NULL) {
1811                 for(; obj; obj=obj->backing_object)
1812                         if( obj == object) {
1813                                 return 1;
1814                         }
1815         }
1816         return 0;
1817 }
1818
1819 static int
1820 vm_object_in_map(vm_object_t object)
1821 {
1822         struct proc *p;
1823         for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1824                 if( !p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
1825                         continue;
1826                 if( _vm_object_in_map(&p->p_vmspace->vm_map, object, 0))
1827                         return 1;
1828         }
1829         if( _vm_object_in_map( kernel_map, object, 0))
1830                 return 1;
1831         if( _vm_object_in_map( pager_map, object, 0))
1832                 return 1;
1833         if( _vm_object_in_map( buffer_map, object, 0))
1834                 return 1;
1835         return 0;
1836 }
1837
1838 DB_SHOW_COMMAND(vmochk, vm_object_check)
1839 {
1840         vm_object_t object;
1841
1842         /*
1843          * make sure that internal objs are in a map somewhere
1844          * and none have zero ref counts.
1845          */
1846         for (object = TAILQ_FIRST(&vm_object_list);
1847                         object != NULL;
1848                         object = TAILQ_NEXT(object, object_list)) {
1849                 if (object->handle == NULL &&
1850                     (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
1851                         if (object->ref_count == 0) {
1852                                 db_printf("vmochk: internal obj has zero ref count: %ld\n",
1853                                         (long)object->size);
1854                         }
1855                         if (!vm_object_in_map(object)) {
1856                                 db_printf(
1857                         "vmochk: internal obj is not in a map: "
1858                         "ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
1859                                     object->ref_count, (u_long)object->size, 
1860                                     (u_long)object->size,
1861                                     (void *)object->backing_object);
1862                         }
1863                 }
1864         }
1865 }
1866
1867 /*
1868  *      vm_object_print:        [ debug ]
1869  */
1870 DB_SHOW_COMMAND(object, vm_object_print_static)
1871 {
1872         /* XXX convert args. */
1873         vm_object_t object = (vm_object_t)addr;
1874         boolean_t full = have_addr;
1875
1876         vm_page_t p;
1877
1878         /* XXX count is an (unused) arg.  Avoid shadowing it. */
1879 #define count   was_count
1880
1881         int count;
1882
1883         if (object == NULL)
1884                 return;
1885
1886         db_iprintf(
1887             "Object %p: type=%d, size=0x%lx, res=%d, ref=%d, flags=0x%x\n",
1888             object, (int)object->type, (u_long)object->size,
1889             object->resident_page_count, object->ref_count, object->flags);
1890         /*
1891          * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1892          */
1893         db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
1894             object->shadow_count, 
1895             object->backing_object ? object->backing_object->ref_count : 0,
1896             object->backing_object, (long)object->backing_object_offset);
1897
1898         if (!full)
1899                 return;
1900
1901         db_indent += 2;
1902         count = 0;
1903         for (p = TAILQ_FIRST(&object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) {
1904                 if (count == 0)
1905                         db_iprintf("memory:=");
1906                 else if (count == 6) {
1907                         db_printf("\n");
1908                         db_iprintf(" ...");
1909                         count = 0;
1910                 } else
1911                         db_printf(",");
1912                 count++;
1913
1914                 db_printf("(off=0x%lx,page=0x%lx)",
1915                     (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1916         }
1917         if (count != 0)
1918                 db_printf("\n");
1919         db_indent -= 2;
1920 }
1921
1922 /* XXX. */
1923 #undef count
1924
1925 /* XXX need this non-static entry for calling from vm_map_print. */
1926 void
1927 vm_object_print(/* db_expr_t */ long addr,
1928                 boolean_t have_addr,
1929                 /* db_expr_t */ long count,
1930                 char *modif)
1931 {
1932         vm_object_print_static(addr, have_addr, count, modif);
1933 }
1934
1935 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1936 {
1937         vm_object_t object;
1938         int nl = 0;
1939         int c;
1940         for (object = TAILQ_FIRST(&vm_object_list);
1941                         object != NULL;
1942                         object = TAILQ_NEXT(object, object_list)) {
1943                 vm_pindex_t idx, fidx;
1944                 vm_pindex_t osize;
1945                 vm_paddr_t pa = -1, padiff;
1946                 int rcount;
1947                 vm_page_t m;
1948
1949                 db_printf("new object: %p\n", (void *)object);
1950                 if ( nl > 18) {
1951                         c = cngetc();
1952                         if (c != ' ')
1953                                 return;
1954                         nl = 0;
1955                 }
1956                 nl++;
1957                 rcount = 0;
1958                 fidx = 0;
1959                 osize = object->size;
1960                 if (osize > 128)
1961                         osize = 128;
1962                 for (idx = 0; idx < osize; idx++) {
1963                         m = vm_page_lookup(object, idx);
1964                         if (m == NULL) {
1965                                 if (rcount) {
1966                                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1967                                                 (long)fidx, rcount, (long)pa);
1968                                         if ( nl > 18) {
1969                                                 c = cngetc();
1970                                                 if (c != ' ')
1971                                                         return;
1972                                                 nl = 0;
1973                                         }
1974                                         nl++;
1975                                         rcount = 0;
1976                                 }
1977                                 continue;
1978                         }
1979
1980                                 
1981                         if (rcount &&
1982                                 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1983                                 ++rcount;
1984                                 continue;
1985                         }
1986                         if (rcount) {
1987                                 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1988                                 padiff >>= PAGE_SHIFT;
1989                                 padiff &= PQ_L2_MASK;
1990                                 if (padiff == 0) {
1991                                         pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1992                                         ++rcount;
1993                                         continue;
1994                                 }
1995                                 db_printf(" index(%ld)run(%d)pa(0x%lx)",
1996                                         (long)fidx, rcount, (long)pa);
1997                                 db_printf("pd(%ld)\n", (long)padiff);
1998                                 if ( nl > 18) {
1999                                         c = cngetc();
2000                                         if (c != ' ')
2001                                                 return;
2002                                         nl = 0;
2003                                 }
2004                                 nl++;
2005                         }
2006                         fidx = idx;
2007                         pa = VM_PAGE_TO_PHYS(m);
2008                         rcount = 1;
2009                 }
2010                 if (rcount) {
2011                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2012                                 (long)fidx, rcount, (long)pa);
2013                         if ( nl > 18) {
2014                                 c = cngetc();
2015                                 if (c != ' ')
2016                                         return;
2017                                 nl = 0;
2018                         }
2019                         nl++;
2020                 }
2021         }
2022 }
2023 #endif /* DDB */