Replace the global VM page hash table with a per-VM-object RB tree. No
authorMatthew Dillon <dillon@dragonflybsd.org>
Sat, 2 Dec 2006 23:13:46 +0000 (23:13 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Sat, 2 Dec 2006 23:13:46 +0000 (23:13 +0000)
performance degradation was observed (probably due to locality of reference
in the RB tree improving cache characteristics for searches).  This also
significantly reduces the kernel memory footprint (no global VM page hash
table) and reduces the size of the vm_page structure.  Future MP work
should benefit from this change.

Prior work in the VM tree guarenteed that VM pages only existed in the hash
table while also associated with a VM object, this commit uses that guarentee
to make the VM page lookup structures VM-object-centric.

sys/platform/pc32/i386/pmap.c
sys/vm/vm_object.c
sys/vm/vm_object.h
sys/vm/vm_page.c
sys/vm/vm_page.h
sys/vm/vm_pageout.c

index c0389e4..c75c53e 100644 (file)
@@ -40,7 +40,7 @@
  *
  *     from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
- * $DragonFly: src/sys/platform/pc32/i386/pmap.c,v 1.61 2006/11/07 06:43:24 dillon Exp $
+ * $DragonFly: src/sys/platform/pc32/i386/pmap.c,v 1.62 2006/12/02 23:13:45 dillon Exp $
  */
 
 /*
@@ -1279,7 +1279,7 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va)
 
 
 /***************************************************
-* Pmap allocation/deallocation routines.
+ * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
@@ -1287,43 +1287,59 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va)
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
+static int pmap_release_callback(struct vm_page *p, void *data);
+
 void
 pmap_release(struct pmap *pmap)
 {
-       vm_page_t p,n,ptdpg;
        vm_object_t object = pmap->pm_pteobj;
-       int curgeneration;
+       struct rb_vm_page_scan_info info;
 
 #if defined(DIAGNOSTIC)
        if (object->ref_count != 1)
                panic("pmap_release: pteobj reference count != 1");
 #endif
        
-       ptdpg = NULL;
-retry:
+       info.pmap = pmap;
+       info.object = object;
        crit_enter();
        TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
-       curgeneration = object->generation;
-       for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
-               n = TAILQ_NEXT(p, listq);
-               if (p->pindex == PTDPTDI) {
-                       ptdpg = p;
-                       continue;
-               }
-               if (!pmap_release_free_page(pmap, p)) {
-                       crit_exit();
-                       goto retry;
-               }
-               if (object->generation != curgeneration) {
-                       crit_exit();
-                       goto retry;
+       crit_exit();
+
+       do {
+               crit_enter();
+               info.error = 0;
+               info.mpte = NULL;
+               info.limit = object->generation;
+
+               vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 
+                                       pmap_release_callback, &info);
+               if (info.error == 0 && info.mpte) {
+                       if (!pmap_release_free_page(pmap, info.mpte))
+                               info.error = 1;
                }
-       }
-       if (ptdpg && !pmap_release_free_page(pmap, ptdpg)) {
                crit_exit();
-               goto retry;
+       } while (info.error);
+}
+
+static int
+pmap_release_callback(struct vm_page *p, void *data)
+{
+       struct rb_vm_page_scan_info *info = data;
+
+       if (p->pindex == PTDPTDI) {
+               info->mpte = p;
+               return(0);
        }
-       crit_exit();
+       if (!pmap_release_free_page(info->pmap, p)) {
+               info->error = 1;
+               return(-1);
+       }
+       if (info->object->generation != info->limit) {
+               info->error = 1;
+               return(-1);
+       }
+       return(0);
 }
 \f
 static int
@@ -2168,15 +2184,15 @@ pmap_kenter_temporary(vm_paddr_t pa, int i)
  * This eliminates the blast of soft faults on process startup and
  * immediately after an mmap.
  */
+static int pmap_object_init_pt_callback(vm_page_t p, void *data);
+
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
                    vm_object_t object, vm_pindex_t pindex, 
                    vm_size_t size, int limit)
 {
-       vm_offset_t tmpidx;
+       struct rb_vm_page_scan_info info;
        int psize;
-       vm_page_t p, mpte;
-       int objpgs;
 
        /*
         * We can't preinit if read access isn't set or there is no pmap
@@ -2191,73 +2207,6 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
        if (curproc == NULL || pmap != vmspace_pmap(curproc->p_vmspace))
                return;
 
-#if 0
-       /* 
-        * XXX you must be joking, entering PTE's into a user page table
-        * without any accounting?  This could result in the page table
-        * being freed while it still contains mappings (free with PG_ZERO
-        * assumption leading to a non-zero page being marked PG_ZERO).
-        */
-       /*
-        * This code maps large physical mmap regions into the
-        * processor address space.  Note that some shortcuts
-        * are taken, but the code works.
-        */
-       if (pseflag &&
-           (object->type == OBJT_DEVICE) &&
-           ((addr & (NBPDR - 1)) == 0) &&
-           ((size & (NBPDR - 1)) == 0) ) {
-               int i;
-               vm_page_t m[1];
-               unsigned int ptepindex;
-               int npdes;
-               vm_offset_t ptepa;
-
-               if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
-                       return;
-
-retry:
-               p = vm_page_lookup(object, pindex);
-               if (p && vm_page_sleep_busy(p, FALSE, "init4p"))
-                       goto retry;
-
-               if (p == NULL) {
-                       p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
-                       if (p == NULL)
-                               return;
-                       m[0] = p;
-
-                       if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
-                               vm_page_free(p);
-                               return;
-                       }
-
-                       p = vm_page_lookup(object, pindex);
-                       vm_page_wakeup(p);
-               }
-
-               ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p);
-               if (ptepa & (NBPDR - 1)) {
-                       return;
-               }
-
-               p->valid = VM_PAGE_BITS_ALL;
-
-               pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
-               npdes = size >> PDRSHIFT;
-               for (i = 0; i < npdes; i++) {
-                       pmap->pm_pdir[ptepindex] =
-                           (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS);
-                       ptepa += NBPDR;
-                       ptepindex += 1;
-               }
-               vm_page_flag_set(p, PG_MAPPED);
-               cpu_invltlb();
-               smp_invltlb();
-               return;
-       }
-#endif
-
        psize = i386_btop(size);
 
        if ((object->type != OBJT_VNODE) ||
@@ -2272,80 +2221,56 @@ retry:
                psize = object->size - pindex;
        }
 
+       if (psize == 0)
+               return;
 
        /*
-        * If we are processing a major portion of the object, then scan the
-        * entire thing.
+        * Use a red-black scan to traverse the requested range and load
+        * any valid pages found into the pmap.
         *
         * We cannot safely scan the object's memq unless we are in a
         * critical section since interrupts can remove pages from objects.
         */
-       crit_enter();
-       mpte = NULL;
-       if (psize > (object->resident_page_count >> 2)) {
-               objpgs = psize;
+       info.start_pindex = pindex;
+       info.end_pindex = pindex + psize - 1;
+       info.limit = limit;
+       info.mpte = NULL;
+       info.addr = addr;
+       info.pmap = pmap;
 
-               for (p = TAILQ_FIRST(&object->memq);
-                   objpgs > 0 && p != NULL;
-                   p = TAILQ_NEXT(p, listq)
-               ) {
-                       tmpidx = p->pindex;
-                       if (tmpidx < pindex)
-                               continue;
-                       tmpidx -= pindex;
-                       if (tmpidx >= psize)
-                               continue;
+       crit_enter();
+       vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
+                               pmap_object_init_pt_callback, &info);
+       crit_exit();
+}
 
-                       /*
-                        * don't allow an madvise to blow away our really
-                        * free pages allocating pv entries.
-                        */
-                       if ((limit & MAP_PREFAULT_MADVISE) &&
-                           vmstats.v_free_count < vmstats.v_free_reserved) {
-                               break;
-                       }
-                       if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
-                               (p->busy == 0) &&
-                           (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
-                               if ((p->queue - p->pc) == PQ_CACHE)
-                                       vm_page_deactivate(p);
-                               vm_page_busy(p);
-                               mpte = pmap_enter_quick(pmap, 
-                                       addr + i386_ptob(tmpidx), p, mpte);
-                               vm_page_flag_set(p, PG_MAPPED);
-                               vm_page_wakeup(p);
-                       }
-                       objpgs -= 1;
-               }
-       } else {
-               /*
-                * else lookup the pages one-by-one.
-                */
-               for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
-                       /*
-                        * don't allow an madvise to blow away our really
-                        * free pages allocating pv entries.
-                        */
-                       if ((limit & MAP_PREFAULT_MADVISE) &&
-                           vmstats.v_free_count < vmstats.v_free_reserved) {
-                               break;
-                       }
-                       p = vm_page_lookup(object, tmpidx + pindex);
-                       if (p &&
-                           ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
-                               (p->busy == 0) &&
-                           (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
-                               if ((p->queue - p->pc) == PQ_CACHE)
-                                       vm_page_deactivate(p);
-                               vm_page_busy(p);
-                               mpte = pmap_enter_quick(pmap, 
-                                       addr + i386_ptob(tmpidx), p, mpte);
-                               vm_page_flag_set(p, PG_MAPPED);
-                               vm_page_wakeup(p);
-                       }
-               }
+static
+int
+pmap_object_init_pt_callback(vm_page_t p, void *data)
+{
+       struct rb_vm_page_scan_info *info = data;
+       vm_pindex_t rel_index;
+       /*
+        * don't allow an madvise to blow away our really
+        * free pages allocating pv entries.
+        */
+       if ((info->limit & MAP_PREFAULT_MADVISE) &&
+               vmstats.v_free_count < vmstats.v_free_reserved) {
+                   return(-1);
+       }
+       if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
+           (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
+               if ((p->queue - p->pc) == PQ_CACHE)
+                       vm_page_deactivate(p);
+               vm_page_busy(p);
+               rel_index = p->pindex - info->start_pindex;
+               info->mpte = pmap_enter_quick(info->pmap,
+                                             info->addr + i386_ptob(rel_index),
+                                             p, info->mpte);
+               vm_page_flag_set(p, PG_MAPPED);
+               vm_page_wakeup(p);
        }
-       crit_exit();
+       return(0);
 }
 
 /*
index 53f559a..53cc10d 100644 (file)
@@ -62,7 +62,7 @@
  * rights to redistribute these changes.
  *
  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
- * $DragonFly: src/sys/vm/vm_object.c,v 1.26 2006/09/11 20:25:31 dillon Exp $
+ * $DragonFly: src/sys/vm/vm_object.c,v 1.27 2006/12/02 23:13:46 dillon Exp $
  */
 
 /*
 
 #define EASY_SCAN_FACTOR       8
 
-#define MSYNC_FLUSH_HARDSEQ    0x01
-#define MSYNC_FLUSH_SOFTSEQ    0x02
-
-static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
-SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags,
-        CTLFLAG_RW, &msync_flush_flags, 0, "");
-
-static void    vm_object_qcollapse (vm_object_t object);
-static int     vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
+static void    vm_object_qcollapse(vm_object_t object);
+static int     vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
+                                            int pagerflags);
 
 /*
  *     Virtual memory objects maintain the actual data
@@ -151,7 +145,7 @@ void
 _vm_object_allocate(objtype_t type, vm_size_t size, vm_object_t object)
 {
        int incr;
-       TAILQ_INIT(&object->memq);
+       RB_INIT(&object->rb_memq);
        LIST_INIT(&object->shadow_head);
 
        object->type = type;
@@ -387,11 +381,11 @@ doterm:
  *     The object must be locked.
  *     This routine may block.
  */
+static int vm_object_terminate_callback(vm_page_t p, void *data);
+
 void
 vm_object_terminate(vm_object_t object)
 {
-       vm_page_t p;
-
        /*
         * Make sure no one uses us.
         */
@@ -436,19 +430,8 @@ vm_object_terminate(vm_object_t object)
         * remove them from the object. 
         */
        crit_enter();
-       while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
-               if (p->busy || (p->flags & PG_BUSY))
-                       panic("vm_object_terminate: freeing busy page %p", p);
-               if (p->wire_count == 0) {
-                       vm_page_busy(p);
-                       vm_page_free(p);
-                       mycpu->gd_cnt.v_pfree++;
-               } else {
-                       vm_page_busy(p);
-                       vm_page_remove(p);
-                       vm_page_wakeup(p);
-               }
-       }
+       vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
+                               vm_object_terminate_callback, NULL);
        crit_exit();
 
        /*
@@ -474,6 +457,23 @@ vm_object_terminate(vm_object_t object)
        zfree(obj_zone, object);
 }
 
+static int
+vm_object_terminate_callback(vm_page_t p, void *data __unused)
+{
+       if (p->busy || (p->flags & PG_BUSY))
+               panic("vm_object_terminate: freeing busy page %p", p);
+       if (p->wire_count == 0) {
+               vm_page_busy(p);
+               vm_page_free(p);
+               mycpu->gd_cnt.v_pfree++;
+       } else {
+               vm_page_busy(p);
+               vm_page_remove(p);
+               vm_page_wakeup(p);
+       }
+       return(0);
+}
+
 /*
  *     vm_object_page_clean
  *
@@ -487,16 +487,16 @@ vm_object_terminate(vm_object_t object)
  *
  *     Odd semantics: if start == end, we clean everything.
  */
+static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
+static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
 
 void
 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
-    int flags)
+                    int flags)
 {
-       vm_page_t p, np;
-       vm_offset_t tstart, tend;
-       vm_pindex_t pi;
+       struct rb_vm_page_scan_info info;
        struct vnode *vp;
-       int clearobjflags;
+       int wholescan;
        int pagerflags;
        int curgeneration;
 
@@ -504,199 +504,135 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
                (object->flags & OBJ_MIGHTBEDIRTY) == 0)
                return;
 
-       pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
+       pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 
+                       VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
        pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
 
        vp = object->handle;
 
+       /*
+        * Interlock other major object operations.  This allows us to 
+        * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
+        */
+       crit_enter();
        vm_object_set_flag(object, OBJ_CLEANING);
 
        /*
         * Handle 'entire object' case
         */
-       tstart = start;
+       info.start_pindex = start;
        if (end == 0) {
-               tend = object->size;
+               info.end_pindex = object->size - 1;
        } else {
-               tend = end;
+               info.end_pindex = end - 1;
        }
+       wholescan = (start == 0 && info.end_pindex == object->size - 1);
+       info.limit = flags;
+       info.pagerflags = pagerflags;
+       info.object = object;
 
        /*
-        * If the caller is smart and only msync()s a range he knows is
-        * dirty, we may be able to avoid an object scan.  This results in
-        * a phenominal improvement in performance.  We cannot do this
-        * as a matter of course because the object may be huge - e.g.
-        * the size might be in the gigabytes or terrabytes.
+        * If cleaning the entire object do a pass to mark the pages read-only.
+        * If everything worked out ok, clear OBJ_WRITEABLE and
+        * OBJ_MIGHTBEDIRTY.
         */
-       if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
-               vm_offset_t tscan;
-               int scanlimit;
-               int scanreset;
-
-               scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
-               if (scanreset < 16)
-                       scanreset = 16;
-               pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
-
-               scanlimit = scanreset;
-               tscan = tstart;
-
-               /*
-                * spl protection is required despite the obj generation
-                * tracking because we cannot safely call vm_page_test_dirty()
-                * or avoid page field tests against an interrupt unbusy/free
-                * race that might occur prior to the busy check in
-                * vm_object_page_collect_flush().
-                */
-               crit_enter();
-               while (tscan < tend) {
-                       curgeneration = object->generation;
-                       p = vm_page_lookup(object, tscan);
-                       if (p == NULL || p->valid == 0 ||
-                           (p->queue - p->pc) == PQ_CACHE) {
-                               if (--scanlimit == 0)
-                                       break;
-                               ++tscan;
-                               continue;
-                       }
-                       vm_page_test_dirty(p);
-                       if ((p->dirty & p->valid) == 0) {
-                               if (--scanlimit == 0)
-                                       break;
-                               ++tscan;
-                               continue;
-                       }
-                       /*
-                        * If we have been asked to skip nosync pages and 
-                        * this is a nosync page, we can't continue.
-                        */
-                       if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
-                               if (--scanlimit == 0)
-                                       break;
-                               ++tscan;
-                               continue;
+       if (wholescan) {
+               info.error = 0;
+               vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
+                                       vm_object_page_clean_pass1, &info);
+               if (info.error == 0) {
+                       vm_object_clear_flag(object,
+                                            OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+                       if (object->type == OBJT_VNODE &&
+                           (vp = (struct vnode *)object->handle) != NULL) {
+                               if (vp->v_flag & VOBJDIRTY) 
+                                       vclrflags(vp, VOBJDIRTY);
                        }
-                       scanlimit = scanreset;
-
-                       /*
-                        * This returns 0 if it was unable to busy the first
-                        * page (i.e. had to sleep).
-                        */
-                       tscan += vm_object_page_collect_flush(object, p, 
-                                               curgeneration, pagerflags);
                }
-               crit_exit();
-
-               /*
-                * If everything was dirty and we flushed it successfully,
-                * and the requested range is not the entire object, we
-                * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
-                * return immediately.
-                */
-               if (tscan >= tend && (tstart || tend < object->size)) {
-                       vm_object_clear_flag(object, OBJ_CLEANING);
-                       return;
-               }
-               pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
        }
 
        /*
-        * Generally set CLEANCHK interlock and make the page read-only so
-        * we can then clear the object flags.
-        *
-        * However, if this is a nosync mmap then the object is likely to 
-        * stay dirty so do not mess with the page and do not clear the
-        * object flags.
-        *
-        * spl protection is required because an interrupt can remove page
-        * from the object.
+        * Do a pass to clean all the dirty pages we find.
         */
-       clearobjflags = 1;
+       do {
+               info.error = 0;
+               curgeneration = object->generation;
+               vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
+                                       vm_object_page_clean_pass2, &info);
+       } while (info.error || curgeneration != object->generation);
 
-       crit_enter();
-       for (p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq)) {
-               vm_page_flag_set(p, PG_CLEANCHK);
-               if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
-                       clearobjflags = 0;
-               else
-                       vm_page_protect(p, VM_PROT_READ);
-       }
+       vm_object_clear_flag(object, OBJ_CLEANING);
        crit_exit();
+}
 
-       if (clearobjflags && (tstart == 0) && (tend == object->size)) {
-               struct vnode *vp;
+static 
+int
+vm_object_page_clean_pass1(struct vm_page *p, void *data)
+{
+       struct rb_vm_page_scan_info *info = data;
 
-               vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
-                if (object->type == OBJT_VNODE &&
-                    (vp = (struct vnode *)object->handle) != NULL) {
-                        if (vp->v_flag & VOBJDIRTY) 
-                               vclrflags(vp, VOBJDIRTY);
-                }
-       }
+       vm_page_flag_set(p, PG_CLEANCHK);
+       if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
+               info->error = 1;
+       else
+               vm_page_protect(p, VM_PROT_READ);
+       return(0);
+}
+
+static 
+int
+vm_object_page_clean_pass2(struct vm_page *p, void *data)
+{
+       struct rb_vm_page_scan_info *info = data;
+       int n;
 
        /*
-        * spl protection is required both to avoid an interrupt unbusy/free
-        * race against a vm_page_lookup(), and also to ensure that the
-        * memq is consistent.  We do not want a busy page to be ripped out
-        * from under us.
+        * Do not mess with pages that were inserted after we started
+        * the cleaning pass.
         */
-       crit_enter();
-rescan:
-       crit_exit();
-       crit_enter();
-       curgeneration = object->generation;
-
-       for (p = TAILQ_FIRST(&object->memq); p; p = np) {
-               int n;
+       if ((p->flags & PG_CLEANCHK) == 0)
+               return(0);
 
-               np = TAILQ_NEXT(p, listq);
-
-again:
-               pi = p->pindex;
-               if (((p->flags & PG_CLEANCHK) == 0) ||
-                       (pi < tstart) || (pi >= tend) ||
-                       (p->valid == 0) ||
-                       ((p->queue - p->pc) == PQ_CACHE)) {
-                       vm_page_flag_clear(p, PG_CLEANCHK);
-                       continue;
-               }
-
-               vm_page_test_dirty(p);
-               if ((p->dirty & p->valid) == 0) {
-                       vm_page_flag_clear(p, PG_CLEANCHK);
-                       continue;
-               }
-
-               /*
-                * If we have been asked to skip nosync pages and this is a
-                * nosync page, skip it.  Note that the object flags were
-                * not cleared in this case so we do not have to set them.
-                */
-               if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
-                       vm_page_flag_clear(p, PG_CLEANCHK);
-                       continue;
-               }
+       /*
+        * Before wasting time traversing the pmaps, check for trivial
+        * cases where the page cannot be dirty.
+        */
+       if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
+               KKASSERT((p->dirty & p->valid) == 0);
+               return(0);
+       }
 
-               n = vm_object_page_collect_flush(object, p,
-                       curgeneration, pagerflags);
-               if (n == 0)
-                       goto rescan;
-               if (object->generation != curgeneration)
-                       goto rescan;
+       /*
+        * Check whether the page is dirty or not.  The page has been set
+        * to be read-only so the check will not race a user dirtying the
+        * page.
+        */
+       vm_page_test_dirty(p);
+       if ((p->dirty & p->valid) == 0) {
+               vm_page_flag_clear(p, PG_CLEANCHK);
+               return(0);
+       }
 
-               /*
-                * Try to optimize the next page.  If we can't we pick up
-                * our (random) scan where we left off.
-                */
-               if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) {
-                       if ((p = vm_page_lookup(object, pi + n)) != NULL)
-                               goto again;
-               }
+       /*
+        * If we have been asked to skip nosync pages and this is a
+        * nosync page, skip it.  Note that the object flags were
+        * not cleared in this case (because pass1 will have returned an
+        * error), so we do not have to set them.
+        */
+       if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
+               vm_page_flag_clear(p, PG_CLEANCHK);
+               return(0);
        }
-       crit_exit();
 
-       vm_object_clear_flag(object, OBJ_CLEANING);
-       return;
+       /*
+        * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
+        * the pages that get successfully flushed.  Set info->error if
+        * we raced an object modification.
+        */
+       n = vm_object_page_collect_flush(info->object, p, info->pagerflags);
+       if (n == 0)
+               info->error = 1;
+       return(0);
 }
 
 /*
@@ -711,24 +647,28 @@ again:
  * recode this to explicitly busy the pages.
  */
 static int
-vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
+vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
 {
        int runlen;
        int maxf;
        int chkb;
        int maxb;
        int i;
+       int curgeneration;
        vm_pindex_t pi;
        vm_page_t maf[vm_pageout_page_count];
        vm_page_t mab[vm_pageout_page_count];
        vm_page_t ma[vm_pageout_page_count];
 
+       curgeneration = object->generation;
+
        pi = p->pindex;
        while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
                if (object->generation != curgeneration) {
                        return(0);
                }
        }
+       KKASSERT(p->object == object && p->pindex == pi);
 
        maxf = 0;
        for(i = 1; i < vm_pageout_page_count; i++) {
@@ -827,18 +767,24 @@ vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration,
  *
  *     The object must be locked.
  */
+static int vm_object_deactivate_pages_callback(vm_page_t p, void *data);
+
 static void
 vm_object_deactivate_pages(vm_object_t object)
 {
-       vm_page_t p, next;
-
        crit_enter();
-       for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) {
-               next = TAILQ_NEXT(p, listq);
-               vm_page_deactivate(p);
-       }
+       vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
+                               vm_object_deactivate_pages_callback, NULL);
        crit_exit();
 }
+
+static int
+vm_object_deactivate_pages_callback(vm_page_t p, void *data __unused)
+{
+       vm_page_deactivate(p);
+       return(0);
+}
+
 #endif
 
 /*
@@ -883,29 +829,31 @@ vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
  *
  *     The object must *not* be locked.
  */
+
+static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
+
 void
 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
-       vm_page_t p;
+       struct rb_vm_page_scan_info info;
 
        if (object == NULL)
                return;
-
-       /*
-        * spl protection is required because an interrupt can unbusy/free
-        * a page.
-        */
+       info.start_pindex = start;
+       info.end_pindex = end - 1;
        crit_enter();
-       for (p = TAILQ_FIRST(&object->memq);
-           p != NULL;
-           p = TAILQ_NEXT(p, listq)
-       ) {
-               if (p->pindex >= start && p->pindex < end)
-                       vm_page_protect(p, VM_PROT_NONE);
-       }
-       crit_exit();
-       if ((start == 0) && (object->size == end))
+       vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
+                               vm_object_pmap_remove_callback, &info);
+       if (start == 0 && end == object->size)
                vm_object_clear_flag(object, OBJ_WRITEABLE);
+       crit_exit();
+}
+
+static int
+vm_object_pmap_remove_callback(vm_page_t p, void *data __unused)
+{
+       vm_page_protect(p, VM_PROT_NONE);
+       return(0);
 }
 
 /*
@@ -1121,13 +1069,13 @@ vm_object_shadow(vm_object_t *object,   /* IN/OUT */
 #define        OBSC_COLLAPSE_NOWAIT    0x0002
 #define        OBSC_COLLAPSE_WAIT      0x0004
 
+static int vm_object_backing_scan_callback(vm_page_t p, void *data);
+
 static __inline int
 vm_object_backing_scan(vm_object_t object, int op)
 {
-       int r = 1;
-       vm_page_t p;
+       struct rb_vm_page_scan_info info;
        vm_object_t backing_object;
-       vm_pindex_t backing_offset_index;
 
        /*
         * spl protection is required to avoid races between the memq/lookup,
@@ -1137,7 +1085,7 @@ vm_object_backing_scan(vm_object_t object, int op)
        crit_enter();
 
        backing_object = object->backing_object;
-       backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
+       info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
        /*
         * Initial conditions
@@ -1164,159 +1112,176 @@ vm_object_backing_scan(vm_object_t object, int op)
        }
 
        /*
-        * Our scan
+        * Our scan.   We have to retry if a negative error code is returned,
+        * otherwise 0 or 1 will be returned in info.error.  0 Indicates that
+        * the scan had to be stopped because the parent does not completely
+        * shadow the child.
         */
+       info.object = object;
+       info.backing_object = backing_object;
+       info.limit = op;
+       do {
+               info.error = 1;
+               vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
+                                       vm_object_backing_scan_callback,
+                                       &info);
+       } while (info.error < 0);
+       crit_exit();
+       return(info.error);
+}
 
-       p = TAILQ_FIRST(&backing_object->memq);
-       while (p) {
-               vm_page_t next = TAILQ_NEXT(p, listq);
-               vm_pindex_t new_pindex = p->pindex - backing_offset_index;
-
-               if (op & OBSC_TEST_ALL_SHADOWED) {
-                       vm_page_t pp;
-
-                       /*
-                        * Ignore pages outside the parent object's range
-                        * and outside the parent object's mapping of the 
-                        * backing object.
-                        *
-                        * note that we do not busy the backing object's
-                        * page.
-                        */
+static int
+vm_object_backing_scan_callback(vm_page_t p, void *data)
+{
+       struct rb_vm_page_scan_info *info = data;
+       vm_object_t backing_object;
+       vm_object_t object;
+       vm_pindex_t new_pindex;
+       vm_pindex_t backing_offset_index;
+       int op;
 
-                       if (
-                           p->pindex < backing_offset_index ||
-                           new_pindex >= object->size
-                       ) {
-                               p = next;
-                               continue;
-                       }
+       new_pindex = p->pindex - info->backing_offset_index;
+       op = info->limit;
+       object = info->object;
+       backing_object = info->backing_object;
+       backing_offset_index = info->backing_offset_index;
 
-                       /*
-                        * See if the parent has the page or if the parent's
-                        * object pager has the page.  If the parent has the
-                        * page but the page is not valid, the parent's
-                        * object pager must have the page.
-                        *
-                        * If this fails, the parent does not completely shadow
-                        * the object and we might as well give up now.
-                        */
+       if (op & OBSC_TEST_ALL_SHADOWED) {
+               vm_page_t pp;
 
-                       pp = vm_page_lookup(object, new_pindex);
-                       if (
-                           (pp == NULL || pp->valid == 0) &&
-                           !vm_pager_has_page(object, new_pindex, NULL, NULL)
-                       ) {
-                               r = 0;
-                               break;
-                       }
+               /*
+                * Ignore pages outside the parent object's range
+                * and outside the parent object's mapping of the 
+                * backing object.
+                *
+                * note that we do not busy the backing object's
+                * page.
+                */
+               if (
+                   p->pindex < backing_offset_index ||
+                   new_pindex >= object->size
+               ) {
+                       return(0);
                }
 
                /*
-                * Check for busy page
+                * See if the parent has the page or if the parent's
+                * object pager has the page.  If the parent has the
+                * page but the page is not valid, the parent's
+                * object pager must have the page.
+                *
+                * If this fails, the parent does not completely shadow
+                * the object and we might as well give up now.
                 */
 
-               if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
-                       vm_page_t pp;
-
-                       if (op & OBSC_COLLAPSE_NOWAIT) {
-                               if (
-                                   (p->flags & PG_BUSY) ||
-                                   !p->valid || 
-                                   p->hold_count || 
-                                   p->wire_count ||
-                                   p->busy
-                               ) {
-                                       p = next;
-                                       continue;
-                               }
-                       } else if (op & OBSC_COLLAPSE_WAIT) {
-                               if (vm_page_sleep_busy(p, TRUE, "vmocol")) {
-                                       /*
-                                        * If we slept, anything could have
-                                        * happened.  Since the object is
-                                        * marked dead, the backing offset
-                                        * should not have changed so we
-                                        * just restart our scan.
-                                        */
-                                       p = TAILQ_FIRST(&backing_object->memq);
-                                       continue;
-                               }
-                       }
-
-                       /* 
-                        * Busy the page
-                        */
-                       vm_page_busy(p);
+               pp = vm_page_lookup(object, new_pindex);
+               if (
+                   (pp == NULL || pp->valid == 0) &&
+                   !vm_pager_has_page(object, new_pindex, NULL, NULL)
+               ) {
+                       info->error = 0;        /* problemo */
+                       return(-1);             /* stop the scan */
+               }
+       }
 
-                       KASSERT(
-                           p->object == backing_object,
-                           ("vm_object_qcollapse(): object mismatch")
-                       );
+       /*
+        * Check for busy page
+        */
 
-                       /*
-                        * Destroy any associated swap
-                        */
-                       if (backing_object->type == OBJT_SWAP) {
-                               swap_pager_freespace(
-                                   backing_object, 
-                                   p->pindex,
-                                   1
-                               );
-                       }
+       if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
+               vm_page_t pp;
 
+               if (op & OBSC_COLLAPSE_NOWAIT) {
                        if (
-                           p->pindex < backing_offset_index ||
-                           new_pindex >= object->size
+                           (p->flags & PG_BUSY) ||
+                           !p->valid || 
+                           p->hold_count || 
+                           p->wire_count ||
+                           p->busy
                        ) {
-                               /*
-                                * Page is out of the parent object's range, we 
-                                * can simply destroy it. 
-                                */
-                               vm_page_protect(p, VM_PROT_NONE);
-                               vm_page_free(p);
-                               p = next;
-                               continue;
+                               return(0);
                        }
-
-                       pp = vm_page_lookup(object, new_pindex);
-                       if (
-                           pp != NULL ||
-                           vm_pager_has_page(object, new_pindex, NULL, NULL)
-                       ) {
+               } else if (op & OBSC_COLLAPSE_WAIT) {
+                       if (vm_page_sleep_busy(p, TRUE, "vmocol")) {
                                /*
-                                * page already exists in parent OR swap exists
-                                * for this location in the parent.  Destroy 
-                                * the original page from the backing object.
+                                * If we slept, anything could have
+                                * happened.   Ask that the scan be restarted.
                                 *
-                                * Leave the parent's page alone
+                                * Since the object is marked dead, the
+                                * backing offset should not have changed.  
                                 */
-                               vm_page_protect(p, VM_PROT_NONE);
-                               vm_page_free(p);
-                               p = next;
-                               continue;
+                               info->error = -1;
+                               return(-1);
                        }
+               }
+
+               /* 
+                * Busy the page
+                */
+               vm_page_busy(p);
 
+               KASSERT(
+                   p->object == backing_object,
+                   ("vm_object_qcollapse(): object mismatch")
+               );
+
+               /*
+                * Destroy any associated swap
+                */
+               if (backing_object->type == OBJT_SWAP) {
+                       swap_pager_freespace(
+                           backing_object, 
+                           p->pindex,
+                           1
+                       );
+               }
+
+               if (
+                   p->pindex < backing_offset_index ||
+                   new_pindex >= object->size
+               ) {
                        /*
-                        * Page does not exist in parent, rename the
-                        * page from the backing object to the main object. 
-                        *
-                        * If the page was mapped to a process, it can remain 
-                        * mapped through the rename.
+                        * Page is out of the parent object's range, we 
+                        * can simply destroy it. 
                         */
-                       if ((p->queue - p->pc) == PQ_CACHE)
-                               vm_page_deactivate(p);
+                       vm_page_protect(p, VM_PROT_NONE);
+                       vm_page_free(p);
+                       return(0);
+               }
 
-                       vm_page_rename(p, object, new_pindex);
-                       /* page automatically made dirty by rename */
+               pp = vm_page_lookup(object, new_pindex);
+               if (
+                   pp != NULL ||
+                   vm_pager_has_page(object, new_pindex, NULL, NULL)
+               ) {
+                       /*
+                        * page already exists in parent OR swap exists
+                        * for this location in the parent.  Destroy 
+                        * the original page from the backing object.
+                        *
+                        * Leave the parent's page alone
+                        */
+                       vm_page_protect(p, VM_PROT_NONE);
+                       vm_page_free(p);
+                       return(0);
                }
-               p = next;
+
+               /*
+                * Page does not exist in parent, rename the
+                * page from the backing object to the main object. 
+                *
+                * If the page was mapped to a process, it can remain 
+                * mapped through the rename.
+                */
+               if ((p->queue - p->pc) == PQ_CACHE)
+                       vm_page_deactivate(p);
+
+               vm_page_rename(p, object, new_pindex);
+               /* page automatically made dirty by rename */
        }
-       crit_exit();
-       return(r);
+       return(0);
 }
 
-
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
@@ -1465,7 +1430,7 @@ vm_object_collapse(vm_object_t object)
                         */
 
                        KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
-                       KASSERT(TAILQ_FIRST(&backing_object->memq) == NULL, ("backing_object %p somehow has left over pages during collapse!", backing_object));
+                       KASSERT(RB_EMPTY(&backing_object->rb_memq), ("backing_object %p somehow has left over pages during collapse!", backing_object));
                        crit_enter();
                        TAILQ_REMOVE(
                            &vm_object_list, 
@@ -1536,105 +1501,102 @@ vm_object_collapse(vm_object_t object)
  *     Removes all physical pages in the specified
  *     object range from the object's list of pages.
  */
+static int vm_object_page_remove_callback(vm_page_t p, void *data);
+
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
-    boolean_t clean_only)
+                     boolean_t clean_only)
 {
-       vm_page_t p, next;
-       unsigned int size;
+       struct rb_vm_page_scan_info info;
        int all;
 
+       /*
+        * Degenerate cases and assertions
+        */
        if (object == NULL || object->resident_page_count == 0)
                return;
+       KASSERT(object->type != OBJT_PHYS, 
+               ("attempt to remove pages from a physical object"));
 
-       all = ((end == 0) && (start == 0));
+       /*
+        * Indicate that paging is occuring on the object
+        */
+       crit_enter();
+       vm_object_pip_add(object, 1);
 
        /*
-        * Since physically-backed objects do not use managed pages, we can't
-        * remove pages from the object (we must instead remove the page
-        * references, and then destroy the object).
+        * Figure out the actual removal range and whether we are removing
+        * the entire contents of the object or not.  If removing the entire
+        * contents, be sure to get all pages, even those that might be 
+        * beyond the end of the object.
         */
-       KASSERT(object->type != OBJT_PHYS, 
-               ("attempt to remove pages from a physical object"));
+       info.start_pindex = start;
+       if (end == 0)
+               info.end_pindex = (vm_pindex_t)-1;
+       else
+               info.end_pindex = end - 1;
+       info.limit = clean_only;
+       all = (start == 0 && info.end_pindex >= object->size - 1);
 
        /*
-        * Indicating that the object is undergoing paging.
-        *
-        * spl protection is required to avoid a race between the memq scan,
-        * an interrupt unbusy/free, and the busy check.
+        * Loop until we are sure we have gotten them all.
         */
-       vm_object_pip_add(object, 1);
-       crit_enter();
-again:
-       size = end - start;
-       if (all || size > object->resident_page_count / 4) {
-               for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) {
-                       next = TAILQ_NEXT(p, listq);
-                       if (all || ((start <= p->pindex) && (p->pindex < end))) {
-                               if (p->wire_count != 0) {
-                                       vm_page_protect(p, VM_PROT_NONE);
-                                       if (!clean_only)
-                                               p->valid = 0;
-                                       continue;
-                               }
+       do {
+               info.error = 0;
+               vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
+                                       vm_object_page_remove_callback, &info);
+       } while (info.error);
 
-                               /*
-                                * The busy flags are only cleared at
-                                * interrupt -- minimize the spl transitions
-                                */
+       /*
+        * Cleanup
+        */
+       vm_object_pip_wakeup(object);
+       crit_exit();
+}
 
-                               if (vm_page_sleep_busy(p, TRUE, "vmopar"))
-                                       goto again;
+static int
+vm_object_page_remove_callback(vm_page_t p, void *data)
+{
+       struct rb_vm_page_scan_info *info = data;
 
-                               if (clean_only && p->valid) {
-                                       vm_page_test_dirty(p);
-                                       if (p->valid & p->dirty)
-                                               continue;
-                               }
+       /*
+        * Wired pages cannot be destroyed, but they can be invalidated
+        * and we do so if clean_only (limit) is not set.
+        */
+       if (p->wire_count != 0) {
+               vm_page_protect(p, VM_PROT_NONE);
+               if (info->limit == 0)
+                       p->valid = 0;
+               return(0);
+       }
 
-                               vm_page_busy(p);
-                               vm_page_protect(p, VM_PROT_NONE);
-                               vm_page_free(p);
-                       }
-               }
-       } else {
-               while (size > 0) {
-                       if ((p = vm_page_lookup(object, start)) != 0) {
-                               if (p->wire_count != 0) {
-                                       vm_page_protect(p, VM_PROT_NONE);
-                                       if (!clean_only)
-                                               p->valid = 0;
-                                       start += 1;
-                                       size -= 1;
-                                       continue;
-                               }
+       /*
+        * The busy flags are only cleared at
+        * interrupt -- minimize the spl transitions
+        */
 
-                               /*
-                                * The busy flags are only cleared at
-                                * interrupt -- minimize the spl transitions
-                                */
-                               if (vm_page_sleep_busy(p, TRUE, "vmopar"))
-                                       goto again;
-
-                               if (clean_only && p->valid) {
-                                       vm_page_test_dirty(p);
-                                       if (p->valid & p->dirty) {
-                                               start += 1;
-                                               size -= 1;
-                                               continue;
-                                       }
-                               }
+       if (vm_page_sleep_busy(p, TRUE, "vmopar")) {
+               info->error = 1;
+               return(0);
+       }
 
-                               vm_page_busy(p);
-                               vm_page_protect(p, VM_PROT_NONE);
-                               vm_page_free(p);
-                       }
-                       start += 1;
-                       size -= 1;
-               }
+       /*
+        * limit is our clean_only flag.  If set and the page is dirty, do
+        * not free it.
+        */
+       if (info->limit && p->valid) {
+               vm_page_test_dirty(p);
+               if (p->valid & p->dirty)
+                       return(0);
        }
-       crit_exit();
-       vm_object_pip_wakeup(object);
+
+       /*
+        * Destroy the page
+        */
+       vm_page_busy(p);
+       vm_page_protect(p, VM_PROT_NONE);
+       vm_page_free(p);
+       return(0);
 }
 
 /*
@@ -1902,7 +1864,7 @@ DB_SHOW_COMMAND(object, vm_object_print_static)
 
        db_indent += 2;
        count = 0;
-       for (p = TAILQ_FIRST(&object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) {
+       RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
                if (count == 0)
                        db_iprintf("memory:=");
                else if (count == 6) {
index 9da954b..e9bae2b 100644 (file)
@@ -62,7 +62,7 @@
  * rights to redistribute these changes.
  *
  * $FreeBSD: src/sys/vm/vm_object.h,v 1.63.2.3 2003/05/26 19:17:56 alc Exp $
- * $DragonFly: src/sys/vm/vm_object.h,v 1.10 2006/05/20 02:42:15 dillon Exp $
+ * $DragonFly: src/sys/vm/vm_object.h,v 1.11 2006/12/02 23:13:46 dillon Exp $
  */
 
 /*
@@ -87,6 +87,9 @@
 #ifndef _VM_VM_H_
 #include <vm/vm.h>
 #endif
+#ifndef _VM_VM_PAGE_H_
+#include <vm/vm_page.h>
+#endif
 
 #ifdef _KERNEL
 
@@ -129,7 +132,7 @@ struct vm_object {
        TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
        LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
        LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
-       TAILQ_HEAD(, vm_page) memq;     /* list of resident pages */
+       RB_HEAD(vm_page_rb_tree, vm_page) rb_memq;      /* resident pages */
        int generation;                 /* generation ID */
        vm_size_t size;                 /* Object size */
        int ref_count;                  /* How many refs?? */
index bdedcc4..33ff59e 100644 (file)
@@ -35,7 +35,7 @@
  *
  *     from: @(#)vm_page.c     7.4 (Berkeley) 5/7/91
  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
- * $DragonFly: src/sys/vm/vm_page.c,v 1.32 2005/07/27 07:55:15 dillon Exp $
+ * $DragonFly: src/sys/vm/vm_page.c,v 1.33 2006/12/02 23:13:46 dillon Exp $
  */
 
 /*
@@ -94,14 +94,13 @@ static void vm_page_free_wakeup(void);
 static vm_page_t vm_page_select_cache(vm_object_t, vm_pindex_t);
 static vm_page_t _vm_page_list_find2(int basequeue, int index);
 
-static int vm_page_bucket_count;       /* How big is array? */
-static int vm_page_hash_mask;          /* Mask for hash function */
-static struct vm_page **vm_page_buckets; /* Array of buckets */
-static volatile int vm_page_bucket_generation;
 struct vpgqueues vm_page_queues[PQ_COUNT]; /* Array of tailq lists */
 
 #define ASSERT_IN_CRIT_SECTION()       KKASSERT(crit_test(curthread));
 
+RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
+            vm_pindex_t, pindex);
+
 static void
 vm_page_queue_init(void) 
 {
@@ -198,7 +197,6 @@ vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
        vm_offset_t mapped;
-       struct vm_page **bucket;
        vm_size_t npages;
        vm_paddr_t page_range;
        vm_paddr_t new_end;
@@ -208,7 +206,6 @@ vm_page_startup(vm_offset_t vaddr)
        vm_paddr_t last_pa;
        vm_paddr_t end;
        vm_paddr_t biggestone, biggestsize;
-
        vm_paddr_t total;
 
        total = 0;
@@ -234,6 +231,7 @@ vm_page_startup(vm_offset_t vaddr)
        }
 
        end = phys_avail[biggestone+1];
+       end = trunc_page(end);
 
        /*
         * Initialize the queue headers for the free queue, the active queue
@@ -242,48 +240,6 @@ vm_page_startup(vm_offset_t vaddr)
 
        vm_page_queue_init();
 
-       /*
-        * Allocate (and initialize) the hash table buckets.
-        *
-        * The number of buckets MUST BE a power of 2, and the actual value is
-        * the next power of 2 greater than the number of physical pages in
-        * the system.  
-        *
-        * We make the hash table approximately 2x the number of pages to
-        * reduce the chain length.  This is about the same size using the 
-        * singly-linked list as the 1x hash table we were using before 
-        * using TAILQ but the chain length will be smaller.
-        *
-        * Note: This computation can be tweaked if desired.
-        */
-       vm_page_buckets = (struct vm_page **)vaddr;
-       bucket = vm_page_buckets;
-       if (vm_page_bucket_count == 0) {
-               vm_page_bucket_count = 1;
-               while (vm_page_bucket_count < atop(total))
-                       vm_page_bucket_count <<= 1;
-       }
-       vm_page_bucket_count <<= 1;
-       vm_page_hash_mask = vm_page_bucket_count - 1;
-
-       /*
-        * Cut a chunk out of the largest block of physical memory,
-        * moving its end point down to accomodate the hash table and
-        * vm_page_array.
-        */
-       new_end = end - vm_page_bucket_count * sizeof(struct vm_page *);
-       new_end = trunc_page(new_end);
-       mapped = round_page(vaddr);
-       vaddr = pmap_map(mapped, new_end, end,
-           VM_PROT_READ | VM_PROT_WRITE);
-       vaddr = round_page(vaddr);
-       bzero((caddr_t) mapped, vaddr - mapped);
-
-       for (i = 0; i < vm_page_bucket_count; i++) {
-               *bucket = NULL;
-               bucket++;
-       }
-
        /*
         * Compute the number of pages of memory that will be available for
         * use (taking into account the overhead of a page structure per
@@ -291,10 +247,7 @@ vm_page_startup(vm_offset_t vaddr)
         */
        first_page = phys_avail[0] / PAGE_SIZE;
        page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
-       npages = (total - (page_range * sizeof(struct vm_page)) -
-           (end - new_end)) / PAGE_SIZE;
-
-       end = new_end;
+       npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
 
        /*
         * Initialize the mem entry structures now, and put them in the free
@@ -339,20 +292,29 @@ vm_page_startup(vm_offset_t vaddr)
 }
 
 /*
- * Distributes the object/offset key pair among hash buckets.
- *
- * NOTE:  This macro depends on vm_page_bucket_count being a power of 2.
- * This routine may not block.
- *
- * We try to randomize the hash based on the object to spread the pages
- * out in the hash table without it costing us too much.
+ * Scan comparison function for Red-Black tree scans.  An inclusive
+ * (start,end) is expected.  Other fields are not used.
  */
-static __inline int
-vm_page_hash(vm_object_t object, vm_pindex_t pindex)
+int
+rb_vm_page_scancmp(struct vm_page *p, void *data)
 {
-       int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
+       struct rb_vm_page_scan_info *info = data;
 
-       return(i & vm_page_hash_mask);
+       if (p->pindex < info->start_pindex)
+               return(-1);
+       if (p->pindex > info->end_pindex)
+               return(1);
+       return(0);
+}
+
+int
+rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
+{
+       if (p1->pindex < p2->pindex)
+               return(-1);
+       if (p1->pindex > p2->pindex)
+               return(1);
+       return(0);
 }
 
 /*
@@ -387,8 +349,6 @@ vm_page_unhold(vm_page_t mem)
 void
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
-       struct vm_page **bucket;
-
        ASSERT_IN_CRIT_SECTION();
        if (m->object != NULL)
                panic("vm_page_insert: already inserted");
@@ -400,17 +360,9 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
        m->pindex = pindex;
 
        /*
-        * Insert it into the object_object/offset hash table
-        */
-       bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
-       m->hnext = *bucket;
-       *bucket = m;
-       vm_page_bucket_generation++;
-
-       /*
-        * Now link into the object's list of backed pages.
+        * Insert it into the object.
         */
-       TAILQ_INSERT_TAIL(&object->memq, m, listq);
+       vm_page_rb_tree_RB_INSERT(&object->rb_memq, m);
        object->generation++;
 
        /*
@@ -443,7 +395,6 @@ void
 vm_page_remove(vm_page_t m)
 {
        vm_object_t object;
-       struct vm_page **bucket;
 
        crit_enter();
        if (m->object == NULL) {
@@ -457,34 +408,13 @@ vm_page_remove(vm_page_t m)
        object = m->object;
 
        /*
-        * Remove from the object_object/offset hash table.  The object
-        * must be on the hash queue, we will panic if it isn't
-        *
-        * Note: we must NULL-out m->hnext to prevent loops in detached
-        * buffers with vm_page_lookup().
-        */
-       bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
-       while (*bucket != m) {
-               if (*bucket == NULL)
-                   panic("vm_page_remove(): page not found in hash");
-               bucket = &(*bucket)->hnext;
-       }
-       *bucket = m->hnext;
-       m->hnext = NULL;
-       vm_page_bucket_generation++;
-
-       /*
-        * Now remove from the object's list of backed pages.
-        */
-       TAILQ_REMOVE(&object->memq, m, listq);
-
-       /*
-        * And show that the object has one fewer resident page.
+        * Remove the page from the object and update the object.
         */
+       vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
        object->resident_page_count--;
        object->generation++;
-
        m->object = NULL;
+
        crit_exit();
 }
 
@@ -507,25 +437,15 @@ vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
        vm_page_t m;
-       struct vm_page **bucket;
-       int generation;
 
        /*
         * Search the hash table for this object/offset pair
         */
-retry:
-       generation = vm_page_bucket_generation;
-       bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
-       for (m = *bucket; m != NULL; m = m->hnext) {
-               if ((m->object == object) && (m->pindex == pindex)) {
-                       if (vm_page_bucket_generation != generation)
-                               goto retry;
-                       return (m);
-               }
-       }
-       if (vm_page_bucket_generation != generation)
-               goto retry;
-       return (NULL);
+       crit_enter();
+       m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
+       crit_exit();
+       KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
+       return(m);
 }
 
 /*
index 83cf5f1..d7cad88 100644 (file)
@@ -62,7 +62,7 @@
  * rights to redistribute these changes.
  *
  * $FreeBSD: src/sys/vm/vm_page.h,v 1.75.2.8 2002/03/06 01:07:09 dillon Exp $
- * $DragonFly: src/sys/vm/vm_page.h,v 1.24 2006/05/21 03:43:47 dillon Exp $
+ * $DragonFly: src/sys/vm/vm_page.h,v 1.25 2006/12/02 23:13:46 dillon Exp $
  */
 
 /*
@@ -79,6 +79,9 @@
 #ifndef _SYS_TYPES_H_
 #include <sys/types.h>
 #endif
+#ifndef _SYS_TREE_H_
+#include <sys/tree.h>
+#endif
 #ifndef _MACHINE_PMAP_H_
 #include <machine/pmap.h>
 #endif
@@ -134,10 +137,14 @@ TAILQ_HEAD(pglist, vm_page);
 struct msf_buf;
 struct vm_object;
 
+int rb_vm_page_compare(struct vm_page *, struct vm_page *);
+
+struct vm_page_rb_tree;
+RB_PROTOTYPE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare, vm_pindex_t);
+
 struct vm_page {
        TAILQ_ENTRY(vm_page) pageq;     /* vm_page_queues[] list (P)    */
-       struct vm_page  *hnext;         /* hash table link (O,P)        */
-       TAILQ_ENTRY(vm_page) listq;     /* pages in same object (O)     */
+       RB_ENTRY(vm_page) rb_entry;     /* Red-Black tree based at object */
 
        struct vm_object *object;       /* which object am I in (O,P)*/
        vm_pindex_t pindex;             /* offset into object (O,P) */
@@ -244,6 +251,29 @@ typedef struct vm_page *vm_page_t;
 #define PQ_HOLD                (3 + 2*PQ_MAXL2_SIZE)
 #define PQ_COUNT       (4 + 2*PQ_MAXL2_SIZE)
 
+/*
+ * Scan support
+ */
+struct vm_map;
+
+struct rb_vm_page_scan_info {
+       vm_pindex_t     start_pindex;
+       vm_pindex_t     end_pindex;
+       int             limit;
+       int             desired;
+       int             error;
+       int             pagerflags;
+       vm_offset_t     addr;
+       vm_pindex_t     backing_offset_index;
+       struct vm_object *object;
+       struct vm_object *backing_object;
+       struct vm_page  *mpte;
+       struct pmap     *pmap;
+       struct vm_map   *map;
+};
+
+int rb_vm_page_scancmp(struct vm_page *, void *);
+
 struct vpgqueues {
        struct pglist pl;
        int     *cnt;
index dc39d79..ca92d6e 100644 (file)
@@ -66,7 +66,7 @@
  * rights to redistribute these changes.
  *
  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
- * $DragonFly: src/sys/vm/vm_pageout.c,v 1.26 2006/11/07 17:51:24 dillon Exp $
+ * $DragonFly: src/sys/vm/vm_pageout.c,v 1.27 2006/12/02 23:13:46 dillon Exp $
  */
 
 /*
@@ -471,12 +471,13 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags)
  *
  *     The object and map must be locked.
  */
+static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
+
 static void
 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
        vm_pindex_t desired, int map_remove_only)
 {
-       vm_page_t p, next;
-       int rcount;
+       struct rb_vm_page_scan_info info;
        int remove_mode;
 
        if (object->type == OBJT_DEVICE || object->type == OBJT_PHYS)
@@ -498,64 +499,68 @@ vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
                 * our busy check.
                 */
                crit_enter();
-               rcount = object->resident_page_count;
-               p = TAILQ_FIRST(&object->memq);
+               info.limit = remove_mode;
+               info.map = map;
+               info.desired = desired;
+               vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
+                               vm_pageout_object_deactivate_pages_callback,
+                               &info
+               );
+               crit_exit();
+               object = object->backing_object;
+       }
+}
+                                       
+static int
+vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
+{
+       struct rb_vm_page_scan_info *info = data;
+       int actcount;
 
-               while (p && (rcount-- > 0)) {
-                       int actcount;
-                       if (pmap_resident_count(vm_map_pmap(map)) <= desired) {
-                               crit_exit();
-                               return;
-                       }
-                       next = TAILQ_NEXT(p, listq);
-                       mycpu->gd_cnt.v_pdpages++;
-                       if (p->wire_count != 0 ||
-                           p->hold_count != 0 ||
-                           p->busy != 0 ||
-                           (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
-                           !pmap_page_exists_quick(vm_map_pmap(map), p)) {
-                               p = next;
-                               continue;
-                       }
+       if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) {
+               return(-1);
+       }
+       mycpu->gd_cnt.v_pdpages++;
+       if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 ||
+           (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
+           !pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
+               return(0);
+       }
 
-                       actcount = pmap_ts_referenced(p);
-                       if (actcount) {
-                               vm_page_flag_set(p, PG_REFERENCED);
-                       } else if (p->flags & PG_REFERENCED) {
-                               actcount = 1;
-                       }
+       actcount = pmap_ts_referenced(p);
+       if (actcount) {
+               vm_page_flag_set(p, PG_REFERENCED);
+       } else if (p->flags & PG_REFERENCED) {
+               actcount = 1;
+       }
 
-                       if ((p->queue != PQ_ACTIVE) &&
-                               (p->flags & PG_REFERENCED)) {
-                               vm_page_activate(p);
-                               p->act_count += actcount;
-                               vm_page_flag_clear(p, PG_REFERENCED);
-                       } else if (p->queue == PQ_ACTIVE) {
-                               if ((p->flags & PG_REFERENCED) == 0) {
-                                       p->act_count -= min(p->act_count, ACT_DECLINE);
-                                       if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
-                                               vm_page_protect(p, VM_PROT_NONE);
-                                               vm_page_deactivate(p);
-                                       } else {
-                                               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
-                                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
-                                       }
-                               } else {
-                                       vm_page_activate(p);
-                                       vm_page_flag_clear(p, PG_REFERENCED);
-                                       if (p->act_count < (ACT_MAX - ACT_ADVANCE))
-                                               p->act_count += ACT_ADVANCE;
-                                       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
-                                       TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
-                               }
-                       } else if (p->queue == PQ_INACTIVE) {
+       if ((p->queue != PQ_ACTIVE) &&
+               (p->flags & PG_REFERENCED)) {
+               vm_page_activate(p);
+               p->act_count += actcount;
+               vm_page_flag_clear(p, PG_REFERENCED);
+       } else if (p->queue == PQ_ACTIVE) {
+               if ((p->flags & PG_REFERENCED) == 0) {
+                       p->act_count -= min(p->act_count, ACT_DECLINE);
+                       if (!info->limit && (vm_pageout_algorithm || (p->act_count == 0))) {
                                vm_page_protect(p, VM_PROT_NONE);
+                               vm_page_deactivate(p);
+                       } else {
+                               TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
+                               TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
                        }
-                       p = next;
+               } else {
+                       vm_page_activate(p);
+                       vm_page_flag_clear(p, PG_REFERENCED);
+                       if (p->act_count < (ACT_MAX - ACT_ADVANCE))
+                               p->act_count += ACT_ADVANCE;
+                       TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
+                       TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
                }
-               crit_exit();
-               object = object->backing_object;
+       } else if (p->queue == PQ_INACTIVE) {
+               vm_page_protect(p, VM_PROT_NONE);
        }
+       return(0);
 }
 
 /*