kernel - VM rework part 2 - Replace backing_object with backing_ba
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 9 May 2019 02:39:44 +0000 (19:39 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 10 May 2019 16:24:45 +0000 (09:24 -0700)
* Remove the vm_object based backing_object chains and all related
  chaining code.

  This removes an enormous number of locks from the VM system and
  also removes object-to-object dependencies which requires careful
  traversal code.  A great deal of complex code has been removed
  and replaced with far simpler code.

  Ultimately the intention will be to support removal of pv_entry
  tracking from vm_pages to gain lockless shared faults, but that
  is far in the future.  It will require hanging vm_map_backing
  structures off of a list based in the object.

* Implement the vm_map_backing structure which is embedded in the
  vm_map_entry and then links to additional dynamically allocated
  vm_map_backing structures via entry->ba.backing_ba.  This structure
  contains the object and offset and essentially takes over the
  functionality that object->backing_object used to have.

  backing objects are now handled via vm_map_backing.  In this
  commit, fork operations create a fan-in tree to shared subsets
  of backings via vm_map_backing.  In this particular commit,
  these subsets are not collapsed in any way.

* Remove all the vm_map_split and collapse code.  Every last line
  is gone.  It will be reimplemented using vm_map_backing in a
  later commit.

  This means that as-of this commit both recursive forks and
  parent-to-multiple-children forks cause an accumulation of
  inefficient lists of backing objects to occur in the parent
  and children.  This will begin to get addressed in part 3.

* The code no longer releases the vm_map lock (typically shared)
  across (get_pages) I/O.  There are no longer any chaining locks to
  get in the way (hopefully).  This means that the code does not
  have to re-check as carefully as it did before.  However, some
  complexity will have to be added back in once we begin to address
  the accumulation of vm_map_backing structures.

* Paging performance improved by 30-40%

18 files changed:
sys/kern/imgact_elf.c
sys/kern/sys_process.c
sys/kern/sysv_shm.c
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/trap.c
sys/vfs/procfs/procfs_map.c
sys/vm/swap_pager.c
sys/vm/swap_pager.h
sys/vm/vm_fault.c
sys/vm/vm_map.c
sys/vm/vm_map.h
sys/vm/vm_mmap.c
sys/vm/vm_object.c
sys/vm/vm_object.h
sys/vm/vm_swapcache.c
sys/vm/vnode_pager.c
sys/vm/vnode_pager.h
usr.bin/fstat/fstat.c

index 2632ba6..949c4f9 100644 (file)
@@ -1096,8 +1096,8 @@ cb_fpcount_segment(vm_map_entry_t entry, void *closure)
        int *count = closure;
        struct vnode *vp;
 
-       if (entry->object.vm_object->type == OBJT_VNODE) {
-               vp = (struct vnode *)entry->object.vm_object->handle;
+       if (entry->ba.object && entry->ba.object->type == OBJT_VNODE) {
+               vp = (struct vnode *)entry->ba.object->handle;
                if ((vp->v_flag & VCKPT) && curproc->p_textvp == vp)
                        return (0);
                ++*count;
@@ -1132,8 +1132,8 @@ cb_put_fp(vm_map_entry_t entry, void *closure)
         * referencing many prior checkpoint files and that is a bit over
         * the top for the purpose of the checkpoint API.
         */
-       if (entry->object.vm_object->type == OBJT_VNODE) {
-               vp = (struct vnode *)entry->object.vm_object->handle;
+       if (entry->ba.object && entry->ba.object->type == OBJT_VNODE) {
+               vp = (struct vnode *)entry->ba.object->handle;
                if ((vp->v_flag & VCKPT) && curproc->p_textvp == vp)
                        return (0);
                if (vnh == fpc->vnh_max)
@@ -1192,9 +1192,8 @@ each_segment(struct proc *p, segment_callback func, void *closure, int writable)
        vm_map_entry_t entry;
 
        RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
+               vm_map_backing_t *ba;
                vm_object_t obj;
-               vm_object_t lobj;
-               vm_object_t tobj;
 
                /*
                 * Don't dump inaccessible mappings, deal with legacy
@@ -1224,43 +1223,32 @@ each_segment(struct proc *p, segment_callback func, void *closure, int writable)
                        continue;
                if (entry->maptype != VM_MAPTYPE_NORMAL)
                        continue;
-               if ((obj = entry->object.vm_object) == NULL)
-                       continue;
 
                /*
                 * Find the bottom-most object, leaving the base object
                 * and the bottom-most object held (but only one hold
                 * if they happen to be the same).
                 */
-               vm_object_hold_shared(obj);
-
-               lobj = obj;
-               while (lobj && (tobj = lobj->backing_object) != NULL) {
-                       KKASSERT(tobj != obj);
-                       vm_object_hold_shared(tobj);
-                       if (tobj == lobj->backing_object) {
-                               if (lobj != obj) {
-                                       vm_object_lock_swap();
-                                       vm_object_drop(lobj);
-                               }
-                               lobj = tobj;
-                       } else {
-                               vm_object_drop(tobj);
-                       }
-               }
+               ba = &entry->ba;
+               while (ba->backing_ba)
+                       ba = ba->backing_ba;
+               obj = ba->object;
 
                /*
                 * The callback only applies to default, swap, or vnode
                 * objects.  Other types of objects such as memory-mapped
                 * devices are ignored.
                 */
-               if (lobj->type == OBJT_DEFAULT || lobj->type == OBJT_SWAP ||
-                   lobj->type == OBJT_VNODE) {
-                       error = (*func)(entry, closure);
+               if (obj) {
+                       vm_object_hold_shared(obj);
+
+                       if (obj->type == OBJT_DEFAULT ||
+                           obj->type == OBJT_SWAP ||
+                           obj->type == OBJT_VNODE) {
+                               error = (*func)(entry, closure);
+                       }
+                       vm_object_drop(obj);
                }
-               if (lobj != obj)
-                       vm_object_drop(lobj);
-               vm_object_drop(obj);
        }
        return (error);
 }
index ebde762..4580956 100644 (file)
@@ -60,6 +60,7 @@ pread (struct proc *procp, unsigned int addr, unsigned int *retval) {
        int             rv;
        vm_map_t        map, tmap;
        vm_object_t     object;
+       vm_map_backing_t *ba;
        vm_offset_t     kva = 0;
        int             page_offset;    /* offset into page */
        vm_offset_t     pageno;         /* page number */
@@ -77,7 +78,12 @@ pread (struct proc *procp, unsigned int addr, unsigned int *retval) {
 
        tmap = map;
        rv = vm_map_lookup(&tmap, pageno, VM_PROT_READ, &out_entry,
-                          &object, &pindex, &out_prot, &wflags);
+                          &ba, &pindex, &out_prot, &wflags);
+       if (ba)
+               object = ba->object;
+       else
+               object = NULL;
+
 
        if (rv != KERN_SUCCESS)
                return EINVAL;
@@ -111,6 +117,7 @@ pwrite (struct proc *procp, unsigned int addr, unsigned int datum) {
        int             rv;
        vm_map_t        map, tmap;
        vm_object_t     object;
+       vm_map_backing_t *ba;
        vm_offset_t     kva = 0;
        int             page_offset;    /* offset into page */
        vm_offset_t     pageno;         /* page number */
@@ -154,7 +161,12 @@ pwrite (struct proc *procp, unsigned int addr, unsigned int datum) {
 
        tmap = map;
        rv = vm_map_lookup(&tmap, pageno, VM_PROT_WRITE, &out_entry,
-                          &object, &pindex, &out_prot, &wflags);
+                          &ba, &pindex, &out_prot, &wflags);
+       if (ba)
+               object = ba->object;
+       else
+               object = NULL;
+
        if (rv != KERN_SUCCESS)
                return EINVAL;
 
index 911b2fe..9c01b77 100644 (file)
@@ -345,7 +345,6 @@ again:
 
        shm_handle = shmseg->shm_internal;
        vm_object_hold(shm_handle->shm_object);
-       vm_object_chain_wait(shm_handle->shm_object, 0);
        vm_object_reference_locked(shm_handle->shm_object);
        rv = vm_map_find(&p->p_vmspace->vm_map, 
                         shm_handle->shm_object, NULL,
index 92d4051..c4d0ee6 100644 (file)
@@ -2695,10 +2695,10 @@ pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp,
            ptepindex >= pmap_pd_pindex(0) ||           /* not terminal or pt */
            entry->inheritance != VM_INHERIT_SHARE ||   /* not shared */
            entry->maptype != VM_MAPTYPE_NORMAL ||      /* weird map type */
-           entry->object.vm_object == NULL ||          /* needs VM object */
-           entry->object.vm_object->type == OBJT_DEVICE ||     /* ick */
-           entry->object.vm_object->type == OBJT_MGTDEVICE ||  /* ick */
-           (entry->offset & SEG_MASK) ||               /* must be aligned */
+           entry->ba.object == NULL ||         /* needs VM object */
+           entry->ba.object->type == OBJT_DEVICE ||    /* ick */
+           entry->ba.object->type == OBJT_MGTDEVICE || /* ick */
+           (entry->ba.offset & SEG_MASK) ||            /* must be aligned */
            (entry->start & SEG_MASK)) {
                return(pmap_allocpte(pmap, ptepindex, pvpp));
        }
@@ -2714,7 +2714,7 @@ pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp,
         * If the full segment can be represented dive the VM object's
         * shared pmap, allocating as required.
         */
-       object = entry->object.vm_object;
+       object = entry->ba.object;
 
        if (entry->protection & VM_PROT_WRITE)
                obpmapp = &object->md.pmap_rw;
index db1cded..a3ed6e8 100644 (file)
@@ -817,9 +817,10 @@ out2:      ;
                ("trap: critical section count mismatch! %d/%d",
                crit_count, td->td_pri));
        KASSERT(curstop == td->td_toks_stop,
-               ("trap: extra tokens held after trap! %ld/%ld",
+               ("trap: extra tokens held after trap! %ld/%ld (%s)",
                curstop - &td->td_toks_base,
-               td->td_toks_stop - &td->td_toks_base));
+               td->td_toks_stop - &td->td_toks_base,
+               td->td_toks_stop[-1].tr_tok->t_desc));
 #endif
 }
 
@@ -910,6 +911,8 @@ trap_pfault(struct trapframe *frame, int usermode)
        else
                ftype = VM_PROT_READ;
 
+       lwkt_tokref_t stop = td->td_toks_stop;
+
        if (map != &kernel_map) {
                /*
                 * Keep swapout from messing with us during this
@@ -928,6 +931,11 @@ trap_pfault(struct trapframe *frame, int usermode)
                else
                        fault_flags |= VM_FAULT_NORMAL;
                rv = vm_fault(map, va, ftype, fault_flags);
+               if (td->td_toks_stop != stop) {
+                       stop = td->td_toks_stop - 1;
+                       kprintf("A-HELD TOKENS DURING PFAULT td=%p(%s) map=%p va=%p ftype=%d fault_flags=%d\n", td, td->td_comm, map, (void *)va, ftype, fault_flags);
+                       panic("held tokens");
+               }
 
                PRELE(lp->lwp_proc);
        } else {
@@ -937,6 +945,11 @@ trap_pfault(struct trapframe *frame, int usermode)
                 */
                fault_flags = VM_FAULT_NORMAL;
                rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
+               if (td->td_toks_stop != stop) {
+                       stop = td->td_toks_stop - 1;
+                       kprintf("B-HELD TOKENS DURING PFAULT td=%p(%s) map=%p va=%p ftype=%d fault_flags=%d\n", td, td->td_comm, map, (void *)va, ftype, VM_FAULT_NORMAL);
+                       panic("held tokens");
+               }
        }
        if (rv == KERN_SUCCESS)
                return (0);
index 4a7e88a..f0307f1 100644 (file)
@@ -86,7 +86,8 @@ procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs,
        lwkt_reltoken(&p->p_token);
 
        RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
-               vm_object_t obj, tobj, lobj;
+               vm_map_backing_t *ba;
+               vm_object_t obj;
                int ref_count, flags;
                vm_offset_t e_start, e_end;
                vm_eflags_t e_eflags;
@@ -97,13 +98,10 @@ procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs,
                switch(entry->maptype) {
                case VM_MAPTYPE_NORMAL:
                case VM_MAPTYPE_VPAGETABLE:
-                       obj = entry->object.vm_object;
-                       if (obj != NULL) {
-                               vm_object_hold(obj);
-                       }
+                       ba = &entry->ba;
                        break;
                case VM_MAPTYPE_UKSMAP:
-                       obj = NULL;
+                       ba = NULL;
                        break;
                default:
                        /* ignore entry */
@@ -132,31 +130,25 @@ procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs,
                        addr += PAGE_SIZE;
                }
 #endif
-               if (obj) {
-                       lobj = obj;
-                       while ((tobj = lobj->backing_object) != NULL) {
-                               KKASSERT(tobj != obj);
-                               vm_object_hold(tobj);
-                               if (tobj == lobj->backing_object) {
-                                       if (lobj != obj) {
-                                               vm_object_lock_swap();
-                                               vm_object_drop(lobj);
-                                       }
-                                       lobj = tobj;
-                               } else {
-                                       vm_object_drop(tobj);
-                               }
-                       }
+               if (ba) {
+                       while (ba->backing_ba)
+                               ba = ba->backing_ba;
+                       obj = ba->object;
+                       if (obj)
+                               vm_object_hold(obj);
                } else {
-                       lobj = NULL;
+                       obj = NULL;
                }
                last_timestamp = map->timestamp;
                vm_map_unlock(map);
 
                freepath = NULL;
                fullpath = "-";
-               if (lobj) {
-                       switch(lobj->type) {
+               flags = 0;
+               ref_count = 0;
+
+               if (obj) {
+                       switch(obj->type) {
                        default:
                        case OBJT_DEFAULT:
                                type = "default";
@@ -164,7 +156,7 @@ procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs,
                                break;
                        case OBJT_VNODE:
                                type = "vnode";
-                               vp = lobj->handle;
+                               vp = obj->handle;
                                vref(vp);
                                break;
                        case OBJT_SWAP:
@@ -180,19 +172,16 @@ procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs,
                                vp = NULL;
                                break;
                        }
-                       if (lobj != obj)
-                               vm_object_drop(lobj);
-                       
-                       flags = obj->flags;
-                       ref_count = obj->ref_count;
+                       if (ba->object) {
+                               flags = ba->object->flags;
+                               ref_count = ba->object->ref_count;
+                       }
                        vm_object_drop(obj);
-                       if (vp != NULL) {
+                       if (vp) {
                                vn_fullpath(p, vp, &fullpath, &freepath, 1);
                                vrele(vp);
                        }
                } else {
-                       flags = 0;
-                       ref_count = 0;
                        switch(entry->maptype) {
                        case VM_MAPTYPE_UNSPECIFIED:
                                type = "unspec";
@@ -227,7 +216,7 @@ procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs,
 #endif
                          "0x%04x %s%s %s %s\n",
                        (u_long)e_start, (u_long)e_end,
-                       resident, -1, obj,
+                       resident, -1, (ba ? ba->object : NULL),
                        (e_prot & VM_PROT_READ) ? "r" : "-",
                        (e_prot & VM_PROT_WRITE) ? "w" : "-",
                        (e_prot & VM_PROT_EXECUTE) ? "x" : "-",
index 98d38ff..0f0de4c 100644 (file)
@@ -898,6 +898,17 @@ swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
        return (TRUE);
 }
 
+/*
+ * Object must be held exclusive or shared by the caller.
+ */
+boolean_t
+swap_pager_haspage_locked(vm_object_t object, vm_pindex_t pindex)
+{
+       if (swp_pager_meta_ctl(object, pindex, 0) == SWAPBLK_NONE)
+               return FALSE;
+       return TRUE;
+}
+
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
index 63904ca..4e0bc17 100644 (file)
@@ -103,6 +103,7 @@ extern int nswap_lowat, nswap_hiwat;
 
 void swap_pager_putpages (vm_object_t, struct vm_page **, int, int, int *);
 boolean_t swap_pager_haspage (vm_object_t object, vm_pindex_t pindex);
+boolean_t swap_pager_haspage_locked (vm_object_t object, vm_pindex_t pindex);
 int swap_pager_swapoff (int devidx);
 
 int swap_pager_swp_alloc (vm_object_t, int);
index d66ea74..59b9e44 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2014 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
+#include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 
 #include <vm/vm_page2.h>
 
 struct faultstate {
        vm_page_t m;
-       vm_object_t object;
+       vm_map_backing_t *ba;
        vm_pindex_t pindex;
        vm_prot_t prot;
        vm_page_t first_m;
-       vm_object_t first_object;
+       vm_map_backing_t *first_ba;
        vm_prot_t first_prot;
        vm_map_t map;
        vm_map_entry_t entry;
@@ -145,6 +146,7 @@ struct faultstate {
        int msoftonly;
        int first_shared;
        int wflags;
+       int first_ba_held;
        struct vnode *vp;
 };
 
@@ -152,9 +154,11 @@ __read_mostly static int debug_fault = 0;
 SYSCTL_INT(_vm, OID_AUTO, debug_fault, CTLFLAG_RW, &debug_fault, 0, "");
 __read_mostly static int debug_cluster = 0;
 SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
-__read_mostly static int virtual_copy_enable = 1;
+#if 0
+static int virtual_copy_enable = 1;
 SYSCTL_INT(_vm, OID_AUTO, virtual_copy_enable, CTLFLAG_RW,
                &virtual_copy_enable, 0, "");
+#endif
 __read_mostly int vm_shared_fault = 1;
 TUNABLE_INT("vm.shared_fault", &vm_shared_fault);
 SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW,
@@ -213,6 +217,7 @@ release_page(struct faultstate *fs)
  * NOTE: This function can fail due to a deadlock against the caller's
  *      holding of a vm_page BUSY.
  */
+#if 0
 static __inline int
 relock_map(struct faultstate *fs)
 {
@@ -227,13 +232,23 @@ relock_map(struct faultstate *fs)
        }
        return error;
 }
+#endif
 
 static __inline void
 unlock_map(struct faultstate *fs)
 {
+       if (fs->ba != fs->first_ba)
+               vm_object_drop(fs->ba->object);
+       if (fs->first_ba && fs->first_ba_held) {
+               vm_object_drop(fs->first_ba->object);
+               fs->first_ba_held = 0;
+       }
+       fs->ba = NULL;
+       fs->first_ba = NULL;
        if (fs->lookup_still_valid && fs->map) {
                vm_map_lookup_done(fs->map, fs->entry, 0);
                fs->lookup_still_valid = FALSE;
+               fs->entry = NULL;
        }
 }
 
@@ -242,38 +257,41 @@ unlock_map(struct faultstate *fs)
  * to vm_fault_object() can be made.
  */
 static void
-_cleanup_successful_fault(struct faultstate *fs, int relock)
+cleanup_fault(struct faultstate *fs)
 {
        /*
         * We allocated a junk page for a COW operation that did
         * not occur, the page must be freed.
         */
-       if (fs->object != fs->first_object) {
+       if (fs->ba != fs->first_ba) {
                KKASSERT(fs->first_shared == 0);
-               vm_page_free(fs->first_m);
-               vm_object_pip_wakeup(fs->object);
+
+               /*
+                * first_m could be completely valid and we got here
+                * because of a PG_RAM, don't mistakenly free it!
+                */
+               if ((fs->first_m->valid & VM_PAGE_BITS_ALL) ==
+                   VM_PAGE_BITS_ALL) {
+                       vm_page_wakeup(fs->first_m);
+               } else {
+                       vm_page_free(fs->first_m);
+               }
+               vm_object_pip_wakeup(fs->ba->object);
                fs->first_m = NULL;
-       }
 
-       /*
-        * Reset fs->object.
-        */
-       fs->object = fs->first_object;
-       if (relock && fs->lookup_still_valid == FALSE) {
-               if (fs->map)
-                       vm_map_lock_read(fs->map);
-               fs->lookup_still_valid = TRUE;
+               /*
+                * Reset fs->ba (used by vm_fault_vpagetahble() without
+                * calling unlock_map(), so we need a little duplication.
+                */
+               vm_object_drop(fs->ba->object);
+               fs->ba = fs->first_ba;
        }
 }
 
 static void
-_unlock_things(struct faultstate *fs, int dealloc)
+unlock_things(struct faultstate *fs)
 {
-       _cleanup_successful_fault(fs, 0);
-       if (dealloc) {
-               /*vm_object_deallocate(fs->first_object);*/
-               /*fs->first_object = NULL; drop used later on */
-       }
+       cleanup_fault(fs);
        unlock_map(fs); 
        if (fs->vp != NULL) { 
                vput(fs->vp);
@@ -281,10 +299,7 @@ _unlock_things(struct faultstate *fs, int dealloc)
        }
 }
 
-#define unlock_things(fs) _unlock_things(fs, 0)
-#define unlock_and_deallocate(fs) _unlock_things(fs, 1)
-#define cleanup_successful_fault(fs) _cleanup_successful_fault(fs, 1)
-
+#if 0
 /*
  * Virtual copy tests.   Used by the fault code to determine if a
  * page can be moved from an orphan vm_object into its shadow
@@ -308,26 +323,26 @@ virtual_copy_test(struct faultstate *fs)
        /*
         * No refs, except us
         */
-       if (fs->object->ref_count != 1)
+       if (fs->ba->object->ref_count != 1)
                return 0;
 
        /*
         * No one else can look this object up
         */
-       if (fs->object->handle != NULL)
+       if (fs->ba->object->handle != NULL)
                return 0;
 
        /*
         * No other ways to look the object up
         */
-       if (fs->object->type != OBJT_DEFAULT &&
-           fs->object->type != OBJT_SWAP)
+       if (fs->ba->object->type != OBJT_DEFAULT &&
+           fs->ba->object->type != OBJT_SWAP)
                return 0;
 
        /*
         * We don't chase down the shadow chain
         */
-       if (fs->object != fs->first_object->backing_object)
+       if (fs->ba != fs->first_ba->backing_ba)
                return 0;
 
        return 1;
@@ -354,6 +369,7 @@ virtual_copy_ok(struct faultstate *fs)
        }
        return 0;
 }
+#endif
 
 /*
  * TRYPAGER 
@@ -365,7 +381,7 @@ virtual_copy_ok(struct faultstate *fs)
  * a wiring fault or if the FS entry is wired.
  */
 #define TRYPAGER(fs)   \
-               (fs->object->type != OBJT_DEFAULT &&                    \
+               (fs->ba->object->type != OBJT_DEFAULT &&                \
                (((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) ||       \
                 (fs->wflags & FW_WIRED)))
 
@@ -396,7 +412,6 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
        thread_t td;
        struct vm_map_ilock ilock;
        int didilock;
-       int didhold;
        int growstack;
        int retry = 0;
        int inherit_prot;
@@ -421,7 +436,7 @@ RetryFault:
         * vm_fault_quick() can shortcut us.
         */
        fs.msoftonly = 0;
-       didhold = 0;
+       fs.first_ba_held = 0;
 
        /*
         * Find the vm_map_entry representing the backing store and resolve
@@ -440,7 +455,7 @@ RetryFault:
         */
        fs.map = map;
        result = vm_map_lookup(&fs.map, vaddr, fault_type,
-                              &fs.entry, &fs.first_object,
+                              &fs.entry, &fs.first_ba,
                               &first_pindex, &fs.first_prot, &fs.wflags);
 
        /*
@@ -486,7 +501,7 @@ RetryFault:
                result = vm_map_lookup(&fs.map, vaddr,
                                       VM_PROT_READ|VM_PROT_WRITE|
                                        VM_PROT_OVERRIDE_WRITE,
-                                      &fs.entry, &fs.first_object,
+                                      &fs.entry, &fs.first_ba,
                                       &first_pindex, &fs.first_prot,
                                       &fs.wflags);
                if (result != KERN_SUCCESS) {
@@ -517,7 +532,7 @@ RetryFault:
        fs.map_generation = fs.map->timestamp;
        fs.lookup_still_valid = TRUE;
        fs.first_m = NULL;
-       fs.object = fs.first_object;    /* so unlock_and_deallocate works */
+       fs.ba = fs.first_ba;            /* so unlock_things() works */
        fs.prot = fs.first_prot;        /* default (used by uksmap) */
 
        if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) {
@@ -548,7 +563,7 @@ RetryFault:
                fakem.busy_count = PBUSY_LOCKED;
                fakem.valid = VM_PAGE_BITS_ALL;
                fakem.pat_mode = VM_MEMATTR_DEFAULT;
-               if (fs.entry->object.uksmap(fs.entry->aux.dev, &fakem)) {
+               if (fs.entry->ba.uksmap(fs.entry->aux.dev, &fakem)) {
                        result = KERN_FAILURE;
                        unlock_things(&fs);
                        goto done2;
@@ -562,7 +577,7 @@ RetryFault:
         * A system map entry may return a NULL object.  No object means
         * no pager means an unrecoverable kernel fault.
         */
-       if (fs.first_object == NULL) {
+       if (fs.first_ba == NULL) {
                panic("vm_fault: unrecoverable fault at %p in entry %p",
                        (void *)vaddr, fs.entry);
        }
@@ -581,9 +596,9 @@ RetryFault:
         */
        if ((td->td_flags & TDF_NOFAULT) &&
            (retry ||
-            fs.first_object->type == OBJT_VNODE ||
-            fs.first_object->type == OBJT_SWAP ||
-            fs.first_object->backing_object)) {
+            fs.first_ba->object->type == OBJT_VNODE ||
+            fs.first_ba->object->type == OBJT_SWAP ||
+            fs.first_ba->backing_ba)) {
                result = KERN_FAILURE;
                unlock_things(&fs);
                goto done2;
@@ -624,9 +639,8 @@ RetryFault:
         * VM_FAULT_DIRTY  - may require swap_pager_unswapped() later, but
         *                   we can try shared first.
         */
-       if (fault_flags & VM_FAULT_UNSWAP) {
+       if (fault_flags & VM_FAULT_UNSWAP)
                fs.first_shared = 0;
-       }
 
        /*
         * Try to shortcut the entire mess and run the fault lockless.
@@ -638,6 +652,12 @@ RetryFault:
                goto success;
        }
 
+       /*
+        * Exclusive heuristic (alloc page vs page exists)
+        */
+       if (fs.first_ba->flags & VM_MAP_BACK_EXCL_HEUR)
+               fs.first_shared = 0;
+
        /*
         * Obtain a top-level object lock, shared or exclusive depending
         * on fs.first_shared.  If a shared lock winds up being insufficient
@@ -646,12 +666,12 @@ RetryFault:
         * The vnode pager lock is always shared.
         */
        if (fs.first_shared)
-               vm_object_hold_shared(fs.first_object);
+               vm_object_hold_shared(fs.first_ba->object);
        else
-               vm_object_hold(fs.first_object);
+               vm_object_hold(fs.first_ba->object);
        if (fs.vp == NULL)
-               fs.vp = vnode_pager_lock(fs.first_object);
-       didhold = 1;
+               fs.vp = vnode_pager_lock(fs.first_ba);
+       fs.first_ba_held = 1;
 
        /*
         * The page we want is at (first_object, first_pindex), but if the
@@ -670,7 +690,6 @@ RetryFault:
                                             fault_type, 1);
                if (result == KERN_TRY_AGAIN) {
                        vm_map_deinterlock(fs.map, &ilock);
-                       vm_object_drop(fs.first_object);
                        ++retry;
                        goto RetryFault;
                }
@@ -683,9 +702,8 @@ RetryFault:
        /*
         * Now we have the actual (object, pindex), fault in the page.  If
         * vm_fault_object() fails it will unlock and deallocate the FS
-        * data.   If it succeeds everything remains locked and fs->object
-        * will have an additional PIP count if it is not equal to
-        * fs->first_object
+        * data.   If it succeeds everything remains locked and fs->ba->object
+        * will have an additional PIP count if fs->ba != fs->first_ba.
         *
         * vm_fault_object will set fs->prot for the pmap operation.  It is
         * allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the
@@ -709,7 +727,6 @@ RetryFault:
        if (result == KERN_TRY_AGAIN) {
                if (didilock)
                        vm_map_deinterlock(fs.map, &ilock);
-               vm_object_drop(fs.first_object);
                ++retry;
                goto RetryFault;
        }
@@ -722,7 +739,7 @@ RetryFault:
 success:
        /*
         * On success vm_fault_object() does not unlock or deallocate, and fs.m
-        * will contain a busied page.
+        * will contain a busied page.  It does drop fs->ba if appropriate.
         *
         * Enter the page into the pmap and do pmap-related adjustments.
         *
@@ -804,14 +821,15 @@ done_success:
                }
        }
 
-       /*vm_object_deallocate(fs.first_object);*/
+       /*vm_object_deallocate(fs.first_ba->object);*/
        /*fs.m = NULL; */
-       /*fs.first_object = NULL; must still drop later */
 
        result = KERN_SUCCESS;
 done:
-       if (fs.first_object && didhold)
-               vm_object_drop(fs.first_object);
+       if (fs.first_ba && fs.first_ba->object && fs.first_ba_held) {
+               vm_object_drop(fs.first_ba->object);
+               fs.first_ba_held = 0;
+       }
 done2:
        if (lp)
                lp->lwp_flags &= ~LWP_PAGING;
@@ -840,6 +858,20 @@ done2:
        }
 #endif
 
+       if (result != KERN_SUCCESS && debug_fault < 0) {
+               kprintf("VM_FAULT %d:%d (%s) result %d "
+                       "addr=%jx type=%02x flags=%02x "
+                       "fs.m=%p fs.prot=%02x fs.wflags=%02x fs.entry=%p\n",
+                       (curthread->td_proc ? curthread->td_proc->p_pid : -1),
+                       (curthread->td_lwp ? curthread->td_lwp->lwp_tid : -1),
+                       curthread->td_comm,
+                       result,
+                       (intmax_t)vaddr, fault_type, fault_flags,
+                       fs.m, fs.prot, fs.wflags, fs.entry);
+               while (debug_fault < 0 && (debug_fault & 1))
+                       tsleep(&debug_fault, 0, "DEBUG", hz);
+       }
+
        return (result);
 }
 
@@ -859,7 +891,7 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
        /*
         * Don't waste time if the object is only being used by one vm_map.
         */
-       obj = fs->first_object;
+       obj = fs->first_ba->object;
        if (obj->flags & OBJ_ONEMAPPING)
                return KERN_FAILURE;
 
@@ -1062,8 +1094,9 @@ RetryFault:
         *        entry.
         */
        fs.map = map;
+       fs.first_ba_held = 0;
        result = vm_map_lookup(&fs.map, vaddr, fault_type,
-                              &fs.entry, &fs.first_object,
+                              &fs.entry, &fs.first_ba,
                               &first_pindex, &fs.first_prot, &fs.wflags);
 
        if (result != KERN_SUCCESS) {
@@ -1100,7 +1133,7 @@ RetryFault:
                result = vm_map_lookup(&fs.map, vaddr,
                                       VM_PROT_READ|VM_PROT_WRITE|
                                        VM_PROT_OVERRIDE_WRITE,
-                                      &fs.entry, &fs.first_object,
+                                      &fs.entry, &fs.first_ba,
                                       &first_pindex, &fs.first_prot,
                                       &fs.wflags);
                if (result != KERN_SUCCESS) {
@@ -1132,7 +1165,7 @@ RetryFault:
        fs.map_generation = fs.map->timestamp;
        fs.lookup_still_valid = TRUE;
        fs.first_m = NULL;
-       fs.object = fs.first_object;    /* so unlock_and_deallocate works */
+       fs.ba = fs.first_ba;
 
        if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
                panic("vm_fault: fault on nofault entry, addr: %lx",
@@ -1154,7 +1187,7 @@ RetryFault:
                fakem.busy_count = PBUSY_LOCKED;
                fakem.valid = VM_PAGE_BITS_ALL;
                fakem.pat_mode = VM_MEMATTR_DEFAULT;
-               if (fs.entry->object.uksmap(fs.entry->aux.dev, &fakem)) {
+               if (fs.entry->ba.uksmap(fs.entry->aux.dev, &fakem)) {
                        *errorp = KERN_FAILURE;
                        fs.m = NULL;
                        unlock_things(&fs);
@@ -1174,7 +1207,7 @@ RetryFault:
         * A system map entry may return a NULL object.  No object means
         * no pager means an unrecoverable kernel fault.
         */
-       if (fs.first_object == NULL) {
+       if (fs.first_ba == NULL) {
                panic("vm_fault: unrecoverable fault at %p in entry %p",
                        (void *)vaddr, fs.entry);
        }
@@ -1189,9 +1222,9 @@ RetryFault:
         */
        if ((curthread->td_flags & TDF_NOFAULT) &&
            (retry ||
-            fs.first_object->type == OBJT_VNODE ||
-            fs.first_object->type == OBJT_SWAP ||
-            fs.first_object->backing_object)) {
+            fs.first_ba->object->type == OBJT_VNODE ||
+            fs.first_ba->object->type == OBJT_SWAP ||
+            fs.first_ba->backing_ba)) {
                *errorp = KERN_FAILURE;
                unlock_things(&fs);
                fs.m = NULL;
@@ -1218,12 +1251,16 @@ RetryFault:
         * truncation operations) during I/O.  This must be done after
         * obtaining the vnode lock in order to avoid possible deadlocks.
         */
+       if (fs.first_ba->flags & VM_MAP_BACK_EXCL_HEUR)
+               fs.first_shared = 0;
+
        if (fs.first_shared)
-               vm_object_hold_shared(fs.first_object);
+               vm_object_hold_shared(fs.first_ba->object);
        else
-               vm_object_hold(fs.first_object);
+               vm_object_hold(fs.first_ba->object);
+       fs.first_ba_held = 1;
        if (fs.vp == NULL)
-               fs.vp = vnode_pager_lock(fs.first_object);      /* shared */
+               fs.vp = vnode_pager_lock(fs.first_ba);  /* shared */
 
        /*
         * The page we want is at (first_object, first_pindex), but if the
@@ -1238,7 +1275,6 @@ RetryFault:
                                             fs.entry->aux.master_pde,
                                             fault_type, 1);
                if (result == KERN_TRY_AGAIN) {
-                       vm_object_drop(fs.first_object);
                        ++retry;
                        goto RetryFault;
                }
@@ -1252,15 +1288,14 @@ RetryFault:
        /*
         * Now we have the actual (object, pindex), fault in the page.  If
         * vm_fault_object() fails it will unlock and deallocate the FS
-        * data.   If it succeeds everything remains locked and fs->object
-        * will have an additinal PIP count if it is not equal to
-        * fs->first_object
+        * data.   If it succeeds everything remains locked and fs->ba->object
+        * will have an additinal PIP count if fs->ba != fs->first_ba.
         */
        fs.m = NULL;
        result = vm_fault_object(&fs, first_pindex, fault_type, 1);
 
        if (result == KERN_TRY_AGAIN) {
-               vm_object_drop(fs.first_object);
+               KKASSERT(fs.first_ba_held == 0);
                ++retry;
                didcow |= fs.wflags & FW_DIDCOW;
                goto RetryFault;
@@ -1274,7 +1309,7 @@ RetryFault:
        if ((orig_fault_type & VM_PROT_WRITE) &&
            (fs.prot & VM_PROT_WRITE) == 0) {
                *errorp = KERN_PROTECTION_FAILURE;
-               unlock_and_deallocate(&fs);
+               unlock_things(&fs);
                fs.m = NULL;
                goto done;
        }
@@ -1347,13 +1382,11 @@ RetryFault:
                vm_page_hold(fs.m);
                vm_page_wakeup(fs.m);
        }
-       /*vm_object_deallocate(fs.first_object);*/
-       /*fs.first_object = NULL; */
+       /*vm_object_deallocate(fs.first_ba->object);*/
        *errorp = 0;
 
 done:
-       if (fs.first_object)
-               vm_object_drop(fs.first_object);
+       KKASSERT(fs.first_ba_held == 0);
 done2:
        return(fs.m);
 }
@@ -1380,9 +1413,11 @@ vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
 
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
        bzero(&entry, sizeof(entry));
-       entry.object.vm_object = object;
        entry.maptype = VM_MAPTYPE_NORMAL;
        entry.protection = entry.max_protection = fault_type;
+       entry.ba.backing_ba = NULL;
+       entry.ba.object = object;
+       entry.ba.offset = 0;
 
        fs.hardfault = 0;
        fs.fault_flags = fault_flags;
@@ -1391,6 +1426,7 @@ vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
        fs.first_shared = *sharedp;
        fs.msoftonly = 0;
        fs.vp = NULL;
+       fs.first_ba_held = 0;   /* object held across call, prevent drop */
        KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
 
        /*
@@ -1409,7 +1445,8 @@ vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
 RetryFault:
        *sharedp = fs.first_shared;
        first_pindex = OFF_TO_IDX(offset);
-       fs.first_object = object;
+       fs.first_ba = &entry.ba;
+       fs.ba = fs.first_ba;
        fs.entry = &entry;
        fs.first_prot = fault_type;
        fs.wflags = 0;
@@ -1430,11 +1467,10 @@ RetryFault:
         * obtaining the vnode lock in order to avoid possible deadlocks.
         */
        if (fs.vp == NULL)
-               fs.vp = vnode_pager_lock(fs.first_object);
+               fs.vp = vnode_pager_lock(fs.first_ba);
 
        fs.lookup_still_valid = TRUE;
        fs.first_m = NULL;
-       fs.object = fs.first_object;    /* so unlock_and_deallocate works */
 
 #if 0
        /* XXX future - ability to operate on VM object using vpagetable */
@@ -1457,11 +1493,10 @@ RetryFault:
        /*
         * Now we have the actual (object, pindex), fault in the page.  If
         * vm_fault_object() fails it will unlock and deallocate the FS
-        * data.   If it succeeds everything remains locked and fs->object
-        * will have an additinal PIP count if it is not equal to
-        * fs->first_object
+        * data.   If it succeeds everything remains locked and fs->ba->object
+        * will have an additinal PIP count if fs->ba != fs->first_ba.
         *
-        * On KERN_TRY_AGAIN vm_fault_object() leaves fs.first_object intact.
+        * On KERN_TRY_AGAIN vm_fault_object() leaves fs.first_ba intact.
         * We may have to upgrade its lock to handle the requested fault.
         */
        result = vm_fault_object(&fs, first_pindex, fault_type, 0);
@@ -1478,7 +1513,7 @@ RetryFault:
 
        if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) {
                *errorp = KERN_PROTECTION_FAILURE;
-               unlock_and_deallocate(&fs);
+               unlock_things(&fs);
                return(NULL);
        }
 
@@ -1519,8 +1554,7 @@ RetryFault:
         * Unlock everything, and return the held page.
         */
        vm_page_wakeup(fs.m);
-       /*vm_object_deallocate(fs.first_object);*/
-       /*fs.first_object = NULL; */
+       /*vm_object_deallocate(fs.first_ba->object);*/
 
        *errorp = 0;
        return(fs.m);
@@ -1548,7 +1582,7 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
        int result;
        vpte_t *ptep;
 
-       ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object));
+       ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_ba->object));
        for (;;) {
                /*
                 * We cannot proceed if the vpte is not valid, not readable
@@ -1556,15 +1590,15 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
                 * not executable for an instruction execution fault.
                 */
                if ((vpte & VPTE_V) == 0) {
-                       unlock_and_deallocate(fs);
+                       unlock_things(fs);
                        return (KERN_FAILURE);
                }
                if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_RW) == 0) {
-                       unlock_and_deallocate(fs);
+                       unlock_things(fs);
                        return (KERN_FAILURE);
                }
                if ((fault_type & VM_PROT_EXECUTE) && (vpte & VPTE_NX)) {
-                       unlock_and_deallocate(fs);
+                       unlock_things(fs);
                        return (KERN_FAILURE);
                }
                if ((vpte & VPTE_PS) || vshift == 0)
@@ -1634,7 +1668,7 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
                vm_page_flag_set(fs->m, PG_REFERENCED);
                vm_page_wakeup(fs->m);
                fs->m = NULL;
-               cleanup_successful_fault(fs);
+               cleanup_fault(fs);
        }
 
        /*
@@ -1672,16 +1706,17 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
 /*
  * This is the core of the vm_fault code.
  *
- * Do all operations required to fault-in (fs.first_object, pindex).  Run
- * through the shadow chain as necessary and do required COW or virtual
+ * Do all operations required to fault-in (fs.first_ba->object, pindex).
+ * Run through the backing store as necessary and do required COW or virtual
  * copy operations.  The caller has already fully resolved the vm_map_entry
  * and, if appropriate, has created a copy-on-write layer.  All we need to
  * do is iterate the object chain.
  *
  * On failure (fs) is unlocked and deallocated and the caller may return or
  * retry depending on the failure code.  On success (fs) is NOT unlocked or
- * deallocated, fs.m will contained a resolved, busied page, and fs.object
- * will have an additional PIP count if it is not equal to fs.first_object.
+ * deallocated, fs.m will contained a resolved, busied page, and fs.ba's
+ * object will have an additional PIP count if it is not equal to
+ * fs.first_ba.
  *
  * If locks based on fs->first_shared or fs->shared are insufficient,
  * clear the appropriate field(s) and return RETRY.  COWs require that
@@ -1692,24 +1727,23 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
  *      we will have to retry with it exclusive if the vm_page is
  *      PG_SWAPPED.
  *
- * fs->first_object must be held on call.
+ * fs->first_ba->object must be held on call.
  */
 static
 int
 vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
                vm_prot_t fault_type, int allow_nofault)
 {
-       vm_object_t next_object;
+       vm_map_backing_t *next_ba;
        vm_pindex_t pindex;
        int error;
 
-       ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object));
+       ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_ba->object));
        fs->prot = fs->first_prot;
-       fs->object = fs->first_object;
        pindex = first_pindex;
+       KKASSERT(fs->ba == fs->first_ba);
 
-       vm_object_chain_acquire(fs->first_object, fs->shared);
-       vm_object_pip_add(fs->first_object, 1);
+       vm_object_pip_add(fs->first_ba->object, 1);
 
        /* 
         * If a read fault occurs we try to upgrade the page protection
@@ -1744,22 +1778,15 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
                        fs->prot &= ~VM_PROT_WRITE;
        }
 
-       /* vm_object_hold(fs->object); implied b/c object == first_object */
+       /* vm_object_hold(fs->ba->object); implied b/c ba == first_ba */
 
        for (;;) {
                /*
-                * The entire backing chain from first_object to object
-                * inclusive is chainlocked.
-                *
                 * If the object is dead, we stop here
                 */
-               if (fs->object->flags & OBJ_DEAD) {
-                       vm_object_pip_wakeup(fs->first_object);
-                       vm_object_chain_release_all(fs->first_object,
-                                                   fs->object);
-                       if (fs->object != fs->first_object)
-                               vm_object_drop(fs->object);
-                       unlock_and_deallocate(fs);
+               if (fs->ba->object->flags & OBJ_DEAD) {
+                       vm_object_pip_wakeup(fs->first_ba->object);
+                       unlock_things(fs);
                        return (KERN_PROTECTION_FAILURE);
                }
 
@@ -1776,19 +1803,13 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
                 * around with a vm_page_t->busy page except, perhaps,
                 * to pmap it.
                 */
-               fs->m = vm_page_lookup_busy_try(fs->object, pindex,
+               fs->m = vm_page_lookup_busy_try(fs->ba->object, pindex,
                                                TRUE, &error);
                if (error) {
-                       vm_object_pip_wakeup(fs->first_object);
-                       vm_object_chain_release_all(fs->first_object,
-                                                   fs->object);
-                       if (fs->object != fs->first_object)
-                               vm_object_drop(fs->object);
+                       vm_object_pip_wakeup(fs->first_ba->object);
                        unlock_things(fs);
                        vm_page_sleep_busy(fs->m, TRUE, "vmpfw");
                        mycpu->gd_cnt.v_intrans++;
-                       /*vm_object_deallocate(fs->first_object);*/
-                       /*fs->first_object = NULL;*/
                        fs->m = NULL;
                        return (KERN_TRY_AGAIN);
                }
@@ -1807,12 +1828,8 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
                                vm_page_activate(fs->m);
                                vm_page_wakeup(fs->m);
                                fs->m = NULL;
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
                                if (allow_nofault == 0 ||
                                    (curthread->td_flags & TDF_NOFAULT) == 0) {
                                        thread_t td;
@@ -1846,6 +1863,7 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
                                        goto readrest;
                                }
                        }
+                       fs->first_ba->flags &= ~VM_MAP_BACK_EXCL_HEUR;
                        break; /* break to PAGE HAS BEEN FOUND */
                }
 
@@ -1853,44 +1871,48 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
                 * Page is not resident, If this is the search termination
                 * or the pager might contain the page, allocate a new page.
                 */
-               if (TRYPAGER(fs) || fs->object == fs->first_object) {
+               if (TRYPAGER(fs) || fs->ba == fs->first_ba) {
+                       /*
+                        * If this is a SWAP object we can use the shared
+                        * lock to check existence of a swap block.  If
+                        * there isn't one we can skip to the next object.
+                        *
+                        * However, if this is the first object we allocate
+                        * a page now just in case we need to copy to it
+                        * later.
+                        */
+                       if (fs->ba != fs->first_ba &&
+                           fs->ba->object->type == OBJT_SWAP) {
+                               if (swap_pager_haspage_locked(fs->ba->object,
+                                                             pindex) == 0) {
+                                       goto next;
+                               }
+                       }
+
                        /*
                         * Allocating, must be exclusive.
                         */
-                       if (fs->object == fs->first_object &&
-                           fs->first_shared) {
+                       fs->first_ba->flags |= VM_MAP_BACK_EXCL_HEUR;
+                       if (fs->ba == fs->first_ba && fs->first_shared) {
                                fs->first_shared = 0;
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
                                return (KERN_TRY_AGAIN);
                        }
-                       if (fs->object != fs->first_object &&
-                           fs->shared) {
+                       if (fs->ba != fs->first_ba && fs->shared) {
                                fs->first_shared = 0;
                                fs->shared = 0;
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
                                return (KERN_TRY_AGAIN);
                        }
 
                        /*
                         * If the page is beyond the object size we fail
                         */
-                       if (pindex >= fs->object->size) {
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
+                       if (pindex >= fs->ba->object->size) {
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
                                return (KERN_PROTECTION_FAILURE);
                        }
 
@@ -1902,19 +1924,15 @@ vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
                         */
                        fs->m = NULL;
                        if (!vm_page_count_severe()) {
-                               fs->m = vm_page_alloc(fs->object, pindex,
-                                   ((fs->vp || fs->object->backing_object) ?
+                               fs->m = vm_page_alloc(fs->ba->object, pindex,
+                                   ((fs->vp || fs->ba->backing_ba) ?
                                        VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL :
                                        VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL |
                                        VM_ALLOC_USE_GD | VM_ALLOC_ZERO));
                        }
                        if (fs->m == NULL) {
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
                                if (allow_nofault == 0 ||
                                    (curthread->td_flags & TDF_NOFAULT) == 0) {
                                        thread_t td;
@@ -1948,9 +1966,12 @@ readrest:
                 * for us.
                 */
                if (TRYPAGER(fs)) {
-                       int rv;
-                       int seqaccess;
                        u_char behavior = vm_map_entry_behavior(fs->entry);
+                       vm_object_t object;
+                       vm_page_t first_m;
+                       int seqaccess;
+                       int ohold;
+                       int rv;
 
                        if (behavior == MAP_ENTRY_BEHAV_RANDOM)
                                seqaccess = 0;
@@ -1962,62 +1983,79 @@ readrest:
                         * pages so we can't be shared at this point either.
                         *
                         * NOTE: We can't free fs->m here in the allocated
-                        *       case (fs->object != fs->first_object) as
-                        *       this would require an exclusively locked
+                        *       case (fs->ba != fs->first_ba) as this
+                        *       would require an exclusively locked
                         *       VM object.
                         */
-                       if (fs->object == fs->first_object &&
-                           fs->first_shared) {
+                       if (fs->ba == fs->first_ba && fs->first_shared) {
                                vm_page_deactivate(fs->m);
                                vm_page_wakeup(fs->m);
                                fs->m = NULL;
                                fs->first_shared = 0;
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
                                return (KERN_TRY_AGAIN);
                        }
-                       if (fs->object != fs->first_object &&
-                           fs->shared) {
+                       if (fs->ba != fs->first_ba && fs->shared) {
                                vm_page_deactivate(fs->m);
                                vm_page_wakeup(fs->m);
                                fs->m = NULL;
                                fs->first_shared = 0;
                                fs->shared = 0;
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
                                return (KERN_TRY_AGAIN);
                        }
 
                        /*
-                        * Avoid deadlocking against the map when doing I/O.
-                        * fs.object and the page is BUSY'd.
+                        * Unlock the map, retaining fs->ba->object.  This
+                        * is necessary to avoid a deadlock and it will also
+                        * allow concurrent faults on the same map and ba
+                        * (albeit a bit inefficiently).
                         *
-                        * NOTE: Once unlocked, fs->entry can become stale
-                        *       so this will NULL it out.
+                        * Some fancy footwork is needed due to token
+                        * ordering.
                         *
-                        * NOTE: fs->entry is invalid until we relock the
-                        *       map and verify that the timestamp has not
-                        *       changed.
+                        * Additional footwork is needed because we are
+                        * blowing away ba vs first_ba, so fs->first_m
+                        * will not be cleaned up automatically.  Pull
+                        * it out.
+                        *
+                        * Because we unlocked the map, we will have to
+                        * return a KERN_TRY_AGAIN for any successful I/O.
                         */
+                       object = fs->ba->object;
+#if 1
+                       ohold = 0;
+                       first_m = NULL;
+#else
+                       if (fs->ba != fs->first_ba) {
+                               first_m = fs->first_m;
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               vm_object_lock_swap();
+                               vm_object_drop(fs->first_ba->object);
+                       } else {
+                               first_m = NULL;
+                       }
+                       ohold = fs->first_ba_held;
+                       fs->ba = NULL;
+                       fs->first_ba = NULL;
+                       fs->first_m = NULL;
+                       fs->first_ba_held = 0;
                        unlock_map(fs);
+#endif
+                       /* object is held, no more access to entry or ba's */
 
                        /*
-                        * Acquire the page data.  We still hold a ref on
-                        * fs.object and the page has been BUSY's.
+                        * Acquire the page data.  We still hold object
+                        * and the page has been BUSY's.
                         *
-                        * The pager may replace the page (for example, in
-                        * order to enter a fictitious page into the
-                        * object).  If it does so it is responsible for
-                        * cleaning up the passed page and properly setting
-                        * the new page BUSY.
+                        * We own the page, but we must re-issue the lookup
+                        * because the pager may have replaced it (for example,
+                        * in order to enter a fictitious page into the
+                        * object).  In this situation the pager will have
+                        * cleaned up the old page and left the new one
+                        * busy for us.
                         *
                         * If we got here through a PG_RAM read-ahead
                         * mark the page may be partially dirty and thus
@@ -2026,37 +2064,85 @@ readrest:
                         * it anyway.  We have to depend on the get_page
                         * operation filling in any gaps whether there is
                         * backing store or not.
+                        *
+                        * We must dispose of the page (fs->m) and also
+                        * possibly first_m (the fronting layer).  If
+                        * this is a write fault leave the page intact
+                        * because we will probably have to copy fs->m
+                        * to fs->first_m on the retry.  If this is a
+                        * read fault we probably won't need the page.
                         */
-                       rv = vm_pager_get_page(fs->object, &fs->m, seqaccess);
+                       rv = vm_pager_get_page(object, &fs->m, seqaccess);
 
                        if (rv == VM_PAGER_OK) {
+                               ++fs->hardfault;
+                               fs->m = vm_page_lookup(object, pindex);
+                               if (fs->m) {
+                                       vm_page_activate(fs->m);
+                                       vm_page_wakeup(fs->m);
+                                       fs->m = NULL;
+                               }
+
                                /*
-                                * Relookup in case pager changed page. Pager
-                                * is responsible for disposition of old page
-                                * if moved.
-                                *
-                                * XXX other code segments do relookups too.
-                                * It's a bad abstraction that needs to be
-                                * fixed/removed.
+                                * first_m could be completely valid and we
+                                * got here because of a PG_RAM, don't
+                                * mistakenly free it!
                                 */
-                               fs->m = vm_page_lookup(fs->object, pindex);
-                               if (fs->m == NULL) {
-                                       vm_object_pip_wakeup(fs->first_object);
-                                       vm_object_chain_release_all(
-                                               fs->first_object, fs->object);
-                                       if (fs->object != fs->first_object)
-                                               vm_object_drop(fs->object);
-                                       unlock_and_deallocate(fs);
-                                       return (KERN_TRY_AGAIN);
+                               if (first_m) {
+                                       if ((first_m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) {
+                                               vm_page_wakeup(first_m);
+                                       } else if (fault_type & VM_PROT_WRITE) {
+                                               vm_page_deactivate(first_m);
+                                               vm_page_wakeup(first_m);
+                                       } else {
+                                               vm_page_free(first_m);
+                                       }
+                                       first_m = NULL;         /* safety */
                                }
-                               ++fs->hardfault;
-                               break; /* break to PAGE HAS BEEN FOUND */
+
+#if 1
+                               if (fs->m) {
+                                       /* have page */
+                                       break;
+                               }
+                               vm_object_pip_wakeup(fs->first_ba->object);
+                               unlock_things(fs);
+                               return (KERN_TRY_AGAIN);
+#else
+                               vm_object_pip_wakeup(object);
+                               unlock_things(fs);
+
+                               /*
+                                * HACK! The object is always held on call,
+                                *       but vm_fault_object_page() needs
+                                *       to leave the object held across
+                                *       the entire operation and will clear
+                                *       first_ba_held to prevent the object
+                                *       from being dropped.
+                                */
+                               if (ohold)
+                                       vm_object_drop(object);
+                               return (KERN_TRY_AGAIN);
+#endif
+                       }
+
+                       /*
+                        * If the pager doesn't have the page, continue on
+                        * to the next object.  Retain the vm_page if this
+                        * is the first object, we may need to copy into
+                        * it later.
+                        */
+                       if (rv == VM_PAGER_FAIL) {
+                               if (fs->ba != fs->first_ba) {
+                                       vm_page_free(fs->m);
+                                       fs->m = NULL;
+                               }
+                               goto next;
                        }
 
                        /*
                         * Remove the bogus page (which does not exist at this
-                        * object/offset); before doing so, we must get back
-                        * our object lock to preserve our invariant.
+                        * object/offset).
                         *
                         * Also wake up any other process that may want to bring
                         * in this page.
@@ -2080,6 +2166,31 @@ readrest:
                                }
                        }
 
+                       /*
+                        * I/O error or data outside pager's range.
+                        */
+                       if (fs->m) {
+                               vnode_pager_freepage(fs->m);
+                               fs->m = NULL;
+                       }
+                       if (first_m) {
+                               vm_page_free(first_m);
+                               first_m = NULL;         /* safety */
+                       }
+                       vm_object_pip_wakeup(object);
+                       unlock_things(fs);
+                       if (ohold)
+                               vm_object_drop(object);
+                       switch(rv) {
+                       case VM_PAGER_ERROR:
+                               return (KERN_FAILURE);
+                       case VM_PAGER_BAD:
+                               return (KERN_PROTECTION_FAILURE);
+                       default:
+                               return (KERN_PROTECTION_FAILURE);
+                       }
+
+#if 0
                        /*
                         * Data outside the range of the pager or an I/O error
                         *
@@ -2087,26 +2198,17 @@ readrest:
                         * e.g. by the buffer cache, and cannot simply be
                         * freed.  Call vnode_pager_freepage() to deal with it.
                         *
-                        * Also note that we cannot free the page if we are
-                        * holding the related object shared. XXX not sure
-                        * what to do in that case.
+                        * The object is not held shared so we can safely
+                        * free the page.
                         */
-                       if (fs->object != fs->first_object) {
-                               /*
-                                * Scrap the page.  Check to see if the
-                                * vm_pager_get_page() call has already
-                                * dealt with it.
-                                */
-                               if (fs->m) {
-                                       vnode_pager_freepage(fs->m);
-                                       fs->m = NULL;
-                               }
+                       if (fs->ba != fs->first_ba) {
 
                                /*
                                 * XXX - we cannot just fall out at this
                                 * point, m has been freed and is invalid!
                                 */
                        }
+
                        /*
                         * XXX - the check for kernel_map is a kludge to work
                         * around having the machine panic on a kernel space
@@ -2115,28 +2217,17 @@ readrest:
                        if (((fs->map != &kernel_map) &&
                            (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) {
                                if (fs->m) {
-                                       if (fs->first_shared) {
-                                               vm_page_deactivate(fs->m);
-                                               vm_page_wakeup(fs->m);
-                                       } else {
-                                               vnode_pager_freepage(fs->m);
-                                       }
+                                       /* from just above */
+                                       KKASSERT(fs->first_shared == 0);
+                                       vnode_pager_freepage(fs->m);
                                        fs->m = NULL;
                                }
-                               vm_object_pip_wakeup(fs->first_object);
-                               vm_object_chain_release_all(fs->first_object,
-                                                           fs->object);
-                               if (fs->object != fs->first_object)
-                                       vm_object_drop(fs->object);
-                               unlock_and_deallocate(fs);
-                               if (rv == VM_PAGER_ERROR)
-                                       return (KERN_FAILURE);
-                               else
-                                       return (KERN_PROTECTION_FAILURE);
                                /* NOT REACHED */
                        }
+#endif
                }
 
+next:
                /*
                 * We get here if the object has a default pager (or unwiring) 
                 * or the pager doesn't have the page.
@@ -2145,7 +2236,7 @@ readrest:
                 * deeper page to be mapped read-only, in which case the
                 * unlock*(fs) will free first_m.
                 */
-               if (fs->object == fs->first_object)
+               if (fs->ba == fs->first_ba)
                        fs->first_m = fs->m;
 
                /*
@@ -2155,40 +2246,24 @@ readrest:
                 * The object lock for the next object is governed by
                 * fs->shared.
                 */
-               if ((next_object = fs->object->backing_object) != NULL) {
+               if ((next_ba = fs->ba->backing_ba) != NULL) {
                        if (fs->shared)
-                               vm_object_hold_shared(next_object);
+                               vm_object_hold_shared(next_ba->object);
                        else
-                               vm_object_hold(next_object);
-                       vm_object_chain_acquire(next_object, fs->shared);
-                       KKASSERT(next_object == fs->object->backing_object);
-                       pindex += OFF_TO_IDX(fs->object->backing_object_offset);
+                               vm_object_hold(next_ba->object);
+                       KKASSERT(next_ba == fs->ba->backing_ba);
+                       pindex += OFF_TO_IDX(next_ba->offset);
                }
 
-               if (next_object == NULL) {
+               if (next_ba == NULL) {
                        /*
                         * If there's no object left, fill the page in the top
                         * object with zeros.
                         */
-                       if (fs->object != fs->first_object) {
-#if 0
-                               if (fs->first_object->backing_object !=
-                                   fs->object) {
-                                       vm_object_hold(fs->first_object->backing_object);
-                               }
-#endif
-                               vm_object_chain_release_all(
-                                       fs->first_object->backing_object,
-                                       fs->object);
-#if 0
-                               if (fs->first_object->backing_object !=
-                                   fs->object) {
-                                       vm_object_drop(fs->first_object->backing_object);
-                               }
-#endif
-                               vm_object_pip_wakeup(fs->object);
-                               vm_object_drop(fs->object);
-                               fs->object = fs->first_object;
+                       if (fs->ba != fs->first_ba) {
+                               vm_object_pip_wakeup(fs->ba->object);
+                               vm_object_drop(fs->ba->object);
+                               fs->ba = fs->first_ba;
                                pindex = first_pindex;
                                fs->m = fs->first_m;
                        }
@@ -2202,15 +2277,13 @@ readrest:
                        fs->m->valid = VM_PAGE_BITS_ALL;
                        break;  /* break to PAGE HAS BEEN FOUND */
                }
-               if (fs->object != fs->first_object) {
-                       vm_object_pip_wakeup(fs->object);
-                       vm_object_lock_swap();
-                       vm_object_drop(fs->object);
+               if (fs->ba != fs->first_ba) {
+                       vm_object_pip_wakeup(fs->ba->object);
+                       vm_object_lock_swap();  /* flip ba/next_ba */
+                       vm_object_drop(fs->ba->object);
                }
-               KASSERT(fs->object != next_object,
-                       ("object loop %p", next_object));
-               fs->object = next_object;
-               vm_object_pip_add(fs->object, 1);
+               fs->ba = next_ba;
+               vm_object_pip_add(next_ba->object, 1);
        }
 
        /*
@@ -2229,11 +2302,13 @@ readrest:
        KASSERT((fs->m->busy_count & PBUSY_LOCKED) != 0,
                ("vm_fault: not busy after main loop"));
 
-       if (fs->object != fs->first_object) {
+       if (fs->ba != fs->first_ba) {
                /*
                 * We only really need to copy if we want to write it.
                 */
                if (fault_type & VM_PROT_WRITE) {
+#if 0
+                       /* CODE REFACTOR IN PROGRESS, REMOVE OPTIMIZATION */
                        /*
                         * This allows pages to be virtually copied from a 
                         * backing_object into the first_object, where the 
@@ -2258,13 +2333,16 @@ readrest:
                                 */
                                vm_page_protect(fs->first_m, VM_PROT_NONE);
                                vm_page_remove(fs->first_m);
-                               vm_page_rename(fs->m, fs->first_object,
+                               vm_page_rename(fs->m,
+                                              fs->first_ba->object,
                                               first_pindex);
                                vm_page_free(fs->first_m);
                                fs->first_m = fs->m;
                                fs->m = NULL;
                                mycpu->gd_cnt.v_cow_optim++;
-                       } else {
+                       } else
+#endif
+                       {
                                /*
                                 * Oh, well, lets copy it.
                                 *
@@ -2304,39 +2382,24 @@ readrest:
                                release_page(fs);
 
                        /*
-                        * We intend to revert to first_object, undo the
-                        * chain lock through to that.
+                        * fs->ba != fs->first_ba due to above conditional
                         */
-#if 0
-                       if (fs->first_object->backing_object != fs->object)
-                               vm_object_hold(fs->first_object->backing_object);
-#endif
-                       vm_object_chain_release_all(
-                                       fs->first_object->backing_object,
-                                       fs->object);
-#if 0
-                       if (fs->first_object->backing_object != fs->object)
-                               vm_object_drop(fs->first_object->backing_object);
-#endif
-
-                       /*
-                        * fs->object != fs->first_object due to above 
-                        * conditional
-                        */
-                       vm_object_pip_wakeup(fs->object);
-                       vm_object_drop(fs->object);
+                       vm_object_pip_wakeup(fs->ba->object);
+                       vm_object_drop(fs->ba->object);
+                       fs->ba = fs->first_ba;
 
                        /*
                         * Only use the new page below...
                         */
                        mycpu->gd_cnt.v_cow_faults++;
                        fs->m = fs->first_m;
-                       fs->object = fs->first_object;
                        pindex = first_pindex;
                } else {
                        /*
                         * If it wasn't a write fault avoid having to copy
-                        * the page by mapping it read-only.
+                        * the page by mapping it read-only from backing
+                        * store.  The process is not allowed to modify
+                        * backing pages.
                         */
                        fs->prot &= ~VM_PROT_WRITE;
                }
@@ -2353,19 +2416,18 @@ readrest:
         * NOTE: The relock_map() can fail due to a deadlock against
         *       the vm_page we are holding BUSY.
         */
+       KKASSERT(fs->lookup_still_valid == TRUE);
+#if 0
        if (fs->lookup_still_valid == FALSE && fs->map) {
                if (relock_map(fs) ||
                    fs->map->timestamp != fs->map_generation) {
                        release_page(fs);
-                       vm_object_pip_wakeup(fs->first_object);
-                       vm_object_chain_release_all(fs->first_object,
-                                                   fs->object);
-                       if (fs->object != fs->first_object)
-                               vm_object_drop(fs->object);
-                       unlock_and_deallocate(fs);
+                       vm_object_pip_wakeup(fs->first_ba->object);
+                       unlock_things(fs);
                        return (KERN_TRY_AGAIN);
                }
        }
+#endif
 
        /*
         * If the fault is a write, we know that this page is being
@@ -2395,22 +2457,18 @@ readrest:
                                 * exclusive object lock.  If we are shared,
                                 * we must clear the shared flag and retry.
                                 */
-                               if ((fs->object == fs->first_object &&
+                               if ((fs->ba == fs->first_ba &&
                                     fs->first_shared) ||
-                                   (fs->object != fs->first_object &&
-                                    fs->shared)) {
+                                   (fs->ba != fs->first_ba && fs->shared)) {
                                        vm_page_wakeup(fs->m);
                                        fs->m = NULL;
-                                       if (fs->object == fs->first_object)
+                                       if (fs->ba == fs->first_ba)
                                                fs->first_shared = 0;
                                        else
                                                fs->shared = 0;
-                                       vm_object_pip_wakeup(fs->first_object);
-                                       vm_object_chain_release_all(
-                                               fs->first_object, fs->object);
-                                       if (fs->object != fs->first_object)
-                                               vm_object_drop(fs->object);
-                                       unlock_and_deallocate(fs);
+                                       vm_object_pip_wakeup(
+                                                       fs->first_ba->object);
+                                       unlock_things(fs);
                                        return (KERN_TRY_AGAIN);
                                }
                                swap_pager_unswapped(fs->m);
@@ -2418,15 +2476,21 @@ readrest:
                }
        }
 
-       vm_object_pip_wakeup(fs->first_object);
-       vm_object_chain_release_all(fs->first_object, fs->object);
-       if (fs->object != fs->first_object)
-               vm_object_drop(fs->object);
+       /*
+        * We found our page at backing layer ba.  Leave the layer state
+        * intact.
+        */
+
+       vm_object_pip_wakeup(fs->first_ba->object);
+#if 0
+       if (fs->ba != fs->first_ba)
+               vm_object_drop(fs->ba->object);
+#endif
 
        /*
         * Page had better still be busy.  We are still locked up and 
-        * fs->object will have another PIP reference if it is not equal
-        * to fs->first_object.
+        * fs->ba->object will have another PIP reference for the case
+        * where fs->ba != fs->first_ba.
         */
        KASSERT(fs->m->busy_count & PBUSY_LOCKED,
                ("vm_fault: page %p not busy!", fs->m));
@@ -2483,9 +2547,9 @@ vm_fault_wire(vm_map_t map, vm_map_entry_t entry,
        switch(entry->maptype) {
        case VM_MAPTYPE_NORMAL:
        case VM_MAPTYPE_VPAGETABLE:
-               fictitious = entry->object.vm_object &&
-                           ((entry->object.vm_object->type == OBJT_DEVICE) ||
-                            (entry->object.vm_object->type == OBJT_MGTDEVICE));
+               fictitious = entry->ba.object &&
+                           ((entry->ba.object->type == OBJT_DEVICE) ||
+                            (entry->ba.object->type == OBJT_MGTDEVICE));
                break;
        case VM_MAPTYPE_UKSMAP:
                fictitious = TRUE;
@@ -2543,9 +2607,9 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
        pmap = vm_map_pmap(map);
        start = entry->start;
        end = entry->end;
-       fictitious = entry->object.vm_object &&
-                       ((entry->object.vm_object->type == OBJT_DEVICE) ||
-                        (entry->object.vm_object->type == OBJT_MGTDEVICE));
+       fictitious = entry->ba.object &&
+                       ((entry->ba.object->type == OBJT_DEVICE) ||
+                        (entry->ba.object->type == OBJT_MGTDEVICE));
        if (entry->eflags & MAP_ENTRY_KSTACK)
                start += PAGE_SIZE;
 
@@ -2588,15 +2652,15 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
        vm_page_t dst_m;
        vm_page_t src_m;
 
-       src_object = src_entry->object.vm_object;
-       src_offset = src_entry->offset;
+       src_object = src_entry->ba.object;
+       src_offset = src_entry->ba.offset;
 
        /*
         * Create the top-level object for the destination entry. (Doesn't
         * actually shadow anything - we copy the pages directly.)
         */
        vm_map_entry_allocate_object(dst_entry);
-       dst_object = dst_entry->object.vm_object;
+       dst_object = dst_entry->ba.object;
 
        prot = dst_entry->max_protection;
 
@@ -2861,6 +2925,7 @@ static void
 vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
            int fault_flags)
 {
+       vm_map_backing_t *ba;   /* first ba */
        struct lwp *lp;
        vm_page_t m;
        vm_offset_t addr;
@@ -2897,22 +2962,23 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
        if (maxpages > 1024)
                maxpages = 1024;
 
-       object = entry->object.vm_object;
+       ba = &entry->ba;
+       object = entry->ba.object;
        KKASSERT(object != NULL);
-       KKASSERT(object == entry->object.vm_object);
 
        /*
         * NOTE: VM_FAULT_DIRTY allowed later so must hold object exclusively
         *       now (or do something more complex XXX).
         */
        vm_object_hold(object);
-       vm_object_chain_acquire(object, 0);
 
        noneg = 0;
        nopos = 0;
        for (i = 0; i < maxpages; ++i) {
                vm_object_t lobject;
                vm_object_t nobject;
+               vm_map_backing_t *last_ba;      /* last ba */
+               vm_map_backing_t *next_ba;      /* last ba */
                int allocated = 0;
                int error;
 
@@ -2965,7 +3031,7 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                }
 
                /*
-                * Follow the VM object chain to obtain the page to be mapped
+                * Follow the backing layers to obtain the page to be mapped
                 * into the pmap.
                 *
                 * If we reach the terminal object without finding a page
@@ -2977,19 +3043,20 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                 * we stop if any non-default object is encountered.  e.g.
                 * a vnode or swap object would stop the loop.
                 */
-               index = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
+               index = ((addr - entry->start) + entry->ba.offset) >>
+                       PAGE_SHIFT;
+               last_ba = ba;
                lobject = object;
                pindex = index;
                pprot = prot;
 
-               KKASSERT(lobject == entry->object.vm_object);
                /*vm_object_hold(lobject); implied */
 
                while ((m = vm_page_lookup_busy_try(lobject, pindex,
                                                    TRUE, &error)) == NULL) {
                        if (lobject->type != OBJT_DEFAULT)
                                break;
-                       if (lobject->backing_object == NULL) {
+                       if ((next_ba = last_ba->backing_ba) == NULL) {
                                if (vm_fast_fault == 0)
                                        break;
                                if ((prot & VM_PROT_WRITE) == 0 ||
@@ -3012,19 +3079,18 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                                /* lobject = object .. not needed */
                                break;
                        }
-                       if (lobject->backing_object_offset & PAGE_MASK)
+                       if (next_ba->offset & PAGE_MASK)
                                break;
-                       nobject = lobject->backing_object;
+                       nobject = next_ba->object;
                        vm_object_hold(nobject);
-                       KKASSERT(nobject == lobject->backing_object);
-                       pindex += lobject->backing_object_offset >> PAGE_SHIFT;
-                       if (lobject != object) {
+                       pindex += next_ba->offset >> PAGE_SHIFT;
+                       if (last_ba != ba) {
                                vm_object_lock_swap();
                                vm_object_drop(lobject);
                        }
                        lobject = nobject;
+                       last_ba = next_ba;
                        pprot &= ~VM_PROT_WRITE;
-                       vm_object_chain_acquire(lobject, 0);
                }
 
                /*
@@ -3035,19 +3101,8 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                 * Give-up if no page is available.
                 */
                if (m == NULL) {
-                       if (lobject != object) {
-#if 0
-                               if (object->backing_object != lobject)
-                                       vm_object_hold(object->backing_object);
-#endif
-                               vm_object_chain_release_all(
-                                       object->backing_object, lobject);
-#if 0
-                               if (object->backing_object != lobject)
-                                       vm_object_drop(object->backing_object);
-#endif
+                       if (last_ba != ba)
                                vm_object_drop(lobject);
-                       }
                        break;
                }
 
@@ -3070,19 +3125,8 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                 * the gravy train should be low (since accesses will likely
                 * be I/O bound anyway).
                 */
-               if (lobject != object) {
-#if 0
-                       if (object->backing_object != lobject)
-                               vm_object_hold(object->backing_object);
-#endif
-                       vm_object_chain_release_all(object->backing_object,
-                                                   lobject);
-#if 0
-                       if (object->backing_object != lobject)
-                               vm_object_drop(object->backing_object);
-#endif
+               if (last_ba != ba)
                        vm_object_drop(lobject);
-               }
 
                /*
                 * Enter the page into the pmap if appropriate.  If we had
@@ -3149,7 +3193,6 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                        vm_page_wakeup(m);
                }
        }
-       vm_object_chain_release(object);
        vm_object_drop(object);
 }
 
@@ -3187,8 +3230,8 @@ vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
        lp = curthread->td_lwp;
        if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
                return;
-       object = entry->object.vm_object;
-       if (object->backing_object != NULL)
+       object = entry->ba.object;
+       if (entry->ba.backing_ba != NULL)
                return;
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 
@@ -3240,7 +3283,8 @@ vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
                 * WARNING!  We cannot call swap_pager_unswapped() or insert
                 *           a new vm_page with a shared token.
                 */
-               pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
+               pindex = ((addr - entry->start) + entry->ba.offset) >>
+                        PAGE_SHIFT;
 
                /*
                 * Skip pages already mapped, and stop scanning in that
index 2330f56..b996a51 100644 (file)
@@ -125,6 +125,7 @@ static void vmspace_dtor(void *obj, void *privdata);
 static void vmspace_terminate(struct vmspace *vm, int final);
 
 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
+MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
 static struct objcache *vmspace_cache;
 
 /*
@@ -254,7 +255,7 @@ vmspace_dtor(void *obj, void *privdata)
 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 
-/* a->start is address, and the only field has to be initialized */
+/* a->start is address, and the only field which must be initialized */
 static int
 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
 {
@@ -547,7 +548,7 @@ vmspace_swap_count(struct vmspace *vm)
                switch(cur->maptype) {
                case VM_MAPTYPE_NORMAL:
                case VM_MAPTYPE_VPAGETABLE:
-                       if ((object = cur->object.vm_object) == NULL)
+                       if ((object = cur->ba.object) == NULL)
                                break;
                        if (object->swblock_count) {
                                n = (cur->end - cur->start) / PAGE_SIZE;
@@ -584,7 +585,7 @@ vmspace_anonymous_count(struct vmspace *vm)
                switch(cur->maptype) {
                case VM_MAPTYPE_NORMAL:
                case VM_MAPTYPE_VPAGETABLE:
-                       if ((object = cur->object.vm_object) == NULL)
+                       if ((object = cur->ba.object) == NULL)
                                break;
                        if (object->type != OBJT_DEFAULT &&
                            object->type != OBJT_SWAP) {
@@ -692,14 +693,17 @@ vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
 }
 
 /*
- * Shadow the vm_map_entry's object.  This typically needs to be done when
- * a write fault is taken on an entry which had previously been cloned by
- * fork().  The shared object (which might be NULL) must become private so
- * we add a shadow layer above it.
+ * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
+ * object in the entry for COW faults.
  *
- * Object allocation for anonymous mappings is defered as long as possible.
- * When creating a shadow, however, the underlying object must be instantiated
- * so it can be shared.
+ * The entire chain including entry->ba (prior to inserting the fronting
+ * object) essentially becomes set in stone... elements of it can be paged
+ * in or out, but cannot be further modified.
+ *
+ * NOTE: If we do not optimize the backing chain then a unique copy is not
+ *      needed.  Note, however, that because portions of the chain are
+ *      shared across pmaps we cannot make any changes to the vm_map_backing
+ *      elements themselves.
  *
  * If the map segment is governed by a virtual page table then it is
  * possible to address offsets beyond the mapped area.  Just allocate
@@ -708,7 +712,7 @@ vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
  * If addref is non-zero an additional reference is added to the returned
  * entry.  This mechanic exists because the additional reference might have
  * to be added atomically and not after return to prevent a premature
- * collapse.
+ * collapse.  XXX currently there is no collapse code.
  *
  * The vm_map must be exclusively locked.
  * No other requirements.
@@ -717,13 +721,132 @@ static
 void
 vm_map_entry_shadow(vm_map_entry_t entry, int addref)
 {
-       if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
-               vm_object_shadow(&entry->object.vm_object, &entry->offset,
-                                0x7FFFFFFF, addref);   /* XXX */
-       } else {
-               vm_object_shadow(&entry->object.vm_object, &entry->offset,
-                                atop(entry->end - entry->start), addref);
+       vm_map_backing_t *ba;
+       vm_size_t length;
+       vm_object_t source;
+       vm_object_t result;
+       int drop_source;
+
+       if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
+               length = 0x7FFFFFFF;
+       else
+               length = atop(entry->end - entry->start);
+       ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
+
+       /*
+        * The ref on source is inherited when we move it into the ba.
+        */
+       source = entry->ba.object;
+
+       /*
+        * Don't create the new object if the old object isn't shared.
+        * We have to chain wait before adding the reference to avoid
+        * racing a collapse or deallocation.
+        *
+        * Clear OBJ_ONEMAPPING flag when shadowing.
+        *
+        * The caller owns a ref on source via *objectp which we are going
+        * to replace.  This ref is inherited by the backing_object assignment.
+        * from nobject and does not need to be incremented here.
+        *
+        * However, we add a temporary extra reference to the original source
+        * prior to holding nobject in case we block, to avoid races where
+        * someone else might believe that the source can be collapsed.
+        */
+       drop_source = 0;
+       if (source) {
+               if (source->type != OBJT_VNODE) {
+                       vm_object_hold(source);
+                       if (source->ref_count == 1 &&
+                           source->handle == NULL &&
+                           (source->type == OBJT_DEFAULT ||
+                            source->type == OBJT_SWAP)) {
+                               if (addref) {
+                                       vm_object_reference_locked(source);
+                                       vm_object_clear_flag(source,
+                                                            OBJ_ONEMAPPING);
+                               }
+                               vm_object_drop(source);
+                               kfree(ba, M_MAP_BACKING);
+                               goto done;
+                       }
+                       /*vm_object_reference_locked(source);*/
+                       vm_object_clear_flag(source, OBJ_ONEMAPPING);
+                       drop_source = 1;        /* drop source at end */
+               } else {
+                       /*vm_object_reference_quick(source);*/
+                       vm_object_clear_flag(source, OBJ_ONEMAPPING);
+               }
+       }
+
+       /*
+        * Allocate a new object with the given length.  The new object
+        * is returned referenced but we may have to add another one.
+        * If we are adding a second reference we must clear OBJ_ONEMAPPING.
+        * (typically because the caller is about to clone a vm_map_entry).
+        *
+        * The source object currently has an extra reference to prevent
+        * collapses into it while we mess with its shadow list, which
+        * we will remove later in this routine.
+        *
+        * The target object may require a second reference if asked for one
+        * by the caller.
+        */
+       result = vm_object_allocate(OBJT_DEFAULT, length);
+       if (result == NULL)
+               panic("vm_object_shadow: no object for shadowing");
+       vm_object_hold(result);
+       if (addref) {
+               vm_object_reference_locked(result);
+               vm_object_clear_flag(result, OBJ_ONEMAPPING);
+       }
+
+       /*
+        * The new object shadows the source object.
+        *
+        * Try to optimize the result object's page color when shadowing
+        * in order to maintain page coloring consistency in the combined
+        * shadowed object.
+        *
+        * The source object is moved to ba, retaining its existing ref-count.
+        * No additional ref is needed.
+        *
+        * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
+        */
+       *ba = entry->ba;                /* previous ba */
+       ba->refs = 1;                   /* initialize ref count */
+       entry->ba.object = result;      /* new ba (at head of entry) */
+       entry->ba.backing_ba = ba;
+       entry->ba.offset = 0;
+       entry->ba.refs = 0;
+
+       if (source) {
+#if 0
+               /* shadowing no longer messes with generation count */
+               if (drop_source) {
+                       atomic_add_int(&source->generation, 1);
+                       vm_object_set_flag(result, OBJ_ONSHADOW);
+               }
+#endif
+               /* cpu localization twist */
+               result->pg_color = vm_quickcolor();
+       }
+
+       /*
+        * Adjust the return storage.  Drop the ref on source before
+        * returning.
+        */
+       vm_object_drop(result);
+       if (source) {
+               if (drop_source) {
+                       /*vm_object_deallocate_locked(source);*/
+                       vm_object_drop(source);
+               } else {
+                       /*vm_object_deallocate(source);*/
+               }
        }
+
+done:
        entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 }
 
@@ -746,14 +869,25 @@ vm_map_entry_allocate_object(vm_map_entry_t entry)
 {
        vm_object_t obj;
 
+       /*
+        * ba.offset is added cumulatively in the backing_ba scan, so we
+        * can noly reset it to zero if ba.backing_ba is NULL.  We reset
+        * it to 0 only for debugging convenience.
+        *
+        * ba.offset cannot otherwise be modified because it effects
+        * the offsets for the entire backing_ba chain.
+        */
+       if (entry->ba.backing_ba == NULL)
+               entry->ba.offset = 0;
+
        if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
                obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
        } else {
                obj = vm_object_allocate(OBJT_DEFAULT,
-                                        atop(entry->end - entry->start));
+                                        atop(entry->end - entry->start) +
+                                        entry->ba.offset);
        }
-       entry->object.vm_object = obj;
-       entry->offset = 0;
+       entry->ba.object = obj;
 }
 
 /*
@@ -944,6 +1078,32 @@ static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
 {
        struct globaldata *gd = mycpu;
+       vm_map_backing_t *ba;
+       long refs;
+
+       /*
+        * We must also dispose of the vm_map_backing, kfree()ing the
+        * possibly shared element on the 1->0 transition.  We only iterate
+        * to the next backing_ba when the previous one went through a
+        * 1->0 transition.
+        */
+       while ((ba = entry->ba.backing_ba) != NULL) {
+               refs = atomic_fetchadd_long(&ba->refs, -1);
+               if (refs > 1)
+                       break;
+               KKASSERT(refs == 1);    /* transitioned 1->0 */
+               if (ba->object)
+                       vm_object_deallocate(ba->object);
+               entry->ba.backing_ba = ba->backing_ba;
+               kfree(ba, M_MAP_BACKING);
+       }
+
+       /*
+        * Cleanup for safety.
+        */
+       entry->ba.backing_ba = NULL;
+       entry->ba.object = NULL;
+       entry->ba.offset = 0;
 
        ++*countp;
        crit_enter();
@@ -1049,8 +1209,8 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
        vm_map_entry_t next;
        vm_map_entry_t temp_entry;
        vm_eflags_t protoeflags;
-       int must_drop = 0;
        vm_object_t object;
+       int must_drop = 0;
 
        if (maptype == VM_MAPTYPE_UKSMAP)
                object = NULL;
@@ -1130,9 +1290,10 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
                 (prev_entry->id == id) &&
                 prev_entry->maptype == maptype &&
                 maptype == VM_MAPTYPE_NORMAL &&
-                ((prev_entry->object.vm_object == NULL) ||
-                 vm_object_coalesce(prev_entry->object.vm_object,
-                                    OFF_TO_IDX(prev_entry->offset),
+                prev_entry->ba.backing_ba == NULL &&   /* not backed */
+                ((prev_entry->ba.object == NULL) ||
+                 vm_object_coalesce(prev_entry->ba.object,
+                                    OFF_TO_IDX(prev_entry->ba.offset),
                                     (vm_size_t)(prev_entry->end - prev_entry->start),
                                     (vm_size_t)(end - prev_entry->end)))) {
                /*
@@ -1155,18 +1316,16 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
                 * map entry, we have to create a new map entry.  We
                 * must bump the ref count on the extended object to
                 * account for it.  object may be NULL.
-                *
-                * XXX if object is NULL should we set offset to 0 here ?
                 */
-               object = prev_entry->object.vm_object;
-               offset = prev_entry->offset +
+               object = prev_entry->ba.object;
+               offset = prev_entry->ba.offset +
                        (prev_entry->end - prev_entry->start);
                if (object) {
                        vm_object_hold(object);
-                       vm_object_chain_wait(object, 0);
+                       vm_object_lock_swap(); /* map->token order */
                        vm_object_reference_locked(object);
-                       must_drop = 1;
                        map_object = object;
+                       must_drop = 1;
                }
        }
 
@@ -1179,7 +1338,6 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
        /*
         * Create a new entry
         */
-
        new_entry = vm_map_entry_create(map, countp);
        new_entry->start = start;
        new_entry->end = end;
@@ -1187,10 +1345,13 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
 
        new_entry->maptype = maptype;
        new_entry->eflags = protoeflags;
-       new_entry->object.map_object = map_object;
        new_entry->aux.master_pde = 0;          /* in case size is different */
        new_entry->aux.map_aux = map_aux;
-       new_entry->offset = offset;
+       new_entry->ba.map_object = map_object;
+       new_entry->ba.backing_ba = NULL;
+       new_entry->ba.offset = offset;
+       new_entry->ba.refs = 0;
+       new_entry->ba.flags = 0;
 
        new_entry->inheritance = VM_INHERIT_DEFAULT;
        new_entry->protection = prot;
@@ -1242,10 +1403,10 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
                        vm_object_lock_swap();
                }
        }
+       lwkt_reltoken(&map->token);
        if (must_drop)
                vm_object_drop(object);
 
-       lwkt_reltoken(&map->token);
        return (KERN_SUCCESS);
 }
 
@@ -1471,9 +1632,10 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
                prevsize = prev->end - prev->start;
                if ( (prev->end == entry->start) &&
                     (prev->maptype == entry->maptype) &&
-                    (prev->object.vm_object == entry->object.vm_object) &&
-                    (!prev->object.vm_object ||
-                       (prev->offset + prevsize == entry->offset)) &&
+                    (prev->ba.object == entry->ba.object) &&
+                    (prev->ba.backing_ba == entry->ba.backing_ba) &&
+                    (!prev->ba.object ||
+                       (prev->ba.offset + prevsize == entry->ba.offset)) &&
                     (prev->eflags == entry->eflags) &&
                     (prev->protection == entry->protection) &&
                     (prev->max_protection == entry->max_protection) &&
@@ -1482,9 +1644,9 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
                     (prev->wired_count == entry->wired_count)) {
                        vm_map_entry_unlink(map, prev);
                        entry->start = prev->start;
-                       entry->offset = prev->offset;
-                       if (prev->object.vm_object)
-                               vm_object_deallocate(prev->object.vm_object);
+                       entry->ba.offset = prev->ba.offset;
+                       if (prev->ba.object)
+                               vm_object_deallocate(prev->ba.object);
                        vm_map_entry_dispose(map, prev, countp);
                }
        }
@@ -1494,9 +1656,10 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
                esize = entry->end - entry->start;
                if ((entry->end == next->start) &&
                    (next->maptype == entry->maptype) &&
-                   (next->object.vm_object == entry->object.vm_object) &&
-                    (!entry->object.vm_object ||
-                       (entry->offset + esize == next->offset)) &&
+                   (next->ba.object == entry->ba.object) &&
+                    (prev->ba.backing_ba == entry->ba.backing_ba) &&
+                    (!entry->ba.object ||
+                       (entry->ba.offset + esize == next->ba.offset)) &&
                    (next->eflags == entry->eflags) &&
                    (next->protection == entry->protection) &&
                    (next->max_protection == entry->max_protection) &&
@@ -1505,8 +1668,8 @@ vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
                    (next->wired_count == entry->wired_count)) {
                        vm_map_entry_unlink(map, next);
                        entry->end = next->end;
-                       if (next->object.vm_object)
-                               vm_object_deallocate(next->object.vm_object);
+                       if (next->ba.object)
+                               vm_object_deallocate(next->ba.object);
                        vm_map_entry_dispose(map, next, countp);
                }
        }
@@ -1548,7 +1711,7 @@ _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
         * map.  This is a bit of a hack, but is also about the best place to
         * put this improvement.
         */
-       if (entry->object.vm_object == NULL && !map->system_map &&
+       if (entry->ba.object == NULL && !map->system_map &&
            VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
                vm_map_entry_allocate_object(entry);
        }
@@ -1557,19 +1720,20 @@ _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
        *new_entry = *entry;
 
        new_entry->end = start;
-       entry->offset += (start - entry->start);
+       entry->ba.offset += (start - entry->start);
        entry->start = start;
+       if (new_entry->ba.backing_ba)
+               atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
 
        vm_map_entry_link(map, new_entry);
 
        switch(entry->maptype) {
        case VM_MAPTYPE_NORMAL:
        case VM_MAPTYPE_VPAGETABLE:
-               if (new_entry->object.vm_object) {
-                       vm_object_hold(new_entry->object.vm_object);
-                       vm_object_chain_wait(new_entry->object.vm_object, 0);
-                       vm_object_reference_locked(new_entry->object.vm_object);
-                       vm_object_drop(new_entry->object.vm_object);
+               if (new_entry->ba.object) {
+                       vm_object_hold(new_entry->ba.object);
+                       vm_object_reference_locked(new_entry->ba.object);
+                       vm_object_drop(new_entry->ba.object);
                }
                break;
        default:
@@ -1608,7 +1772,7 @@ _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
         * put this improvement.
         */
 
-       if (entry->object.vm_object == NULL && !map->system_map &&
+       if (entry->ba.object == NULL && !map->system_map &&
            VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
                vm_map_entry_allocate_object(entry);
        }
@@ -1620,18 +1784,19 @@ _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
        *new_entry = *entry;
 
        new_entry->start = entry->end = end;
-       new_entry->offset += (end - entry->start);
+       new_entry->ba.offset += (end - entry->start);
+       if (new_entry->ba.backing_ba)
+               atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
 
        vm_map_entry_link(map, new_entry);
 
        switch(entry->maptype) {
        case VM_MAPTYPE_NORMAL:
        case VM_MAPTYPE_VPAGETABLE:
-               if (new_entry->object.vm_object) {
-                       vm_object_hold(new_entry->object.vm_object);
-                       vm_object_chain_wait(new_entry->object.vm_object, 0);
-                       vm_object_reference_locked(new_entry->object.vm_object);
-                       vm_object_drop(new_entry->object.vm_object);
+               if (new_entry->ba.object) {
+                       vm_object_hold(new_entry->ba.object);
+                       vm_object_reference_locked(new_entry->ba.object);
+                       vm_object_drop(new_entry->ba.object);
                }
                break;
        default:
@@ -1887,8 +2052,8 @@ vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
 
        if ((entry->start == start) && (entry->end == end) &&
            ((entry->eflags & MAP_ENTRY_COW) == 0) &&
-           (entry->object.vm_object == NULL)) {
-               entry->object.sub_map = submap;
+           (entry->ba.object == NULL)) {
+               entry->ba.sub_map = submap;
                entry->maptype = VM_MAPTYPE_SUBMAP;
                result = KERN_SUCCESS;
        }
@@ -1956,11 +2121,11 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
                    (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
                    (current->maptype == VM_MAPTYPE_NORMAL ||
                     current->maptype == VM_MAPTYPE_VPAGETABLE) &&
-                   current->object.vm_object &&
-                   current->object.vm_object->type == OBJT_VNODE) {
+                   current->ba.object &&
+                   current->ba.object->type == OBJT_VNODE) {
                        struct vnode *vp;
 
-                       vp = current->object.vm_object->handle;
+                       vp = current->ba.object->handle;
                        if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
                                vfs_timestamp(&vp->v_lastwrite_ts);
                                vsetflags(vp, VLASTWRITETS);
@@ -2176,6 +2341,9 @@ vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
                 * NOTE!  These functions are only supported on normal maps,
                 *        except MADV_INVAL which is also supported on
                 *        virtual page tables.
+                *
+                * NOTE!  These functions only apply to the top-most object.
+                *        It is not applicable to backing objects.
                 */
                for (current = entry;
                     current && current->start < end;
@@ -2188,7 +2356,7 @@ vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
                                continue;
                        }
 
-                       pindex = OFF_TO_IDX(current->offset);
+                       pindex = OFF_TO_IDX(current->ba.offset);
                        delta = atop(current->end - current->start);
                        useStart = current->start;
 
@@ -2230,7 +2398,7 @@ vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
                                            useStart + ptoa(delta));
                                vm_map_deinterlock(map, &ilock);
                        } else {
-                               vm_object_madvise(current->object.vm_object,
+                               vm_object_madvise(current->ba.object,
                                                  pindex, delta, behav);
                        }
 
@@ -2245,7 +2413,7 @@ vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
                                    map->pmap, 
                                    useStart,
                                    current->protection,
-                                   current->object.vm_object,
+                                   current->ba.object,
                                    pindex, 
                                    (count << PAGE_SHIFT),
                                    MAP_PREFAULT_MADVISE
@@ -2367,7 +2535,7 @@ vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
                                if (copyflag && ((entry->protection &
                                                  VM_PROT_WRITE) != 0)) {
                                        vm_map_entry_shadow(entry, 0);
-                               } else if (entry->object.vm_object == NULL &&
+                               } else if (entry->ba.object == NULL &&
                                           !map->system_map) {
                                        vm_map_entry_allocate_object(entry);
                                }
@@ -2568,7 +2736,7 @@ vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
                                if (copyflag && ((entry->protection &
                                                  VM_PROT_WRITE) != 0)) {
                                        vm_map_entry_shadow(entry, 0);
-                               } else if (entry->object.vm_object == NULL &&
+                               } else if (entry->ba.object == NULL &&
                                           !map->system_map) {
                                        vm_map_entry_allocate_object(entry);
                                }
@@ -2732,9 +2900,9 @@ vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
        vm_map_entry_t current;
        vm_map_entry_t next;
        vm_map_entry_t entry;
+       vm_map_backing_t *ba;
        vm_size_t size;
        vm_object_t object;
-       vm_object_t tobj;
        vm_ooffset_t offset;
 
        vm_map_lock_read(map);
@@ -2775,7 +2943,7 @@ vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
         */
        current = entry;
        while (current && current->start < end) {
-               offset = current->offset + (start - current->start);
+               offset = current->ba.offset + (start - current->start);
                size = (end <= current->end ? end : current->end) - start;
 
                switch(current->maptype) {
@@ -2785,17 +2953,17 @@ vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
                        vm_map_entry_t tentry;
                        vm_size_t tsize;
 
-                       smap = current->object.sub_map;
+                       smap = current->ba.sub_map;
                        vm_map_lock_read(smap);
                        vm_map_lookup_entry(smap, offset, &tentry);
                        if (tentry == NULL) {
                                tsize = vm_map_max(smap) - offset;
-                               object = NULL;
+                               ba = NULL;
                                offset = 0 + (offset - vm_map_min(smap));
                        } else {
                                tsize = tentry->end - offset;
-                               object = tentry->object.vm_object;
-                               offset = tentry->offset +
+                               ba = &tentry->ba;
+                               offset = tentry->ba.offset +
                                         (offset - tentry->start);
                        }
                        vm_map_unlock_read(smap);
@@ -2805,15 +2973,19 @@ vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
                }
                case VM_MAPTYPE_NORMAL:
                case VM_MAPTYPE_VPAGETABLE:
-                       object = current->object.vm_object;
+                       ba = &current->ba;
                        break;
                default:
-                       object = NULL;
+                       ba = NULL;
                        break;
                }
-
-               if (object)
-                       vm_object_hold(object);
+               if (ba) {
+                       object = ba->object;
+                       if (object)
+                               vm_object_hold(object);
+               } else {
+                       object = NULL;
+               }
 
                /*
                 * Note that there is absolutely no sense in writing out
@@ -2824,21 +2996,30 @@ vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
                 *
                 * note: certain anonymous maps, such as MAP_NOSYNC maps,
                 * may start out with a NULL object.
+                *
+                * XXX do we really want to stop at the first backing store
+                * here if there are more? XXX
                 */
-               while (object && (tobj = object->backing_object) != NULL) {
-                       vm_object_hold(tobj);
-                       if (tobj == object->backing_object) {
-                               vm_object_lock_swap();
-                               offset += object->backing_object_offset;
-                               vm_object_drop(object);
+               if (ba) {
+                       vm_object_t tobj;
+
+                       tobj = object;
+                       while (ba->backing_ba != NULL) {
+                               ba = ba->backing_ba;
+                               offset += ba->offset;
+                               tobj = ba->object;
+                               if (tobj->size < OFF_TO_IDX(offset + size))
+                                       size = IDX_TO_OFF(tobj->size) - offset;
+                               break; /* XXX this break is not correct */
+                       }
+                       if (object != tobj) {
+                               if (object)
+                                       vm_object_drop(object);
                                object = tobj;
-                               if (object->size < OFF_TO_IDX(offset + size))
-                                       size = IDX_TO_OFF(object->size) -
-                                              offset;
-                               break;
+                               vm_object_hold(object);
                        }
-                       vm_object_drop(tobj);
                }
+
                if (object && (object->type == OBJT_VNODE) && 
                    (current->protection & VM_PROT_WRITE) &&
                    (object->flags & OBJ_NOMSYNC) == 0) {
@@ -2943,7 +3124,7 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
        case VM_MAPTYPE_NORMAL:
        case VM_MAPTYPE_VPAGETABLE:
        case VM_MAPTYPE_SUBMAP:
-               vm_object_deallocate(entry->object.vm_object);
+               vm_object_deallocate(entry->ba.object);
                break;
        case VM_MAPTYPE_UKSMAP:
                /* XXX TODO */
@@ -3032,14 +3213,14 @@ again:
                e = entry->end;
                next = vm_map_rb_tree_RB_NEXT(entry);
 
-               offidxstart = OFF_TO_IDX(entry->offset);
+               offidxstart = OFF_TO_IDX(entry->ba.offset);
                count = OFF_TO_IDX(e - s);
 
                switch(entry->maptype) {
                case VM_MAPTYPE_NORMAL:
                case VM_MAPTYPE_VPAGETABLE:
                case VM_MAPTYPE_SUBMAP:
-                       object = entry->object.vm_object;
+                       object = entry->ba.object;
                        break;
                default:
                        object = NULL;
@@ -3079,7 +3260,6 @@ again:
                        vm_object_drop(object);
                } else if (object) {
                        vm_object_hold(object);
-                       vm_object_chain_acquire(object, 0);
                        pmap_remove(map->pmap, s, e);
 
                        if (object != NULL &&
@@ -3092,7 +3272,7 @@ again:
                                 * When ONEMAPPING is set we can destroy the
                                 * pages underlying the entry's range.
                                 */
-                               vm_object_collapse(object, NULL);
+                               /*vm_object_collapse(object, NULL);*/
                                vm_object_page_remove(object, offidxstart,
                                                      offidxend, FALSE);
                                if (object->type == OBJT_SWAP) {
@@ -3105,7 +3285,6 @@ again:
                                        object->size = offidxstart;
                                }
                        }
-                       vm_object_chain_release(object);
                        vm_object_drop(object);
                } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
                        pmap_remove(map->pmap, s, e);
@@ -3218,283 +3397,8 @@ vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 }
 
 /*
- * If appropriate this function shadows the original object with a new object
- * and moves the VM pages from the original object to the new object.
- * The original object will also be collapsed, if possible.
- *
- * Caller must supply entry->object.vm_object held and chain_acquired, and
- * should chain_release and drop the object upon return.
- *
- * We can only do this for normal memory objects with a single mapping, and
- * it only makes sense to do it if there are 2 or more refs on the original
- * object.  i.e. typically a memory object that has been extended into
- * multiple vm_map_entry's with non-overlapping ranges.
- *
- * This makes it easier to remove unused pages and keeps object inheritance
- * from being a negative impact on memory usage.
- *
- * On return the (possibly new) entry->object.vm_object will have an
- * additional ref on it for the caller to dispose of (usually by cloning
- * the vm_map_entry).  The additional ref had to be done in this routine
- * to avoid racing a collapse.  The object's ONEMAPPING flag will also be
- * cleared.
- *
- * The vm_map must be locked and its token held.
- */
-static void
-vm_map_split(vm_map_entry_t entry, vm_object_t oobject)
-{
-       /* OPTIMIZED */
-       vm_object_t nobject, bobject;
-       vm_offset_t s, e;
-       vm_page_t m;
-       vm_pindex_t offidxstart, offidxend, idx;
-       vm_size_t size;
-       vm_ooffset_t offset;
-       int useshadowlist;
-
-       /*
-        * Optimize away object locks for vnode objects.  Important exit/exec
-        * critical path.
-        *
-        * OBJ_ONEMAPPING doesn't apply to vnode objects but clear the flag
-        * anyway.
-        */
-       if (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) {
-               vm_object_reference_quick(oobject);
-               vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
-               return;
-       }
-
-#if 0
-       /*
-        * Original object cannot be split?
-        */
-       if (oobject->handle == NULL) {
-               vm_object_reference_locked_chain_held(oobject);
-               vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
-               return;
-       }
-#endif
-
-       /*
-        * Collapse original object with its backing store as an
-        * optimization to reduce chain lengths when possible.
-        *
-        * If ref_count <= 1 there aren't other non-overlapping vm_map_entry's
-        * for oobject, so there's no point collapsing it.
-        *
-        * Then re-check whether the object can be split.
-        */
-       vm_object_collapse(oobject, NULL);
-
-       if (oobject->ref_count <= 1 ||
-           (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) ||
-           (oobject->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) != OBJ_ONEMAPPING) {
-               vm_object_reference_locked_chain_held(oobject);
-               vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
-               return;
-       }
-
-       /*
-        * Acquire the chain lock on the backing object.
-        *
-        * Give bobject an additional ref count for when it will be shadowed
-        * by nobject.
-        */
-       useshadowlist = 0;
-       if ((bobject = oobject->backing_object) != NULL) {
-               if (bobject->type != OBJT_VNODE) {
-                       useshadowlist = 1;
-                       vm_object_hold(bobject);
-                       vm_object_chain_wait(bobject, 0);
-                       /* ref for shadowing below */
-                       vm_object_reference_locked(bobject);
-                       vm_object_chain_acquire(bobject, 0);
-                       KKASSERT(oobject->backing_object == bobject);
-                       KKASSERT((bobject->flags & OBJ_DEAD) == 0);
-               } else {
-                       /*
-                        * vnodes are not placed on the shadow list but
-                        * they still get another ref for the backing_object
-                        * reference.
-                        */
-                       vm_object_reference_quick(bobject);
-               }
-       }
-
-       /*
-        * Calculate the object page range and allocate the new object.
-        */
-       offset = entry->offset;
-       s = entry->start;
-       e = entry->end;
-
-       offidxstart = OFF_TO_IDX(offset);
-       offidxend = offidxstart + OFF_TO_IDX(e - s);
-       size = offidxend - offidxstart;
-
-       switch(oobject->type) {
-       case OBJT_DEFAULT:
-               nobject = default_pager_alloc(NULL, IDX_TO_OFF(size),
-                                             VM_PROT_ALL, 0);
-               break;
-       case OBJT_SWAP:
-               nobject = swap_pager_alloc(NULL, IDX_TO_OFF(size),
-                                          VM_PROT_ALL, 0);
-               break;
-       default:
-               /* not reached */
-               nobject = NULL;
-               KKASSERT(0);
-       }
-
-       /*
-        * If we could not allocate nobject just clear ONEMAPPING on
-        * oobject and return.
-        */
-       if (nobject == NULL) {
-               if (bobject) {
-                       if (useshadowlist) {
-                               vm_object_chain_release(bobject);
-                               vm_object_deallocate(bobject);
-                               vm_object_drop(bobject);
-                       } else {
-                               vm_object_deallocate(bobject);
-                       }
-               }
-               vm_object_reference_locked_chain_held(oobject);
-               vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
-               return;
-       }
-
-       /*
-        * The new object will replace entry->object.vm_object so it needs
-        * a second reference (the caller expects an additional ref).
-        */
-       vm_object_hold(nobject);
-       vm_object_reference_locked(nobject);
-       vm_object_chain_acquire(nobject, 0);
-
-       /*
-        * nobject shadows bobject (oobject already shadows bobject).
-        *
-        * Adding an object to bobject's shadow list requires refing bobject
-        * which we did above in the useshadowlist case.
-        *
-        * XXX it is unclear if we need to clear ONEMAPPING on bobject here
-        *     or not.
-        */
-       if (bobject) {
-               nobject->backing_object_offset =
-                   oobject->backing_object_offset + IDX_TO_OFF(offidxstart);
-               nobject->backing_object = bobject;
-               if (useshadowlist) {
-                       atomic_add_int(&bobject->generation, 1);
-                       vm_object_clear_flag(bobject, OBJ_ONEMAPPING); /*XXX*/
-                       vm_object_set_flag(nobject, OBJ_ONSHADOW);
-               }
-       }
-
-       /*
-        * Move the VM pages from oobject to nobject
-        */
-       for (idx = 0; idx < size; idx++) {
-               vm_page_t m;
-
-               m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
-                                            TRUE, "vmpg");
-               if (m == NULL)
-                       continue;
-
-               /*
-                * We must wait for pending I/O to complete before we can
-                * rename the page.
-                *
-                * We do not have to VM_PROT_NONE the page as mappings should
-                * not be changed by this operation.
-                *
-                * NOTE: The act of renaming a page updates chaingen for both
-                *       objects.
-                */
-               vm_page_rename(m, nobject, idx);
-               /* page automatically made dirty by rename and cache handled */
-               /* page remains busy */
-       }
-
-       if (oobject->type == OBJT_SWAP) {
-               vm_object_pip_add(oobject, 1);
-               /*
-                * copy oobject pages into nobject and destroy unneeded
-                * pages in shadow object.
-                */
-               swap_pager_copy(oobject, nobject, offidxstart, 0);
-               vm_object_pip_wakeup(oobject);
-       }
-
-       /*
-        * Wakeup the pages we played with.  No spl protection is needed
-        * for a simple wakeup.
-        */
-       for (idx = 0; idx < size; idx++) {
-               m = vm_page_lookup(nobject, idx);
-               if (m) {
-                       KKASSERT(m->busy_count & PBUSY_LOCKED);
-                       vm_page_wakeup(m);
-               }
-       }
-       entry->object.vm_object = nobject;
-       entry->offset = 0LL;
-
-       /*
-        * The map is being split and nobject is going to wind up on both
-        * vm_map_entry's, so make sure OBJ_ONEMAPPING is cleared on
-        * nobject.
-        */
-       vm_object_clear_flag(nobject, OBJ_ONEMAPPING);
-
-       /*
-        * Cleanup
-        *
-        * NOTE: There is no need to remove OBJ_ONEMAPPING from oobject, the
-        *       related pages were moved and are no longer applicable to the
-        *       original object.
-        *
-        * NOTE: Deallocate oobject (due to its entry->object.vm_object being
-        *       replaced by nobject).
-        */
-       vm_object_chain_release(nobject);
-       vm_object_drop(nobject);
-       if (bobject && useshadowlist) {
-               vm_object_chain_release(bobject);
-               vm_object_drop(bobject);
-       }
-
-#if 0
-       if (oobject->resident_page_count) {
-               kprintf("oobject %p still contains %jd pages!\n",
-                       oobject, (intmax_t)oobject->resident_page_count);
-               for (idx = 0; idx < size; idx++) {
-                       vm_page_t m;
-
-                       m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
-                                                    TRUE, "vmpg");
-                       if (m) {
-                               kprintf("oobject %p idx %jd\n",
-                                       oobject,
-                                       offidxstart + idx);
-                               vm_page_wakeup(m);
-                       }
-               }
-       }
-#endif
-       /*vm_object_clear_flag(oobject, OBJ_ONEMAPPING);*/
-       vm_object_deallocate_locked(oobject);
-}
-
-/*
- * Copies the contents of the source entry to the destination
- * entry.  The entries *must* be aligned properly.
+ * Handles the dirty work of making src_entry and dst_entry copy-on-write
+ * after src_entry has been cloned to dst_entry.
  *
  * The vm_maps must be exclusively locked.
  * The vm_map's token must be held.
@@ -3509,6 +3413,9 @@ vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
        vm_object_t src_object;
        vm_object_t oobject;
 
+       /*
+        * Nothing to do for special map types
+        */
        if (dst_entry->maptype == VM_MAPTYPE_SUBMAP ||
            dst_entry->maptype == VM_MAPTYPE_UKSMAP)
                return;
@@ -3525,11 +3432,9 @@ vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
                 * released its vm_map, we must acquire the fronting
                 * object.
                 */
-               oobject = src_entry->object.vm_object;
-               if (oobject) {
+               oobject = src_entry->ba.object;
+               if (oobject)
                        vm_object_hold(oobject);
-                       vm_object_chain_acquire(oobject, 0);
-               }
 
                if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
                        pmap_protect(src_map->pmap,
@@ -3541,47 +3446,48 @@ vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
                /*
                 * Make a copy of the object.
                 *
-                * The object must be locked prior to checking the object type
-                * and for the call to vm_object_collapse() and vm_map_split().
-                * We cannot use *_hold() here because the split code will
-                * probably try to destroy the object.  The lock is a pool
-                * token and doesn't care.
+                * The object has been copied but we need to bump the
+                * ref-count and then mark both entries NEEDS_COPY.
+                *
+                * If there is no object then we are golden.  Also, in
+                * this situation if there are no backing_ba linkages then
+                * we can set ba.offset to 0 for debugging convenience.
                 *
-                * We must bump src_map->timestamp when setting
-                * MAP_ENTRY_NEEDS_COPY to force any concurrent fault
-                * to retry, otherwise the concurrent fault might improperly
-                * install a RW pte when its supposed to be a RO(COW) pte.
-                * This race can occur because a vnode-backed fault may have
-                * to temporarily release the map lock.  This was handled
-                * when the caller locked the map exclusively.
+                * ba.offset cannot otherwise be modified because it effects
+                * the offsets for the entire backing_ba chain.
                 */
+               KKASSERT(dst_entry->ba.object == oobject);
                if (oobject) {
-                       vm_map_split(src_entry, oobject);
+                       vm_object_reference_locked(oobject);
+                       vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
 
-                       src_object = src_entry->object.vm_object;
-                       dst_entry->object.vm_object = src_object;
+                       src_object = src_entry->ba.object;
+                       dst_entry->ba.object = src_object;
                        src_entry->eflags |= (MAP_ENTRY_COW |
                                              MAP_ENTRY_NEEDS_COPY);
                        dst_entry->eflags |= (MAP_ENTRY_COW |
                                              MAP_ENTRY_NEEDS_COPY);
-                       dst_entry->offset = src_entry->offset;
+                       dst_entry->ba.offset = src_entry->ba.offset;
                } else {
-                       dst_entry->object.vm_object = NULL;
-                       dst_entry->offset = 0;
+                       if (dst_entry->ba.backing_ba == NULL)
+                               dst_entry->ba.offset = 0;
                }
                pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
                          dst_entry->end - dst_entry->start,
                          src_entry->start);
-               if (oobject) {
-                       vm_object_chain_release(oobject);
+               if (oobject)
                        vm_object_drop(oobject);
-               }
        } else {
                /*
                 * Of course, wired down pages can't be set copy-on-write.
                 * Cause wired pages to be copied into the new map by
                 * simulating faults (the new pages are pageable)
+                *
+                * Scrap ba.object (its ref-count has not yet been adjusted
+                * so we can just NULL out the field).  vm_fault_copy_entry()
+                * will create a new object.
                 */
+               dst_entry->ba.object = NULL;
                vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
        }
 }
@@ -3648,8 +3554,8 @@ vmspace_fork(struct vmspace *vm1)
        }
 
        new_map->size = old_map->size;
-       vm_map_unlock(old_map);
        vm_map_unlock(new_map);
+       vm_map_unlock(old_map);
        vm_map_entry_release(count);
 
        lwkt_reltoken(&vm2->vm_map.token);
@@ -3671,25 +3577,28 @@ vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
                break;
        case VM_INHERIT_SHARE:
                /*
-                * Clone the entry, creating the shared object if
-                * necessary.
+                * Clone the entry as a shared entry.  This will look like
+                * shared memory across the old and the new process.  We must
+                * ensure that the object is allocated.
                 */
-               if (old_entry->object.vm_object == NULL)
+               if (old_entry->ba.object == NULL)
                        vm_map_entry_allocate_object(old_entry);
 
                if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
                        /*
-                        * Shadow a map_entry which needs a copy,
-                        * replacing its object with a new object
-                        * that points to the old one.  Ask the
-                        * shadow code to automatically add an
-                        * additional ref.  We can't do it afterwords
-                        * because we might race a collapse.  The call
-                        * to vm_map_entry_shadow() will also clear
+                        * Create the fronting vm_map_backing for
+                        * an entry which needs a copy, plus an extra
+                        * ref because we are going to duplicate it
+                        * in the fork.
+                        *
+                        * The call to vm_map_entry_shadow() will also clear
                         * OBJ_ONEMAPPING.
+                        *
+                        * XXX no more collapse.  Still need extra ref
+                        * for the fork.
                         */
                        vm_map_entry_shadow(old_entry, 1);
-               } else if (old_entry->object.vm_object) {
+               } else if (old_entry->ba.object) {
                        /*
                         * We will make a shared copy of the object,
                         * and must clear OBJ_ONEMAPPING.
@@ -3702,29 +3611,30 @@ vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
                         * XXX assert that object.vm_object != NULL
                         *     since we allocate it above.
                         */
-                       object = old_entry->object.vm_object;
+                       object = old_entry->ba.object;
                        if (object->type == OBJT_VNODE) {
                                vm_object_reference_quick(object);
                                vm_object_clear_flag(object,
                                                     OBJ_ONEMAPPING);
                        } else {
                                vm_object_hold(object);
-                               vm_object_chain_wait(object, 0);
                                vm_object_reference_locked(object);
-                               vm_object_clear_flag(object,
-                                                    OBJ_ONEMAPPING);
+                               vm_object_clear_flag(object, OBJ_ONEMAPPING);
                                vm_object_drop(object);
                        }
                }
 
                /*
                 * Clone the entry.  We've already bumped the ref on
-                * any vm_object.
+                * the vm_object for our new entry.
                 */
                new_entry = vm_map_entry_create(new_map, countp);
                *new_entry = *old_entry;
+
                new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
                new_entry->wired_count = 0;
+               if (new_entry->ba.backing_ba)
+                       atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
 
                /*
                 * Insert the entry into the new map -- we know we're
@@ -3742,16 +3652,27 @@ vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
                break;
        case VM_INHERIT_COPY:
                /*
-                * Clone the entry and link into the map.
+                * Clone the entry and link the copy into the new map.
+                *
+                * Note that ref-counting adjustment for old_entry->ba.object
+                * (if it isn't a special map that is) is handled by
+                * vm_map_copy_entry().
                 */
                new_entry = vm_map_entry_create(new_map, countp);
                *new_entry = *old_entry;
+
                new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
                new_entry->wired_count = 0;
-               new_entry->object.vm_object = NULL;
+               if (new_entry->ba.backing_ba)
+                       atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
+
                vm_map_entry_link(new_map, new_entry);
-               vm_map_copy_entry(old_map, new_map, old_entry,
-                                 new_entry);
+
+               /*
+                * This does the actual dirty work of making both entries
+                * copy-on-write, and will also handle the fronting object.
+                */
+               vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
                break;
        }
 }
@@ -3769,8 +3690,12 @@ vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
 
        new_entry = vm_map_entry_create(new_map, countp);
        *new_entry = *old_entry;
+
        new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
        new_entry->wired_count = 0;
+       if (new_entry->ba.backing_ba)
+               atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
+
        vm_map_entry_link(new_map, new_entry);
 }
 
@@ -4220,7 +4145,7 @@ vm_map_lookup(vm_map_t *var_map,          /* IN/OUT */
              vm_offset_t vaddr,
              vm_prot_t fault_typea,
              vm_map_entry_t *out_entry,        /* OUT */
-             vm_object_t *object,              /* OUT */
+             vm_map_backing_t **bap,           /* OUT */
              vm_pindex_t *pindex,              /* OUT */
              vm_prot_t *out_prot,              /* OUT */
              int *wflags)                      /* OUT */
@@ -4259,7 +4184,7 @@ RetryLookup:
         */
        cpu_ccfence();
        *out_entry = NULL;
-       *object = NULL;
+       *bap = NULL;
 
        {
                vm_map_entry_t tmp_entry;
@@ -4278,7 +4203,7 @@ RetryLookup:
        if (entry->maptype == VM_MAPTYPE_SUBMAP) {
                vm_map_t old_map = map;
 
-               *var_map = map = entry->object.sub_map;
+               *var_map = map = entry->ba.sub_map;
                if (use_read_lock)
                        vm_map_unlock_read(old_map);
                else
@@ -4343,7 +4268,7 @@ RetryLookup:
         */
        if (entry->maptype != VM_MAPTYPE_NORMAL &&
            entry->maptype != VM_MAPTYPE_VPAGETABLE) {
-               *object = NULL;
+               *bap = NULL;
                goto skip;
        }
 
@@ -4370,10 +4295,10 @@ RetryLookup:
                        }
 
                        /*
-                        * Make a new object, and place it in the object
-                        * chain.  Note that no new references have appeared
-                        * -- one just moved from the map to the new
-                        * object.
+                        * Make a new vm_map_backing + object, and place it
+                        * in the object chain.  Note that no new references
+                        * have appeared -- one just moved from the map to
+                        * the new object.
                         */
                        if (use_read_lock && vm_map_lock_upgrade(map)) {
                                /* lost lock */
@@ -4396,7 +4321,7 @@ RetryLookup:
         * Create an object if necessary.  This code also handles
         * partitioning large entries to improve vm_fault performance.
         */
-       if (entry->object.vm_object == NULL && !map->system_map) {
+       if (entry->ba.object == NULL && !map->system_map) {
                if (use_read_lock && vm_map_lock_upgrade(map))  {
                        /* lost lock */
                        use_read_lock = 0;
@@ -4429,10 +4354,10 @@ RetryLookup:
         * Return the object/offset from this entry.  If the entry was
         * copy-on-write or empty, it has been fixed up.
         */
-       *object = entry->object.vm_object;
+       *bap = &entry->ba;
 
 skip:
-       *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
+       *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->ba.offset);
 
        /*
         * Return whether this is the only map sharing this data.  On
@@ -4574,34 +4499,33 @@ DB_SHOW_COMMAND(map, vm_map_print)
                }
                switch(entry->maptype) {
                case VM_MAPTYPE_SUBMAP:
-                       /* XXX no %qd in kernel.  Truncate entry->offset. */
+                       /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
                        db_printf(", share=%p, offset=0x%lx\n",
-                           (void *)entry->object.sub_map,
-                           (long)entry->offset);
+                           (void *)entry->ba.sub_map,
+                           (long)entry->ba.offset);
                        nlines++;
 
                        db_indent += 2;
-                       vm_map_print((db_expr_t)(intptr_t)
-                                    entry->object.sub_map,
+                       vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
                                     full, 0, NULL);
                        db_indent -= 2;
                        break;
                case VM_MAPTYPE_NORMAL:
                case VM_MAPTYPE_VPAGETABLE:
-                       /* XXX no %qd in kernel.  Truncate entry->offset. */
+                       /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
                        db_printf(", object=%p, offset=0x%lx",
-                           (void *)entry->object.vm_object,
-                           (long)entry->offset);
+                           (void *)entry->ba.object,
+                           (long)entry->ba.offset);
                        if (entry->eflags & MAP_ENTRY_COW)
                                db_printf(", copy (%s)",
                                    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
                        db_printf("\n");
                        nlines++;
 
-                       if (entry->object.vm_object) {
+                       if (entry->ba.object) {
                                db_indent += 2;
                                vm_object_print((db_expr_t)(intptr_t)
-                                               entry->object.vm_object,
+                                               entry->ba.object,
                                                full, 0, NULL);
                                nlines += 4;
                                db_indent -= 2;
@@ -4609,8 +4533,8 @@ DB_SHOW_COMMAND(map, vm_map_print)
                        break;
                case VM_MAPTYPE_UKSMAP:
                        db_printf(", uksmap=%p, offset=0x%lx",
-                           (void *)entry->object.uksmap,
-                           (long)entry->offset);
+                           (void *)entry->ba.uksmap,
+                           (long)entry->ba.offset);
                        if (entry->eflags & MAP_ENTRY_COW)
                                db_printf(", copy (%s)",
                                    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
index 605074f..da28508 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 1991, 1993
  *     The Regents of the University of California.  All rights reserved.
- * Copyright (c) 2003-2017 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
@@ -60,8 +60,6 @@
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
- *
- * $FreeBSD: src/sys/vm/vm_map.h,v 1.54.2.5 2003/01/13 22:51:17 dillon Exp $
  */
 
 /*
@@ -114,17 +112,6 @@ RB_PROTOTYPE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 typedef u_int vm_flags_t;
 typedef u_int vm_eflags_t;
 
-/*
- * A vm_map_entry may reference an object, a submap, a uksmap, or a
- * direct user-kernel shared map.
- */
-union vm_map_object {
-       struct vm_object *vm_object;    /* object object */
-       struct vm_map *sub_map;         /* belongs to another map */
-       int     (*uksmap)(struct cdev *dev, vm_page_t fake);
-       void    *map_object;            /* generic */
-};
-
 union vm_map_aux {
        vm_offset_t avail_ssize;        /* amt can grow if this is a stack */
        vpte_t master_pde;              /* virtual page table root */
@@ -174,24 +161,59 @@ typedef enum {
 } vm_subsys_t;
 
 /*
- *     Address map entries consist of start and end addresses,
- *     a VM object (or sharing map) and offset into that object,
- *     and user-exported inheritance and protection information.
- *     Also included is control information for virtual copy operations.
+ * vm_map backing structure for specifying multiple backings.  This
+ * structure is NOT shared across pmaps but may be shared within a pmap.
+ * The offset is cumulatively added from its parent, allowing easy splits
+ * and merges.
+ */
+union vm_map_object;
+
+struct vm_map_backing {
+       struct vm_map_backing   *backing_ba;    /* backing store */
+
+       /*
+        * A vm_map_entry may reference an object, a submap, a uksmap, or a
+        * direct user-kernel shared map.
+        */
+       union {
+               struct vm_object *object;       /* vm_object */
+               struct vm_map *sub_map;         /* belongs to another map */
+               int     (*uksmap)(struct cdev *dev, vm_page_t fake);
+               void    *map_object;            /* generic */
+       };
+
+       vm_ooffset_t            offset;         /* cumulative offset */
+       long                    refs;           /* shared refs */
+       uint32_t                flags;
+       uint32_t                unused01;
+};
+
+typedef struct vm_map_backing vm_map_backing_t;
+
+#define VM_MAP_BACK_EXCL_HEUR  0x00000001U
+
+/*
+ * Address map entries consist of start and end addresses, a VM object
+ * (or sharing map) and offset into that object, and user-exported
+ * inheritance and protection information.  Also included is control
+ * information for virtual copy operations.
+ *
+ * The object information is now encapsulated in a vm_map_backing
+ * structure which contains the backing store chain, if any.  This
+ * structure is NOT shared.
  *
- *     When used with MAP_STACK, avail_ssize is used to determine the
- *     limits of stack growth.
+ * When used with MAP_STACK, avail_ssize is used to determine the limits
+ * of stack growth.
  *
- *     When used with VM_MAPTYPE_VPAGETABLE, avail_ssize stores the
- *     page directory index.
+ * When used with VM_MAPTYPE_VPAGETABLE, avail_ssize stores the page
+ * directory index.
  */
 struct vm_map_entry {
        RB_ENTRY(vm_map_entry) rb_entry;
        vm_offset_t start;              /* start address */
        vm_offset_t end;                /* end address */
        union vm_map_aux aux;           /* auxillary data */
-       union vm_map_object object;     /* object I point to */
-       vm_ooffset_t offset;            /* offset into object */
+       vm_map_backing_t ba;            /* backing object chain */
        vm_eflags_t eflags;             /* map entry flags */
        vm_maptype_t maptype;           /* type of VM mapping */
        vm_prot_t protection;           /* protection code */
@@ -546,7 +568,7 @@ vmspace_president_count(struct vmspace *vmspace)
                switch(cur->maptype) {
                case VM_MAPTYPE_NORMAL:
                case VM_MAPTYPE_VPAGETABLE:
-                       if ((object = cur->object.vm_object) == NULL)
+                       if ((object = cur->ba.object) == NULL)
                                break;
                        if (object->type != OBJT_DEFAULT &&
                            object->type != OBJT_SWAP) {
@@ -642,7 +664,7 @@ int vm_map_insert (vm_map_t, int *, void *, void *,
                   vm_maptype_t, vm_subsys_t id,
                   vm_prot_t, vm_prot_t, int);
 int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t,
-               vm_map_entry_t *, vm_object_t *,
+               vm_map_entry_t *, vm_map_backing_t **,
                vm_pindex_t *, vm_prot_t *, int *);
 void vm_map_lookup_done (vm_map_t, vm_map_entry_t, int);
 boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *);
index c00305b..6450cc8 100644 (file)
@@ -827,7 +827,7 @@ RestartScan:
                    current->maptype != VM_MAPTYPE_VPAGETABLE) {
                        continue;
                }
-               if (current->object.vm_object == NULL)
+               if (current->ba.object == NULL)
                        continue;
                
                /*
@@ -863,7 +863,8 @@ RestartScan:
                                /*
                                 * calculate the page index into the object
                                 */
-                               offset = current->offset + (addr - current->start);
+                               offset = current->ba.offset +
+                                        (addr - current->start);
                                pindex = OFF_TO_IDX(offset);
 
                                /*
@@ -877,9 +878,8 @@ RestartScan:
                                 *     in x86 and vkernel pmap code.
                                 */
                                lwkt_gettoken(&vm_token);
-                               vm_object_hold(current->object.vm_object);
-                               m = vm_page_lookup(current->object.vm_object,
-                                                   pindex);
+                               vm_object_hold(current->ba.object);
+                               m = vm_page_lookup(current->ba.object, pindex);
                                if (m && m->valid) {
                                        mincoreinfo = MINCORE_INCORE;
                                        if (m->dirty || pmap_is_modified(m))
@@ -890,7 +890,7 @@ RestartScan:
                                                mincoreinfo |= MINCORE_REFERENCED_OTHER;
                                        }
                                }
-                               vm_object_drop(current->object.vm_object);
+                               vm_object_drop(current->ba.object);
                                lwkt_reltoken(&vm_token);
                        }
 
index 5bd4290..6bc4710 100644 (file)
@@ -95,8 +95,6 @@
 
 #define EASY_SCAN_FACTOR       8
 
-static void    vm_object_qcollapse(vm_object_t object,
-                                   vm_object_t backing_object);
 static void    vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
                                             int pagerflags);
 static void    vm_object_lock_init(vm_object_t);
@@ -129,9 +127,6 @@ static void vm_object_lock_init(vm_object_t);
 
 struct vm_object kernel_object;
 
-static long object_collapses;
-static long object_bypasses;
-
 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
 
 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
@@ -263,7 +258,7 @@ vm_object_assert_held(vm_object_t obj)
        ASSERT_LWKT_TOKEN_HELD(&obj->token);
 }
 
-static __inline int
+int
 vm_quickcolor(void)
 {
        globaldata_t gd = mycpu;
@@ -403,8 +398,6 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
        /* cpu localization twist */
        object->pg_color = vm_quickcolor();
        object->handle = NULL;
-       object->backing_object = NULL;
-       object->backing_object_offset = (vm_ooffset_t)0;
 
        atomic_add_int(&object->generation, 1);
        object->swblock_count = 0;
@@ -494,38 +487,11 @@ vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
  * held.  The original non-lock version is no longer supported.  The object
  * must NOT be chain locked by anyone at the time the reference is added.
  *
- * Referencing a chain-locked object can blow up the fairly sensitive
- * ref_count tests in the deallocator.  Most callers
- * will call vm_object_chain_wait() prior to calling
- * vm_object_reference_locked() to avoid the case.  The held token
- * allows the caller to pair the wait and ref.
- *
  * The object must be held, but may be held shared if desired (hence why
  * we use an atomic op).
  */
 void
 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
-{
-       KKASSERT(object != NULL);
-       ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
-       KKASSERT((object->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) == 0);
-       atomic_add_int(&object->ref_count, 1);
-       if (object->type == OBJT_VNODE) {
-               vref(object->handle);
-               /* XXX what if the vnode is being destroyed? */
-       }
-#if defined(DEBUG_LOCKS)
-       debugvm_object_add(object, file, line, 1);
-#endif
-}
-
-/*
- * This version explicitly allows the chain to be held (i.e. by the
- * caller).  The token must also be held.
- */
-void
-VMOBJDEBUG(vm_object_reference_locked_chain_held)(vm_object_t object
-          VMOBJDBARGS)
 {
        KKASSERT(object != NULL);
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
@@ -553,193 +519,6 @@ VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
 #endif
 }
 
-/*
- * Object OBJ_CHAINLOCK lock handling.
- *
- * The caller can chain-lock backing objects recursively and then
- * use vm_object_chain_release_all() to undo the whole chain.
- *
- * Chain locks are used to prevent collapses and are only applicable
- * to OBJT_DEFAULT and OBJT_SWAP objects.  Chain locking operations
- * on other object types are ignored.  This is also important because
- * it allows e.g. the vnode underlying a memory mapping to take concurrent
- * faults.
- *
- * The object must usually be held on entry, though intermediate
- * objects need not be held on release.  The object must be held exclusively,
- * NOT shared.  Note that the prefault path checks the shared state and
- * avoids using the chain functions.
- */
-void
-vm_object_chain_wait(vm_object_t object, int shared)
-{
-       ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
-       for (;;) {
-               uint32_t chainlk = object->chainlk;
-
-               cpu_ccfence();
-               if (shared) {
-                       if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
-                               tsleep_interlock(object, 0);
-                               if (atomic_cmpset_int(&object->chainlk,
-                                                     chainlk,
-                                                     chainlk | CHAINLK_WAIT)) {
-                                       tsleep(object, PINTERLOCKED,
-                                              "objchns", 0);
-                               }
-                               /* retry */
-                       } else {
-                               break;
-                       }
-                       /* retry */
-               } else {
-                       if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
-                               tsleep_interlock(object, 0);
-                               if (atomic_cmpset_int(&object->chainlk,
-                                                     chainlk,
-                                                     chainlk | CHAINLK_WAIT))
-                               {
-                                       tsleep(object, PINTERLOCKED,
-                                              "objchnx", 0);
-                               }
-                               /* retry */
-                       } else {
-                               if (atomic_cmpset_int(&object->chainlk,
-                                                     chainlk,
-                                                     chainlk & ~CHAINLK_WAIT))
-                               {
-                                       if (chainlk & CHAINLK_WAIT)
-                                               wakeup(object);
-                                       break;
-                               }
-                               /* retry */
-                       }
-               }
-               /* retry */
-       }
-}
-
-void
-vm_object_chain_acquire(vm_object_t object, int shared)
-{
-       if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
-               return;
-       if (vm_shared_fault == 0)
-               shared = 0;
-
-       for (;;) {
-               uint32_t chainlk = object->chainlk;
-
-               cpu_ccfence();
-               if (shared) {
-                       if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
-                               tsleep_interlock(object, 0);
-                               if (atomic_cmpset_int(&object->chainlk,
-                                                     chainlk,
-                                                     chainlk | CHAINLK_WAIT)) {
-                                       tsleep(object, PINTERLOCKED,
-                                              "objchns", 0);
-                               }
-                               /* retry */
-                       } else if (atomic_cmpset_int(&object->chainlk,
-                                             chainlk, chainlk + 1)) {
-                               break;
-                       }
-                       /* retry */
-               } else {
-                       if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
-                               tsleep_interlock(object, 0);
-                               if (atomic_cmpset_int(&object->chainlk,
-                                                     chainlk,
-                                                     chainlk |
-                                                      CHAINLK_WAIT |
-                                                      CHAINLK_EXCLREQ)) {
-                                       tsleep(object, PINTERLOCKED,
-                                              "objchnx", 0);
-                               }
-                               /* retry */
-                       } else {
-                               if (atomic_cmpset_int(&object->chainlk,
-                                                     chainlk,
-                                                     (chainlk | CHAINLK_EXCL) &
-                                                     ~(CHAINLK_EXCLREQ |
-                                                       CHAINLK_WAIT))) {
-                                       if (chainlk & CHAINLK_WAIT)
-                                               wakeup(object);
-                                       break;
-                               }
-                               /* retry */
-                       }
-               }
-               /* retry */
-       }
-}
-
-void
-vm_object_chain_release(vm_object_t object)
-{
-       /*ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));*/
-       if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
-               return;
-       KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
-       for (;;) {
-               uint32_t chainlk = object->chainlk;
-
-               cpu_ccfence();
-               if (chainlk & CHAINLK_MASK) {
-                       if ((chainlk & CHAINLK_MASK) == 1 &&
-                           atomic_cmpset_int(&object->chainlk,
-                                             chainlk,
-                                             (chainlk - 1) & ~CHAINLK_WAIT)) {
-                               if (chainlk & CHAINLK_WAIT)
-                                       wakeup(object);
-                               break;
-                       }
-                       if ((chainlk & CHAINLK_MASK) > 1 &&
-                           atomic_cmpset_int(&object->chainlk,
-                                             chainlk, chainlk - 1)) {
-                               break;
-                       }
-                       /* retry */
-               } else {
-                       KKASSERT(chainlk & CHAINLK_EXCL);
-                       if (atomic_cmpset_int(&object->chainlk,
-                                             chainlk,
-                                             chainlk & ~(CHAINLK_EXCL |
-                                                         CHAINLK_WAIT))) {
-                               if (chainlk & CHAINLK_WAIT)
-                                       wakeup(object);
-                               break;
-                       }
-               }
-       }
-}
-
-/*
- * Release the chain from first_object through and including stopobj.
- * The caller is typically holding the first and last object locked
- * (shared or exclusive) to prevent destruction races.
- *
- * We release stopobj first as an optimization as this object is most
- * likely to be shared across multiple processes.
- */
-void
-vm_object_chain_release_all(vm_object_t first_object, vm_object_t stopobj)
-{
-       vm_object_t backing_object;
-       vm_object_t object;
-
-       vm_object_chain_release(stopobj);
-       object = first_object;
-
-       while (object != stopobj) {
-               KKASSERT(object);
-               backing_object = object->backing_object;
-               vm_object_chain_release(object);
-               object = backing_object;
-       }
-}
-
 /*
  * Dereference an object and its underlying vnode.  The object may be
  * held shared.  On return the object will remain held.
@@ -881,288 +660,58 @@ VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
 void
 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
 {
-       struct vm_object_dealloc_list *dlist = NULL;
-       struct vm_object_dealloc_list *dtmp;
-       vm_object_t temp;
-       int must_drop = 0;
-
        /*
-        * We may chain deallocate object, but additional objects may
-        * collect on the dlist which also have to be deallocated.  We
-        * must avoid a recursion, vm_object chains can get deep.
+        * Degenerate case
         */
+       if (object == NULL)
+               return;
 
-again:
-       while (object != NULL) {
-               /*
-                * vnode case, caller either locked the object exclusively
-                * or this is a recursion with must_drop != 0 and the vnode
-                * object will be locked shared.
-                *
-                * If locked shared we have to drop the object before we can
-                * call vrele() or risk a shared/exclusive livelock.
-                */
-               if (object->type == OBJT_VNODE) {
-                       ASSERT_LWKT_TOKEN_HELD(&object->token);
-                       if (must_drop) {
-                               struct vnode *tmp_vp;
-
-                               vm_object_vndeallocate(object, &tmp_vp);
-                               vm_object_drop(object);
-                               must_drop = 0;
-                               object = NULL;
-                               vrele(tmp_vp);
-                       } else {
-                               vm_object_vndeallocate(object, NULL);
-                       }
-                       break;
-               }
-               ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
-
-               /*
-                * Normal case (object is locked exclusively)
-                */
-               if (object->ref_count == 0) {
-                       panic("vm_object_deallocate: object deallocated "
-                             "too many times: %d", object->type);
-               }
-               if (object->ref_count > 2) {
-                       atomic_add_int(&object->ref_count, -1);
-#if defined(DEBUG_LOCKS)
-                       debugvm_object_add(object, file, line, -1);
-#endif
-                       break;
-               }
-
-#if 0
-               /*
-                * CODE REMOVAL IN PROGRESS.
-                *
-                * This code handled setting ONEMAPPING again on a DEFAULT
-                * or SWAP object on the 2->1 transition of ref_count,
-                *
-                * This code also handled collapsing object chains on the
-                * 2->1 transition when the second ref was due to a shadow.
-                */
-               /*
-                * The ref_count is either 1 or 2.
-                *
-                * Here on ref_count of one or two, which are special cases for
-                * objects.
-                *
-                * Nominal ref_count > 1 case if the second ref is not from
-                * a shadow.
-                *
-                * (ONEMAPPING only applies to DEFAULT AND SWAP objects)
-                */
-               if (object->ref_count == 2 && object->shadow_count == 0) {
-                       if (object->type == OBJT_DEFAULT ||
-                           object->type == OBJT_SWAP) {
-                               vm_object_set_flag(object, OBJ_ONEMAPPING);
-                       }
-                       atomic_add_int(&object->ref_count, -1);
-#if defined(DEBUG_LOCKS)
-                       debugvm_object_add(object, file, line, -1);
-#endif
-                       break;
-               }
-
-               /*
-                * If the second ref is from a shadow we chain along it
-                * upwards if object's handle is exhausted.
-                *
-                * We have to decrement object->ref_count before potentially
-                * collapsing the first shadow object or the collapse code
-                * will not be able to handle the degenerate case to remove
-                * object.  However, if we do it too early the object can
-                * get ripped out from under us.
-                */
-               if (object->ref_count == 2 && object->shadow_count == 1 &&
-                   object->handle == NULL && (object->type == OBJT_DEFAULT ||
-                                              object->type == OBJT_SWAP)) {
-                       temp = LIST_FIRST(&object->shadow_head);
-                       KKASSERT(temp != NULL);
-                       vm_object_hold(temp);
-
-                       /*
-                        * Wait for any paging to complete so the collapse
-                        * doesn't (or isn't likely to) qcollapse.  pip
-                        * waiting must occur before we acquire the
-                        * chainlock.
-                        */
-                       while (
-                               temp->paging_in_progress ||
-                               object->paging_in_progress
-                       ) {
-                               vm_object_pip_wait(temp, "objde1");
-                               vm_object_pip_wait(object, "objde2");
-                       }
-
-                       /*
-                        * If the parent is locked we have to give up, as
-                        * otherwise we would be acquiring locks in the
-                        * wrong order and potentially deadlock.
-                        */
-                       if (temp->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) {
-                               vm_object_drop(temp);
-                               goto skip;
-                       }
-                       vm_object_chain_acquire(temp, 0);
-
-                       /*
-                        * Recheck/retry after the hold and the paging
-                        * wait, both of which can block us.
-                        */
-                       if (object->ref_count != 2 ||
-                           object->shadow_count != 1 ||
-                           object->handle ||
-                           LIST_FIRST(&object->shadow_head) != temp ||
-                           (object->type != OBJT_DEFAULT &&
-                            object->type != OBJT_SWAP)) {
-                               vm_object_chain_release(temp);
-                               vm_object_drop(temp);
-                               continue;
-                       }
-
-                       /*
-                        * We can safely drop object's ref_count now.
-                        */
-                       KKASSERT(object->ref_count == 2);
-                       atomic_add_int(&object->ref_count, -1);
-#if defined(DEBUG_LOCKS)
-                       debugvm_object_add(object, file, line, -1);
-#endif
-
-                       /*
-                        * If our single parent is not collapseable just
-                        * decrement ref_count (2->1) and stop.
-                        */
-                       if (temp->handle || (temp->type != OBJT_DEFAULT &&
-                                            temp->type != OBJT_SWAP)) {
-                               vm_object_chain_release(temp);
-                               vm_object_drop(temp);
-                               break;
-                       }
-
-                       /*
-                        * At this point we have already dropped object's
-                        * ref_count so it is possible for a race to
-                        * deallocate obj out from under us.  Any collapse
-                        * will re-check the situation.  We must not block
-                        * until we are able to collapse.
-                        *
-                        * Bump temp's ref_count to avoid an unwanted
-                        * degenerate recursion (can't call
-                        * vm_object_reference_locked() because it asserts
-                        * that CHAINLOCK is not set).
-                        */
-                       atomic_add_int(&temp->ref_count, 1);
-                       KKASSERT(temp->ref_count > 1);
-
-                       /*
-                        * Collapse temp, then deallocate the extra ref
-                        * formally.
-                        */
-                       vm_object_collapse(temp, &dlist);
-                       vm_object_chain_release(temp);
-                       if (must_drop) {
-                               vm_object_lock_swap();
-                               vm_object_drop(object);
-                       }
-                       object = temp;
-                       must_drop = 1;
-                       continue;
-               }
-skip:
-               ;
-#endif
+       /*
+        * vnode case, caller either locked the object exclusively
+        * or this is a recursion with must_drop != 0 and the vnode
+        * object will be locked shared.
+        *
+        * If locked shared we have to drop the object before we can
+        * call vrele() or risk a shared/exclusive livelock.
+        */
+       if (object->type == OBJT_VNODE) {
+               ASSERT_LWKT_TOKEN_HELD(&object->token);
+               vm_object_vndeallocate(object, NULL);
+               return;
+       }
+       ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
 
-               /*
-                * Drop the ref and handle termination on the 1->0 transition.
-                * We may have blocked above so we have to recheck.
-                */
-               KKASSERT(object->ref_count != 0);
-               if (object->ref_count >= 2) {
-                       atomic_add_int(&object->ref_count, -1);
+       /*
+        * Normal case (object is locked exclusively)
+        */
+       if (object->ref_count == 0) {
+               panic("vm_object_deallocate: object deallocated "
+                     "too many times: %d", object->type);
+       }
+       if (object->ref_count > 2) {
+               atomic_add_int(&object->ref_count, -1);
 #if defined(DEBUG_LOCKS)
-                       debugvm_object_add(object, file, line, -1);
+               debugvm_object_add(object, file, line, -1);
 #endif
-                       break;
-               }
-               KKASSERT(object->ref_count == 1);
-
-               /*
-                * 1->0 transition.  Chain through the backing_object.
-                * Maintain the ref until we've located the backing object,
-                * then re-check.
-                */
-               while ((temp = object->backing_object) != NULL) {
-                       if (temp->type == OBJT_VNODE)
-                               vm_object_hold_shared(temp);
-                       else
-                               vm_object_hold(temp);
-                       if (temp == object->backing_object)
-                               break;
-                       vm_object_drop(temp);
-               }
-
-               /*
-                * 1->0 transition verified, retry if ref_count is no longer
-                * 1.  Otherwise disconnect the backing_object (temp) and
-                * clean up.
-                */
-               if (object->ref_count != 1) {
-                       vm_object_drop(temp);
-                       continue;
-               }
-
-               /*
-                * It shouldn't be possible for the object to be chain locked
-                * if we're removing the last ref on it.
-                *
-                * Removing object from temp's shadow list requires dropping
-                * temp, which we will do on loop.
-                *
-                * NOTE! vnodes do not use the shadow list, but still have
-                *       the backing_object reference.
-                */
-               KKASSERT((object->chainlk & (CHAINLK_EXCL|CHAINLK_MASK)) == 0);
-
-               if (temp) {
-                       if (object->flags & OBJ_ONSHADOW) {
-                               atomic_add_int(&temp->generation, 1);
-                               vm_object_clear_flag(object, OBJ_ONSHADOW);
-                       }
-                       object->backing_object = NULL;
-               }
-
-               atomic_add_int(&object->ref_count, -1);
-               if ((object->flags & OBJ_DEAD) == 0)
-                       vm_object_terminate(object);
-               if (must_drop && temp)
-                       vm_object_lock_swap();
-               if (must_drop)
-                       vm_object_drop(object);
-               object = temp;
-               must_drop = 1;
+               return;
        }
 
-       if (must_drop && object)
-               vm_object_drop(object);
-
        /*
-        * Additional tail recursion on dlist.  Avoid a recursion.  Objects
-        * on the dlist have a hold count but are not locked.
+        * Drop the ref and handle termination on the 1->0 transition.
+        * We may have blocked above so we have to recheck.
         */
-       if ((dtmp = dlist) != NULL) {
-               dlist = dtmp->next;
-               object = dtmp->object;
-               kfree(dtmp, M_TEMP);
-
-               vm_object_lock(object); /* already held, add lock */
-               must_drop = 1;          /* and we're responsible for it */
-               goto again;
+       KKASSERT(object->ref_count != 0);
+       if (object->ref_count >= 2) {
+               atomic_add_int(&object->ref_count, -1);
+#if defined(DEBUG_LOCKS)
+               debugvm_object_add(object, file, line, -1);
+#endif
+               return;
        }
+
+       atomic_add_int(&object->ref_count, -1);
+       if ((object->flags & OBJ_DEAD) == 0)
+               vm_object_terminate(object);
 }
 
 /*
@@ -1772,9 +1321,7 @@ void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
                  vm_pindex_t count, int advise)
 {
-       vm_pindex_t end, tpindex;
-       vm_object_t tobject;
-       vm_object_t xobj;
+       vm_pindex_t end;
        vm_page_t m;
        int error;
 
@@ -1784,31 +1331,26 @@ vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
        end = pindex + count;
 
        vm_object_hold(object);
-       tobject = object;
 
        /*
-        * Locate and adjust resident pages
+        * Locate and adjust resident pages.  This only applies to the
+        * primary object in the mapping.
         */
        for (; pindex < end; pindex += 1) {
 relookup:
-               if (tobject != object)
-                       vm_object_drop(tobject);
-               tobject = object;
-               tpindex = pindex;
-shadowlookup:
                /*
                 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
                 * and those pages must be OBJ_ONEMAPPING.
                 */
                if (advise == MADV_FREE) {
-                       if ((tobject->type != OBJT_DEFAULT &&
-                            tobject->type != OBJT_SWAP) ||
-                           (tobject->flags & OBJ_ONEMAPPING) == 0) {
+                       if ((object->type != OBJT_DEFAULT &&
+                            object->type != OBJT_SWAP) ||
+                           (object->flags & OBJ_ONEMAPPING) == 0) {
                                continue;
                        }
                }
 
-               m = vm_page_lookup_busy_try(tobject, tpindex, TRUE, &error);
+               m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
 
                if (error) {
                        vm_page_sleep_busy(m, TRUE, "madvpo");
@@ -1818,28 +1360,12 @@ shadowlookup:
                        /*
                         * There may be swap even if there is no backing page
                         */
-                       if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
-                               swap_pager_freespace(tobject, tpindex, 1);
-
-                       /*
-                        * next object
-                        */
-                       while ((xobj = tobject->backing_object) != NULL) {
-                               KKASSERT(xobj != object);
-                               vm_object_hold(xobj);
-                               if (xobj == tobject->backing_object)
-                                       break;
-                               vm_object_drop(xobj);
-                       }
-                       if (xobj == NULL)
-                               continue;
-                       tpindex += OFF_TO_IDX(tobject->backing_object_offset);
-                       if (tobject != object) {
-                               vm_object_lock_swap();
-                               vm_object_drop(tobject);
+                       if (advise == MADV_FREE &&
+                           object->type == OBJT_SWAP &&
+                           m->object == object) {
+                               swap_pager_freespace(object, pindex, 1);
                        }
-                       tobject = xobj;
-                       goto shadowlookup;
+                       continue;
                }
 
                /*
@@ -1860,7 +1386,6 @@ shadowlookup:
                 * Theoretically once a page is known not to be busy, an
                 * interrupt cannot come along and rip it out from under us.
                 */
-
                if (advise == MADV_WILLNEED) {
                        vm_page_activate(m);
                } else if (advise == MADV_DONTNEED) {
@@ -1885,826 +1410,45 @@ shadowlookup:
                        m->dirty = 0;
                        m->act_count = 0;
                        vm_page_dontneed(m);
-                       if (tobject->type == OBJT_SWAP)
-                               swap_pager_freespace(tobject, tpindex, 1);
+                       if (object->type == OBJT_SWAP)
+                               swap_pager_freespace(object, pindex, 1);
                }
                vm_page_wakeup(m);
        }       
-       if (tobject != object)
-               vm_object_drop(tobject);
        vm_object_drop(object);
 }
 
 /*
- * Create a new object which is backed by the specified existing object
- * range.  Replace the pointer and offset that was pointing at the existing
- * object with the pointer/offset for the new object.
- *
- * If addref is non-zero the returned object is given an additional reference.
- * This mechanic exists to avoid the situation where refs might be 1 and
- * race against a collapse when the caller intends to bump it.  So the
- * caller cannot add the ref after the fact.  Used when the caller is
- * duplicating a vm_map_entry.
+ * Removes all physical pages in the specified object range from the
+ * object's list of pages.
  *
- * No other requirements.
+ * No requirements.
  */
+static int vm_object_page_remove_callback(vm_page_t p, void *data);
+
 void
-vm_object_shadow(vm_object_t *objectp, vm_ooffset_t *offset, vm_size_t length,
-                int addref)
+vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
+                     boolean_t clean_only)
 {
-       vm_object_t source;
-       vm_object_t result;
-       int useshadowlist;
-
-       source = *objectp;
+       struct rb_vm_page_scan_info info;
+       int all;
 
        /*
-        * Don't create the new object if the old object isn't shared.
-        * We have to chain wait before adding the reference to avoid
-        * racing a collapse or deallocation.
-        *
-        * Clear OBJ_ONEMAPPING flag when shadowing.
-        *
-        * The caller owns a ref on source via *objectp which we are going
-        * to replace.  This ref is inherited by the backing_object assignment.
-        * from nobject and does not need to be incremented here.
-        *
-        * However, we add a temporary extra reference to the original source
-        * prior to holding nobject in case we block, to avoid races where
-        * someone else might believe that the source can be collapsed.
+        * Degenerate cases and assertions
         */
-       useshadowlist = 0;
-       if (source) {
-               if (source->type != OBJT_VNODE) {
-                       useshadowlist = 1;
-                       vm_object_hold(source);
-                       vm_object_chain_wait(source, 0);
-                       if (source->ref_count == 1 &&
-                           source->handle == NULL &&
-                           (source->type == OBJT_DEFAULT ||
-                            source->type == OBJT_SWAP)) {
-                               if (addref) {
-                                       vm_object_reference_locked(source);
-                                       vm_object_clear_flag(source,
-                                                            OBJ_ONEMAPPING);
-                               }
-                               vm_object_drop(source);
-                               return;
-                       }
-                       vm_object_reference_locked(source);
-                       vm_object_clear_flag(source, OBJ_ONEMAPPING);
-               } else {
-                       vm_object_reference_quick(source);
-                       vm_object_clear_flag(source, OBJ_ONEMAPPING);
-               }
+       vm_object_hold(object);
+       if (object == NULL ||
+           (object->resident_page_count == 0 && object->swblock_count == 0)) {
+               vm_object_drop(object);
+               return;
        }
+       KASSERT(object->type != OBJT_PHYS,
+               ("attempt to remove pages from a physical object"));
 
        /*
-        * Allocate a new object with the given length.  The new object
-        * is returned referenced but we may have to add another one.
-        * If we are adding a second reference we must clear OBJ_ONEMAPPING.
-        * (typically because the caller is about to clone a vm_map_entry).
-        *
-        * The source object currently has an extra reference to prevent
-        * collapses into it while we mess with its shadow list, which
-        * we will remove later in this routine.
-        *
-        * The target object may require a second reference if asked for one
-        * by the caller.
+        * Indicate that paging is occuring on the object
         */
-       result = vm_object_allocate(OBJT_DEFAULT, length);
-       if (result == NULL)
-               panic("vm_object_shadow: no object for shadowing");
-       vm_object_hold(result);
-       if (addref) {
-               vm_object_reference_locked(result);
-               vm_object_clear_flag(result, OBJ_ONEMAPPING);
-       }
-
-       /*
-        * The new object shadows the source object.
-        *
-        * Try to optimize the result object's page color when shadowing
-        * in order to maintain page coloring consistency in the combined 
-        * shadowed object.
-        *
-        * The backing_object reference to source requires adding a ref to
-        * source.  We simply inherit the ref from the original *objectp
-        * (which we are replacing) so no additional refs need to be added.
-        * (we must still clean up the extra ref we had to prevent collapse
-        * races).
-        *
-        * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
-        */
-       KKASSERT(result->backing_object == NULL);
-       result->backing_object = source;
-       if (source) {
-               if (useshadowlist) {
-                       vm_object_chain_wait(source, 0);
-                       atomic_add_int(&source->generation, 1);
-                       vm_object_set_flag(result, OBJ_ONSHADOW);
-               }
-               /* cpu localization twist */
-               result->pg_color = vm_quickcolor();
-       }
-
-       /*
-        * Adjust the return storage.  Drop the ref on source before
-        * returning.
-        */
-       result->backing_object_offset = *offset;
-       vm_object_drop(result);
-       *offset = 0;
-       if (source) {
-               if (useshadowlist) {
-                       vm_object_deallocate_locked(source);
-                       vm_object_drop(source);
-               } else {
-                       vm_object_deallocate(source);
-               }
-       }
-
-       /*
-        * Return the new things
-        */
-       *objectp = result;
-}
-
-#define        OBSC_TEST_ALL_SHADOWED  0x0001
-#define        OBSC_COLLAPSE_NOWAIT    0x0002
-#define        OBSC_COLLAPSE_WAIT      0x0004
-
-static int vm_object_backing_scan_callback(vm_page_t p, void *data);
-
-/*
- * The caller must hold the object.
- */
-static __inline int
-vm_object_backing_scan(vm_object_t object, vm_object_t backing_object, int op)
-{
-       struct rb_vm_page_scan_info info;
-       struct vm_object_hash *hash;
-
-       vm_object_assert_held(object);
-       vm_object_assert_held(backing_object);
-
-       KKASSERT(backing_object == object->backing_object);
-       info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
-
-       /*
-        * Initial conditions
-        */
-       if (op & OBSC_TEST_ALL_SHADOWED) {
-               /*
-                * We do not want to have to test for the existence of
-                * swap pages in the backing object.  XXX but with the
-                * new swapper this would be pretty easy to do.
-                *
-                * XXX what about anonymous MAP_SHARED memory that hasn't
-                * been ZFOD faulted yet?  If we do not test for this, the
-                * shadow test may succeed! XXX
-                */
-               if (backing_object->type != OBJT_DEFAULT)
-                       return(0);
-       }
-       if (op & OBSC_COLLAPSE_WAIT) {
-               KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
-               vm_object_set_flag(backing_object, OBJ_DEAD);
-
-               hash = vmobj_hash(backing_object);
-               lwkt_gettoken(&hash->token);
-               TAILQ_REMOVE(&hash->list, backing_object, object_list);
-               lwkt_reltoken(&hash->token);
-       }
-
-       /*
-        * Our scan.   We have to retry if a negative error code is returned,
-        * otherwise 0 or 1 will be returned in info.error.  0 Indicates that
-        * the scan had to be stopped because the parent does not completely
-        * shadow the child.
-        */
-       info.object = object;
-       info.backing_object = backing_object;
-       info.limit = op;
-       info.count = 0;
-       do {
-               info.error = 1;
-               vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
-                                       vm_object_backing_scan_callback,
-                                       &info);
-       } while (info.error < 0);
-
-       return(info.error);
-}
-
-/*
- * The caller must hold the object.
- */
-static int
-vm_object_backing_scan_callback(vm_page_t p, void *data)
-{
-       struct rb_vm_page_scan_info *info = data;
-       vm_object_t backing_object;
-       vm_object_t object;
-       vm_pindex_t pindex;
-       vm_pindex_t new_pindex;
-       vm_pindex_t backing_offset_index;
-       int op;
-
-       pindex = p->pindex;
-       new_pindex = pindex - info->backing_offset_index;
-       op = info->limit;
-       object = info->object;
-       backing_object = info->backing_object;
-       backing_offset_index = info->backing_offset_index;
-
-       if (op & OBSC_TEST_ALL_SHADOWED) {
-               vm_page_t pp;
-
-               /*
-                * Ignore pages outside the parent object's range
-                * and outside the parent object's mapping of the 
-                * backing object.
-                *
-                * note that we do not busy the backing object's
-                * page.
-                */
-               if (pindex < backing_offset_index ||
-                   new_pindex >= object->size
-               ) {
-                       return(0);
-               }
-
-               /*
-                * See if the parent has the page or if the parent's
-                * object pager has the page.  If the parent has the
-                * page but the page is not valid, the parent's
-                * object pager must have the page.
-                *
-                * If this fails, the parent does not completely shadow
-                * the object and we might as well give up now.
-                */
-               pp = vm_page_lookup(object, new_pindex);
-               if ((pp == NULL || pp->valid == 0) &&
-                   !vm_pager_has_page(object, new_pindex)
-               ) {
-                       info->error = 0;        /* problemo */
-                       return(-1);             /* stop the scan */
-               }
-       }
-
-       /*
-        * Check for busy page.  Note that we may have lost (p) when we
-        * possibly blocked above.
-        */
-       if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
-               vm_page_t pp;
-
-               if (vm_page_busy_try(p, TRUE)) {
-                       if (op & OBSC_COLLAPSE_NOWAIT) {
-                               return(0);
-                       } else {
-                               /*
-                                * If we slept, anything could have
-                                * happened.   Ask that the scan be restarted.
-                                *
-                                * Since the object is marked dead, the
-                                * backing offset should not have changed.  
-                                */
-                               vm_page_sleep_busy(p, TRUE, "vmocol");
-                               info->error = -1;
-                               return(-1);
-                       }
-               }
-
-               /*
-                * If (p) is no longer valid restart the scan.
-                */
-               if (p->object != backing_object || p->pindex != pindex) {
-                       kprintf("vm_object_backing_scan: Warning: page "
-                               "%p ripped out from under us\n", p);
-                       vm_page_wakeup(p);
-                       info->error = -1;
-                       return(-1);
-               }
-
-               if (op & OBSC_COLLAPSE_NOWAIT) {
-                       if (p->valid == 0 ||
-                           p->wire_count ||
-                           (p->flags & PG_NEED_COMMIT)) {
-                               vm_page_wakeup(p);
-                               return(0);
-                       }
-               } else {
-                       /* XXX what if p->valid == 0 , hold_count, etc? */
-               }
-
-               KASSERT(
-                   p->object == backing_object,
-                   ("vm_object_qcollapse(): object mismatch")
-               );
-
-               /*
-                * Destroy any associated swap
-                */
-               if (backing_object->type == OBJT_SWAP)
-                       swap_pager_freespace(backing_object, p->pindex, 1);
-
-               if (
-                   p->pindex < backing_offset_index ||
-                   new_pindex >= object->size
-               ) {
-                       /*
-                        * Page is out of the parent object's range, we 
-                        * can simply destroy it. 
-                        */
-                       vm_page_protect(p, VM_PROT_NONE);
-                       vm_page_free(p);
-                       return(0);
-               }
-
-               pp = vm_page_lookup(object, new_pindex);
-               if (pp != NULL || vm_pager_has_page(object, new_pindex)) {
-                       /*
-                        * page already exists in parent OR swap exists
-                        * for this location in the parent.  Destroy 
-                        * the original page from the backing object.
-                        *
-                        * Leave the parent's page alone
-                        */
-                       vm_page_protect(p, VM_PROT_NONE);
-                       vm_page_free(p);
-                       return(0);
-               }
-
-               /*
-                * Page does not exist in parent, rename the
-                * page from the backing object to the main object. 
-                *
-                * If the page was mapped to a process, it can remain 
-                * mapped through the rename.
-                */
-               if ((p->queue - p->pc) == PQ_CACHE)
-                       vm_page_deactivate(p);
-
-               vm_page_rename(p, object, new_pindex);
-               vm_page_wakeup(p);
-               /* page automatically made dirty by rename */
-       }
-       return(0);
-}
-
-/*
- * This version of collapse allows the operation to occur earlier and
- * when paging_in_progress is true for an object...  This is not a complete
- * operation, but should plug 99.9% of the rest of the leaks.
- *
- * The caller must hold the object and backing_object and both must be
- * chainlocked.
- *
- * (only called from vm_object_collapse)
- */
-static void
-vm_object_qcollapse(vm_object_t object, vm_object_t backing_object)
-{
-       if (backing_object->ref_count == 1) {
-               atomic_add_int(&backing_object->ref_count, 2);
-#if defined(DEBUG_LOCKS)
-               debugvm_object_add(backing_object, "qcollapse", 1, 2);
-#endif
-               vm_object_backing_scan(object, backing_object,
-                                      OBSC_COLLAPSE_NOWAIT);
-               atomic_add_int(&backing_object->ref_count, -2);
-#if defined(DEBUG_LOCKS)
-               debugvm_object_add(backing_object, "qcollapse", 2, -2);
-#endif
-       }
-}
-
-/*
- * Collapse an object with the object backing it.  Pages in the backing
- * object are moved into the parent, and the backing object is deallocated.
- * Any conflict is resolved in favor of the parent's existing pages.
- *
- * object must be held and chain-locked on call.
- *
- * The caller must have an extra ref on object to prevent a race from
- * destroying it during the collapse.
- */
-void
-vm_object_collapse(vm_object_t object, struct vm_object_dealloc_list **dlistp)
-{
-       struct vm_object_dealloc_list *dlist = NULL;
-       vm_object_t backing_object;
-
-       /*
-        * Only one thread is attempting a collapse at any given moment.
-        * There are few restrictions for (object) that callers of this
-        * function check so reentrancy is likely.
-        */
-       KKASSERT(object != NULL);
-       vm_object_assert_held(object);
-       KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
-
-       for (;;) {
-               vm_object_t bbobj;
-               int dodealloc;
-
-               /*
-                * We can only collapse a DEFAULT/SWAP object with a
-                * DEFAULT/SWAP object.
-                */
-               if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) {
-                       backing_object = NULL;
-                       break;
-               }
-
-               backing_object = object->backing_object;
-               if (backing_object == NULL)
-                       break;
-               if (backing_object->type != OBJT_DEFAULT &&
-                   backing_object->type != OBJT_SWAP) {
-                       backing_object = NULL;
-                       break;
-               }
-
-               /*
-                * Hold (token lock) the backing_object and retest conditions.
-                */
-               vm_object_hold(backing_object);
-               if (backing_object != object->backing_object ||
-                   (backing_object->type != OBJT_DEFAULT &&
-                    backing_object->type != OBJT_SWAP)) {
-                       vm_object_drop(backing_object);
-                       continue;
-               }
-
-               /*
-                * Chain-lock the backing object too because if we
-                * successfully merge its pages into the top object we
-                * will collapse backing_object->backing_object as the
-                * new backing_object.  Re-check that it is still our
-                * backing object.
-                */
-               vm_object_chain_acquire(backing_object, 0);
-               if (backing_object != object->backing_object) {
-                       vm_object_chain_release(backing_object);
-                       vm_object_drop(backing_object);
-                       continue;
-               }
-
-               /*
-                * We check the backing object first, because it is most
-                * likely not collapsable.
-                */
-               if (backing_object->handle != NULL ||
-                   (backing_object->type != OBJT_DEFAULT &&
-                    backing_object->type != OBJT_SWAP) ||
-                   (backing_object->flags & OBJ_DEAD) ||
-                   object->handle != NULL ||
-                   (object->type != OBJT_DEFAULT &&
-                    object->type != OBJT_SWAP) ||
-                   (object->flags & OBJ_DEAD)) {
-                       break;
-               }
-
-               /*
-                * If paging is in progress we can't do a normal collapse.
-                */
-               if (object->paging_in_progress != 0 ||
-                   backing_object->paging_in_progress != 0
-               ) {
-                       vm_object_qcollapse(object, backing_object);
-                       break;
-               }
-
-               /*
-                * We know that we can either collapse the backing object (if
-                * the parent is the only reference to it) or (perhaps) have
-                * the parent bypass the object if the parent happens to shadow
-                * all the resident pages in the entire backing object.
-                *
-                * This is ignoring pager-backed pages such as swap pages.
-                * vm_object_backing_scan fails the shadowing test in this
-                * case.
-                */
-               if (backing_object->ref_count == 1) {
-                       /*
-                        * If there is exactly one reference to the backing
-                        * object, we can collapse it into the parent.  
-                        */
-                       KKASSERT(object->backing_object == backing_object);
-                       vm_object_backing_scan(object, backing_object,
-                                              OBSC_COLLAPSE_WAIT);
-
-                       /*
-                        * Move the pager from backing_object to object.
-                        */
-                       if (backing_object->type == OBJT_SWAP) {
-                               vm_object_pip_add(backing_object, 1);
-
-                               /*
-                                * scrap the paging_offset junk and do a 
-                                * discrete copy.  This also removes major 
-                                * assumptions about how the swap-pager 
-                                * works from where it doesn't belong.  The
-                                * new swapper is able to optimize the
-                                * destroy-source case.
-                                */
-                               vm_object_pip_add(object, 1);
-                               swap_pager_copy(backing_object, object,
-                                   OFF_TO_IDX(object->backing_object_offset),
-                                   TRUE);
-                               vm_object_pip_wakeup(object);
-                               vm_object_pip_wakeup(backing_object);
-                       }
-
-                       /*
-                        * Object now shadows whatever backing_object did.
-                        *
-                        * Removing object from backing_objects shadow list
-                        * requires releasing object, which we will do below.
-                        */
-                       KKASSERT(object->backing_object == backing_object);
-                       if (object->flags & OBJ_ONSHADOW) {
-                               atomic_add_int(&backing_object->generation, 1);
-                               vm_object_clear_flag(object, OBJ_ONSHADOW);
-                       }
-
-                       /*
-                        * backing_object->backing_object moves from within
-                        * backing_object to within object.
-                        *
-                        * OBJT_VNODE bbobj's should have empty shadow lists.
-                        */
-                       while ((bbobj = backing_object->backing_object) != NULL) {
-                               if (bbobj->type == OBJT_VNODE)
-                                       vm_object_hold_shared(bbobj);
-                               else
-                                       vm_object_hold(bbobj);
-                               if (bbobj == backing_object->backing_object)
-                                       break;
-                               vm_object_drop(bbobj);
-                       }
-
-                       /*
-                        * We are removing backing_object from bbobj's
-                        * shadow list and adding object to bbobj's shadow
-                        * list, so the ref_count on bbobj is unchanged.
-                        */
-                       if (bbobj) {
-                               if (backing_object->flags & OBJ_ONSHADOW) {
-                                       /* not locked exclusively if vnode */
-                                       KKASSERT(bbobj->type != OBJT_VNODE);
-                                       atomic_add_int(&bbobj->generation, 1);
-                                       vm_object_clear_flag(backing_object,
-                                                            OBJ_ONSHADOW);
-                               }
-                               backing_object->backing_object = NULL;
-                       }
-                       object->backing_object = bbobj;
-                       if (bbobj) {
-                               if (bbobj->type != OBJT_VNODE) {
-                                       atomic_add_int(&bbobj->generation, 1);
-                                       vm_object_set_flag(object,
-                                                          OBJ_ONSHADOW);
-                               }
-                       }
-
-                       object->backing_object_offset +=
-                               backing_object->backing_object_offset;
-
-                       vm_object_drop(bbobj);
-
-                       /*
-                        * Discard the old backing_object.  Nothing should be
-                        * able to ref it, other than a vm_map_split(),
-                        * and vm_map_split() will stall on our chain lock.
-                        * And we control the parent so it shouldn't be
-                        * possible for it to go away either.
-                        *
-                        * Since the backing object has no pages, no pager
-                        * left, and no object references within it, all
-                        * that is necessary is to dispose of it.
-                        */
-                       KASSERT(backing_object->ref_count == 1,
-                               ("backing_object %p was somehow "
-                                "re-referenced during collapse!",
-                                backing_object));
-                       KASSERT(RB_EMPTY(&backing_object->rb_memq),
-                               ("backing_object %p somehow has left "
-                                "over pages during collapse!",
-                                backing_object));
-
-                       /*
-                        * The object can be destroyed.
-                        *
-                        * XXX just fall through and dodealloc instead
-                        *     of forcing destruction?
-                        */
-                       atomic_add_int(&backing_object->ref_count, -1);
-#if defined(DEBUG_LOCKS)
-                       debugvm_object_add(backing_object, "collapse", 1, -1);
-#endif
-                       if ((backing_object->flags & OBJ_DEAD) == 0)
-                               vm_object_terminate(backing_object);
-                       object_collapses++;
-                       dodealloc = 0;
-               } else {
-                       /*
-                        * If we do not entirely shadow the backing object,
-                        * there is nothing we can do so we give up.
-                        */
-                       if (vm_object_backing_scan(object, backing_object,
-                                               OBSC_TEST_ALL_SHADOWED) == 0) {
-                               break;
-                       }
-
-                       /*
-                        * bbobj is backing_object->backing_object.  Since
-                        * object completely shadows backing_object we can
-                        * bypass it and become backed by bbobj instead.
-                        *
-                        * The shadow list for vnode backing objects is not
-                        * used and a shared hold is allowed.
-                        */
-                       while ((bbobj = backing_object->backing_object) != NULL) {
-                               if (bbobj->type == OBJT_VNODE)
-                                       vm_object_hold_shared(bbobj);
-                               else
-                                       vm_object_hold(bbobj);
-                               if (bbobj == backing_object->backing_object)
-                                       break;
-                               vm_object_drop(bbobj);
-                       }
-
-                       /*
-                        * Make object shadow bbobj instead of backing_object.
-                        * Remove object from backing_object's shadow list.
-                        *
-                        * Deallocating backing_object will not remove
-                        * it, since its reference count is at least 2.
-                        *
-                        * Removing object from backing_object's shadow
-                        * list requires releasing a ref, which we do
-                        * below by setting dodealloc to 1.
-                        */
-                       KKASSERT(object->backing_object == backing_object);
-                       if (object->flags & OBJ_ONSHADOW) {
-                               atomic_add_int(&backing_object->generation, 1);
-                               vm_object_clear_flag(object, OBJ_ONSHADOW);
-                       }
-
-                       /*
-                        * Add a ref to bbobj, bbobj now shadows object.
-                        *
-                        * NOTE: backing_object->backing_object still points
-                        *       to bbobj.  That relationship remains intact
-                        *       because backing_object has > 1 ref, so
-                        *       someone else is pointing to it (hence why
-                        *       we can't collapse it into object and can
-                        *       only handle the all-shadowed bypass case).
-                        */
-                       if (bbobj) {
-                               if (bbobj->type != OBJT_VNODE) {
-                                       vm_object_chain_wait(bbobj, 0);
-                                       vm_object_reference_locked(bbobj);
-                                       atomic_add_int(&bbobj->generation, 1);
-                                       vm_object_set_flag(object,
-                                                          OBJ_ONSHADOW);
-                               } else {
-                                       vm_object_reference_quick(bbobj);
-                               }
-                               object->backing_object_offset +=
-                                       backing_object->backing_object_offset;
-                               object->backing_object = bbobj;
-                               vm_object_drop(bbobj);
-                       } else {
-                               object->backing_object = NULL;
-                       }
-
-                       /*
-                        * Drop the reference count on backing_object.  To
-                        * handle ref_count races properly we can't assume
-                        * that the ref_count is still at least 2 so we
-                        * have to actually call vm_object_deallocate()
-                        * (after clearing the chainlock).
-                        */
-                       object_bypasses++;
-                       dodealloc = 1;
-               }
-
-               /*
-                * Ok, we want to loop on the new object->bbobj association,
-                * possibly collapsing it further.  However if dodealloc is
-                * non-zero we have to deallocate the backing_object which
-                * itself can potentially undergo a collapse, creating a
-                * recursion depth issue with the LWKT token subsystem.
-                *
-                * In the case where we must deallocate the backing_object
-                * it is possible now that the backing_object has a single
-                * shadow count on some other object (not represented here
-                * as yet), since it no longer shadows us.  Thus when we
-                * call vm_object_deallocate() it may attempt to collapse
-                * itself into its remaining parent.
-                */
-               if (dodealloc) {
-                       struct vm_object_dealloc_list *dtmp;
-
-                       vm_object_chain_release(backing_object);
-                       vm_object_unlock(backing_object);
-                       /* backing_object remains held */
-
-                       /*
-                        * Auto-deallocation list for caller convenience.
-                        */
-                       if (dlistp == NULL)
-                               dlistp = &dlist;
-
-                       dtmp = kmalloc(sizeof(*dtmp), M_TEMP, M_WAITOK);
-                       dtmp->object = backing_object;
-                       dtmp->next = *dlistp;
-                       *dlistp = dtmp;
-               } else {
-                       vm_object_chain_release(backing_object);
-                       vm_object_drop(backing_object);
-               }
-               /* backing_object = NULL; not needed */
-               /* loop */
-       }
-
-       /*
-        * Clean up any left over backing_object
-        */
-       if (backing_object) {
-               vm_object_chain_release(backing_object);
-               vm_object_drop(backing_object);
-       }
-
-       /*
-        * Clean up any auto-deallocation list.  This is a convenience
-        * for top-level callers so they don't have to pass &dlist.
-        * Do not clean up any caller-passed dlistp, the caller will
-        * do that.
-        */
-       if (dlist)
-               vm_object_deallocate_list(&dlist);
-
-}
-
-/*
- * vm_object_collapse() may collect additional objects in need of
- * deallocation.  This routine deallocates these objects.  The
- * deallocation itself can trigger additional collapses (which the
- * deallocate function takes care of).  This procedure is used to
- * reduce procedural recursion since these vm_object shadow chains
- * can become quite long.
- */
-void
-vm_object_deallocate_list(struct vm_object_dealloc_list **dlistp)
-{
-       struct vm_object_dealloc_list *dlist;
-
-       while ((dlist = *dlistp) != NULL) {
-               *dlistp = dlist->next;
-               vm_object_lock(dlist->object);
-               vm_object_deallocate_locked(dlist->object);
-               vm_object_drop(dlist->object);
-               kfree(dlist, M_TEMP);
-       }
-}
-
-/*
- * Removes all physical pages in the specified object range from the
- * object's list of pages.
- *
- * No requirements.
- */
-static int vm_object_page_remove_callback(vm_page_t p, void *data);
-
-void
-vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
-                     boolean_t clean_only)
-{
-       struct rb_vm_page_scan_info info;
-       int all;
-
-       /*
-        * Degenerate cases and assertions
-        */
-       vm_object_hold(object);
-       if (object == NULL ||
-           (object->resident_page_count == 0 && object->swblock_count == 0)) {
-               vm_object_drop(object);
-               return;
-       }
-       KASSERT(object->type != OBJT_PHYS, 
-               ("attempt to remove pages from a physical object"));
-
-       /*
-        * Indicate that paging is occuring on the object
-        */
-       vm_object_pip_add(object, 1);
+       vm_object_pip_add(object, 1);
 
        /*
         * Figure out the actual removal range and whether we are removing
@@ -2859,12 +1603,16 @@ vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
                return (FALSE);
        }
 
+#if 0
+       /* caller now checks this */
        /*
         * Try to collapse the object first
         */
-       vm_object_chain_acquire(prev_object, 0);
        vm_object_collapse(prev_object, NULL);
+#endif
 
+#if 0
+       /* caller now checks this */
        /*
         * We can't coalesce if we shadow another object (figuring out the
         * relationships become too complex).
@@ -2874,6 +1622,7 @@ vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
                vm_object_drop(prev_object);
                return (FALSE);
        }
+#endif
 
        prev_size >>= PAGE_SHIFT;
        next_size >>= PAGE_SHIFT;
@@ -2885,7 +1634,6 @@ vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
         */
        if (prev_object->ref_count > 1 &&
            prev_object->size != next_pindex) {
-               vm_object_chain_release(prev_object);
                vm_object_drop(prev_object);
                return (FALSE);
        }
@@ -2908,7 +1656,6 @@ vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
         */
        if (next_pindex + next_size > prev_object->size)
                prev_object->size = next_pindex + next_size;
-       vm_object_chain_release(prev_object);
        vm_object_drop(prev_object);
 
        return (TRUE);
@@ -2978,9 +1725,9 @@ static int        vm_object_in_map (vm_object_t object);
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
+       vm_map_backing_t *ba;
        vm_map_t tmpm;
        vm_map_entry_t tmpe;
-       vm_object_t obj, nobj;
        int entcount;
 
        if (map == NULL)
@@ -2998,7 +1745,7 @@ _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
        }
        switch(entry->maptype) {
        case VM_MAPTYPE_SUBMAP:
-               tmpm = entry->object.sub_map;
+               tmpm = entry->ba.sub_map;
                tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
                entcount = tmpm->nentries;
                while (entcount-- && tmpe) {
@@ -3010,25 +1757,11 @@ _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
                break;
        case VM_MAPTYPE_NORMAL:
        case VM_MAPTYPE_VPAGETABLE:
-               obj = entry->object.vm_object;
-               while (obj) {
-                       if (obj == object) {
-                               if (obj != entry->object.vm_object)
-                                       vm_object_drop(obj);
-                               return 1;
-                       }
-                       while ((nobj = obj->backing_object) != NULL) {
-                               vm_object_hold(nobj);
-                               if (nobj == obj->backing_object)
-                                       break;
-                               vm_object_drop(nobj);
-                       }
-                       if (obj != entry->object.vm_object) {
-                               if (nobj)
-                                       vm_object_lock_swap();
-                               vm_object_drop(obj);
-                       }
-                       obj = nobj;
+               ba = &entry->ba;
+               while (ba) {
+                       if (ba->object == object)
+                               return TRUE;
+                       ba = ba->backing_ba;
                }
                break;
        default:
@@ -3114,11 +1847,9 @@ DB_SHOW_COMMAND(vmochk, vm_object_check)
                        if (vm_object_in_map(object))
                                continue;
                        db_printf("vmochk: internal obj is not in a map: "
-                                 "ref: %d, size: %lu: 0x%lx, "
-                                 "backing_object: %p\n",
+                                 "ref: %d, size: %lu: 0x%lx\n",
                                  object->ref_count, (u_long)object->size,
-                                 (u_long)object->size,
-                                 (void *)object->backing_object);
+                                 (u_long)object->size);
                }
        }
 }
@@ -3149,10 +1880,7 @@ DB_SHOW_COMMAND(object, vm_object_print_static)
        /*
         * XXX no %qd in kernel.  Truncate object->backing_object_offset.
         */
-       db_iprintf(" backing_object(%d)=(%p)+0x%lx\n",
-           (object->backing_object ? object->backing_object->ref_count : 0),
-           object->backing_object,
-           (long)object->backing_object_offset);
+       db_iprintf("\n");
 
        if (!full)
                return;
index 21fff14..391c445 100644 (file)
@@ -1,10 +1,14 @@
 /*
+ * Copyright (c) 2019 The DragonFly Project.  All rights reserved.
  * Copyright (c) 1991, 1993
  *     The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -125,16 +129,11 @@ typedef u_char objtype_t;
 /*
  * A VM object which represents an arbitrarily sized data store.
  *
- * NOTE:
- *     shadow_head is only used by OBJT_DEFAULT or OBJT_SWAP objects.
- *     OBJT_VNODE objects explicitly do not keep track of who is shadowing
- *     them.
- *
  * LOCKING:
  *     vmobj_tokens[n] for object_list, hashed by address.
  *
- *     vm_object_hold/drop() for most vm_object related operations.
- *     OBJ_CHAINLOCK to avoid chain/shadow object collisions.
+ *     vm_object_hold/drop() for most vm_object related operations
+ *     to avoid ref confusion in the deallocator.
  */
 struct vm_object {
        TAILQ_ENTRY(vm_object) object_list; /* locked by vmobj_tokens[n] */
@@ -148,8 +147,6 @@ struct vm_object {
        u_short pg_color;               /* color of first page in obj */
        u_int paging_in_progress;       /* Paging (in or out) so don't collapse or destroy */
        long resident_page_count;       /* number of resident pages */
-       struct vm_object *backing_object; /* object that I'm a shadow of */
-       vm_ooffset_t backing_object_offset;/* Offset in backing object */
        TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
        void *handle;                   /* control handle: vp, etc */
        int hold_count;                 /* count prevents destruction */
@@ -188,7 +185,6 @@ struct vm_object {
        long    swblock_count;
        struct  lwkt_token      token;
        struct md_object        md;     /* machine specific (typ pmap) */
-       uint32_t                chainlk;/* chaining lock */
 };
 
 /*
@@ -200,18 +196,19 @@ struct vm_object {
  *                 object types (particularly OBJT_VNODE).
  *
  *                 This flag indicates that any given page index within the
- *                 object is only mapped to a single vm_map_entry.  Split
- *                 vm_map_entry's (denoting distinct non-overlapping page
- *                 ranges) do not clear this flag.  This flag is typically
- *                 cleared on fork().
+ *                 object is only mapped to at most one vm_map_entry.
+ *
+ *                 WARNING!  An obj->refs of 1 does NOT allow you to
+ *                 re-set this bit because the object might be part of
+ *                 a shared chain of vm_map_backing structures.
  *
  * OBJ_NOPAGEIN   - vn and tmpfs set this flag, indicating to swapoff
  *                 that the objects aren't intended to have any vm_page's,
  *                 only swap blocks.  vn and tmpfs don't know how to deal
  *                 with any actual pages.
  */
-#define OBJ_UNUSED0001 0x0001          /* backing_object/shadow changing */
-#define OBJ_ONSHADOW   0x0002          /* backing_object on shadow list */
+#define OBJ_UNUSED0001 0x0001
+#define OBJ_UNUSED0002 0x0002
 #define OBJ_ACTIVE     0x0004          /* active objects */
 #define OBJ_DEAD       0x0008          /* dead objects (during rundown) */
 #define        OBJ_NOSPLIT     0x0010          /* dont split this object */
@@ -223,11 +220,6 @@ struct vm_object {
 #define        OBJ_ONEMAPPING  0x2000
 #define OBJ_NOMSYNC    0x4000          /* disable msync() system call */
 
-#define CHAINLK_EXCL   0x80000000
-#define CHAINLK_WAIT   0x40000000
-#define CHAINLK_EXCLREQ        0x20000000
-#define CHAINLK_MASK   0x1FFFFFFF
-
 #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
 
@@ -240,6 +232,7 @@ struct vm_object {
 #define OBJPC_INVAL    0x2                     /* invalidate */
 #define OBJPC_NOSYNC   0x4                     /* skip if PG_NOSYNC */
 
+#if 0
 /*
  * Used to chain vm_object deallocations
  */
@@ -247,6 +240,7 @@ struct vm_object_dealloc_list {
        struct vm_object_dealloc_list *next;
        vm_object_t     object;
 };
+#endif
 
 TAILQ_HEAD(object_q, vm_object);
 
@@ -315,8 +309,6 @@ vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
 vm_object_t vm_object_allocate_hold (objtype_t, vm_pindex_t);
 void _vm_object_allocate (objtype_t, vm_pindex_t, vm_object_t);
 boolean_t vm_object_coalesce (vm_object_t, vm_pindex_t, vm_size_t, vm_size_t);
-void vm_object_collapse (vm_object_t, struct vm_object_dealloc_list **);
-void vm_object_deallocate_list(struct vm_object_dealloc_list **);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init(vm_object_t, vm_pindex_t);
@@ -326,11 +318,6 @@ void vm_object_page_remove (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t);
 void vm_object_pmap_copy (vm_object_t, vm_pindex_t, vm_pindex_t);
 void vm_object_pmap_copy_1 (vm_object_t, vm_pindex_t, vm_pindex_t);
 void vm_object_pmap_remove (vm_object_t, vm_pindex_t, vm_pindex_t);
-void vm_object_chain_wait (vm_object_t object, int shared);
-void vm_object_chain_acquire(vm_object_t object, int shared);
-void vm_object_chain_release(vm_object_t object);
-void vm_object_chain_release_all(vm_object_t object, vm_object_t stopobj);
-void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t, int);
 void vm_object_madvise (vm_object_t, vm_pindex_t, vm_pindex_t, int);
 void vm_object_init2 (void);
 vm_page_t vm_fault_object_page(vm_object_t, vm_ooffset_t,
@@ -358,9 +345,11 @@ void vm_object_unlock(vm_object_t);
                debugvm_object_reference_quick(obj, __FILE__, __LINE__)
 #define vm_object_reference_locked(obj)                \
                debugvm_object_reference_locked(obj, __FILE__, __LINE__)
+#if 0
 #define vm_object_reference_locked_chain_held(obj)             \
                debugvm_object_reference_locked_chain_held(     \
                                        obj, __FILE__, __LINE__)
+#endif
 #define vm_object_deallocate(obj)              \
                debugvm_object_deallocate(obj, __FILE__, __LINE__)
 #define vm_object_deallocate_locked(obj)       \
@@ -380,13 +369,16 @@ void VMOBJDEBUG(vm_object_hold_shared)(vm_object_t object VMOBJDBARGS);
 void VMOBJDEBUG(vm_object_drop)(vm_object_t object VMOBJDBARGS);
 void VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS);
 void VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS);
+#if 0
 void VMOBJDEBUG(vm_object_reference_locked_chain_held)(
                        vm_object_t object VMOBJDBARGS);
+#endif
 void VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS);
 void VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS);
 
 void vm_object_upgrade(vm_object_t);
 void vm_object_downgrade(vm_object_t);
+int vm_quickcolor(void);
 
 #endif                         /* _KERNEL */
 
index f58eef1..ccaa8ab 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * (MPSAFE)
  *
- * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2010,2019 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
 #include <sys/spinlock2.h>
 #include <vm/vm_page2.h>
 
+struct swmarker {
+       struct vm_object dummy_obj;
+       struct vm_object *save_obj;
+       vm_ooffset_t save_off;
+};
+
+typedef struct swmarker swmarker_t;
+
 /* the kernel process "vm_pageout"*/
 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
 static int vm_swapcache_test(vm_page_t m);
 static int vm_swapcache_writing_heuristic(void);
 static int vm_swapcache_writing(vm_page_t marker, int count, int scount);
-static void vm_swapcache_cleaning(vm_object_t marker,
+static void vm_swapcache_cleaning(swmarker_t *marker,
                        struct vm_object_hash **swindexp);
-static void vm_swapcache_movemarker(vm_object_t marker,
+static void vm_swapcache_movemarker(swmarker_t *marker,
                        struct vm_object_hash *swindex, vm_object_t object);
 struct thread *swapcached_thread;
 
@@ -172,7 +180,7 @@ vm_swapcached_thread(void)
        enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
        enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
        static struct vm_page page_marker[PQ_L2_SIZE];
-       static struct vm_object swmarker;
+       static swmarker_t swmarker;
        static struct vm_object_hash *swindex;
        int q;
 
@@ -210,10 +218,10 @@ vm_swapcached_thread(void)
         * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
         */
        bzero(&swmarker, sizeof(swmarker));
-       swmarker.type = OBJT_MARKER;
+       swmarker.dummy_obj.type = OBJT_MARKER;
        swindex = &vm_object_hash[0];
        lwkt_gettoken(&swindex->token);
-       TAILQ_INSERT_HEAD(&swindex->list, &swmarker, object_list);
+       TAILQ_INSERT_HEAD(&swindex->list, &swmarker.dummy_obj, object_list);
        lwkt_reltoken(&swindex->token);
 
        for (;;) {
@@ -324,7 +332,7 @@ vm_swapcached_thread(void)
        }
 
        lwkt_gettoken(&swindex->token);
-       TAILQ_REMOVE(&swindex->list, &swmarker, object_list);
+       TAILQ_REMOVE(&swindex->list, &swmarker.dummy_obj, object_list);
        lwkt_reltoken(&swindex->token);
 }
 
@@ -693,7 +701,7 @@ vm_swapcache_test(vm_page_t m)
  */
 static
 void
-vm_swapcache_cleaning(vm_object_t marker, struct vm_object_hash **swindexp)
+vm_swapcache_cleaning(swmarker_t *marker, struct vm_object_hash **swindexp)
 {
        vm_object_t object;
        struct vnode *vp;
@@ -712,7 +720,7 @@ vm_swapcache_cleaning(vm_object_t marker, struct vm_object_hash **swindexp)
 
        didmove = 0;
 outerloop:
-       while ((object = TAILQ_NEXT(marker, object_list)) != NULL) {
+       while ((object = TAILQ_NEXT(&marker->dummy_obj, object_list)) != NULL) {
                /*
                 * We have to skip markers.  We cannot hold/drop marker
                 * objects!
@@ -756,10 +764,10 @@ outerloop:
                 * Reset the object pindex stored in the marker if the
                 * working object has changed.
                 */
-               if (marker->backing_object != object || didmove) {
-                       marker->size = 0;
-                       marker->backing_object_offset = 0;
-                       marker->backing_object = object;
+               if (marker->save_obj != object || didmove) {
+                       marker->dummy_obj.size = 0;
+                       marker->save_off = 0;
+                       marker->save_obj = object;
                        didmove = 0;
                }
 
@@ -783,7 +791,7 @@ outerloop:
                lwkt_token_swap();
                lwkt_reltoken(&(*swindexp)->token);
 
-               n = swap_pager_condfree(object, &marker->size,
+               n = swap_pager_condfree(object, &marker->dummy_obj.size,
                                    (count + SWAP_META_MASK) & ~SWAP_META_MASK);
 
                vm_object_drop(object);         /* object may be invalid now */
@@ -795,7 +803,7 @@ outerloop:
                 * the current object may no longer be on the vm_object_list.
                 */
                if (n <= 0 ||
-                   marker->backing_object_offset > vm_swapcache_cleanperobj) {
+                   marker->save_off > vm_swapcache_cleanperobj) {
                        vm_swapcache_movemarker(marker, *swindexp, object);
                        didmove = 1;
                }
@@ -804,7 +812,7 @@ outerloop:
                 * If we have exhausted our max-launder stop for now.
                 */
                count -= n;
-               marker->backing_object_offset += n * PAGE_SIZE;
+               marker->save_off += n * PAGE_SIZE;
                if (count < 0)
                        goto breakout;
        }
@@ -812,12 +820,12 @@ outerloop:
        /*
         * Iterate vm_object_lists[] hash table
         */
-       TAILQ_REMOVE(&(*swindexp)->list, marker, object_list);
+       TAILQ_REMOVE(&(*swindexp)->list, &marker->dummy_obj, object_list);
        lwkt_reltoken(&(*swindexp)->token);
        if (++*swindexp >= &vm_object_hash[VMOBJ_HSIZE])
                *swindexp = &vm_object_hash[0];
        lwkt_gettoken(&(*swindexp)->token);
-       TAILQ_INSERT_HEAD(&(*swindexp)->list, marker, object_list);
+       TAILQ_INSERT_HEAD(&(*swindexp)->list, &marker->dummy_obj, object_list);
 
        if (*swindexp != &vm_object_hash[0])
                goto outerloop;
@@ -833,11 +841,12 @@ breakout:
  * the marker past it.
  */
 static void
-vm_swapcache_movemarker(vm_object_t marker, struct vm_object_hash *swindex,
+vm_swapcache_movemarker(swmarker_t *marker, struct vm_object_hash *swindex,
                        vm_object_t object)
 {
-       if (TAILQ_NEXT(marker, object_list) == object) {
-               TAILQ_REMOVE(&swindex->list, marker, object_list);
-               TAILQ_INSERT_AFTER(&swindex->list, object, marker, object_list);
+       if (TAILQ_NEXT(&marker->dummy_obj, object_list) == object) {
+               TAILQ_REMOVE(&swindex->list, &marker->dummy_obj, object_list);
+               TAILQ_INSERT_AFTER(&swindex->list, object,
+                                  &marker->dummy_obj, object_list);
        }
 }
index a7dc91c..75d5662 100644 (file)
@@ -783,38 +783,27 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m, int bytecount,
 /*
  * Run the chain and if the bottom-most object is a vnode-type lock the
  * underlying vnode.  A locked vnode or NULL is returned.
+ *
+ * Caller must hold the first object.
  */
 struct vnode *
-vnode_pager_lock(vm_object_t object)
+vnode_pager_lock(vm_map_backing_t *ba)
 {
-       struct vnode *vp = NULL;
+       vm_map_backing_t *lba;
+       struct vnode *vp;
        vm_object_t lobject;
-       vm_object_t tobject;
        int error;
 
-       if (object == NULL)
-               return(NULL);
-
-       ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
-       lobject = object;
-
-       while (lobject->type != OBJT_VNODE) {
-               if (lobject->flags & OBJ_DEAD)
-                       break;
-               tobject = lobject->backing_object;
-               if (tobject == NULL)
-                       break;
-               vm_object_hold_shared(tobject);
-               if (tobject == lobject->backing_object) {
-                       if (lobject != object) {
-                               vm_object_lock_swap();
-                               vm_object_drop(lobject);
-                       }
-                       lobject = tobject;
-               } else {
-                       vm_object_drop(tobject);
-               }
-       }
+       if (ba == NULL)
+               return NULL;
+       lba = ba;
+       while (lba->backing_ba)
+               lba = lba->backing_ba;
+       if ((lobject = lba->object) == NULL)
+               return NULL;
+       if (lba != ba)
+               vm_object_hold_shared(lobject);
+
        while (lobject->type == OBJT_VNODE &&
               (lobject->flags & OBJ_DEAD) == 0) {
                /*
@@ -831,11 +820,11 @@ vnode_pager_lock(vm_object_t object)
                                "lockstatus %d, retrying\n",
                                vp, error,
                                lockstatus(&vp->v_lock, curthread));
-                       tsleep(object->handle, 0, "vnpgrl", hz);
+                       tsleep(lobject->handle, 0, "vnpgrl", hz);
                }
                vp = NULL;
        }
-       if (lobject != object)
+       if (lba != ba)
                vm_object_drop(lobject);
        return (vp);
 }
index 22e4b27..dfefe5e 100644 (file)
 #include <vm/vm_object.h>
 #endif
 
+struct vm_map_backing;
+
 void vnode_pager_freepage (vm_page_t);
-struct vnode *vnode_pager_lock (vm_object_t);
+struct vnode *vnode_pager_lock (struct vm_map_backing *);
 
 /*
  * XXX Generic routines; currently called by badly written FS code; these
index 0bc51e3..a173743 100644 (file)
@@ -395,6 +395,7 @@ dommap(struct proc *p)
        struct vm_map_entry entry;
        vm_map_entry_t ken;
        struct vm_object object;
+       vm_map_backing_t ba;
        vm_object_t objp;
        int prot, fflags;
 
@@ -410,21 +411,33 @@ dommap(struct proc *p)
                if (entry.maptype == VM_MAPTYPE_SUBMAP)
                        continue;
 
-               if ((objp = entry.object.vm_object) == NULL)
+               if (entry.ba.object == NULL)
                        continue;
-
-               for (; objp; objp = object.backing_object) {
-                       if (!kread(objp, &object, sizeof(object))) {
+               ba = entry.ba;
+               for (;;) {
+                       if ((objp = entry.ba.object) != NULL) {
+                               if (!kread(objp, &object, sizeof(object))) {
+                                       dprintf(stderr,
+                                           "can't read vm_object at %p "
+                                           "for pid %d\n",
+                                           (void *)objp, Pid);
+                                       return;
+                               }
+                       }
+                       if (ba.backing_ba == NULL)
+                               break;
+                       if (!kread(ba.backing_ba, &ba, sizeof(ba))) {
                                dprintf(stderr,
-                                   "can't read vm_object at %p for pid %d\n",
-                                   (void *)objp, Pid);
+                                   "can't read map_backing at %p "
+                                   "for pid %d\n",
+                                   (void *)ba.backing_ba, Pid);
                                return;
                        }
                }
 
                prot = entry.protection;
                fflags = (prot & VM_PROT_READ ? FREAD : 0) |
-                   (prot & VM_PROT_WRITE ? FWRITE : 0);
+                        (prot & VM_PROT_WRITE ? FWRITE : 0);
 
                switch (object.type) {
                case OBJT_VNODE: