kernel - VM rework part 4 - Implement vm_fault_collapse()
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 10 May 2019 01:37:10 +0000 (18:37 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 10 May 2019 16:24:58 +0000 (09:24 -0700)
* Add the function vm_fault_collapse().  This function simulates
  faults to copy all pages from backing objects into the front
  object, allowing the backing objects to be disconnected
  from the map entry.

  This function is called under certain conditions from the
  vmspace_fork*() code prior to a fork to potentially collapse
  the entry's backing objects into the front object.  The
  caller then disconnects the backing objects, truncating the
  list to a single object (the front object).

  This optimization is necessary to prevent the backing_ba list
  from growing in an unbounded fashion.  In addition, being able
  to disconnect the graph allows redundant backing store to
  be freed more quickly, reducing memory use.

* Add sysctl vm.map_backing_shadow_test (default enabled).
  The vmspace_fork*() code now does a quick all-shadowed test on
  the first backing object and calls vm_fault_collapse()
  if it comes back true, regardless of the chain length.

* Add sysctl vm.map_backing_limit (default 5).
  The vmspace_fork*() code calls vm_fault_collapse() when the
  ba.backing_ba list exceeds the specified number of entries.

* Performance is a tad faster than the original collapse
  code.

sys/vm/vm_extern.h
sys/vm/vm_fault.c
sys/vm/vm_map.c
sys/vm/vm_object.h

index 74157db..c6b4634 100644 (file)
@@ -81,6 +81,7 @@ int swaponvp(struct thread *, struct vnode *, u_quad_t);
 void swapout_procs(int);
 int useracc(c_caddr_t, int, int);
 int vm_fault(struct vm_map *, vm_offset_t, vm_prot_t, int);
+int vm_fault_collapse(struct vm_map *, struct vm_map_entry *);
 vm_page_t vm_fault_page(struct vm_map *, vm_offset_t,
                        vm_prot_t, int, int *, int *);
 vm_page_t vm_fault_page_quick(vm_offset_t, vm_prot_t, int *, int *);
index 051d5aa..5fd6455 100644 (file)
@@ -137,14 +137,14 @@ struct faultstate {
        vm_prot_t first_prot;
        vm_map_t map;
        vm_map_entry_t entry;
-       int lookup_still_valid;
+       int lookup_still_valid; /* 0=inv 1=valid/rel -1=valid/atomic */
        int hardfault;
        int fault_flags;
        int shared;
        int msoftonly;
        int first_shared;
        int wflags;
-       int first_ba_held;
+       int first_ba_held;      /* 0=unlocked 1=locked/rel -1=lock/atomic */
        struct vnode *vp;
 };
 
@@ -205,46 +205,25 @@ release_page(struct faultstate *fs)
        fs->m = NULL;
 }
 
-/*
- * NOTE: Once unlocked any cached fs->entry becomes invalid, any reuse
- *      requires relocking and then checking the timestamp.
- *
- * NOTE: vm_map_lock_read() does not bump fs->map->timestamp.
- *
- * NOTE: This function can fail due to a deadlock against the caller's
- *      holding of a vm_page BUSY.
- */
-#if 0
-static __inline int
-relock_map(struct faultstate *fs)
-{
-       int error;
-
-       if (fs->lookup_still_valid == FALSE && fs->map) {
-               error = vm_map_lock_read_to(fs->map);
-               if (error == 0)
-                       fs->lookup_still_valid = TRUE;
-       } else {
-               error = 0;
-       }
-       return error;
-}
-#endif
-
 static __inline void
 unlock_map(struct faultstate *fs)
 {
        if (fs->ba != fs->first_ba)
                vm_object_drop(fs->ba->object);
-       if (fs->first_ba && fs->first_ba_held) {
+       if (fs->first_ba && fs->first_ba_held == 1) {
                vm_object_drop(fs->first_ba->object);
                fs->first_ba_held = 0;
+               fs->first_ba = NULL;
        }
        fs->ba = NULL;
-       fs->first_ba = NULL;
-       if (fs->lookup_still_valid && fs->map) {
+
+       /*
+        * NOTE: If lookup_still_valid == -1 the map is assumed to be locked
+        *       and caller expects it to remain locked atomically.
+        */
+       if (fs->lookup_still_valid == 1 && fs->map) {
                vm_map_lookup_done(fs->map, fs->entry, 0);
-               fs->lookup_still_valid = FALSE;
+               fs->lookup_still_valid = 0;
                fs->entry = NULL;
        }
 }
@@ -352,15 +331,15 @@ virtual_copy_ok(struct faultstate *fs)
                /*
                 * Grab the lock and re-test changeable items.
                 */
-               if (fs->lookup_still_valid == FALSE && fs->map) {
+               if (fs->lookup_still_valid == 0 && fs->map) {
                        if (lockmgr(&fs->map->lock, LK_EXCLUSIVE|LK_NOWAIT))
                                return 0;
-                       fs->lookup_still_valid = TRUE;
+                       fs->lookup_still_valid = 1;
                        if (virtual_copy_test(fs)) {
                                fs->map_generation = ++fs->map->timestamp;
                                return 1;
                        }
-                       fs->lookup_still_valid = FALSE;
+                       fs->lookup_still_valid = 0;
                        lockmgr(&fs->map->lock, LK_RELEASE);
                }
        }
@@ -526,7 +505,7 @@ RetryFault:
         *
         * Misc checks.  Save the map generation number to detect races.
         */
-       fs.lookup_still_valid = TRUE;
+       fs.lookup_still_valid = 1;
        fs.first_m = NULL;
        fs.ba = fs.first_ba;            /* so unlock_things() works */
        fs.prot = fs.first_prot;        /* default (used by uksmap) */
@@ -742,7 +721,7 @@ success:
         * WARNING! Soft-busied fs.m's can only be manipulated in limited
         *          ways.
         */
-       KKASSERT(fs.lookup_still_valid == TRUE);
+       KKASSERT(fs.lookup_still_valid != 0);
        vm_page_flag_set(fs.m, PG_REFERENCED);
        pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot | inherit_prot,
                   fs.wflags & FW_WIRED, fs.entry);
@@ -822,7 +801,7 @@ done_success:
 
        result = KERN_SUCCESS;
 done:
-       if (fs.first_ba && fs.first_ba->object && fs.first_ba_held) {
+       if (fs.first_ba && fs.first_ba->object && fs.first_ba_held == 1) {
                vm_object_drop(fs.first_ba->object);
                fs.first_ba_held = 0;
        }
@@ -1166,7 +1145,7 @@ RetryFault:
         *
         * Misc checks.  Save the map generation number to detect races.
         */
-       fs.lookup_still_valid = TRUE;
+       fs.lookup_still_valid = 1;
        fs.first_m = NULL;
        fs.ba = fs.first_ba;
 
@@ -1429,7 +1408,7 @@ vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
        fs.first_shared = *sharedp;
        fs.msoftonly = 0;
        fs.vp = NULL;
-       fs.first_ba_held = 0;   /* object held across call, prevent drop */
+       fs.first_ba_held = -1;  /* object held across call, prevent drop */
        KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
 
        /*
@@ -1471,7 +1450,7 @@ RetryFault:
        if (fs.vp == NULL)
                fs.vp = vnode_pager_lock(fs.first_ba);
 
-       fs.lookup_still_valid = TRUE;
+       fs.lookup_still_valid = 1;
        fs.first_m = NULL;
 
 #if 0
@@ -1972,7 +1951,6 @@ readrest:
                        vm_object_t object;
                        vm_page_t first_m;
                        int seqaccess;
-                       int ohold;
                        int rv;
 
                        if (behavior == MAP_ENTRY_BEHAV_RANDOM)
@@ -2009,43 +1987,9 @@ readrest:
                                return (KERN_TRY_AGAIN);
                        }
 
-                       /*
-                        * Unlock the map, retaining fs->ba->object.  This
-                        * is necessary to avoid a deadlock and it will also
-                        * allow concurrent faults on the same map and ba
-                        * (albeit a bit inefficiently).
-                        *
-                        * Some fancy footwork is needed due to token
-                        * ordering.
-                        *
-                        * Additional footwork is needed because we are
-                        * blowing away ba vs first_ba, so fs->first_m
-                        * will not be cleaned up automatically.  Pull
-                        * it out.
-                        *
-                        * Because we unlocked the map, we will have to
-                        * return a KERN_TRY_AGAIN for any successful I/O.
-                        */
                        object = fs->ba->object;
-#if 1
-                       ohold = 0;
                        first_m = NULL;
-#else
-                       if (fs->ba != fs->first_ba) {
-                               first_m = fs->first_m;
-                               vm_object_pip_wakeup(fs->first_ba->object);
-                               vm_object_lock_swap();
-                               vm_object_drop(fs->first_ba->object);
-                       } else {
-                               first_m = NULL;
-                       }
-                       ohold = fs->first_ba_held;
-                       fs->ba = NULL;
-                       fs->first_ba = NULL;
-                       fs->first_m = NULL;
-                       fs->first_ba_held = 0;
-                       unlock_map(fs);
-#endif
+
                        /* object is held, no more access to entry or ba's */
 
                        /*
@@ -2085,24 +2029,6 @@ readrest:
                                        fs->m = NULL;
                                }
 
-                               /*
-                                * first_m could be completely valid and we
-                                * got here because of a PG_RAM, don't
-                                * mistakenly free it!
-                                */
-                               if (first_m) {
-                                       if ((first_m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) {
-                                               vm_page_wakeup(first_m);
-                                       } else if (fault_type & VM_PROT_WRITE) {
-                                               vm_page_deactivate(first_m);
-                                               vm_page_wakeup(first_m);
-                                       } else {
-                                               vm_page_free(first_m);
-                                       }
-                                       first_m = NULL;         /* safety */
-                               }
-
-#if 1
                                if (fs->m) {
                                        /* have page */
                                        break;
@@ -2110,22 +2036,6 @@ readrest:
                                vm_object_pip_wakeup(fs->first_ba->object);
                                unlock_things(fs);
                                return (KERN_TRY_AGAIN);
-#else
-                               vm_object_pip_wakeup(object);
-                               unlock_things(fs);
-
-                               /*
-                                * HACK! The object is always held on call,
-                                *       but vm_fault_object_page() needs
-                                *       to leave the object held across
-                                *       the entire operation and will clear
-                                *       first_ba_held to prevent the object
-                                *       from being dropped.
-                                */
-                               if (ohold)
-                                       vm_object_drop(object);
-                               return (KERN_TRY_AGAIN);
-#endif
                        }
 
                        /*
@@ -2181,8 +2091,7 @@ readrest:
                        }
                        vm_object_pip_wakeup(object);
                        unlock_things(fs);
-                       if (ohold)
-                               vm_object_drop(object);
+
                        switch(rv) {
                        case VM_PAGER_ERROR:
                                return (KERN_FAILURE);
@@ -2418,9 +2327,9 @@ next:
         * NOTE: The relock_map() can fail due to a deadlock against
         *       the vm_page we are holding BUSY.
         */
-       KKASSERT(fs->lookup_still_valid == TRUE);
+       KKASSERT(fs->lookup_still_valid != 0);
 #if 0
-       if (fs->lookup_still_valid == FALSE && fs->map) {
+       if (fs->lookup_still_valid == 0 && fs->map) {
                if (relock_map(fs) ||
                    fs->map->timestamp != fs->map_generation) {
                        release_page(fs);
@@ -2516,7 +2425,7 @@ next:
  * deadlock.  Note that the entry may be clipped while we are blocked but
  * will never be freed.
  *
- * No requirements.
+ * map must be locked on entry.
  */
 int
 vm_fault_wire(vm_map_t map, vm_map_entry_t entry,
@@ -2629,6 +2538,79 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
        }
 }
 
+/*
+ * Simulate write faults to bring all data into the head object, return
+ * KERN_SUCCESS on success (which should be always unless the system runs
+ * out of memory).
+ *
+ * The caller will handle destroying the backing_ba's.
+ */
+int
+vm_fault_collapse(vm_map_t map, vm_map_entry_t entry)
+{
+       struct faultstate fs;
+       vm_ooffset_t scan;
+       vm_pindex_t pindex;
+       vm_object_t object;
+       int rv;
+       int all_shadowed;
+
+       bzero(&fs, sizeof(fs));
+       object = entry->ba.object;
+
+       fs.first_prot = entry->max_protection | /* optional VM_PROT_EXECUTE */
+                       VM_PROT_READ | VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE;
+       fs.fault_flags = VM_FAULT_NORMAL;
+       fs.map = map;
+       fs.entry = entry;
+       fs.lookup_still_valid = -1;     /* leave map atomically locked */
+       fs.first_ba = &entry->ba;
+       fs.first_ba_held = -1;          /* leave object held */
+
+       /* fs.hardfault */
+
+       vm_object_hold(object);
+       rv = KERN_SUCCESS;
+
+       scan = entry->start;
+       all_shadowed = 1;
+
+       while (scan < entry->end) {
+               pindex = OFF_TO_IDX(entry->ba.offset + (scan - entry->start));
+
+               if (vm_page_lookup(object, pindex)) {
+                       scan += PAGE_SIZE;
+                       continue;
+               }
+
+               all_shadowed = 0;
+               fs.ba = fs.first_ba;
+               fs.prot = fs.first_prot;
+
+               rv = vm_fault_object(&fs, pindex, fs.first_prot, 1);
+               if (rv == KERN_TRY_AGAIN)
+                       continue;
+               if (rv != KERN_SUCCESS)
+                       break;
+               vm_page_flag_set(fs.m, PG_REFERENCED);
+               vm_page_activate(fs.m);
+               vm_page_wakeup(fs.m);
+               scan += PAGE_SIZE;
+       }
+       KKASSERT(entry->ba.object == object);
+       vm_object_drop(object);
+
+       /*
+        * If the fronting object did not have every page we have to clear
+        * the pmap range due to the pages being changed so we can fault-in
+        * the proper pages.
+        */
+       if (all_shadowed == 0)
+               pmap_remove(map->pmap, entry->start, entry->end);
+
+       return rv;
+}
+
 /*
  * Copy all of the pages from one map entry to another.  If the source
  * is wired down we just use vm_page_lookup().  If not we use
index 1cd8df1..6174091 100644 (file)
@@ -164,6 +164,9 @@ SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
 static int vm_map_backing_limit = 5;
 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
           &vm_map_backing_limit, 0, "ba.backing_ba link depth");
+static int vm_map_backing_shadow_test = 1;
+SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW,
+          &vm_map_backing_shadow_test, 0, "ba.object shadow test");
 
 static void vmspace_drop_notoken(struct vmspace *vm);
 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
@@ -3284,7 +3287,6 @@ again:
                                 * When ONEMAPPING is set we can destroy the
                                 * pages underlying the entry's range.
                                 */
-                               /*vm_object_collapse(object, NULL);*/
                                vm_object_page_remove(object, offidxstart,
                                                      offidxend, FALSE);
                                if (object->type == OBJT_SWAP) {
@@ -3585,8 +3587,105 @@ vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
                          vm_map_entry_t old_entry, int *countp)
 {
        vm_map_entry_t new_entry;
+       vm_map_backing_t ba;
        vm_object_t object;
 
+#if 0
+       /*
+        * Any uninterrupted sequence of ba->refs == 1 in the backing_ba
+        * list can be collapsed.  It's a good time to do this check with
+        * regards to prior forked children likely having exited or execd.
+        *
+        * Only the specific page ranges within the object(s) specified by
+        * the entry can be collapsed.
+        *
+        * Once we hit ba->refs > 1, or a non-anonymous-memory object,
+        * we're done.  Even if later ba's beyond this parent ba have
+        * a ref count of 1 the whole sub-list could be shared at the this
+        * parent ba and so we have to stop.
+        *
+        * We do not have to test OBJ_ONEMAPPING here (it probably won't be
+        * set anyway due to previous sharing of the object).  Also the objects
+        * themselves might have a ref_count > 1 due to clips and forks
+        * related to OTHER page ranges.  That is, the vm_object itself might
+        * still be associated with multiple pmaps... just not this particular
+        * page range within the object.
+        */
+       while ((ba = old_entry->ba.backing_ba) && ba->refs == 1) {
+               if (ba.object->type != OBJT_DEFAULT &&
+                   ba.object->type != OBJT_SWAP) {
+                       break;
+               }
+               object = vm_object_collapse(old_entry->ba.object, ba->object);
+               if (object == old_entry->ba.object) {
+                       /*
+                        * Merged into base, remove intermediate ba.
+                        */
+                       kprintf("A");
+                       --old_entry->ba.backing_count;
+                       old_entry->ba.backing_ba = ba->backing_ba;
+                       if (ba->backing_ba)
+                               ba->backing_ba->offset += ba->offset;
+                       ba->backing_ba = NULL;
+                       vm_map_entry_dispose_ba(ba);
+               } else if (object == ba->object) {
+                       /*
+                        * Merged into intermediate ba, shift it into
+                        * the base.
+                        */
+                       kprintf("B");
+                       vm_object_deallocate(old_entry->ba.object);
+                       --old_entry->ba.backing_count;
+                       old_entry->ba.backing_ba = ba->backing_ba;
+                       old_entry->ba.object = ba->object;
+                       old_entry->ba.offset += ba->offset;
+                       ba->object = NULL;
+                       ba->backing_ba = NULL;
+                       vm_map_entry_dispose_ba(ba);
+               } else {
+                       break;
+               }
+       }
+#endif
+
+       /*
+        * If the backing_ba link list gets too long then fault it
+        * all into the head object and dispose of the list.  We do
+        * this in old_entry prior to cloning in order to benefit both
+        * parent and child.
+        *
+        * We can test our fronting object's size against its
+        * resident_page_count for a really cheap (but probably not perfect)
+        * all-shadowed test, allowing us to disconnect the backing_ba
+        * link list early.
+        */
+       object = old_entry->ba.object;
+       if (old_entry->ba.backing_ba &&
+           (old_entry->ba.backing_count >= vm_map_backing_limit ||
+            (vm_map_backing_shadow_test && object &&
+             object->size == object->resident_page_count))) {
+               /*
+                * If there are too many backing_ba linkages we
+                * collapse everything into the head
+                *
+                * This will also remove all the pte's.
+                */
+               if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY)
+                       vm_map_entry_shadow(old_entry, 0);
+               if (object == NULL)
+                       vm_map_entry_allocate_object(old_entry);
+               if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) {
+                       ba = old_entry->ba.backing_ba;
+                       old_entry->ba.backing_ba = NULL;
+                       old_entry->ba.backing_count = 0;
+                       vm_map_entry_dispose_ba(ba);
+               }
+       }
+       object = NULL;  /* object variable is now invalid */
+
+       /*
+        * Fork the entry
+        */
        switch (old_entry->inheritance) {
        case VM_INHERIT_NONE:
                break;
index 391c445..b3bd831 100644 (file)
@@ -309,6 +309,7 @@ vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
 vm_object_t vm_object_allocate_hold (objtype_t, vm_pindex_t);
 void _vm_object_allocate (objtype_t, vm_pindex_t, vm_object_t);
 boolean_t vm_object_coalesce (vm_object_t, vm_pindex_t, vm_size_t, vm_size_t);
+vm_object_t vm_object_collapse (vm_object_t, vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init(vm_object_t, vm_pindex_t);