kernel - Fix races in the vm_map and vm_object page-scanning code
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 28 Jan 2017 22:08:39 +0000 (14:08 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 28 Jan 2017 22:08:39 +0000 (14:08 -0800)
* Calling lwkt_yield() while depending on the vm_object token to stabilize
  a vm_page_t is a bad idea.  Move the calls to the end of the routine
  where they don't hurt anyone (the RB tree scanning code can handle
  ripouts, the callback functions cannot).

* Unconditionally busying a vm_page_t in a RB tree scanning callback
  and then trying to re-check it to see if its still ok is a bad idea.
  Instead, use vm_page_busy_try() so as not to break the object token
  lock.  If it fails, sleep on the page and set a retry condition but
  do not act on it.

* Fix an issue in vm_object_pmap_remove() where the function operates
  incorrect if passed a 0-length address range.

sys/vm/vm_map.c
sys/vm/vm_map.h
sys/vm/vm_object.c
sys/vm/vm_pageout.c

index b89c9d0..3265b79 100644 (file)
@@ -143,6 +143,7 @@ static int vm_map_relock_enable = 1;
 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
           &vm_map_relock_enable, 0, "Randomize mmap offsets");
 
+static void vmspace_drop_notoken(struct vmspace *vm);
 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
@@ -210,7 +211,7 @@ vmspace_ctor(void *obj, void *privdata, int ocflags)
        struct vmspace *vm = obj;
 
        bzero(vm, sizeof(*vm));
-       vm->vm_refcnt = (u_int)-1;
+       vm->vm_refcnt = VM_REF_DELETED;
 
        return 1;
 }
@@ -221,7 +222,7 @@ vmspace_dtor(void *obj, void *privdata)
 {
        struct vmspace *vm = obj;
 
-       KKASSERT(vm->vm_refcnt == (u_int)-1);
+       KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
        pmap_puninit(vmspace_pmap(vm));
 }
 
@@ -252,7 +253,7 @@ void
 vmspace_initrefs(struct vmspace *vm)
 {
        vm->vm_refcnt = 1;
-       vm->vm_holdcnt = 1;
+       vm->vm_holdcnt = 0;
 }
 
 /*
@@ -285,7 +286,7 @@ vmspace_alloc(vm_offset_t min, vm_offset_t max)
         * two stages, one on refs 1->0, and the the second on hold 1->0.
         */
        KKASSERT(vm->vm_holdcnt == 0);
-       KKASSERT(vm->vm_refcnt == (u_int)-1);
+       KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
        vmspace_initrefs(vm);
        vmspace_hold(vm);
        pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
@@ -299,39 +300,18 @@ vmspace_alloc(vm_offset_t min, vm_offset_t max)
 }
 
 /*
- * NOTE: Can return -1 if the vmspace is exiting.
+ * NOTE: Can return 0 if the vmspace is exiting.
  */
 int
 vmspace_getrefs(struct vmspace *vm)
 {
-       return ((int)vm->vm_refcnt);
-}
-
-/*
- * A vmspace object must already have a non-zero hold to be able to gain
- * further holds on it.
- */
-static void
-vmspace_hold_notoken(struct vmspace *vm)
-{
-       KKASSERT(vm->vm_holdcnt != 0);
-       refcount_acquire(&vm->vm_holdcnt);
-}
-
-static void
-vmspace_drop_notoken(struct vmspace *vm)
-{
-       if (refcount_release(&vm->vm_holdcnt)) {
-               if (vm->vm_refcnt == (u_int)-1) {
-                       vmspace_terminate(vm, 1);
-               }
-       }
+       return ((int)(vm->vm_refcnt & ~VM_REF_DELETED));
 }
 
 void
 vmspace_hold(struct vmspace *vm)
 {
-       vmspace_hold_notoken(vm);
+       atomic_add_int(&vm->vm_holdcnt, 1);
        lwkt_gettoken(&vm->vm_map.token);
 }
 
@@ -342,18 +322,32 @@ vmspace_drop(struct vmspace *vm)
        vmspace_drop_notoken(vm);
 }
 
+static void
+vmspace_drop_notoken(struct vmspace *vm)
+{
+       if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
+               if (vm->vm_refcnt & VM_REF_DELETED)
+                       vmspace_terminate(vm, 1);
+       }
+}
+
 /*
  * A vmspace object must not be in a terminated state to be able to obtain
  * additional refs on it.
  *
- * Ref'ing a vmspace object also increments its hold count.
+ * These are official references to the vmspace, the count is used to check
+ * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
+ *
+ * XXX we need to combine hold & ref together into one 64-bit field to allow
+ * holds to prevent stage-1 termination.
  */
 void
 vmspace_ref(struct vmspace *vm)
 {
-       KKASSERT((int)vm->vm_refcnt >= 0);
-       vmspace_hold_notoken(vm);
-       refcount_acquire(&vm->vm_refcnt);
+       uint32_t n;
+
+       n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
+       KKASSERT((n & VM_REF_DELETED) == 0);
 }
 
 /*
@@ -364,11 +358,30 @@ vmspace_ref(struct vmspace *vm)
 void
 vmspace_rel(struct vmspace *vm)
 {
-       if (refcount_release(&vm->vm_refcnt)) {
-               vm->vm_refcnt = (u_int)-1;      /* no other refs possible */
-               vmspace_terminate(vm, 0);
+       uint32_t n;
+
+       for (;;) {
+               n = vm->vm_refcnt;
+               cpu_ccfence();
+               KKASSERT((int)n > 0);   /* at least one ref & not deleted */
+
+               if (n == 1) {
+                       /*
+                        * We must have a hold first to interlock the
+                        * VM_REF_DELETED check that the drop tests.
+                        */
+                       atomic_add_int(&vm->vm_holdcnt, 1);
+                       if (atomic_cmpset_int(&vm->vm_refcnt, n,
+                                             VM_REF_DELETED)) {
+                               vmspace_terminate(vm, 0);
+                               vmspace_drop_notoken(vm);
+                               break;
+                       }
+                       vmspace_drop_notoken(vm);
+               } else if (atomic_cmpset_int(&vm->vm_refcnt, n, n - 1)) {
+                       break;
+               } /* else retry */
        }
-       vmspace_drop_notoken(vm);
 }
 
 /*
@@ -376,17 +389,17 @@ vmspace_rel(struct vmspace *vm)
  * longer in used by an exiting process, but the process has not yet
  * been reaped.
  *
- * We release the refcnt but not the associated holdcnt.
+ * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
+ * to prevent stage-2 until the process is reaped.  Note hte order of
+ * operation, we must hold first.
  *
  * No requirements.
  */
 void
 vmspace_relexit(struct vmspace *vm)
 {
-       if (refcount_release(&vm->vm_refcnt)) {
-               vm->vm_refcnt = (u_int)-1;      /* no other refs possible */
-               vmspace_terminate(vm, 0);
-       }
+       atomic_add_int(&vm->vm_holdcnt, 1);
+       vmspace_rel(vm);
 }
 
 /*
@@ -426,6 +439,7 @@ vmspace_terminate(struct vmspace *vm, int final)
        lwkt_gettoken(&vm->vm_map.token);
        if (final == 0) {
                KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
+               vm->vm_flags |= VMSPACE_EXIT1;
 
                /*
                 * Get rid of most of the resources.  Leave the kernel pmap
@@ -453,7 +467,6 @@ vmspace_terminate(struct vmspace *vm, int final)
                                      VM_MAX_USER_ADDRESS);
                }
                lwkt_reltoken(&vm->vm_map.token);
-               vm->vm_flags |= VMSPACE_EXIT1;
        } else {
                KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
                KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
index c9f0da3..4f848ed 100644 (file)
@@ -313,6 +313,8 @@ struct vmspace {
        u_int   vm_refcnt;      /* normal ref count */
 };
 
+#define VM_REF_DELETED         0x80000000U
+
 #define VMSPACE_EXIT1          0x0001  /* partial exit */
 #define VMSPACE_EXIT2          0x0002  /* full exit */
 
index 2114d06..8874578 100644 (file)
@@ -1194,8 +1194,11 @@ vm_object_terminate(vm_object_t object)
         */
        info.count = 0;
        info.object = object;
-       vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
-                               vm_object_terminate_callback, &info);
+       do {
+               info.error = 0;
+               vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
+                                       vm_object_terminate_callback, &info);
+       } while (info.error);
 
        /*
         * Let the pager know object is dead.
@@ -1251,19 +1254,19 @@ vm_object_terminate_callback(vm_page_t p, void *data)
        struct rb_vm_page_scan_info *info = data;
        vm_object_t object;
 
-       if ((++info->count & 63) == 0)
-               lwkt_user_yield();
        object = p->object;
-       if (object != info->object) {
-               kprintf("vm_object_terminate_callback: obj/pg race %p/%p\n",
-                       info->object, p);
-               return(0);
+       KKASSERT(object == info->object);
+       if (vm_page_busy_try(p, TRUE)) {
+               vm_page_sleep_busy(p, TRUE, "vmotrm");
+               info->error = 1;
+               return 0;
        }
-       vm_page_busy_wait(p, TRUE, "vmpgtrm");
        if (object != p->object) {
+               /* XXX remove once we determine it can't happen */
                kprintf("vm_object_terminate: Warning: Encountered "
                        "busied page %p on queue %d\n", p, p->queue);
                vm_page_wakeup(p);
+               info->error = 1;
        } else if (p->wire_count == 0) {
                /*
                 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
@@ -1277,6 +1280,12 @@ vm_object_terminate_callback(vm_page_t p, void *data)
                vm_page_remove(p);
                vm_page_wakeup(p);
        }
+
+       /*
+        * Must be at end to avoid SMP races, caller holds object token
+        */
+       if ((++info->count & 63) == 0)
+               lwkt_user_yield();
        return(0);
 }
 
@@ -1390,25 +1399,24 @@ vm_object_page_clean_pass1(struct vm_page *p, void *data)
 {
        struct rb_vm_page_scan_info *info = data;
 
-       if ((++info->count & 63) == 0)
-               lwkt_user_yield();
-       if (p->object != info->object ||
-           p->pindex < info->start_pindex ||
-           p->pindex > info->end_pindex) {
-               kprintf("vm_object_page_clean_pass1: obj/pg race %p/%p\n",
-                       info->object, p);
-               return(0);
-       }
+       KKASSERT(p->object == info->object);
+
        vm_page_flag_set(p, PG_CLEANCHK);
        if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
                info->error = 1;
-       } else if (vm_page_busy_try(p, FALSE) == 0) {
-               if (p->object == info->object)
-                       vm_page_protect(p, VM_PROT_READ);
-               vm_page_wakeup(p);
-       } else {
+       } else if (vm_page_busy_try(p, FALSE)) {
                info->error = 1;
+       } else {
+               KKASSERT(p->object == info->object);
+               vm_page_protect(p, VM_PROT_READ);
+               vm_page_wakeup(p);
        }
+
+       /*
+        * Must be at end to avoid SMP races, caller holds object token
+        */
+       if ((++info->count & 63) == 0)
+               lwkt_user_yield();
        return(0);
 }
 
@@ -1422,13 +1430,7 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data)
        struct rb_vm_page_scan_info *info = data;
        int generation;
 
-       if (p->object != info->object ||
-           p->pindex < info->start_pindex ||
-           p->pindex > info->end_pindex) {
-               kprintf("vm_object_page_clean_pass2: obj/pg race %p/%p\n",
-                       info->object, p);
-               return(0);
-       }
+       KKASSERT(p->object == info->object);
 
        /*
         * Do not mess with pages that were inserted after we started
@@ -1438,17 +1440,16 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data)
                goto done;
 
        generation = info->object->generation;
-       vm_page_busy_wait(p, TRUE, "vpcwai");
 
-       if (p->object != info->object ||
-           p->pindex < info->start_pindex ||
-           p->pindex > info->end_pindex ||
-           info->object->generation != generation) {
+       if (vm_page_busy_try(p, TRUE)) {
+               vm_page_sleep_busy(p, TRUE, "vpcwai");
                info->error = 1;
-               vm_page_wakeup(p);
                goto done;
        }
 
+       KKASSERT(p->object == info->object &&
+                info->object->generation == generation);
+
        /*
         * Before wasting time traversing the pmaps, check for trivial
         * cases where the page cannot be dirty.
@@ -1491,10 +1492,13 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data)
         */
        vm_object_page_collect_flush(info->object, p, info->pagerflags);
        /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
+
+       /*
+        * Must be at end to avoid SMP races, caller holds object token
+        */
 done:
        if ((++info->count & 63) == 0)
                lwkt_user_yield();
-
        return(0);
 }
 
@@ -1644,14 +1648,19 @@ vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 
        if (object == NULL)
                return;
+       if (start == end)
+               return;
        info.start_pindex = start;
        info.end_pindex = end - 1;
        info.count = 0;
        info.object = object;
 
        vm_object_hold(object);
-       vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
-                               vm_object_pmap_remove_callback, &info);
+       do {
+               info.error = 0;
+               vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
+                                       vm_object_pmap_remove_callback, &info);
+       } while (info.error);
        if (start == 0 && end == object->size)
                vm_object_clear_flag(object, OBJ_WRITEABLE);
        vm_object_drop(object);
@@ -1665,19 +1674,22 @@ vm_object_pmap_remove_callback(vm_page_t p, void *data)
 {
        struct rb_vm_page_scan_info *info = data;
 
-       if ((++info->count & 63) == 0)
-               lwkt_user_yield();
-
        if (info->object != p->object ||
            p->pindex < info->start_pindex ||
            p->pindex > info->end_pindex) {
                kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n",
                        info->object, p);
+               info->error = 1;
                return(0);
        }
 
        vm_page_protect(p, VM_PROT_NONE);
 
+       /*
+        * Must be at end to avoid SMP races, caller holds object token
+        */
+       if ((++info->count & 63) == 0)
+               lwkt_user_yield();
        return(0);
 }
 
@@ -2714,9 +2726,6 @@ vm_object_page_remove_callback(vm_page_t p, void *data)
 {
        struct rb_vm_page_scan_info *info = data;
 
-       if ((++info->count & 63) == 0)
-               lwkt_user_yield();
-
        if (info->object != p->object ||
            p->pindex < info->start_pindex ||
            p->pindex > info->end_pindex) {
@@ -2753,7 +2762,7 @@ vm_object_page_remove_callback(vm_page_t p, void *data)
                if (info->limit == 0)
                        p->valid = 0;
                vm_page_wakeup(p);
-               return(0);
+               goto done;
        }
 
        /*
@@ -2765,7 +2774,7 @@ vm_object_page_remove_callback(vm_page_t p, void *data)
                vm_page_test_dirty(p);
                if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
                        vm_page_wakeup(p);
-                       return(0);
+                       goto done;
                }
        }
 
@@ -2775,6 +2784,13 @@ vm_object_page_remove_callback(vm_page_t p, void *data)
        vm_page_protect(p, VM_PROT_NONE);
        vm_page_free(p);
 
+       /*
+        * Must be at end to avoid SMP races, caller holds object token
+        */
+done:
+       if ((++info->count & 63) == 0)
+               lwkt_user_yield();
+
        return(0);
 }
 
index a1f814e..071101d 100644 (file)
@@ -649,6 +649,10 @@ vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
        } else {
                vm_page_wakeup(p);
        }
+
+       /*
+        * Must be at end to avoid SMP races.
+        */
 done:
        lwkt_user_yield();
        return 0;