From 07540d375a7f9dfdadc9976982dd8d0a1fd6794e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 28 Jan 2017 14:08:39 -0800 Subject: [PATCH] kernel - Fix races in the vm_map and vm_object page-scanning code * Calling lwkt_yield() while depending on the vm_object token to stabilize a vm_page_t is a bad idea. Move the calls to the end of the routine where they don't hurt anyone (the RB tree scanning code can handle ripouts, the callback functions cannot). * Unconditionally busying a vm_page_t in a RB tree scanning callback and then trying to re-check it to see if its still ok is a bad idea. Instead, use vm_page_busy_try() so as not to break the object token lock. If it fails, sleep on the page and set a retry condition but do not act on it. * Fix an issue in vm_object_pmap_remove() where the function operates incorrect if passed a 0-length address range. --- sys/vm/vm_map.c | 97 +++++++++++++++++++++----------------- sys/vm/vm_map.h | 2 + sys/vm/vm_object.c | 110 +++++++++++++++++++++++++------------------- sys/vm/vm_pageout.c | 4 ++ 4 files changed, 124 insertions(+), 89 deletions(-) diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index b89c9d084c..3265b79698 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -143,6 +143,7 @@ static int vm_map_relock_enable = 1; SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW, &vm_map_relock_enable, 0, "Randomize mmap offsets"); +static void vmspace_drop_notoken(struct vmspace *vm); static void vm_map_entry_shadow(vm_map_entry_t entry, int addref); static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *); static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *); @@ -210,7 +211,7 @@ vmspace_ctor(void *obj, void *privdata, int ocflags) struct vmspace *vm = obj; bzero(vm, sizeof(*vm)); - vm->vm_refcnt = (u_int)-1; + vm->vm_refcnt = VM_REF_DELETED; return 1; } @@ -221,7 +222,7 @@ vmspace_dtor(void *obj, void *privdata) { struct vmspace *vm = obj; - KKASSERT(vm->vm_refcnt == (u_int)-1); + KKASSERT(vm->vm_refcnt == VM_REF_DELETED); pmap_puninit(vmspace_pmap(vm)); } @@ -252,7 +253,7 @@ void vmspace_initrefs(struct vmspace *vm) { vm->vm_refcnt = 1; - vm->vm_holdcnt = 1; + vm->vm_holdcnt = 0; } /* @@ -285,7 +286,7 @@ vmspace_alloc(vm_offset_t min, vm_offset_t max) * two stages, one on refs 1->0, and the the second on hold 1->0. */ KKASSERT(vm->vm_holdcnt == 0); - KKASSERT(vm->vm_refcnt == (u_int)-1); + KKASSERT(vm->vm_refcnt == VM_REF_DELETED); vmspace_initrefs(vm); vmspace_hold(vm); pmap_pinit(vmspace_pmap(vm)); /* (some fields reused) */ @@ -299,39 +300,18 @@ vmspace_alloc(vm_offset_t min, vm_offset_t max) } /* - * NOTE: Can return -1 if the vmspace is exiting. + * NOTE: Can return 0 if the vmspace is exiting. */ int vmspace_getrefs(struct vmspace *vm) { - return ((int)vm->vm_refcnt); -} - -/* - * A vmspace object must already have a non-zero hold to be able to gain - * further holds on it. - */ -static void -vmspace_hold_notoken(struct vmspace *vm) -{ - KKASSERT(vm->vm_holdcnt != 0); - refcount_acquire(&vm->vm_holdcnt); -} - -static void -vmspace_drop_notoken(struct vmspace *vm) -{ - if (refcount_release(&vm->vm_holdcnt)) { - if (vm->vm_refcnt == (u_int)-1) { - vmspace_terminate(vm, 1); - } - } + return ((int)(vm->vm_refcnt & ~VM_REF_DELETED)); } void vmspace_hold(struct vmspace *vm) { - vmspace_hold_notoken(vm); + atomic_add_int(&vm->vm_holdcnt, 1); lwkt_gettoken(&vm->vm_map.token); } @@ -342,18 +322,32 @@ vmspace_drop(struct vmspace *vm) vmspace_drop_notoken(vm); } +static void +vmspace_drop_notoken(struct vmspace *vm) +{ + if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) { + if (vm->vm_refcnt & VM_REF_DELETED) + vmspace_terminate(vm, 1); + } +} + /* * A vmspace object must not be in a terminated state to be able to obtain * additional refs on it. * - * Ref'ing a vmspace object also increments its hold count. + * These are official references to the vmspace, the count is used to check + * for vmspace sharing. Foreign accessors should use 'hold' and not 'ref'. + * + * XXX we need to combine hold & ref together into one 64-bit field to allow + * holds to prevent stage-1 termination. */ void vmspace_ref(struct vmspace *vm) { - KKASSERT((int)vm->vm_refcnt >= 0); - vmspace_hold_notoken(vm); - refcount_acquire(&vm->vm_refcnt); + uint32_t n; + + n = atomic_fetchadd_int(&vm->vm_refcnt, 1); + KKASSERT((n & VM_REF_DELETED) == 0); } /* @@ -364,11 +358,30 @@ vmspace_ref(struct vmspace *vm) void vmspace_rel(struct vmspace *vm) { - if (refcount_release(&vm->vm_refcnt)) { - vm->vm_refcnt = (u_int)-1; /* no other refs possible */ - vmspace_terminate(vm, 0); + uint32_t n; + + for (;;) { + n = vm->vm_refcnt; + cpu_ccfence(); + KKASSERT((int)n > 0); /* at least one ref & not deleted */ + + if (n == 1) { + /* + * We must have a hold first to interlock the + * VM_REF_DELETED check that the drop tests. + */ + atomic_add_int(&vm->vm_holdcnt, 1); + if (atomic_cmpset_int(&vm->vm_refcnt, n, + VM_REF_DELETED)) { + vmspace_terminate(vm, 0); + vmspace_drop_notoken(vm); + break; + } + vmspace_drop_notoken(vm); + } else if (atomic_cmpset_int(&vm->vm_refcnt, n, n - 1)) { + break; + } /* else retry */ } - vmspace_drop_notoken(vm); } /* @@ -376,17 +389,17 @@ vmspace_rel(struct vmspace *vm) * longer in used by an exiting process, but the process has not yet * been reaped. * - * We release the refcnt but not the associated holdcnt. + * We drop refs, allowing for stage-1 termination, but maintain a holdcnt + * to prevent stage-2 until the process is reaped. Note hte order of + * operation, we must hold first. * * No requirements. */ void vmspace_relexit(struct vmspace *vm) { - if (refcount_release(&vm->vm_refcnt)) { - vm->vm_refcnt = (u_int)-1; /* no other refs possible */ - vmspace_terminate(vm, 0); - } + atomic_add_int(&vm->vm_holdcnt, 1); + vmspace_rel(vm); } /* @@ -426,6 +439,7 @@ vmspace_terminate(struct vmspace *vm, int final) lwkt_gettoken(&vm->vm_map.token); if (final == 0) { KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0); + vm->vm_flags |= VMSPACE_EXIT1; /* * Get rid of most of the resources. Leave the kernel pmap @@ -453,7 +467,6 @@ vmspace_terminate(struct vmspace *vm, int final) VM_MAX_USER_ADDRESS); } lwkt_reltoken(&vm->vm_map.token); - vm->vm_flags |= VMSPACE_EXIT1; } else { KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0); KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0); diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index c9f0da35d2..4f848ed98e 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -313,6 +313,8 @@ struct vmspace { u_int vm_refcnt; /* normal ref count */ }; +#define VM_REF_DELETED 0x80000000U + #define VMSPACE_EXIT1 0x0001 /* partial exit */ #define VMSPACE_EXIT2 0x0002 /* full exit */ diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 2114d06e34..88745786a4 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1194,8 +1194,11 @@ vm_object_terminate(vm_object_t object) */ info.count = 0; info.object = object; - vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, - vm_object_terminate_callback, &info); + do { + info.error = 0; + vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, + vm_object_terminate_callback, &info); + } while (info.error); /* * Let the pager know object is dead. @@ -1251,19 +1254,19 @@ vm_object_terminate_callback(vm_page_t p, void *data) struct rb_vm_page_scan_info *info = data; vm_object_t object; - if ((++info->count & 63) == 0) - lwkt_user_yield(); object = p->object; - if (object != info->object) { - kprintf("vm_object_terminate_callback: obj/pg race %p/%p\n", - info->object, p); - return(0); + KKASSERT(object == info->object); + if (vm_page_busy_try(p, TRUE)) { + vm_page_sleep_busy(p, TRUE, "vmotrm"); + info->error = 1; + return 0; } - vm_page_busy_wait(p, TRUE, "vmpgtrm"); if (object != p->object) { + /* XXX remove once we determine it can't happen */ kprintf("vm_object_terminate: Warning: Encountered " "busied page %p on queue %d\n", p, p->queue); vm_page_wakeup(p); + info->error = 1; } else if (p->wire_count == 0) { /* * NOTE: p->dirty and PG_NEED_COMMIT are ignored. @@ -1277,6 +1280,12 @@ vm_object_terminate_callback(vm_page_t p, void *data) vm_page_remove(p); vm_page_wakeup(p); } + + /* + * Must be at end to avoid SMP races, caller holds object token + */ + if ((++info->count & 63) == 0) + lwkt_user_yield(); return(0); } @@ -1390,25 +1399,24 @@ vm_object_page_clean_pass1(struct vm_page *p, void *data) { struct rb_vm_page_scan_info *info = data; - if ((++info->count & 63) == 0) - lwkt_user_yield(); - if (p->object != info->object || - p->pindex < info->start_pindex || - p->pindex > info->end_pindex) { - kprintf("vm_object_page_clean_pass1: obj/pg race %p/%p\n", - info->object, p); - return(0); - } + KKASSERT(p->object == info->object); + vm_page_flag_set(p, PG_CLEANCHK); if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { info->error = 1; - } else if (vm_page_busy_try(p, FALSE) == 0) { - if (p->object == info->object) - vm_page_protect(p, VM_PROT_READ); - vm_page_wakeup(p); - } else { + } else if (vm_page_busy_try(p, FALSE)) { info->error = 1; + } else { + KKASSERT(p->object == info->object); + vm_page_protect(p, VM_PROT_READ); + vm_page_wakeup(p); } + + /* + * Must be at end to avoid SMP races, caller holds object token + */ + if ((++info->count & 63) == 0) + lwkt_user_yield(); return(0); } @@ -1422,13 +1430,7 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data) struct rb_vm_page_scan_info *info = data; int generation; - if (p->object != info->object || - p->pindex < info->start_pindex || - p->pindex > info->end_pindex) { - kprintf("vm_object_page_clean_pass2: obj/pg race %p/%p\n", - info->object, p); - return(0); - } + KKASSERT(p->object == info->object); /* * Do not mess with pages that were inserted after we started @@ -1438,17 +1440,16 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data) goto done; generation = info->object->generation; - vm_page_busy_wait(p, TRUE, "vpcwai"); - if (p->object != info->object || - p->pindex < info->start_pindex || - p->pindex > info->end_pindex || - info->object->generation != generation) { + if (vm_page_busy_try(p, TRUE)) { + vm_page_sleep_busy(p, TRUE, "vpcwai"); info->error = 1; - vm_page_wakeup(p); goto done; } + KKASSERT(p->object == info->object && + info->object->generation == generation); + /* * Before wasting time traversing the pmaps, check for trivial * cases where the page cannot be dirty. @@ -1491,10 +1492,13 @@ vm_object_page_clean_pass2(struct vm_page *p, void *data) */ vm_object_page_collect_flush(info->object, p, info->pagerflags); /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ + + /* + * Must be at end to avoid SMP races, caller holds object token + */ done: if ((++info->count & 63) == 0) lwkt_user_yield(); - return(0); } @@ -1644,14 +1648,19 @@ vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end) if (object == NULL) return; + if (start == end) + return; info.start_pindex = start; info.end_pindex = end - 1; info.count = 0; info.object = object; vm_object_hold(object); - vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, - vm_object_pmap_remove_callback, &info); + do { + info.error = 0; + vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, + vm_object_pmap_remove_callback, &info); + } while (info.error); if (start == 0 && end == object->size) vm_object_clear_flag(object, OBJ_WRITEABLE); vm_object_drop(object); @@ -1665,19 +1674,22 @@ vm_object_pmap_remove_callback(vm_page_t p, void *data) { struct rb_vm_page_scan_info *info = data; - if ((++info->count & 63) == 0) - lwkt_user_yield(); - if (info->object != p->object || p->pindex < info->start_pindex || p->pindex > info->end_pindex) { kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n", info->object, p); + info->error = 1; return(0); } vm_page_protect(p, VM_PROT_NONE); + /* + * Must be at end to avoid SMP races, caller holds object token + */ + if ((++info->count & 63) == 0) + lwkt_user_yield(); return(0); } @@ -2714,9 +2726,6 @@ vm_object_page_remove_callback(vm_page_t p, void *data) { struct rb_vm_page_scan_info *info = data; - if ((++info->count & 63) == 0) - lwkt_user_yield(); - if (info->object != p->object || p->pindex < info->start_pindex || p->pindex > info->end_pindex) { @@ -2753,7 +2762,7 @@ vm_object_page_remove_callback(vm_page_t p, void *data) if (info->limit == 0) p->valid = 0; vm_page_wakeup(p); - return(0); + goto done; } /* @@ -2765,7 +2774,7 @@ vm_object_page_remove_callback(vm_page_t p, void *data) vm_page_test_dirty(p); if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { vm_page_wakeup(p); - return(0); + goto done; } } @@ -2775,6 +2784,13 @@ vm_object_page_remove_callback(vm_page_t p, void *data) vm_page_protect(p, VM_PROT_NONE); vm_page_free(p); + /* + * Must be at end to avoid SMP races, caller holds object token + */ +done: + if ((++info->count & 63) == 0) + lwkt_user_yield(); + return(0); } diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index a1f814ef81..071101ddb0 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -649,6 +649,10 @@ vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, } else { vm_page_wakeup(p); } + + /* + * Must be at end to avoid SMP races. + */ done: lwkt_user_yield(); return 0; -- 2.41.0