From 6d538b476425b98b7d28d50562ab45c9a9949911 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 6 Oct 2013 18:50:36 -0700 Subject: [PATCH] kernel - Rewrite do_vmtotal and change the way VM statistics are collected * The vmtotal sysctl was iterating through all VM objects. This is a problem on machines with huge amounts of memory which might have millions of VM objects. * Collect running VM statistics in the swap pager and vm_page modules, on a per-cpu basis. Add a struct vmtotal structure to globaldata. Active real memory use is how many VM pages are mapped to processes. Total real memory use is how many VM pages are allocated whether they are mapped to processes or not. Shared real memory use represents VM pages mapped to more than one process. Total virtual memory use uses total real memory plus allocated swap space. Remaining fields are left 0 and not currently supported. * Represents a more realistic view of memmory and VM. In particular, totalling up the file sizes for all mmap()'d files is no longer a collected statistic because the system really has no way of knowing how little or how much of the file is 'active', or even ever accessed. * The vmtotal sysctl (e.g. used by systat -vm 1) now just iterates cpus to aggregate gd_vmtotal for VM statistics. This is basically O(1) for VM statistics. It still iterates processes (which we will want to fix too, eventually), but the main scaling issue was with VM objects and that has been fixed. --- sys/platform/pc64/include/pmap.h | 3 +- sys/platform/pc64/x86_64/pmap.c | 35 ++++++++- sys/sys/globaldata.h | 1 + sys/sys/vmmeter.h | 3 +- sys/vm/swap_pager.c | 8 +- sys/vm/vm_meter.c | 125 +++++++------------------------ sys/vm/vm_page.c | 6 +- 7 files changed, 74 insertions(+), 107 deletions(-) diff --git a/sys/platform/pc64/include/pmap.h b/sys/platform/pc64/include/pmap.h index 0923318b53..cc413dca40 100644 --- a/sys/platform/pc64/include/pmap.h +++ b/sys/platform/pc64/include/pmap.h @@ -214,11 +214,12 @@ struct vm_page; struct vm_object; struct vmspace; +TAILQ_HEAD(md_page_pv_list, pv_entry); /* * vm_page structures embed a list of related pv_entry's */ struct md_page { - TAILQ_HEAD(,pv_entry) pv_list; + struct md_page_pv_list pv_list; }; /* diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index e679d3998f..5b9c58790c 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -308,6 +308,34 @@ pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, pv_entry_compare, vm_pindex_t, pv_pindex); +static __inline +void +pmap_page_stats_adding(vm_page_t m) +{ + globaldata_t gd = mycpu; + + if (TAILQ_EMPTY(&m->md.pv_list)) { + ++gd->gd_vmtotal.t_arm; + } else if (TAILQ_FIRST(&m->md.pv_list) == + TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { + ++gd->gd_vmtotal.t_armshr; + } +} + +static __inline +void +pmap_page_stats_deleting(vm_page_t m) +{ + globaldata_t gd = mycpu; + + if (TAILQ_EMPTY(&m->md.pv_list)) { + --gd->gd_vmtotal.t_arm; + } else if (TAILQ_FIRST(&m->md.pv_list) == + TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) { + --gd->gd_vmtotal.t_armshr; + } +} + /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance @@ -1879,6 +1907,7 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) vm_wait(0); } vm_page_spin_lock(m); + pmap_page_stats_adding(m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pv->pv_m = m; vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); @@ -2540,6 +2569,7 @@ pmap_remove_pv_page(pv_entry_t pv) vm_page_spin_lock(m); pv->pv_m = NULL; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + pmap_page_stats_deleting(m); /* if (m->object) atomic_add_int(&m->object->agg_pv_list_count, -1); @@ -4082,11 +4112,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, KKASSERT(pte_pv->pv_m == NULL); vm_page_spin_lock(m); pte_pv->pv_m = m; + pmap_page_stats_adding(m); TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); - /* - if (m->object) - atomic_add_int(&m->object->agg_pv_list_count, 1); - */ vm_page_flag_set(m, PG_MAPPED); vm_page_spin_unlock(m); } else if (pt_pv && opa == 0) { diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h index ca97a64921..3ac989b57f 100644 --- a/sys/sys/globaldata.h +++ b/sys/sys/globaldata.h @@ -133,6 +133,7 @@ struct globaldata { struct timeval gd_stattv; int gd_intr_nesting_level; /* hard code, intrs, ipis */ struct vmmeter gd_cnt; + struct vmtotal gd_vmtotal; cpumask_t gd_ipimask; /* pending ipis from cpus */ struct lwkt_ipiq *gd_ipiq; /* array[ncpu] of ipiq's */ struct lwkt_ipiq gd_cpusyncq; /* ipiq for cpu synchro */ diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index 0bb1265f89..1a62ec08b0 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -142,8 +142,7 @@ extern struct vmstats vmstats; #endif /* systemwide totals computed every five seconds */ -struct vmtotal -{ +struct vmtotal { long t_rq; /* length of the run queue */ long t_dw; /* jobs in ``disk wait'' (neg priority) */ long t_pw; /* jobs in page wait */ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index d2bb9bf02e..b413c6fb9c 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -2185,14 +2185,17 @@ retry: /* can block */ swp_pager_freeswapspace(object, v, 1); --swap->swb_count; + --mycpu->gd_vmtotal.t_vm; } /* * Enter block into metadata */ swap->swb_pages[index] = swapblk; - if (swapblk != SWAPBLK_NONE) + if (swapblk != SWAPBLK_NONE) { ++swap->swb_count; + ++mycpu->gd_vmtotal.t_vm; + } } /* @@ -2279,6 +2282,7 @@ swp_pager_meta_free_callback(struct swblock *swap, void *data) swap->swb_pages[index] = SWAPBLK_NONE; /* can block */ swp_pager_freeswapspace(object, v, 1); + --mycpu->gd_vmtotal.t_vm; if (--swap->swb_count == 0) { swp_pager_remove(object, swap); zfree(swap_zone, swap); @@ -2322,6 +2326,7 @@ swp_pager_meta_free_all(vm_object_t object) /* can block */ swp_pager_freeswapspace(object, v, 1); --swap->swb_count; + --mycpu->gd_vmtotal.t_vm; } } if (swap->swb_count != 0) @@ -2374,6 +2379,7 @@ swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags) if (r1 != SWAPBLK_NONE) { if (flags & (SWM_FREE|SWM_POP)) { swap->swb_pages[index] = SWAPBLK_NONE; + --mycpu->gd_vmtotal.t_vm; if (--swap->swb_count == 0) { swp_pager_remove(object, swap); zfree(swap_zone, swap); diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 5c3268c2c2..bd5fd17b21 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -90,114 +90,45 @@ static int do_vmtotal(SYSCTL_HANDLER_ARGS) { struct vmtotal total; - struct vmtotal *totalp; - struct vm_object marker; - vm_object_t object; - long collisions; - int burst; + globaldata_t gd; + int n; bzero(&total, sizeof(total)); - totalp = &total; - bzero(&marker, sizeof(marker)); - marker.type = OBJT_MARKER; - collisions = vmobj_token.t_collisions; - -#if 0 - /* - * Mark all objects as inactive. - */ - lwkt_gettoken(&vmobj_token); - for (object = TAILQ_FIRST(&vm_object_list); - object != NULL; - object = TAILQ_NEXT(object,object_list)) { - if (object->type == OBJT_MARKER) - continue; - vm_object_clear_flag(object, OBJ_ACTIVE); + for (n = 0; n < ncpus; ++n) { + gd = globaldata_find(n); + + /* total.t_rq calculated separately */ + /* total.t_dw calculated separately */ + /* total.t_pw calculated separately */ + /* total.t_sl calculated separately */ + /* total.t_sw calculated separately */ + total.t_vm += gd->gd_vmtotal.t_vm; + total.t_avm += gd->gd_vmtotal.t_avm; + total.t_rm += gd->gd_vmtotal.t_rm; + total.t_arm += gd->gd_vmtotal.t_arm; + total.t_vmshr += gd->gd_vmtotal.t_vmshr; + total.t_avmshr += gd->gd_vmtotal.t_avmshr; + total.t_rmshr += gd->gd_vmtotal.t_rmshr; + total.t_armshr += gd->gd_vmtotal.t_armshr; + /* total.t_free calculated separately */ } - lwkt_reltoken(&vmobj_token); -#endif /* * Calculate process statistics. */ - allproc_scan(do_vmtotal_callback, totalp); + allproc_scan(do_vmtotal_callback, &total); /* - * Calculate object memory usage statistics. + * Adjust for sysctl return. Add real memory into virtual memory. + * Set t_free. + * + * t_rm - Real memory + * t_vm - Virtual memory (real + swap) */ - lwkt_gettoken(&vmobj_token); - TAILQ_INSERT_HEAD(&vm_object_list, &marker, object_list); - burst = 0; - - for (object = TAILQ_FIRST(&vm_object_list); - object != NULL; - object = TAILQ_NEXT(object, object_list)) { - /* - * devices, like /dev/mem, will badly skew our totals. - * markers aren't real objects. - */ - if (object->type == OBJT_MARKER) - continue; - if (object->type == OBJT_DEVICE) - continue; - if (object->type == OBJT_MGTDEVICE) - continue; - if (object->size >= 0x7FFFFFFF) { - /* - * Probably unbounded anonymous memory (really - * bounded by related vm_map_entry structures which - * we do not have access to in this loop). - */ - totalp->t_vm += object->resident_page_count; - } else { - /* - * It's questionable how useful this is but... - */ - totalp->t_vm += object->size; - } - totalp->t_rm += object->resident_page_count; - if (object->flags & OBJ_ACTIVE) { - totalp->t_avm += object->size; - totalp->t_arm += object->resident_page_count; - } - if (object->shadow_count > 1) { - /* shared object */ - totalp->t_vmshr += object->size; - totalp->t_rmshr += object->resident_page_count; - if (object->flags & OBJ_ACTIVE) { - totalp->t_avmshr += object->size; - totalp->t_armshr += object->resident_page_count; - } - } - - /* - * Don't waste time unnecessarily - */ - if (++burst < 25) - continue; - burst = 0; - - /* - * Don't hog the vmobj_token if someone else wants it. - */ - TAILQ_REMOVE(&vm_object_list, &marker, object_list); - TAILQ_INSERT_AFTER(&vm_object_list, object, - &marker, object_list); - object = ▮ - if (collisions != vmobj_token.t_collisions) { - tsleep(&vm_object_list, 0, "breath", 1); - collisions = vmobj_token.t_collisions; - } else { - lwkt_yield(); - } - } - - TAILQ_REMOVE(&vm_object_list, &marker, object_list); - lwkt_reltoken(&vmobj_token); - - totalp->t_free = vmstats.v_free_count + vmstats.v_cache_count; + total.t_vm += total.t_rm; + total.t_free = vmstats.v_free_count + vmstats.v_cache_count; - return (sysctl_handle_opaque(oidp, totalp, sizeof total, req)); + return (sysctl_handle_opaque(oidp, &total, sizeof(total), req)); } /* diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 1159994e15..584ad31427 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -973,7 +973,8 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) vm_page_spin_unlock(m); return FALSE; } - object->resident_page_count++; + ++object->resident_page_count; + ++mycpu->gd_vmtotal.t_rm; /* atomic_add_int(&object->agg_pv_list_count, m->md.pv_list_count); */ vm_page_spin_unlock(m); @@ -1027,7 +1028,8 @@ vm_page_remove(vm_page_t m) */ vm_page_spin_lock(m); vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m); - object->resident_page_count--; + --object->resident_page_count; + --mycpu->gd_vmtotal.t_rm; /* atomic_add_int(&object->agg_pv_list_count, -m->md.pv_list_count); */ m->object = NULL; vm_page_spin_unlock(m); -- 2.41.0