From 79634a6643a5f76dd3cf8995a5c054ba6ad27192 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 12 Aug 2009 09:52:29 -0700 Subject: [PATCH] swap, amd64 - increase maximum swap space to 1TB x 4 * The radix can overflow a 32 bit integer even if swblk_t fits in 32 bits. Expand the radix to 64 bits and thus allow the subr_blist code to operate up to 2 billion blocks (8TB total). * Shortcut the common single-swap-device case. We do not have to scan the radix tree to get available space in the single-device case. * Change maxswzone and maxbcache to longs and add TUNABLE_LONG_FETCH(). * All the TUNEABLE_*_FETCH() calls and kgetenv_*() calls for integers call kgetenv_quad(). Adjust kgetenv_quad() to accept a suffix for kilobytes, megabytes, gigabytes, and terrabytes. --- lib/libkvm/kvm_getswapinfo.c | 97 ++++++++++++++++++++----------- sys/cpu/amd64/include/param.h | 7 ++- sys/kern/kern_environment.c | 40 +++++++++++++ sys/kern/subr_blist.c | 104 ++++++++++++++++++++-------------- sys/kern/subr_param.c | 8 +-- sys/sys/blist.h | 14 ++++- sys/sys/buf.h | 4 +- sys/sys/kernel.h | 1 + sys/sys/systm.h | 1 + sys/vm/swap_pager.c | 12 ++-- sys/vm/vm_swap.c | 43 +++++++------- 11 files changed, 219 insertions(+), 112 deletions(-) diff --git a/lib/libkvm/kvm_getswapinfo.c b/lib/libkvm/kvm_getswapinfo.c index 7a9bc3c0ba..66c9259074 100644 --- a/lib/libkvm/kvm_getswapinfo.c +++ b/lib/libkvm/kvm_getswapinfo.c @@ -234,10 +234,11 @@ kvm_getswapinfo( static int scanradix( blmeta_t *scan, - daddr_t blk, - daddr_t radix, - daddr_t skip, - daddr_t count, + blmeta_t *scan_cache, + swblk_t blk, + int64_t radix, + swblk_t skip, + swblk_t count, kvm_t *kd, int dmmax, int nswdev, @@ -247,19 +248,30 @@ scanradix( int flags ) { blmeta_t meta; + blmeta_t scan_array[BLIST_BMAP_RADIX]; int ti = (unswdev >= swap_max) ? swap_max - 1 : unswdev; - KGET2(scan, &meta, sizeof(meta), "blmeta_t"); + if (scan_cache) { + meta = *scan_cache; + } else if (skip == BLIST_META_RADIX) { + if (kvm_read(kd, (u_long)scan, scan_array, sizeof(scan_array)) != sizeof(scan_array)) { + warnx("cannot read %s: %s", "blmeta_t", kvm_geterr(kd)); + bzero(scan_array, sizeof(scan_array)); + } + meta = scan_array[0]; + } else { + KGET2(scan, &meta, sizeof(meta), "blmeta_t"); + } /* * Terminator */ - if (meta.bm_bighint == (daddr_t)-1) { + if (meta.bm_bighint == (swblk_t)-1) { if (flags & SWIF_DUMP_TREE) { - printf("%*.*s(0x%06x,%d) Terminator\n", + printf("%*.*s(0x%06x,%lld) Terminator\n", TABME, blk, - radix + (long long)radix ); } return(-1); @@ -272,10 +284,10 @@ scanradix( int i; if (flags & SWIF_DUMP_TREE) { - printf("%*.*s(0x%06x,%d) Bitmap %08x big=%d\n", + printf("%*.*s(0x%06x,%lld) Bitmap %08x big=%d\n", TABME, blk, - radix, + (long long)radix, (int)meta.u.bmu_bitmap, meta.bm_bighint ); @@ -306,10 +318,10 @@ scanradix( * Meta node if all free */ if (flags & SWIF_DUMP_TREE) { - printf("%*.*s(0x%06x,%d) Submap ALL-FREE {\n", + printf("%*.*s(0x%06x,%lld) Submap ALL-FREE {\n", TABME, blk, - radix + (long long)radix ); } /* @@ -338,10 +350,10 @@ scanradix( * Meta node if all used */ if (flags & SWIF_DUMP_TREE) { - printf("%*.*s(0x%06x,%d) Submap ALL-ALLOCATED\n", + printf("%*.*s(0x%06x,%lld) Submap ALL-ALLOCATED\n", TABME, blk, - radix + (long long)radix ); } } else { @@ -352,10 +364,10 @@ scanradix( int next_skip; if (flags & SWIF_DUMP_TREE) { - printf("%*.*s(0x%06x,%d) Submap avail=%d big=%d {\n", + printf("%*.*s(0x%06x,%lld) Submap avail=%d big=%d {\n", TABME, blk, - radix, + (long long)radix, (int)meta.u.bmu_avail, meta.bm_bighint ); @@ -366,10 +378,12 @@ scanradix( for (i = 1; i <= skip; i += next_skip) { int r; - daddr_t vcount = (count > radix) ? radix : count; + swblk_t vcount = (count > radix) ? + (swblk_t)radix : count; r = scanradix( &scan[i], + ((next_skip == 1) ? &scan_array[i] : NULL), blk, radix, next_skip - 1, @@ -384,7 +398,7 @@ scanradix( ); if (r < 0) break; - blk += radix; + blk += (swblk_t)radix; } if (flags & SWIF_DUMP_TREE) { printf("%*.*s}\n", TABME); @@ -410,26 +424,41 @@ getswapinfo_radix(kvm_t *kd, struct kvm_swap *swap_ary, int swap_max, int flags) KGET2(swapblist, &blcopy, sizeof(blcopy), "*swapblist"); if (flags & SWIF_DUMP_TREE) { - printf("radix tree: %d/%d/%d blocks, %dK wired\n", + printf("radix tree: %d/%d/%lld blocks, %dK wired\n", blcopy.bl_free, blcopy.bl_blocks, - blcopy.bl_radix, + (long long)blcopy.bl_radix, (int)((blcopy.bl_rootblks * sizeof(blmeta_t) + 1023)/ 1024) ); } - scanradix( - blcopy.bl_root, - 0, - blcopy.bl_radix, - blcopy.bl_skip, - blcopy.bl_rootblks, - kd, - dmmax, - nswdev, - swap_ary, - swap_max, - 0, - flags - ); + + /* + * XXX Scan the radix tree in the kernel if we have more then one + * swap device so we can get per-device statistics. This can + * get nasty because swap devices are interleaved based on the + * maximum of (4), so the blist winds up not using any shortcuts. + * + * Otherwise just pull the free count out of the blist header, + * which is a billion times faster. + */ + if ((flags & SWIF_DUMP_TREE) || unswdev > 1) { + scanradix( + blcopy.bl_root, + NULL, + 0, + blcopy.bl_radix, + blcopy.bl_skip, + blcopy.bl_rootblks, + kd, + dmmax, + nswdev, + swap_ary, + swap_max, + 0, + flags + ); + } else { + swap_ary[0].ksw_used -= blcopy.bl_free; + } } diff --git a/sys/cpu/amd64/include/param.h b/sys/cpu/amd64/include/param.h index 1a5ed832f6..bc82c29826 100644 --- a/sys/cpu/amd64/include/param.h +++ b/sys/cpu/amd64/include/param.h @@ -146,9 +146,12 @@ /* * Ceiling on amount of swblock kva space, can be changed via * kern.maxswzone /boot/loader.conf variable. + * + * Approximately size / 160 x 32 x PAGE_SIZE bytes of swap. This + * comes to approximately 1GB of swap space per 1MB of kernel memory. */ #ifndef VM_SWZONE_SIZE_MAX -#define VM_SWZONE_SIZE_MAX (32 * 1024 * 1024) +#define VM_SWZONE_SIZE_MAX (32L * 1024 * 1024) #endif /* @@ -157,7 +160,7 @@ * kern.maxbcache /boot/loader.conf variable. */ #ifndef VM_BCACHE_SIZE_MAX -#define VM_BCACHE_SIZE_MAX (200 * 1024 * 1024) +#define VM_BCACHE_SIZE_MAX (200L * 1024 * 1024) #endif diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c index c0c1602af4..bd52f12aec 100644 --- a/sys/kern/kern_environment.c +++ b/sys/kern/kern_environment.c @@ -280,6 +280,21 @@ kgetenv_int(const char *name, int *data) return (rval); } +/* + * Return a long value from an environment variable. + */ +int +kgetenv_long(const char *name, long *data) +{ + quad_t tmp; + int rval; + + rval = kgetenv_quad(name, &tmp); + if (rval) + *data = (long)tmp; + return (rval); +} + /* * Return an unsigned long value from an environment variable. */ @@ -297,6 +312,9 @@ kgetenv_ulong(const char *name, unsigned long *data) /* * Return a quad_t value from an environment variable. + * + * A single character kmgtKMGT extension multiplies the value + * by 1024, 1024*1024, etc. */ int kgetenv_quad(const char *name, quad_t *data) @@ -309,6 +327,28 @@ kgetenv_quad(const char *name, quad_t *data) return(0); iv = strtoq(value, &vtp, 0); + switch(*vtp) { + case 't': + case 'T': + iv <<= 10; + /* fall through */ + case 'g': + case 'G': + iv <<= 10; + /* fall through */ + case 'm': + case 'M': + iv <<= 10; + /* fall through */ + case 'k': + case 'K': + iv <<= 10; + ++vtp; + break; + default: + break; + } + if ((vtp == value) || (*vtp != '\0')) { kfreeenv(value); return(0); diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c index cb01775881..8a2895b9e4 100644 --- a/sys/kern/subr_blist.c +++ b/sys/kern/subr_blist.c @@ -79,7 +79,7 @@ * to cover the number of blocks requested at creation time even if it * must be encompassed in larger root-node radix. * - * NOTE: the allocator cannot currently allocate more then + * NOTE: The allocator cannot currently allocate more then * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too * large' if you try. This is an area that could use improvement. The * radix is large enough that this restriction does not effect the swap @@ -87,6 +87,10 @@ * this algorithmic unfeature. The freeing code can handle arbitrary * ranges. * + * NOTE: The radix may exceed 32 bits in order to support up to 2^31 + * blocks. The first divison will drop the radix down and fit + * it within a signed 32 bit integer. + * * This code can be compiled stand-alone for debugging. * * $FreeBSD: src/sys/kern/subr_blist.c,v 1.5.2.2 2003/01/12 09:23:12 dillon Exp $ @@ -123,6 +127,8 @@ #define kmalloc(a,b,c) malloc(a) #define kfree(a,b) free(a) +#define kprintf printf +#define KKASSERT(exp) #include @@ -136,17 +142,17 @@ void panic(const char *ctl, ...); static swblk_t blst_leaf_alloc(blmeta_t *scan, swblk_t blk, int count); static swblk_t blst_meta_alloc(blmeta_t *scan, swblk_t blk, - swblk_t count, swblk_t radix, int skip); + swblk_t count, int64_t radix, int skip); static void blst_leaf_free(blmeta_t *scan, swblk_t relblk, int count); static void blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, - swblk_t radix, int skip, swblk_t blk); -static void blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, + int64_t radix, int skip, swblk_t blk); +static void blst_copy(blmeta_t *scan, swblk_t blk, int64_t radix, swblk_t skip, blist_t dest, swblk_t count); -static swblk_t blst_radix_init(blmeta_t *scan, swblk_t radix, +static swblk_t blst_radix_init(blmeta_t *scan, int64_t radix, int skip, swblk_t count); #ifndef _KERNEL static void blst_radix_print(blmeta_t *scan, swblk_t blk, - swblk_t radix, int skip, int tab); + int64_t radix, int skip, int tab); #endif #ifdef _KERNEL @@ -167,17 +173,20 @@ blist_t blist_create(swblk_t blocks) { blist_t bl; - int radix; + int64_t radix; int skip = 0; /* * Calculate radix and skip field used for scanning. + * + * Radix can exceed 32 bits even if swblk_t is limited to 32 bits. */ radix = BLIST_BMAP_RADIX; while (radix < blocks) { radix *= BLIST_META_RADIX; skip = (skip + 1) * BLIST_META_RADIX; + KKASSERT(skip > 0); } bl = kmalloc(sizeof(struct blist), M_SWAP, M_WAITOK); @@ -391,7 +400,7 @@ blst_leaf_alloc(blmeta_t *scan, swblk_t blk, int count) static swblk_t blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count, - swblk_t radix, int skip) + int64_t radix, int skip) { int i; int next_skip = ((u_int)skip / BLIST_META_RADIX); @@ -404,6 +413,9 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count, return(SWAPBLK_NONE); } + /* + * note: radix may exceed 32 bits until first division. + */ if (scan->u.bmu_avail == radix) { radix /= BLIST_META_RADIX; @@ -418,8 +430,8 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count, scan[i].u.bmu_bitmap = (u_swblk_t)-1; scan[i].bm_bighint = BLIST_BMAP_RADIX; } else { - scan[i].bm_bighint = radix; - scan[i].u.bmu_avail = radix; + scan[i].bm_bighint = (swblk_t)radix; + scan[i].u.bmu_avail = (swblk_t)radix; } } } else { @@ -448,14 +460,14 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count, * Terminator */ break; - } else if (count > radix) { + } else if (count > (swblk_t)radix) { /* * count does not fit in object even if it were * complete free. */ panic("blist_meta_alloc: allocation too large"); } - blk += radix; + blk += (swblk_t)radix; } /* @@ -514,18 +526,21 @@ blst_leaf_free(blmeta_t *scan, swblk_t blk, int count) static void blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, - swblk_t radix, int skip, swblk_t blk) + int64_t radix, int skip, swblk_t blk) { int i; int next_skip = ((u_int)skip / BLIST_META_RADIX); #if 0 - kprintf("FREE (%x,%d) FROM (%x,%d)\n", + kprintf("FREE (%x,%d) FROM (%x,%lld)\n", freeBlk, count, - blk, radix + blk, (long long)radix ); #endif + /* + * NOTE: radix may exceed 32 bits until first division. + */ if (scan->u.bmu_avail == 0) { /* * ALL-ALLOCATED special case, with possible @@ -559,7 +574,7 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, if (scan->u.bmu_avail == radix) return; if (scan->u.bmu_avail > radix) - panic("blst_meta_free: freeing already free blocks (%d) %d/%d", count, scan->u.bmu_avail, radix); + panic("blst_meta_free: freeing already free blocks (%d) %d/%lld", count, scan->u.bmu_avail, (long long)radix); /* * Break the free down into its components @@ -567,14 +582,14 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, radix /= BLIST_META_RADIX; - i = (freeBlk - blk) / radix; - blk += i * radix; + i = (freeBlk - blk) / (swblk_t)radix; + blk += i * (swblk_t)radix; i = i * next_skip + 1; while (i <= skip && blk < freeBlk + count) { swblk_t v; - v = blk + radix - freeBlk; + v = blk + (swblk_t)radix - freeBlk; if (v > count) v = count; @@ -590,7 +605,7 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, scan->bm_bighint = scan[i].bm_bighint; count -= v; freeBlk += v; - blk += radix; + blk += (swblk_t)radix; i += next_skip; } } @@ -603,7 +618,7 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, */ static void -blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, +blst_copy(blmeta_t *scan, swblk_t blk, int64_t radix, swblk_t skip, blist_t dest, swblk_t count) { int next_skip; @@ -646,7 +661,7 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, if (count < radix) blist_free(dest, blk, count); else - blist_free(dest, blk, radix); + blist_free(dest, blk, (swblk_t)radix); return; } @@ -658,16 +673,16 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, if (scan[i].bm_bighint == (swblk_t)-1) break; - if (count >= radix) { + if (count >= (swblk_t)radix) { blst_copy( &scan[i], blk, radix, next_skip - 1, dest, - radix + (swblk_t)radix ); - count -= radix; + count -= (swblk_t)radix; } else { if (count) { blst_copy( @@ -681,7 +696,7 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, } count = 0; } - blk += radix; + blk += (swblk_t)radix; } } @@ -695,7 +710,7 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, */ static swblk_t -blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count) +blst_radix_init(blmeta_t *scan, int64_t radix, int skip, swblk_t count) { int i; int next_skip; @@ -728,7 +743,7 @@ blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count) next_skip = ((u_int)skip / BLIST_META_RADIX); for (i = 1; i <= skip; i += next_skip) { - if (count >= radix) { + if (count >= (swblk_t)radix) { /* * Allocate the entire object */ @@ -736,9 +751,9 @@ blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count) ((scan) ? &scan[i] : NULL), radix, next_skip - 1, - radix + (swblk_t)radix ); - count -= radix; + count -= (swblk_t)radix; } else if (count > 0) { /* * Allocate a partial object @@ -767,7 +782,7 @@ blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count) #ifdef BLIST_DEBUG static void -blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab) +blst_radix_print(blmeta_t *scan, swblk_t blk, int64_t radix, int skip, int tab) { int i; int next_skip; @@ -775,9 +790,9 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab) if (radix == BLIST_BMAP_RADIX) { kprintf( - "%*.*s(%04x,%d): bitmap %08x big=%d\n", + "%*.*s(%04x,%lld): bitmap %08x big=%d\n", tab, tab, "", - blk, radix, + blk, (long long)radix, scan->u.bmu_bitmap, scan->bm_bighint ); @@ -786,29 +801,29 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab) if (scan->u.bmu_avail == 0) { kprintf( - "%*.*s(%04x,%d) ALL ALLOCATED\n", + "%*.*s(%04x,%lld) ALL ALLOCATED\n", tab, tab, "", blk, - radix + (long long)radix ); return; } if (scan->u.bmu_avail == radix) { kprintf( - "%*.*s(%04x,%d) ALL FREE\n", + "%*.*s(%04x,%lld) ALL FREE\n", tab, tab, "", blk, - radix + (long long)radix ); return; } kprintf( - "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n", + "%*.*s(%04x,%lld): subtree (%d/%lld) big=%d {\n", tab, tab, "", - blk, radix, + blk, (long long)radix, scan->u.bmu_avail, - radix, + (long long)radix, scan->bm_bighint ); @@ -819,9 +834,9 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab) for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (swblk_t)-1) { kprintf( - "%*.*s(%04x,%d): Terminator\n", + "%*.*s(%04x,%lld): Terminator\n", tab, tab, "", - blk, radix + blk, (long long)radix ); lastState = 0; break; @@ -833,7 +848,7 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab) next_skip - 1, tab ); - blk += radix; + blk += (swblk_t)radix; } tab -= 4; @@ -873,7 +888,8 @@ main(int ac, char **av) swblk_t count = 0; - kprintf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix); + kprintf("%d/%d/%lld> ", + bl->bl_free, size, (long long)bl->bl_radix); fflush(stdout); if (fgets(buf, sizeof(buf), stdin) == NULL) break; diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index 5b693e8ea4..2497fa5bec 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -85,8 +85,8 @@ int ncallout; /* maximum # of timer events */ int mbuf_wait = 32; /* mbuf sleep time in ticks */ int nbuf; int nswbuf; -int maxswzone; /* max swmeta KVA storage */ -int maxbcache; /* max buffer cache KVA storage */ +long maxswzone; /* max swmeta KVA storage */ +long maxbcache; /* max buffer cache KVA storage */ u_quad_t maxtsiz; /* max text size */ u_quad_t dfldsiz; /* initial data size limit */ u_quad_t maxdsiz; /* max data size */ @@ -121,11 +121,11 @@ init_param1(void) #ifdef VM_SWZONE_SIZE_MAX maxswzone = VM_SWZONE_SIZE_MAX; #endif - TUNABLE_INT_FETCH("kern.maxswzone", &maxswzone); + TUNABLE_LONG_FETCH("kern.maxswzone", &maxswzone); #ifdef VM_BCACHE_SIZE_MAX maxbcache = VM_BCACHE_SIZE_MAX; #endif - TUNABLE_INT_FETCH("kern.maxbcache", &maxbcache); + TUNABLE_LONG_FETCH("kern.maxbcache", &maxbcache); maxtsiz = MAXTSIZ; TUNABLE_QUAD_FETCH("kern.maxtsiz", &maxtsiz); dfldsiz = DFLDSIZ; diff --git a/sys/sys/blist.h b/sys/sys/blist.h index b1a9f2e0d4..a3c3ae9760 100644 --- a/sys/sys/blist.h +++ b/sys/sys/blist.h @@ -81,7 +81,7 @@ typedef struct blmeta { typedef struct blist { swblk_t bl_blocks; /* area of coverage */ - swblk_t bl_radix; /* coverage radix */ + int64_t bl_radix; /* coverage radix */ swblk_t bl_skip; /* starting skip */ swblk_t bl_free; /* number of free blocks */ blmeta_t *bl_root; /* root of radix tree */ @@ -91,6 +91,18 @@ typedef struct blist { #define BLIST_META_RADIX 16 #define BLIST_BMAP_RADIX (sizeof(u_swblk_t)*8) +/* + * The radix can be up to x BLIST_BMAP_RADIX the largest skip, + * based on the initial skip calculation in blist_create(). + * + * The radix will exceed the size of a 32 bit signed (or unsigned) int + * when the maximal number of blocks is allocated. This corresponds + * to ~1G x PAGE_SIZE = 4096GB. The swap code usually divides this + * by 4, leaving us with a capability of up to four 1TB swap devices. + */ +#define BLIST_MAXBLKS (0x40000000 / \ + (BLIST_BMAP_RADIX / BLIST_META_RADIX)) + #define BLIST_MAX_ALLOC BLIST_BMAP_RADIX extern blist_t blist_create(swblk_t blocks); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 1a6668d3a7..64cf191b94 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -365,8 +365,8 @@ struct cluster_save { #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ -extern int maxswzone; /* Max KVA for swap structures */ -extern int maxbcache; /* Max KVA for buffer cache */ +extern long maxswzone; /* Max KVA for swap structures */ +extern long maxbcache; /* Max KVA for buffer cache */ extern int runningbufspace; extern int runningbufcount; extern int hidirtybufspace; diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h index 0168bbc7e2..3eb54b5a73 100644 --- a/sys/sys/kernel.h +++ b/sys/sys/kernel.h @@ -308,6 +308,7 @@ struct tunable_int { tunable_int_init, &__tunable_int_ ## line) #define TUNABLE_INT_FETCH(path, var) kgetenv_int((path), (var)) +#define TUNABLE_LONG_FETCH(path, var) kgetenv_long((path), (var)) /* Backwards compatibility with the old deprecated TUNABLE_INT_DECL API */ #define TUNABLE_INT_DECL(path, defval, var) \ diff --git a/sys/sys/systm.h b/sys/sys/systm.h index d1df7691be..977e36dfd9 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -253,6 +253,7 @@ int kgetenv_int (const char *name, int *data); int kgetenv_string (const char *name, char *data, int size); int kgetenv_ulong(const char *name, unsigned long *data); int kgetenv_quad (const char *name, quad_t *data); +int kgetenv_long(const char *name, long *data); extern char *kern_envp; #ifdef APM_FIXUP_CALLTODO diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 1b9eabcc22..fde1d25400 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -333,13 +333,15 @@ swap_pager_swap_init(void) nsw_wcount_async_max = nsw_wcount_async; /* - * Initialize our zone. Right now I'm just guessing on the number - * we need based on the number of pages in the system. Each swblock - * can hold 16 pages, so this is probably overkill. This reservation - * is typically limited to around 32MB by default. + * The zone is dynamically allocated so generally size it to + * maxswzone (32MB to 512MB of KVM). Set a minimum size based + * on physical memory of around 8x (each swblock can hold 16 pages). + * + * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio + * has increased dramatically. */ n = vmstats.v_page_count / 2; - if (maxswzone && n > maxswzone / sizeof(struct swblock)) + if (maxswzone && n < maxswzone / sizeof(struct swblock)) n = maxswzone / sizeof(struct swblock); n2 = n; diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c index ac6fb0cdd8..dd30a4d0d8 100644 --- a/sys/vm/vm_swap.c +++ b/sys/vm/vm_swap.c @@ -67,7 +67,7 @@ #endif static struct swdevt should_be_malloced[NSWAPDEV]; struct swdevt *swdevt = should_be_malloced; /* exported to pstat/systat */ -static int nswap; /* first block after the interleaved devs */ +static swblk_t nswap; /* first block after the interleaved devs */ int nswdev = NSWAPDEV; /* exported to pstat/systat */ int vm_swap_size; @@ -227,9 +227,9 @@ sys_swapon(struct swapon_args *uap) * XXX locking when multiple swapon's run in parallel */ int -swaponvp(struct thread *td, struct vnode *vp, u_long nblks) +swaponvp(struct thread *td, struct vnode *vp, u_quad_t nblks) { - u_long aligned_nblks; + swblk_t aligned_nblks; int64_t dpsize; struct ucred *cred; struct swdevt *sp; @@ -282,47 +282,50 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks) VOP_CLOSE(vp, FREAD | FWRITE); return (ENXIO); } - if ((u_int64_t)dpsize < 0x100000000ULL) - nblks = (u_long)dpsize; - else - nblks = 0xffffffffU; + nblks = (u_quad_t)dpsize; } if (nblks == 0) { VOP_CLOSE(vp, FREAD | FWRITE); return (ENXIO); } - /* - * If we go beyond this, we get overflows in the radix - * tree bitmap code. - */ - if (nblks > 0x40000000 / BLIST_META_RADIX / nswdev) { - kprintf("exceeded maximum of %d blocks per swap unit\n", - 0x40000000 / BLIST_META_RADIX / nswdev); - VOP_CLOSE(vp, FREAD | FWRITE); - return (ENXIO); - } /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ - nblks &= ~(ctodb(1) - 1); + nblks &= ~(u_quad_t)(ctodb(1) - 1); nblks = dbtoc(nblks); + /* + * Post-conversion nblks must not be >= BLIST_MAXBLKS, and + * we impose a 4-swap-device limit so we have to divide it out + * further. Going beyond this will result in overflows in the + * blist code. + * + * Post-conversion nblks must fit within a (swblk_t), which + * this test also ensures. + */ + if (nblks > BLIST_MAXBLKS / nswdev) { + kprintf("exceeded maximum of %d blocks per swap unit\n", + (int)BLIST_MAXBLKS / nswdev); + VOP_CLOSE(vp, FREAD | FWRITE); + return (ENXIO); + } + sp->sw_vp = vp; sp->sw_dev = dev2udev(dev); sp->sw_device = dev; sp->sw_flags |= SW_FREED; - sp->sw_nblks = nblks; + sp->sw_nblks = (swblk_t)nblks; /* * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not * DEV_BSIZE'd. aligned_nblks is used to calculate the * size of the swap bitmap, taking into account the stripe size. */ - aligned_nblks = (nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1); + aligned_nblks = (swblk_t)((nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1)); if (aligned_nblks * nswdev > nswap) nswap = aligned_nblks * nswdev; -- 2.41.0