swap, amd64 - increase maximum swap space to 1TB x 4
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 12 Aug 2009 16:52:29 +0000 (09:52 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 12 Aug 2009 16:52:29 +0000 (09:52 -0700)
* The radix can overflow a 32 bit integer even if swblk_t fits in 32 bits.
  Expand the radix to 64 bits and thus allow the subr_blist code to operate
  up to 2 billion blocks (8TB total).

* Shortcut the common single-swap-device case.  We do not have to scan
  the radix tree to get available space in the single-device case.

* Change maxswzone and maxbcache to longs and add TUNABLE_LONG_FETCH().

* All the TUNEABLE_*_FETCH() calls and kgetenv_*() calls for integers
  call kgetenv_quad().

  Adjust kgetenv_quad() to accept a suffix for kilobytes, megabytes,
  gigabytes, and terrabytes.

lib/libkvm/kvm_getswapinfo.c
sys/cpu/amd64/include/param.h
sys/kern/kern_environment.c
sys/kern/subr_blist.c
sys/kern/subr_param.c
sys/sys/blist.h
sys/sys/buf.h
sys/sys/kernel.h
sys/sys/systm.h
sys/vm/swap_pager.c
sys/vm/vm_swap.c

index 7a9bc3c..66c9259 100644 (file)
@@ -234,10 +234,11 @@ kvm_getswapinfo(
 static int
 scanradix(
        blmeta_t *scan, 
-       daddr_t blk,
-       daddr_t radix,
-       daddr_t skip, 
-       daddr_t count,
+       blmeta_t *scan_cache,
+       swblk_t blk,
+       int64_t radix,
+       swblk_t skip,
+       swblk_t count,
        kvm_t *kd,
        int dmmax, 
        int nswdev,
@@ -247,19 +248,30 @@ scanradix(
        int flags
 ) {
        blmeta_t meta;
+       blmeta_t scan_array[BLIST_BMAP_RADIX];
        int ti = (unswdev >= swap_max) ? swap_max - 1 : unswdev;
 
-       KGET2(scan, &meta, sizeof(meta), "blmeta_t");
+       if (scan_cache) {
+               meta = *scan_cache;
+       } else if (skip == BLIST_META_RADIX) {
+               if (kvm_read(kd, (u_long)scan, scan_array, sizeof(scan_array)) != sizeof(scan_array)) {
+                       warnx("cannot read %s: %s", "blmeta_t", kvm_geterr(kd));
+                       bzero(scan_array, sizeof(scan_array));
+               }
+               meta = scan_array[0];
+       } else {
+               KGET2(scan, &meta, sizeof(meta), "blmeta_t");
+       }
 
        /*
         * Terminator
         */
-       if (meta.bm_bighint == (daddr_t)-1) {
+       if (meta.bm_bighint == (swblk_t)-1) {
                if (flags & SWIF_DUMP_TREE) {
-                       printf("%*.*s(0x%06x,%d) Terminator\n", 
+                       printf("%*.*s(0x%06x,%lld) Terminator\n",
                            TABME,
                            blk, 
-                           radix
+                           (long long)radix
                        );
                }
                return(-1);
@@ -272,10 +284,10 @@ scanradix(
                int i;
 
                if (flags & SWIF_DUMP_TREE) {
-                       printf("%*.*s(0x%06x,%d) Bitmap %08x big=%d\n", 
+                       printf("%*.*s(0x%06x,%lld) Bitmap %08x big=%d\n",
                            TABME,
                            blk, 
-                           radix,
+                           (long long)radix,
                            (int)meta.u.bmu_bitmap,
                            meta.bm_bighint
                        );
@@ -306,10 +318,10 @@ scanradix(
                 * Meta node if all free
                 */
                if (flags & SWIF_DUMP_TREE) {
-                       printf("%*.*s(0x%06x,%d) Submap ALL-FREE {\n", 
+                       printf("%*.*s(0x%06x,%lld) Submap ALL-FREE {\n",
                            TABME,
                            blk, 
-                           radix
+                           (long long)radix
                        );
                }
                /*
@@ -338,10 +350,10 @@ scanradix(
                 * Meta node if all used
                 */
                if (flags & SWIF_DUMP_TREE) {
-                       printf("%*.*s(0x%06x,%d) Submap ALL-ALLOCATED\n", 
+                       printf("%*.*s(0x%06x,%lld) Submap ALL-ALLOCATED\n",
                            TABME,
                            blk, 
-                           radix
+                           (long long)radix
                        );
                }
        } else {
@@ -352,10 +364,10 @@ scanradix(
                int next_skip;
 
                if (flags & SWIF_DUMP_TREE) {
-                       printf("%*.*s(0x%06x,%d) Submap avail=%d big=%d {\n", 
+                       printf("%*.*s(0x%06x,%lld) Submap avail=%d big=%d {\n",
                            TABME,
                            blk, 
-                           radix,
+                           (long long)radix,
                            (int)meta.u.bmu_avail,
                            meta.bm_bighint
                        );
@@ -366,10 +378,12 @@ scanradix(
 
                for (i = 1; i <= skip; i += next_skip) {
                        int r;
-                       daddr_t vcount = (count > radix) ? radix : count;
+                       swblk_t vcount = (count > radix) ?
+                                       (swblk_t)radix : count;
 
                        r = scanradix(
                            &scan[i],
+                           ((next_skip == 1) ? &scan_array[i] : NULL),
                            blk,
                            radix,
                            next_skip - 1,
@@ -384,7 +398,7 @@ scanradix(
                        );
                        if (r < 0)
                                break;
-                       blk += radix;
+                       blk += (swblk_t)radix;
                }
                if (flags & SWIF_DUMP_TREE) {
                        printf("%*.*s}\n", TABME);
@@ -410,26 +424,41 @@ getswapinfo_radix(kvm_t *kd, struct kvm_swap *swap_ary, int swap_max, int flags)
        KGET2(swapblist, &blcopy, sizeof(blcopy), "*swapblist");
 
        if (flags & SWIF_DUMP_TREE) {
-               printf("radix tree: %d/%d/%d blocks, %dK wired\n",
+               printf("radix tree: %d/%d/%lld blocks, %dK wired\n",
                        blcopy.bl_free,
                        blcopy.bl_blocks,
-                       blcopy.bl_radix,
+                       (long long)blcopy.bl_radix,
                        (int)((blcopy.bl_rootblks * sizeof(blmeta_t) + 1023)/
                            1024)
                );
        }
-       scanradix(
-           blcopy.bl_root, 
-           0, 
-           blcopy.bl_radix, 
-           blcopy.bl_skip, 
-           blcopy.bl_rootblks, 
-           kd,
-           dmmax,
-           nswdev, 
-           swap_ary,
-           swap_max,
-           0,
-           flags
-       );
+
+       /*
+        * XXX Scan the radix tree in the kernel if we have more then one
+        *     swap device so we can get per-device statistics.  This can
+        *     get nasty because swap devices are interleaved based on the
+        *     maximum of (4), so the blist winds up not using any shortcuts.
+        *
+        *     Otherwise just pull the free count out of the blist header,
+        *     which is a billion times faster.
+        */
+       if ((flags & SWIF_DUMP_TREE) || unswdev > 1) {
+               scanradix(
+                   blcopy.bl_root,
+                   NULL,
+                   0,
+                   blcopy.bl_radix,
+                   blcopy.bl_skip,
+                   blcopy.bl_rootblks,
+                   kd,
+                   dmmax,
+                   nswdev,
+                   swap_ary,
+                   swap_max,
+                   0,
+                   flags
+               );
+       } else {
+               swap_ary[0].ksw_used -= blcopy.bl_free;
+       }
 }
index 1a5ed83..bc82c29 100644 (file)
 /*
  * Ceiling on amount of swblock kva space, can be changed via
  * kern.maxswzone /boot/loader.conf variable.
+ *
+ * Approximately size / 160 x 32 x PAGE_SIZE bytes of swap.  This
+ * comes to approximately 1GB of swap space per 1MB of kernel memory.
  */
 #ifndef VM_SWZONE_SIZE_MAX
-#define VM_SWZONE_SIZE_MAX     (32 * 1024 * 1024)
+#define VM_SWZONE_SIZE_MAX     (32L * 1024 * 1024)
 #endif
 
 /*
  * kern.maxbcache /boot/loader.conf variable.
  */
 #ifndef VM_BCACHE_SIZE_MAX
-#define VM_BCACHE_SIZE_MAX     (200 * 1024 * 1024)
+#define VM_BCACHE_SIZE_MAX     (200L * 1024 * 1024)
 #endif
 
 
index c0c1602..bd52f12 100644 (file)
@@ -280,6 +280,21 @@ kgetenv_int(const char *name, int *data)
        return (rval);
 }
 
+/*
+ * Return a long value from an environment variable.
+ */
+int
+kgetenv_long(const char *name, long *data)
+{
+       quad_t tmp;
+       int rval;
+
+       rval = kgetenv_quad(name, &tmp);
+       if (rval)
+               *data = (long)tmp;
+       return (rval);
+}
+
 /*
  * Return an unsigned long value from an environment variable.
  */
@@ -297,6 +312,9 @@ kgetenv_ulong(const char *name, unsigned long *data)
 
 /*
  * Return a quad_t value from an environment variable.
+ *
+ * A single character kmgtKMGT extension multiplies the value
+ * by 1024, 1024*1024, etc.
  */
 int
 kgetenv_quad(const char *name, quad_t *data)
@@ -309,6 +327,28 @@ kgetenv_quad(const char *name, quad_t *data)
                return(0);
 
        iv = strtoq(value, &vtp, 0);
+       switch(*vtp) {
+       case 't':
+       case 'T':
+               iv <<= 10;
+               /* fall through */
+       case 'g':
+       case 'G':
+               iv <<= 10;
+               /* fall through */
+       case 'm':
+       case 'M':
+               iv <<= 10;
+               /* fall through */
+       case 'k':
+       case 'K':
+               iv <<= 10;
+               ++vtp;
+               break;
+       default:
+               break;
+       }
+
        if ((vtp == value) || (*vtp != '\0')) {
                kfreeenv(value);
                return(0);
index cb01775..8a2895b 100644 (file)
@@ -79,7 +79,7 @@
  *     to cover the number of blocks requested at creation time even if it
  *     must be encompassed in larger root-node radix.
  *
- *     NOTE: the allocator cannot currently allocate more then 
+ *     NOTE: The allocator cannot currently allocate more then
  *     BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too 
  *     large' if you try.  This is an area that could use improvement.  The 
  *     radix is large enough that this restriction does not effect the swap 
  *     this algorithmic unfeature.  The freeing code can handle arbitrary
  *     ranges.
  *
+ *     NOTE: The radix may exceed 32 bits in order to support up to 2^31
+ *           blocks.  The first divison will drop the radix down and fit
+ *           it within a signed 32 bit integer.
+ *
  *     This code can be compiled stand-alone for debugging.
  *
  * $FreeBSD: src/sys/kern/subr_blist.c,v 1.5.2.2 2003/01/12 09:23:12 dillon Exp $
 
 #define kmalloc(a,b,c) malloc(a)
 #define kfree(a,b)     free(a)
+#define kprintf                printf
+#define KKASSERT(exp)
 
 #include <sys/blist.h>
 
@@ -136,17 +142,17 @@ void panic(const char *ctl, ...);
 
 static swblk_t blst_leaf_alloc(blmeta_t *scan, swblk_t blk, int count);
 static swblk_t blst_meta_alloc(blmeta_t *scan, swblk_t blk, 
-                               swblk_t count, swblk_t radix, int skip);
+                               swblk_t count, int64_t radix, int skip);
 static void blst_leaf_free(blmeta_t *scan, swblk_t relblk, int count);
 static void blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, 
-                                       swblk_t radix, int skip, swblk_t blk);
-static void blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, 
+                                       int64_t radix, int skip, swblk_t blk);
+static void blst_copy(blmeta_t *scan, swblk_t blk, int64_t radix,
                                swblk_t skip, blist_t dest, swblk_t count);
-static swblk_t blst_radix_init(blmeta_t *scan, swblk_t radix, 
+static swblk_t blst_radix_init(blmeta_t *scan, int64_t radix,
                                                int skip, swblk_t count);
 #ifndef _KERNEL
 static void    blst_radix_print(blmeta_t *scan, swblk_t blk, 
-                                       swblk_t radix, int skip, int tab);
+                                       int64_t radix, int skip, int tab);
 #endif
 
 #ifdef _KERNEL
@@ -167,17 +173,20 @@ blist_t
 blist_create(swblk_t blocks)
 {
        blist_t bl;
-       int radix;
+       int64_t radix;
        int skip = 0;
 
        /*
         * Calculate radix and skip field used for scanning.
+        *
+        * Radix can exceed 32 bits even if swblk_t is limited to 32 bits.
         */
        radix = BLIST_BMAP_RADIX;
 
        while (radix < blocks) {
                radix *= BLIST_META_RADIX;
                skip = (skip + 1) * BLIST_META_RADIX;
+               KKASSERT(skip > 0);
        }
 
        bl = kmalloc(sizeof(struct blist), M_SWAP, M_WAITOK);
@@ -391,7 +400,7 @@ blst_leaf_alloc(blmeta_t *scan, swblk_t blk, int count)
 
 static swblk_t
 blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count,
-               swblk_t radix, int skip)
+               int64_t radix, int skip)
 {
        int i;
        int next_skip = ((u_int)skip / BLIST_META_RADIX);
@@ -404,6 +413,9 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count,
                return(SWAPBLK_NONE);
        }
 
+       /*
+        * note: radix may exceed 32 bits until first division.
+        */
        if (scan->u.bmu_avail == radix) {
                radix /= BLIST_META_RADIX;
 
@@ -418,8 +430,8 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count,
                                scan[i].u.bmu_bitmap = (u_swblk_t)-1;
                                scan[i].bm_bighint = BLIST_BMAP_RADIX;
                        } else {
-                               scan[i].bm_bighint = radix;
-                               scan[i].u.bmu_avail = radix;
+                               scan[i].bm_bighint = (swblk_t)radix;
+                               scan[i].u.bmu_avail = (swblk_t)radix;
                        }
                }
        } else {
@@ -448,14 +460,14 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count,
                         * Terminator
                         */
                        break;
-               } else if (count > radix) {
+               } else if (count > (swblk_t)radix) {
                        /*
                         * count does not fit in object even if it were
                         * complete free.
                         */
                        panic("blist_meta_alloc: allocation too large");
                }
-               blk += radix;
+               blk += (swblk_t)radix;
        }
 
        /*
@@ -514,18 +526,21 @@ blst_leaf_free(blmeta_t *scan, swblk_t blk, int count)
 
 static void 
 blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
-              swblk_t radix, int skip, swblk_t blk)
+              int64_t radix, int skip, swblk_t blk)
 {
        int i;
        int next_skip = ((u_int)skip / BLIST_META_RADIX);
 
 #if 0
-       kprintf("FREE (%x,%d) FROM (%x,%d)\n",
+       kprintf("FREE (%x,%d) FROM (%x,%lld)\n",
            freeBlk, count,
-           blk, radix
+           blk, (long long)radix
        );
 #endif
 
+       /*
+        * NOTE: radix may exceed 32 bits until first division.
+        */
        if (scan->u.bmu_avail == 0) {
                /*
                 * ALL-ALLOCATED special case, with possible
@@ -559,7 +574,7 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
        if (scan->u.bmu_avail == radix)
                return;
        if (scan->u.bmu_avail > radix)
-               panic("blst_meta_free: freeing already free blocks (%d) %d/%d", count, scan->u.bmu_avail, radix);
+               panic("blst_meta_free: freeing already free blocks (%d) %d/%lld", count, scan->u.bmu_avail, (long long)radix);
 
        /*
         * Break the free down into its components
@@ -567,14 +582,14 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
 
        radix /= BLIST_META_RADIX;
 
-       i = (freeBlk - blk) / radix;
-       blk += i * radix;
+       i = (freeBlk - blk) / (swblk_t)radix;
+       blk += i * (swblk_t)radix;
        i = i * next_skip + 1;
 
        while (i <= skip && blk < freeBlk + count) {
                swblk_t v;
 
-               v = blk + radix - freeBlk;
+               v = blk + (swblk_t)radix - freeBlk;
                if (v > count)
                        v = count;
 
@@ -590,7 +605,7 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
                    scan->bm_bighint = scan[i].bm_bighint;
                count -= v;
                freeBlk += v;
-               blk += radix;
+               blk += (swblk_t)radix;
                i += next_skip;
        }
 }
@@ -603,7 +618,7 @@ blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count,
  */
 
 static void
-blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix, 
+blst_copy(blmeta_t *scan, swblk_t blk, int64_t radix,
          swblk_t skip, blist_t dest, swblk_t count) 
 {
        int next_skip;
@@ -646,7 +661,7 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix,
                if (count < radix)
                        blist_free(dest, blk, count);
                else
-                       blist_free(dest, blk, radix);
+                       blist_free(dest, blk, (swblk_t)radix);
                return;
        }
 
@@ -658,16 +673,16 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix,
                if (scan[i].bm_bighint == (swblk_t)-1)
                        break;
 
-               if (count >= radix) {
+               if (count >= (swblk_t)radix) {
                        blst_copy(
                            &scan[i],
                            blk,
                            radix,
                            next_skip - 1,
                            dest,
-                           radix
+                           (swblk_t)radix
                        );
-                       count -= radix;
+                       count -= (swblk_t)radix;
                } else {
                        if (count) {
                                blst_copy(
@@ -681,7 +696,7 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix,
                        }
                        count = 0;
                }
-               blk += radix;
+               blk += (swblk_t)radix;
        }
 }
 
@@ -695,7 +710,7 @@ blst_copy(blmeta_t *scan, swblk_t blk, swblk_t radix,
  */
 
 static swblk_t 
-blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count)
+blst_radix_init(blmeta_t *scan, int64_t radix, int skip, swblk_t count)
 {
        int i;
        int next_skip;
@@ -728,7 +743,7 @@ blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count)
        next_skip = ((u_int)skip / BLIST_META_RADIX);
 
        for (i = 1; i <= skip; i += next_skip) {
-               if (count >= radix) {
+               if (count >= (swblk_t)radix) {
                        /*
                         * Allocate the entire object
                         */
@@ -736,9 +751,9 @@ blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count)
                            ((scan) ? &scan[i] : NULL),
                            radix,
                            next_skip - 1,
-                           radix
+                           (swblk_t)radix
                        );
-                       count -= radix;
+                       count -= (swblk_t)radix;
                } else if (count > 0) {
                        /*
                         * Allocate a partial object
@@ -767,7 +782,7 @@ blst_radix_init(blmeta_t *scan, swblk_t radix, int skip, swblk_t count)
 #ifdef BLIST_DEBUG
 
 static void    
-blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab)
+blst_radix_print(blmeta_t *scan, swblk_t blk, int64_t radix, int skip, int tab)
 {
        int i;
        int next_skip;
@@ -775,9 +790,9 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab)
 
        if (radix == BLIST_BMAP_RADIX) {
                kprintf(
-                   "%*.*s(%04x,%d): bitmap %08x big=%d\n", 
+                   "%*.*s(%04x,%lld): bitmap %08x big=%d\n",
                    tab, tab, "",
-                   blk, radix,
+                   blk, (long long)radix,
                    scan->u.bmu_bitmap,
                    scan->bm_bighint
                );
@@ -786,29 +801,29 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab)
 
        if (scan->u.bmu_avail == 0) {
                kprintf(
-                   "%*.*s(%04x,%d) ALL ALLOCATED\n",
+                   "%*.*s(%04x,%lld) ALL ALLOCATED\n",
                    tab, tab, "",
                    blk,
-                   radix
+                   (long long)radix
                );
                return;
        }
        if (scan->u.bmu_avail == radix) {
                kprintf(
-                   "%*.*s(%04x,%d) ALL FREE\n",
+                   "%*.*s(%04x,%lld) ALL FREE\n",
                    tab, tab, "",
                    blk,
-                   radix
+                   (long long)radix
                );
                return;
        }
 
        kprintf(
-           "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+           "%*.*s(%04x,%lld): subtree (%d/%lld) big=%d {\n",
            tab, tab, "",
-           blk, radix,
+           blk, (long long)radix,
            scan->u.bmu_avail,
-           radix,
+           (long long)radix,
            scan->bm_bighint
        );
 
@@ -819,9 +834,9 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab)
        for (i = 1; i <= skip; i += next_skip) {
                if (scan[i].bm_bighint == (swblk_t)-1) {
                        kprintf(
-                           "%*.*s(%04x,%d): Terminator\n",
+                           "%*.*s(%04x,%lld): Terminator\n",
                            tab, tab, "",
-                           blk, radix
+                           blk, (long long)radix
                        );
                        lastState = 0;
                        break;
@@ -833,7 +848,7 @@ blst_radix_print(blmeta_t *scan, swblk_t blk, swblk_t radix, int skip, int tab)
                    next_skip - 1,
                    tab
                );
-               blk += radix;
+               blk += (swblk_t)radix;
        }
        tab -= 4;
 
@@ -873,7 +888,8 @@ main(int ac, char **av)
                swblk_t count = 0;
 
 
-               kprintf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+               kprintf("%d/%d/%lld> ",
+                       bl->bl_free, size, (long long)bl->bl_radix);
                fflush(stdout);
                if (fgets(buf, sizeof(buf), stdin) == NULL)
                        break;
index 5b693e8..2497fa5 100644 (file)
@@ -85,8 +85,8 @@ int   ncallout;                       /* maximum # of timer events */
 int    mbuf_wait = 32;                 /* mbuf sleep time in ticks */
 int    nbuf;
 int    nswbuf;
-int    maxswzone;                      /* max swmeta KVA storage */
-int    maxbcache;                      /* max buffer cache KVA storage */
+long   maxswzone;                      /* max swmeta KVA storage */
+long   maxbcache;                      /* max buffer cache KVA storage */
 u_quad_t       maxtsiz;                        /* max text size */
 u_quad_t       dfldsiz;                        /* initial data size limit */
 u_quad_t       maxdsiz;                        /* max data size */
@@ -121,11 +121,11 @@ init_param1(void)
 #ifdef VM_SWZONE_SIZE_MAX
        maxswzone = VM_SWZONE_SIZE_MAX;
 #endif
-       TUNABLE_INT_FETCH("kern.maxswzone", &maxswzone);
+       TUNABLE_LONG_FETCH("kern.maxswzone", &maxswzone);
 #ifdef VM_BCACHE_SIZE_MAX
        maxbcache = VM_BCACHE_SIZE_MAX;
 #endif
-       TUNABLE_INT_FETCH("kern.maxbcache", &maxbcache);
+       TUNABLE_LONG_FETCH("kern.maxbcache", &maxbcache);
        maxtsiz = MAXTSIZ;
        TUNABLE_QUAD_FETCH("kern.maxtsiz", &maxtsiz);
        dfldsiz = DFLDSIZ;
index b1a9f2e..a3c3ae9 100644 (file)
@@ -81,7 +81,7 @@ typedef struct blmeta {
 
 typedef struct blist {
        swblk_t         bl_blocks;      /* area of coverage             */
-       swblk_t         bl_radix;       /* coverage radix               */
+       int64_t         bl_radix;       /* coverage radix               */
        swblk_t         bl_skip;        /* starting skip                */
        swblk_t         bl_free;        /* number of free blocks        */
        blmeta_t        *bl_root;       /* root of radix tree           */
@@ -91,6 +91,18 @@ typedef struct blist {
 #define BLIST_META_RADIX       16
 #define BLIST_BMAP_RADIX       (sizeof(u_swblk_t)*8)
 
+/*
+ * The radix can be up to x BLIST_BMAP_RADIX the largest skip,
+ * based on the initial skip calculation in blist_create().
+ *
+ * The radix will exceed the size of a 32 bit signed (or unsigned) int
+ * when the maximal number of blocks is allocated.  This corresponds
+ * to ~1G x PAGE_SIZE = 4096GB.  The swap code usually divides this
+ * by 4, leaving us with a capability of up to four 1TB swap devices.
+ */
+#define BLIST_MAXBLKS          (0x40000000 /           \
+                                (BLIST_BMAP_RADIX / BLIST_META_RADIX))
+
 #define BLIST_MAX_ALLOC                BLIST_BMAP_RADIX
 
 extern blist_t blist_create(swblk_t blocks);
index 1a6668d..64cf191 100644 (file)
@@ -365,8 +365,8 @@ struct cluster_save {
 
 #ifdef _KERNEL
 extern int     nbuf;                   /* The number of buffer headers */
-extern int     maxswzone;              /* Max KVA for swap structures */
-extern int     maxbcache;              /* Max KVA for buffer cache */
+extern long    maxswzone;              /* Max KVA for swap structures */
+extern long    maxbcache;              /* Max KVA for buffer cache */
 extern int     runningbufspace;
 extern int     runningbufcount;
 extern int     hidirtybufspace;
index 0168bbc..3eb54b5 100644 (file)
@@ -308,6 +308,7 @@ struct tunable_int {
                tunable_int_init, &__tunable_int_ ## line)
 
 #define        TUNABLE_INT_FETCH(path, var)    kgetenv_int((path), (var))
+#define        TUNABLE_LONG_FETCH(path, var)   kgetenv_long((path), (var))
 
 /* Backwards compatibility with the old deprecated TUNABLE_INT_DECL API */
 #define TUNABLE_INT_DECL(path, defval, var)    \
index d1df769..977e36d 100644 (file)
@@ -253,6 +253,7 @@ int kgetenv_int (const char *name, int *data);
 int    kgetenv_string (const char *name, char *data, int size);
 int    kgetenv_ulong(const char *name, unsigned long *data);
 int    kgetenv_quad (const char *name, quad_t *data);
+int    kgetenv_long(const char *name, long *data);
 extern char *kern_envp;
 
 #ifdef APM_FIXUP_CALLTODO 
index 1b9eabc..fde1d25 100644 (file)
@@ -333,13 +333,15 @@ swap_pager_swap_init(void)
        nsw_wcount_async_max = nsw_wcount_async;
 
        /*
-        * Initialize our zone.  Right now I'm just guessing on the number
-        * we need based on the number of pages in the system.  Each swblock
-        * can hold 16 pages, so this is probably overkill.  This reservation
-        * is typically limited to around 32MB by default.
+        * The zone is dynamically allocated so generally size it to
+        * maxswzone (32MB to 512MB of KVM).  Set a minimum size based
+        * on physical memory of around 8x (each swblock can hold 16 pages).
+        *
+        * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
+        * has increased dramatically.
         */
        n = vmstats.v_page_count / 2;
-       if (maxswzone && n > maxswzone / sizeof(struct swblock))
+       if (maxswzone && n < maxswzone / sizeof(struct swblock))
                n = maxswzone / sizeof(struct swblock);
        n2 = n;
 
index ac6fb0c..dd30a4d 100644 (file)
@@ -67,7 +67,7 @@
 #endif
 static struct swdevt should_be_malloced[NSWAPDEV];
 struct swdevt *swdevt = should_be_malloced;    /* exported to pstat/systat */
-static int nswap;              /* first block after the interleaved devs */
+static swblk_t nswap;          /* first block after the interleaved devs */
 int nswdev = NSWAPDEV;                         /* exported to pstat/systat */
 int vm_swap_size;
 
@@ -227,9 +227,9 @@ sys_swapon(struct swapon_args *uap)
  * XXX locking when multiple swapon's run in parallel
  */
 int
-swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
+swaponvp(struct thread *td, struct vnode *vp, u_quad_t nblks)
 {
-       u_long aligned_nblks;
+       swblk_t aligned_nblks;
        int64_t dpsize;
        struct ucred *cred;
        struct swdevt *sp;
@@ -282,47 +282,50 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
                        VOP_CLOSE(vp, FREAD | FWRITE);
                        return (ENXIO);
                }
-               if ((u_int64_t)dpsize < 0x100000000ULL)
-                       nblks = (u_long)dpsize;
-               else
-                       nblks = 0xffffffffU;
+               nblks = (u_quad_t)dpsize;
        }
        if (nblks == 0) {
                VOP_CLOSE(vp, FREAD | FWRITE);
                return (ENXIO);
        }
 
-       /*
-        * If we go beyond this, we get overflows in the radix
-        * tree bitmap code.
-        */
-       if (nblks > 0x40000000 / BLIST_META_RADIX / nswdev) {
-               kprintf("exceeded maximum of %d blocks per swap unit\n",
-                       0x40000000 / BLIST_META_RADIX / nswdev);
-               VOP_CLOSE(vp, FREAD | FWRITE);
-               return (ENXIO);
-       }
        /*
         * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
         * First chop nblks off to page-align it, then convert.
         * 
         * sw->sw_nblks is in page-sized chunks now too.
         */
-       nblks &= ~(ctodb(1) - 1);
+       nblks &= ~(u_quad_t)(ctodb(1) - 1);
        nblks = dbtoc(nblks);
 
+       /*
+        * Post-conversion nblks must not be >= BLIST_MAXBLKS, and
+        * we impose a 4-swap-device limit so we have to divide it out
+        * further.  Going beyond this will result in overflows in the
+        * blist code.
+        *
+        * Post-conversion nblks must fit within a (swblk_t), which
+        * this test also ensures.
+        */
+       if (nblks > BLIST_MAXBLKS / nswdev) {
+               kprintf("exceeded maximum of %d blocks per swap unit\n",
+                       (int)BLIST_MAXBLKS / nswdev);
+               VOP_CLOSE(vp, FREAD | FWRITE);
+               return (ENXIO);
+       }
+
        sp->sw_vp = vp;
        sp->sw_dev = dev2udev(dev);
        sp->sw_device = dev;
        sp->sw_flags |= SW_FREED;
-       sp->sw_nblks = nblks;
+       sp->sw_nblks = (swblk_t)nblks;
 
        /*
         * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not
         * DEV_BSIZE'd.   aligned_nblks is used to calculate the
         * size of the swap bitmap, taking into account the stripe size.
         */
-       aligned_nblks = (nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1);
+       aligned_nblks = (swblk_t)((nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1));
 
        if (aligned_nblks * nswdev > nswap)
                nswap = aligned_nblks * nswdev;