kernel - Add swap block allocation iterator
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 3 Mar 2013 01:03:17 +0000 (17:03 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 3 Mar 2013 01:03:17 +0000 (17:03 -0800)
* Instead of allocating the first available block the swap allocator
  now has an iterator and attempts to allocate a block near the iterator.
  On failure the iterator resets to the beginning of swap (0) and it
  tries again.

* This theoretically should result in more linearized allocations of
  swap space, allowing the pageout daemon to flush memory to a
  hard-drive-based swap at much higher bandwidth.

* Greatly improves poudriere when using stressful memory parameters,
  at least in the first pass.

* There are still some obvious linearity issues that can occur once
  the iterator recycles back to 0 which need to be addressed.  However,
  this change is certainly not going to be worse and should
  prevent degenerative swap situations where performance winds up
  being permanently bad due to fragmented data laid down earlier that
  is never paged back into memory.

sys/kern/subr_blist.c
sys/sys/blist.h
sys/vm/swap_pager.c

index 0d491d0..d5ad45a 100644 (file)
@@ -132,9 +132,11 @@ void panic(const char *ctl, ...);
  * static support functions
  */
 
-static swblk_t blst_leaf_alloc(blmeta_t *scan, swblk_t blk, int count);
-static swblk_t blst_meta_alloc(blmeta_t *scan, swblk_t blk, 
-                               swblk_t count, int64_t radix, int skip);
+static swblk_t blst_leaf_alloc(blmeta_t *scan, swblk_t blkat,
+                               swblk_t blk, int count);
+static swblk_t blst_meta_alloc(blmeta_t *scan, swblk_t blkat,
+                               swblk_t blk, swblk_t count,
+                               int64_t radix, int skip);
 static void blst_leaf_free(blmeta_t *scan, swblk_t relblk, int count);
 static void blst_meta_free(blmeta_t *scan, swblk_t freeBlk, swblk_t count, 
                                        int64_t radix, int skip, swblk_t blk);
@@ -228,9 +230,27 @@ blist_alloc(blist_t bl, swblk_t count)
 
        if (bl) {
                if (bl->bl_radix == BLIST_BMAP_RADIX)
-                       blk = blst_leaf_alloc(bl->bl_root, 0, count);
+                       blk = blst_leaf_alloc(bl->bl_root, 0, 0, count);
                else
-                       blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
+                       blk = blst_meta_alloc(bl->bl_root, 0, 0, count,
+                                             bl->bl_radix, bl->bl_skip);
+               if (blk != SWAPBLK_NONE)
+                       bl->bl_free -= count;
+       }
+       return(blk);
+}
+
+swblk_t
+blist_allocat(blist_t bl, swblk_t count, swblk_t blkat)
+{
+       swblk_t blk = SWAPBLK_NONE;
+
+       if (bl) {
+               if (bl->bl_radix == BLIST_BMAP_RADIX)
+                       blk = blst_leaf_alloc(bl->bl_root, blkat, 0, count);
+               else
+                       blk = blst_meta_alloc(bl->bl_root, blkat, 0, count,
+                                             bl->bl_radix, bl->bl_skip);
                if (blk != SWAPBLK_NONE)
                        bl->bl_free -= count;
        }
@@ -345,7 +365,7 @@ blist_print(blist_t bl)
  */
 
 static swblk_t
-blst_leaf_alloc(blmeta_t *scan, swblk_t blk, int count)
+blst_leaf_alloc(blmeta_t *scan, swblk_t blkat __unused, swblk_t blk, int count)
 {
        u_swblk_t orig = scan->u.bmu_bitmap;
 
@@ -417,11 +437,13 @@ blst_leaf_alloc(blmeta_t *scan, swblk_t blk, int count)
  *     and we have a few optimizations strewn in as well.
  */
 static swblk_t
-blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count,
+blst_meta_alloc(blmeta_t *scan, swblk_t blkat,
+               swblk_t blk, swblk_t count,
                int64_t radix, int skip)
 {
        int i;
        int next_skip = ((u_int)skip / BLIST_META_RADIX);
+       int hintok = (blk >= blkat);
 
        /*
         * ALL-ALLOCATED special case
@@ -457,15 +479,18 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count,
        }
 
        for (i = 1; i <= skip; i += next_skip) {
-               if (count <= scan[i].bm_bighint) {
+               if (count <= scan[i].bm_bighint &&
+                   blk + (swblk_t)radix > blkat) {
                        /*
                         * count fits in object
                         */
                        swblk_t r;
                        if (next_skip == 1) {
-                               r = blst_leaf_alloc(&scan[i], blk, count);
+                               r = blst_leaf_alloc(&scan[i], blkat,
+                                                   blk, count);
                        } else {
-                               r = blst_meta_alloc(&scan[i], blk, count,
+                               r = blst_meta_alloc(&scan[i], blkat,
+                                                   blk, count,
                                                    radix, next_skip - 1);
                        }
                        if (r != SWAPBLK_NONE) {
@@ -493,7 +518,7 @@ blst_meta_alloc(blmeta_t *scan, swblk_t blk, swblk_t count,
        /*
         * We couldn't allocate count in this subtree, update bighint.
         */
-       if (scan->bm_bighint >= count)
+       if (hintok && scan->bm_bighint >= count)
                scan->bm_bighint = count - 1;
        return(SWAPBLK_NONE);
 }
@@ -1022,6 +1047,7 @@ main(int ac, char **av)
                char buf[1024];
                swblk_t da = 0;
                swblk_t count = 0;
+               swblk_t blkat;
 
 
                kprintf("%d/%d/%lld> ",
@@ -1041,9 +1067,12 @@ main(int ac, char **av)
                        blist_print(bl);
                        break;
                case 'a':
-                       if (sscanf(buf + 1, "%d", &count) == 1) {
+                       if (sscanf(buf + 1, "%d %d", &count, &blkat) == 1) {
                                swblk_t blk = blist_alloc(bl, count);
                                kprintf("    R=%04x\n", blk);
+                       } else if (sscanf(buf + 1, "%d %d", &count, &blkat) == 2) {
+                               swblk_t blk = blist_allocat(bl, count, blkat);
+                               kprintf("    R=%04x\n", blk);
                        } else {
                                kprintf("?\n");
                        }
index 1ff7e75..f9062ff 100644 (file)
@@ -116,6 +116,7 @@ typedef struct blist {
 extern blist_t blist_create(swblk_t blocks);
 extern void blist_destroy(blist_t blist);
 extern swblk_t blist_alloc(blist_t blist, swblk_t count);
+extern swblk_t blist_allocat(blist_t blist, swblk_t count, swblk_t blkat);
 extern void blist_free(blist_t blist, swblk_t blkno, swblk_t count);
 extern swblk_t blist_fill(blist_t blist, swblk_t blkno, swblk_t count);
 extern void blist_print(blist_t blist);
index dc08225..2883d6a 100644 (file)
@@ -165,6 +165,7 @@ static int nsw_cluster_max; /* maximum VOP I/O allowed              */
 struct blist *swapblist;
 static int swap_async_max = 4; /* maximum in-progress async I/O's      */
 static int swap_burst_read = 0;        /* allow burst reading */
+static swblk_t swapiterator;   /* linearize allocations */
 
 /* from vm_swap.c */
 extern struct vnode *swapdev_vp;
@@ -481,7 +482,10 @@ swp_pager_getswapspace(vm_object_t object, int npages)
        swblk_t blk;
 
        lwkt_gettoken(&vm_token);
-       if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
+       blk = blist_allocat(swapblist, npages, swapiterator);
+       if (blk == SWAPBLK_NONE)
+               blk = blist_allocat(swapblist, npages, 0);
+       if (blk == SWAPBLK_NONE) {
                if (swap_pager_full != 2) {
                        kprintf("swap_pager_getswapspace: failed alloc=%d\n",
                                npages);
@@ -489,6 +493,7 @@ swp_pager_getswapspace(vm_object_t object, int npages)
                        swap_pager_almost_full = 1;
                }
        } else {
+               swapiterator = blk;
                swapacctspace(blk, -npages);
                if (object->type == OBJT_SWAP)
                        vm_swap_anon_use += npages;
@@ -900,8 +905,11 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
        char *data;
        struct bio *biox;
        struct buf *bufx;
+#if 0
        struct bio_track *track;
+#endif
 
+#if 0
        /*
         * tracking for swapdev vnode I/Os
         */
@@ -909,6 +917,7 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                track = &swapdev_vp->v_track_read;
        else
                track = &swapdev_vp->v_track_write;
+#endif
 
        if (bp->b_bcount & PAGE_MASK) {
                bp->b_error = EINVAL;