kernel - split the pbuf subsystem into two (kva and non-kva)
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 15 Aug 2010 18:57:53 +0000 (11:57 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 15 Aug 2010 18:57:53 +0000 (11:57 -0700)
Most pbufs do not require kva reservations.  Split the API to provide
pbufs without kva reservations out of a much larger pool of pbufs, fixing
deadlock issues with the DM subsystem.

Note in particular that the hammer reblocker can eat upwards of 5000 pbufs
when operating on multi-layered DM-based storage, which blows away the
256 pbufs normally available with kva reservations that the old API had.

* Getpbuf() / trypbuf() now return pbufs without KVA reservations.

* Add getpbuf_kva() and trypbuf_kva() to get pbufs with KVA reservations.

* Fixes pbuf deadlocks in the low level I/O subsystem, particularly DM
  crypt, stripe, and mirror.

sys/bus/cam/cam_periph.c
sys/kern/kern_dsched.c
sys/kern/kern_physio.c
sys/kern/vfs_aio.c
sys/kern/vfs_bio.c
sys/kern/vfs_cluster.c
sys/sys/buf.h
sys/vfs/ufs/ffs_rawread.c
sys/vm/swap_pager.c
sys/vm/vm_pager.c

index 839902c..2571643 100644 (file)
@@ -701,7 +701,7 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
                /*
                 * Get the buffer.
                 */
-               bp = getpbuf(NULL);
+               bp = getpbuf_kva(NULL);
 
                /* save the original user pointer */
                mapinfo->saved_ptrs[i] = *data_ptrs[i];
index 813f009..3b09494 100644 (file)
@@ -498,8 +498,13 @@ dsched_strategy_sync(struct disk *dp, struct bio *bio)
        nbp->b_bcount = bp->b_bcount;
        nbp->b_resid = bp->b_resid;
        nbp->b_data = bp->b_data;
+#if 0
+       /*
+        * Buffers undergoing device I/O do not need a kvabase/size.
+        */
        nbp->b_kvabase = bp->b_kvabase;
        nbp->b_kvasize = bp->b_kvasize;
+#endif
        nbp->b_dirtyend = bp->b_dirtyend;
 
        nbio->bio_done = biodone_sync;
@@ -514,6 +519,10 @@ dsched_strategy_sync(struct disk *dp, struct bio *bio)
        bp->b_resid = nbp->b_resid;
        bp->b_error = nbp->b_error;
        biodone(bio);
+#if 0
+       nbp->b_kvabase = NULL;
+       nbp->b_kvasize = 0;
+#endif
        relpbuf(nbp, NULL);
 }
 
index f930f4e..4bcf743 100644 (file)
@@ -44,7 +44,7 @@ physio(cdev_t dev, struct uio *uio, int ioflag)
        caddr_t ubase;
        struct buf *bp;
 
-       bp = getpbuf(NULL);
+       bp = getpbuf_kva(NULL);
        saflags = bp->b_flags;
        error = 0;
 
index 22a0e86..e87558a 100644 (file)
@@ -912,7 +912,7 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
                lj->lioj_buffer_count++;
 
        /* Create and build a buffer header for a transfer. */
-       bp = getpbuf(NULL);
+       bp = getpbuf_kva(NULL);
        BUF_KERNPROC(bp);
 
        /*
index 720ce86..3ca7175 100644 (file)
@@ -790,6 +790,7 @@ bfreekva(struct buf *bp)
                vm_map_unlock(&buffer_map);
                vm_map_entry_release(count);
                bp->b_kvasize = 0;
+               bp->b_kvabase = NULL;
                bufspacewakeup();
                rel_mplock();
        }
@@ -4413,6 +4414,7 @@ vmapbuf(struct buf *bp, caddr_t udata, int bytes)
         */
        KKASSERT(bp->b_cmd != BUF_CMD_DONE);
        KKASSERT(bp->b_flags & B_PAGING);
+       KKASSERT(bp->b_kvabase);
 
        if (bytes < 0)
                return (-1);
index 54ad63b..a3787e3 100644 (file)
@@ -380,7 +380,7 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
                return tbp;
        }
 
-       bp = trypbuf(&cluster_pbuf_freecnt);
+       bp = trypbuf_kva(&cluster_pbuf_freecnt);
        if (bp == NULL) {
                return tbp;
        }
@@ -828,7 +828,7 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
                    (tbp->b_bcount != tbp->b_bufsize) ||
                    (tbp->b_bcount != blksize) ||
                    (bytes == blksize) ||
-                   ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+                   ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) {
                        totalwritten += tbp->b_bufsize;
                        bawrite(tbp);
                        start_loffset += blksize;
index 2fdc7dc..2978c62 100644 (file)
@@ -415,6 +415,7 @@ void        brelse (struct buf *);
 void   bqrelse (struct buf *);
 int    vfs_bio_awrite (struct buf *);
 struct buf *getpbuf (int *);
+struct buf *getpbuf_kva (int *);
 int    inmem (struct vnode *, off_t);
 struct buf *findblk (struct vnode *, off_t, int);
 struct buf *getblk (struct vnode *, off_t, int, int, int);
@@ -448,6 +449,7 @@ int allocbuf (struct buf *bp, int size);
 int    scan_all_buffers (int (*)(struct buf *, void *), void *);
 void   reassignbuf (struct buf *);
 struct buf *trypbuf (int *);
+struct buf *trypbuf_kva (int *);
 void   bio_ops_sync(struct mount *mp);
 void   vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to);
 void   vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to);
index 0876e37..72dfbbb 100644 (file)
@@ -249,7 +249,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
                
                if (bp == NULL) { /* Setup first read */
                        /* XXX: Leave some bufs for swap */
-                       bp = getpbuf(&ffsrawbufcnt);
+                       bp = getpbuf_kva(&ffsrawbufcnt);
                        error = ffs_rawread_readahead(vp, udata, offset, resid,
                                                      bp, &baseticks);
                        if (error != 0)
@@ -258,7 +258,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
                        if (resid > bp->b_bufsize) { /* Setup fist readahead */
                                /* XXX: Leave bufs for swap */
                                if (rawreadahead != 0) 
-                                       nbp = trypbuf(&ffsrawbufcnt);
+                                       nbp = trypbuf_kva(&ffsrawbufcnt);
                                else
                                        nbp = NULL;
                                if (nbp != NULL) {
index 89faeef..8554ce9 100644 (file)
@@ -1313,7 +1313,7 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        /*
         * map our page(s) into kva for input
         */
-       bp = getpbuf(&nsw_rcount);
+       bp = getpbuf_kva(&nsw_rcount);
        bio = &bp->b_bio1;
        kva = (vm_offset_t) bp->b_kvabase;
        bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
@@ -1561,9 +1561,9 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                 * request and assign the swap space.
                 */
                if (sync == TRUE)
-                       bp = getpbuf(&nsw_wcount_sync);
+                       bp = getpbuf_kva(&nsw_wcount_sync);
                else
-                       bp = getpbuf(&nsw_wcount_async);
+                       bp = getpbuf_kva(&nsw_wcount_async);
                bio = &bp->b_bio1;
 
                pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
index 7021669..6899f3b 100644 (file)
 #include <sys/malloc.h>
 #include <sys/dsched.h>
 #include <sys/proc.h>
+#include <sys/sysctl.h>
 #include <sys/thread2.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
+#include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
@@ -177,13 +179,24 @@ int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
  */
 #define PAGER_MAP_SIZE (8 * 1024 * 1024)
 
+TAILQ_HEAD(swqueue, buf);
+
 int pager_map_size = PAGER_MAP_SIZE;
 struct vm_map pager_map;
 
-static int bswneeded;
+static int bswneeded_raw;
+static int bswneeded_kva;
+static int nswbuf_raw;
+static struct buf *swbuf_raw;
 static vm_offset_t swapbkva;           /* swap buffers kva */
-static TAILQ_HEAD(swqueue, buf) bswlist;
+static struct swqueue bswlist_raw;     /* without kva */
+static struct swqueue bswlist_kva;     /* with kva */
 static struct spinlock bswspin = SPINLOCK_INITIALIZER(&bswspin);
+static int pbuf_raw_count;
+static int pbuf_kva_count;
+
+SYSCTL_INT(_vfs, OID_AUTO, pbuf_raw_count, CTLFLAG_RD, &pbuf_raw_count, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, pbuf_kva_count, CTLFLAG_RD, &pbuf_kva_count, 0, "");
 
 /*
  * Initialize the swap buffer list.
@@ -193,7 +206,8 @@ static struct spinlock bswspin = SPINLOCK_INITIALIZER(&bswspin);
 static void
 vm_pager_init(void *arg __unused)
 {
-       TAILQ_INIT(&bswlist);
+       TAILQ_INIT(&bswlist_raw);
+       TAILQ_INIT(&bswlist_kva);
 }
 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_SECOND, vm_pager_init, NULL)
 
@@ -214,15 +228,32 @@ vm_pager_bufferinit(void)
                panic("Not enough pager_map VM space for physical buffers");
 
        /*
-        * Initial pbuf setup.
+        * Initial pbuf setup.  These pbufs have KVA reservations.
         */
        bp = swbuf;
        for (i = 0; i < nswbuf; ++i, ++bp) {
                bp->b_kvabase = (caddr_t)((intptr_t)i * MAXPHYS) + swapbkva;
                bp->b_kvasize = MAXPHYS;
-               TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist);
                BUF_LOCKINIT(bp);
                buf_dep_init(bp);
+               TAILQ_INSERT_HEAD(&bswlist_kva, bp, b_freelist);
+               ++pbuf_kva_count;
+       }
+
+       /*
+        * Initial pbuf setup.  These pbufs do not have KVA reservations,
+        * so we can have a lot more of them.  These are typically used
+        * to massage low level buf/bio requests.
+        */
+       nswbuf_raw = nbuf * 2;
+       swbuf_raw = (void *)kmem_alloc(&kernel_map,
+                               round_page(nswbuf_raw * sizeof(struct buf)));
+       bp = swbuf_raw;
+       for (i = 0; i < nswbuf_raw; ++i, ++bp) {
+               BUF_LOCKINIT(bp);
+               buf_dep_init(bp);
+               TAILQ_INSERT_HEAD(&bswlist_raw, bp, b_freelist);
+               ++pbuf_raw_count;
        }
 
        /*
@@ -276,8 +307,8 @@ vm_pager_sync(void)
 static void
 initpbuf(struct buf *bp)
 {
-       bp->b_qindex = 0; /* BQUEUE_NONE */
-       bp->b_data = bp->b_kvabase;
+       bp->b_qindex = 0;               /* BQUEUE_NONE */
+       bp->b_data = bp->b_kvabase;     /* NULL if pbuf sans kva */
        bp->b_flags = B_PAGING;
        bp->b_cmd = BUF_CMD_DONE;
        bp->b_error = 0;
@@ -303,6 +334,11 @@ initpbuf(struct buf *bp)
  *     NOTE: pfreecnt can be NULL, but this 'feature' will be removed
  *     relatively soon when the rest of the subsystems get smart about it. XXX
  *
+ *     Physical buffers can be with or without KVA space reserved.  There
+ *     are severe limitations on the ones with KVA reserved, and fewer
+ *     limitations on the ones without.  getpbuf() gets one without,
+ *     getpbuf_kva() gets one with.
+ *
  * No requirements.
  */
 struct buf *
@@ -319,13 +355,14 @@ getpbuf(int *pfreecnt)
                }
 
                /* get a bp from the swap buffer header pool */
-               if ((bp = TAILQ_FIRST(&bswlist)) != NULL)
+               if ((bp = TAILQ_FIRST(&bswlist_raw)) != NULL)
                        break;
-               bswneeded = 1;
-               ssleep(&bswneeded, &bswspin, 0, "wswbuf1", 0);
+               bswneeded_raw = 1;
+               ssleep(&bswneeded_raw, &bswspin, 0, "wswbuf1", 0);
                /* loop in case someone else grabbed one */
        }
-       TAILQ_REMOVE(&bswlist, bp, b_freelist);
+       TAILQ_REMOVE(&bswlist_raw, bp, b_freelist);
+       --pbuf_raw_count;
        if (pfreecnt)
                --*pfreecnt;
 
@@ -333,7 +370,41 @@ getpbuf(int *pfreecnt)
 
        initpbuf(bp);
        KKASSERT(dsched_is_clear_buf_priv(bp));
-       return bp;
+
+       return (bp);
+}
+
+struct buf *
+getpbuf_kva(int *pfreecnt)
+{
+       struct buf *bp;
+
+       spin_lock_wr(&bswspin);
+
+       for (;;) {
+               if (pfreecnt) {
+                       while (*pfreecnt == 0)
+                               ssleep(pfreecnt, &bswspin, 0, "wswbuf0", 0);
+               }
+
+               /* get a bp from the swap buffer header pool */
+               if ((bp = TAILQ_FIRST(&bswlist_kva)) != NULL)
+                       break;
+               bswneeded_kva = 1;
+               ssleep(&bswneeded_kva, &bswspin, 0, "wswbuf1", 0);
+               /* loop in case someone else grabbed one */
+       }
+       TAILQ_REMOVE(&bswlist_kva, bp, b_freelist);
+       --pbuf_kva_count;
+       if (pfreecnt)
+               --*pfreecnt;
+
+       spin_unlock_wr(&bswspin);
+
+       initpbuf(bp);
+       KKASSERT(dsched_is_clear_buf_priv(bp));
+
+       return (bp);
 }
 
 /*
@@ -351,11 +422,34 @@ trypbuf(int *pfreecnt)
 
        spin_lock_wr(&bswspin);
 
-       if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) {
+       if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist_raw)) == NULL) {
+               spin_unlock_wr(&bswspin);
+               return NULL;
+       }
+       TAILQ_REMOVE(&bswlist_raw, bp, b_freelist);
+       --pbuf_raw_count;
+       --*pfreecnt;
+
+       spin_unlock_wr(&bswspin);
+
+       initpbuf(bp);
+
+       return bp;
+}
+
+struct buf *
+trypbuf_kva(int *pfreecnt)
+{
+       struct buf *bp;
+
+       spin_lock_wr(&bswspin);
+
+       if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist_kva)) == NULL) {
                spin_unlock_wr(&bswspin);
                return NULL;
        }
-       TAILQ_REMOVE(&bswlist, bp, b_freelist);
+       TAILQ_REMOVE(&bswlist_kva, bp, b_freelist);
+       --pbuf_kva_count;
        --*pfreecnt;
 
        spin_unlock_wr(&bswspin);
@@ -376,7 +470,8 @@ trypbuf(int *pfreecnt)
 void
 relpbuf(struct buf *bp, int *pfreecnt)
 {
-       int wake_bsw = 0;
+       int wake_bsw_kva = 0;
+       int wake_bsw_raw = 0;
        int wake_freecnt = 0;
 
        KKASSERT(bp->b_flags & B_PAGING);
@@ -385,10 +480,20 @@ relpbuf(struct buf *bp, int *pfreecnt)
        spin_lock_wr(&bswspin);
 
        BUF_UNLOCK(bp);
-       TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist);
-       if (bswneeded) {
-               bswneeded = 0;
-               wake_bsw = 1;
+       if (bp->b_kvabase) {
+               TAILQ_INSERT_HEAD(&bswlist_kva, bp, b_freelist);
+               ++pbuf_kva_count;
+       } else {
+               TAILQ_INSERT_HEAD(&bswlist_raw, bp, b_freelist);
+               ++pbuf_raw_count;
+       }
+       if (bswneeded_kva) {
+               bswneeded_kva = 0;
+               wake_bsw_kva = 1;
+       }
+       if (bswneeded_raw) {
+               bswneeded_raw = 0;
+               wake_bsw_raw = 1;
        }
        if (pfreecnt) {
                if (++*pfreecnt == 1)
@@ -397,8 +502,10 @@ relpbuf(struct buf *bp, int *pfreecnt)
 
        spin_unlock_wr(&bswspin);
 
-       if (wake_bsw)
-               wakeup(&bswneeded);
+       if (wake_bsw_kva)
+               wakeup(&bswneeded_kva);
+       if (wake_bsw_raw)
+               wakeup(&bswneeded_raw);
        if (wake_freecnt)
                wakeup(pfreecnt);
 }