From: Matthew Dillon Date: Sun, 15 Aug 2010 18:57:53 +0000 (-0700) Subject: kernel - split the pbuf subsystem into two (kva and non-kva) X-Git-Tag: v2.9.0~520 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/9a82e536c033562735802be193d648f04ed8bf6d kernel - split the pbuf subsystem into two (kva and non-kva) Most pbufs do not require kva reservations. Split the API to provide pbufs without kva reservations out of a much larger pool of pbufs, fixing deadlock issues with the DM subsystem. Note in particular that the hammer reblocker can eat upwards of 5000 pbufs when operating on multi-layered DM-based storage, which blows away the 256 pbufs normally available with kva reservations that the old API had. * Getpbuf() / trypbuf() now return pbufs without KVA reservations. * Add getpbuf_kva() and trypbuf_kva() to get pbufs with KVA reservations. * Fixes pbuf deadlocks in the low level I/O subsystem, particularly DM crypt, stripe, and mirror. --- diff --git a/sys/bus/cam/cam_periph.c b/sys/bus/cam/cam_periph.c index 839902cf88..2571643735 100644 --- a/sys/bus/cam/cam_periph.c +++ b/sys/bus/cam/cam_periph.c @@ -701,7 +701,7 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) /* * Get the buffer. */ - bp = getpbuf(NULL); + bp = getpbuf_kva(NULL); /* save the original user pointer */ mapinfo->saved_ptrs[i] = *data_ptrs[i]; diff --git a/sys/kern/kern_dsched.c b/sys/kern/kern_dsched.c index 813f00963b..3b09494392 100644 --- a/sys/kern/kern_dsched.c +++ b/sys/kern/kern_dsched.c @@ -498,8 +498,13 @@ dsched_strategy_sync(struct disk *dp, struct bio *bio) nbp->b_bcount = bp->b_bcount; nbp->b_resid = bp->b_resid; nbp->b_data = bp->b_data; +#if 0 + /* + * Buffers undergoing device I/O do not need a kvabase/size. + */ nbp->b_kvabase = bp->b_kvabase; nbp->b_kvasize = bp->b_kvasize; +#endif nbp->b_dirtyend = bp->b_dirtyend; nbio->bio_done = biodone_sync; @@ -514,6 +519,10 @@ dsched_strategy_sync(struct disk *dp, struct bio *bio) bp->b_resid = nbp->b_resid; bp->b_error = nbp->b_error; biodone(bio); +#if 0 + nbp->b_kvabase = NULL; + nbp->b_kvasize = 0; +#endif relpbuf(nbp, NULL); } diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index f930f4e6ff..4bcf743065 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -44,7 +44,7 @@ physio(cdev_t dev, struct uio *uio, int ioflag) caddr_t ubase; struct buf *bp; - bp = getpbuf(NULL); + bp = getpbuf_kva(NULL); saflags = bp->b_flags; error = 0; diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 22a0e86997..e87558a3e9 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -912,7 +912,7 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe) lj->lioj_buffer_count++; /* Create and build a buffer header for a transfer. */ - bp = getpbuf(NULL); + bp = getpbuf_kva(NULL); BUF_KERNPROC(bp); /* diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 720ce8617a..3ca7175015 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -790,6 +790,7 @@ bfreekva(struct buf *bp) vm_map_unlock(&buffer_map); vm_map_entry_release(count); bp->b_kvasize = 0; + bp->b_kvabase = NULL; bufspacewakeup(); rel_mplock(); } @@ -4413,6 +4414,7 @@ vmapbuf(struct buf *bp, caddr_t udata, int bytes) */ KKASSERT(bp->b_cmd != BUF_CMD_DONE); KKASSERT(bp->b_flags & B_PAGING); + KKASSERT(bp->b_kvabase); if (bytes < 0) return (-1); diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 54ad63b4a6..a3787e30ce 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -380,7 +380,7 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, return tbp; } - bp = trypbuf(&cluster_pbuf_freecnt); + bp = trypbuf_kva(&cluster_pbuf_freecnt); if (bp == NULL) { return tbp; } @@ -828,7 +828,7 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != blksize) || (bytes == blksize) || - ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); start_loffset += blksize; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 2fdc7dc684..2978c62686 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -415,6 +415,7 @@ void brelse (struct buf *); void bqrelse (struct buf *); int vfs_bio_awrite (struct buf *); struct buf *getpbuf (int *); +struct buf *getpbuf_kva (int *); int inmem (struct vnode *, off_t); struct buf *findblk (struct vnode *, off_t, int); struct buf *getblk (struct vnode *, off_t, int, int, int); @@ -448,6 +449,7 @@ int allocbuf (struct buf *bp, int size); int scan_all_buffers (int (*)(struct buf *, void *), void *); void reassignbuf (struct buf *); struct buf *trypbuf (int *); +struct buf *trypbuf_kva (int *); void bio_ops_sync(struct mount *mp); void vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); diff --git a/sys/vfs/ufs/ffs_rawread.c b/sys/vfs/ufs/ffs_rawread.c index 0876e37a42..72dfbbb8c9 100644 --- a/sys/vfs/ufs/ffs_rawread.c +++ b/sys/vfs/ufs/ffs_rawread.c @@ -249,7 +249,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) if (bp == NULL) { /* Setup first read */ /* XXX: Leave some bufs for swap */ - bp = getpbuf(&ffsrawbufcnt); + bp = getpbuf_kva(&ffsrawbufcnt); error = ffs_rawread_readahead(vp, udata, offset, resid, bp, &baseticks); if (error != 0) @@ -258,7 +258,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) if (resid > bp->b_bufsize) { /* Setup fist readahead */ /* XXX: Leave bufs for swap */ if (rawreadahead != 0) - nbp = trypbuf(&ffsrawbufcnt); + nbp = trypbuf_kva(&ffsrawbufcnt); else nbp = NULL; if (nbp != NULL) { diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 89faeef53b..8554ce9863 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1313,7 +1313,7 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess) /* * map our page(s) into kva for input */ - bp = getpbuf(&nsw_rcount); + bp = getpbuf_kva(&nsw_rcount); bio = &bp->b_bio1; kva = (vm_offset_t) bp->b_kvabase; bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t)); @@ -1561,9 +1561,9 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, * request and assign the swap space. */ if (sync == TRUE) - bp = getpbuf(&nsw_wcount_sync); + bp = getpbuf_kva(&nsw_wcount_sync); else - bp = getpbuf(&nsw_wcount_async); + bp = getpbuf_kva(&nsw_wcount_async); bio = &bp->b_bio1; pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index 7021669436..6899f3ba70 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -81,10 +81,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -177,13 +179,24 @@ int npagers = sizeof(pagertab) / sizeof(pagertab[0]); */ #define PAGER_MAP_SIZE (8 * 1024 * 1024) +TAILQ_HEAD(swqueue, buf); + int pager_map_size = PAGER_MAP_SIZE; struct vm_map pager_map; -static int bswneeded; +static int bswneeded_raw; +static int bswneeded_kva; +static int nswbuf_raw; +static struct buf *swbuf_raw; static vm_offset_t swapbkva; /* swap buffers kva */ -static TAILQ_HEAD(swqueue, buf) bswlist; +static struct swqueue bswlist_raw; /* without kva */ +static struct swqueue bswlist_kva; /* with kva */ static struct spinlock bswspin = SPINLOCK_INITIALIZER(&bswspin); +static int pbuf_raw_count; +static int pbuf_kva_count; + +SYSCTL_INT(_vfs, OID_AUTO, pbuf_raw_count, CTLFLAG_RD, &pbuf_raw_count, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, pbuf_kva_count, CTLFLAG_RD, &pbuf_kva_count, 0, ""); /* * Initialize the swap buffer list. @@ -193,7 +206,8 @@ static struct spinlock bswspin = SPINLOCK_INITIALIZER(&bswspin); static void vm_pager_init(void *arg __unused) { - TAILQ_INIT(&bswlist); + TAILQ_INIT(&bswlist_raw); + TAILQ_INIT(&bswlist_kva); } SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_SECOND, vm_pager_init, NULL) @@ -214,15 +228,32 @@ vm_pager_bufferinit(void) panic("Not enough pager_map VM space for physical buffers"); /* - * Initial pbuf setup. + * Initial pbuf setup. These pbufs have KVA reservations. */ bp = swbuf; for (i = 0; i < nswbuf; ++i, ++bp) { bp->b_kvabase = (caddr_t)((intptr_t)i * MAXPHYS) + swapbkva; bp->b_kvasize = MAXPHYS; - TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); BUF_LOCKINIT(bp); buf_dep_init(bp); + TAILQ_INSERT_HEAD(&bswlist_kva, bp, b_freelist); + ++pbuf_kva_count; + } + + /* + * Initial pbuf setup. These pbufs do not have KVA reservations, + * so we can have a lot more of them. These are typically used + * to massage low level buf/bio requests. + */ + nswbuf_raw = nbuf * 2; + swbuf_raw = (void *)kmem_alloc(&kernel_map, + round_page(nswbuf_raw * sizeof(struct buf))); + bp = swbuf_raw; + for (i = 0; i < nswbuf_raw; ++i, ++bp) { + BUF_LOCKINIT(bp); + buf_dep_init(bp); + TAILQ_INSERT_HEAD(&bswlist_raw, bp, b_freelist); + ++pbuf_raw_count; } /* @@ -276,8 +307,8 @@ vm_pager_sync(void) static void initpbuf(struct buf *bp) { - bp->b_qindex = 0; /* BQUEUE_NONE */ - bp->b_data = bp->b_kvabase; + bp->b_qindex = 0; /* BQUEUE_NONE */ + bp->b_data = bp->b_kvabase; /* NULL if pbuf sans kva */ bp->b_flags = B_PAGING; bp->b_cmd = BUF_CMD_DONE; bp->b_error = 0; @@ -303,6 +334,11 @@ initpbuf(struct buf *bp) * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX * + * Physical buffers can be with or without KVA space reserved. There + * are severe limitations on the ones with KVA reserved, and fewer + * limitations on the ones without. getpbuf() gets one without, + * getpbuf_kva() gets one with. + * * No requirements. */ struct buf * @@ -319,13 +355,14 @@ getpbuf(int *pfreecnt) } /* get a bp from the swap buffer header pool */ - if ((bp = TAILQ_FIRST(&bswlist)) != NULL) + if ((bp = TAILQ_FIRST(&bswlist_raw)) != NULL) break; - bswneeded = 1; - ssleep(&bswneeded, &bswspin, 0, "wswbuf1", 0); + bswneeded_raw = 1; + ssleep(&bswneeded_raw, &bswspin, 0, "wswbuf1", 0); /* loop in case someone else grabbed one */ } - TAILQ_REMOVE(&bswlist, bp, b_freelist); + TAILQ_REMOVE(&bswlist_raw, bp, b_freelist); + --pbuf_raw_count; if (pfreecnt) --*pfreecnt; @@ -333,7 +370,41 @@ getpbuf(int *pfreecnt) initpbuf(bp); KKASSERT(dsched_is_clear_buf_priv(bp)); - return bp; + + return (bp); +} + +struct buf * +getpbuf_kva(int *pfreecnt) +{ + struct buf *bp; + + spin_lock_wr(&bswspin); + + for (;;) { + if (pfreecnt) { + while (*pfreecnt == 0) + ssleep(pfreecnt, &bswspin, 0, "wswbuf0", 0); + } + + /* get a bp from the swap buffer header pool */ + if ((bp = TAILQ_FIRST(&bswlist_kva)) != NULL) + break; + bswneeded_kva = 1; + ssleep(&bswneeded_kva, &bswspin, 0, "wswbuf1", 0); + /* loop in case someone else grabbed one */ + } + TAILQ_REMOVE(&bswlist_kva, bp, b_freelist); + --pbuf_kva_count; + if (pfreecnt) + --*pfreecnt; + + spin_unlock_wr(&bswspin); + + initpbuf(bp); + KKASSERT(dsched_is_clear_buf_priv(bp)); + + return (bp); } /* @@ -351,11 +422,34 @@ trypbuf(int *pfreecnt) spin_lock_wr(&bswspin); - if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { + if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist_raw)) == NULL) { + spin_unlock_wr(&bswspin); + return NULL; + } + TAILQ_REMOVE(&bswlist_raw, bp, b_freelist); + --pbuf_raw_count; + --*pfreecnt; + + spin_unlock_wr(&bswspin); + + initpbuf(bp); + + return bp; +} + +struct buf * +trypbuf_kva(int *pfreecnt) +{ + struct buf *bp; + + spin_lock_wr(&bswspin); + + if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist_kva)) == NULL) { spin_unlock_wr(&bswspin); return NULL; } - TAILQ_REMOVE(&bswlist, bp, b_freelist); + TAILQ_REMOVE(&bswlist_kva, bp, b_freelist); + --pbuf_kva_count; --*pfreecnt; spin_unlock_wr(&bswspin); @@ -376,7 +470,8 @@ trypbuf(int *pfreecnt) void relpbuf(struct buf *bp, int *pfreecnt) { - int wake_bsw = 0; + int wake_bsw_kva = 0; + int wake_bsw_raw = 0; int wake_freecnt = 0; KKASSERT(bp->b_flags & B_PAGING); @@ -385,10 +480,20 @@ relpbuf(struct buf *bp, int *pfreecnt) spin_lock_wr(&bswspin); BUF_UNLOCK(bp); - TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); - if (bswneeded) { - bswneeded = 0; - wake_bsw = 1; + if (bp->b_kvabase) { + TAILQ_INSERT_HEAD(&bswlist_kva, bp, b_freelist); + ++pbuf_kva_count; + } else { + TAILQ_INSERT_HEAD(&bswlist_raw, bp, b_freelist); + ++pbuf_raw_count; + } + if (bswneeded_kva) { + bswneeded_kva = 0; + wake_bsw_kva = 1; + } + if (bswneeded_raw) { + bswneeded_raw = 0; + wake_bsw_raw = 1; } if (pfreecnt) { if (++*pfreecnt == 1) @@ -397,8 +502,10 @@ relpbuf(struct buf *bp, int *pfreecnt) spin_unlock_wr(&bswspin); - if (wake_bsw) - wakeup(&bswneeded); + if (wake_bsw_kva) + wakeup(&bswneeded_kva); + if (wake_bsw_raw) + wakeup(&bswneeded_raw); if (wake_freecnt) wakeup(pfreecnt); }