From: Matthew Dillon Date: Sun, 5 Mar 2006 18:38:39 +0000 (+0000) Subject: Replace the global buffer cache hash table with a per-vnode red-black tree. X-Git-Tag: v2.0.1~5262 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/1f1ea522b5b3ee752ad73fca1c965b6937d2bf05 Replace the global buffer cache hash table with a per-vnode red-black tree. Add a B_HASHED b_flags bit as a sanity check. Remove the invalhash junk and replace with assertions in several cases where the buffer must already not be hashed. Get rid of incore() and gbincore() and replace with a new function called findblk(). Merge the new RB management with bgetvp(), the two are now fully integrated. Previous work has turned reassignbuf() into a mostly degenerate call, simplify its arguments and functionality to match. Remove an unnecessary reassignbuf() call from the NFS code. Get rid of pbreassignbuf(). Adjust the code in several places where it was assumed that calling BUF_LOCK() with LK_SLEEPFAIL after previously failing with LK_NOWAIT would always fail. This code was used to sleep before a retry. Instead, if the second lock unexpectedly succeeds, simply issue an unlock and retry anyway. Testing-by: Stefan Krueger --- diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index c535fb388e..d9fa4bb00d 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -36,7 +36,7 @@ * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ - * $DragonFly: src/sys/i386/i386/Attic/machdep.c,v 1.86 2006/02/17 19:18:06 dillon Exp $ + * $DragonFly: src/sys/i386/i386/Attic/machdep.c,v 1.87 2006/03/05 18:38:32 dillon Exp $ */ #include "use_apm.h" @@ -353,7 +353,6 @@ again: valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); - v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 0ffaeefbf7..261df18255 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -12,7 +12,7 @@ * John S. Dyson. * * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $ - * $DragonFly: src/sys/kern/vfs_bio.c,v 1.57 2006/03/02 20:28:49 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_bio.c,v 1.58 2006/03/05 18:38:34 dillon Exp $ */ /* @@ -164,9 +164,6 @@ SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0, SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), "sizeof(struct buf)"); -static int bufhashmask; -static int bufhashshift; -static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; char *buf_wmesg = BUF_WMESG; extern int vm_swap_size; @@ -176,51 +173,6 @@ extern int vm_swap_size; #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ -/* - * Buffer hash table code. Note that the logical block scans linearly, which - * gives us some L1 cache locality. - */ - -static __inline -struct bufhashhdr * -bufhash(struct vnode *vnp, daddr_t bn) -{ - u_int64_t hashkey64; - int hashkey; - - /* - * A variation on the Fibonacci hash that Knuth credits to - * R. W. Floyd, see Knuth's _Art of Computer Programming, - * Volume 3 / Sorting and Searching_ - * - * We reduce the argument to 32 bits before doing the hash to - * avoid the need for a slow 64x64 multiply on 32 bit platforms. - * - * sizeof(struct vnode) is 168 on i386, so toss some of the lower - * bits of the vnode address to reduce the key range, which - * improves the distribution of keys across buckets. - * - * The file system cylinder group blocks are very heavily - * used. They are located at invervals of fbg, which is - * on the order of 89 to 94 * 2^10, depending on other - * filesystem parameters, for a 16k block size. Smaller block - * sizes will reduce fpg approximately proportionally. This - * will cause the cylinder group index to be hashed using the - * lower bits of the hash multiplier, which will not distribute - * the keys as uniformly in a classic Fibonacci hash where a - * relatively small number of the upper bits of the result - * are used. Using 2^16 as a close-enough approximation to - * fpg, split the hash multiplier in half, with the upper 16 - * bits being the inverse of the golden ratio, and the lower - * 16 bits being a fraction between 1/3 and 3/7 (closer to - * 3/7 in this case), that gives good experimental results. - */ - hashkey64 = ((u_int64_t)(uintptr_t)vnp >> 3) + (u_int64_t)bn; - hashkey = (((u_int32_t)(hashkey64 + (hashkey64 >> 32)) * 0x9E376DB1u) >> - bufhashshift) & bufhashmask; - return(&bufhashtbl[hashkey]); -} - /* * numdirtywakeup: * @@ -383,25 +335,6 @@ bd_speedup(void) bd_wakeup(1); } -/* - * bufhashinit: - * - * Initialize buffer headers and related structures. - */ - -caddr_t -bufhashinit(caddr_t vaddr) -{ - /* first, make a null hash table */ - bufhashshift = 29; - for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) - bufhashshift--; - bufhashtbl = (void *)vaddr; - vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask; - --bufhashmask; - return(vaddr); -} - /* * bufinit: * @@ -415,11 +348,6 @@ bufinit(void) vm_offset_t bogus_offset; int i; - LIST_INIT(&invalhash); - - for (i = 0; i <= bufhashmask; i++) - LIST_INIT(&bufhashtbl[i]); - /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); @@ -435,7 +363,6 @@ bufinit(void) LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist); - LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* @@ -923,7 +850,7 @@ bdirty(struct buf *bp) if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; - reassignbuf(bp, bp->b_vp); + reassignbuf(bp); ++numdirtybuffers; bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } @@ -949,7 +876,7 @@ bundirty(struct buf *bp) { if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; - reassignbuf(bp, bp->b_vp); + reassignbuf(bp); --numdirtybuffers; numdirtywakeup(lodirtybuffers); } @@ -1274,6 +1201,7 @@ brelse(struct buf * bp) bp->b_flags |= B_INVAL; bp->b_xflags &= ~BX_BKGRDWRITE; KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08lx vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); + KKASSERT((bp->b_flags & B_HASHED) == 0); if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) { @@ -1282,22 +1210,19 @@ brelse(struct buf * bp) bp->b_qindex = BQUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - LIST_REMOVE(bp, b_hash); - LIST_INSERT_HEAD(&invalhash, bp, b_hash); } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { /* * Buffers with junk contents. Again these buffers had better * already be disassociated from their vnode. */ KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08lx vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); + KKASSERT((bp->b_flags & B_HASHED) == 0); bp->b_flags |= B_INVAL; bp->b_xflags &= ~BX_BKGRDWRITE; if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = BQUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); - LIST_REMOVE(bp, b_hash); - LIST_INSERT_HEAD(&invalhash, bp, b_hash); } else if (bp->b_flags & B_LOCKED) { /* * Buffers that are locked. @@ -1496,25 +1421,6 @@ vfs_vmio_release(struct buf *bp) brelvp(bp); } -/* - * gbincore: - * - * Check to see if a block is currently memory resident. - */ -struct buf * -gbincore(struct vnode * vp, daddr_t blkno) -{ - struct buf *bp; - struct bufhashhdr *bh; - - bh = bufhash(vp, blkno); - LIST_FOREACH(bp, bh, b_hash) { - if (bp->b_vp == vp && bp->b_lblkno == blkno) - break; - } - return (bp); -} - /* * vfs_bio_awrite: * @@ -1556,7 +1462,7 @@ vfs_bio_awrite(struct buf *bp) maxcl = MAXPHYS / size; for (i = 1; i < maxcl; i++) { - if ((bpa = gbincore(vp, lblkno + i)) && + if ((bpa = findblk(vp, lblkno + i)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && @@ -1570,7 +1476,7 @@ vfs_bio_awrite(struct buf *bp) } } for (j = 1; i + j <= maxcl && j <= lblkno; j++) { - if ((bpa = gbincore(vp, lblkno - j)) && + if ((bpa = findblk(vp, lblkno - j)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && @@ -1785,12 +1691,11 @@ restart: */ KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08lx vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex)); + KKASSERT((bp->b_flags & B_HASHED) == 0); if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 3"); - LIST_REMOVE(bp, b_hash); - LIST_INSERT_HEAD(&invalhash, bp, b_hash); /* * critical section protection is not required when @@ -2054,27 +1959,11 @@ flushbufqueues(void) return (r); } -/* - * incore: - * - * Check to see if a block is currently resident in memory. - */ -struct buf * -incore(struct vnode * vp, daddr_t blkno) -{ - struct buf *bp; - - crit_enter(); - bp = gbincore(vp, blkno); - crit_exit(); - return (bp); -} - /* * inmem: * * Returns true if no I/O is needed to access the associated VM object. - * This is like incore except it also hunts around in the VM system for + * This is like findblk except it also hunts around in the VM system for * the data. * * Note that we ignore vm_page_free() races from interrupts against our @@ -2089,7 +1978,7 @@ inmem(struct vnode * vp, daddr_t blkno) vm_page_t m; vm_ooffset_t off; - if (incore(vp, blkno)) + if (findblk(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; @@ -2207,6 +2096,25 @@ vfs_setdirty(struct buf *bp) } } +/* + * findblk: + * + * Locate and return the specified buffer, or NULL if the buffer does + * not exist. Do not attempt to lock the buffer or manipulate it in + * any way. The caller must validate that the correct buffer has been + * obtain after locking it. + */ +struct buf * +findblk(struct vnode *vp, daddr_t blkno) +{ + struct buf *bp; + + crit_enter(); + bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, blkno); + crit_exit(); + return(bp); +} + /* * getblk: * @@ -2251,10 +2159,9 @@ vfs_setdirty(struct buf *bp) * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * -getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) +getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; - struct bufhashhdr *bh; if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE); @@ -2266,7 +2173,7 @@ loop: * to completely exhaust the buffer cache. * * If this check ever becomes a bottleneck it may be better to - * move it into the else, when gbincore() fails. At the moment + * move it into the else, when findblk() fails. At the moment * it isn't a problem. * * XXX remove, we cannot afford to block anywhere if holding a vnode @@ -2279,7 +2186,7 @@ loop: tsleep(&needsbuffer, slpflag, "newbuf", slptimeo); } - if ((bp = gbincore(vp, blkno))) { + if ((bp = findblk(vp, blkno))) { /* * The buffer was found in the cache, but we need to lock it. * Even with LK_NOWAIT the lockmgr may break our critical @@ -2458,7 +2365,7 @@ loop: * from the point of the duplicate buffer creation through * to here, and we've locked the buffer. */ - if (gbincore(vp, blkno)) { + if (findblk(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; @@ -2466,18 +2373,16 @@ loop: /* * Insert the buffer into the hash, so that it can - * be found by incore. bgetvp() and bufhash() - * must be synchronized with each other. Make sure the - * translation layer has been cleared. + * be found by findblk(). + * + * Make sure the translation layer has been cleared. */ bp->b_lblkno = blkno; bp->b_loffset = offset; bp->b_bio2.bio_blkno = (daddr_t)-1; + /* bp->b_bio2.bio_next = NULL; */ bgetvp(vp, bp); - LIST_REMOVE(bp, b_hash); - bh = bufhash(vp, blkno); - LIST_INSERT_HEAD(bh, bp, b_hash); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 710d1b9cde..0b7652a881 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -34,7 +34,7 @@ * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $ - * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.16 2006/02/21 18:46:56 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.17 2006/03/05 18:38:34 dillon Exp $ */ #include "opt_debug_cluster.h" @@ -133,14 +133,14 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, bp->b_flags &= ~B_RAM; /* * We do the crit here so that there is no window - * between the incore and the b_usecount increment + * between the findblk and the b_usecount increment * below. We opt to keep the crit out of the loop * for efficiency. */ crit_enter(); for (i = 1; i < maxra; i++) { - if (!(tbp = incore(vp, lblkno+i))) { + if (!(tbp = findblk(vp, lblkno+i))) { break; } @@ -371,7 +371,7 @@ cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, * would block in the lock. The same checks have to * be made again after we officially get the buffer. */ - if ((tbp = incore(vp, lbn + i)) != NULL) { + if ((tbp = findblk(vp, lbn + i)) != NULL) { if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) break; BUF_UNLOCK(tbp); @@ -749,7 +749,7 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len) * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ - if (((tbp = gbincore(vp, start_lbn)) == NULL) || + if (((tbp = findblk(vp, start_lbn)) == NULL) || ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) || BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { ++start_lbn; @@ -818,7 +818,7 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len) * If the adjacent data is not even in core it * can't need to be written. */ - if ((tbp = gbincore(vp, start_lbn)) == NULL) { + if ((tbp = findblk(vp, start_lbn)) == NULL) { crit_exit(); break; } @@ -903,7 +903,6 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len) bundirty(tbp); tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR); tbp->b_flags |= B_ASYNC; - reassignbuf(tbp, tbp->b_vp); /* put on clean list */ crit_exit(); BUF_KERNPROC(tbp); cluster_append(&bp->b_bio1, tbp); diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c index 05870f8d4c..d319bd7466 100644 --- a/sys/kern/vfs_lock.c +++ b/sys/kern/vfs_lock.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/kern/vfs_lock.c,v 1.9 2006/03/02 19:07:59 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_lock.c,v 1.10 2006/03/05 18:38:34 dillon Exp $ */ /* @@ -494,6 +494,7 @@ allocvnode(int lktimeout, int lkflags) RB_INIT(&vp->v_rbclean_tree); RB_INIT(&vp->v_rbdirty_tree); + RB_INIT(&vp->v_rbhash_tree); vp->v_type = VNON; vp->v_tag = 0; vp->v_ops = NULL; diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index ee9107eca6..ff25457c9a 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -37,7 +37,7 @@ * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ - * $DragonFly: src/sys/kern/vfs_subr.c,v 1.69 2006/03/02 20:28:49 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_subr.c,v 1.70 2006/03/05 18:38:34 dillon Exp $ */ /* @@ -136,7 +136,8 @@ extern struct vnodeopv_entry_desc spec_vnodeop_entries[]; * Red black tree functions */ static int rb_buf_compare(struct buf *b1, struct buf *b2); -RB_GENERATE(buf_rb_tree, buf, b_rbnode, rb_buf_compare); +RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, daddr_t, b_lblkno); +RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, daddr_t, b_lblkno); static int rb_buf_compare(struct buf *b1, struct buf *b2) @@ -356,6 +357,8 @@ vinvalbuf(struct vnode *vp, int flags, struct thread *td, if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree)) panic("vinvalbuf: flush failed"); + if (!RB_EMPTY(&vp->v_rbhash_tree)) + panic("vinvalbuf: flush failed, buffers still present"); return (0); } @@ -785,17 +788,22 @@ void bgetvp(struct vnode *vp, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); + KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI)) == 0); + KKASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0); vhold(vp); - bp->b_vp = vp; /* * Insert onto list for new vnode. */ crit_enter(); + bp->b_vp = vp; + bp->b_flags |= B_HASHED; + if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) + panic("reassignbuf: dup lblk vp %p bp %p", vp, bp); + bp->b_xflags |= BX_VNCLEAN; - bp->b_xflags &= ~BX_VNDIRTY; if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) - panic("reassignbuf: dup lblk vp %p bp %p", vp, bp); + panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp); crit_exit(); } @@ -821,6 +829,10 @@ brelvp(struct buf *bp) buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } + if (bp->b_flags & B_HASHED) { + buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp); + bp->b_flags &= ~B_HASHED; + } if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); @@ -841,6 +853,7 @@ void pbgetvp(struct vnode *vp, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); + KKASSERT((bp->b_flags & B_HASHED) == 0); bp->b_vp = vp; bp->b_flags |= B_PAGING; @@ -853,37 +866,23 @@ void pbrelvp(struct buf *bp) { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); + KKASSERT((bp->b_flags & B_HASHED) == 0); bp->b_vp = NULL; bp->b_flags &= ~B_PAGING; } -void -pbreassignbuf(struct buf *bp, struct vnode *newvp) -{ - if ((bp->b_flags & B_PAGING) == 0) { - panic( - "pbreassignbuf() on non phys bp %p", - bp - ); - } - bp->b_vp = newvp; -} - /* - * Reassign a buffer from one vnode to another. - * Used to assign file specific control information - * (indirect blocks) to the vnode to which they belong. + * Reassign the buffer to the proper clean/dirty list based on B_DELWRI. + * This routine is called when the state of the B_DELWRI bit is changed. */ void -reassignbuf(struct buf *bp, struct vnode *newvp) +reassignbuf(struct buf *bp) { + struct vnode *vp = bp->b_vp; int delay; - if (newvp == NULL) { - printf("reassignbuf: NULL"); - return; - } + KKASSERT(vp != NULL); ++reassignbufcalls; /* @@ -894,34 +893,30 @@ reassignbuf(struct buf *bp, struct vnode *newvp) panic("cannot reassign paging buffer"); crit_enter(); - /* - * Delete from old vnode list, if on one. - */ - if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { - if (bp->b_xflags & BX_VNDIRTY) - buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbdirty_tree, bp); - else - buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbclean_tree, bp); - bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); - if (bp->b_vp != newvp) { - vdrop(bp->b_vp); - bp->b_vp = NULL; /* for clarification */ - } - } - /* - * If dirty, put on list of dirty buffers; otherwise insert onto list - * of clean buffers. - */ if (bp->b_flags & B_DELWRI) { - if ((newvp->v_flag & VONWORKLST) == 0) { - switch (newvp->v_type) { + /* + * Move to the dirty list, add the vnode to the worklist + */ + if (bp->b_xflags & BX_VNCLEAN) { + buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); + bp->b_xflags &= ~BX_VNCLEAN; + } + if ((bp->b_xflags & BX_VNDIRTY) == 0) { + if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) { + panic("reassignbuf: dup lblk vp %p bp %p", + vp, bp); + } + bp->b_xflags |= BX_VNDIRTY; + } + if ((vp->v_flag & VONWORKLST) == 0) { + switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: case VBLK: - if (newvp->v_rdev && - newvp->v_rdev->si_mountpoint != NULL) { + if (vp->v_rdev && + vp->v_rdev->si_mountpoint != NULL) { delay = metadelay; break; } @@ -929,24 +924,29 @@ reassignbuf(struct buf *bp, struct vnode *newvp) default: delay = filedelay; } - vn_syncer_add_to_worklist(newvp, delay); + vn_syncer_add_to_worklist(vp, delay); } - bp->b_xflags |= BX_VNDIRTY; - if (buf_rb_tree_RB_INSERT(&newvp->v_rbdirty_tree, bp)) - panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp); } else { - bp->b_xflags |= BX_VNCLEAN; - if (buf_rb_tree_RB_INSERT(&newvp->v_rbclean_tree, bp)) - panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp); - if ((newvp->v_flag & VONWORKLST) && - RB_EMPTY(&newvp->v_rbdirty_tree)) { - newvp->v_flag &= ~VONWORKLST; - LIST_REMOVE(newvp, v_synclist); + /* + * Move to the clean list, remove the vnode from the worklist + * if no dirty blocks remain. + */ + if (bp->b_xflags & BX_VNDIRTY) { + buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); + bp->b_xflags &= ~BX_VNDIRTY; + } + if ((bp->b_xflags & BX_VNCLEAN) == 0) { + if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) { + panic("reassignbuf: dup lblk vp %p bp %p", + vp, bp); + } + bp->b_xflags |= BX_VNCLEAN; + } + if ((vp->v_flag & VONWORKLST) && + RB_EMPTY(&vp->v_rbdirty_tree)) { + vp->v_flag &= ~VONWORKLST; + LIST_REMOVE(vp, v_synclist); } - } - if (bp->b_vp != newvp) { - bp->b_vp = newvp; - vhold(bp->b_vp); } crit_exit(); } diff --git a/sys/platform/pc32/i386/machdep.c b/sys/platform/pc32/i386/machdep.c index ae7ea32367..5531088640 100644 --- a/sys/platform/pc32/i386/machdep.c +++ b/sys/platform/pc32/i386/machdep.c @@ -36,7 +36,7 @@ * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ - * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.86 2006/02/17 19:18:06 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/machdep.c,v 1.87 2006/03/05 18:38:32 dillon Exp $ */ #include "use_apm.h" @@ -353,7 +353,6 @@ again: valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); - v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory diff --git a/sys/sys/buf.h b/sys/sys/buf.h index aaa98ed019..7e337c9a55 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -37,7 +37,7 @@ * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD: src/sys/sys/buf.h,v 1.88.2.10 2003/01/25 19:02:23 dillon Exp $ - * $DragonFly: src/sys/sys/buf.h,v 1.23 2006/03/02 19:26:17 dillon Exp $ + * $DragonFly: src/sys/sys/buf.h,v 1.24 2006/03/05 18:38:36 dillon Exp $ */ #ifndef _SYS_BUF_H_ @@ -75,7 +75,9 @@ struct xio; #define NBUF_BIO 4 struct buf_rb_tree; -RB_PROTOTYPE(buf_rb_tree, buf, b_rbnode, rb_buf_compare); +struct buf_rb_hash; +RB_PROTOTYPE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, daddr_t, b_lblkno); +RB_PROTOTYPE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, daddr_t, b_lblkno); /* * To avoid including @@ -138,8 +140,8 @@ extern struct bio_ops { * unrelated to the vnode/device whos strategy routine was called. */ struct buf { - LIST_ENTRY(buf) b_hash; /* Hash chain. */ - RB_ENTRY(buf) b_rbnode; /* Red-Black node in vnode RB tree */ + RB_ENTRY(buf) b_rbnode; /* RB node in vnode clean/dirty tree */ + RB_ENTRY(buf) b_rbhash; /* RB node in vnode hash tree */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ struct buf *b_cluster_next; /* Next buffer (cluster code) */ struct vnode *b_vp; /* (vp, lblkno) index */ @@ -229,7 +231,7 @@ struct buf { #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ -#define B_UNUSED40 0x00000040 /* Unused */ +#define B_HASHED 0x00000040 /* Indexed via v_rbhash_tree */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_FREEBUF 0x00000100 /* Instruct driver: free blocks */ #define B_DONE 0x00000200 /* I/O completed. */ @@ -339,7 +341,6 @@ extern int nswbuf; /* Number of swap I/O buffer headers. */ struct uio; -caddr_t bufhashinit (caddr_t); void bufinit (void); void bwillwrite (void); int buf_dirty_count_severe (void); @@ -360,9 +361,8 @@ void brelse (struct buf *); void bqrelse (struct buf *); int vfs_bio_awrite (struct buf *); struct buf *getpbuf (int *); -struct buf *incore (struct vnode *, daddr_t); -struct buf *gbincore (struct vnode *, daddr_t); int inmem (struct vnode *, daddr_t); +struct buf *findblk (struct vnode *, daddr_t); struct buf *getblk (struct vnode *, daddr_t, int, int, int); struct buf *geteblk (int); struct bio *push_bio(struct bio *); @@ -390,8 +390,7 @@ void bgetvp (struct vnode *, struct buf *); void pbgetvp (struct vnode *, struct buf *); void pbrelvp (struct buf *); int allocbuf (struct buf *bp, int size); -void reassignbuf (struct buf *, struct vnode *); -void pbreassignbuf (struct buf *, struct vnode *); +void reassignbuf (struct buf *); struct buf *trypbuf (int *); #endif /* _KERNEL */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 7e417c2668..2b450308de 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -32,7 +32,7 @@ * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD: src/sys/sys/vnode.h,v 1.111.2.19 2002/12/29 18:19:53 dillon Exp $ - * $DragonFly: src/sys/sys/vnode.h,v 1.40 2006/02/17 19:18:07 dillon Exp $ + * $DragonFly: src/sys/sys/vnode.h,v 1.41 2006/03/05 18:38:36 dillon Exp $ */ #ifndef _SYS_VNODE_H_ @@ -155,6 +155,7 @@ vrange_lock_excl(struct vnode *vp, struct vrangelock *vr, * journaling is turned on or off. */ RB_HEAD(buf_rb_tree, buf); +RB_HEAD(buf_rb_hash, buf); struct vnode { u_long v_flag; /* vnode flags (see below) */ @@ -171,6 +172,7 @@ struct vnode { TAILQ_ENTRY(vnode) v_nmntvnodes; /* vnodes for mount point */ struct buf_rb_tree v_rbclean_tree; /* RB tree of clean bufs */ struct buf_rb_tree v_rbdirty_tree; /* RB tree of dirty bufs */ + struct buf_rb_hash v_rbhash_tree; /* RB tree general lookup */ LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */ enum vtype v_type; /* vnode type */ union { diff --git a/sys/vfs/nfs/nfs_bio.c b/sys/vfs/nfs/nfs_bio.c index 7d4ebaaa31..75b11b6e17 100644 --- a/sys/vfs/nfs/nfs_bio.c +++ b/sys/vfs/nfs/nfs_bio.c @@ -35,7 +35,7 @@ * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $ - * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.26 2006/02/17 19:18:07 dillon Exp $ + * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.27 2006/03/05 18:38:37 dillon Exp $ */ @@ -458,7 +458,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag) for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { rabn = lbn + 1 + nra; - if (!incore(vp, rabn)) { + if (!findblk(vp, rabn)) { rabp = nfs_getcacheblk(vp, rabn, biosize, td); if (!rabp) return (EINTR); @@ -642,7 +642,7 @@ again: (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && !(np->n_flag & NQNFSNONCACHE) && - !incore(vp, lbn + 1)) { + !findblk(vp, lbn + 1)) { rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { diff --git a/sys/vfs/nfs/nfs_serv.c b/sys/vfs/nfs/nfs_serv.c index 94ae44d370..d30f668a69 100644 --- a/sys/vfs/nfs/nfs_serv.c +++ b/sys/vfs/nfs/nfs_serv.c @@ -35,7 +35,7 @@ * * @(#)nfs_serv.c 8.8 (Berkeley) 7/31/95 * $FreeBSD: src/sys/nfs/nfs_serv.c,v 1.93.2.6 2002/12/29 18:19:53 dillon Exp $ - * $DragonFly: src/sys/vfs/nfs/nfs_serv.c,v 1.27 2006/03/04 17:39:08 dillon Exp $ + * $DragonFly: src/sys/vfs/nfs/nfs_serv.c,v 1.28 2006/03/05 18:38:37 dillon Exp $ */ /* @@ -3542,9 +3542,10 @@ nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, * have to lock and write it. Otherwise the prior * write is assumed to have already been committed. */ - if ((bp = gbincore(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) { + if ((bp = findblk(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { - BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL); + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) == 0) + BUF_UNLOCK(bp); continue; /* retry */ } bremfree(bp); diff --git a/sys/vfs/nfs/nfs_vnops.c b/sys/vfs/nfs/nfs_vnops.c index 93ff56937a..52a7ed23d4 100644 --- a/sys/vfs/nfs/nfs_vnops.c +++ b/sys/vfs/nfs/nfs_vnops.c @@ -35,7 +35,7 @@ * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 * $FreeBSD: src/sys/nfs/nfs_vnops.c,v 1.150.2.5 2001/12/20 19:56:28 dillon Exp $ - * $DragonFly: src/sys/vfs/nfs/nfs_vnops.c,v 1.48 2006/03/02 19:26:19 dillon Exp $ + * $DragonFly: src/sys/vfs/nfs/nfs_vnops.c,v 1.49 2006/03/05 18:38:37 dillon Exp $ */ @@ -3297,11 +3297,7 @@ nfs_bwrite(struct vop_bwrite_args *ap) int nfs_writebp(struct buf *bp, int force, struct thread *td) { - int oldflags = bp->b_flags; -#if 0 - int retv = 1; - off_t off; -#endif + int error; if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not locked???"); @@ -3316,7 +3312,6 @@ nfs_writebp(struct buf *bp, int force, struct thread *td) /* * Undirty the bp. We will redirty it later if the I/O fails. */ - crit_enter(); bundirty(bp); bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); @@ -3327,24 +3322,17 @@ nfs_writebp(struct buf *bp, int force, struct thread *td) * assign b_runningbufspace. */ vfs_busy_pages(bp, 1); - BUF_KERNPROC(bp); - vn_strategy(bp->b_vp, &bp->b_bio1); - - if((oldflags & B_ASYNC) == 0) { - int rtval = biowait(bp); - - if (oldflags & B_DELWRI) { - crit_enter(); - reassignbuf(bp, bp->b_vp); - crit_exit(); - } + if (bp->b_flags & B_ASYNC) { + vn_strategy(bp->b_vp, &bp->b_bio1); + error = 0; + } else { + vn_strategy(bp->b_vp, &bp->b_bio1); + error = biowait(bp); brelse(bp); - return (rtval); } - - return (0); + return (error); } /* diff --git a/sys/vfs/ufs/ffs_softdep.c b/sys/vfs/ufs/ffs_softdep.c index 5f582b50d4..b2ce3b7f52 100644 --- a/sys/vfs/ufs/ffs_softdep.c +++ b/sys/vfs/ufs/ffs_softdep.c @@ -37,7 +37,7 @@ * * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.57.2.11 2002/02/05 18:46:53 dillon Exp $ - * $DragonFly: src/sys/vfs/ufs/ffs_softdep.c,v 1.34 2006/02/17 19:18:08 dillon Exp $ + * $DragonFly: src/sys/vfs/ufs/ffs_softdep.c,v 1.35 2006/03/05 18:38:39 dillon Exp $ */ /* @@ -2266,8 +2266,11 @@ indir_trunc(ip, dbn, level, lbn, countp) * Otherwise we have to read the blocks in from the disk. */ ACQUIRE_LOCK(&lk); - if ((bp = incore(ip->i_devvp, dbn)) != NULL && + if ((bp = findblk(ip->i_devvp, dbn)) != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { + /* + * bp must be ir_savebp, which is held locked for our use. + */ if (wk->wk_type != D_INDIRDEP || (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || (indirdep->ir_state & GOINGAWAY) == 0) { diff --git a/sys/vfs/ufs/ufs_bmap.c b/sys/vfs/ufs/ufs_bmap.c index 730ce1f44b..2808bbb0fe 100644 --- a/sys/vfs/ufs/ufs_bmap.c +++ b/sys/vfs/ufs/ufs_bmap.c @@ -37,7 +37,7 @@ * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 * $FreeBSD: src/sys/ufs/ufs/ufs_bmap.c,v 1.34.2.1 2000/03/17 10:12:14 ps Exp $ - * $DragonFly: src/sys/vfs/ufs/ufs_bmap.c,v 1.7 2006/02/17 19:18:08 dillon Exp $ + * $DragonFly: src/sys/vfs/ufs/ufs_bmap.c,v 1.8 2006/03/05 18:38:39 dillon Exp $ */ #include @@ -165,7 +165,7 @@ ufs_bmaparray(struct vnode *vp, ufs_daddr_t bn, ufs_daddr_t *bnp, */ metalbn = xap->in_lbn; - if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) + if ((daddr == 0 && !findblk(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache