From: Matthew Dillon Date: Mon, 13 Jul 2009 22:57:53 +0000 (-0700) Subject: BUF/BIO - MPSAFE work on core buffer cache routines. X-Git-Tag: v2.4.0~461 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/b1c20cfa90fe1b1b6ab1a34b5ea98c33b5336ec9 BUF/BIO - MPSAFE work on core buffer cache routines. Do initial MPSAFE work on getblk(), bread(), brelse(), bqrelse(), biowait(), and friends. Use get_mplock()/rel_mplock() in areas not yet converted. Add a flags argument to findblk() and have it return a locked buffer cache buffer by default. Callers may request that the buffer be locked non-blocking, or that it not be locked at all. Adjust all callers of findblk() for those cases where an unlocked buffer is desired to lock and re-check the returned buffer's parameters, since otherwise it can change out from under the caller whether a critical section is used or not (for SMP). --- diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 97c70ed51a..e78950ccc3 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -256,8 +256,9 @@ runningbufwakeup(struct buf *bp) * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). + * + * MPSAFE */ - static __inline void bufcountwakeup(void) { @@ -328,6 +329,8 @@ vfs_buf_test_cache(struct buf *bp, * * Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the * low water mark. + * + * MPSAFE */ static __inline__ void @@ -359,6 +362,8 @@ bd_speedup(void) * * Get the buf_daemon heated up when the number of running and dirty * buffers exceeds the mid-point. + * + * MPSAFE */ int bd_heatup(void) @@ -387,6 +392,8 @@ bd_heatup(void) * * Regardless this function blocks while the number of dirty buffers * exceeds hidirtybufspace. + * + * MPSAFE */ void bd_wait(int totalspace) @@ -405,8 +412,13 @@ bd_wait(int totalspace) count = totalspace / BKVASIZE; if (count >= BD_WAKE_SIZE) count = BD_WAKE_SIZE - 1; + + spin_lock_wr(&needsbuffer_spin); i = (bd_wake_index + count) & BD_WAKE_MASK; ++bd_wake_ary[i]; + tsleep_interlock(&bd_wake_ary[i]); + spin_unlock_wr(&needsbuffer_spin); + tsleep(&bd_wake_ary[i], 0, "flstik", hz); crit_exit(); @@ -420,20 +432,30 @@ bd_wait(int totalspace) * This function is called whenever runningbufspace or dirtybufspace * is reduced. Track threads waiting for run+dirty buffer I/O * complete. + * + * MPSAFE */ static void bd_signal(int totalspace) { u_int i; - while (totalspace > 0) { - i = atomic_fetchadd_int(&bd_wake_index, 1); - i &= BD_WAKE_MASK; - if (bd_wake_ary[i]) { - bd_wake_ary[i] = 0; - wakeup(&bd_wake_ary[i]); + if (totalspace > 0) { + if (totalspace > BKVASIZE * BD_WAKE_SIZE) + totalspace = BKVASIZE * BD_WAKE_SIZE; + spin_lock_wr(&needsbuffer_spin); + while (totalspace > 0) { + i = bd_wake_index++; + i &= BD_WAKE_MASK; + if (bd_wake_ary[i]) { + bd_wake_ary[i] = 0; + spin_unlock_wr(&needsbuffer_spin); + wakeup(&bd_wake_ary[i]); + spin_lock_wr(&needsbuffer_spin); + } + totalspace -= BKVASIZE; } - totalspace -= BKVASIZE; + spin_unlock_wr(&needsbuffer_spin); } } @@ -698,6 +720,8 @@ clearbiocache(struct bio *bio) * buffer_map. * * Since this call frees up buffer space, we call bufspacewakeup(). + * + * MPALMOSTSAFE */ static void bfreekva(struct buf *bp) @@ -705,6 +729,7 @@ bfreekva(struct buf *bp) int count; if (bp->b_kvasize) { + get_mplock(); ++buffreekvacnt; count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(&buffer_map); @@ -718,6 +743,7 @@ bfreekva(struct buf *bp) vm_map_entry_release(count); bp->b_kvasize = 0; bufspacewakeup(); + rel_mplock(); } } @@ -748,7 +774,7 @@ bremfree(struct buf *bp) spin_unlock_wr(&bufspin); } -void +static void bremfree_locked(struct buf *bp) { _bremfree(bp); @@ -794,6 +820,8 @@ bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp) * read-ahead blocks. We must clear B_ERROR and B_INVAL prior * to initiating I/O . If B_CACHE is set, the buffer is valid * and we do not have to do anything. + * + * MPALMOSTSAFE */ int breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset, @@ -807,11 +835,13 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset, /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { + get_mplock(); bp->b_flags &= ~(B_ERROR | B_INVAL); bp->b_cmd = BUF_CMD_READ; vfs_busy_pages(vp, bp); vn_strategy(vp, &bp->b_bio1); ++readwait; + rel_mplock(); } for (i = 0; i < cnt; i++, raoffset++, rabsize++) { @@ -820,20 +850,20 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset, rabp = getblk(vp, *raoffset, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { + rel_mplock(); rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~(B_ERROR | B_INVAL); rabp->b_cmd = BUF_CMD_READ; vfs_busy_pages(vp, rabp); BUF_KERNPROC(rabp); vn_strategy(vp, &rabp->b_bio1); + rel_mplock(); } else { brelse(rabp); } } - - if (readwait) { + if (readwait) rv = biowait(bp); - } return (rv); } @@ -989,11 +1019,11 @@ bdirty(struct buf *bp) if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DELWRI; reassignbuf(bp); - ++dirtybufcount; + atomic_add_int(&dirtybufcount, 1); dirtybufspace += bp->b_bufsize; if (bp->b_flags & B_HEAVY) { - ++dirtybufcounthw; - dirtybufspacehw += bp->b_bufsize; + atomic_add_int(&dirtybufcounthw, 1); + atomic_add_int(&dirtybufspacehw, bp->b_bufsize); } bd_heatup(); } @@ -1010,8 +1040,8 @@ bheavy(struct buf *bp) if ((bp->b_flags & B_HEAVY) == 0) { bp->b_flags |= B_HEAVY; if (bp->b_flags & B_DELWRI) { - ++dirtybufcounthw; - dirtybufspacehw += bp->b_bufsize; + atomic_add_int(&dirtybufcounthw, 1); + atomic_add_int(&dirtybufspacehw, bp->b_bufsize); } } } @@ -1026,19 +1056,20 @@ bheavy(struct buf *bp) * The buffer is typically on BQUEUE_NONE but there is one case in * brelse() that calls this function after placing the buffer on * a different queue. + * + * MPSAFE */ - void bundirty(struct buf *bp) { if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); - --dirtybufcount; - dirtybufspace -= bp->b_bufsize; + atomic_subtract_int(&dirtybufcount, 1); + atomic_subtract_int(&dirtybufspace, bp->b_bufsize); if (bp->b_flags & B_HEAVY) { - --dirtybufcounthw; - dirtybufspacehw -= bp->b_bufsize; + atomic_subtract_int(&dirtybufcounthw, 1); + atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize); } bd_signal(bp->b_bufsize); } @@ -1097,6 +1128,8 @@ buf_dirty_count_severe(void) * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. + * + * MPALMOSTSAFE */ void brelse(struct buf *bp) @@ -1139,14 +1172,16 @@ brelse(struct buf *bp) */ bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL) { + get_mplock(); buf_deallocate(bp); + rel_mplock(); } if (bp->b_flags & B_DELWRI) { - --dirtybufcount; - dirtybufspace -= bp->b_bufsize; + atomic_subtract_int(&dirtybufcount, 1); + atomic_subtract_int(&dirtybufspace, bp->b_bufsize); if (bp->b_flags & B_HEAVY) { - --dirtybufcounthw; - dirtybufspacehw -= bp->b_bufsize; + atomic_subtract_int(&dirtybufcounthw, 1); + atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize); } bd_signal(bp->b_bufsize); } @@ -1174,8 +1209,11 @@ brelse(struct buf *bp) if (bp->b_flags & (B_DELWRI | B_LOCKED)) { bp->b_flags &= ~B_RELBUF; } else if (vm_page_count_severe()) { - if (LIST_FIRST(&bp->b_dep) != NULL) + if (LIST_FIRST(&bp->b_dep) != NULL) { + get_mplock(); buf_deallocate(bp); /* can set B_LOCKED */ + rel_mplock(); + } if (bp->b_flags & (B_DELWRI | B_LOCKED)) bp->b_flags &= ~B_RELBUF; else @@ -1237,6 +1275,7 @@ brelse(struct buf *bp) resid = bp->b_bufsize; foff = bp->b_loffset; + get_mplock(); for (i = 0; i < bp->b_xio.xio_npages; i++) { m = bp->b_xio.xio_pages[i]; vm_page_flag_clear(m, PG_ZERO); @@ -1314,20 +1353,19 @@ brelse(struct buf *bp) } if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); + rel_mplock(); } else { /* * Rundown for non-VMIO buffers. */ if (bp->b_flags & (B_INVAL | B_RELBUF)) { -#if 0 - if (bp->b_vp) - kprintf("brelse bp %p %08x/%08x: Warning, caught and fixed brelvp bug\n", bp, saved_flags, bp->b_flags); -#endif + get_mplock(); if (bp->b_bufsize) allocbuf(bp, 0); KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); if (bp->b_vp) brelvp(bp); + rel_mplock(); } } @@ -1337,8 +1375,7 @@ brelse(struct buf *bp) /* Temporary panic to verify exclusive locking */ /* This panic goes away when we allow shared refs */ panic("brelse: multiple refs"); - /* do not release to free list */ - BUF_UNLOCK(bp); + /* NOT REACHED */ return; } @@ -1450,6 +1487,8 @@ brelse(struct buf *bp) * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. + * + * MPSAFE */ void bqrelse(struct buf *bp) @@ -1576,8 +1615,11 @@ vfs_vmio_release(struct buf *bp) bp->b_xio.xio_npages = 0; bp->b_flags &= ~B_VMIO; KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); - if (bp->b_vp) + if (bp->b_vp) { + get_mplock(); brelvp(bp); + rel_mplock(); + } } /* @@ -1602,7 +1644,6 @@ vfs_bio_awrite(struct buf *bp) int nwritten; int size; - crit_enter(); /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster @@ -1618,7 +1659,7 @@ vfs_bio_awrite(struct buf *bp) size = vp->v_mount->mnt_stat.f_iosize; for (i = size; i < MAXPHYS; i += size) { - if ((bpa = findblk(vp, loffset + i)) && + if ((bpa = findblk(vp, loffset + i, FINDBLK_TEST)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && @@ -1632,7 +1673,7 @@ vfs_bio_awrite(struct buf *bp) } } for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) { - if ((bpa = findblk(vp, loffset - j)) && + if ((bpa = findblk(vp, loffset - j, FINDBLK_TEST)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && @@ -1647,6 +1688,7 @@ vfs_bio_awrite(struct buf *bp) } j -= size; nbytes = (i + j); + /* * this is a possible cluster write */ @@ -1654,7 +1696,6 @@ vfs_bio_awrite(struct buf *bp) BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, loffset - j, nbytes); - crit_exit(); return nwritten; } } @@ -1662,7 +1703,6 @@ vfs_bio_awrite(struct buf *bp) bremfree(bp); bp->b_flags |= B_ASYNC; - crit_exit(); /* * default (old) behavior, writing out only one block * @@ -1692,6 +1732,8 @@ vfs_bio_awrite(struct buf *bp) * To avoid VFS layer recursion we do not flush dirty buffers ourselves. * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. + * + * MPALMOSTSAFE */ static struct buf * getnewbuf(int blkflags, int slptimeo, int size, int maxsize) @@ -1860,7 +1902,9 @@ restart: * NOTE: bufspin is UNLOCKED now. */ if (LIST_FIRST(&bp->b_dep) != NULL) { + get_mplock(); buf_deallocate(bp); + rel_mplock(); if (bp->b_flags & B_LOCKED) { bqrelse(bp); goto restart; @@ -1869,12 +1913,16 @@ restart: } if (qindex == BQUEUE_CLEAN) { + get_mplock(); if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; + get_mplock(); vfs_vmio_release(bp); + rel_mplock(); } if (bp->b_vp) brelvp(bp); + rel_mplock(); } /* @@ -1893,8 +1941,11 @@ restart: * scrapping a buffer's contents because it is already * wired. */ - if (bp->b_bufsize) + if (bp->b_bufsize) { + get_mplock(); allocbuf(bp, 0); + rel_mplock(); + } bp->b_flags = B_BNOCLIP; bp->b_cmd = BUF_CMD_DONE; @@ -1987,6 +2038,7 @@ restart: bfreekva(bp); + get_mplock(); count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(&buffer_map); @@ -2002,6 +2054,7 @@ restart: ++bufdefragcnt; defrag = 1; bp->b_flags |= B_INVAL; + rel_mplock(); brelse(bp); goto restart; } @@ -2020,6 +2073,7 @@ restart: } vm_map_unlock(&buffer_map); vm_map_entry_release(count); + rel_mplock(); } bp->b_data = bp->b_kvabase; } @@ -2103,6 +2157,7 @@ recoverbufpages(void) bytes += bp->b_bufsize; + get_mplock(); if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; bp->b_flags |= B_DIRECT; /* try to free pages */ @@ -2121,6 +2176,7 @@ recoverbufpages(void) */ if (bp->b_bufsize) allocbuf(bp, 0); + rel_mplock(); bp->b_flags = B_BNOCLIP; bp->b_cmd = BUF_CMD_DONE; @@ -2380,7 +2436,7 @@ inmem(struct vnode *vp, off_t loffset) vm_offset_t toff, tinc, size; vm_page_t m; - if (findblk(vp, loffset)) + if (findblk(vp, loffset, FINDBLK_TEST)) return 1; if (vp->v_mount == NULL) return 0; @@ -2500,23 +2556,44 @@ vfs_setdirty(struct buf *bp) /* * findblk: * - * Locate and return the specified buffer, or NULL if the buffer does - * not exist. Do not attempt to lock the buffer or manipulate it in - * any way. The caller must validate that the correct buffer has been - * obtain after locking it. + * Locate and return the specified buffer. Unless flagged otherwise, + * a locked buffer will be returned if it exists or NULL if it does not. * + * FINDBLK_TEST - Do not lock the buffer. The caller is responsible + * for locking the buffer and ensuring that it remains + * the desired buffer after locking. * + * FINDBLK_NBLOCK - Lock the buffer non-blocking. If we are unable + * to acquire the lock we return NULL, even if the + * buffer exists. + * + * MPSAFE */ struct buf * -findblk(struct vnode *vp, off_t loffset) +findblk(struct vnode *vp, off_t loffset, int flags) { lwkt_tokref vlock; struct buf *bp; + int lkflags; + + lkflags = LK_EXCLUSIVE; + if (flags & FINDBLK_NBLOCK) + lkflags |= LK_NOWAIT; - lwkt_gettoken(&vlock, &vp->v_token); -/* ASSERT_LWKT_TOKEN_HELD(&vp->v_token);*/ - bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset); - lwkt_reltoken(&vlock); + for (;;) { + lwkt_gettoken(&vlock, &vp->v_token); + bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset); + lwkt_reltoken(&vlock); + if (bp == NULL || (flags & FINDBLK_TEST)) + break; + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { + bp = NULL; + break; + } + if (bp->b_vp == vp && bp->b_loffset == loffset) + break; + BUF_UNLOCK(bp); + } return(bp); } @@ -2565,6 +2642,8 @@ findblk(struct vnode *vp, off_t loffset) * * GETBLK_PCATCH - catch signal if blocked, can cause NULL return * GETBLK_BHEAVY - heavy-weight buffer cache buffer + * + * MPALMOSTSAFE */ struct buf * getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo) @@ -2572,15 +2651,15 @@ getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo) struct buf *bp; int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; int error; + int lkflags; if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE); if (vp->v_object == NULL) panic("getblk: vnode %p has no object!", vp); - crit_enter(); loop: - if ((bp = findblk(vp, loffset))) { + if ((bp = findblk(vp, loffset, FINDBLK_TEST)) != NULL) { /* * The buffer was found in the cache, but we need to lock it. * Even with LK_NOWAIT the lockmgr may break our critical @@ -2588,20 +2667,18 @@ loop: * once the lock has been obtained. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { - if (blkflags & GETBLK_NOWAIT) { - crit_exit(); + if (blkflags & GETBLK_NOWAIT) return(NULL); - } - int lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; + lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; if (blkflags & GETBLK_PCATCH) lkflags |= LK_PCATCH; error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo); if (error) { if (error == ENOLCK) goto loop; - crit_exit(); return (NULL); } + /* buffer may have changed on us */ } /* @@ -2625,7 +2702,6 @@ loop: */ if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) { BUF_UNLOCK(bp); - crit_exit(); return(NULL); } @@ -2661,6 +2737,7 @@ loop: * the buffer in such circumstances can lead to problems. */ if (size != bp->b_bcount) { + get_mplock(); if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); @@ -2671,6 +2748,7 @@ loop: bp->b_flags |= B_RELBUF; brelse(bp); } + rel_mplock(); goto loop; } KKASSERT(size <= bp->b_kvasize); @@ -2706,11 +2784,12 @@ loop: */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { + get_mplock(); bp->b_flags |= B_NOCACHE; bwrite(bp); + rel_mplock(); goto loop; } - crit_exit(); } else { /* * Buffer is not in-core, create new buffer. The buffer @@ -2744,33 +2823,19 @@ loop: maxsize = size + (loffset & PAGE_MASK); maxsize = imax(maxsize, bsize); - if ((bp = getnewbuf(blkflags, slptimeo, size, maxsize)) == NULL) { - if (slpflags || slptimeo) { - crit_exit(); + bp = getnewbuf(blkflags, slptimeo, size, maxsize); + if (bp == NULL) { + if (slpflags || slptimeo) return NULL; - } goto loop; } /* - * This code is used to make sure that a buffer is not - * created while the getnewbuf routine is blocked. - * This can be a problem whether the vnode is locked or not. - * If the buffer is created out from under us, we have to - * throw away the one we just created. There is no window - * race because we are safely running in a critical section - * from the point of the duplicate buffer creation through - * to here, and we've locked the buffer. - */ - if (findblk(vp, loffset)) { - bp->b_flags |= B_INVAL; - brelse(bp); - goto loop; - } - - /* - * Insert the buffer into the hash, so that it can - * be found by findblk(). + * Atomically insert the buffer into the hash, so that it can + * be found by findblk(). + * + * If bgetvp() returns non-zero a collision occured, and the + * bp will not be associated with the vnode. * * Make sure the translation layer has been cleared. */ @@ -2778,7 +2843,11 @@ loop: bp->b_bio2.bio_offset = NOOFFSET; /* bp->b_bio2.bio_next = NULL; */ - bgetvp(vp, bp); + if (bgetvp(vp, bp)) { + bp->b_flags |= B_INVAL; + brelse(bp); + goto loop; + } /* * All vnode-based buffers must be backed by a VM object. @@ -2787,9 +2856,9 @@ loop: bp->b_flags |= B_VMIO; KKASSERT(bp->b_cmd == BUF_CMD_DONE); + get_mplock(); allocbuf(bp, size); - - crit_exit(); + rel_mplock(); } return (bp); } @@ -2803,15 +2872,15 @@ loop: * * To this end, either B_LOCKED must be set or the dependancy list must be * non-empty. + * + * MPSAFE */ void regetblk(struct buf *bp) { KKASSERT((bp->b_flags & B_LOCKED) || LIST_FIRST(&bp->b_dep) != NULL); BUF_LOCK(bp, LK_EXCLUSIVE | LK_RETRY); - crit_enter(); bremfree(bp); - crit_exit(); } /* @@ -2822,6 +2891,8 @@ regetblk(struct buf *bp) * * critical section protection is not required for the allocbuf() * call because races are impossible here. + * + * MPALMOSTSAFE */ struct buf * geteblk(int size) @@ -2831,11 +2902,11 @@ geteblk(int size) maxsize = (size + BKVAMASK) & ~BKVAMASK; - crit_enter(); while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) ; - crit_exit(); + get_mplock(); allocbuf(bp, size); + rel_mplock(); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } @@ -2859,6 +2930,8 @@ geteblk(int size) * * This routine does not need to be called from a critical section but you * must own the buffer. + * + * NOTMPSAFE */ int allocbuf(struct buf *bp, int size) @@ -3208,7 +3281,6 @@ vn_strategy(struct vnode *vp, struct bio *bio) vop_strategy(*vp->v_ops, vp, bio); } - /* * biodone: * diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index b1f43c4c56..c0c11d8b48 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -132,30 +132,25 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset, } else { struct buf *tbp; bp->b_flags &= ~B_RAM; + /* - * We do the crit here so that there is no window - * between the findblk and the b_usecount increment - * below. We opt to keep the crit out of the loop - * for efficiency. + * Set read-ahead-mark only if we can passively lock + * the buffer. Note that with these flags the bp + * could very exist even though NULL is returned. */ - crit_enter(); for (i = 1; i < maxra; i++) { - if (!(tbp = findblk(vp, loffset + i * blksize))) { + tbp = findblk(vp, loffset + i * blksize, + FINDBLK_NBLOCK); + if (tbp == NULL) break; - } - - /* - * Set another read-ahead mark so we know - * to check again. - */ if (((i % racluster) == (racluster - 1)) || - (i == (maxra - 1))) + (i == (maxra - 1))) { tbp->b_flags |= B_RAM; + } + BUF_UNLOCK(tbp); } - crit_exit(); - if (i >= maxra) { + if (i >= maxra) return 0; - } loffset += i * blksize; } reqbp = bp = NULL; @@ -758,24 +753,23 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) int maxiosize = vmaxiosize(vp); while (bytes > 0) { - crit_enter(); /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ - if (((tbp = findblk(vp, start_loffset)) == NULL) || - ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) || - (LIST_FIRST(&tbp->b_dep) != NULL && buf_checkwrite(tbp)) || - BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { + tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK); + if (tbp == NULL || + (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI || + (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) { + if (tbp) + BUF_UNLOCK(tbp); start_loffset += blksize; bytes -= blksize; - crit_exit(); continue; } bremfree(tbp); KKASSERT(tbp->b_cmd == BUF_CMD_DONE); - crit_exit(); /* * Extra memory in the buffer, punt on this buffer. @@ -786,10 +780,10 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || - (tbp->b_bcount != tbp->b_bufsize) || - (tbp->b_bcount != blksize) || - (bytes == blksize) || - ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + (tbp->b_bcount != tbp->b_bufsize) || + (tbp->b_bcount != blksize) || + (bytes == blksize) || + ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); start_loffset += blksize; @@ -823,6 +817,7 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) bp->b_bio1.bio_done = cluster_callback; bp->b_bio1.bio_caller_info1.cluster_head = NULL; bp->b_bio1.bio_caller_info2.cluster_tail = NULL; + /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to @@ -830,31 +825,29 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) */ for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) { if (i != 0) { /* If not the first buffer */ - crit_enter(); + tbp = findblk(vp, start_loffset, + FINDBLK_NBLOCK); /* - * If the adjacent data is not even in core it - * can't need to be written. + * Buffer not found or could not be locked + * non-blocking. */ - if ((tbp = findblk(vp, start_loffset)) == NULL) { - crit_exit(); + if (tbp == NULL) break; - } /* * If it IS in core, but has different - * characteristics, or is locked (which - * means it could be undergoing a background - * I/O or be in a weird state), then don't - * cluster with it. + * characteristics, then don't cluster + * with it. */ if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | - B_INVAL | B_DELWRI | B_NEEDCOMMIT)) - != (B_DELWRI | B_CLUSTEROK | - (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || + B_INVAL | B_DELWRI | B_NEEDCOMMIT)) + != (B_DELWRI | B_CLUSTEROK | + (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || (tbp->b_flags & B_LOCKED) || - (LIST_FIRST(&tbp->b_dep) != NULL && buf_checkwrite(tbp)) || - BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { - crit_exit(); + (LIST_FIRST(&tbp->b_dep) && + buf_checkwrite(tbp)) + ) { + BUF_UNLOCK(tbp); break; } @@ -869,7 +862,6 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > (maxiosize / PAGE_SIZE))) { BUF_UNLOCK(tbp); - crit_exit(); break; } /* @@ -879,7 +871,6 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) */ bremfree(tbp); KKASSERT(tbp->b_cmd == BUF_CMD_DONE); - crit_exit(); } /* end of code for non-first buffers only */ /* @@ -917,12 +908,10 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) bp->b_bcount += blksize; bp->b_bufsize += blksize; - crit_enter(); bundirty(tbp); tbp->b_flags &= ~B_ERROR; tbp->b_flags |= B_ASYNC; tbp->b_cmd = BUF_CMD_WRITE; - crit_exit(); BUF_KERNPROC(tbp); cluster_append(&bp->b_bio1, tbp); diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c index 00a5d50444..221ce768ad 100644 --- a/sys/kern/vfs_lock.c +++ b/sys/kern/vfs_lock.c @@ -182,6 +182,8 @@ vrele(struct vnode *vp) * * An auxiliary reference DOES NOT move a vnode out of the VFREE state * once it has entered it. + * + * MPSAFE */ void vhold(struct vnode *vp) diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index b063ead413..de7d0f1b02 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -822,8 +822,10 @@ vfsync_bp(struct buf *bp, void *data) /* * Associate a buffer with a vnode. + * + * MPSAFE */ -void +int bgetvp(struct vnode *vp, struct buf *bp) { lwkt_tokref vlock; @@ -831,24 +833,22 @@ bgetvp(struct vnode *vp, struct buf *bp) KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI|B_VNCLEAN|B_VNDIRTY)) == 0); - /* - * vp is held for each bp associated with it. - */ - vhold(vp); - /* * Insert onto list for new vnode. */ lwkt_gettoken(&vlock, &vp->v_token); + if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) { + lwkt_reltoken(&vlock); + return (EEXIST); + } bp->b_vp = vp; bp->b_flags |= B_HASHED; - if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) - panic("reassignbuf: dup lblk vp %p bp %p", vp, bp); - bp->b_flags |= B_VNCLEAN; if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp); + vhold(vp); lwkt_reltoken(&vlock); + return(0); } /* @@ -891,6 +891,8 @@ brelvp(struct buf *bp) /* * Reassign the buffer to the proper clean/dirty list based on B_DELWRI. * This routine is called when the state of the B_DELWRI bit is changed. + * + * MPSAFE */ void reassignbuf(struct buf *bp) diff --git a/sys/kern/vfs_sync.c b/sys/kern/vfs_sync.c index 233a2f1ec5..3650dd212c 100644 --- a/sys/kern/vfs_sync.c +++ b/sys/kern/vfs_sync.c @@ -151,6 +151,8 @@ vfs_sync_init(void) /* * Add an item to the syncer work queue. + * + * MPSAFE */ void vn_syncer_add_to_worklist(struct vnode *vp, int delay) diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 9d2fd2ddc4..d97f7e2a96 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -207,6 +207,9 @@ struct buf { #define GETBLK_SZMATCH 0x0004 /* pre-existing buffer must match */ #define GETBLK_NOWAIT 0x0008 /* non-blocking */ +#define FINDBLK_TEST 0x0010 /* test only, do not lock */ +#define FINDBLK_NBLOCK 0x0020 /* use non-blocking lock, can return NULL */ + /* * These flags are kept in b_flags. * @@ -403,7 +406,7 @@ void bqrelse (struct buf *); int vfs_bio_awrite (struct buf *); struct buf *getpbuf (int *); int inmem (struct vnode *, off_t); -struct buf *findblk (struct vnode *, off_t); +struct buf *findblk (struct vnode *, off_t, int); struct buf *getblk (struct vnode *, off_t, int, int, int); struct buf *geteblk (int); void regetblk(struct buf *bp); @@ -427,7 +430,7 @@ int vmapbuf (struct buf *, caddr_t, int); void vunmapbuf (struct buf *); void relpbuf (struct buf *, int *); void brelvp (struct buf *); -void bgetvp (struct vnode *, struct buf *); +int bgetvp (struct vnode *, struct buf *); int allocbuf (struct buf *bp, int size); int scan_all_buffers (int (*)(struct buf *, void *), void *); void reassignbuf (struct buf *); diff --git a/sys/vfs/gnu/ext2fs/ext2_bmap.c b/sys/vfs/gnu/ext2fs/ext2_bmap.c index 5ac1ab30b1..228beaf5e9 100644 --- a/sys/vfs/gnu/ext2fs/ext2_bmap.c +++ b/sys/vfs/gnu/ext2fs/ext2_bmap.c @@ -195,8 +195,11 @@ ext2_bmaparray(struct vnode *vp, ext2_daddr_t bn, ext2_daddr_t *bnp, */ metalbn = xap->in_lbn; - if ((daddr == 0 && !findblk(vp, dbtodoff(fs, metalbn))) || metalbn == bn) + if ((daddr == 0 && + !findblk(vp, dbtodoff(fs, metalbn), FINDBLK_TEST)) || + metalbn == bn) { break; + } /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 8b40c76a6f..62b13bc486 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -274,7 +274,7 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) phys_offset = volume->ondisk->vol_buf_beg + (zone2_offset & HAMMER_OFF_SHORT_MASK); crit_enter(); - if ((bp = findblk(volume->devvp, phys_offset)) != NULL) + if ((bp = findblk(volume->devvp, phys_offset, FINDBLK_TEST)) != NULL) bp = getblk(volume->devvp, phys_offset, bp->b_bufsize, 0, 0); else bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0); @@ -1405,7 +1405,7 @@ hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data) hammer_ref(&ip->lock); if (hammer_get_vnode(ip, &vp) == 0) { - if ((bp = findblk(ip->vp, file_offset)) != NULL && + if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL && bp->b_bio2.bio_offset != NOOFFSET) { bp = getblk(ip->vp, file_offset, blksize, 0, 0); bp->b_bio2.bio_offset = NOOFFSET; diff --git a/sys/vfs/nfs/nfs_bio.c b/sys/vfs/nfs/nfs_bio.c index f6a501a627..7256e04ee5 100644 --- a/sys/vfs/nfs/nfs_bio.c +++ b/sys/vfs/nfs/nfs_bio.c @@ -429,7 +429,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag) (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { rabn = lbn + 1 + nra; raoffset = (off_t)rabn * biosize; - if (!findblk(vp, raoffset)) { + if (findblk(vp, raoffset, FINDBLK_TEST) == NULL) { rabp = nfs_getcacheblk(vp, raoffset, biosize, td); if (!rabp) return (EINTR); @@ -618,7 +618,8 @@ again: (np->n_direofoffset == 0 || loffset + NFS_DIRBLKSIZ < np->n_direofoffset) && (np->n_flag & NDONTCACHE) == 0 && - !findblk(vp, loffset + NFS_DIRBLKSIZ)) { + findblk(vp, loffset + NFS_DIRBLKSIZ, FINDBLK_TEST) == NULL + ) { rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ, NFS_DIRBLKSIZ, td); if (rabp) { diff --git a/sys/vfs/nfs/nfs_serv.c b/sys/vfs/nfs/nfs_serv.c index 06121293d7..3d22d2975a 100644 --- a/sys/vfs/nfs/nfs_serv.c +++ b/sys/vfs/nfs/nfs_serv.c @@ -3481,12 +3481,13 @@ nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, * have to lock and write it. Otherwise the prior * write is assumed to have already been committed. */ - if ((bp = findblk(vp, loffset)) != NULL && (bp->b_flags & B_DELWRI)) { - if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { - if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) == 0) - BUF_UNLOCK(bp); - continue; /* retry */ - } + if ((bp = findblk(vp, loffset, FINDBLK_TEST)) != NULL) { + if (bp->b_flags & B_DELWRI) + bp = findblk(vp, loffset, 0); + else + bp = NULL; + } + if (bp && (bp->b_flags & B_DELWRI)) { bremfree(bp); bp->b_flags &= ~B_ASYNC; bwrite(bp); diff --git a/sys/vfs/nfs/nfs_vnops.c b/sys/vfs/nfs/nfs_vnops.c index 31b08f9f68..34ad5e9598 100644 --- a/sys/vfs/nfs/nfs_vnops.c +++ b/sys/vfs/nfs/nfs_vnops.c @@ -3068,42 +3068,44 @@ int nfs_flush_bp(struct buf *bp, void *data) { struct nfs_flush_info *info = data; - off_t toff; + int lkflags; int error; + off_t toff; error = 0; switch(info->mode) { case NFI_FLUSHNEW: - crit_enter(); - if (info->loops && info->waitfor == MNT_WAIT) { + error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT); + if (error && info->loops && info->waitfor == MNT_WAIT) { error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT); if (error) { - int lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; + lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; if (info->slpflag & PCATCH) lkflags |= LK_PCATCH; error = BUF_TIMELOCK(bp, lkflags, "nfsfsync", info->slptimeo); } - } else { - error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT); } - if (error == 0) { - KKASSERT(bp->b_vp == info->vp); - - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfs_fsync: not dirty"); - if (bp->b_flags & B_NEEDCOMMIT) { - BUF_UNLOCK(bp); - crit_exit(); - break; - } - bremfree(bp); - crit_exit(); + /* + * Ignore locking errors + */ + if (error) { + error = 0; + break; + } + + /* + * The buffer may have changed out from under us, even if + * we did not block (MPSAFE). Check again now that it is + * locked. + */ + if (bp->b_vp == info->vp && + (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == B_DELWRI) { + bremfree(bp); bawrite(bp); } else { - crit_exit(); - error = 0; + BUF_UNLOCK(bp); } break; case NFI_COMMIT: @@ -3113,16 +3115,22 @@ nfs_flush_bp(struct buf *bp, void *data) * committed, but the normal flush loop will block on the * same buffer so we shouldn't get into an endless loop. */ - crit_enter(); if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != - (B_DELWRI | B_NEEDCOMMIT) || - BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { - crit_exit(); + (B_DELWRI | B_NEEDCOMMIT)) { break; } + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) + break; - KKASSERT(bp->b_vp == info->vp); - bremfree(bp); + /* + * We must recheck after successfully locking the buffer. + */ + if (bp->b_vp != info->vp || + (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != + (B_DELWRI | B_NEEDCOMMIT)) { + BUF_UNLOCK(bp); + break; + } /* * NOTE: storing the bp in the bvary[] basically sets @@ -3135,6 +3143,7 @@ nfs_flush_bp(struct buf *bp, void *data) * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ + bremfree(bp); bp->b_cmd = BUF_CMD_WRITE; vfs_busy_pages(bp->b_vp, bp); info->bvary[info->bvsize] = bp; @@ -3149,7 +3158,6 @@ nfs_flush_bp(struct buf *bp, void *data) error = nfs_flush_docommit(info, 0); KKASSERT(info->bvsize == 0); } - crit_exit(); } return (error); } @@ -3179,7 +3187,7 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error) retv = -error; } else { retv = nfs_commit(vp, info->beg_off, - (int)bytes, info->td); + (int)bytes, info->td); if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); } @@ -3211,12 +3219,10 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error) * start the transaction in order to * immediately biodone() it. */ - crit_enter(); bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~B_ERROR; bp->b_dirtyoff = bp->b_dirtyend = 0; - crit_exit(); biodone(&bp->b_bio1); } } diff --git a/sys/vfs/ufs/ffs_softdep.c b/sys/vfs/ufs/ffs_softdep.c index e0af38e90c..370819b7b1 100644 --- a/sys/vfs/ufs/ffs_softdep.c +++ b/sys/vfs/ufs/ffs_softdep.c @@ -2314,7 +2314,7 @@ indir_trunc(struct inode *ip, off_t doffset, int level, ufs_lbn_t lbn, * Otherwise we have to read the blocks in from the disk. */ ACQUIRE_LOCK(&lk); - if ((bp = findblk(ip->i_devvp, doffset)) != NULL && + if ((bp = findblk(ip->i_devvp, doffset, FINDBLK_TEST)) != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { /* * bp must be ir_savebp, which is held locked for our use. diff --git a/sys/vfs/ufs/ufs_bmap.c b/sys/vfs/ufs/ufs_bmap.c index 7ed38da343..d26bf0fc4b 100644 --- a/sys/vfs/ufs/ufs_bmap.c +++ b/sys/vfs/ufs/ufs_bmap.c @@ -186,10 +186,12 @@ ufs_bmaparray(struct vnode *vp, ufs_daddr_t bn, ufs_daddr_t *bnp, * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ - metalbn = xap->in_lbn; - if ((daddr == 0 && !findblk(vp, dbtodoff(fs, metalbn))) || metalbn == bn) + if ((daddr == 0 && + !findblk(vp, dbtodoff(fs, metalbn), FINDBLK_TEST)) || + metalbn == bn) { break; + } /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it.