From: Matthew Dillon Date: Fri, 22 Jan 2010 05:09:51 +0000 (-0800) Subject: kernel - NFS - fix another B_CLUSTEROK / B_NEEDCOMMIT races X-Git-Tag: v2.7.1~229^2~66 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/8ae5c7e01448df9bc6797560538ae7b4b16be6f4 kernel - NFS - fix another B_CLUSTEROK / B_NEEDCOMMIT races * nfs_flush_docommit() was not handling the case where B_NEEDCOMMIT might get cleared by vfs_busy_pages() due to late detection of a modified VM page. This appears to be responsible for at least one fsx issue. * Catch an edge case when clearing the PMAP modify bit in vfs_busy_pages(). * NFS no longer tries to cluster commit operations via the buffer cache's cluster code. nfs_flush_docommit() will still do its own manual clustering of commit ops. The problem with using B_CLUSTEROK is that the cluster code will collect bufs together but vfs_busy_pages() might have to clear B_NEEDCOMMIT when a late detection of a modified VM pages occurs. This doesn't propagate back to the underlying bufs making up the cluster. This appears to be responsible for at least one fsx issue too. --- diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 19f0f8c65e..07679e8a43 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -132,6 +132,7 @@ static u_int bd_wake_ary[BD_WAKE_SIZE]; static u_int bd_wake_index; static u_int vm_cycle_point = ACT_INIT + ACT_ADVANCE * 6; static struct spinlock needsbuffer_spin; +static int debug_commit; static struct thread *bufdaemon_td; static struct thread *bufdaemonhw_td; @@ -191,6 +192,7 @@ SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0, "Amount of time KVA space was deallocated in an arbitrary buffer"); SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0, "Amount of time buffer re-use operations were successful"); +SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, ""); SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), "sizeof(struct buf)"); @@ -3933,14 +3935,29 @@ vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m) */ vm_page_test_dirty(m); if (m->dirty) { - pmap_clear_modify(m); if ((bp->b_flags & B_NEEDCOMMIT) && (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) { + if (debug_commit) kprintf("Warning: vfs_clean_one_page: bp %p " - "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT\n", + "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT" + " cmd %d vd %02x/%02x x/s/e %d %d %d " + "doff/end %d %d\n", bp, (intmax_t)bp->b_loffset, bp->b_bcount, - bp->b_flags); + bp->b_flags, bp->b_cmd, + m->valid, m->dirty, xoff, soff, eoff, + bp->b_dirtyoff, bp->b_dirtyend); bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); + if (debug_commit) + print_backtrace(); + } + /* + * Only clear the pmap modified bits if ALL the dirty bits + * are set, otherwise the system might mis-clear portions + * of a page. + */ + if (m->dirty == VM_PAGE_BITS_ALL && + (bp->b_flags & B_NEEDCOMMIT) == 0) { + pmap_clear_modify(m); } if (bp->b_dirtyoff > soff - xoff) bp->b_dirtyoff = soff - xoff; diff --git a/sys/vfs/nfs/nfs_bio.c b/sys/vfs/nfs/nfs_bio.c index 364891d212..29cee4c619 100644 --- a/sys/vfs/nfs/nfs_bio.c +++ b/sys/vfs/nfs/nfs_bio.c @@ -1153,23 +1153,23 @@ nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td) error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit); /* - * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try - * to cluster the buffers needing commit. This will allow - * the system to submit a single commit rpc for the whole - * cluster. We can do this even if the buffer is not 100% - * dirty (relative to the NFS blocksize), so we optimize the - * append-to-file-case. - * - * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be - * cleared because write clustering only works for commit - * rpc's, not for the data portion of the write). + * We no longer try to use kern/vfs_bio's cluster code to + * cluster commits, so B_CLUSTEROK is no longer set with + * B_NEEDCOMMIT. The problem is that a vfs_busy_pages() + * may have to clear B_NEEDCOMMIT if it finds underlying + * pages have been redirtied through a memory mapping + * and doing this on a clustered bp will probably cause + * a panic, plus the flag in the underlying NFS bufs + * making up the cluster bp will not be properly cleared. */ - if (!error && iomode == NFSV3WRITE_UNSTABLE) { bp->b_flags |= B_NEEDCOMMIT; +#if 0 + /* XXX do not enable commit clustering */ if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; +#endif } else { bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } @@ -1547,21 +1547,16 @@ nfsmout: /* * End of RPC. Now clean up the bp. * - * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try - * to cluster the buffers needing commit. This will allow - * the system to submit a single commit rpc for the whole - * cluster. We can do this even if the buffer is not 100% - * dirty (relative to the NFS blocksize), so we optimize the - * append-to-file-case. - * - * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be - * cleared because write clustering only works for commit - * rpc's, not for the data portion of the write). + * We no longer enable write clustering for commit operations, + * See around line 1157 for a more detailed comment. */ if (!error && iomode == NFSV3WRITE_UNSTABLE) { bp->b_flags |= B_NEEDCOMMIT; +#if 0 + /* XXX do not enable commit clustering */ if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; +#endif } else { bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } diff --git a/sys/vfs/nfs/nfs_vnops.c b/sys/vfs/nfs/nfs_vnops.c index 24f7b2d484..ae039ac6ca 100644 --- a/sys/vfs/nfs/nfs_vnops.c +++ b/sys/vfs/nfs/nfs_vnops.c @@ -3288,14 +3288,19 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error) */ for (i = 0; i < info->bvsize; ++i) { bp = info->bvary[i]; - bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); - if (retv) { + if (retv || (bp->b_flags & B_NEEDCOMMIT) == 0) { /* - * Error, leave B_DELWRI intact + * Either an error or the original + * vfs_busy_pages() cleared B_NEEDCOMMIT + * due to finding new dirty VM pages in + * the buffer. + * + * Leave B_DELWRI intact. */ + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); vfs_unbusy_pages(bp); bp->b_cmd = BUF_CMD_DONE; - brelse(bp); + bqrelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). @@ -3310,6 +3315,7 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error) */ bundirty(bp); bp->b_flags &= ~B_ERROR; + bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); bp->b_dirtyoff = bp->b_dirtyend = 0; biodone(&bp->b_bio1); }