kernel - NFS - fix another B_CLUSTEROK / B_NEEDCOMMIT races
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 22 Jan 2010 05:09:51 +0000 (21:09 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 22 Jan 2010 05:09:51 +0000 (21:09 -0800)
* nfs_flush_docommit() was not handling the case where B_NEEDCOMMIT
  might get cleared by vfs_busy_pages() due to late detection of
  a modified VM page.

  This appears to be responsible for at least one fsx issue.

* Catch an edge case when clearing the PMAP modify bit in vfs_busy_pages().

* NFS no longer tries to cluster commit operations via the buffer cache's
  cluster code.  nfs_flush_docommit() will still do its own manual
  clustering of commit ops.  The problem with using B_CLUSTEROK is that
  the cluster code will collect bufs together but vfs_busy_pages() might
  have to clear B_NEEDCOMMIT when a late detection of a modified VM
  pages occurs.  This doesn't propagate back to the underlying bufs making
  up the cluster.

  This appears to be responsible for at least one fsx issue too.

sys/kern/vfs_bio.c
sys/vfs/nfs/nfs_bio.c
sys/vfs/nfs/nfs_vnops.c

index 19f0f8c..07679e8 100644 (file)
@@ -132,6 +132,7 @@ static u_int bd_wake_ary[BD_WAKE_SIZE];
 static u_int bd_wake_index;
 static u_int vm_cycle_point = ACT_INIT + ACT_ADVANCE * 6;
 static struct spinlock needsbuffer_spin;
+static int debug_commit;
 
 static struct thread *bufdaemon_td;
 static struct thread *bufdaemonhw_td;
@@ -191,6 +192,7 @@ SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0,
        "Amount of time KVA space was deallocated in an arbitrary buffer");
 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0,
        "Amount of time buffer re-use operations were successful");
+SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, "");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf),
        "sizeof(struct buf)");
 
@@ -3933,14 +3935,29 @@ vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m)
         */
        vm_page_test_dirty(m);
        if (m->dirty) {
-               pmap_clear_modify(m);
                if ((bp->b_flags & B_NEEDCOMMIT) &&
                    (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) {
+                       if (debug_commit)
                        kprintf("Warning: vfs_clean_one_page: bp %p "
-                               "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT\n",
+                               "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT"
+                               " cmd %d vd %02x/%02x x/s/e %d %d %d "
+                               "doff/end %d %d\n",
                                bp, (intmax_t)bp->b_loffset, bp->b_bcount,
-                               bp->b_flags);
+                               bp->b_flags, bp->b_cmd,
+                               m->valid, m->dirty, xoff, soff, eoff,
+                               bp->b_dirtyoff, bp->b_dirtyend);
                        bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+                       if (debug_commit)
+                               print_backtrace();
+               }
+               /*
+                * Only clear the pmap modified bits if ALL the dirty bits
+                * are set, otherwise the system might mis-clear portions
+                * of a page.
+                */
+               if (m->dirty == VM_PAGE_BITS_ALL &&
+                   (bp->b_flags & B_NEEDCOMMIT) == 0) {
+                       pmap_clear_modify(m);
                }
                if (bp->b_dirtyoff > soff - xoff)
                        bp->b_dirtyoff = soff - xoff;
index 364891d..29cee4c 100644 (file)
@@ -1153,23 +1153,23 @@ nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
                error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit);
 
                /*
-                * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
-                * to cluster the buffers needing commit.  This will allow
-                * the system to submit a single commit rpc for the whole
-                * cluster.  We can do this even if the buffer is not 100% 
-                * dirty (relative to the NFS blocksize), so we optimize the
-                * append-to-file-case.
-                *
-                * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
-                * cleared because write clustering only works for commit
-                * rpc's, not for the data portion of the write).
+                * We no longer try to use kern/vfs_bio's cluster code to
+                * cluster commits, so B_CLUSTEROK is no longer set with
+                * B_NEEDCOMMIT.  The problem is that a vfs_busy_pages()
+                * may have to clear B_NEEDCOMMIT if it finds underlying
+                * pages have been redirtied through a memory mapping
+                * and doing this on a clustered bp will probably cause
+                * a panic, plus the flag in the underlying NFS bufs
+                * making up the cluster bp will not be properly cleared.
                 */
-
                if (!error && iomode == NFSV3WRITE_UNSTABLE) {
                    bp->b_flags |= B_NEEDCOMMIT;
+#if 0
+                   /* XXX do not enable commit clustering */
                    if (bp->b_dirtyoff == 0
                        && bp->b_dirtyend == bp->b_bcount)
                        bp->b_flags |= B_CLUSTEROK;
+#endif
                } else {
                    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
                }
@@ -1547,21 +1547,16 @@ nfsmout:
        /*
         * End of RPC.  Now clean up the bp.
         *
-        * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
-        * to cluster the buffers needing commit.  This will allow
-        * the system to submit a single commit rpc for the whole
-        * cluster.  We can do this even if the buffer is not 100%
-        * dirty (relative to the NFS blocksize), so we optimize the
-        * append-to-file-case.
-        *
-        * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
-        * cleared because write clustering only works for commit
-        * rpc's, not for the data portion of the write).
+        * We no longer enable write clustering for commit operations,
+        * See around line 1157 for a more detailed comment.
         */
        if (!error && iomode == NFSV3WRITE_UNSTABLE) {
                bp->b_flags |= B_NEEDCOMMIT;
+#if 0
+               /* XXX do not enable commit clustering */
                if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount)
                        bp->b_flags |= B_CLUSTEROK;
+#endif
        } else {
                bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
        }
index 24f7b2d..ae039ac 100644 (file)
@@ -3288,14 +3288,19 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error)
                 */
                for (i = 0; i < info->bvsize; ++i) {
                        bp = info->bvary[i];
-                       bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
-                       if (retv) {
+                       if (retv || (bp->b_flags & B_NEEDCOMMIT) == 0) {
                                /*
-                                * Error, leave B_DELWRI intact
+                                * Either an error or the original
+                                * vfs_busy_pages() cleared B_NEEDCOMMIT
+                                * due to finding new dirty VM pages in
+                                * the buffer.
+                                *
+                                * Leave B_DELWRI intact.
                                 */
+                               bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
                                vfs_unbusy_pages(bp);
                                bp->b_cmd = BUF_CMD_DONE;
-                               brelse(bp);
+                               bqrelse(bp);
                        } else {
                                /*
                                 * Success, remove B_DELWRI ( bundirty() ).
@@ -3310,6 +3315,7 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error)
                                 */
                                bundirty(bp);
                                bp->b_flags &= ~B_ERROR;
+                               bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
                                bp->b_dirtyoff = bp->b_dirtyend = 0;
                                biodone(&bp->b_bio1);
                        }