NFS - Implement async write BIO, greatly increase sockbuf limits, better rexmit
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 18 Jul 2009 22:03:49 +0000 (15:03 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 18 Jul 2009 22:03:49 +0000 (15:03 -0700)
* Write BIOs are now executed asynchronously.

* Commit BIOs are now executed asynchronously.

* Increase the sockbuf limit to around 2MBytes, which is what is needed
  to handle the burst writing a DragonFly client can do now when the
  kernel flushes its buffers (at least for a UDP socket).

* Adjust the rexmit code.  Also detect duplicate packets received and
  adjust the SRTT up a little since no RTT is calculated when a timeout
  occurs.

* NEW SYSCTLS:

  vfs.nfs.soreserve - This parameter now overrides all other kernel and
  user parameters to set the NFS sockets limit.

  vfs.nfs.maxasyncbio - This is set to the hard-coded maximum by default
  and may be reduced to accomodate insufficiently
  endowed remote servers.

* NOTE ON UDP MOUNTS TO REMOTE SERVERS.  If the remote server does not have
  sufficient sockbuf space to receive a large burst of UDP packets performance
  will suffer greatly.  To allieviate this issue you can decrease
  vfs.nfs.maxasyncbio on the DragonFly client, or you can simply use a
  TCP mount.

sys/vfs/nfs/nfs.h
sys/vfs/nfs/nfs_bio.c
sys/vfs/nfs/nfs_iod.c
sys/vfs/nfs/nfs_socket.c
sys/vfs/nfs/nfs_syscalls.c
sys/vfs/nfs/nfs_vnops.c
sys/vfs/nfs/nfsm_subs.c
sys/vfs/nfs/nfsm_subs.h
sys/vfs/nfs/nfsmount.h

index a6d57ec..dfaed7c 100644 (file)
@@ -182,7 +182,7 @@ struct nfs_args {
 #define        NFSSTA_MNTD             0x00200000  /* Mnt server for mnt point */
 #define        NFSSTA_DISMINPROG       0x00400000  /* Dismount in progress */
 #define        NFSSTA_DISMNT           0x00800000  /* Dismounted */
-#define        NFSSTA_UNUSED24         0x01000000
+#define        NFSSTA_SENDSPACE        0x01000000  /* Printed sendspace warning */
 #define        NFSSTA_UNUSED25         0x02000000
 #define        NFSSTA_UNUSED26         0x04000000
 #define        NFSSTA_UNUSED27         0x08000000
@@ -313,6 +313,8 @@ extern vm_zone_t nfsmount_zone;
 
 extern struct callout nfs_timer_handle;
 extern int nfs_async;
+extern int nfs_maxasyncbio;
+extern int nfs_soreserve;
 
 struct uio;
 struct buf;
@@ -663,18 +665,20 @@ int       nfs_disct (struct mbuf **, caddr_t *, int, int, caddr_t *);
 int    nfs_vinvalbuf (struct vnode *, int, int);
 int    nfs_readrpc_uio (struct vnode *, struct uio *);
 void   nfs_readrpc_bio (struct vnode *, struct bio *);
-int    nfs_writerpc (struct vnode *, struct uio *, int *, int *);
-int    nfs_commit (struct vnode *vp, u_quad_t offset, int cnt, 
+int    nfs_writerpc_uio (struct vnode *, struct uio *, int *, int *);
+void   nfs_writerpc_bio (struct vnode *, struct bio *);
+int    nfs_commitrpc_uio (struct vnode *vp, u_quad_t offset, int cnt,
                        struct thread *td);
-int    nfs_readdirrpc (struct vnode *, struct uio *);
-void   nfs_startio (struct vnode *vp, struct bio *, struct thread *);
+void   nfs_commitrpc_bio (struct vnode *vp, struct bio *);
+int    nfs_readdirrpc_uio (struct vnode *, struct uio *);
+void   nfs_startio(struct vnode *vp, struct bio *, struct thread *);
+int    nfs_doio(struct vnode *vp, struct bio *, struct thread *);
 void   nfs_asyncio(struct vnode *vp, struct bio *bio);
 int    nfs_asyncok(struct nfsmount *nmp);
-int    nfs_iowait (struct bio *bio);
 
-int    nfs_readlinkrpc (struct vnode *, struct uio *);
+int    nfs_readlinkrpc_uio (struct vnode *, struct uio *);
 int    nfs_sigintr (struct nfsmount *, struct nfsreq *, struct thread *);
-int    nfs_readdirplusrpc (struct vnode *, struct uio *);
+int    nfs_readdirplusrpc_uio (struct vnode *, struct uio *);
 int    netaddr_match (int, union nethostaddr *, struct sockaddr *);
 
 int    nfs_loadattrcache (struct vnode *, struct mbuf **, caddr_t *,
index 5b48974..7f92d7d 100644 (file)
@@ -74,6 +74,9 @@ static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset,
                                   int size, struct thread *td);
 static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen);
 static void nfsiodone_sync(struct bio *bio);
+static void nfs_readrpc_bio_done(nfsm_info_t info);
+static void nfs_writerpc_bio_done(nfsm_info_t info);
+static void nfs_commitrpc_bio_done(nfsm_info_t info);
 
 /*
  * Vnode op for VM getpages.
@@ -311,7 +314,7 @@ nfs_putpages(struct vop_putpages_args *ap)
        else
            iomode = NFSV3WRITE_FILESYNC;
 
-       error = nfs_writerpc(vp, &uio, &iomode, &must_commit);
+       error = nfs_writerpc_uio(vp, &uio, &iomode, &must_commit);
 
        msf_buf_free(msf);
 
@@ -407,7 +410,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag)
                case VREG:
                        return (nfs_readrpc_uio(vp, uio));
                case VLNK:
-                       return (nfs_readlinkrpc(vp, uio));
+                       return (nfs_readlinkrpc_uio(vp, uio));
                case VDIR:
                        break;
                default:
@@ -438,14 +441,6 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag)
                                rabp->b_cmd = BUF_CMD_READ;
                                vfs_busy_pages(vp, rabp);
                                nfs_asyncio(vp, &rabp->b_bio2);
-#if 0
-                               if (nfs_startio(vp, &rabp->b_bio2, td)) {
-                                   rabp->b_flags |= B_INVAL|B_ERROR;
-                                   vfs_unbusy_pages(rabp);
-                                   brelse(rabp);
-                                   break;
-                               }
-#endif
                            } else {
                                brelse(rabp);
                            }
@@ -503,8 +498,7 @@ again:
                    bp->b_bio2.bio_done = nfsiodone_sync;
                    bp->b_bio2.bio_flags |= BIO_SYNC;
                    vfs_busy_pages(vp, bp);
-                   nfs_startio(vp, &bp->b_bio2, td);
-                   error = nfs_iowait(&bp->b_bio2);
+                   error = nfs_doio(vp, &bp->b_bio2, td);
                    if (error) {
                        brelse(bp);
                        return (error);
@@ -534,8 +528,7 @@ again:
                    bp->b_bio2.bio_done = nfsiodone_sync;
                    bp->b_bio2.bio_flags |= BIO_SYNC;
                    vfs_busy_pages(vp, bp);
-                   nfs_startio(vp, &bp->b_bio2, td);
-                   error = nfs_iowait(&bp->b_bio2);
+                   error = nfs_doio(vp, &bp->b_bio2, td);
                    if (error) {
                        bp->b_flags |= B_ERROR | B_INVAL;
                        brelse(bp);
@@ -563,11 +556,9 @@ again:
                    bp->b_bio2.bio_done = nfsiodone_sync;
                    bp->b_bio2.bio_flags |= BIO_SYNC;
                    vfs_busy_pages(vp, bp);
-                   nfs_startio(vp, &bp->b_bio2, td);
-                   error = nfs_iowait(&bp->b_bio2);
-                   if (error) {
+                   error = nfs_doio(vp, &bp->b_bio2, td);
+                   if (error)
                            brelse(bp);
-                   }
                    while (error == NFSERR_BAD_COOKIE) {
                        kprintf("got bad cookie vp %p bp %p\n", vp, bp);
                        nfs_invaldir(vp);
@@ -595,8 +586,7 @@ again:
                                    bp->b_bio2.bio_done = nfsiodone_sync;
                                    bp->b_bio2.bio_flags |= BIO_SYNC;
                                    vfs_busy_pages(vp, bp);
-                                   nfs_startio(vp, &bp->b_bio2, td);
-                                   error = nfs_iowait(&bp->b_bio2);
+                                   error = nfs_doio(vp, &bp->b_bio2, td);
                                    /*
                                     * no error + B_INVAL == directory EOF,
                                     * use the block.
@@ -642,13 +632,6 @@ again:
                                rabp->b_cmd = BUF_CMD_READ;
                                vfs_busy_pages(vp, rabp);
                                nfs_asyncio(vp, &rabp->b_bio2);
-#if 0
-                               if (nfs_startio(vp, &rabp->b_bio2, td)) {
-                                   rabp->b_flags |= B_INVAL|B_ERROR;
-                                   vfs_unbusy_pages(rabp);
-                                   brelse(rabp);
-                               }
-#endif
                            } else {
                                brelse(rabp);
                            }
@@ -876,7 +859,7 @@ restart:
        do {
                if ((np->n_flag & NDONTCACHE) && uio->uio_iovcnt == 1) {
                    iomode = NFSV3WRITE_FILESYNC;
-                   error = nfs_writerpc(vp, uio, &iomode, &must_commit);
+                   error = nfs_writerpc_uio(vp, uio, &iomode, &must_commit);
                    if (must_commit)
                            nfs_clearcommit(vp->v_mount);
                    break;
@@ -972,8 +955,7 @@ again:
                        bp->b_bio2.bio_done = nfsiodone_sync;
                        bp->b_bio2.bio_flags |= BIO_SYNC;
                        vfs_busy_pages(vp, bp);
-                       nfs_startio(vp, &bp->b_bio2, td);
-                       error = nfs_iowait(&bp->b_bio2);
+                       error = nfs_doio(vp, &bp->b_bio2, td);
                        if (error) {
                                brelse(bp);
                                break;
@@ -1205,7 +1187,7 @@ nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg)
 int
 nfs_asyncok(struct nfsmount *nmp)
 {
-       return (nmp->nm_bioqlen < NFS_MAXASYNCBIO &&
+       return (nmp->nm_bioqlen < nfs_maxasyncbio &&
                nmp->nm_bioqlen < nmp->nm_maxasync_scaled / NFS_ASYSCALE &&
                nmp->nm_rxstate <= NFSSVC_PENDING &&
                nmp->nm_txstate <= NFSSVC_PENDING);
@@ -1238,30 +1220,102 @@ nfs_asyncio(struct vnode *vp, struct bio *bio)
 }
 
 /*
- * Initiate an I/O operation to/from a cache block.  If the BIO is
- * flagged BIO_SYNC, or if the async thread is not running, the
- * operation will be executed synchronously.
+ * nfs_dio()   - Execute a BIO operation synchronously.  The BIO will be
+ *               completed and its error returned.  The caller is responsible
+ *               for brelse()ing it.  ONLY USE FOR BIO_SYNC IOs!  Otherwise
+ *               our error probe will be against an invalid pointer.
  *
- * Typically for BIO_SYNC the caller set up the completion and will
- * call nfs_iowait() to obtain the error code, then brelse().
- * iowait is a degenerate routine.
+ * nfs_startio()- Execute a BIO operation assynchronously.
  *
- * For async operation we set up a request and queue it the transmit
- * thread along with a done function to deal with cleanup after
- * the RPC completes.  The presence of a done function causes the
- * state machine to automatically move the req onto the reqrxq when
- * a reponse is received.
+ * NOTE: nfs_asyncio() is used to initiate an asynchronous BIO operation,
+ *      which basically just queues it to the txthread.  nfs_startio()
+ *      actually initiates the I/O AFTER it has gotten to the txthread.
  *
- * NOTE! TD MIGHT BE NULL
+ * NOTE: td might be NULL.
  */
 void
 nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
 {
        struct buf *bp = bio->bio_buf;
+       struct nfsnode *np;
+       struct nfsmount *nmp;
+
+       KKASSERT(vp->v_tag == VT_NFS);
+       np = VTONFS(vp);
+       nmp = VFSTONFS(vp->v_mount);
+
+       /*
+        * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
+        * do this here so we do not have to do it in all the code that
+        * calls us.
+        */
+       bp->b_flags &= ~(B_ERROR | B_INVAL);
+
+       KASSERT(bp->b_cmd != BUF_CMD_DONE,
+               ("nfs_doio: bp %p already marked done!", bp));
+
+       if (bp->b_cmd == BUF_CMD_READ) {
+           switch (vp->v_type) {
+           case VREG:
+               nfsstats.read_bios++;
+               nfs_readrpc_bio(vp, bio);
+               break;
+           case VLNK:
+#if 0
+               bio->bio_offset = 0;
+               nfsstats.readlink_bios++;
+               nfs_readlinkrpc_bio(vp, bio);
+#else
+               nfs_doio(vp, bio, td);
+#endif
+               break;
+           case VDIR:
+               /*
+                * NOTE: If nfs_readdirplusrpc_bio() is requested but
+                *       not supported, it will chain to
+                *       nfs_readdirrpc_bio().
+                */
+#if 0
+               nfsstats.readdir_bios++;
+               uiop->uio_offset = bio->bio_offset;
+               if (nmp->nm_flag & NFSMNT_RDIRPLUS)
+                       nfs_readdirplusrpc_bio(vp, bio);
+               else
+                       nfs_readdirrpc_bio(vp, bio);
+#else
+               nfs_doio(vp, bio, td);
+#endif
+               break;
+           default:
+               kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
+               bp->b_flags |= B_ERROR;
+               bp->b_error = EINVAL;
+               biodone(bio);
+               break;
+           }
+       } else {
+           /*
+            * If we only need to commit, try to commit.  If this fails
+            * it will chain through to the write.  Basically all the logic
+            * in nfs_doio() is replicated.
+            */
+           KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
+           if (bp->b_flags & B_NEEDCOMMIT)
+               nfs_commitrpc_bio(vp, bio);
+           else
+               nfs_writerpc_bio(vp, bio);
+       }
+}
+
+int
+nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
+{
+       struct buf *bp = bio->bio_buf;
        struct uio *uiop;
        struct nfsnode *np;
        struct nfsmount *nmp;
-       int error = 0, iomode, must_commit = 0;
+       int error = 0;
+       int iomode, must_commit;
        struct uio uio;
        struct iovec io;
 
@@ -1278,9 +1332,6 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
         * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
         * do this here so we do not have to do it in all the code that
         * calls us.
-        *
-        * NOTE: An EINPROGRESS response can be returned if the bio was
-        *       asynchronous.
         */
        bp->b_flags &= ~(B_ERROR | B_INVAL);
 
@@ -1294,15 +1345,7 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
 
            switch (vp->v_type) {
            case VREG:
-               /*
-                * Note: NFS assumes BIO_SYNC is run synchronously, so
-                *       be sure to do that.
-                */
                nfsstats.read_bios++;
-               if ((bio->bio_flags & BIO_SYNC) == 0) {
-                       nfs_readrpc_bio(vp, bio);
-                       return;
-               }
                uiop->uio_offset = bio->bio_offset;
                error = nfs_readrpc_uio(vp, uiop);
                if (error == 0) {
@@ -1332,18 +1375,18 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
            case VLNK:
                uiop->uio_offset = 0;
                nfsstats.readlink_bios++;
-               error = nfs_readlinkrpc(vp, uiop);
+               error = nfs_readlinkrpc_uio(vp, uiop);
                break;
            case VDIR:
                nfsstats.readdir_bios++;
                uiop->uio_offset = bio->bio_offset;
                if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
-                       error = nfs_readdirplusrpc(vp, uiop);
+                       error = nfs_readdirplusrpc_uio(vp, uiop);
                        if (error == NFSERR_NOTSUPP)
                                nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
                }
                if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
-                       error = nfs_readdirrpc(vp, uiop);
+                       error = nfs_readdirrpc_uio(vp, uiop);
                /*
                 * end-of-directory sets B_INVAL but does not generate an
                 * error.
@@ -1359,6 +1402,7 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
                bp->b_flags |= B_ERROR;
                bp->b_error = error;
            }
+           bp->b_resid = uiop->uio_resid;
        } else {
            /* 
             * If we only need to commit, try to commit
@@ -1369,14 +1413,15 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
                    off_t off;
 
                    off = bio->bio_offset + bp->b_dirtyoff;
-                   retv = nfs_commit(vp, off, 
-                               bp->b_dirtyend - bp->b_dirtyoff, td);
+                   retv = nfs_commitrpc_uio(vp, off,
+                                            bp->b_dirtyend - bp->b_dirtyoff,
+                                            td);
                    if (retv == 0) {
                            bp->b_dirtyoff = bp->b_dirtyend = 0;
                            bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
                            bp->b_resid = 0;
                            biodone(bio);
-                           return;
+                           return(0);
                    }
                    if (retv == NFSERR_STALEWRITEVERF) {
                            nfs_clearcommit(vp->v_mount);
@@ -1386,7 +1431,6 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
            /*
             * Setup for actual write
             */
-
            if (bio->bio_offset + bp->b_dirtyend > np->n_size)
                bp->b_dirtyend = np->n_size - bio->bio_offset;
 
@@ -1403,7 +1447,8 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
                else
                    iomode = NFSV3WRITE_FILESYNC;
 
-               error = nfs_writerpc(vp, uiop, &iomode, &must_commit);
+               must_commit = 0;
+               error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit);
 
                /*
                 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
@@ -1462,16 +1507,25 @@ nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
                    }
                    bp->b_dirtyoff = bp->b_dirtyend = 0;
                }
+               if (must_commit)
+                   nfs_clearcommit(vp->v_mount);
+               bp->b_resid = uiop->uio_resid;
            } else {
                bp->b_resid = 0;
-               biodone(bio);
-               return;
            }
        }
-       bp->b_resid = uiop->uio_resid;
-       if (must_commit)
-           nfs_clearcommit(vp->v_mount);
+
+       /*
+        * I/O was run synchronously, biodone() it and calculate the
+        * error to return.
+        */
        biodone(bio);
+       KKASSERT(bp->b_cmd == BUF_CMD_DONE);
+       if (bp->b_flags & B_EINTR)
+               return (EINTR);
+       if (bp->b_flags & B_ERROR)
+               return (bp->b_error ? bp->b_error : EIO);
+       return (0);
 }
 
 /*
@@ -1532,28 +1586,8 @@ nfsiodone_sync(struct bio *bio)
 }
 
 /*
- * If nfs_startio() was told to do the request BIO_SYNC it will
- * complete the request before returning, so assert that the
- * request is in-fact complete.
- */
-int
-nfs_iowait(struct bio *bio)
-{
-       struct buf *bp = bio->bio_buf;
-
-       KKASSERT(bp->b_cmd == BUF_CMD_DONE);
-       if (bp->b_flags & B_EINTR)
-               return (EINTR);
-       if (bp->b_flags & B_ERROR)
-               return (bp->b_error ? bp->b_error : EIO);
-       return (0);
-}
-
-/*
  * nfs read rpc - BIO version
  */
-static void nfs_readrpc_bio_done(nfsm_info_t info);
-
 void
 nfs_readrpc_bio(struct vnode *vp, struct bio *bio)
 {
@@ -1569,12 +1603,13 @@ nfs_readrpc_bio(struct vnode *vp, struct bio *bio)
 
        nmp = VFSTONFS(vp->v_mount);
        tsiz = bp->b_bcount;
+       KKASSERT(tsiz <= nmp->nm_rsize);
        if (bio->bio_offset + tsiz > nmp->nm_maxfilesize) {
                error = EFBIG;
                goto nfsmout;
        }
        nfsstats.rpccnt[NFSPROC_READ]++;
-       len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz;
+       len = tsiz;
        nfsm_reqhead(info, vp, NFSPROC_READ,
                     NFSX_FH(info->v3) + NFSX_UNSIGNED * 3);
        ERROROUT(nfsm_fhtom(info, vp));
@@ -1654,125 +1689,317 @@ nfsmout:
        biodone(bio);
 }
 
-#if 0
-
 /*
  * nfs write call - BIO version
  */
-int
-nfs_writerpc_bio(struct vnode *vp, struct bio *bio, int *iomode, int *must_commit)
+void
+nfs_writerpc_bio(struct vnode *vp, struct bio *bio)
 {
-       u_int32_t *tl;
-       int32_t backup;
        struct nfsmount *nmp = VFSTONFS(vp->v_mount);
-       int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
-       int  committed = NFSV3WRITE_FILESYNC;
-       struct nfsm_info info;
+       struct nfsnode *np = VTONFS(vp);
+       struct buf *bp = bio->bio_buf;
+       u_int32_t *tl;
+       int len;
+       int iomode;
+       int error = 0;
+       struct nfsm_info *info;
+       off_t offset;
 
-       info.mrep = NULL;
-       info.v3 = NFS_ISV3(vp);
+       /*
+        * Setup for actual write.  Just clean up the bio if there
+        * is nothing to do.
+        */
+       if (bio->bio_offset + bp->b_dirtyend > np->n_size)
+               bp->b_dirtyend = np->n_size - bio->bio_offset;
 
-#ifndef DIAGNOSTIC
-       if (uiop->uio_iovcnt != 1)
-               panic("nfs: writerpc iovcnt > 1");
-#endif
-       *must_commit = 0;
-       tsiz = uiop->uio_resid;
-       if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize)
-               return (EFBIG);
-       while (tsiz > 0) {
-               nfsstats.rpccnt[NFSPROC_WRITE]++;
-               len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz;
-               nfsm_reqhead(&info, vp, NFSPROC_WRITE,
-                            NFSX_FH(info.v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
-               ERROROUT(nfsm_fhtom(&info, vp));
-               if (info.v3) {
-                       tl = nfsm_build(&info, 5 * NFSX_UNSIGNED);
-                       txdr_hyper(uiop->uio_offset, tl);
-                       tl += 2;
-                       *tl++ = txdr_unsigned(len);
-                       *tl++ = txdr_unsigned(*iomode);
-                       *tl = txdr_unsigned(len);
-               } else {
-                       u_int32_t x;
-
-                       tl = nfsm_build(&info, 4 * NFSX_UNSIGNED);
-                       /* Set both "begin" and "current" to non-garbage. */
-                       x = txdr_unsigned((u_int32_t)uiop->uio_offset);
-                       *tl++ = x;      /* "begin offset" */
-                       *tl++ = x;      /* "current offset" */
-                       x = txdr_unsigned(len);
-                       *tl++ = x;      /* total to this offset */
-                       *tl = x;        /* size of this write */
-               }
-               ERROROUT(nfsm_uiotom(&info, uiop, len));
-               NEGKEEPOUT(nfsm_request(&info, vp, NFSPROC_WRITE, uiop->uio_td,
-                                       nfs_vpcred(vp, ND_WRITE), &error));
-               if (info.v3) {
-                       /*
-                        * The write RPC returns a before and after mtime.  The
-                        * nfsm_wcc_data() macro checks the before n_mtime
-                        * against the before time and stores the after time
-                        * in the nfsnode's cached vattr and n_mtime field.
-                        * The NRMODIFIED bit will be set if the before
-                        * time did not match the original mtime.
-                        */
-                       wccflag = NFSV3_WCCCHK;
-                       ERROROUT(nfsm_wcc_data(&info, vp, &wccflag));
-                       if (error == 0) {
-                               NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF));
-                               rlen = fxdr_unsigned(int, *tl++);
-                               if (rlen == 0) {
-                                       error = NFSERR_IO;
-                                       m_freem(info.mrep);
-                                       info.mrep = NULL;
-                                       break;
-                               } else if (rlen < len) {
-                                       backup = len - rlen;
-                                       uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup;
-                                       uiop->uio_iov->iov_len += backup;
-                                       uiop->uio_offset -= backup;
-                                       uiop->uio_resid += backup;
-                                       len = rlen;
-                               }
-                               commit = fxdr_unsigned(int, *tl++);
+       if (bp->b_dirtyend <= bp->b_dirtyoff) {
+               bp->b_resid = 0;
+               biodone(bio);
+               return;
+       }
+       len = bp->b_dirtyend - bp->b_dirtyoff;
+       offset = bio->bio_offset + bp->b_dirtyoff;
+       if (offset + len > nmp->nm_maxfilesize) {
+               bp->b_flags |= B_ERROR;
+               bp->b_error = EFBIG;
+               biodone(bio);
+               return;
+       }
+       bp->b_resid = len;
+       nfsstats.write_bios++;
+
+       info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
+       info->mrep = NULL;
+       info->v3 = NFS_ISV3(vp);
+       info->info_writerpc.must_commit = 0;
+       if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
+               iomode = NFSV3WRITE_UNSTABLE;
+       else
+               iomode = NFSV3WRITE_FILESYNC;
 
+       KKASSERT(len <= nmp->nm_wsize);
+
+       nfsstats.rpccnt[NFSPROC_WRITE]++;
+       nfsm_reqhead(info, vp, NFSPROC_WRITE,
+                    NFSX_FH(info->v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
+       ERROROUT(nfsm_fhtom(info, vp));
+       if (info->v3) {
+               tl = nfsm_build(info, 5 * NFSX_UNSIGNED);
+               txdr_hyper(offset, tl);
+               tl += 2;
+               *tl++ = txdr_unsigned(len);
+               *tl++ = txdr_unsigned(iomode);
+               *tl = txdr_unsigned(len);
+       } else {
+               u_int32_t x;
+
+               tl = nfsm_build(info, 4 * NFSX_UNSIGNED);
+               /* Set both "begin" and "current" to non-garbage. */
+               x = txdr_unsigned((u_int32_t)offset);
+               *tl++ = x;      /* "begin offset" */
+               *tl++ = x;      /* "current offset" */
+               x = txdr_unsigned(len);
+               *tl++ = x;      /* total to this offset */
+               *tl = x;        /* size of this write */
+       }
+       ERROROUT(nfsm_biotom(info, bio, bp->b_dirtyoff, len));
+       info->bio = bio;
+       info->done = nfs_writerpc_bio_done;
+       nfsm_request_bio(info, vp, NFSPROC_WRITE, NULL,
+                        nfs_vpcred(vp, ND_WRITE));
+       return;
+nfsmout:
+       kfree(info, M_NFSREQ);
+       bp->b_error = error;
+       bp->b_flags |= B_ERROR;
+       biodone(bio);
+}
+
+static void
+nfs_writerpc_bio_done(nfsm_info_t info)
+{
+       struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
+       struct nfsnode *np = VTONFS(info->vp);
+       struct bio *bio = info->bio;
+       struct buf *bp = bio->bio_buf;
+       int wccflag = NFSV3_WCCRATTR;
+       int iomode = NFSV3WRITE_FILESYNC;
+       int commit;
+       int rlen;
+       int error;
+       int len = bp->b_resid;  /* b_resid was set to shortened length */
+       u_int32_t *tl;
+
+       if (info->v3) {
+               /*
+                * The write RPC returns a before and after mtime.  The
+                * nfsm_wcc_data() macro checks the before n_mtime
+                * against the before time and stores the after time
+                * in the nfsnode's cached vattr and n_mtime field.
+                * The NRMODIFIED bit will be set if the before
+                * time did not match the original mtime.
+                */
+               wccflag = NFSV3_WCCCHK;
+               ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
+               if (error == 0) {
+                       NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF));
+                       rlen = fxdr_unsigned(int, *tl++);
+                       if (rlen == 0) {
+                               error = NFSERR_IO;
+                               m_freem(info->mrep);
+                               info->mrep = NULL;
+                               goto nfsmout;
+                       } else if (rlen < len) {
+#if 0
                                /*
-                                * Return the lowest committment level
-                                * obtained by any of the RPCs.
+                                * XXX what do we do here?
                                 */
-                               if (committed == NFSV3WRITE_FILESYNC)
-                                       committed = commit;
-                               else if (committed == NFSV3WRITE_DATASYNC &&
-                                       commit == NFSV3WRITE_UNSTABLE)
-                                       committed = commit;
-                               if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
-                                   bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
-                                       NFSX_V3WRITEVERF);
-                                   nmp->nm_state |= NFSSTA_HASWRITEVERF;
-                               } else if (bcmp((caddr_t)tl,
-                                   (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) {
-                                   *must_commit = 1;
-                                   bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
-                                       NFSX_V3WRITEVERF);
-                               }
+                               backup = len - rlen;
+                               uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup;
+                               uiop->uio_iov->iov_len += backup;
+                               uiop->uio_offset -= backup;
+                               uiop->uio_resid += backup;
+                               len = rlen;
+#endif
+                       }
+                       commit = fxdr_unsigned(int, *tl++);
+
+                       /*
+                        * Return the lowest committment level
+                        * obtained by any of the RPCs.
+                        */
+                       if (iomode == NFSV3WRITE_FILESYNC)
+                               iomode = commit;
+                       else if (iomode == NFSV3WRITE_DATASYNC &&
+                               commit == NFSV3WRITE_UNSTABLE)
+                               iomode = commit;
+                       if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
+                           bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
+                           nmp->nm_state |= NFSSTA_HASWRITEVERF;
+                       } else if (bcmp(tl, nmp->nm_verf, NFSX_V3WRITEVERF)) {
+                           info->info_writerpc.must_commit = 1;
+                           bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
                        }
-               } else {
-                       ERROROUT(nfsm_loadattr(&info, vp, NULL));
                }
-               m_freem(info.mrep);
-               info.mrep = NULL;
+       } else {
+               ERROROUT(nfsm_loadattr(info, info->vp, NULL));
+       }
+       m_freem(info->mrep);
+       info->mrep = NULL;
+       len = 0;
+nfsmout:
+       if (info->vp->v_mount->mnt_flag & MNT_ASYNC)
+               iomode = NFSV3WRITE_FILESYNC;
+       bp->b_resid = len;
+
+       /*
+        * End of RPC.  Now clean up the bp.
+        *
+        * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
+        * to cluster the buffers needing commit.  This will allow
+        * the system to submit a single commit rpc for the whole
+        * cluster.  We can do this even if the buffer is not 100%
+        * dirty (relative to the NFS blocksize), so we optimize the
+        * append-to-file-case.
+        *
+        * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
+        * cleared because write clustering only works for commit
+        * rpc's, not for the data portion of the write).
+        */
+       if (!error && iomode == NFSV3WRITE_UNSTABLE) {
+               bp->b_flags |= B_NEEDCOMMIT;
+               if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount)
+                       bp->b_flags |= B_CLUSTEROK;
+       } else {
+               bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+       }
+
+       /*
+        * For an interrupted write, the buffer is still valid
+        * and the write hasn't been pushed to the server yet,
+        * so we can't set B_ERROR and report the interruption
+        * by setting B_EINTR. For the async case, B_EINTR
+        * is not relevant, so the rpc attempt is essentially
+        * a noop.  For the case of a V3 write rpc not being
+        * committed to stable storage, the block is still
+        * dirty and requires either a commit rpc or another
+        * write rpc with iomode == NFSV3WRITE_FILESYNC before
+        * the block is reused. This is indicated by setting
+        * the B_DELWRI and B_NEEDCOMMIT flags.
+        *
+        * If the buffer is marked B_PAGING, it does not reside on
+        * the vp's paging queues so we cannot call bdirty().  The
+        * bp in this case is not an NFS cache block so we should
+        * be safe. XXX
+        */
+       if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
+               crit_enter();
+               bp->b_flags &= ~(B_INVAL|B_NOCACHE);
+               if ((bp->b_flags & B_PAGING) == 0)
+                       bdirty(bp);
                if (error)
-                       break;
-               tsiz -= len;
+                       bp->b_flags |= B_EINTR;
+               crit_exit();
+       } else {
+               if (error) {
+                       bp->b_flags |= B_ERROR;
+                       bp->b_error = np->n_error = error;
+                       np->n_flag |= NWRITEERR;
+               }
+               bp->b_dirtyoff = bp->b_dirtyend = 0;
+       }
+       if (info->info_writerpc.must_commit)
+               nfs_clearcommit(info->vp->v_mount);
+       kfree(info, M_NFSREQ);
+       if (error) {
+               bp->b_flags |= B_ERROR;
+               bp->b_error = error;
+       }
+       biodone(bio);
+}
+
+/*
+ * Nfs Version 3 commit rpc - BIO version
+ *
+ * This function issues the commit rpc and will chain to a write
+ * rpc if necessary.
+ */
+void
+nfs_commitrpc_bio(struct vnode *vp, struct bio *bio)
+{
+       struct nfsmount *nmp = VFSTONFS(vp->v_mount);
+       struct buf *bp = bio->bio_buf;
+       struct nfsm_info *info;
+       int error = 0;
+       u_int32_t *tl;
+
+       if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
+               bp->b_dirtyoff = bp->b_dirtyend = 0;
+               bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+               bp->b_resid = 0;
+               biodone(bio);
+               return;
+       }
+
+       info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
+       info->mrep = NULL;
+       info->v3 = 1;
+
+       nfsstats.rpccnt[NFSPROC_COMMIT]++;
+       nfsm_reqhead(info, vp, NFSPROC_COMMIT, NFSX_FH(1));
+       ERROROUT(nfsm_fhtom(info, vp));
+       tl = nfsm_build(info, 3 * NFSX_UNSIGNED);
+       txdr_hyper(bio->bio_offset + bp->b_dirtyoff, tl);
+       tl += 2;
+       *tl = txdr_unsigned(bp->b_dirtyend - bp->b_dirtyoff);
+       info->bio = bio;
+       info->done = nfs_commitrpc_bio_done;
+       nfsm_request_bio(info, vp, NFSPROC_COMMIT, NULL,
+                        nfs_vpcred(vp, ND_WRITE));
+       return;
+nfsmout:
+       /*
+        * Chain to write RPC on (early) error
+        */
+       kfree(info, M_NFSREQ);
+       nfs_writerpc_bio(vp, bio);
+}
+
+static void
+nfs_commitrpc_bio_done(nfsm_info_t info)
+{
+       struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
+       struct bio *bio = info->bio;
+       struct buf *bp = bio->bio_buf;
+       u_int32_t *tl;
+       int wccflag = NFSV3_WCCRATTR;
+       int error = 0;
+
+       ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
+       if (error == 0) {
+               NULLOUT(tl = nfsm_dissect(info, NFSX_V3WRITEVERF));
+               if (bcmp(nmp->nm_verf, tl, NFSX_V3WRITEVERF)) {
+                       bcopy(tl, nmp->nm_verf, NFSX_V3WRITEVERF);
+                       error = NFSERR_STALEWRITEVERF;
+               }
        }
+       m_freem(info->mrep);
+       info->mrep = NULL;
+
+       /*
+        * On completion we must chain to a write bio if an
+        * error occurred.
+        */
 nfsmout:
-       if (vp->v_mount->mnt_flag & MNT_ASYNC)
-               committed = NFSV3WRITE_FILESYNC;
-       *iomode = committed;
-       if (error)
-               uiop->uio_resid = tsiz;
-       return (error);
+       kfree(info, M_NFSREQ);
+       if (error == 0) {
+               bp->b_dirtyoff = bp->b_dirtyend = 0;
+               bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+               bp->b_resid = 0;
+               biodone(bio);
+       } else {
+               kprintf("commitrpc_bioC %lld -> CHAIN WRITE\n", bio->bio_offset);
+               nfs_writerpc_bio(info->vp, bio);
+       }
 }
 
-#endif
index b524941..4bded66 100644 (file)
@@ -177,7 +177,13 @@ nfssvc_iod_writer(void *arg)
                        break;
                nmp->nm_txstate = NFSSVC_WAITING;
 
+               /*
+                * Eep, we could blow out the mbuf allocator if we just
+                * did everything the kernel wanted us to do.
+                */
                while ((bio = TAILQ_FIRST(&nmp->nm_bioq)) != NULL) {
+                       if (nmp->nm_reqqlen >= NFS_MAXASYNCBIO)
+                               break;
                        TAILQ_REMOVE(&nmp->nm_bioq, bio, bio_act);
                        vp = bio->bio_driver_info;
                        crit_exit();
index 02bfb56..f7460d7 100644 (file)
  * 4 - write
  */
 static int proct[NFS_NPROCS] = {
-       0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
-       0, 0, 0,
+       0, 1, 0, 2, 1, 3, 3, 4, 0, 0,   /* 00-09        */
+       0, 0, 0, 0, 0, 0, 3, 3, 0, 0,   /* 10-19        */
+       0, 5, 0, 0, 0, 0,               /* 20-29        */
+};
+
+static int multt[NFS_NPROCS] = {
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   /* 00-09        */
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   /* 10-19        */
+       1, 2, 1, 1, 1, 1,               /* 20-29        */
 };
 
 static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 };
 static int nfs_realign_test;
 static int nfs_realign_count;
-static int nfs_bufpackets = 4;
 static int nfs_showrtt;
 static int nfs_showrexmit;
+int nfs_maxasyncbio = NFS_MAXASYNCBIO;
 
 SYSCTL_DECL(_vfs_nfs);
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
-SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, maxasyncbio, CTLFLAG_RW, &nfs_maxasyncbio, 0, "");
 
 static int nfs_request_setup(nfsm_info_t info);
 static int nfs_request_auth(struct nfsreq *rep);
@@ -183,8 +190,7 @@ int
 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 {
        struct socket *so;
-       int error, rcvreserve, sndreserve;
-       int pktscale;
+       int error;
        struct sockaddr *saddr;
        struct sockaddr_in *sin;
        struct thread *td = &thread0; /* only used for socreate and sobind */
@@ -283,23 +289,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
         * Get buffer reservation size from sysctl, but impose reasonable
         * limits.
         */
-       pktscale = nfs_bufpackets;
-       if (pktscale < 2)
-               pktscale = 2;
-       if (pktscale > 64)
-               pktscale = 64;
-
-       if (nmp->nm_sotype == SOCK_DGRAM) {
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
-               rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
-                   NFS_MAXPKTHDR) * pktscale;
-       } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
-               rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
-                   NFS_MAXPKTHDR) * pktscale;
-       } else {
-               if (nmp->nm_sotype != SOCK_STREAM)
-                       panic("nfscon sotype");
+       if (nmp->nm_sotype == SOCK_STREAM) {
                if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
                        struct sockopt sopt;
                        int val;
@@ -324,13 +314,8 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
                        val = 1;
                        sosetopt(so, &sopt);
                }
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
-                   sizeof (u_int32_t)) * pktscale;
-               rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
-                   sizeof (u_int32_t)) * pktscale;
        }
-       error = soreserve(so, sndreserve, rcvreserve,
-                         &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
+       error = soreserve(so, nfs_soreserve, nfs_soreserve, NULL);
        if (error)
                goto bad;
        so->so_rcv.ssb_flags |= SSB_NOINTR;
@@ -464,8 +449,17 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
                /*
                 * do backoff retransmit on client
                 */
-               if (rep)
+               if (rep) {
+                       if ((rep->r_nmp->nm_state & NFSSTA_SENDSPACE) == 0) {
+                               rep->r_nmp->nm_state |= NFSSTA_SENDSPACE;
+                               kprintf("Warning: NFS: Insufficient sendspace "
+                                       "(%lu),\n"
+                                       "\t You must increase vfs.nfs.soreserve"
+                                       "or decrease vfs.nfs.maxasyncbio\n",
+                                       so->so_snd.ssb_hiwat);
+                       }
                        rep->r_flags |= R_NEEDSXMIT;
+               }
        }
 
        if (error) {
@@ -850,6 +844,8 @@ nfsmout:
 
                /*
                 * Fill in the rest of the reply if we found a match.
+                *
+                * Deal with duplicate responses if there was no match.
                 */
                if (rep) {
                        rep->r_md = info.md;
@@ -888,7 +884,7 @@ nfsmout:
                         *
                         * NOTE SRTT/SDRTT are only good if R_TIMING is set.
                         */
-                       if (rep->r_flags & R_TIMING) {
+                       if ((rep->r_flags & R_TIMING) && rep->r_rexmit == 0) {
                                /*
                                 * Since the timer resolution of
                                 * NFS_HZ is so course, it can often
@@ -923,6 +919,25 @@ nfsmout:
                        nmp->nm_timeouts = 0;
                        rep->r_mrep = info.mrep;
                        nfs_hardterm(rep, 0);
+               } else {
+                       /*
+                        * Extract vers, prog, nfsver, procnum.  A duplicate
+                        * response means we didn't wait long enough so
+                        * we increase the SRTT to avoid future spurious
+                        * timeouts.
+                        */
+                       u_int procnum = nmp->nm_lastreprocnum;
+                       int n;
+
+                       if (procnum < NFS_NPROCS && proct[procnum]) {
+                               if (nfs_showrexmit)
+                                       kprintf("D");
+                               n = nmp->nm_srtt[proct[procnum]];
+                               n += NFS_ASYSCALE * NFS_HZ;
+                               if (n < NFS_ASYSCALE * NFS_HZ * 10)
+                                       n = NFS_ASYSCALE * NFS_HZ * 10;
+                               nmp->nm_srtt[proct[procnum]] = n;
+                       }
                }
                nfs_rcvunlock(nmp);
                crit_exit();
@@ -1338,6 +1353,10 @@ nfs_request_waitreply(struct nfsreq *rep)
        TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
        rep->r_flags &= ~R_ONREQQ;
        --nmp->nm_reqqlen;
+       if (TAILQ_FIRST(&nmp->nm_bioq) &&
+           nmp->nm_reqqlen == NFS_MAXASYNCBIO * 2 / 3) {
+               nfssvc_iod_writer_wakeup(nmp);
+       }
        crit_exit();
 
        /*
@@ -1705,6 +1724,9 @@ nfs_timer_req(struct nfsreq *req)
         * to multiply by fairly large numbers.
         */
        if (req->r_rtt >= 0) {
+               /*
+                * Calculate the timeout to test against.
+                */
                req->r_rtt++;
                if (nmp->nm_flag & NFSMNT_DUMBTIMR) {
                        timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
@@ -1713,6 +1735,7 @@ nfs_timer_req(struct nfsreq *req)
                } else {
                        timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
                }
+               timeo *= multt[req->r_procnum];
                /* timeo is still scaled by SCALE_BITS */
 
 #define NFSFS  (NFS_RTT_SCALE * NFS_HZ)
@@ -1784,13 +1807,12 @@ nfs_timer_req(struct nfsreq *req)
        /*
         * If there is enough space and the window allows.. resend it.
         *
-        * Set r_rtt to -1 in case we fail to send it now.
+        * r_rtt is left intact in case we get an answer after the
+        * retry that was a reply to the original packet.
         */
-       req->r_rtt = -1;
        if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len &&
            (req->r_flags & (R_SENT | R_NEEDSXMIT)) &&
           (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
-               req->r_flags &= ~R_NEEDSXMIT;
                if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
                    error = so_pru_send(so, 0, m, NULL, NULL, td);
                else
@@ -1812,21 +1834,24 @@ nfs_timer_req(struct nfsreq *req)
                         * been filled in.  R_LOCKED will prevent
                         * the request from being ripped out from under
                         * us entirely.
+                        *
+                        * Record the last resent procnum to aid us
+                        * in duplicate detection on receive.
                         */
-                       if (req->r_flags & R_SENT) {
+                       if ((req->r_flags & R_NEEDSXMIT) == 0) {
                                if (nfs_showrexmit)
                                        kprintf("X");
-                               req->r_flags &= ~R_TIMING;
                                if (++req->r_rexmit > NFS_MAXREXMIT)
                                        req->r_rexmit = NFS_MAXREXMIT;
                                nmp->nm_maxasync_scaled >>= 1;
                                if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED)
                                        nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
                                nfsstats.rpcretries++;
+                               nmp->nm_lastreprocnum = req->r_procnum;
                        } else {
                                req->r_flags |= R_SENT;
+                               req->r_flags &= ~R_NEEDSXMIT;
                        }
-                       req->r_rtt = 0;
                }
        }
 }
@@ -1921,6 +1946,10 @@ nfs_hardterm(struct nfsreq *rep, int islocked)
                                 rep->r_info->state == NFSM_STATE_WAITREPLY);
                        rep->r_info->state = NFSM_STATE_PROCESSREPLY;
                        nfssvc_iod_reader_wakeup(nmp);
+                       if (TAILQ_FIRST(&nmp->nm_bioq) &&
+                           nmp->nm_reqqlen == NFS_MAXASYNCBIO * 2 / 3) {
+                               nfssvc_iod_writer_wakeup(nmp);
+                       }
                }
                mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link);
        }
index 88f2a1b..cc49340 100644 (file)
@@ -104,7 +104,7 @@ static int nfs_privport = 0;
 SYSCTL_INT(_vfs_nfs, NFS_NFSPRIVPORT, nfs_privport, CTLFLAG_RW, &nfs_privport, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, gatherdelay, CTLFLAG_RW, &nfsrvw_procrastinate, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, gatherdelay_v3, CTLFLAG_RW, &nfsrvw_procrastinate_v3, 0, "");
-static int     nfs_soreserve = 65535;
+int    nfs_soreserve = NFS_MAXPACKET * NFS_MAXASYNCBIO;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, soreserve, CTLFLAG_RW, &nfs_soreserve, 0, "");
 
 /*
@@ -335,6 +335,9 @@ nfssvc_addsock(struct file *fp, struct sockaddr *mynam, struct thread *td)
         * Reserve buffer space in the socket.  Note that due to bugs in
         * Linux's delayed-ack code, serious performance degredation may
         * occur with linux hosts if the minimum is used.
+        *
+        * NFS sockets are not limited to the standard sb_max or by
+        * resource limits.
         */
        if (so->so_type == SOCK_STREAM)
                siz = NFS_MAXPACKET + sizeof (u_long);
@@ -342,14 +345,8 @@ nfssvc_addsock(struct file *fp, struct sockaddr *mynam, struct thread *td)
                siz = NFS_MAXPACKET;
        if (siz < nfs_soreserve)
            siz = nfs_soreserve;
-       if (siz > sb_max_adj) {
-           kprintf("Warning: vfs.nfs.soreserve (%d) "
-               "limited to adjusted sb_max (%ld)\n",
-               nfs_soreserve, sb_max_adj);
-           siz = sb_max_adj;
-       }
 
-       error = soreserve(so, siz, siz, &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
+       error = soreserve(so, siz, siz, NULL);
        if (error) {
                if (mynam != NULL)
                        FREE(mynam, M_SONAME);
index 3f71526..02421b2 100644 (file)
@@ -405,10 +405,10 @@ nfs_access(struct vop_access_args *ap)
                                bp = kmalloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
                                aiov.iov_base = bp;
                                aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
-                               error = nfs_readdirrpc(vp, &auio);
+                               error = nfs_readdirrpc_uio(vp, &auio);
                                kfree(bp, M_TEMP);
                        } else if (vp->v_type == VLNK) {
-                               error = nfs_readlinkrpc(vp, &auio);
+                               error = nfs_readlinkrpc_uio(vp, &auio);
                        } else {
                                error = EACCES;
                        }
@@ -1199,7 +1199,7 @@ nfs_readlink(struct vop_readlink_args *ap)
  * Called by nfs_doio() from below the buffer cache.
  */
 int
-nfs_readlinkrpc(struct vnode *vp, struct uio *uiop)
+nfs_readlinkrpc_uio(struct vnode *vp, struct uio *uiop)
 {
        int error = 0, len, attrflag;
        struct nfsm_info info;
@@ -1300,7 +1300,8 @@ nfsmout:
  * nfs write call
  */
 int
-nfs_writerpc(struct vnode *vp, struct uio *uiop, int *iomode, int *must_commit)
+nfs_writerpc_uio(struct vnode *vp, struct uio *uiop,
+                int *iomode, int *must_commit)
 {
        u_int32_t *tl;
        int32_t backup;
@@ -2232,7 +2233,7 @@ done:
  * be block-bounded.  It must convert to cookies for the actual RPC.
  */
 int
-nfs_readdirrpc(struct vnode *vp, struct uio *uiop)
+nfs_readdirrpc_uio(struct vnode *vp, struct uio *uiop)
 {
        int len, left;
        struct nfs_dirent *dp = NULL;
@@ -2446,7 +2447,7 @@ nfsmout:
  * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc().
  */
 int
-nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop)
+nfs_readdirplusrpc_uio(struct vnode *vp, struct uio *uiop)
 {
        int len, left;
        struct nfs_dirent *dp;
@@ -2846,9 +2847,12 @@ nfsmout:
 
 /*
  * Nfs Version 3 commit rpc
+ *
+ * We call it 'uio' to distinguish it from 'bio' but there is no real uio
+ * involved.
  */
 int
-nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct thread *td)
+nfs_commitrpc_uio(struct vnode *vp, u_quad_t offset, int cnt, struct thread *td)
 {
        struct nfsmount *nmp = VFSTONFS(vp->v_mount);
        int error = 0, wccflag = NFSV3_WCCRATTR;
@@ -2919,6 +2923,7 @@ nfs_strategy(struct vop_strategy_args *ap)
        struct bio *nbio;
        struct buf *bp = bio->bio_buf;
        struct thread *td;
+       int error;
 
        KASSERT(bp->b_cmd != BUF_CMD_DONE,
                ("nfs_strategy: buffer %p unexpectedly marked done", bp));
@@ -2950,11 +2955,12 @@ nfs_strategy(struct vop_strategy_args *ap)
         * otherwise just do it ourselves.
         */
        if (bio->bio_flags & BIO_SYNC) {
-               nfs_startio(ap->a_vp, nbio, td);
+               error = nfs_doio(ap->a_vp, nbio, td);
        } else {
                nfs_asyncio(ap->a_vp, nbio);
+               error = 0;
        }
-       return(0);
+       return (error);
 }
 
 /*
@@ -3243,8 +3249,8 @@ nfs_flush_docommit(struct nfs_flush_info *info, int error)
                if (error) {
                        retv = -error;
                } else {
-                       retv = nfs_commit(vp, info->beg_off, 
-                                         (int)bytes, info->td);
+                       retv = nfs_commitrpc_uio(vp, info->beg_off,
+                                                (int)bytes, info->td);
                        if (retv == NFSERR_STALEWRITEVERF)
                                nfs_clearcommit(vp->v_mount);
                }
index 13dd892..4c19e87 100644 (file)
@@ -738,7 +738,22 @@ nfsm_uiotom(nfsm_info_t info, struct uio *uiop, int len)
 {
        int error;
 
-       if ((error = nfsm_uiotombuf(uiop, &info->mb, len, &info->bpos)) != 0) {
+       error = nfsm_uiotombuf(uiop, &info->mb, len, &info->bpos);
+       if (error) {
+               m_freem(info->mreq);
+               info->mreq = NULL;
+               return (error);
+       }
+       return(0);
+}
+
+int
+nfsm_biotom(nfsm_info_t info, struct bio *bio, int off, int len)
+{
+       int error;
+
+       error = nfsm_biotombuf(bio, &info->mb, off, len, &info->bpos);
+       if (error) {
                m_freem(info->mreq);
                info->mreq = NULL;
                return (error);
@@ -1218,6 +1233,65 @@ nfsm_uiotombuf(struct uio *uiop, struct mbuf **mq, int siz, caddr_t *bpos)
        return (0);
 }
 
+int
+nfsm_biotombuf(struct bio *bio, struct mbuf **mq, int off,
+              int siz, caddr_t *bpos)
+{
+       struct buf *bp = bio->bio_buf;
+       struct mbuf *mp, *mp2;
+       char *bio_cp;
+       int bio_left;
+       int xfer, mlen;
+       int rem;
+       boolean_t getcluster;
+       char *cp;
+
+       if (siz >= MINCLSIZE)
+               getcluster = TRUE;
+       else
+               getcluster = FALSE;
+       rem = nfsm_rndup(siz) - siz;
+       mp = mp2 = *mq;
+
+       bio_cp = bp->b_data + off;
+       bio_left = siz;
+
+       while (bio_left) {
+               mlen = M_TRAILINGSPACE(mp);
+               if (mlen == 0) {
+                       if (getcluster)
+                               mp = m_getcl(MB_WAIT, MT_DATA, 0);
+                       else
+                               mp = m_get(MB_WAIT, MT_DATA);
+                       mp->m_len = 0;
+                       mp2->m_next = mp;
+                       mp2 = mp;
+                       mlen = M_TRAILINGSPACE(mp);
+               }
+               xfer = (bio_left < mlen) ? bio_left : mlen;
+               bcopy(bio_cp, mtod(mp, caddr_t) + mp->m_len, xfer);
+               mp->m_len += xfer;
+               bio_left -= xfer;
+               bio_cp += xfer;
+       }
+       if (rem > 0) {
+               if (rem > M_TRAILINGSPACE(mp)) {
+                       MGET(mp, MB_WAIT, MT_DATA);
+                       mp->m_len = 0;
+                       mp2->m_next = mp;
+               }
+               cp = mtod(mp, caddr_t) + mp->m_len;
+               for (mlen = 0; mlen < rem; mlen++)
+                       *cp++ = '\0';
+               mp->m_len += rem;
+               *bpos = cp;
+       } else {
+               *bpos = mtod(mp, caddr_t) + mp->m_len;
+       }
+       *mq = mp;
+       return(0);
+}
+
 /*
  * Help break down an mbuf chain by setting the first siz bytes contiguous
  * pointed to by returned val.
index b10c763..c0f9369 100644 (file)
@@ -89,8 +89,15 @@ struct nfsm_info {
         */
        struct bio      *bio;
        void            (*done)(struct nfsm_info *);
+       union {
+               struct {
+                       int     must_commit;
+               } writerpc;
+       } u;
 };
 
+#define info_writerpc  u.writerpc
+
 typedef struct nfsm_info *nfsm_info_t;
 
 #define NULLOUT(nfsmexp)                               \
@@ -156,7 +163,7 @@ int nfsm_mtouio(nfsm_info_t info, struct uio *uiop, int len);
 int    nfsm_mtobio(nfsm_info_t info, struct bio *bio, int len);
 
 int    nfsm_uiotom(nfsm_info_t info, struct uio *uiop, int len);
-int    nfsm_biotom(nfsm_info_t info, struct bio *bio, int len);
+int    nfsm_biotom(nfsm_info_t info, struct bio *bio, int off, int len);
 int    nfsm_request(nfsm_info_t info, struct vnode *vp, int procnum,
                                thread_t td, struct ucred *cred, int *errorp);
 void   nfsm_request_bio(nfsm_info_t info, struct vnode *vp, int procnum,
@@ -178,7 +185,7 @@ int nfsm_mbuftobio(struct mbuf **mrep, struct bio *bio,
                                int siz, caddr_t *dpos);
 int    nfsm_uiotombuf (struct uio *uiop, struct mbuf **mq,
                                int siz, caddr_t *bpos);
-int    nfsm_biotombuf (struct bio *bio, struct mbuf **mq,
+int    nfsm_biotombuf (struct bio *bio, struct mbuf **mq, int off,
                                int siz, caddr_t *bpos);
 int    nfsm_disct(struct mbuf **mdp, caddr_t *dposp, int siz,
                                int left, caddr_t *cp2);
index 50e90ac..88b8673 100644 (file)
@@ -78,11 +78,12 @@ struct      nfsmount {
        struct  sockaddr *nm_nam;       /* Addr of server */
        int     nm_timeo;               /* Init timer for NFSMNT_DUMBTIMR */
        int     nm_retry;               /* Max retries */
-       int     nm_srtt[5];             /* Timers for rpcs */
-       int     nm_sdrtt[5];
+       int     nm_srtt[6];             /* Timers for rpcs (see proct[]) */
+       int     nm_sdrtt[6];
        int     nm_maxasync_scaled;     /* Used to control congestion */
        int     nm_timeouts;            /* Request timeouts */
        int     nm_deadthresh;          /* Threshold of timeouts-->dead server*/
+       u_int32_t nm_lastreprocnum;     /* Last resent procnum for dup detect */
        int     nm_rsize;               /* Max size of read rpc */
        int     nm_wsize;               /* Max size of write rpc */
        int     nm_readdirsize;         /* Size of a readdir rpc */