kernel - Fix excessive mbuf use in nfs_realign()
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
index f0b09ae..c241c3c 100644 (file)
 #include <sys/tprintf.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
+#include <sys/mutex.h>
+
 #include <sys/signal2.h>
+#include <sys/mutex2.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #define        FALSE   0
 
 /*
- * Estimate rto for an nfs rpc sent via. an unreliable datagram.
- * Use the mean and mean deviation of rtt for the appropriate type of rpc
- * for the frequent rpcs and a default for the others.
- * The justification for doing "other" this way is that these rpcs
- * happen so infrequently that timer est. would probably be stale.
- * Also, since many of these rpcs are
- * non-idempotent, a conservative timeout is desired.
- * getattr, lookup - A+2D
- * read, write     - A+4D
- * other           - nm_timeo
- */
-#define        NFS_RTO(n, t) \
-       ((t) == 0 ? (n)->nm_timeo : \
-        ((t) < 3 ? \
-         (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
-         ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
-#define        NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
-#define        NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
-/*
- * External data, mostly RPC constants in XDR form
+ * RTT calculations are scaled by 256 (8 bits).  A proper fractional
+ * RTT will still be calculated even with a slow NFS timer.
  */
-extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers,
-       rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr,
-       rpc_auth_kerb;
-extern u_int32_t nfs_prog;
-extern struct nfsstats nfsstats;
-extern int nfsv3_procid[NFS_NPROCS];
-extern int nfs_ticks;
+#define        NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum]]
+#define        NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum]]
+#define NFS_RTT_SCALE_BITS     8       /* bits */
+#define NFS_RTT_SCALE          256     /* value */
 
 /*
  * Defines which timer to use for the procnum.
@@ -118,52 +100,54 @@ extern int nfs_ticks;
  * 4 - write
  */
 static int proct[NFS_NPROCS] = {
-       0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
-       0, 0, 0,
+       0, 1, 0, 2, 1, 3, 3, 4, 0, 0,   /* 00-09        */
+       0, 0, 0, 0, 0, 0, 3, 3, 0, 0,   /* 10-19        */
+       0, 5, 0, 0, 0, 0,               /* 20-29        */
 };
 
+static int multt[NFS_NPROCS] = {
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   /* 00-09        */
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   /* 10-19        */
+       1, 2, 1, 1, 1, 1,               /* 20-29        */
+};
+
+static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 };
 static int nfs_realign_test;
 static int nfs_realign_count;
-static int nfs_bufpackets = 4;
-static int nfs_timer_raced;
+static int nfs_showrtt;
+static int nfs_showrexmit;
+int nfs_maxasyncbio = NFS_MAXASYNCBIO;
 
 SYSCTL_DECL(_vfs_nfs);
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
-SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, maxasyncbio, CTLFLAG_RW, &nfs_maxasyncbio, 0, "");
 
+static int nfs_request_setup(nfsm_info_t info);
+static int nfs_request_auth(struct nfsreq *rep);
+static int nfs_request_try(struct nfsreq *rep);
+static int nfs_request_waitreply(struct nfsreq *rep);
+static int nfs_request_processreply(nfsm_info_t info, int);
 
-/*
- * There is a congestion window for outstanding rpcs maintained per mount
- * point. The cwnd size is adjusted in roughly the way that:
- * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
- * SIGCOMM '88". ACM, August 1988.
- * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
- * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
- * of rpcs is in progress.
- * (The sent count and cwnd are scaled for integer arith.)
- * Variants of "slow start" were tried and were found to be too much of a
- * performance hit (ave. rtt 3 times larger),
- * I suspect due to the large rtt that nfs rpcs have.
- */
-#define        NFS_CWNDSCALE   256
-#define        NFS_MAXCWND     (NFS_CWNDSCALE * 32)
-static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 int nfsrtton = 0;
 struct nfsrtt nfsrtt;
 struct callout nfs_timer_handle;
 
 static int     nfs_msg (struct thread *,char *,char *);
-static int     nfs_rcvlock (struct nfsreq *);
-static void    nfs_rcvunlock (struct nfsreq *);
+static int     nfs_rcvlock (struct nfsmount *nmp, struct nfsreq *myreq);
+static void    nfs_rcvunlock (struct nfsmount *nmp);
 static void    nfs_realign (struct mbuf **pm, int hsiz);
-static int     nfs_receive (struct nfsreq *rep, struct sockaddr **aname,
-                                struct mbuf **mp);
-static void    nfs_softterm (struct nfsreq *rep);
-static int     nfs_reconnect (struct nfsreq *rep);
+static int     nfs_receive (struct nfsmount *nmp, struct nfsreq *rep,
+                               struct sockaddr **aname, struct mbuf **mp);
+static void    nfs_softterm (struct nfsreq *rep, int islocked);
+static void    nfs_hardterm (struct nfsreq *rep, int islocked);
+static int     nfs_reconnect (struct nfsmount *nmp, struct nfsreq *rep);
 #ifndef NFS_NOSERVER 
 static int     nfsrv_getstream (struct nfssvc_sock *, int, int *);
+static void    nfs_timer_req(struct nfsreq *req);
 
 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
                                    struct nfssvc_sock *slp,
@@ -206,19 +190,19 @@ int
 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 {
        struct socket *so;
-       int error, rcvreserve, sndreserve;
-       int pktscale;
+       int error;
        struct sockaddr *saddr;
        struct sockaddr_in *sin;
        struct thread *td = &thread0; /* only used for socreate and sobind */
 
-       nmp->nm_so = (struct socket *)0;
+       nmp->nm_so = so = NULL;
+       if (nmp->nm_flag & NFSMNT_FORCE)
+               return (EINVAL);
        saddr = nmp->nm_nam;
-       error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
+       error = socreate(saddr->sa_family, &so, nmp->nm_sotype,
                nmp->nm_soproto, td);
        if (error)
                goto bad;
-       so = nmp->nm_so;
        nmp->nm_soflags = so->so_proto->pr_flags;
 
        /*
@@ -306,23 +290,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
         * Get buffer reservation size from sysctl, but impose reasonable
         * limits.
         */
-       pktscale = nfs_bufpackets;
-       if (pktscale < 2)
-               pktscale = 2;
-       if (pktscale > 64)
-               pktscale = 64;
-
-       if (nmp->nm_sotype == SOCK_DGRAM) {
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
-               rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
-                   NFS_MAXPKTHDR) * pktscale;
-       } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
-               rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
-                   NFS_MAXPKTHDR) * pktscale;
-       } else {
-               if (nmp->nm_sotype != SOCK_STREAM)
-                       panic("nfscon sotype");
+       if (nmp->nm_sotype == SOCK_STREAM) {
                if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
                        struct sockopt sopt;
                        int val;
@@ -347,13 +315,8 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
                        val = 1;
                        sosetopt(so, &sopt);
                }
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
-                   sizeof (u_int32_t)) * pktscale;
-               rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
-                   sizeof (u_int32_t)) * pktscale;
        }
-       error = soreserve(so, sndreserve, rcvreserve,
-                         &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
+       error = soreserve(so, nfs_soreserve, nfs_soreserve, NULL);
        if (error)
                goto bad;
        so->so_rcv.ssb_flags |= SSB_NOINTR;
@@ -361,16 +324,24 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 
        /* Initialize other non-zero congestion variables */
        nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = 
-               nmp->nm_srtt[3] = (NFS_TIMEO << 3);
+               nmp->nm_srtt[3] = (NFS_TIMEO << NFS_RTT_SCALE_BITS);
        nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
                nmp->nm_sdrtt[3] = 0;
-       nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
-       nmp->nm_sent = 0;
+       nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
        nmp->nm_timeouts = 0;
+
+       /*
+        * Assign nm_so last.  The moment nm_so is assigned the nfs_timer()
+        * can mess with the socket.
+        */
+       nmp->nm_so = so;
        return (0);
 
 bad:
-       nfs_disconnect(nmp);
+       if (so) {
+               soshutdown(so, SHUT_RDWR);
+               soclose(so, FNONBLOCK);
+       }
        return (error);
 }
 
@@ -379,21 +350,26 @@ bad:
  * Called when a connection is broken on a reliable protocol.
  * - clean up the old socket
  * - nfs_connect() again
- * - set R_MUSTRESEND for all outstanding requests on mount point
+ * - set R_NEEDSXMIT for all outstanding requests on mount point
  * If this fails the mount point is DEAD!
  * nb: Must be called with the nfs_sndlock() set on the mount point.
  */
 static int
-nfs_reconnect(struct nfsreq *rep)
+nfs_reconnect(struct nfsmount *nmp, struct nfsreq *rep)
 {
-       struct nfsreq *rp;
-       struct nfsmount *nmp = rep->r_nmp;
+       struct nfsreq *req;
        int error;
 
        nfs_disconnect(nmp);
+       if (nmp->nm_rxstate >= NFSSVC_STOPPING)
+               return (EINTR);
        while ((error = nfs_connect(nmp, rep)) != 0) {
                if (error == EINTR || error == ERESTART)
                        return (EINTR);
+               if (error == EINVAL)
+                       return (error);
+               if (nmp->nm_rxstate >= NFSSVC_STOPPING)
+                       return (EINTR);
                (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
        }
 
@@ -402,9 +378,9 @@ nfs_reconnect(struct nfsreq *rep)
         * on old socket.
         */
        crit_enter();
-       TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
-               if (rp->r_nmp == nmp)
-                       rp->r_flags |= R_MUSTRESEND;
+       TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
+               KKASSERT(req->r_nmp == nmp);
+               req->r_flags |= R_NEEDSXMIT;
        }
        crit_exit();
        return (0);
@@ -420,7 +396,7 @@ nfs_disconnect(struct nfsmount *nmp)
 
        if (nmp->nm_so) {
                so = nmp->nm_so;
-               nmp->nm_so = (struct socket *)0;
+               nmp->nm_so = NULL;
                soshutdown(so, SHUT_RDWR);
                soclose(so, FNONBLOCK);
        }
@@ -429,14 +405,9 @@ nfs_disconnect(struct nfsmount *nmp)
 void
 nfs_safedisconnect(struct nfsmount *nmp)
 {
-       struct nfsreq dummyreq;
-
-       bzero(&dummyreq, sizeof(dummyreq));
-       dummyreq.r_nmp = nmp;
-       dummyreq.r_td = NULL;
-       nfs_rcvlock(&dummyreq);
+       nfs_rcvlock(nmp, NULL);
        nfs_disconnect(nmp);
-       nfs_rcvunlock(&dummyreq);
+       nfs_rcvunlock(nmp);
 }
 
 /*
@@ -445,7 +416,7 @@ nfs_safedisconnect(struct nfsmount *nmp)
  * "rep == NULL" indicates that it has been called from a server.
  * For the client side:
  * - return EINTR if the RPC is terminated, 0 otherwise
- * - set R_MUSTRESEND if the send fails for any reason
+ * - set R_NEEDSXMIT if the send fails for any reason
  * - do any cleanup required by recoverable socket errors (?)
  * For the server side:
  * - return EINTR or ERESTART if interrupted by a signal
@@ -465,16 +436,17 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
                        return (EINTR);
                }
                if ((so = rep->r_nmp->nm_so) == NULL) {
-                       rep->r_flags |= R_MUSTRESEND;
+                       rep->r_flags |= R_NEEDSXMIT;
                        m_freem(top);
                        return (0);
                }
-               rep->r_flags &= ~R_MUSTRESEND;
+               rep->r_flags &= ~R_NEEDSXMIT;
                soflags = rep->r_nmp->nm_soflags;
-       } else
+       } else {
                soflags = so->so_proto->pr_flags;
+       }
        if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
-               sendnam = (struct sockaddr *)0;
+               sendnam = NULL;
        else
                sendnam = nam;
        if (so->so_type == SOCK_SEQPACKET)
@@ -490,8 +462,20 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
         */
        if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
                error = 0;
-               if (rep)                /* do backoff retransmit on client */
-                       rep->r_flags |= R_MUSTRESEND;
+               /*
+                * do backoff retransmit on client
+                */
+               if (rep) {
+                       if ((rep->r_nmp->nm_state & NFSSTA_SENDSPACE) == 0) {
+                               rep->r_nmp->nm_state |= NFSSTA_SENDSPACE;
+                               kprintf("Warning: NFS: Insufficient sendspace "
+                                       "(%lu),\n"
+                                       "\t You must increase vfs.nfs.soreserve"
+                                       "or decrease vfs.nfs.maxasyncbio\n",
+                                       so->so_snd.ssb_hiwat);
+                       }
+                       rep->r_flags |= R_NEEDSXMIT;
+               }
        }
 
        if (error) {
@@ -504,9 +488,10 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
                        if (rep->r_flags & R_SOFTTERM)
                                error = EINTR;
                        else
-                               rep->r_flags |= R_MUSTRESEND;
-               } else
+                               rep->r_flags |= R_NEEDSXMIT;
+               } else {
                        log(LOG_INFO, "nfsd send error %d\n", error);
+               }
 
                /*
                 * Handle any recoverable (soft) socket errors here. (?)
@@ -528,7 +513,8 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
  * we have read any of it, even if the system call has been interrupted.
  */
 static int
-nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
+nfs_receive(struct nfsmount *nmp, struct nfsreq *rep,
+           struct sockaddr **aname, struct mbuf **mp)
 {
        struct socket *so;
        struct sockbuf sio;
@@ -546,7 +532,7 @@ nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
         */
        *mp = NULL;
        *aname = NULL;
-       sotype = rep->r_nmp->nm_sotype;
+       sotype = nmp->nm_sotype;
 
        /*
         * For reliable protocols, lock against other senders/receivers
@@ -557,7 +543,7 @@ nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
         * until we have an entire rpc request/reply.
         */
        if (sotype != SOCK_DGRAM) {
-               error = nfs_sndlock(rep);
+               error = nfs_sndlock(nmp, rep);
                if (error)
                        return (error);
 tryagain:
@@ -570,33 +556,33 @@ tryagain:
                 * attempt that has essentially shut down this
                 * mount point.
                 */
-               if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
-                       nfs_sndunlock(rep);
+               if (rep && (rep->r_mrep || (rep->r_flags & R_SOFTTERM))) {
+                       nfs_sndunlock(nmp);
                        return (EINTR);
                }
-               so = rep->r_nmp->nm_so;
-               if (!so) {
-                       error = nfs_reconnect(rep);
+               so = nmp->nm_so;
+               if (so == NULL) {
+                       error = nfs_reconnect(nmp, rep);
                        if (error) {
-                               nfs_sndunlock(rep);
+                               nfs_sndunlock(nmp);
                                return (error);
                        }
                        goto tryagain;
                }
-               while (rep->r_flags & R_MUSTRESEND) {
+               while (rep && (rep->r_flags & R_NEEDSXMIT)) {
                        m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
                        nfsstats.rpcretries++;
                        error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
                        if (error) {
                                if (error == EINTR || error == ERESTART ||
-                                   (error = nfs_reconnect(rep)) != 0) {
-                                       nfs_sndunlock(rep);
+                                   (error = nfs_reconnect(nmp, rep)) != 0) {
+                                       nfs_sndunlock(nmp);
                                        return (error);
                                }
                                goto tryagain;
                        }
                }
-               nfs_sndunlock(rep);
+               nfs_sndunlock(nmp);
                if (sotype == SOCK_STREAM) {
                        /*
                         * Get the length marker from the stream
@@ -629,7 +615,7 @@ tryagain:
                                 "short receive (%d/%d) from nfs server %s\n",
                                 (int)(sizeof(u_int32_t) - auio.uio_resid),
                                 (int)sizeof(u_int32_t),
-                                rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                                nmp->nm_mountp->mnt_stat.f_mntfromname);
                            error = EPIPE;
                        }
                        if (error)
@@ -643,7 +629,7 @@ tryagain:
                            log(LOG_ERR, "%s (%d) from nfs server %s\n",
                                "impossible packet length",
                                len,
-                               rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                               nmp->nm_mountp->mnt_stat.f_mntfromname);
                            error = EFBIG;
                            goto errout;
                        }
@@ -661,9 +647,9 @@ tryagain:
                        if (error == 0 && sio.sb_cc != len) {
                            if (sio.sb_cc != 0)
                            log(LOG_INFO,
-                               "short receive (%d/%d) from nfs server %s\n",
-                               len - auio.uio_resid, len,
-                               rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                               "short receive (%zu/%d) from nfs server %s\n",
+                               (size_t)len - auio.uio_resid, len,
+                               nmp->nm_mountp->mnt_stat.f_mntfromname);
                            error = EPIPE;
                        }
                        *mp = sio.sb_mb;
@@ -707,19 +693,19 @@ errout:
                                log(LOG_INFO,
                                    "receive error %d from nfs server %s\n",
                                    error,
-                                rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
+                                nmp->nm_mountp->mnt_stat.f_mntfromname);
                        }
-                       error = nfs_sndlock(rep);
+                       error = nfs_sndlock(nmp, rep);
                        if (!error) {
-                               error = nfs_reconnect(rep);
+                               error = nfs_reconnect(nmp, rep);
                                if (!error)
                                        goto tryagain;
                                else
-                                       nfs_sndunlock(rep);
+                                       nfs_sndunlock(nmp);
                        }
                }
        } else {
-               if ((so = rep->r_nmp->nm_so) == NULL)
+               if ((so = nmp->nm_so) == NULL)
                        return (EACCES);
                if (so->so_state & SS_ISCONNECTED)
                        getnam = NULL;
@@ -730,19 +716,28 @@ errout:
                        rcvflg = 0;
                        error =  so_pru_soreceive(so, getnam, NULL, &sio,
                                                  NULL, &rcvflg);
-                       if (error == EWOULDBLOCK &&
+                       if (error == EWOULDBLOCK && rep &&
                            (rep->r_flags & R_SOFTTERM)) {
                                m_freem(sio.sb_mb);
                                return (EINTR);
                        }
                } while (error == EWOULDBLOCK);
+
                len = sio.sb_cc;
                *mp = sio.sb_mb;
+
+               /*
+                * A shutdown may result in no error and no mbuf.
+                * Convert to EPIPE.
+                */
+               if (*mp == NULL && error == 0)
+                       error = EPIPE;
        }
        if (error) {
                m_freem(*mp);
                *mp = NULL;
        }
+
        /*
         * Search for any mbufs that are not a multiple of 4 bytes long
         * or with m_data not longword aligned.
@@ -755,21 +750,23 @@ errout:
 
 /*
  * Implement receipt of reply on a socket.
+ *
  * We must search through the list of received datagrams matching them
  * with outstanding requests using the xid, until ours is found.
+ *
+ * If myrep is NULL we process packets on the socket until
+ * interrupted or until nm_reqrxq is non-empty.
  */
 /* ARGSUSED */
 int
-nfs_reply(struct nfsreq *myrep)
+nfs_reply(struct nfsmount *nmp, struct nfsreq *myrep)
 {
        struct nfsreq *rep;
-       struct nfsmount *nmp = myrep->r_nmp;
-       int32_t t1;
-       struct mbuf *mrep, *md;
        struct sockaddr *nam;
-       u_int32_t rxid, *tl;
-       caddr_t dpos, cp2;
+       u_int32_t rxid;
+       u_int32_t *tl;
        int error;
+       struct nfsm_info info;
 
        /*
         * Loop around until we get our own reply
@@ -780,30 +777,51 @@ nfs_reply(struct nfsreq *myrep)
                 * sbwait() after someone else has received my reply for me.
                 * Also necessary for connection based protocols to avoid
                 * race conditions during a reconnect.
+                *
                 * If nfs_rcvlock() returns EALREADY, that means that
                 * the reply has already been recieved by another
                 * process and we can return immediately.  In this
                 * case, the lock is not taken to avoid races with
                 * other processes.
                 */
-               error = nfs_rcvlock(myrep);
+               info.mrep = NULL;
+
+               error = nfs_rcvlock(nmp, myrep);
                if (error == EALREADY)
                        return (0);
                if (error)
                        return (error);
+
+               /*
+                * If myrep is NULL we are the receiver helper thread.
+                * Stop waiting for incoming replies if there are
+                * messages sitting on reqrxq that we need to process,
+                * or if a shutdown request is pending.
+                */
+               if (myrep == NULL && (TAILQ_FIRST(&nmp->nm_reqrxq) ||
+                   nmp->nm_rxstate > NFSSVC_PENDING)) {
+                       nfs_rcvunlock(nmp);
+                       return(EWOULDBLOCK);
+               }
+
                /*
                 * Get the next Rpc reply off the socket
+                *
+                * We cannot release the receive lock until we've
+                * filled in rep->r_mrep, otherwise a waiting
+                * thread may deadlock in soreceive with no incoming
+                * packets expected.
                 */
-               error = nfs_receive(myrep, &nam, &mrep);
-               nfs_rcvunlock(myrep);
+               error = nfs_receive(nmp, myrep, &nam, &info.mrep);
                if (error) {
                        /*
                         * Ignore routing errors on connectionless protocols??
                         */
+                       nfs_rcvunlock(nmp);
                        if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
+                               if (nmp->nm_so == NULL)
+                                       return (error);
                                nmp->nm_so->so_error = 0;
-                               if (myrep->r_flags & R_GETONEREP)
-                                       return (0);
                                continue;
                        }
                        return (error);
@@ -814,16 +832,16 @@ nfs_reply(struct nfsreq *myrep)
                /*
                 * Get the xid and check that it is an rpc reply
                 */
-               md = mrep;
-               dpos = mtod(md, caddr_t);
-               nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED);
+               info.md = info.mrep;
+               info.dpos = mtod(info.md, caddr_t);
+               NULLOUT(tl = nfsm_dissect(&info, 2*NFSX_UNSIGNED));
                rxid = *tl++;
                if (*tl != rpc_reply) {
                        nfsstats.rpcinvalid++;
-                       m_freem(mrep);
+                       m_freem(info.mrep);
+                       info.mrep = NULL;
 nfsmout:
-                       if (myrep->r_flags & R_GETONEREP)
-                               return (0);
+                       nfs_rcvunlock(nmp);
                        continue;
                }
 
@@ -835,28 +853,27 @@ nfsmout:
                 * section.
                 */
                crit_enter();
-               TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
-                       if (rep->r_mrep == NULL && rxid == rep->r_xid) {
-                               rep->r_mrep = mrep;
+               TAILQ_FOREACH(rep, &nmp->nm_reqq, r_chain) {
+                       if (rep->r_mrep == NULL && rxid == rep->r_xid)
                                break;
-                       }
                }
-               crit_exit();
 
                /*
                 * Fill in the rest of the reply if we found a match.
+                *
+                * Deal with duplicate responses if there was no match.
                 */
                if (rep) {
-                       rep->r_md = md;
-                       rep->r_dpos = dpos;
+                       rep->r_md = info.md;
+                       rep->r_dpos = info.dpos;
                        if (nfsrtton) {
                                struct rttl *rt;
 
                                rt = &nfsrtt.rttl[nfsrtt.pos];
                                rt->proc = rep->r_procnum;
-                               rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
-                               rt->sent = nmp->nm_sent;
-                               rt->cwnd = nmp->nm_cwnd;
+                               rt->rto = 0;
+                               rt->sent = 0;
+                               rt->cwnd = nmp->nm_maxasync_scaled;
                                rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
                                rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
                                rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
@@ -867,29 +884,23 @@ nfsmout:
                                        rt->rtt = 1000000;
                                nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
                        }
+
                        /*
-                        * Update congestion window.
-                        * Do the additive increase of
-                        * one rpc/rtt.
+                        * New congestion control is based only on async
+                        * requests.
                         */
-                       if (nmp->nm_cwnd <= nmp->nm_sent) {
-                               nmp->nm_cwnd +=
-                                  (NFS_CWNDSCALE * NFS_CWNDSCALE +
-                                  (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
-                               if (nmp->nm_cwnd > NFS_MAXCWND)
-                                       nmp->nm_cwnd = NFS_MAXCWND;
-                       }
-                       crit_enter();   /* nfs_timer interlock for nm_sent */
+                       if (nmp->nm_maxasync_scaled < NFS_MAXASYNC_SCALED)
+                               ++nmp->nm_maxasync_scaled;
                        if (rep->r_flags & R_SENT) {
                                rep->r_flags &= ~R_SENT;
-                               nmp->nm_sent -= NFS_CWNDSCALE;
                        }
-                       crit_exit();
                        /*
                         * Update rtt using a gain of 0.125 on the mean
                         * and a gain of 0.25 on the deviation.
+                        *
+                        * NOTE SRTT/SDRTT are only good if R_TIMING is set.
                         */
-                       if (rep->r_flags & R_TIMING) {
+                       if ((rep->r_flags & R_TIMING) && rep->r_rexmit == 0) {
                                /*
                                 * Since the timer resolution of
                                 * NFS_HZ is so course, it can often
@@ -898,33 +909,194 @@ nfsmout:
                                 * rtt is between N+dt and N+2-dt ticks,
                                 * add 1.
                                 */
-                               t1 = rep->r_rtt + 1;
-                               t1 -= (NFS_SRTT(rep) >> 3);
-                               NFS_SRTT(rep) += t1;
-                               if (t1 < 0)
-                                       t1 = -t1;
-                               t1 -= (NFS_SDRTT(rep) >> 2);
-                               NFS_SDRTT(rep) += t1;
+                               int n;
+                               int d;
+
+#define NFSRSB NFS_RTT_SCALE_BITS
+                               n = ((NFS_SRTT(rep) * 7) +
+                                    (rep->r_rtt << NFSRSB)) >> 3;
+                               d = n - NFS_SRTT(rep);
+                               NFS_SRTT(rep) = n;
+
+                               /*
+                                * Don't let the jitter calculation decay
+                                * too quickly, but we want a fast rampup.
+                                */
+                               if (d < 0)
+                                       d = -d;
+                               d <<= NFSRSB;
+                               if (d < NFS_SDRTT(rep))
+                                       n = ((NFS_SDRTT(rep) * 15) + d) >> 4;
+                               else
+                                       n = ((NFS_SDRTT(rep) * 3) + d) >> 2;
+                               NFS_SDRTT(rep) = n;
+#undef NFSRSB
                        }
                        nmp->nm_timeouts = 0;
+                       rep->r_mrep = info.mrep;
+                       nfs_hardterm(rep, 0);
+               } else {
+                       /*
+                        * Extract vers, prog, nfsver, procnum.  A duplicate
+                        * response means we didn't wait long enough so
+                        * we increase the SRTT to avoid future spurious
+                        * timeouts.
+                        */
+                       u_int procnum = nmp->nm_lastreprocnum;
+                       int n;
+
+                       if (procnum < NFS_NPROCS && proct[procnum]) {
+                               if (nfs_showrexmit)
+                                       kprintf("D");
+                               n = nmp->nm_srtt[proct[procnum]];
+                               n += NFS_ASYSCALE * NFS_HZ;
+                               if (n < NFS_ASYSCALE * NFS_HZ * 10)
+                                       n = NFS_ASYSCALE * NFS_HZ * 10;
+                               nmp->nm_srtt[proct[procnum]] = n;
+                       }
                }
+               nfs_rcvunlock(nmp);
+               crit_exit();
+
                /*
                 * If not matched to a request, drop it.
                 * If it's mine, get out.
                 */
                if (rep == NULL) {
                        nfsstats.rpcunexpected++;
-                       m_freem(mrep);
+                       m_freem(info.mrep);
+                       info.mrep = NULL;
                } else if (rep == myrep) {
                        if (rep->r_mrep == NULL)
                                panic("nfsreply nil");
                        return (0);
                }
-               if (myrep->r_flags & R_GETONEREP)
-                       return (0);
        }
 }
 
+/*
+ * Run the request state machine until the target state is reached
+ * or a fatal error occurs.  The target state is not run.  Specifying
+ * a target of NFSM_STATE_DONE runs the state machine until the rpc
+ * is complete.
+ *
+ * EINPROGRESS is returned for all states other then the DONE state,
+ * indicating that the rpc is still in progress.
+ */
+int
+nfs_request(struct nfsm_info *info, nfsm_state_t bstate, nfsm_state_t estate)
+{
+       struct nfsreq *req;
+
+       while (info->state >= bstate && info->state < estate) {
+               switch(info->state) {
+               case NFSM_STATE_SETUP:
+                       /*
+                        * Setup the nfsreq.  Any error which occurs during
+                        * this state is fatal.
+                        */
+                       info->error = nfs_request_setup(info);
+                       if (info->error) {
+                               info->state = NFSM_STATE_DONE;
+                               return (info->error);
+                       } else {
+                               req = info->req;
+                               req->r_mrp = &info->mrep;
+                               req->r_mdp = &info->md;
+                               req->r_dposp = &info->dpos;
+                               info->state = NFSM_STATE_AUTH;
+                       }
+                       break;
+               case NFSM_STATE_AUTH:
+                       /*
+                        * Authenticate the nfsreq.  Any error which occurs
+                        * during this state is fatal.
+                        */
+                       info->error = nfs_request_auth(info->req);
+                       if (info->error) {
+                               info->state = NFSM_STATE_DONE;
+                               return (info->error);
+                       } else {
+                               info->state = NFSM_STATE_TRY;
+                       }
+                       break;
+               case NFSM_STATE_TRY:
+                       /*
+                        * Transmit or retransmit attempt.  An error in this
+                        * state is ignored and we always move on to the
+                        * next state.
+                        *
+                        * This can trivially race the receiver if the
+                        * request is asynchronous.  nfs_request_try()
+                        * will thus set the state for us and we
+                        * must also return immediately if we are
+                        * running an async state machine, because
+                        * info can become invalid due to races after
+                        * try() returns.
+                        */
+                       if (info->req->r_flags & R_ASYNC) {
+                               nfs_request_try(info->req);
+                               if (estate == NFSM_STATE_WAITREPLY)
+                                       return (EINPROGRESS);
+                       } else {
+                               nfs_request_try(info->req);
+                               info->state = NFSM_STATE_WAITREPLY;
+                       }
+                       break;
+               case NFSM_STATE_WAITREPLY:
+                       /*
+                        * Wait for a reply or timeout and move on to the
+                        * next state.  The error returned by this state
+                        * is passed to the processing code in the next
+                        * state.
+                        */
+                       info->error = nfs_request_waitreply(info->req);
+                       info->state = NFSM_STATE_PROCESSREPLY;
+                       break;
+               case NFSM_STATE_PROCESSREPLY:
+                       /*
+                        * Process the reply or timeout.  Errors which occur
+                        * in this state may cause the state machine to
+                        * go back to an earlier state, and are fatal
+                        * otherwise.
+                        */
+                       info->error = nfs_request_processreply(info,
+                                                              info->error);
+                       switch(info->error) {
+                       case ENEEDAUTH:
+                               info->state = NFSM_STATE_AUTH;
+                               break;
+                       case EAGAIN:
+                               info->state = NFSM_STATE_TRY;
+                               break;
+                       default:
+                               /*
+                                * Operation complete, with or without an
+                                * error.  We are done.
+                                */
+                               info->req = NULL;
+                               info->state = NFSM_STATE_DONE;
+                               return (info->error);
+                       }
+                       break;
+               case NFSM_STATE_DONE:
+                       /*
+                        * Shouldn't be reached
+                        */
+                       return (info->error);
+                       /* NOT REACHED */
+               }
+       }
+
+       /*
+        * If we are done return the error code (if any).
+        * Otherwise return EINPROGRESS.
+        */
+       if (info->state == NFSM_STATE_DONE)
+               return (info->error);
+       return (EINPROGRESS);
+}
+
 /*
  * nfs_request - goes something like this
  *     - fill in request struct
@@ -935,64 +1107,89 @@ nfsmout:
  *       by mrep or error
  * nb: always frees up mreq mbuf list
  */
-int
-nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
-           struct thread *td, struct ucred *cred, struct mbuf **mrp,
-           struct mbuf **mdp, caddr_t *dposp)
+static int
+nfs_request_setup(nfsm_info_t info)
 {
-       struct mbuf *mrep, *m2;
-       struct nfsreq *rep;
-       u_int32_t *tl;
-       int i;
+       struct nfsreq *req;
        struct nfsmount *nmp;
-       struct mbuf *m, *md, *mheadend;
-       char nickv[RPCX_NICKVERF];
-       time_t waituntil;
-       caddr_t dpos, cp2;
-       int t1, error = 0, mrest_len, auth_len, auth_type;
-       int trylater_delay = 15, trylater_cnt = 0, failed_auth = 0;
-       int verf_len, verf_type;
-       u_int32_t xid;
-       char *auth_str, *verf_str;
-       NFSKERBKEY_T key;               /* save session key */
+       struct mbuf *m;
+       int i;
 
-       /* Reject requests while attempting a forced unmount. */
-       if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
-               m_freem(mrest);
+       /*
+        * Reject requests while attempting a forced unmount.
+        */
+       if (info->vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
+               m_freem(info->mreq);
+               info->mreq = NULL;
                return (ESTALE);
        }
-       nmp = VFSTONFS(vp->v_mount);
-       MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
-       rep->r_nmp = nmp;
-       rep->r_vp = vp;
-       rep->r_td = td;
-       rep->r_procnum = procnum;
-       rep->r_mreq = NULL;
+       nmp = VFSTONFS(info->vp->v_mount);
+       req = kmalloc(sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
+       req->r_nmp = nmp;
+       req->r_vp = info->vp;
+       req->r_td = info->td;
+       req->r_procnum = info->procnum;
+       req->r_mreq = NULL;
+       req->r_cred = info->cred;
+
        i = 0;
-       m = mrest;
+       m = info->mreq;
        while (m) {
                i += m->m_len;
                m = m->m_next;
        }
-       mrest_len = i;
+       req->r_mrest = info->mreq;
+       req->r_mrest_len = i;
+
+       /*
+        * The presence of a non-NULL r_info in req indicates
+        * async completion via our helper threads.  See the receiver
+        * code.
+        */
+       if (info->bio) {
+               req->r_info = info;
+               req->r_flags = R_ASYNC;
+       } else {
+               req->r_info = NULL;
+               req->r_flags = 0;
+       }
+       info->req = req;
+       return(0);
+}
+
+static int
+nfs_request_auth(struct nfsreq *rep)
+{
+       struct nfsmount *nmp = rep->r_nmp;
+       struct mbuf *m;
+       char nickv[RPCX_NICKVERF];
+       int error = 0, auth_len, auth_type;
+       int verf_len;
+       u_int32_t xid;
+       char *auth_str, *verf_str;
+       struct ucred *cred;
+
+       cred = rep->r_cred;
+       rep->r_failed_auth = 0;
 
        /*
         * Get the RPC header with authorization.
         */
-kerbauth:
-       verf_str = auth_str = (char *)0;
+       verf_str = auth_str = NULL;
        if (nmp->nm_flag & NFSMNT_KERB) {
                verf_str = nickv;
                verf_len = sizeof (nickv);
                auth_type = RPCAUTH_KERB4;
-               bzero((caddr_t)key, sizeof (key));
-               if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
-                       &auth_len, verf_str, verf_len)) {
+               bzero((caddr_t)rep->r_key, sizeof(rep->r_key));
+               if (rep->r_failed_auth ||
+                   nfs_getnickauth(nmp, cred, &auth_str, &auth_len,
+                                   verf_str, verf_len)) {
                        error = nfs_getauth(nmp, rep, cred, &auth_str,
-                               &auth_len, verf_str, &verf_len, key);
+                               &auth_len, verf_str, &verf_len, rep->r_key);
                        if (error) {
+                               m_freem(rep->r_mrest);
+                               rep->r_mrest = NULL;
                                kfree((caddr_t)rep, M_NFSREQ);
-                               m_freem(mrest);
                                return (error);
                        }
                }
@@ -1004,8 +1201,10 @@ kerbauth:
                        nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
                        5 * NFSX_UNSIGNED;
        }
-       m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
-            auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid);
+       m = nfsm_rpchead(cred, nmp->nm_flag, rep->r_procnum, auth_type,
+                       auth_len, auth_str, verf_len, verf_str,
+                       rep->r_mrest, rep->r_mrest_len, &rep->r_mheadend, &xid);
+       rep->r_mrest = NULL;
        if (auth_str)
                kfree(auth_str, M_TEMP);
 
@@ -1023,16 +1222,37 @@ kerbauth:
        }
        rep->r_mreq = m;
        rep->r_xid = xid;
-tryagain:
+       return (0);
+}
+
+static int
+nfs_request_try(struct nfsreq *rep)
+{
+       struct nfsmount *nmp = rep->r_nmp;
+       struct mbuf *m2;
+       int error;
+
+       /*
+        * Request is not on any queue, only the owner has access to it
+        * so it should not be locked by anyone atm.
+        *
+        * Interlock to prevent races.  While locked the only remote
+        * action possible is for r_mrep to be set (once we enqueue it).
+        */
+       if (rep->r_flags == 0xdeadc0de) {
+               print_backtrace(-1);
+               panic("flags nbad\n");
+       }
+       KKASSERT((rep->r_flags & (R_LOCKED | R_ONREQQ)) == 0);
        if (nmp->nm_flag & NFSMNT_SOFT)
                rep->r_retry = nmp->nm_retry;
        else
                rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
        rep->r_rtt = rep->r_rexmit = 0;
-       if (proct[procnum] > 0)
-               rep->r_flags = R_TIMING | R_MASKTIMER;
+       if (proct[rep->r_procnum] > 0)
+               rep->r_flags |= R_TIMING | R_LOCKED;
        else
-               rep->r_flags = R_MASKTIMER;
+               rep->r_flags |= R_LOCKED;
        rep->r_mrep = NULL;
 
        /*
@@ -1040,117 +1260,198 @@ tryagain:
         */
        nfsstats.rpcrequests++;
 
+       if (nmp->nm_flag & NFSMNT_FORCE) {
+               rep->r_flags |= R_SOFTTERM;
+               rep->r_flags &= ~R_LOCKED;
+               return (0);
+       }
+
        /*
         * Chain request into list of outstanding requests. Be sure
         * to put it LAST so timer finds oldest requests first.  Note
-        * that R_MASKTIMER is set at the moment to prevent any timer
-        * action on this request while we are still doing processing on
-        * it below.  splsoftclock() primarily protects nm_sent.  Note
-        * that we may block in this code so there is no atomicy guarentee.
+        * that our control of R_LOCKED prevents the request from
+        * getting ripped out from under us or transmitted by the
+        * timer code.
+        *
+        * For requests with info structures we must atomically set the
+        * info's state because the structure could become invalid upon
+        * return due to races (i.e., if async)
         */
        crit_enter();
-       TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
+       mtx_link_init(&rep->r_link);
+       TAILQ_INSERT_TAIL(&nmp->nm_reqq, rep, r_chain);
+       rep->r_flags |= R_ONREQQ;
+       ++nmp->nm_reqqlen;
+       if (rep->r_flags & R_ASYNC)
+               rep->r_info->state = NFSM_STATE_WAITREPLY;
+       crit_exit();
+
+       error = 0;
 
        /*
-        * If backing off another request or avoiding congestion, don't
-        * send this one now but let timer do it.  If not timing a request,
-        * do it now. 
-        *
-        * Even though the timer will not mess with our request there is
-        * still the possibility that we will race a reply (which clears
-        * R_SENT), especially on localhost connections, so be very careful
-        * when setting R_SENT.  We could set R_SENT prior to calling
-        * nfs_send() but why bother if the response occurs that quickly?
+        * Send if we can.  Congestion control is not handled here any more
+        * becausing trying to defer the initial send based on the nfs_timer
+        * requires having a very fast nfs_timer, which is silly.
         */
-       if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
-           (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
-           nmp->nm_sent < nmp->nm_cwnd)) {
+       if (nmp->nm_so) {
                if (nmp->nm_soflags & PR_CONNREQUIRED)
-                       error = nfs_sndlock(rep);
-               if (!error) {
-                       m2 = m_copym(m, 0, M_COPYALL, MB_WAIT);
+                       error = nfs_sndlock(nmp, rep);
+               if (error == 0) {
+                       m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
                        error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
                        if (nmp->nm_soflags & PR_CONNREQUIRED)
-                               nfs_sndunlock(rep);
-               }
-               if (!error && (rep->r_flags & R_MUSTRESEND) == 0 &&
-                   rep->r_mrep == NULL) {
-                       KASSERT((rep->r_flags & R_SENT) == 0,
-                               ("R_SENT ASSERT %p", rep));
-                       nmp->nm_sent += NFS_CWNDSCALE;
-                       rep->r_flags |= R_SENT;
+                               nfs_sndunlock(nmp);
+                       rep->r_flags &= ~R_NEEDSXMIT;
+                       if ((rep->r_flags & R_SENT) == 0) {
+                               rep->r_flags |= R_SENT;
+                       }
+               } else {
+                       rep->r_flags |= R_NEEDSXMIT;
                }
        } else {
+               rep->r_flags |= R_NEEDSXMIT;
                rep->r_rtt = -1;
        }
+       if (error == EPIPE)
+               error = 0;
 
        /*
-        * Let the timer do what it will with the request, then
-        * wait for the reply from our send or the timer's.
+        * Release the lock.  The only remote action that may have occurred
+        * would have been the setting of rep->r_mrep.  If this occured
+        * and the request was async we have to move it to the reader
+        * thread's queue for action.
+        *
+        * For async requests also make sure the reader is woken up so
+        * it gets on the socket to read responses.
         */
-       if (!error || error == EPIPE) {
-               rep->r_flags &= ~R_MASKTIMER;
-               crit_exit();
-               error = nfs_reply(rep);
-               crit_enter();
+       crit_enter();
+       if (rep->r_flags & R_ASYNC) {
+               if (rep->r_mrep)
+                       nfs_hardterm(rep, 1);
+               rep->r_flags &= ~R_LOCKED;
+               nfssvc_iod_reader_wakeup(nmp);
+       } else {
+               rep->r_flags &= ~R_LOCKED;
+       }
+       if (rep->r_flags & R_WANTED) {
+               rep->r_flags &= ~R_WANTED;
+               wakeup(rep);
        }
+       crit_exit();
+       return (error);
+}
+
+/*
+ * This code is only called for synchronous requests.  Completed synchronous
+ * requests are left on reqq and we remove them before moving on to the
+ * processing state.
+ */
+static int
+nfs_request_waitreply(struct nfsreq *rep)
+{
+       struct nfsmount *nmp = rep->r_nmp;
+       int error;
+
+       KKASSERT((rep->r_flags & R_ASYNC) == 0);
+
+       /*
+        * Wait until the request is finished.
+        */
+       error = nfs_reply(nmp, rep);
 
        /*
         * RPC done, unlink the request, but don't rip it out from under
         * the callout timer.
+        *
+        * Once unlinked no other receiver or the timer will have
+        * visibility, so we do not have to set R_LOCKED.
         */
+       crit_enter();
        while (rep->r_flags & R_LOCKED) {
-               nfs_timer_raced = 1;
-               tsleep(&nfs_timer_raced, 0, "nfstrac", 0);
+               rep->r_flags |= R_WANTED;
+               tsleep(rep, 0, "nfstrac", 0);
+       }
+       KKASSERT(rep->r_flags & R_ONREQQ);
+       TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
+       rep->r_flags &= ~R_ONREQQ;
+       --nmp->nm_reqqlen;
+       if (TAILQ_FIRST(&nmp->nm_bioq) &&
+           nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
+               nfssvc_iod_writer_wakeup(nmp);
        }
-       TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
+       crit_exit();
 
        /*
         * Decrement the outstanding request count.
         */
        if (rep->r_flags & R_SENT) {
                rep->r_flags &= ~R_SENT;
-               nmp->nm_sent -= NFS_CWNDSCALE;
        }
-       crit_exit();
+       return (error);
+}
+
+/*
+ * Process reply with error returned from nfs_requet_waitreply().
+ *
+ * Returns EAGAIN if it wants us to loop up to nfs_request_try() again.
+ * Returns ENEEDAUTH if it wants us to loop up to nfs_request_auth() again.
+ */
+static int
+nfs_request_processreply(nfsm_info_t info, int error)
+{
+       struct nfsreq *req = info->req;
+       struct nfsmount *nmp = req->r_nmp;
+       u_int32_t *tl;
+       int verf_type;
+       int i;
 
        /*
         * If there was a successful reply and a tprintf msg.
         * tprintf a response.
         */
-       if (!error && (rep->r_flags & R_TPRINTFMSG))
-               nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
+       if (error == 0 && (req->r_flags & R_TPRINTFMSG)) {
+               nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
                    "is alive again");
-       mrep = rep->r_mrep;
-       md = rep->r_md;
-       dpos = rep->r_dpos;
+       }
+       info->mrep = req->r_mrep;
+       info->md = req->r_md;
+       info->dpos = req->r_dpos;
        if (error) {
-               m_freem(rep->r_mreq);
-               kfree((caddr_t)rep, M_NFSREQ);
+               m_freem(req->r_mreq);
+               req->r_mreq = NULL;
+               kfree(req, M_NFSREQ);
+               info->req = NULL;
                return (error);
        }
 
        /*
         * break down the rpc header and check if ok
         */
-       nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+       NULLOUT(tl = nfsm_dissect(info, 3 * NFSX_UNSIGNED));
        if (*tl++ == rpc_msgdenied) {
-               if (*tl == rpc_mismatch)
+               if (*tl == rpc_mismatch) {
                        error = EOPNOTSUPP;
-               else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
-                       if (!failed_auth) {
-                               failed_auth++;
-                               mheadend->m_next = (struct mbuf *)0;
-                               m_freem(mrep);
-                               m_freem(rep->r_mreq);
-                               goto kerbauth;
-                       } else
+               } else if ((nmp->nm_flag & NFSMNT_KERB) &&
+                          *tl++ == rpc_autherr) {
+                       if (req->r_failed_auth == 0) {
+                               req->r_failed_auth++;
+                               req->r_mheadend->m_next = NULL;
+                               m_freem(info->mrep);
+                               info->mrep = NULL;
+                               m_freem(req->r_mreq);
+                               return (ENEEDAUTH);
+                       } else {
                                error = EAUTH;
-               } else
+                       }
+               } else {
                        error = EACCES;
-               m_freem(mrep);
-               m_freem(rep->r_mreq);
-               kfree((caddr_t)rep, M_NFSREQ);
+               }
+               m_freem(info->mrep);
+               info->mrep = NULL;
+               m_freem(req->r_mreq);
+               req->r_mreq = NULL;
+               kfree(req, M_NFSREQ);
+               info->req = NULL;
                return (error);
        }
 
@@ -1160,29 +1461,32 @@ tryagain:
        verf_type = fxdr_unsigned(int, *tl++);
        i = fxdr_unsigned(int32_t, *tl);
        if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
-               error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
+               error = nfs_savenickauth(nmp, req->r_cred, i, req->r_key,
+                                        &info->md, &info->dpos, info->mrep);
                if (error)
                        goto nfsmout;
-       } else if (i > 0)
-               nfsm_adv(nfsm_rndup(i));
-       nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
+       } else if (i > 0) {
+               ERROROUT(nfsm_adv(info, nfsm_rndup(i)));
+       }
+       NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
        /* 0 == ok */
        if (*tl == 0) {
-               nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
+               NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
                if (*tl != 0) {
                        error = fxdr_unsigned(int, *tl);
+
+                       /*
+                        * Does anyone even implement this?  Just impose
+                        * a 1-second delay.
+                        */
                        if ((nmp->nm_flag & NFSMNT_NFSV3) &&
                                error == NFSERR_TRYLATER) {
-                               m_freem(mrep);
+                               m_freem(info->mrep);
+                               info->mrep = NULL;
                                error = 0;
-                               waituntil = time_second + trylater_delay;
-                               while (time_second < waituntil)
-                                       (void) tsleep((caddr_t)&lbolt,
-                                               0, "nqnfstry", 0);
-                               trylater_delay *= nfs_backoff[trylater_cnt];
-                               if (trylater_cnt < 7)
-                                       trylater_cnt++;
-                               goto tryagain;
+
+                               tsleep((caddr_t)&lbolt, 0, "nqnfstry", 0);
+                               return (EAGAIN);        /* goto tryagain */
                        }
 
                        /*
@@ -1193,6 +1497,7 @@ tryagain:
                         * release the vnode lock if we hold it.
                         */
                        if (error == ESTALE) {
+                               struct vnode *vp = req->r_vp;
                                int ltype;
 
                                ltype = lockstatus(&vp->v_lock, curthread);
@@ -1203,29 +1508,37 @@ tryagain:
                                        lockmgr(&vp->v_lock, ltype);
                        }
                        if (nmp->nm_flag & NFSMNT_NFSV3) {
-                               *mrp = mrep;
-                               *mdp = md;
-                               *dposp = dpos;
+                               KKASSERT(*req->r_mrp == info->mrep);
+                               KKASSERT(*req->r_mdp == info->md);
+                               KKASSERT(*req->r_dposp == info->dpos);
                                error |= NFSERR_RETERR;
-                       } else
-                               m_freem(mrep);
-                       m_freem(rep->r_mreq);
-                       kfree((caddr_t)rep, M_NFSREQ);
+                       } else {
+                               m_freem(info->mrep);
+                               info->mrep = NULL;
+                       }
+                       m_freem(req->r_mreq);
+                       req->r_mreq = NULL;
+                       kfree(req, M_NFSREQ);
+                       info->req = NULL;
                        return (error);
                }
 
-               *mrp = mrep;
-               *mdp = md;
-               *dposp = dpos;
-               m_freem(rep->r_mreq);
-               FREE((caddr_t)rep, M_NFSREQ);
+               KKASSERT(*req->r_mrp == info->mrep);
+               KKASSERT(*req->r_mdp == info->md);
+               KKASSERT(*req->r_dposp == info->dpos);
+               m_freem(req->r_mreq);
+               req->r_mreq = NULL;
+               FREE(req, M_NFSREQ);
                return (0);
        }
-       m_freem(mrep);
+       m_freem(info->mrep);
+       info->mrep = NULL;
        error = EPROTONOSUPPORT;
 nfsmout:
-       m_freem(rep->r_mreq);
-       kfree((caddr_t)rep, M_NFSREQ);
+       m_freem(req->r_mreq);
+       req->r_mreq = NULL;
+       kfree(req, M_NFSREQ);
+       info->req = NULL;
        return (error);
 }
 
@@ -1239,22 +1552,21 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
            int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp)
 {
        u_int32_t *tl;
-       struct mbuf *mreq;
-       caddr_t bpos;
-       struct mbuf *mb, *mb2;
+       struct nfsm_info info;
 
        siz += RPC_REPLYSIZ;
-       mb = mreq = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
-       mreq->m_pkthdr.len = 0;
+       info.mb = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
+       info.mreq = info.mb;
+       info.mreq->m_pkthdr.len = 0;
        /*
         * If this is not a cluster, try and leave leading space
         * for the lower level headers.
         */
        if ((max_hdr + siz) < MINCLSIZE)
-               mreq->m_data += max_hdr;
-       tl = mtod(mreq, u_int32_t *);
-       mreq->m_len = 6 * NFSX_UNSIGNED;
-       bpos = ((caddr_t)tl) + mreq->m_len;
+               info.mreq->m_data += max_hdr;
+       tl = mtod(info.mreq, u_int32_t *);
+       info.mreq->m_len = 6 * NFSX_UNSIGNED;
+       info.bpos = ((caddr_t)tl) + info.mreq->m_len;
        *tl++ = txdr_unsigned(nd->nd_retxid);
        *tl++ = rpc_reply;
        if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
@@ -1262,8 +1574,8 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
                if (err & NFSERR_AUTHERR) {
                        *tl++ = rpc_autherr;
                        *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
-                       mreq->m_len -= NFSX_UNSIGNED;
-                       bpos -= NFSX_UNSIGNED;
+                       info.mreq->m_len -= NFSX_UNSIGNED;
+                       info.bpos -= NFSX_UNSIGNED;
                } else {
                        *tl++ = rpc_mismatch;
                        *tl++ = txdr_unsigned(RPC_VER2);
@@ -1299,12 +1611,15 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
                         */
 #ifdef NFSKERB
                        XXX
+#else
+                       ktvout.tv_sec = 0;
+                       ktvout.tv_usec = 0;
 #endif
 
                        *tl++ = rpc_auth_kerb;
                        *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
                        *tl = ktvout.tv_sec;
-                       nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+                       tl = nfsm_build(&info, 3 * NFSX_UNSIGNED);
                        *tl++ = ktvout.tv_usec;
                        *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
                    } else {
@@ -1321,7 +1636,7 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
                        break;
                case EPROGMISMATCH:
                        *tl = txdr_unsigned(RPC_PROGMISMATCH);
-                       nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+                       tl = nfsm_build(&info, 2 * NFSX_UNSIGNED);
                        *tl++ = txdr_unsigned(2);
                        *tl = txdr_unsigned(3);
                        break;
@@ -1334,7 +1649,7 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
                default:
                        *tl = 0;
                        if (err != NFSERR_RETVOID) {
-                               nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
+                               tl = nfsm_build(&info, NFSX_UNSIGNED);
                                if (err)
                                    *tl = txdr_unsigned(nfsrv_errmap(nd, err));
                                else
@@ -1345,9 +1660,9 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
        }
 
        if (mrq != NULL)
-           *mrq = mreq;
-       *mbp = mb;
-       *bposp = bpos;
+           *mrq = info.mreq;
+       *mbp = info.mb;
+       *bposp = info.bpos;
        if (err != 0 && err != NFSERR_RETVOID)
                nfsstats.srvrpc_errs++;
        return (0);
@@ -1355,123 +1670,48 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
 
 
 #endif /* NFS_NOSERVER */
+
 /*
- * Nfs timer routine
+ * Nfs timer routine.
+ *
  * Scan the nfsreq list and retranmit any requests that have timed out
  * To avoid retransmission attempts on STREAM sockets (in the future) make
  * sure to set the r_retry field to 0 (implies nm_retry == 0).
+ *
+ * Requests with attached responses, terminated requests, and
+ * locked requests are ignored.  Locked requests will be picked up
+ * in a later timer call.
  */
 void
 nfs_timer(void *arg /* never used */)
 {
-       struct nfsreq *rep;
-       struct mbuf *m;
-       struct socket *so;
        struct nfsmount *nmp;
-       int timeo;
-       int error;
+       struct nfsreq *req;
 #ifndef NFS_NOSERVER
        struct nfssvc_sock *slp;
        u_quad_t cur_usec;
 #endif /* NFS_NOSERVER */
-       struct thread *td = &thread0; /* XXX for credentials, will break if sleep */
 
        crit_enter();
-       TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
-               nmp = rep->r_nmp;
-               if (rep->r_mrep || (rep->r_flags & (R_SOFTTERM|R_MASKTIMER)))
-                       continue;
-               rep->r_flags |= R_LOCKED;
-               if (nfs_sigintr(nmp, rep, rep->r_td)) {
-                       nfs_softterm(rep);
-                       goto skip;
-               }
-               if (rep->r_rtt >= 0) {
-                       rep->r_rtt++;
-                       if (nmp->nm_flag & NFSMNT_DUMBTIMR)
-                               timeo = nmp->nm_timeo;
-                       else
-                               timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
-                       if (nmp->nm_timeouts > 0)
-                               timeo *= nfs_backoff[nmp->nm_timeouts - 1];
-                       if (rep->r_rtt <= timeo)
-                               goto skip;
-                       if (nmp->nm_timeouts < 8)
-                               nmp->nm_timeouts++;
-               }
-               /*
-                * Check for server not responding
-                */
-               if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
-                    rep->r_rexmit > nmp->nm_deadthresh) {
-                       nfs_msg(rep->r_td,
-                           nmp->nm_mountp->mnt_stat.f_mntfromname,
-                           "not responding");
-                       rep->r_flags |= R_TPRINTFMSG;
-               }
-               if (rep->r_rexmit >= rep->r_retry) {    /* too many */
-                       nfsstats.rpctimeouts++;
-                       nfs_softterm(rep);
-                       goto skip;
-               }
-               if (nmp->nm_sotype != SOCK_DGRAM) {
-                       if (++rep->r_rexmit > NFS_MAXREXMIT)
-                               rep->r_rexmit = NFS_MAXREXMIT;
-                       goto skip;
-               }
-               if ((so = nmp->nm_so) == NULL)
-                       goto skip;
-
-               /*
-                * If there is enough space and the window allows..
-                *      Resend it
-                * Set r_rtt to -1 in case we fail to send it now.
-                */
-               rep->r_rtt = -1;
-               if (ssb_space(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
-                  ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
-                   (rep->r_flags & R_SENT) ||
-                   nmp->nm_sent < nmp->nm_cwnd) &&
-                  (m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
-                       if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
-                           error = so_pru_send(so, 0, m, (struct sockaddr *)0,
-                                    (struct mbuf *)0, td);
-                       else
-                           error = so_pru_send(so, 0, m, nmp->nm_nam,
-                               (struct mbuf *)0, td);
-                       if (error) {
-                               if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
-                                       so->so_error = 0;
-                       } else if (rep->r_mrep == NULL) {
-                               /*
-                                * Iff first send, start timing
-                                * else turn timing off, backoff timer
-                                * and divide congestion window by 2.
-                                *
-                                * It is possible for the so_pru_send() to
-                                * block and for us to race a reply so we
-                                * only do this if the reply field has not 
-                                * been filled in.  R_LOCKED will prevent
-                                * the request from being ripped out from under
-                                * us entirely.
-                                */
-                               if (rep->r_flags & R_SENT) {
-                                       rep->r_flags &= ~R_TIMING;
-                                       if (++rep->r_rexmit > NFS_MAXREXMIT)
-                                               rep->r_rexmit = NFS_MAXREXMIT;
-                                       nmp->nm_cwnd >>= 1;
-                                       if (nmp->nm_cwnd < NFS_CWNDSCALE)
-                                               nmp->nm_cwnd = NFS_CWNDSCALE;
-                                       nfsstats.rpcretries++;
-                               } else {
-                                       rep->r_flags |= R_SENT;
-                                       nmp->nm_sent += NFS_CWNDSCALE;
-                               }
-                               rep->r_rtt = 0;
+       TAILQ_FOREACH(nmp, &nfs_mountq, nm_entry) {
+               TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
+                       KKASSERT(nmp == req->r_nmp);
+                       if (req->r_mrep)
+                               continue;
+                       if (req->r_flags & (R_SOFTTERM | R_LOCKED))
+                               continue;
+                       req->r_flags |= R_LOCKED;
+                       if (nfs_sigintr(nmp, req, req->r_td)) {
+                               nfs_softterm(req, 1);
+                       } else {
+                               nfs_timer_req(req);
+                       }
+                       req->r_flags &= ~R_LOCKED;
+                       if (req->r_flags & R_WANTED) {
+                               req->r_flags &= ~R_WANTED;
+                               wakeup(req);
                        }
                }
-skip:
-               rep->r_flags &= ~R_LOCKED;
        }
 #ifndef NFS_NOSERVER
 
@@ -1485,23 +1725,169 @@ skip:
                nfsrv_wakenfsd(slp, 1);
        }
 #endif /* NFS_NOSERVER */
+       crit_exit();
+       callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL);
+}
+
+static
+void
+nfs_timer_req(struct nfsreq *req)
+{
+       struct thread *td = &thread0; /* XXX for creds, will break if sleep */
+       struct nfsmount *nmp = req->r_nmp;
+       struct mbuf *m;
+       struct socket *so;
+       int timeo;
+       int error;
+
+       /*
+        * rtt ticks and timeout calculation.  Return if the timeout
+        * has not been reached yet, unless the packet is flagged
+        * for an immediate send.
+        *
+        * The mean rtt doesn't help when we get random I/Os, we have
+        * to multiply by fairly large numbers.
+        */
+       if (req->r_rtt >= 0) {
+               /*
+                * Calculate the timeout to test against.
+                */
+               req->r_rtt++;
+               if (nmp->nm_flag & NFSMNT_DUMBTIMR) {
+                       timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
+               } else if (req->r_flags & R_TIMING) {
+                       timeo = NFS_SRTT(req) + NFS_SDRTT(req);
+               } else {
+                       timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
+               }
+               timeo *= multt[req->r_procnum];
+               /* timeo is still scaled by SCALE_BITS */
+
+#define NFSFS  (NFS_RTT_SCALE * NFS_HZ)
+               if (req->r_flags & R_TIMING) {
+                       static long last_time;
+                       if (nfs_showrtt && last_time != time_second) {
+                               kprintf("rpccmd %d NFS SRTT %d SDRTT %d "
+                                       "timeo %d.%03d\n",
+                                       proct[req->r_procnum],
+                                       NFS_SRTT(req), NFS_SDRTT(req),
+                                       timeo / NFSFS,
+                                       timeo % NFSFS * 1000 /  NFSFS);
+                               last_time = time_second;
+                       }
+               }
+#undef NFSFS
+
+               /*
+                * deal with nfs_timer jitter.
+                */
+               timeo = (timeo >> NFS_RTT_SCALE_BITS) + 1;
+               if (timeo < 2)
+                       timeo = 2;
+
+               if (nmp->nm_timeouts > 0)
+                       timeo *= nfs_backoff[nmp->nm_timeouts - 1];
+               if (timeo > NFS_MAXTIMEO)
+                       timeo = NFS_MAXTIMEO;
+               if (req->r_rtt <= timeo) {
+                       if ((req->r_flags & R_NEEDSXMIT) == 0)
+                               return;
+               } else if (nmp->nm_timeouts < 8) {
+                       nmp->nm_timeouts++;
+               }
+       }
 
        /*
-        * Due to possible blocking, a client operation may be waiting for
-        * us to finish processing this request so it can remove it.
+        * Check for server not responding
         */
-       if (nfs_timer_raced) {
-               nfs_timer_raced = 0;
-               wakeup(&nfs_timer_raced);
+       if ((req->r_flags & R_TPRINTFMSG) == 0 &&
+            req->r_rexmit > nmp->nm_deadthresh) {
+               nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
+                       "not responding");
+               req->r_flags |= R_TPRINTFMSG;
+       }
+       if (req->r_rexmit >= req->r_retry) {    /* too many */
+               nfsstats.rpctimeouts++;
+               nfs_softterm(req, 1);
+               return;
+       }
+
+       /*
+        * Generally disable retransmission on reliable sockets,
+        * unless the request is flagged for immediate send.
+        */
+       if (nmp->nm_sotype != SOCK_DGRAM) {
+               if (++req->r_rexmit > NFS_MAXREXMIT)
+                       req->r_rexmit = NFS_MAXREXMIT;
+               if ((req->r_flags & R_NEEDSXMIT) == 0)
+                       return;
+       }
+
+       /*
+        * Stop here if we do not have a socket!
+        */
+       if ((so = nmp->nm_so) == NULL)
+               return;
+
+       /*
+        * If there is enough space and the window allows.. resend it.
+        *
+        * r_rtt is left intact in case we get an answer after the
+        * retry that was a reply to the original packet.
+        */
+       if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len &&
+           (req->r_flags & (R_SENT | R_NEEDSXMIT)) &&
+          (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
+               if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
+                   error = so_pru_send(so, 0, m, NULL, NULL, td);
+               else
+                   error = so_pru_send(so, 0, m, nmp->nm_nam,
+                       NULL, td);
+               if (error) {
+                       if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
+                               so->so_error = 0;
+                       req->r_flags |= R_NEEDSXMIT;
+               } else if (req->r_mrep == NULL) {
+                       /*
+                        * Iff first send, start timing
+                        * else turn timing off, backoff timer
+                        * and divide congestion window by 2.
+                        *
+                        * It is possible for the so_pru_send() to
+                        * block and for us to race a reply so we
+                        * only do this if the reply field has not
+                        * been filled in.  R_LOCKED will prevent
+                        * the request from being ripped out from under
+                        * us entirely.
+                        *
+                        * Record the last resent procnum to aid us
+                        * in duplicate detection on receive.
+                        */
+                       if ((req->r_flags & R_NEEDSXMIT) == 0) {
+                               if (nfs_showrexmit)
+                                       kprintf("X");
+                               if (++req->r_rexmit > NFS_MAXREXMIT)
+                                       req->r_rexmit = NFS_MAXREXMIT;
+                               nmp->nm_maxasync_scaled >>= 1;
+                               if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED)
+                                       nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
+                               nfsstats.rpcretries++;
+                               nmp->nm_lastreprocnum = req->r_procnum;
+                       } else {
+                               req->r_flags |= R_SENT;
+                               req->r_flags &= ~R_NEEDSXMIT;
+                       }
+               }
        }
-       crit_exit();
-       callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL);
 }
 
 /*
  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
  * wait for all requests to complete. This is used by forced unmounts
  * to terminate any outstanding RPCs.
+ *
+ * Locked requests cannot be canceled but will be marked for
+ * soft-termination.
  */
 int
 nfs_nmcancelreqs(struct nfsmount *nmp)
@@ -1510,18 +1896,17 @@ nfs_nmcancelreqs(struct nfsmount *nmp)
        int i;
 
        crit_enter();
-       TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
-               if (nmp != req->r_nmp || req->r_mrep != NULL ||
-                   (req->r_flags & R_SOFTTERM)) {
+       TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
+               if (req->r_mrep != NULL || (req->r_flags & R_SOFTTERM))
                        continue;
-               }
-               nfs_softterm(req);
+               nfs_softterm(req, 0);
        }
+       /* XXX  the other two queues as well */
        crit_exit();
 
        for (i = 0; i < 30; i++) {
                crit_enter();
-               TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
+               TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
                        if (nmp == req->r_nmp)
                                break;
                }
@@ -1534,23 +1919,65 @@ nfs_nmcancelreqs(struct nfsmount *nmp)
 }
 
 /*
- * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
- * The nm_send count is decremented now to avoid deadlocks when the process in
- * soreceive() hasn't yet managed to send its own request.
+ * Soft-terminate a request, effectively marking it as failed.
  *
- * This routine must be called at splsoftclock() to protect r_flags and
- * nm_sent.
+ * Must be called from within a critical section.
  */
-
 static void
-nfs_softterm(struct nfsreq *rep)
+nfs_softterm(struct nfsreq *rep, int islocked)
 {
        rep->r_flags |= R_SOFTTERM;
+       nfs_hardterm(rep, islocked);
+}
 
+/*
+ * Hard-terminate a request, typically after getting a response.
+ *
+ * The state machine can still decide to re-issue it later if necessary.
+ *
+ * Must be called from within a critical section.
+ */
+static void
+nfs_hardterm(struct nfsreq *rep, int islocked)
+{
+       struct nfsmount *nmp = rep->r_nmp;
+
+       /*
+        * The nm_send count is decremented now to avoid deadlocks
+        * when the process in soreceive() hasn't yet managed to send
+        * its own request.
+        */
        if (rep->r_flags & R_SENT) {
-               rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
                rep->r_flags &= ~R_SENT;
        }
+
+       /*
+        * If we locked the request or nobody else has locked the request,
+        * and the request is async, we can move it to the reader thread's
+        * queue now and fix up the state.
+        *
+        * If we locked the request or nobody else has locked the request,
+        * we can wake up anyone blocked waiting for a response on the
+        * request.
+        */
+       if (islocked || (rep->r_flags & R_LOCKED) == 0) {
+               if ((rep->r_flags & (R_ONREQQ | R_ASYNC)) ==
+                   (R_ONREQQ | R_ASYNC)) {
+                       rep->r_flags &= ~R_ONREQQ;
+                       TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
+                       --nmp->nm_reqqlen;
+                       TAILQ_INSERT_TAIL(&nmp->nm_reqrxq, rep, r_chain);
+                       KKASSERT(rep->r_info->state == NFSM_STATE_TRY ||
+                                rep->r_info->state == NFSM_STATE_WAITREPLY);
+                       rep->r_info->state = NFSM_STATE_PROCESSREPLY;
+                       nfssvc_iod_reader_wakeup(nmp);
+                       if (TAILQ_FIRST(&nmp->nm_bioq) &&
+                           nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
+                               nfssvc_iod_writer_wakeup(nmp);
+                       }
+               }
+               mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link);
+       }
 }
 
 /*
@@ -1592,9 +2019,9 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
  * in progress when a reconnect is necessary.
  */
 int
-nfs_sndlock(struct nfsreq *rep)
+nfs_sndlock(struct nfsmount *nmp, struct nfsreq *rep)
 {
-       int *statep = &rep->r_nmp->nm_state;
+       mtx_t mtx = &nmp->nm_txlock;
        struct thread *td;
        int slptimeo;
        int slpflag;
@@ -1602,30 +2029,29 @@ nfs_sndlock(struct nfsreq *rep)
 
        slpflag = 0;
        slptimeo = 0;
-       td = rep->r_td;
-       if (rep->r_nmp->nm_flag & NFSMNT_INT)
+       td = rep ? rep->r_td : NULL;
+       if (nmp->nm_flag & NFSMNT_INT)
                slpflag = PCATCH;
 
-       error = 0;
-       crit_enter();
-       while (*statep & NFSSTA_SNDLOCK) {
-               *statep |= NFSSTA_WANTSND;
-               if (nfs_sigintr(rep->r_nmp, rep, td)) {
+       while ((error = mtx_lock_ex_try(mtx)) != 0) {
+               if (nfs_sigintr(nmp, rep, td)) {
                        error = EINTR;
                        break;
                }
-               tsleep((caddr_t)statep, slpflag, "nfsndlck", slptimeo);
+               error = mtx_lock_ex(mtx, "nfsndlck", slpflag, slptimeo);
+               if (error == 0)
+                       break;
                if (slpflag == PCATCH) {
                        slpflag = 0;
                        slptimeo = 2 * hz;
                }
        }
        /* Always fail if our request has been cancelled. */
-       if ((rep->r_flags & R_SOFTTERM))
+       if (rep && (rep->r_flags & R_SOFTTERM)) {
+               if (error == 0)
+                       mtx_unlock(mtx);
                error = EINTR;
-       if (error == 0)
-               *statep |= NFSSTA_SNDLOCK;
-       crit_exit();
+       }
        return (error);
 }
 
@@ -1633,25 +2059,20 @@ nfs_sndlock(struct nfsreq *rep)
  * Unlock the stream socket for others.
  */
 void
-nfs_sndunlock(struct nfsreq *rep)
+nfs_sndunlock(struct nfsmount *nmp)
 {
-       int *statep = &rep->r_nmp->nm_state;
-
-       if ((*statep & NFSSTA_SNDLOCK) == 0)
-               panic("nfs sndunlock");
-       crit_enter();
-       *statep &= ~NFSSTA_SNDLOCK;
-       if (*statep & NFSSTA_WANTSND) {
-               *statep &= ~NFSSTA_WANTSND;
-               wakeup((caddr_t)statep);
-       }
-       crit_exit();
+       mtx_unlock(&nmp->nm_txlock);
 }
 
+/*
+ * Lock the receiver side of the socket.
+ *
+ * rep may be NULL.
+ */
 static int
-nfs_rcvlock(struct nfsreq *rep)
+nfs_rcvlock(struct nfsmount *nmp, struct nfsreq *rep)
 {
-       int *statep = &rep->r_nmp->nm_state;
+       mtx_t mtx = &nmp->nm_rxlock;
        int slpflag;
        int slptimeo;
        int error;
@@ -1665,34 +2086,46 @@ nfs_rcvlock(struct nfsreq *rep)
         * We do not strictly need the second check just before the
         * tsleep(), but it's good defensive programming.
         */
-       if (rep->r_mrep != NULL)
+       if (rep && rep->r_mrep != NULL)
                return (EALREADY);
 
-       if (rep->r_nmp->nm_flag & NFSMNT_INT)
+       if (nmp->nm_flag & NFSMNT_INT)
                slpflag = PCATCH;
        else
                slpflag = 0;
        slptimeo = 0;
-       error = 0;
-       crit_enter();
-       while (*statep & NFSSTA_RCVLOCK) {
-               if (nfs_sigintr(rep->r_nmp, rep, rep->r_td)) {
+
+       while ((error = mtx_lock_ex_try(mtx)) != 0) {
+               if (nfs_sigintr(nmp, rep, (rep ? rep->r_td : NULL))) {
                        error = EINTR;
                        break;
                }
-               if (rep->r_mrep != NULL) {
+               if (rep && rep->r_mrep != NULL) {
                        error = EALREADY;
                        break;
                }
-               *statep |= NFSSTA_WANTRCV;
-               tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo);
+
+               /*
+                * NOTE: can return ENOLCK, but in that case rep->r_mrep
+                *       will already be set.
+                */
+               if (rep) {
+                       error = mtx_lock_ex_link(mtx, &rep->r_link,
+                                                "nfsrcvlk",
+                                                slpflag, slptimeo);
+               } else {
+                       error = mtx_lock_ex(mtx, "nfsrcvlk", slpflag, slptimeo);
+               }
+               if (error == 0)
+                       break;
+
                /*
                 * If our reply was recieved while we were sleeping,
                 * then just return without taking the lock to avoid a
                 * situation where a single iod could 'capture' the
                 * recieve lock.
                 */
-               if (rep->r_mrep != NULL) {
+               if (rep && rep->r_mrep != NULL) {
                        error = EALREADY;
                        break;
                }
@@ -1702,10 +2135,11 @@ nfs_rcvlock(struct nfsreq *rep)
                }
        }
        if (error == 0) {
-               *statep |= NFSSTA_RCVLOCK;
-               rep->r_nmp->nm_rcvlock_td = curthread;  /* DEBUGGING */
+               if (rep && rep->r_mrep != NULL) {
+                       error = EALREADY;
+                       mtx_unlock(mtx);
+               }
        }
-       crit_exit();
        return (error);
 }
 
@@ -1713,66 +2147,54 @@ nfs_rcvlock(struct nfsreq *rep)
  * Unlock the stream socket for others.
  */
 static void
-nfs_rcvunlock(struct nfsreq *rep)
+nfs_rcvunlock(struct nfsmount *nmp)
 {
-       int *statep = &rep->r_nmp->nm_state;
-
-       if ((*statep & NFSSTA_RCVLOCK) == 0)
-               panic("nfs rcvunlock");
-       crit_enter();
-       rep->r_nmp->nm_rcvlock_td = (void *)-1; /* DEBUGGING */
-       *statep &= ~NFSSTA_RCVLOCK;
-       if (*statep & NFSSTA_WANTRCV) {
-               *statep &= ~NFSSTA_WANTRCV;
-               wakeup((caddr_t)statep);
-       }
-       crit_exit();
+       mtx_unlock(&nmp->nm_rxlock);
 }
 
 /*
- *     nfs_realign:
+ * nfs_realign:
  *
- *     Check for badly aligned mbuf data and realign by copying the unaligned
- *     portion of the data into a new mbuf chain and freeing the portions
- *     of the old chain that were replaced.
+ * Check for badly aligned mbuf data and realign by copying the unaligned
+ * portion of the data into a new mbuf chain and freeing the portions
+ * of the old chain that were replaced.
  *
- *     We cannot simply realign the data within the existing mbuf chain
- *     because the underlying buffers may contain other rpc commands and
- *     we cannot afford to overwrite them.
+ * We cannot simply realign the data within the existing mbuf chain
+ * because the underlying buffers may contain other rpc commands and
+ * we cannot afford to overwrite them.
  *
- *     We would prefer to avoid this situation entirely.  The situation does
- *     not occur with NFS/UDP and is supposed to only occassionally occur
- *     with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
+ * We would prefer to avoid this situation entirely.  The situation does
+ * not occur with NFS/UDP and is supposed to only occassionally occur
+ * with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
+ *
+ * NOTE!  MB_DONTWAIT cannot be used here.  The mbufs must be acquired
+ *       because the rpc request OR reply cannot be thrown away.  TCP NFS
+ *       mounts do not retry their RPCs unless the TCP connection itself
+ *       is dropped so throwing away a RPC will basically cause the NFS
+ *       operation to lockup indefinitely.
  */
 static void
 nfs_realign(struct mbuf **pm, int hsiz)
 {
        struct mbuf *m;
        struct mbuf *n = NULL;
-       int off = 0;
 
+       /*
+        * Check for misalignemnt
+        */
        ++nfs_realign_test;
-
        while ((m = *pm) != NULL) {
-               if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-                       n = m_getl(m->m_len, MB_WAIT, MT_DATA, 0, NULL);
-                       n->m_len = 0;
+               if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3))
                        break;
-               }
                pm = &m->m_next;
        }
 
        /*
-        * If n is non-NULL, loop on m copying data, then replace the
-        * portion of the chain that had to be realigned.
+        * If misalignment found make a completely new copy.
         */
-       if (n != NULL) {
+       if (m) {
                ++nfs_realign_count;
-               while (m) {
-                       m_copyback(n, off, m->m_len, mtod(m, caddr_t));
-                       off += m->m_len;
-                       m = m->m_next;
-               }
+               n = m_dup_data(m, MB_WAIT);
                m_freem(*pm);
                *pm = n;
        }
@@ -1790,32 +2212,33 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
 {
        int len, i;
        u_int32_t *tl;
-       int32_t t1;
        struct uio uio;
        struct iovec iov;
-       caddr_t dpos, cp2, cp;
+       caddr_t cp;
        u_int32_t nfsvers, auth_type;
        uid_t nickuid;
        int error = 0, ticklen;
-       struct mbuf *mrep, *md;
        struct nfsuid *nuidp;
        struct timeval tvin, tvout;
+       struct nfsm_info info;
 #if 0                          /* until encrypted keys are implemented */
        NFSKERBKEYSCHED_T keys; /* stores key schedule */
 #endif
 
-       mrep = nd->nd_mrep;
-       md = nd->nd_md;
-       dpos = nd->nd_dpos;
+       info.mrep = nd->nd_mrep;
+       info.md = nd->nd_md;
+       info.dpos = nd->nd_dpos;
+
        if (has_header) {
-               nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED);
+               NULLOUT(tl = nfsm_dissect(&info, 10 * NFSX_UNSIGNED));
                nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
                if (*tl++ != rpc_call) {
-                       m_freem(mrep);
+                       m_freem(info.mrep);
                        return (EBADRPC);
                }
-       } else
-               nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
+       } else {
+               NULLOUT(tl = nfsm_dissect(&info, 8 * NFSX_UNSIGNED));
+       }
        nd->nd_repstat = 0;
        nd->nd_flag = 0;
        if (*tl++ != rpc_vers) {
@@ -1852,7 +2275,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
        auth_type = *tl++;
        len = fxdr_unsigned(int, *tl++);
        if (len < 0 || len > RPCAUTH_MAXSIZ) {
-               m_freem(mrep);
+               m_freem(info.mrep);
                return (EBADRPC);
        }
 
@@ -1863,21 +2286,23 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
        if (auth_type == rpc_auth_unix) {
                len = fxdr_unsigned(int, *++tl);
                if (len < 0 || len > NFS_MAXNAMLEN) {
-                       m_freem(mrep);
+                       m_freem(info.mrep);
                        return (EBADRPC);
                }
-               nfsm_adv(nfsm_rndup(len));
-               nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+               ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
+               NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
                bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
                nd->nd_cr.cr_ref = 1;
                nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
+               nd->nd_cr.cr_ruid = nd->nd_cr.cr_svuid = nd->nd_cr.cr_uid;
                nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
+               nd->nd_cr.cr_rgid = nd->nd_cr.cr_svgid = nd->nd_cr.cr_gid;
                len = fxdr_unsigned(int, *tl);
                if (len < 0 || len > RPCAUTH_UNIXGIDS) {
-                       m_freem(mrep);
+                       m_freem(info.mrep);
                        return (EBADRPC);
                }
-               nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED);
+               NULLOUT(tl = nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED));
                for (i = 1; i <= len; i++)
                    if (i < NGROUPS)
                        nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
@@ -1888,11 +2313,12 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                    nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
                len = fxdr_unsigned(int, *++tl);
                if (len < 0 || len > RPCAUTH_MAXSIZ) {
-                       m_freem(mrep);
+                       m_freem(info.mrep);
                        return (EBADRPC);
                }
-               if (len > 0)
-                       nfsm_adv(nfsm_rndup(len));
+               if (len > 0) {
+                       ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
+               }
        } else if (auth_type == rpc_auth_kerb) {
                switch (fxdr_unsigned(int, *tl++)) {
                case RPCAKN_FULLNAME:
@@ -1901,7 +2327,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                        uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
                        nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
                        if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
-                               m_freem(mrep);
+                               m_freem(info.mrep);
                                return (EBADRPC);
                        }
                        uio.uio_offset = 0;
@@ -1910,8 +2336,8 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                        uio.uio_segflg = UIO_SYSSPACE;
                        iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
                        iov.iov_len = RPCAUTH_MAXSIZ - 4;
-                       nfsm_mtouio(&uio, uio.uio_resid);
-                       nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+                       ERROROUT(nfsm_mtouio(&info, &uio, uio.uio_resid));
+                       NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
                        if (*tl++ != rpc_auth_kerb ||
                                fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
                                kprintf("Bad kerb verifier\n");
@@ -1919,7 +2345,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                                nd->nd_procnum = NFSPROC_NOOP;
                                return (0);
                        }
-                       nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
+                       NULLOUT(cp = nfsm_dissect(&info, 4 * NFSX_UNSIGNED));
                        tl = (u_int32_t *)cp;
                        if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
                                kprintf("Not fullname kerb verifier\n");
@@ -1941,7 +2367,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                                return (0);
                        }
                        nickuid = fxdr_unsigned(uid_t, *tl);
-                       nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+                       NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
                        if (*tl++ != rpc_auth_kerb ||
                                fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
                                kprintf("Kerb nick verifier bad\n");
@@ -1949,7 +2375,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                                nd->nd_procnum = NFSPROC_NOOP;
                                return (0);
                        }
-                       nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+                       NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
                        tvin.tv_sec = *tl++;
                        tvin.tv_usec = *tl;
 
@@ -1974,6 +2400,9 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                         */
 #ifdef NFSKERB
                        XXX
+#else
+                       tvout.tv_sec = 0;
+                       tvout.tv_usec = 0;
 #endif
 
                        tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
@@ -1997,8 +2426,8 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                return (0);
        }
 
-       nd->nd_md = md;
-       nd->nd_dpos = dpos;
+       nd->nd_md = info.md;
+       nd->nd_dpos = info.dpos;
        return (0);
 nfsmout:
        return (error);
@@ -2240,12 +2669,12 @@ nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
            recm = NULL;
            if (slp->ns_cc == slp->ns_reclen) {
                recm = slp->ns_raw;
-               slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
+               slp->ns_raw = slp->ns_rawend = NULL;
                slp->ns_cc = slp->ns_reclen = 0;
            } else if (slp->ns_cc > slp->ns_reclen) {
                len = 0;
                m = slp->ns_raw;
-               om = (struct mbuf *)0;
+               om = NULL;
 
                while (len < slp->ns_reclen) {
                        if ((len + m->m_len) > slp->ns_reclen) {
@@ -2268,7 +2697,7 @@ nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
                                len += m->m_len;
                                m = m->m_next;
                                recm = slp->ns_raw;
-                               om->m_next = (struct mbuf *)0;
+                               om->m_next = NULL;
                        } else {
                                om = m;
                                len += m->m_len;
@@ -2297,13 +2726,13 @@ nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
                    m_freem(slp->ns_frag);
                } else {
                    nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
-                   rec->nr_address = (struct sockaddr *)0;
+                   rec->nr_address = NULL;
                    rec->nr_packet = slp->ns_frag;
                    STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
                    ++slp->ns_numrec;
                    ++*countp;
                }
-               slp->ns_frag = (struct mbuf *)0;
+               slp->ns_frag = NULL;
            }
        }
 }