X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/42bcdd13e82e8260710c310c068c4040a52a7f77..3bf6fec386b09ee8846e7093b18975148d9da2cc:/sys/vfs/nfs/nfs_socket.c diff --git a/sys/vfs/nfs/nfs_socket.c b/sys/vfs/nfs/nfs_socket.c index 1b620c732b..c241c3cead 100644 --- a/sys/vfs/nfs/nfs_socket.c +++ b/sys/vfs/nfs/nfs_socket.c @@ -61,7 +61,10 @@ #include #include #include +#include + #include +#include #include #include @@ -80,34 +83,13 @@ #define FALSE 0 /* - * Estimate rto for an nfs rpc sent via. an unreliable datagram. - * Use the mean and mean deviation of rtt for the appropriate type of rpc - * for the frequent rpcs and a default for the others. - * The justification for doing "other" this way is that these rpcs - * happen so infrequently that timer est. would probably be stale. - * Also, since many of these rpcs are - * non-idempotent, a conservative timeout is desired. - * getattr, lookup - A+2D - * read, write - A+4D - * other - nm_timeo - */ -#define NFS_RTO(n, t) \ - ((t) == 0 ? (n)->nm_timeo : \ - ((t) < 3 ? \ - (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \ - ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) -#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] -#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] -/* - * External data, mostly RPC constants in XDR form + * RTT calculations are scaled by 256 (8 bits). A proper fractional + * RTT will still be calculated even with a slow NFS timer. */ -extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, - rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr, - rpc_auth_kerb; -extern u_int32_t nfs_prog; -extern struct nfsstats nfsstats; -extern int nfsv3_procid[NFS_NPROCS]; -extern int nfs_ticks; +#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum]] +#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum]] +#define NFS_RTT_SCALE_BITS 8 /* bits */ +#define NFS_RTT_SCALE 256 /* value */ /* * Defines which timer to use for the procnum. @@ -118,52 +100,54 @@ extern int nfs_ticks; * 4 - write */ static int proct[NFS_NPROCS] = { - 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, - 0, 0, 0, + 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, /* 00-09 */ + 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, /* 10-19 */ + 0, 5, 0, 0, 0, 0, /* 20-29 */ }; +static int multt[NFS_NPROCS] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-09 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10-19 */ + 1, 2, 1, 1, 1, 1, /* 20-29 */ +}; + +static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 }; static int nfs_realign_test; static int nfs_realign_count; -static int nfs_bufpackets = 4; -static int nfs_timer_raced; +static int nfs_showrtt; +static int nfs_showrexmit; +int nfs_maxasyncbio = NFS_MAXASYNCBIO; SYSCTL_DECL(_vfs_nfs); SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, ""); SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, ""); -SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, ""); +SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, ""); +SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, ""); +SYSCTL_INT(_vfs_nfs, OID_AUTO, maxasyncbio, CTLFLAG_RW, &nfs_maxasyncbio, 0, ""); +static int nfs_request_setup(nfsm_info_t info); +static int nfs_request_auth(struct nfsreq *rep); +static int nfs_request_try(struct nfsreq *rep); +static int nfs_request_waitreply(struct nfsreq *rep); +static int nfs_request_processreply(nfsm_info_t info, int); -/* - * There is a congestion window for outstanding rpcs maintained per mount - * point. The cwnd size is adjusted in roughly the way that: - * Van Jacobson, Congestion avoidance and Control, In "Proceedings of - * SIGCOMM '88". ACM, August 1988. - * describes for TCP. The cwnd size is chopped in half on a retransmit timeout - * and incremented by 1/cwnd when each rpc reply is received and a full cwnd - * of rpcs is in progress. - * (The sent count and cwnd are scaled for integer arith.) - * Variants of "slow start" were tried and were found to be too much of a - * performance hit (ave. rtt 3 times larger), - * I suspect due to the large rtt that nfs rpcs have. - */ -#define NFS_CWNDSCALE 256 -#define NFS_MAXCWND (NFS_CWNDSCALE * 32) -static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; int nfsrtton = 0; struct nfsrtt nfsrtt; struct callout nfs_timer_handle; static int nfs_msg (struct thread *,char *,char *); -static int nfs_rcvlock (struct nfsreq *); -static void nfs_rcvunlock (struct nfsreq *); +static int nfs_rcvlock (struct nfsmount *nmp, struct nfsreq *myreq); +static void nfs_rcvunlock (struct nfsmount *nmp); static void nfs_realign (struct mbuf **pm, int hsiz); -static int nfs_receive (struct nfsreq *rep, struct sockaddr **aname, - struct mbuf **mp); -static void nfs_softterm (struct nfsreq *rep); -static int nfs_reconnect (struct nfsreq *rep); +static int nfs_receive (struct nfsmount *nmp, struct nfsreq *rep, + struct sockaddr **aname, struct mbuf **mp); +static void nfs_softterm (struct nfsreq *rep, int islocked); +static void nfs_hardterm (struct nfsreq *rep, int islocked); +static int nfs_reconnect (struct nfsmount *nmp, struct nfsreq *rep); #ifndef NFS_NOSERVER static int nfsrv_getstream (struct nfssvc_sock *, int, int *); +static void nfs_timer_req(struct nfsreq *req); int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd, struct nfssvc_sock *slp, @@ -206,19 +190,19 @@ int nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) { struct socket *so; - int error, rcvreserve, sndreserve; - int pktscale; + int error; struct sockaddr *saddr; struct sockaddr_in *sin; struct thread *td = &thread0; /* only used for socreate and sobind */ - nmp->nm_so = NULL; + nmp->nm_so = so = NULL; + if (nmp->nm_flag & NFSMNT_FORCE) + return (EINVAL); saddr = nmp->nm_nam; - error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, + error = socreate(saddr->sa_family, &so, nmp->nm_sotype, nmp->nm_soproto, td); if (error) goto bad; - so = nmp->nm_so; nmp->nm_soflags = so->so_proto->pr_flags; /* @@ -306,23 +290,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) * Get buffer reservation size from sysctl, but impose reasonable * limits. */ - pktscale = nfs_bufpackets; - if (pktscale < 2) - pktscale = 2; - if (pktscale > 64) - pktscale = 64; - - if (nmp->nm_sotype == SOCK_DGRAM) { - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale; - rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + - NFS_MAXPKTHDR) * pktscale; - } else if (nmp->nm_sotype == SOCK_SEQPACKET) { - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale; - rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + - NFS_MAXPKTHDR) * pktscale; - } else { - if (nmp->nm_sotype != SOCK_STREAM) - panic("nfscon sotype"); + if (nmp->nm_sotype == SOCK_STREAM) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { struct sockopt sopt; int val; @@ -347,13 +315,8 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) val = 1; sosetopt(so, &sopt); } - sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + - sizeof (u_int32_t)) * pktscale; - rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + - sizeof (u_int32_t)) * pktscale; } - error = soreserve(so, sndreserve, rcvreserve, - &td->td_proc->p_rlimit[RLIMIT_SBSIZE]); + error = soreserve(so, nfs_soreserve, nfs_soreserve, NULL); if (error) goto bad; so->so_rcv.ssb_flags |= SSB_NOINTR; @@ -361,16 +324,24 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) /* Initialize other non-zero congestion variables */ nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = - nmp->nm_srtt[3] = (NFS_TIMEO << 3); + nmp->nm_srtt[3] = (NFS_TIMEO << NFS_RTT_SCALE_BITS); nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = nmp->nm_sdrtt[3] = 0; - nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ - nmp->nm_sent = 0; + nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED; nmp->nm_timeouts = 0; + + /* + * Assign nm_so last. The moment nm_so is assigned the nfs_timer() + * can mess with the socket. + */ + nmp->nm_so = so; return (0); bad: - nfs_disconnect(nmp); + if (so) { + soshutdown(so, SHUT_RDWR); + soclose(so, FNONBLOCK); + } return (error); } @@ -379,21 +350,26 @@ bad: * Called when a connection is broken on a reliable protocol. * - clean up the old socket * - nfs_connect() again - * - set R_MUSTRESEND for all outstanding requests on mount point + * - set R_NEEDSXMIT for all outstanding requests on mount point * If this fails the mount point is DEAD! * nb: Must be called with the nfs_sndlock() set on the mount point. */ static int -nfs_reconnect(struct nfsreq *rep) +nfs_reconnect(struct nfsmount *nmp, struct nfsreq *rep) { - struct nfsreq *rp; - struct nfsmount *nmp = rep->r_nmp; + struct nfsreq *req; int error; nfs_disconnect(nmp); + if (nmp->nm_rxstate >= NFSSVC_STOPPING) + return (EINTR); while ((error = nfs_connect(nmp, rep)) != 0) { if (error == EINTR || error == ERESTART) return (EINTR); + if (error == EINVAL) + return (error); + if (nmp->nm_rxstate >= NFSSVC_STOPPING) + return (EINTR); (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0); } @@ -402,9 +378,9 @@ nfs_reconnect(struct nfsreq *rep) * on old socket. */ crit_enter(); - TAILQ_FOREACH(rp, &nfs_reqq, r_chain) { - if (rp->r_nmp == nmp) - rp->r_flags |= R_MUSTRESEND; + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { + KKASSERT(req->r_nmp == nmp); + req->r_flags |= R_NEEDSXMIT; } crit_exit(); return (0); @@ -429,14 +405,9 @@ nfs_disconnect(struct nfsmount *nmp) void nfs_safedisconnect(struct nfsmount *nmp) { - struct nfsreq dummyreq; - - bzero(&dummyreq, sizeof(dummyreq)); - dummyreq.r_nmp = nmp; - dummyreq.r_td = NULL; - nfs_rcvlock(&dummyreq); + nfs_rcvlock(nmp, NULL); nfs_disconnect(nmp); - nfs_rcvunlock(&dummyreq); + nfs_rcvunlock(nmp); } /* @@ -445,7 +416,7 @@ nfs_safedisconnect(struct nfsmount *nmp) * "rep == NULL" indicates that it has been called from a server. * For the client side: * - return EINTR if the RPC is terminated, 0 otherwise - * - set R_MUSTRESEND if the send fails for any reason + * - set R_NEEDSXMIT if the send fails for any reason * - do any cleanup required by recoverable socket errors (?) * For the server side: * - return EINTR or ERESTART if interrupted by a signal @@ -465,14 +436,15 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top, return (EINTR); } if ((so = rep->r_nmp->nm_so) == NULL) { - rep->r_flags |= R_MUSTRESEND; + rep->r_flags |= R_NEEDSXMIT; m_freem(top); return (0); } - rep->r_flags &= ~R_MUSTRESEND; + rep->r_flags &= ~R_NEEDSXMIT; soflags = rep->r_nmp->nm_soflags; - } else + } else { soflags = so->so_proto->pr_flags; + } if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) sendnam = NULL; else @@ -490,8 +462,20 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top, */ if (error == ENOBUFS && so->so_type == SOCK_DGRAM) { error = 0; - if (rep) /* do backoff retransmit on client */ - rep->r_flags |= R_MUSTRESEND; + /* + * do backoff retransmit on client + */ + if (rep) { + if ((rep->r_nmp->nm_state & NFSSTA_SENDSPACE) == 0) { + rep->r_nmp->nm_state |= NFSSTA_SENDSPACE; + kprintf("Warning: NFS: Insufficient sendspace " + "(%lu),\n" + "\t You must increase vfs.nfs.soreserve" + "or decrease vfs.nfs.maxasyncbio\n", + so->so_snd.ssb_hiwat); + } + rep->r_flags |= R_NEEDSXMIT; + } } if (error) { @@ -504,9 +488,10 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top, if (rep->r_flags & R_SOFTTERM) error = EINTR; else - rep->r_flags |= R_MUSTRESEND; - } else + rep->r_flags |= R_NEEDSXMIT; + } else { log(LOG_INFO, "nfsd send error %d\n", error); + } /* * Handle any recoverable (soft) socket errors here. (?) @@ -528,7 +513,8 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top, * we have read any of it, even if the system call has been interrupted. */ static int -nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp) +nfs_receive(struct nfsmount *nmp, struct nfsreq *rep, + struct sockaddr **aname, struct mbuf **mp) { struct socket *so; struct sockbuf sio; @@ -546,7 +532,7 @@ nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp) */ *mp = NULL; *aname = NULL; - sotype = rep->r_nmp->nm_sotype; + sotype = nmp->nm_sotype; /* * For reliable protocols, lock against other senders/receivers @@ -557,7 +543,7 @@ nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp) * until we have an entire rpc request/reply. */ if (sotype != SOCK_DGRAM) { - error = nfs_sndlock(rep); + error = nfs_sndlock(nmp, rep); if (error) return (error); tryagain: @@ -570,33 +556,33 @@ tryagain: * attempt that has essentially shut down this * mount point. */ - if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { - nfs_sndunlock(rep); + if (rep && (rep->r_mrep || (rep->r_flags & R_SOFTTERM))) { + nfs_sndunlock(nmp); return (EINTR); } - so = rep->r_nmp->nm_so; - if (!so) { - error = nfs_reconnect(rep); + so = nmp->nm_so; + if (so == NULL) { + error = nfs_reconnect(nmp, rep); if (error) { - nfs_sndunlock(rep); + nfs_sndunlock(nmp); return (error); } goto tryagain; } - while (rep->r_flags & R_MUSTRESEND) { + while (rep && (rep->r_flags & R_NEEDSXMIT)) { m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT); nfsstats.rpcretries++; error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); if (error) { if (error == EINTR || error == ERESTART || - (error = nfs_reconnect(rep)) != 0) { - nfs_sndunlock(rep); + (error = nfs_reconnect(nmp, rep)) != 0) { + nfs_sndunlock(nmp); return (error); } goto tryagain; } } - nfs_sndunlock(rep); + nfs_sndunlock(nmp); if (sotype == SOCK_STREAM) { /* * Get the length marker from the stream @@ -629,7 +615,7 @@ tryagain: "short receive (%d/%d) from nfs server %s\n", (int)(sizeof(u_int32_t) - auio.uio_resid), (int)sizeof(u_int32_t), - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + nmp->nm_mountp->mnt_stat.f_mntfromname); error = EPIPE; } if (error) @@ -643,7 +629,7 @@ tryagain: log(LOG_ERR, "%s (%d) from nfs server %s\n", "impossible packet length", len, - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + nmp->nm_mountp->mnt_stat.f_mntfromname); error = EFBIG; goto errout; } @@ -661,9 +647,9 @@ tryagain: if (error == 0 && sio.sb_cc != len) { if (sio.sb_cc != 0) log(LOG_INFO, - "short receive (%d/%d) from nfs server %s\n", - len - auio.uio_resid, len, - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + "short receive (%zu/%d) from nfs server %s\n", + (size_t)len - auio.uio_resid, len, + nmp->nm_mountp->mnt_stat.f_mntfromname); error = EPIPE; } *mp = sio.sb_mb; @@ -707,19 +693,19 @@ errout: log(LOG_INFO, "receive error %d from nfs server %s\n", error, - rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + nmp->nm_mountp->mnt_stat.f_mntfromname); } - error = nfs_sndlock(rep); + error = nfs_sndlock(nmp, rep); if (!error) { - error = nfs_reconnect(rep); + error = nfs_reconnect(nmp, rep); if (!error) goto tryagain; else - nfs_sndunlock(rep); + nfs_sndunlock(nmp); } } } else { - if ((so = rep->r_nmp->nm_so) == NULL) + if ((so = nmp->nm_so) == NULL) return (EACCES); if (so->so_state & SS_ISCONNECTED) getnam = NULL; @@ -730,19 +716,28 @@ errout: rcvflg = 0; error = so_pru_soreceive(so, getnam, NULL, &sio, NULL, &rcvflg); - if (error == EWOULDBLOCK && + if (error == EWOULDBLOCK && rep && (rep->r_flags & R_SOFTTERM)) { m_freem(sio.sb_mb); return (EINTR); } } while (error == EWOULDBLOCK); + len = sio.sb_cc; *mp = sio.sb_mb; + + /* + * A shutdown may result in no error and no mbuf. + * Convert to EPIPE. + */ + if (*mp == NULL && error == 0) + error = EPIPE; } if (error) { m_freem(*mp); *mp = NULL; } + /* * Search for any mbufs that are not a multiple of 4 bytes long * or with m_data not longword aligned. @@ -755,21 +750,23 @@ errout: /* * Implement receipt of reply on a socket. + * * We must search through the list of received datagrams matching them * with outstanding requests using the xid, until ours is found. + * + * If myrep is NULL we process packets on the socket until + * interrupted or until nm_reqrxq is non-empty. */ /* ARGSUSED */ int -nfs_reply(struct nfsreq *myrep) +nfs_reply(struct nfsmount *nmp, struct nfsreq *myrep) { struct nfsreq *rep; - struct nfsmount *nmp = myrep->r_nmp; - int32_t t1; - struct mbuf *mrep, *md; struct sockaddr *nam; - u_int32_t rxid, *tl; - caddr_t dpos, cp2; + u_int32_t rxid; + u_int32_t *tl; int error; + struct nfsm_info info; /* * Loop around until we get our own reply @@ -780,30 +777,51 @@ nfs_reply(struct nfsreq *myrep) * sbwait() after someone else has received my reply for me. * Also necessary for connection based protocols to avoid * race conditions during a reconnect. + * * If nfs_rcvlock() returns EALREADY, that means that * the reply has already been recieved by another * process and we can return immediately. In this * case, the lock is not taken to avoid races with * other processes. */ - error = nfs_rcvlock(myrep); + info.mrep = NULL; + + error = nfs_rcvlock(nmp, myrep); if (error == EALREADY) return (0); if (error) return (error); + + /* + * If myrep is NULL we are the receiver helper thread. + * Stop waiting for incoming replies if there are + * messages sitting on reqrxq that we need to process, + * or if a shutdown request is pending. + */ + if (myrep == NULL && (TAILQ_FIRST(&nmp->nm_reqrxq) || + nmp->nm_rxstate > NFSSVC_PENDING)) { + nfs_rcvunlock(nmp); + return(EWOULDBLOCK); + } + /* * Get the next Rpc reply off the socket + * + * We cannot release the receive lock until we've + * filled in rep->r_mrep, otherwise a waiting + * thread may deadlock in soreceive with no incoming + * packets expected. */ - error = nfs_receive(myrep, &nam, &mrep); - nfs_rcvunlock(myrep); + error = nfs_receive(nmp, myrep, &nam, &info.mrep); if (error) { /* * Ignore routing errors on connectionless protocols?? */ + nfs_rcvunlock(nmp); if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { + if (nmp->nm_so == NULL) + return (error); nmp->nm_so->so_error = 0; - if (myrep->r_flags & R_GETONEREP) - return (0); continue; } return (error); @@ -814,16 +832,16 @@ nfs_reply(struct nfsreq *myrep) /* * Get the xid and check that it is an rpc reply */ - md = mrep; - dpos = mtod(md, caddr_t); - nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED); + info.md = info.mrep; + info.dpos = mtod(info.md, caddr_t); + NULLOUT(tl = nfsm_dissect(&info, 2*NFSX_UNSIGNED)); rxid = *tl++; if (*tl != rpc_reply) { nfsstats.rpcinvalid++; - m_freem(mrep); + m_freem(info.mrep); + info.mrep = NULL; nfsmout: - if (myrep->r_flags & R_GETONEREP) - return (0); + nfs_rcvunlock(nmp); continue; } @@ -835,28 +853,27 @@ nfsmout: * section. */ crit_enter(); - TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { - if (rep->r_mrep == NULL && rxid == rep->r_xid) { - rep->r_mrep = mrep; + TAILQ_FOREACH(rep, &nmp->nm_reqq, r_chain) { + if (rep->r_mrep == NULL && rxid == rep->r_xid) break; - } } - crit_exit(); /* * Fill in the rest of the reply if we found a match. + * + * Deal with duplicate responses if there was no match. */ if (rep) { - rep->r_md = md; - rep->r_dpos = dpos; + rep->r_md = info.md; + rep->r_dpos = info.dpos; if (nfsrtton) { struct rttl *rt; rt = &nfsrtt.rttl[nfsrtt.pos]; rt->proc = rep->r_procnum; - rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]); - rt->sent = nmp->nm_sent; - rt->cwnd = nmp->nm_cwnd; + rt->rto = 0; + rt->sent = 0; + rt->cwnd = nmp->nm_maxasync_scaled; rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; @@ -867,29 +884,23 @@ nfsmout: rt->rtt = 1000000; nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ; } + /* - * Update congestion window. - * Do the additive increase of - * one rpc/rtt. + * New congestion control is based only on async + * requests. */ - if (nmp->nm_cwnd <= nmp->nm_sent) { - nmp->nm_cwnd += - (NFS_CWNDSCALE * NFS_CWNDSCALE + - (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; - if (nmp->nm_cwnd > NFS_MAXCWND) - nmp->nm_cwnd = NFS_MAXCWND; - } - crit_enter(); /* nfs_timer interlock for nm_sent */ + if (nmp->nm_maxasync_scaled < NFS_MAXASYNC_SCALED) + ++nmp->nm_maxasync_scaled; if (rep->r_flags & R_SENT) { rep->r_flags &= ~R_SENT; - nmp->nm_sent -= NFS_CWNDSCALE; } - crit_exit(); /* * Update rtt using a gain of 0.125 on the mean * and a gain of 0.25 on the deviation. + * + * NOTE SRTT/SDRTT are only good if R_TIMING is set. */ - if (rep->r_flags & R_TIMING) { + if ((rep->r_flags & R_TIMING) && rep->r_rexmit == 0) { /* * Since the timer resolution of * NFS_HZ is so course, it can often @@ -898,33 +909,194 @@ nfsmout: * rtt is between N+dt and N+2-dt ticks, * add 1. */ - t1 = rep->r_rtt + 1; - t1 -= (NFS_SRTT(rep) >> 3); - NFS_SRTT(rep) += t1; - if (t1 < 0) - t1 = -t1; - t1 -= (NFS_SDRTT(rep) >> 2); - NFS_SDRTT(rep) += t1; + int n; + int d; + +#define NFSRSB NFS_RTT_SCALE_BITS + n = ((NFS_SRTT(rep) * 7) + + (rep->r_rtt << NFSRSB)) >> 3; + d = n - NFS_SRTT(rep); + NFS_SRTT(rep) = n; + + /* + * Don't let the jitter calculation decay + * too quickly, but we want a fast rampup. + */ + if (d < 0) + d = -d; + d <<= NFSRSB; + if (d < NFS_SDRTT(rep)) + n = ((NFS_SDRTT(rep) * 15) + d) >> 4; + else + n = ((NFS_SDRTT(rep) * 3) + d) >> 2; + NFS_SDRTT(rep) = n; +#undef NFSRSB } nmp->nm_timeouts = 0; + rep->r_mrep = info.mrep; + nfs_hardterm(rep, 0); + } else { + /* + * Extract vers, prog, nfsver, procnum. A duplicate + * response means we didn't wait long enough so + * we increase the SRTT to avoid future spurious + * timeouts. + */ + u_int procnum = nmp->nm_lastreprocnum; + int n; + + if (procnum < NFS_NPROCS && proct[procnum]) { + if (nfs_showrexmit) + kprintf("D"); + n = nmp->nm_srtt[proct[procnum]]; + n += NFS_ASYSCALE * NFS_HZ; + if (n < NFS_ASYSCALE * NFS_HZ * 10) + n = NFS_ASYSCALE * NFS_HZ * 10; + nmp->nm_srtt[proct[procnum]] = n; + } } + nfs_rcvunlock(nmp); + crit_exit(); + /* * If not matched to a request, drop it. * If it's mine, get out. */ if (rep == NULL) { nfsstats.rpcunexpected++; - m_freem(mrep); + m_freem(info.mrep); + info.mrep = NULL; } else if (rep == myrep) { if (rep->r_mrep == NULL) panic("nfsreply nil"); return (0); } - if (myrep->r_flags & R_GETONEREP) - return (0); } } +/* + * Run the request state machine until the target state is reached + * or a fatal error occurs. The target state is not run. Specifying + * a target of NFSM_STATE_DONE runs the state machine until the rpc + * is complete. + * + * EINPROGRESS is returned for all states other then the DONE state, + * indicating that the rpc is still in progress. + */ +int +nfs_request(struct nfsm_info *info, nfsm_state_t bstate, nfsm_state_t estate) +{ + struct nfsreq *req; + + while (info->state >= bstate && info->state < estate) { + switch(info->state) { + case NFSM_STATE_SETUP: + /* + * Setup the nfsreq. Any error which occurs during + * this state is fatal. + */ + info->error = nfs_request_setup(info); + if (info->error) { + info->state = NFSM_STATE_DONE; + return (info->error); + } else { + req = info->req; + req->r_mrp = &info->mrep; + req->r_mdp = &info->md; + req->r_dposp = &info->dpos; + info->state = NFSM_STATE_AUTH; + } + break; + case NFSM_STATE_AUTH: + /* + * Authenticate the nfsreq. Any error which occurs + * during this state is fatal. + */ + info->error = nfs_request_auth(info->req); + if (info->error) { + info->state = NFSM_STATE_DONE; + return (info->error); + } else { + info->state = NFSM_STATE_TRY; + } + break; + case NFSM_STATE_TRY: + /* + * Transmit or retransmit attempt. An error in this + * state is ignored and we always move on to the + * next state. + * + * This can trivially race the receiver if the + * request is asynchronous. nfs_request_try() + * will thus set the state for us and we + * must also return immediately if we are + * running an async state machine, because + * info can become invalid due to races after + * try() returns. + */ + if (info->req->r_flags & R_ASYNC) { + nfs_request_try(info->req); + if (estate == NFSM_STATE_WAITREPLY) + return (EINPROGRESS); + } else { + nfs_request_try(info->req); + info->state = NFSM_STATE_WAITREPLY; + } + break; + case NFSM_STATE_WAITREPLY: + /* + * Wait for a reply or timeout and move on to the + * next state. The error returned by this state + * is passed to the processing code in the next + * state. + */ + info->error = nfs_request_waitreply(info->req); + info->state = NFSM_STATE_PROCESSREPLY; + break; + case NFSM_STATE_PROCESSREPLY: + /* + * Process the reply or timeout. Errors which occur + * in this state may cause the state machine to + * go back to an earlier state, and are fatal + * otherwise. + */ + info->error = nfs_request_processreply(info, + info->error); + switch(info->error) { + case ENEEDAUTH: + info->state = NFSM_STATE_AUTH; + break; + case EAGAIN: + info->state = NFSM_STATE_TRY; + break; + default: + /* + * Operation complete, with or without an + * error. We are done. + */ + info->req = NULL; + info->state = NFSM_STATE_DONE; + return (info->error); + } + break; + case NFSM_STATE_DONE: + /* + * Shouldn't be reached + */ + return (info->error); + /* NOT REACHED */ + } + } + + /* + * If we are done return the error code (if any). + * Otherwise return EINPROGRESS. + */ + if (info->state == NFSM_STATE_DONE) + return (info->error); + return (EINPROGRESS); +} + /* * nfs_request - goes something like this * - fill in request struct @@ -935,64 +1107,89 @@ nfsmout: * by mrep or error * nb: always frees up mreq mbuf list */ -int -nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum, - struct thread *td, struct ucred *cred, struct mbuf **mrp, - struct mbuf **mdp, caddr_t *dposp) +static int +nfs_request_setup(nfsm_info_t info) { - struct mbuf *mrep, *m2; - struct nfsreq *rep; - u_int32_t *tl; - int i; + struct nfsreq *req; struct nfsmount *nmp; - struct mbuf *m, *md, *mheadend; - char nickv[RPCX_NICKVERF]; - time_t waituntil; - caddr_t dpos, cp2; - int t1, error = 0, mrest_len, auth_len, auth_type; - int trylater_delay = 15, trylater_cnt = 0, failed_auth = 0; - int verf_len, verf_type; - u_int32_t xid; - char *auth_str, *verf_str; - NFSKERBKEY_T key; /* save session key */ + struct mbuf *m; + int i; - /* Reject requests while attempting a forced unmount. */ - if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) { - m_freem(mrest); + /* + * Reject requests while attempting a forced unmount. + */ + if (info->vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) { + m_freem(info->mreq); + info->mreq = NULL; return (ESTALE); } - nmp = VFSTONFS(vp->v_mount); - MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); - rep->r_nmp = nmp; - rep->r_vp = vp; - rep->r_td = td; - rep->r_procnum = procnum; - rep->r_mreq = NULL; + nmp = VFSTONFS(info->vp->v_mount); + req = kmalloc(sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); + req->r_nmp = nmp; + req->r_vp = info->vp; + req->r_td = info->td; + req->r_procnum = info->procnum; + req->r_mreq = NULL; + req->r_cred = info->cred; + i = 0; - m = mrest; + m = info->mreq; while (m) { i += m->m_len; m = m->m_next; } - mrest_len = i; + req->r_mrest = info->mreq; + req->r_mrest_len = i; + + /* + * The presence of a non-NULL r_info in req indicates + * async completion via our helper threads. See the receiver + * code. + */ + if (info->bio) { + req->r_info = info; + req->r_flags = R_ASYNC; + } else { + req->r_info = NULL; + req->r_flags = 0; + } + info->req = req; + return(0); +} + +static int +nfs_request_auth(struct nfsreq *rep) +{ + struct nfsmount *nmp = rep->r_nmp; + struct mbuf *m; + char nickv[RPCX_NICKVERF]; + int error = 0, auth_len, auth_type; + int verf_len; + u_int32_t xid; + char *auth_str, *verf_str; + struct ucred *cred; + + cred = rep->r_cred; + rep->r_failed_auth = 0; /* * Get the RPC header with authorization. */ -kerbauth: verf_str = auth_str = NULL; if (nmp->nm_flag & NFSMNT_KERB) { verf_str = nickv; verf_len = sizeof (nickv); auth_type = RPCAUTH_KERB4; - bzero((caddr_t)key, sizeof (key)); - if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str, - &auth_len, verf_str, verf_len)) { + bzero((caddr_t)rep->r_key, sizeof(rep->r_key)); + if (rep->r_failed_auth || + nfs_getnickauth(nmp, cred, &auth_str, &auth_len, + verf_str, verf_len)) { error = nfs_getauth(nmp, rep, cred, &auth_str, - &auth_len, verf_str, &verf_len, key); + &auth_len, verf_str, &verf_len, rep->r_key); if (error) { + m_freem(rep->r_mrest); + rep->r_mrest = NULL; kfree((caddr_t)rep, M_NFSREQ); - m_freem(mrest); return (error); } } @@ -1004,8 +1201,10 @@ kerbauth: nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + 5 * NFSX_UNSIGNED; } - m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len, - auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid); + m = nfsm_rpchead(cred, nmp->nm_flag, rep->r_procnum, auth_type, + auth_len, auth_str, verf_len, verf_str, + rep->r_mrest, rep->r_mrest_len, &rep->r_mheadend, &xid); + rep->r_mrest = NULL; if (auth_str) kfree(auth_str, M_TEMP); @@ -1023,16 +1222,37 @@ kerbauth: } rep->r_mreq = m; rep->r_xid = xid; -tryagain: + return (0); +} + +static int +nfs_request_try(struct nfsreq *rep) +{ + struct nfsmount *nmp = rep->r_nmp; + struct mbuf *m2; + int error; + + /* + * Request is not on any queue, only the owner has access to it + * so it should not be locked by anyone atm. + * + * Interlock to prevent races. While locked the only remote + * action possible is for r_mrep to be set (once we enqueue it). + */ + if (rep->r_flags == 0xdeadc0de) { + print_backtrace(-1); + panic("flags nbad\n"); + } + KKASSERT((rep->r_flags & (R_LOCKED | R_ONREQQ)) == 0); if (nmp->nm_flag & NFSMNT_SOFT) rep->r_retry = nmp->nm_retry; else rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ rep->r_rtt = rep->r_rexmit = 0; - if (proct[procnum] > 0) - rep->r_flags = R_TIMING | R_MASKTIMER; + if (proct[rep->r_procnum] > 0) + rep->r_flags |= R_TIMING | R_LOCKED; else - rep->r_flags = R_MASKTIMER; + rep->r_flags |= R_LOCKED; rep->r_mrep = NULL; /* @@ -1040,117 +1260,198 @@ tryagain: */ nfsstats.rpcrequests++; + if (nmp->nm_flag & NFSMNT_FORCE) { + rep->r_flags |= R_SOFTTERM; + rep->r_flags &= ~R_LOCKED; + return (0); + } + /* * Chain request into list of outstanding requests. Be sure * to put it LAST so timer finds oldest requests first. Note - * that R_MASKTIMER is set at the moment to prevent any timer - * action on this request while we are still doing processing on - * it below. splsoftclock() primarily protects nm_sent. Note - * that we may block in this code so there is no atomicy guarentee. + * that our control of R_LOCKED prevents the request from + * getting ripped out from under us or transmitted by the + * timer code. + * + * For requests with info structures we must atomically set the + * info's state because the structure could become invalid upon + * return due to races (i.e., if async) */ crit_enter(); - TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); + mtx_link_init(&rep->r_link); + TAILQ_INSERT_TAIL(&nmp->nm_reqq, rep, r_chain); + rep->r_flags |= R_ONREQQ; + ++nmp->nm_reqqlen; + if (rep->r_flags & R_ASYNC) + rep->r_info->state = NFSM_STATE_WAITREPLY; + crit_exit(); + + error = 0; /* - * If backing off another request or avoiding congestion, don't - * send this one now but let timer do it. If not timing a request, - * do it now. - * - * Even though the timer will not mess with our request there is - * still the possibility that we will race a reply (which clears - * R_SENT), especially on localhost connections, so be very careful - * when setting R_SENT. We could set R_SENT prior to calling - * nfs_send() but why bother if the response occurs that quickly? + * Send if we can. Congestion control is not handled here any more + * becausing trying to defer the initial send based on the nfs_timer + * requires having a very fast nfs_timer, which is silly. */ - if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || - (nmp->nm_flag & NFSMNT_DUMBTIMR) || - nmp->nm_sent < nmp->nm_cwnd)) { + if (nmp->nm_so) { if (nmp->nm_soflags & PR_CONNREQUIRED) - error = nfs_sndlock(rep); - if (!error) { - m2 = m_copym(m, 0, M_COPYALL, MB_WAIT); + error = nfs_sndlock(nmp, rep); + if (error == 0) { + m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT); error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); if (nmp->nm_soflags & PR_CONNREQUIRED) - nfs_sndunlock(rep); - } - if (!error && (rep->r_flags & R_MUSTRESEND) == 0 && - rep->r_mrep == NULL) { - KASSERT((rep->r_flags & R_SENT) == 0, - ("R_SENT ASSERT %p", rep)); - nmp->nm_sent += NFS_CWNDSCALE; - rep->r_flags |= R_SENT; + nfs_sndunlock(nmp); + rep->r_flags &= ~R_NEEDSXMIT; + if ((rep->r_flags & R_SENT) == 0) { + rep->r_flags |= R_SENT; + } + } else { + rep->r_flags |= R_NEEDSXMIT; } } else { + rep->r_flags |= R_NEEDSXMIT; rep->r_rtt = -1; } + if (error == EPIPE) + error = 0; /* - * Let the timer do what it will with the request, then - * wait for the reply from our send or the timer's. + * Release the lock. The only remote action that may have occurred + * would have been the setting of rep->r_mrep. If this occured + * and the request was async we have to move it to the reader + * thread's queue for action. + * + * For async requests also make sure the reader is woken up so + * it gets on the socket to read responses. */ - if (!error || error == EPIPE) { - rep->r_flags &= ~R_MASKTIMER; - crit_exit(); - error = nfs_reply(rep); - crit_enter(); + crit_enter(); + if (rep->r_flags & R_ASYNC) { + if (rep->r_mrep) + nfs_hardterm(rep, 1); + rep->r_flags &= ~R_LOCKED; + nfssvc_iod_reader_wakeup(nmp); + } else { + rep->r_flags &= ~R_LOCKED; + } + if (rep->r_flags & R_WANTED) { + rep->r_flags &= ~R_WANTED; + wakeup(rep); } + crit_exit(); + return (error); +} + +/* + * This code is only called for synchronous requests. Completed synchronous + * requests are left on reqq and we remove them before moving on to the + * processing state. + */ +static int +nfs_request_waitreply(struct nfsreq *rep) +{ + struct nfsmount *nmp = rep->r_nmp; + int error; + + KKASSERT((rep->r_flags & R_ASYNC) == 0); + + /* + * Wait until the request is finished. + */ + error = nfs_reply(nmp, rep); /* * RPC done, unlink the request, but don't rip it out from under * the callout timer. + * + * Once unlinked no other receiver or the timer will have + * visibility, so we do not have to set R_LOCKED. */ + crit_enter(); while (rep->r_flags & R_LOCKED) { - nfs_timer_raced = 1; - tsleep(&nfs_timer_raced, 0, "nfstrac", 0); + rep->r_flags |= R_WANTED; + tsleep(rep, 0, "nfstrac", 0); + } + KKASSERT(rep->r_flags & R_ONREQQ); + TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain); + rep->r_flags &= ~R_ONREQQ; + --nmp->nm_reqqlen; + if (TAILQ_FIRST(&nmp->nm_bioq) && + nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) { + nfssvc_iod_writer_wakeup(nmp); } - TAILQ_REMOVE(&nfs_reqq, rep, r_chain); + crit_exit(); /* * Decrement the outstanding request count. */ if (rep->r_flags & R_SENT) { rep->r_flags &= ~R_SENT; - nmp->nm_sent -= NFS_CWNDSCALE; } - crit_exit(); + return (error); +} + +/* + * Process reply with error returned from nfs_requet_waitreply(). + * + * Returns EAGAIN if it wants us to loop up to nfs_request_try() again. + * Returns ENEEDAUTH if it wants us to loop up to nfs_request_auth() again. + */ +static int +nfs_request_processreply(nfsm_info_t info, int error) +{ + struct nfsreq *req = info->req; + struct nfsmount *nmp = req->r_nmp; + u_int32_t *tl; + int verf_type; + int i; /* * If there was a successful reply and a tprintf msg. * tprintf a response. */ - if (!error && (rep->r_flags & R_TPRINTFMSG)) - nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname, + if (error == 0 && (req->r_flags & R_TPRINTFMSG)) { + nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname, "is alive again"); - mrep = rep->r_mrep; - md = rep->r_md; - dpos = rep->r_dpos; + } + info->mrep = req->r_mrep; + info->md = req->r_md; + info->dpos = req->r_dpos; if (error) { - m_freem(rep->r_mreq); - kfree((caddr_t)rep, M_NFSREQ); + m_freem(req->r_mreq); + req->r_mreq = NULL; + kfree(req, M_NFSREQ); + info->req = NULL; return (error); } /* * break down the rpc header and check if ok */ - nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); + NULLOUT(tl = nfsm_dissect(info, 3 * NFSX_UNSIGNED)); if (*tl++ == rpc_msgdenied) { - if (*tl == rpc_mismatch) + if (*tl == rpc_mismatch) { error = EOPNOTSUPP; - else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) { - if (!failed_auth) { - failed_auth++; - mheadend->m_next = NULL; - m_freem(mrep); - m_freem(rep->r_mreq); - goto kerbauth; - } else + } else if ((nmp->nm_flag & NFSMNT_KERB) && + *tl++ == rpc_autherr) { + if (req->r_failed_auth == 0) { + req->r_failed_auth++; + req->r_mheadend->m_next = NULL; + m_freem(info->mrep); + info->mrep = NULL; + m_freem(req->r_mreq); + return (ENEEDAUTH); + } else { error = EAUTH; - } else + } + } else { error = EACCES; - m_freem(mrep); - m_freem(rep->r_mreq); - kfree((caddr_t)rep, M_NFSREQ); + } + m_freem(info->mrep); + info->mrep = NULL; + m_freem(req->r_mreq); + req->r_mreq = NULL; + kfree(req, M_NFSREQ); + info->req = NULL; return (error); } @@ -1160,29 +1461,32 @@ tryagain: verf_type = fxdr_unsigned(int, *tl++); i = fxdr_unsigned(int32_t, *tl); if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) { - error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep); + error = nfs_savenickauth(nmp, req->r_cred, i, req->r_key, + &info->md, &info->dpos, info->mrep); if (error) goto nfsmout; - } else if (i > 0) - nfsm_adv(nfsm_rndup(i)); - nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); + } else if (i > 0) { + ERROROUT(nfsm_adv(info, nfsm_rndup(i))); + } + NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED)); /* 0 == ok */ if (*tl == 0) { - nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); + NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED)); if (*tl != 0) { error = fxdr_unsigned(int, *tl); + + /* + * Does anyone even implement this? Just impose + * a 1-second delay. + */ if ((nmp->nm_flag & NFSMNT_NFSV3) && error == NFSERR_TRYLATER) { - m_freem(mrep); + m_freem(info->mrep); + info->mrep = NULL; error = 0; - waituntil = time_second + trylater_delay; - while (time_second < waituntil) - (void) tsleep((caddr_t)&lbolt, - 0, "nqnfstry", 0); - trylater_delay *= nfs_backoff[trylater_cnt]; - if (trylater_cnt < 7) - trylater_cnt++; - goto tryagain; + + tsleep((caddr_t)&lbolt, 0, "nqnfstry", 0); + return (EAGAIN); /* goto tryagain */ } /* @@ -1193,6 +1497,7 @@ tryagain: * release the vnode lock if we hold it. */ if (error == ESTALE) { + struct vnode *vp = req->r_vp; int ltype; ltype = lockstatus(&vp->v_lock, curthread); @@ -1203,29 +1508,37 @@ tryagain: lockmgr(&vp->v_lock, ltype); } if (nmp->nm_flag & NFSMNT_NFSV3) { - *mrp = mrep; - *mdp = md; - *dposp = dpos; + KKASSERT(*req->r_mrp == info->mrep); + KKASSERT(*req->r_mdp == info->md); + KKASSERT(*req->r_dposp == info->dpos); error |= NFSERR_RETERR; - } else - m_freem(mrep); - m_freem(rep->r_mreq); - kfree((caddr_t)rep, M_NFSREQ); + } else { + m_freem(info->mrep); + info->mrep = NULL; + } + m_freem(req->r_mreq); + req->r_mreq = NULL; + kfree(req, M_NFSREQ); + info->req = NULL; return (error); } - *mrp = mrep; - *mdp = md; - *dposp = dpos; - m_freem(rep->r_mreq); - FREE((caddr_t)rep, M_NFSREQ); + KKASSERT(*req->r_mrp == info->mrep); + KKASSERT(*req->r_mdp == info->md); + KKASSERT(*req->r_dposp == info->dpos); + m_freem(req->r_mreq); + req->r_mreq = NULL; + FREE(req, M_NFSREQ); return (0); } - m_freem(mrep); + m_freem(info->mrep); + info->mrep = NULL; error = EPROTONOSUPPORT; nfsmout: - m_freem(rep->r_mreq); - kfree((caddr_t)rep, M_NFSREQ); + m_freem(req->r_mreq); + req->r_mreq = NULL; + kfree(req, M_NFSREQ); + info->req = NULL; return (error); } @@ -1239,22 +1552,21 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp) { u_int32_t *tl; - struct mbuf *mreq; - caddr_t bpos; - struct mbuf *mb, *mb2; + struct nfsm_info info; siz += RPC_REPLYSIZ; - mb = mreq = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL); - mreq->m_pkthdr.len = 0; + info.mb = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL); + info.mreq = info.mb; + info.mreq->m_pkthdr.len = 0; /* * If this is not a cluster, try and leave leading space * for the lower level headers. */ if ((max_hdr + siz) < MINCLSIZE) - mreq->m_data += max_hdr; - tl = mtod(mreq, u_int32_t *); - mreq->m_len = 6 * NFSX_UNSIGNED; - bpos = ((caddr_t)tl) + mreq->m_len; + info.mreq->m_data += max_hdr; + tl = mtod(info.mreq, u_int32_t *); + info.mreq->m_len = 6 * NFSX_UNSIGNED; + info.bpos = ((caddr_t)tl) + info.mreq->m_len; *tl++ = txdr_unsigned(nd->nd_retxid); *tl++ = rpc_reply; if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { @@ -1262,8 +1574,8 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, if (err & NFSERR_AUTHERR) { *tl++ = rpc_autherr; *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); - mreq->m_len -= NFSX_UNSIGNED; - bpos -= NFSX_UNSIGNED; + info.mreq->m_len -= NFSX_UNSIGNED; + info.bpos -= NFSX_UNSIGNED; } else { *tl++ = rpc_mismatch; *tl++ = txdr_unsigned(RPC_VER2); @@ -1299,12 +1611,15 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, */ #ifdef NFSKERB XXX +#else + ktvout.tv_sec = 0; + ktvout.tv_usec = 0; #endif *tl++ = rpc_auth_kerb; *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED); *tl = ktvout.tv_sec; - nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED); + tl = nfsm_build(&info, 3 * NFSX_UNSIGNED); *tl++ = ktvout.tv_usec; *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid); } else { @@ -1321,7 +1636,7 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, break; case EPROGMISMATCH: *tl = txdr_unsigned(RPC_PROGMISMATCH); - nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); + tl = nfsm_build(&info, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(2); *tl = txdr_unsigned(3); break; @@ -1334,7 +1649,7 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, default: *tl = 0; if (err != NFSERR_RETVOID) { - nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); + tl = nfsm_build(&info, NFSX_UNSIGNED); if (err) *tl = txdr_unsigned(nfsrv_errmap(nd, err)); else @@ -1345,9 +1660,9 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, } if (mrq != NULL) - *mrq = mreq; - *mbp = mb; - *bposp = bpos; + *mrq = info.mreq; + *mbp = info.mb; + *bposp = info.bpos; if (err != 0 && err != NFSERR_RETVOID) nfsstats.srvrpc_errs++; return (0); @@ -1355,122 +1670,48 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, #endif /* NFS_NOSERVER */ + /* - * Nfs timer routine + * Nfs timer routine. + * * Scan the nfsreq list and retranmit any requests that have timed out * To avoid retransmission attempts on STREAM sockets (in the future) make * sure to set the r_retry field to 0 (implies nm_retry == 0). + * + * Requests with attached responses, terminated requests, and + * locked requests are ignored. Locked requests will be picked up + * in a later timer call. */ void nfs_timer(void *arg /* never used */) { - struct nfsreq *rep; - struct mbuf *m; - struct socket *so; struct nfsmount *nmp; - int timeo; - int error; + struct nfsreq *req; #ifndef NFS_NOSERVER struct nfssvc_sock *slp; u_quad_t cur_usec; #endif /* NFS_NOSERVER */ - struct thread *td = &thread0; /* XXX for credentials, will break if sleep */ crit_enter(); - TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { - nmp = rep->r_nmp; - if (rep->r_mrep || (rep->r_flags & (R_SOFTTERM|R_MASKTIMER))) - continue; - rep->r_flags |= R_LOCKED; - if (nfs_sigintr(nmp, rep, rep->r_td)) { - nfs_softterm(rep); - goto skip; - } - if (rep->r_rtt >= 0) { - rep->r_rtt++; - if (nmp->nm_flag & NFSMNT_DUMBTIMR) - timeo = nmp->nm_timeo; - else - timeo = NFS_RTO(nmp, proct[rep->r_procnum]); - if (nmp->nm_timeouts > 0) - timeo *= nfs_backoff[nmp->nm_timeouts - 1]; - if (rep->r_rtt <= timeo) - goto skip; - if (nmp->nm_timeouts < 8) - nmp->nm_timeouts++; - } - /* - * Check for server not responding - */ - if ((rep->r_flags & R_TPRINTFMSG) == 0 && - rep->r_rexmit > nmp->nm_deadthresh) { - nfs_msg(rep->r_td, - nmp->nm_mountp->mnt_stat.f_mntfromname, - "not responding"); - rep->r_flags |= R_TPRINTFMSG; - } - if (rep->r_rexmit >= rep->r_retry) { /* too many */ - nfsstats.rpctimeouts++; - nfs_softterm(rep); - goto skip; - } - if (nmp->nm_sotype != SOCK_DGRAM) { - if (++rep->r_rexmit > NFS_MAXREXMIT) - rep->r_rexmit = NFS_MAXREXMIT; - goto skip; - } - if ((so = nmp->nm_so) == NULL) - goto skip; - - /* - * If there is enough space and the window allows.. - * Resend it - * Set r_rtt to -1 in case we fail to send it now. - */ - rep->r_rtt = -1; - if (ssb_space(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && - ((nmp->nm_flag & NFSMNT_DUMBTIMR) || - (rep->r_flags & R_SENT) || - nmp->nm_sent < nmp->nm_cwnd) && - (m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){ - if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) - error = so_pru_send(so, 0, m, NULL, NULL, td); - else - error = so_pru_send(so, 0, m, nmp->nm_nam, - NULL, td); - if (error) { - if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) - so->so_error = 0; - } else if (rep->r_mrep == NULL) { - /* - * Iff first send, start timing - * else turn timing off, backoff timer - * and divide congestion window by 2. - * - * It is possible for the so_pru_send() to - * block and for us to race a reply so we - * only do this if the reply field has not - * been filled in. R_LOCKED will prevent - * the request from being ripped out from under - * us entirely. - */ - if (rep->r_flags & R_SENT) { - rep->r_flags &= ~R_TIMING; - if (++rep->r_rexmit > NFS_MAXREXMIT) - rep->r_rexmit = NFS_MAXREXMIT; - nmp->nm_cwnd >>= 1; - if (nmp->nm_cwnd < NFS_CWNDSCALE) - nmp->nm_cwnd = NFS_CWNDSCALE; - nfsstats.rpcretries++; - } else { - rep->r_flags |= R_SENT; - nmp->nm_sent += NFS_CWNDSCALE; - } - rep->r_rtt = 0; + TAILQ_FOREACH(nmp, &nfs_mountq, nm_entry) { + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { + KKASSERT(nmp == req->r_nmp); + if (req->r_mrep) + continue; + if (req->r_flags & (R_SOFTTERM | R_LOCKED)) + continue; + req->r_flags |= R_LOCKED; + if (nfs_sigintr(nmp, req, req->r_td)) { + nfs_softterm(req, 1); + } else { + nfs_timer_req(req); + } + req->r_flags &= ~R_LOCKED; + if (req->r_flags & R_WANTED) { + req->r_flags &= ~R_WANTED; + wakeup(req); } } -skip: - rep->r_flags &= ~R_LOCKED; } #ifndef NFS_NOSERVER @@ -1484,23 +1725,169 @@ skip: nfsrv_wakenfsd(slp, 1); } #endif /* NFS_NOSERVER */ + crit_exit(); + callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL); +} + +static +void +nfs_timer_req(struct nfsreq *req) +{ + struct thread *td = &thread0; /* XXX for creds, will break if sleep */ + struct nfsmount *nmp = req->r_nmp; + struct mbuf *m; + struct socket *so; + int timeo; + int error; + + /* + * rtt ticks and timeout calculation. Return if the timeout + * has not been reached yet, unless the packet is flagged + * for an immediate send. + * + * The mean rtt doesn't help when we get random I/Os, we have + * to multiply by fairly large numbers. + */ + if (req->r_rtt >= 0) { + /* + * Calculate the timeout to test against. + */ + req->r_rtt++; + if (nmp->nm_flag & NFSMNT_DUMBTIMR) { + timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS; + } else if (req->r_flags & R_TIMING) { + timeo = NFS_SRTT(req) + NFS_SDRTT(req); + } else { + timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS; + } + timeo *= multt[req->r_procnum]; + /* timeo is still scaled by SCALE_BITS */ + +#define NFSFS (NFS_RTT_SCALE * NFS_HZ) + if (req->r_flags & R_TIMING) { + static long last_time; + if (nfs_showrtt && last_time != time_second) { + kprintf("rpccmd %d NFS SRTT %d SDRTT %d " + "timeo %d.%03d\n", + proct[req->r_procnum], + NFS_SRTT(req), NFS_SDRTT(req), + timeo / NFSFS, + timeo % NFSFS * 1000 / NFSFS); + last_time = time_second; + } + } +#undef NFSFS + + /* + * deal with nfs_timer jitter. + */ + timeo = (timeo >> NFS_RTT_SCALE_BITS) + 1; + if (timeo < 2) + timeo = 2; + + if (nmp->nm_timeouts > 0) + timeo *= nfs_backoff[nmp->nm_timeouts - 1]; + if (timeo > NFS_MAXTIMEO) + timeo = NFS_MAXTIMEO; + if (req->r_rtt <= timeo) { + if ((req->r_flags & R_NEEDSXMIT) == 0) + return; + } else if (nmp->nm_timeouts < 8) { + nmp->nm_timeouts++; + } + } /* - * Due to possible blocking, a client operation may be waiting for - * us to finish processing this request so it can remove it. + * Check for server not responding */ - if (nfs_timer_raced) { - nfs_timer_raced = 0; - wakeup(&nfs_timer_raced); + if ((req->r_flags & R_TPRINTFMSG) == 0 && + req->r_rexmit > nmp->nm_deadthresh) { + nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname, + "not responding"); + req->r_flags |= R_TPRINTFMSG; + } + if (req->r_rexmit >= req->r_retry) { /* too many */ + nfsstats.rpctimeouts++; + nfs_softterm(req, 1); + return; + } + + /* + * Generally disable retransmission on reliable sockets, + * unless the request is flagged for immediate send. + */ + if (nmp->nm_sotype != SOCK_DGRAM) { + if (++req->r_rexmit > NFS_MAXREXMIT) + req->r_rexmit = NFS_MAXREXMIT; + if ((req->r_flags & R_NEEDSXMIT) == 0) + return; + } + + /* + * Stop here if we do not have a socket! + */ + if ((so = nmp->nm_so) == NULL) + return; + + /* + * If there is enough space and the window allows.. resend it. + * + * r_rtt is left intact in case we get an answer after the + * retry that was a reply to the original packet. + */ + if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len && + (req->r_flags & (R_SENT | R_NEEDSXMIT)) && + (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){ + if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) + error = so_pru_send(so, 0, m, NULL, NULL, td); + else + error = so_pru_send(so, 0, m, nmp->nm_nam, + NULL, td); + if (error) { + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) + so->so_error = 0; + req->r_flags |= R_NEEDSXMIT; + } else if (req->r_mrep == NULL) { + /* + * Iff first send, start timing + * else turn timing off, backoff timer + * and divide congestion window by 2. + * + * It is possible for the so_pru_send() to + * block and for us to race a reply so we + * only do this if the reply field has not + * been filled in. R_LOCKED will prevent + * the request from being ripped out from under + * us entirely. + * + * Record the last resent procnum to aid us + * in duplicate detection on receive. + */ + if ((req->r_flags & R_NEEDSXMIT) == 0) { + if (nfs_showrexmit) + kprintf("X"); + if (++req->r_rexmit > NFS_MAXREXMIT) + req->r_rexmit = NFS_MAXREXMIT; + nmp->nm_maxasync_scaled >>= 1; + if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED) + nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED; + nfsstats.rpcretries++; + nmp->nm_lastreprocnum = req->r_procnum; + } else { + req->r_flags |= R_SENT; + req->r_flags &= ~R_NEEDSXMIT; + } + } } - crit_exit(); - callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL); } /* * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and * wait for all requests to complete. This is used by forced unmounts * to terminate any outstanding RPCs. + * + * Locked requests cannot be canceled but will be marked for + * soft-termination. */ int nfs_nmcancelreqs(struct nfsmount *nmp) @@ -1509,18 +1896,17 @@ nfs_nmcancelreqs(struct nfsmount *nmp) int i; crit_enter(); - TAILQ_FOREACH(req, &nfs_reqq, r_chain) { - if (nmp != req->r_nmp || req->r_mrep != NULL || - (req->r_flags & R_SOFTTERM)) { + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { + if (req->r_mrep != NULL || (req->r_flags & R_SOFTTERM)) continue; - } - nfs_softterm(req); + nfs_softterm(req, 0); } + /* XXX the other two queues as well */ crit_exit(); for (i = 0; i < 30; i++) { crit_enter(); - TAILQ_FOREACH(req, &nfs_reqq, r_chain) { + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { if (nmp == req->r_nmp) break; } @@ -1533,23 +1919,65 @@ nfs_nmcancelreqs(struct nfsmount *nmp) } /* - * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT). - * The nm_send count is decremented now to avoid deadlocks when the process in - * soreceive() hasn't yet managed to send its own request. + * Soft-terminate a request, effectively marking it as failed. * - * This routine must be called at splsoftclock() to protect r_flags and - * nm_sent. + * Must be called from within a critical section. */ - static void -nfs_softterm(struct nfsreq *rep) +nfs_softterm(struct nfsreq *rep, int islocked) { rep->r_flags |= R_SOFTTERM; + nfs_hardterm(rep, islocked); +} +/* + * Hard-terminate a request, typically after getting a response. + * + * The state machine can still decide to re-issue it later if necessary. + * + * Must be called from within a critical section. + */ +static void +nfs_hardterm(struct nfsreq *rep, int islocked) +{ + struct nfsmount *nmp = rep->r_nmp; + + /* + * The nm_send count is decremented now to avoid deadlocks + * when the process in soreceive() hasn't yet managed to send + * its own request. + */ if (rep->r_flags & R_SENT) { - rep->r_nmp->nm_sent -= NFS_CWNDSCALE; rep->r_flags &= ~R_SENT; } + + /* + * If we locked the request or nobody else has locked the request, + * and the request is async, we can move it to the reader thread's + * queue now and fix up the state. + * + * If we locked the request or nobody else has locked the request, + * we can wake up anyone blocked waiting for a response on the + * request. + */ + if (islocked || (rep->r_flags & R_LOCKED) == 0) { + if ((rep->r_flags & (R_ONREQQ | R_ASYNC)) == + (R_ONREQQ | R_ASYNC)) { + rep->r_flags &= ~R_ONREQQ; + TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain); + --nmp->nm_reqqlen; + TAILQ_INSERT_TAIL(&nmp->nm_reqrxq, rep, r_chain); + KKASSERT(rep->r_info->state == NFSM_STATE_TRY || + rep->r_info->state == NFSM_STATE_WAITREPLY); + rep->r_info->state = NFSM_STATE_PROCESSREPLY; + nfssvc_iod_reader_wakeup(nmp); + if (TAILQ_FIRST(&nmp->nm_bioq) && + nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) { + nfssvc_iod_writer_wakeup(nmp); + } + } + mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link); + } } /* @@ -1591,9 +2019,9 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td) * in progress when a reconnect is necessary. */ int -nfs_sndlock(struct nfsreq *rep) +nfs_sndlock(struct nfsmount *nmp, struct nfsreq *rep) { - int *statep = &rep->r_nmp->nm_state; + mtx_t mtx = &nmp->nm_txlock; struct thread *td; int slptimeo; int slpflag; @@ -1601,30 +2029,29 @@ nfs_sndlock(struct nfsreq *rep) slpflag = 0; slptimeo = 0; - td = rep->r_td; - if (rep->r_nmp->nm_flag & NFSMNT_INT) + td = rep ? rep->r_td : NULL; + if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; - error = 0; - crit_enter(); - while (*statep & NFSSTA_SNDLOCK) { - *statep |= NFSSTA_WANTSND; - if (nfs_sigintr(rep->r_nmp, rep, td)) { + while ((error = mtx_lock_ex_try(mtx)) != 0) { + if (nfs_sigintr(nmp, rep, td)) { error = EINTR; break; } - tsleep((caddr_t)statep, slpflag, "nfsndlck", slptimeo); + error = mtx_lock_ex(mtx, "nfsndlck", slpflag, slptimeo); + if (error == 0) + break; if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } /* Always fail if our request has been cancelled. */ - if ((rep->r_flags & R_SOFTTERM)) + if (rep && (rep->r_flags & R_SOFTTERM)) { + if (error == 0) + mtx_unlock(mtx); error = EINTR; - if (error == 0) - *statep |= NFSSTA_SNDLOCK; - crit_exit(); + } return (error); } @@ -1632,25 +2059,20 @@ nfs_sndlock(struct nfsreq *rep) * Unlock the stream socket for others. */ void -nfs_sndunlock(struct nfsreq *rep) +nfs_sndunlock(struct nfsmount *nmp) { - int *statep = &rep->r_nmp->nm_state; - - if ((*statep & NFSSTA_SNDLOCK) == 0) - panic("nfs sndunlock"); - crit_enter(); - *statep &= ~NFSSTA_SNDLOCK; - if (*statep & NFSSTA_WANTSND) { - *statep &= ~NFSSTA_WANTSND; - wakeup((caddr_t)statep); - } - crit_exit(); + mtx_unlock(&nmp->nm_txlock); } +/* + * Lock the receiver side of the socket. + * + * rep may be NULL. + */ static int -nfs_rcvlock(struct nfsreq *rep) +nfs_rcvlock(struct nfsmount *nmp, struct nfsreq *rep) { - int *statep = &rep->r_nmp->nm_state; + mtx_t mtx = &nmp->nm_rxlock; int slpflag; int slptimeo; int error; @@ -1664,34 +2086,46 @@ nfs_rcvlock(struct nfsreq *rep) * We do not strictly need the second check just before the * tsleep(), but it's good defensive programming. */ - if (rep->r_mrep != NULL) + if (rep && rep->r_mrep != NULL) return (EALREADY); - if (rep->r_nmp->nm_flag & NFSMNT_INT) + if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; else slpflag = 0; slptimeo = 0; - error = 0; - crit_enter(); - while (*statep & NFSSTA_RCVLOCK) { - if (nfs_sigintr(rep->r_nmp, rep, rep->r_td)) { + + while ((error = mtx_lock_ex_try(mtx)) != 0) { + if (nfs_sigintr(nmp, rep, (rep ? rep->r_td : NULL))) { error = EINTR; break; } - if (rep->r_mrep != NULL) { + if (rep && rep->r_mrep != NULL) { error = EALREADY; break; } - *statep |= NFSSTA_WANTRCV; - tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo); + + /* + * NOTE: can return ENOLCK, but in that case rep->r_mrep + * will already be set. + */ + if (rep) { + error = mtx_lock_ex_link(mtx, &rep->r_link, + "nfsrcvlk", + slpflag, slptimeo); + } else { + error = mtx_lock_ex(mtx, "nfsrcvlk", slpflag, slptimeo); + } + if (error == 0) + break; + /* * If our reply was recieved while we were sleeping, * then just return without taking the lock to avoid a * situation where a single iod could 'capture' the * recieve lock. */ - if (rep->r_mrep != NULL) { + if (rep && rep->r_mrep != NULL) { error = EALREADY; break; } @@ -1701,10 +2135,11 @@ nfs_rcvlock(struct nfsreq *rep) } } if (error == 0) { - *statep |= NFSSTA_RCVLOCK; - rep->r_nmp->nm_rcvlock_td = curthread; /* DEBUGGING */ + if (rep && rep->r_mrep != NULL) { + error = EALREADY; + mtx_unlock(mtx); + } } - crit_exit(); return (error); } @@ -1712,66 +2147,54 @@ nfs_rcvlock(struct nfsreq *rep) * Unlock the stream socket for others. */ static void -nfs_rcvunlock(struct nfsreq *rep) +nfs_rcvunlock(struct nfsmount *nmp) { - int *statep = &rep->r_nmp->nm_state; - - if ((*statep & NFSSTA_RCVLOCK) == 0) - panic("nfs rcvunlock"); - crit_enter(); - rep->r_nmp->nm_rcvlock_td = (void *)-1; /* DEBUGGING */ - *statep &= ~NFSSTA_RCVLOCK; - if (*statep & NFSSTA_WANTRCV) { - *statep &= ~NFSSTA_WANTRCV; - wakeup((caddr_t)statep); - } - crit_exit(); + mtx_unlock(&nmp->nm_rxlock); } /* - * nfs_realign: + * nfs_realign: * - * Check for badly aligned mbuf data and realign by copying the unaligned - * portion of the data into a new mbuf chain and freeing the portions - * of the old chain that were replaced. + * Check for badly aligned mbuf data and realign by copying the unaligned + * portion of the data into a new mbuf chain and freeing the portions + * of the old chain that were replaced. * - * We cannot simply realign the data within the existing mbuf chain - * because the underlying buffers may contain other rpc commands and - * we cannot afford to overwrite them. + * We cannot simply realign the data within the existing mbuf chain + * because the underlying buffers may contain other rpc commands and + * we cannot afford to overwrite them. * - * We would prefer to avoid this situation entirely. The situation does - * not occur with NFS/UDP and is supposed to only occassionally occur - * with TCP. Use vfs.nfs.realign_count and realign_test to check this. + * We would prefer to avoid this situation entirely. The situation does + * not occur with NFS/UDP and is supposed to only occassionally occur + * with TCP. Use vfs.nfs.realign_count and realign_test to check this. + * + * NOTE! MB_DONTWAIT cannot be used here. The mbufs must be acquired + * because the rpc request OR reply cannot be thrown away. TCP NFS + * mounts do not retry their RPCs unless the TCP connection itself + * is dropped so throwing away a RPC will basically cause the NFS + * operation to lockup indefinitely. */ static void nfs_realign(struct mbuf **pm, int hsiz) { struct mbuf *m; struct mbuf *n = NULL; - int off = 0; + /* + * Check for misalignemnt + */ ++nfs_realign_test; - while ((m = *pm) != NULL) { - if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) { - n = m_getl(m->m_len, MB_WAIT, MT_DATA, 0, NULL); - n->m_len = 0; + if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) break; - } pm = &m->m_next; } /* - * If n is non-NULL, loop on m copying data, then replace the - * portion of the chain that had to be realigned. + * If misalignment found make a completely new copy. */ - if (n != NULL) { + if (m) { ++nfs_realign_count; - while (m) { - m_copyback(n, off, m->m_len, mtod(m, caddr_t)); - off += m->m_len; - m = m->m_next; - } + n = m_dup_data(m, MB_WAIT); m_freem(*pm); *pm = n; } @@ -1789,32 +2212,33 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) { int len, i; u_int32_t *tl; - int32_t t1; struct uio uio; struct iovec iov; - caddr_t dpos, cp2, cp; + caddr_t cp; u_int32_t nfsvers, auth_type; uid_t nickuid; int error = 0, ticklen; - struct mbuf *mrep, *md; struct nfsuid *nuidp; struct timeval tvin, tvout; + struct nfsm_info info; #if 0 /* until encrypted keys are implemented */ NFSKERBKEYSCHED_T keys; /* stores key schedule */ #endif - mrep = nd->nd_mrep; - md = nd->nd_md; - dpos = nd->nd_dpos; + info.mrep = nd->nd_mrep; + info.md = nd->nd_md; + info.dpos = nd->nd_dpos; + if (has_header) { - nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); + NULLOUT(tl = nfsm_dissect(&info, 10 * NFSX_UNSIGNED)); nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); if (*tl++ != rpc_call) { - m_freem(mrep); + m_freem(info.mrep); return (EBADRPC); } - } else - nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); + } else { + NULLOUT(tl = nfsm_dissect(&info, 8 * NFSX_UNSIGNED)); + } nd->nd_repstat = 0; nd->nd_flag = 0; if (*tl++ != rpc_vers) { @@ -1851,7 +2275,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) auth_type = *tl++; len = fxdr_unsigned(int, *tl++); if (len < 0 || len > RPCAUTH_MAXSIZ) { - m_freem(mrep); + m_freem(info.mrep); return (EBADRPC); } @@ -1862,21 +2286,23 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) if (auth_type == rpc_auth_unix) { len = fxdr_unsigned(int, *++tl); if (len < 0 || len > NFS_MAXNAMLEN) { - m_freem(mrep); + m_freem(info.mrep); return (EBADRPC); } - nfsm_adv(nfsm_rndup(len)); - nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); + ERROROUT(nfsm_adv(&info, nfsm_rndup(len))); + NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED)); bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred)); nd->nd_cr.cr_ref = 1; nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); + nd->nd_cr.cr_ruid = nd->nd_cr.cr_svuid = nd->nd_cr.cr_uid; nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); + nd->nd_cr.cr_rgid = nd->nd_cr.cr_svgid = nd->nd_cr.cr_gid; len = fxdr_unsigned(int, *tl); if (len < 0 || len > RPCAUTH_UNIXGIDS) { - m_freem(mrep); + m_freem(info.mrep); return (EBADRPC); } - nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); + NULLOUT(tl = nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED)); for (i = 1; i <= len; i++) if (i < NGROUPS) nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); @@ -1887,11 +2313,12 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups); len = fxdr_unsigned(int, *++tl); if (len < 0 || len > RPCAUTH_MAXSIZ) { - m_freem(mrep); + m_freem(info.mrep); return (EBADRPC); } - if (len > 0) - nfsm_adv(nfsm_rndup(len)); + if (len > 0) { + ERROROUT(nfsm_adv(&info, nfsm_rndup(len))); + } } else if (auth_type == rpc_auth_kerb) { switch (fxdr_unsigned(int, *tl++)) { case RPCAKN_FULLNAME: @@ -1900,7 +2327,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED; nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED; if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) { - m_freem(mrep); + m_freem(info.mrep); return (EBADRPC); } uio.uio_offset = 0; @@ -1909,8 +2336,8 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) uio.uio_segflg = UIO_SYSSPACE; iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4]; iov.iov_len = RPCAUTH_MAXSIZ - 4; - nfsm_mtouio(&uio, uio.uio_resid); - nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); + ERROROUT(nfsm_mtouio(&info, &uio, uio.uio_resid)); + NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED)); if (*tl++ != rpc_auth_kerb || fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) { kprintf("Bad kerb verifier\n"); @@ -1918,7 +2345,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) nd->nd_procnum = NFSPROC_NOOP; return (0); } - nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED); + NULLOUT(cp = nfsm_dissect(&info, 4 * NFSX_UNSIGNED)); tl = (u_int32_t *)cp; if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) { kprintf("Not fullname kerb verifier\n"); @@ -1940,7 +2367,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) return (0); } nickuid = fxdr_unsigned(uid_t, *tl); - nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); + NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED)); if (*tl++ != rpc_auth_kerb || fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) { kprintf("Kerb nick verifier bad\n"); @@ -1948,7 +2375,7 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) nd->nd_procnum = NFSPROC_NOOP; return (0); } - nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); + NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED)); tvin.tv_sec = *tl++; tvin.tv_usec = *tl; @@ -1973,6 +2400,9 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) */ #ifdef NFSKERB XXX +#else + tvout.tv_sec = 0; + tvout.tv_usec = 0; #endif tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec); @@ -1996,8 +2426,8 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) return (0); } - nd->nd_md = md; - nd->nd_dpos = dpos; + nd->nd_md = info.md; + nd->nd_dpos = info.dpos; return (0); nfsmout: return (error);