gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1991, 1993, 1995
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* Rick Macklem at The University of Guelph.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
	37	* $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
	38	* $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.45 2007/05/18 17:05:13 dillon Exp $
	39	*/
	40
	41	/*
	42	* Socket operations for use by nfs
	43	*/
	44
	45	#include <sys/param.h>
	46	#include <sys/systm.h>
	47	#include <sys/proc.h>
	48	#include <sys/malloc.h>
	49	#include <sys/mount.h>
	50	#include <sys/kernel.h>
	51	#include <sys/mbuf.h>
	52	#include <sys/vnode.h>
	53	#include <sys/fcntl.h>
	54	#include <sys/protosw.h>
	55	#include <sys/resourcevar.h>
	56	#include <sys/socket.h>
	57	#include <sys/socketvar.h>
	58	#include <sys/socketops.h>
	59	#include <sys/syslog.h>
	60	#include <sys/thread.h>
	61	#include <sys/tprintf.h>
	62	#include <sys/sysctl.h>
	63	#include <sys/signalvar.h>
	64
	65	#include <sys/signal2.h>
	66	#include <sys/mutex2.h>
	67	#include <sys/socketvar2.h>
	68
	69	#include <netinet/in.h>
	70	#include <netinet/tcp.h>
	71	#include <sys/thread2.h>
	72
	73	#include "rpcv2.h"
	74	#include "nfsproto.h"
	75	#include "nfs.h"
	76	#include "xdr_subs.h"
	77	#include "nfsm_subs.h"
	78	#include "nfsmount.h"
	79	#include "nfsnode.h"
	80	#include "nfsrtt.h"
	81
	82	#define TRUE 1
	83	#define FALSE 0
	84
	85	/*
	86	* RTT calculations are scaled by 256 (8 bits). A proper fractional
	87	* RTT will still be calculated even with a slow NFS timer.
	88	*/
	89	#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum]]
	90	#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum]]
	91	#define NFS_RTT_SCALE_BITS 8 /* bits */
	92	#define NFS_RTT_SCALE 256 /* value */
	93
	94	/*
	95	* Defines which timer to use for the procnum.
	96	* 0 - default
	97	* 1 - getattr
	98	* 2 - lookup
	99	* 3 - read
	100	* 4 - write
	101	*/
	102	static int proct[NFS_NPROCS] = {
	103	0, 1, 0, 2, 1, 3, 3, 4, 0, 0, /* 00-09 */
	104	0, 0, 0, 0, 0, 0, 3, 3, 0, 0, /* 10-19 */
	105	0, 5, 0, 0, 0, 0, /* 20-29 */
	106	};
	107
	108	static int multt[NFS_NPROCS] = {
	109	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-09 */
	110	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10-19 */
	111	1, 2, 1, 1, 1, 1, /* 20-29 */
	112	};
	113
	114	static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 };
	115	static int nfs_realign_test;
	116	static int nfs_realign_count;
	117	static int nfs_showrtt;
	118	static int nfs_showrexmit;
	119	int nfs_maxasyncbio = NFS_MAXASYNCBIO;
	120
	121	SYSCTL_DECL(_vfs_nfs);
	122
	123	SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
	124	SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
	125	SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, "");
	126	SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, "");
	127	SYSCTL_INT(_vfs_nfs, OID_AUTO, maxasyncbio, CTLFLAG_RW, &nfs_maxasyncbio, 0, "");
	128
	129	static int nfs_request_setup(nfsm_info_t info);
	130	static int nfs_request_auth(struct nfsreq *rep);
	131	static int nfs_request_try(struct nfsreq *rep);
	132	static int nfs_request_waitreply(struct nfsreq *rep);
	133	static int nfs_request_processreply(nfsm_info_t info, int);
	134
	135	int nfsrtton = 0;
	136	struct nfsrtt nfsrtt;
	137	struct callout nfs_timer_handle;
	138
	139	static int nfs_msg (struct thread ,char ,char *);
	140	static int nfs_rcvlock (struct nfsmount nmp, struct nfsreq myreq);
	141	static void nfs_rcvunlock (struct nfsmount *nmp);
	142	static void nfs_realign (struct mbuf **pm, int hsiz);
	143	static int nfs_receive (struct nfsmount nmp, struct nfsreq rep,
	144	struct sockaddr aname, struct mbuf mp);
	145	static void nfs_softterm (struct nfsreq *rep, int islocked);
	146	static void nfs_hardterm (struct nfsreq *rep, int islocked);
	147	static int nfs_reconnect (struct nfsmount nmp, struct nfsreq rep);
	148	#ifndef NFS_NOSERVER
	149	static int nfsrv_getstream (struct nfssvc_sock , int, int );
	150	static void nfs_timer_req(struct nfsreq *req);
	151	static void nfs_checkpkt(struct mbuf *m, int len);
	152
	153	int (nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript nd,
	154	struct nfssvc_sock *slp,
	155	struct thread *td,
	156	struct mbuf **mreqp) = {
	157	nfsrv_null,
	158	nfsrv_getattr,
	159	nfsrv_setattr,
	160	nfsrv_lookup,
	161	nfsrv3_access,
	162	nfsrv_readlink,
	163	nfsrv_read,
	164	nfsrv_write,
	165	nfsrv_create,
	166	nfsrv_mkdir,
	167	nfsrv_symlink,
	168	nfsrv_mknod,
	169	nfsrv_remove,
	170	nfsrv_rmdir,
	171	nfsrv_rename,
	172	nfsrv_link,
	173	nfsrv_readdir,
	174	nfsrv_readdirplus,
	175	nfsrv_statfs,
	176	nfsrv_fsinfo,
	177	nfsrv_pathconf,
	178	nfsrv_commit,
	179	nfsrv_noop,
	180	nfsrv_noop,
	181	nfsrv_noop,
	182	nfsrv_noop
	183	};
	184	#endif /* NFS_NOSERVER */
	185
	186	/*
	187	* Initialize sockets and congestion for a new NFS connection.
	188	* We do not free the sockaddr if error.
	189	*/
	190	int
	191	nfs_connect(struct nfsmount nmp, struct nfsreq rep)
	192	{
	193	struct socket *so;
	194	int error;
	195	struct sockaddr *saddr;
	196	struct sockaddr_in *sin;
	197	struct thread td = &thread0; / only used for socreate and sobind */
	198
	199	nmp->nm_so = so = NULL;
	200	if (nmp->nm_flag & NFSMNT_FORCE)
	201	return (EINVAL);
	202	saddr = nmp->nm_nam;
	203	error = socreate(saddr->sa_family, &so, nmp->nm_sotype,
	204	nmp->nm_soproto, td);
	205	if (error)
	206	goto bad;
	207	nmp->nm_soflags = so->so_proto->pr_flags;
	208
	209	/*
	210	* Some servers require that the client port be a reserved port number.
	211	*/
	212	if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
	213	struct sockopt sopt;
	214	int ip;
	215	struct sockaddr_in ssin;
	216
	217	bzero(&sopt, sizeof sopt);
	218	ip = IP_PORTRANGE_LOW;
	219	sopt.sopt_level = IPPROTO_IP;
	220	sopt.sopt_name = IP_PORTRANGE;
	221	sopt.sopt_val = (void *)&ip;
	222	sopt.sopt_valsize = sizeof(ip);
	223	sopt.sopt_td = NULL;
	224	error = sosetopt(so, &sopt);
	225	if (error)
	226	goto bad;
	227	bzero(&ssin, sizeof ssin);
	228	sin = &ssin;
	229	sin->sin_len = sizeof (struct sockaddr_in);
	230	sin->sin_family = AF_INET;
	231	sin->sin_addr.s_addr = INADDR_ANY;
	232	sin->sin_port = htons(0);
	233	error = sobind(so, (struct sockaddr *)sin, td);
	234	if (error)
	235	goto bad;
	236	bzero(&sopt, sizeof sopt);
	237	ip = IP_PORTRANGE_DEFAULT;
	238	sopt.sopt_level = IPPROTO_IP;
	239	sopt.sopt_name = IP_PORTRANGE;
	240	sopt.sopt_val = (void *)&ip;
	241	sopt.sopt_valsize = sizeof(ip);
	242	sopt.sopt_td = NULL;
	243	error = sosetopt(so, &sopt);
	244	if (error)
	245	goto bad;
	246	}
	247
	248	/*
	249	* Protocols that do not require connections may be optionally left
	250	* unconnected for servers that reply from a port other than NFS_PORT.
	251	*/
	252	if (nmp->nm_flag & NFSMNT_NOCONN) {
	253	if (nmp->nm_soflags & PR_CONNREQUIRED) {
	254	error = ENOTCONN;
	255	goto bad;
	256	}
	257	} else {
	258	error = soconnect(so, nmp->nm_nam, td);
	259	if (error)
	260	goto bad;
	261
	262	/*
	263	* Wait for the connection to complete. Cribbed from the
	264	* connect system call but with the wait timing out so
	265	* that interruptible mounts don't hang here for a long time.
	266	*/
	267	crit_enter();
	268	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
	269	(void) tsleep((caddr_t)&so->so_timeo, 0,
	270	"nfscon", 2 * hz);
	271	if ((so->so_state & SS_ISCONNECTING) &&
	272	so->so_error == 0 && rep &&
	273	(error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
	274	soclrstate(so, SS_ISCONNECTING);
	275	crit_exit();
	276	goto bad;
	277	}
	278	}
	279	if (so->so_error) {
	280	error = so->so_error;
	281	so->so_error = 0;
	282	crit_exit();
	283	goto bad;
	284	}
	285	crit_exit();
	286	}
	287	so->so_rcv.ssb_timeo = (5 * hz);
	288	so->so_snd.ssb_timeo = (5 * hz);
	289
	290	/*
	291	* Get buffer reservation size from sysctl, but impose reasonable
	292	* limits.
	293	*/
	294	if (nmp->nm_sotype == SOCK_STREAM) {
	295	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
	296	struct sockopt sopt;
	297	int val;
	298
	299	bzero(&sopt, sizeof sopt);
	300	sopt.sopt_level = SOL_SOCKET;
	301	sopt.sopt_name = SO_KEEPALIVE;
	302	sopt.sopt_val = &val;
	303	sopt.sopt_valsize = sizeof val;
	304	val = 1;
	305	sosetopt(so, &sopt);
	306	}
	307	if (so->so_proto->pr_protocol == IPPROTO_TCP) {
	308	struct sockopt sopt;
	309	int val;
	310
	311	bzero(&sopt, sizeof sopt);
	312	sopt.sopt_level = IPPROTO_TCP;
	313	sopt.sopt_name = TCP_NODELAY;
	314	sopt.sopt_val = &val;
	315	sopt.sopt_valsize = sizeof val;
	316	val = 1;
	317	sosetopt(so, &sopt);
	318
	319	bzero(&sopt, sizeof sopt);
	320	sopt.sopt_level = IPPROTO_TCP;
	321	sopt.sopt_name = TCP_FASTKEEP;
	322	sopt.sopt_val = &val;
	323	sopt.sopt_valsize = sizeof val;
	324	val = 1;
	325	sosetopt(so, &sopt);
	326	}
	327	}
	328	error = soreserve(so, nfs_soreserve, nfs_soreserve, NULL);
	329	if (error)
	330	goto bad;
	331	atomic_set_int(&so->so_rcv.ssb_flags, SSB_NOINTR);
	332	atomic_set_int(&so->so_snd.ssb_flags, SSB_NOINTR);
	333
	334	/* Initialize other non-zero congestion variables */
	335	nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
	336	nmp->nm_srtt[3] = (NFS_TIMEO << NFS_RTT_SCALE_BITS);
	337	nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
	338	nmp->nm_sdrtt[3] = 0;
	339	nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
	340	nmp->nm_timeouts = 0;
	341
	342	/*
	343	* Assign nm_so last. The moment nm_so is assigned the nfs_timer()
	344	* can mess with the socket.
	345	*/
	346	nmp->nm_so = so;
	347	return (0);
	348
	349	bad:
	350	if (so) {
	351	soshutdown(so, SHUT_RDWR);
	352	soclose(so, FNONBLOCK);
	353	}
	354	return (error);
	355	}
	356
	357	/*
	358	* Reconnect routine:
	359	* Called when a connection is broken on a reliable protocol.
	360	* - clean up the old socket
	361	* - nfs_connect() again
	362	* - set R_NEEDSXMIT for all outstanding requests on mount point
	363	* If this fails the mount point is DEAD!
	364	* nb: Must be called with the nfs_sndlock() set on the mount point.
	365	*/
	366	static int
	367	nfs_reconnect(struct nfsmount nmp, struct nfsreq rep)
	368	{
	369	struct nfsreq *req;
	370	int error;
	371
	372	nfs_disconnect(nmp);
	373	if (nmp->nm_rxstate >= NFSSVC_STOPPING)
	374	return (EINTR);
	375	while ((error = nfs_connect(nmp, rep)) != 0) {
	376	if (error == EINTR \|\| error == ERESTART)
	377	return (EINTR);
	378	if (error == EINVAL)
	379	return (error);
	380	if (nmp->nm_rxstate >= NFSSVC_STOPPING)
	381	return (EINTR);
	382	(void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
	383	}
	384
	385	/*
	386	* Loop through outstanding request list and fix up all requests
	387	* on old socket.
	388	*/
	389	crit_enter();
	390	TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
	391	KKASSERT(req->r_nmp == nmp);
	392	req->r_flags \|= R_NEEDSXMIT;
	393	}
	394	crit_exit();
	395	return (0);
	396	}
	397
	398	/*
	399	* NFS disconnect. Clean up and unlink.
	400	*/
	401	void
	402	nfs_disconnect(struct nfsmount *nmp)
	403	{
	404	struct socket *so;
	405
	406	if (nmp->nm_so) {
	407	so = nmp->nm_so;
	408	nmp->nm_so = NULL;
	409	soshutdown(so, SHUT_RDWR);
	410	soclose(so, FNONBLOCK);
	411	}
	412	}
	413
	414	void
	415	nfs_safedisconnect(struct nfsmount *nmp)
	416	{
	417	nfs_rcvlock(nmp, NULL);
	418	nfs_disconnect(nmp);
	419	nfs_rcvunlock(nmp);
	420	}
	421
	422	/*
	423	* This is the nfs send routine. For connection based socket types, it
	424	* must be called with an nfs_sndlock() on the socket.
	425	* "rep == NULL" indicates that it has been called from a server.
	426	* For the client side:
	427	* - return EINTR if the RPC is terminated, 0 otherwise
	428	* - set R_NEEDSXMIT if the send fails for any reason
	429	* - do any cleanup required by recoverable socket errors (?)
	430	* For the server side:
	431	* - return EINTR or ERESTART if interrupted by a signal
	432	* - return EPIPE if a connection is lost for connection based sockets (TCP...)
	433	* - do any cleanup required by recoverable socket errors (?)
	434	*/
	435	int
	436	nfs_send(struct socket so, struct sockaddr nam, struct mbuf *top,
	437	struct nfsreq *rep)
	438	{
	439	struct sockaddr *sendnam;
	440	int error, soflags, flags;
	441
	442	if (rep) {
	443	if (rep->r_flags & R_SOFTTERM) {
	444	m_freem(top);
	445	return (EINTR);
	446	}
	447	if ((so = rep->r_nmp->nm_so) == NULL) {
	448	rep->r_flags \|= R_NEEDSXMIT;
	449	m_freem(top);
	450	return (0);
	451	}
	452	rep->r_flags &= ~R_NEEDSXMIT;
	453	soflags = rep->r_nmp->nm_soflags;
	454	} else {
	455	soflags = so->so_proto->pr_flags;
	456	}
	457	if ((soflags & PR_CONNREQUIRED) \|\| (so->so_state & SS_ISCONNECTED))
	458	sendnam = NULL;
	459	else
	460	sendnam = nam;
	461	if (so->so_type == SOCK_SEQPACKET)
	462	flags = MSG_EOR;
	463	else
	464	flags = 0;
	465
	466	/*
	467	* calls pru_sosend -> sosend -> so_pru_send -> netrpc
	468	*/
	469	error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
	470	curthread /XXX/);
	471
	472	/*
	473	* ENOBUFS for dgram sockets is transient and non fatal.
	474	* No need to log, and no need to break a soft mount.
	475	*/
	476	if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
	477	error = 0;
	478	/*
	479	* do backoff retransmit on client
	480	*/
	481	if (rep) {
	482	if ((rep->r_nmp->nm_state & NFSSTA_SENDSPACE) == 0) {
	483	rep->r_nmp->nm_state \|= NFSSTA_SENDSPACE;
	484	kprintf("Warning: NFS: Insufficient sendspace "
	485	"(%lu),\n"
	486	"\t You must increase vfs.nfs.soreserve"
	487	"or decrease vfs.nfs.maxasyncbio\n",
	488	so->so_snd.ssb_hiwat);
	489	}
	490	rep->r_flags \|= R_NEEDSXMIT;
	491	}
	492	}
	493
	494	if (error) {
	495	if (rep) {
	496	log(LOG_INFO, "nfs send error %d for server %s\n",error,
	497	rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
	498	/*
	499	* Deal with errors for the client side.
	500	*/
	501	if (rep->r_flags & R_SOFTTERM)
	502	error = EINTR;
	503	else
	504	rep->r_flags \|= R_NEEDSXMIT;
	505	} else {
	506	log(LOG_INFO, "nfsd send error %d\n", error);
	507	}
	508
	509	/*
	510	* Handle any recoverable (soft) socket errors here. (?)
	511	*/
	512	if (error != EINTR && error != ERESTART &&
	513	error != EWOULDBLOCK && error != EPIPE)
	514	error = 0;
	515	}
	516	return (error);
	517	}
	518
	519	/*
	520	* Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
	521	* done by soreceive(), but for SOCK_STREAM we must deal with the Record
	522	* Mark and consolidate the data into a new mbuf list.
	523	* nb: Sometimes TCP passes the data up to soreceive() in long lists of
	524	* small mbufs.
	525	* For SOCK_STREAM we must be very careful to read an entire record once
	526	* we have read any of it, even if the system call has been interrupted.
	527	*/
	528	static int
	529	nfs_receive(struct nfsmount nmp, struct nfsreq rep,
	530	struct sockaddr aname, struct mbuf mp)
	531	{
	532	struct socket *so;
	533	struct sockbuf sio;
	534	struct uio auio;
	535	struct iovec aio;
	536	struct mbuf *m;
	537	struct mbuf *control;
	538	u_int32_t len;
	539	struct sockaddr **getnam;
	540	int error, sotype, rcvflg;
	541	struct thread td = curthread; / XXX */
	542
	543	/*
	544	* Set up arguments for soreceive()
	545	*/
	546	*mp = NULL;
	547	*aname = NULL;
	548	sotype = nmp->nm_sotype;
	549
	550	/*
	551	* For reliable protocols, lock against other senders/receivers
	552	* in case a reconnect is necessary.
	553	* For SOCK_STREAM, first get the Record Mark to find out how much
	554	* more there is to get.
	555	* We must lock the socket against other receivers
	556	* until we have an entire rpc request/reply.
	557	*/
	558	if (sotype != SOCK_DGRAM) {
	559	error = nfs_sndlock(nmp, rep);
	560	if (error)
	561	return (error);
	562	tryagain:
	563	/*
	564	* Check for fatal errors and resending request.
	565	*/
	566	/*
	567	* Ugh: If a reconnect attempt just happened, nm_so
	568	* would have changed. NULL indicates a failed
	569	* attempt that has essentially shut down this
	570	* mount point.
	571	*/
	572	if (rep && (rep->r_mrep \|\| (rep->r_flags & R_SOFTTERM))) {
	573	nfs_sndunlock(nmp);
	574	return (EINTR);
	575	}
	576	so = nmp->nm_so;
	577	if (so == NULL) {
	578	error = nfs_reconnect(nmp, rep);
	579	if (error) {
	580	nfs_sndunlock(nmp);
	581	return (error);
	582	}
	583	goto tryagain;
	584	}
	585	while (rep && (rep->r_flags & R_NEEDSXMIT)) {
	586	m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
	587	nfsstats.rpcretries++;
	588	error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
	589	if (error) {
	590	if (error == EINTR \|\| error == ERESTART \|\|
	591	(error = nfs_reconnect(nmp, rep)) != 0) {
	592	nfs_sndunlock(nmp);
	593	return (error);
	594	}
	595	goto tryagain;
	596	}
	597	}
	598	nfs_sndunlock(nmp);
	599	if (sotype == SOCK_STREAM) {
	600	/*
	601	* Get the length marker from the stream
	602	*/
	603	aio.iov_base = (caddr_t)&len;
	604	aio.iov_len = sizeof(u_int32_t);
	605	auio.uio_iov = &aio;
	606	auio.uio_iovcnt = 1;
	607	auio.uio_segflg = UIO_SYSSPACE;
	608	auio.uio_rw = UIO_READ;
	609	auio.uio_offset = 0;
	610	auio.uio_resid = sizeof(u_int32_t);
	611	auio.uio_td = td;
	612	do {
	613	rcvflg = MSG_WAITALL;
	614	error = so_pru_soreceive(so, NULL, &auio, NULL,
	615	NULL, &rcvflg);
	616	if (error == EWOULDBLOCK && rep) {
	617	if (rep->r_flags & R_SOFTTERM)
	618	return (EINTR);
	619	}
	620	} while (error == EWOULDBLOCK);
	621
	622	if (error == 0 && auio.uio_resid > 0) {
	623	/*
	624	* Only log short packets if not EOF
	625	*/
	626	if (auio.uio_resid != sizeof(u_int32_t))
	627	log(LOG_INFO,
	628	"short receive (%d/%d) from nfs server %s\n",
	629	(int)(sizeof(u_int32_t) - auio.uio_resid),
	630	(int)sizeof(u_int32_t),
	631	nmp->nm_mountp->mnt_stat.f_mntfromname);
	632	error = EPIPE;
	633	}
	634	if (error)
	635	goto errout;
	636	len = ntohl(len) & ~0x80000000;
	637	/*
	638	* This is SERIOUS! We are out of sync with the sender
	639	* and forcing a disconnect/reconnect is all I can do.
	640	*/
	641	if (len > NFS_MAXPACKET) {
	642	log(LOG_ERR, "%s (%d) from nfs server %s\n",
	643	"impossible packet length",
	644	len,
	645	nmp->nm_mountp->mnt_stat.f_mntfromname);
	646	error = EFBIG;
	647	goto errout;
	648	}
	649
	650	/*
	651	* Get the rest of the packet as an mbuf chain
	652	*/
	653	sbinit(&sio, len);
	654	do {
	655	rcvflg = MSG_WAITALL;
	656	error = so_pru_soreceive(so, NULL, NULL, &sio,
	657	NULL, &rcvflg);
	658	} while (error == EWOULDBLOCK \|\| error == EINTR \|\|
	659	error == ERESTART);
	660	if (error == 0 && sio.sb_cc != len) {
	661	if (sio.sb_cc != 0)
	662	log(LOG_INFO,
	663	"short receive (%zu/%d) from nfs server %s\n",
	664	(size_t)len - auio.uio_resid, len,
	665	nmp->nm_mountp->mnt_stat.f_mntfromname);
	666	error = EPIPE;
	667	}
	668	*mp = sio.sb_mb;
	669	} else {
	670	/*
	671	* Non-stream, so get the whole packet by not
	672	* specifying MSG_WAITALL and by specifying a large
	673	* length.
	674	*
	675	* We have no use for control msg., but must grab them
	676	* and then throw them away so we know what is going
	677	* on.
	678	*/
	679	sbinit(&sio, 100000000);
	680	do {
	681	rcvflg = 0;
	682	error = so_pru_soreceive(so, NULL, NULL, &sio,
	683	&control, &rcvflg);
	684	if (control)
	685	m_freem(control);
	686	if (error == EWOULDBLOCK && rep) {
	687	if (rep->r_flags & R_SOFTTERM) {
	688	m_freem(sio.sb_mb);
	689	return (EINTR);
	690	}
	691	}
	692	} while (error == EWOULDBLOCK \|\|
	693	(error == 0 && sio.sb_mb == NULL && control));
	694	if ((rcvflg & MSG_EOR) == 0)
	695	kprintf("Egad!!\n");
	696	if (error == 0 && sio.sb_mb == NULL)
	697	error = EPIPE;
	698	len = sio.sb_cc;
	699	*mp = sio.sb_mb;
	700	}
	701	errout:
	702	if (error && error != EINTR && error != ERESTART) {
	703	m_freem(*mp);
	704	*mp = NULL;
	705	if (error != EPIPE) {
	706	log(LOG_INFO,
	707	"receive error %d from nfs server %s\n",
	708	error,
	709	nmp->nm_mountp->mnt_stat.f_mntfromname);
	710	}
	711	error = nfs_sndlock(nmp, rep);
	712	if (!error) {
	713	error = nfs_reconnect(nmp, rep);
	714	if (!error)
	715	goto tryagain;
	716	else
	717	nfs_sndunlock(nmp);
	718	}
	719	}
	720	} else {
	721	if ((so = nmp->nm_so) == NULL)
	722	return (EACCES);
	723	if (so->so_state & SS_ISCONNECTED)
	724	getnam = NULL;
	725	else
	726	getnam = aname;
	727	sbinit(&sio, 100000000);
	728	do {
	729	rcvflg = 0;
	730	error = so_pru_soreceive(so, getnam, NULL, &sio,
	731	NULL, &rcvflg);
	732	if (error == EWOULDBLOCK && rep &&
	733	(rep->r_flags & R_SOFTTERM)) {
	734	m_freem(sio.sb_mb);
	735	return (EINTR);
	736	}
	737	} while (error == EWOULDBLOCK);
	738
	739	len = sio.sb_cc;
	740	*mp = sio.sb_mb;
	741
	742	/*
	743	* A shutdown may result in no error and no mbuf.
	744	* Convert to EPIPE.
	745	*/
	746	if (*mp == NULL && error == 0)
	747	error = EPIPE;
	748	}
	749	if (error) {
	750	m_freem(*mp);
	751	*mp = NULL;
	752	}
	753
	754	/*
	755	* Search for any mbufs that are not a multiple of 4 bytes long
	756	* or with m_data not longword aligned.
	757	* These could cause pointer alignment problems, so copy them to
	758	* well aligned mbufs.
	759	*/
	760	nfs_realign(mp, 5 * NFSX_UNSIGNED);
	761	return (error);
	762	}
	763
	764	/*
	765	* Implement receipt of reply on a socket.
	766	*
	767	* We must search through the list of received datagrams matching them
	768	* with outstanding requests using the xid, until ours is found.
	769	*
	770	* If myrep is NULL we process packets on the socket until
	771	* interrupted or until nm_reqrxq is non-empty.
	772	*/
	773	/* ARGSUSED */
	774	int
	775	nfs_reply(struct nfsmount nmp, struct nfsreq myrep)
	776	{
	777	struct nfsreq *rep;
	778	struct sockaddr *nam;
	779	u_int32_t rxid;
	780	u_int32_t *tl;
	781	int error;
	782	struct nfsm_info info;
	783
	784	/*
	785	* Loop around until we get our own reply
	786	*/
	787	for (;;) {
	788	/*
	789	* Lock against other receivers so that I don't get stuck in
	790	* sbwait() after someone else has received my reply for me.
	791	* Also necessary for connection based protocols to avoid
	792	* race conditions during a reconnect.
	793	*
	794	* If nfs_rcvlock() returns EALREADY, that means that
	795	* the reply has already been recieved by another
	796	* process and we can return immediately. In this
	797	* case, the lock is not taken to avoid races with
	798	* other processes.
	799	*/
	800	info.mrep = NULL;
	801
	802	error = nfs_rcvlock(nmp, myrep);
	803	if (error == EALREADY)
	804	return (0);
	805	if (error)
	806	return (error);
	807
	808	/*
	809	* If myrep is NULL we are the receiver helper thread.
	810	* Stop waiting for incoming replies if there are
	811	* messages sitting on reqrxq that we need to process,
	812	* or if a shutdown request is pending.
	813	*/
	814	if (myrep == NULL && (TAILQ_FIRST(&nmp->nm_reqrxq) \|\|
	815	nmp->nm_rxstate > NFSSVC_PENDING)) {
	816	nfs_rcvunlock(nmp);
	817	return(EWOULDBLOCK);
	818	}
	819
	820	/*
	821	* Get the next Rpc reply off the socket
	822	*
	823	* We cannot release the receive lock until we've
	824	* filled in rep->r_mrep, otherwise a waiting
	825	* thread may deadlock in soreceive with no incoming
	826	* packets expected.
	827	*/
	828	error = nfs_receive(nmp, myrep, &nam, &info.mrep);
	829	if (error) {
	830	/*
	831	* Ignore routing errors on connectionless protocols??
	832	*/
	833	nfs_rcvunlock(nmp);
	834	if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
	835	if (nmp->nm_so == NULL)
	836	return (error);
	837	nmp->nm_so->so_error = 0;
	838	continue;
	839	}
	840	return (error);
	841	}
	842	if (nam)
	843	FREE(nam, M_SONAME);
	844
	845	/*
	846	* Get the xid and check that it is an rpc reply
	847	*/
	848	info.md = info.mrep;
	849	info.dpos = mtod(info.md, caddr_t);
	850	NULLOUT(tl = nfsm_dissect(&info, 2*NFSX_UNSIGNED));
	851	rxid = *tl++;
	852	if (*tl != rpc_reply) {
	853	nfsstats.rpcinvalid++;
	854	m_freem(info.mrep);
	855	info.mrep = NULL;
	856	nfsmout:
	857	nfs_rcvunlock(nmp);
	858	continue;
	859	}
	860
	861	/*
	862	* Loop through the request list to match up the reply
	863	* Iff no match, just drop the datagram. On match, set
	864	* r_mrep atomically to prevent the timer from messing
	865	* around with the request after we have exited the critical
	866	* section.
	867	*/
	868	crit_enter();
	869	TAILQ_FOREACH(rep, &nmp->nm_reqq, r_chain) {
	870	if (rep->r_mrep == NULL && rxid == rep->r_xid)
	871	break;
	872	}
	873
	874	/*
	875	* Fill in the rest of the reply if we found a match.
	876	*
	877	* Deal with duplicate responses if there was no match.
	878	*/
	879	if (rep) {
	880	rep->r_md = info.md;
	881	rep->r_dpos = info.dpos;
	882	if (nfsrtton) {
	883	struct rttl *rt;
	884
	885	rt = &nfsrtt.rttl[nfsrtt.pos];
	886	rt->proc = rep->r_procnum;
	887	rt->rto = 0;
	888	rt->sent = 0;
	889	rt->cwnd = nmp->nm_maxasync_scaled;
	890	rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
	891	rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
	892	rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
	893	getmicrotime(&rt->tstamp);
	894	if (rep->r_flags & R_TIMING)
	895	rt->rtt = rep->r_rtt;
	896	else
	897	rt->rtt = 1000000;
	898	nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
	899	}
	900
	901	/*
	902	* New congestion control is based only on async
	903	* requests.
	904	*/
	905	if (nmp->nm_maxasync_scaled < NFS_MAXASYNC_SCALED)
	906	++nmp->nm_maxasync_scaled;
	907	if (rep->r_flags & R_SENT) {
	908	rep->r_flags &= ~R_SENT;
	909	}
	910	/*
	911	* Update rtt using a gain of 0.125 on the mean
	912	* and a gain of 0.25 on the deviation.
	913	*
	914	* NOTE SRTT/SDRTT are only good if R_TIMING is set.
	915	*/
	916	if ((rep->r_flags & R_TIMING) && rep->r_rexmit == 0) {
	917	/*
	918	* Since the timer resolution of
	919	* NFS_HZ is so course, it can often
	920	* result in r_rtt == 0. Since
	921	* r_rtt == N means that the actual
	922	* rtt is between N+dt and N+2-dt ticks,
	923	* add 1.
	924	*/
	925	int n;
	926	int d;
	927
	928	#define NFSRSB NFS_RTT_SCALE_BITS
	929	n = ((NFS_SRTT(rep) * 7) +
	930	(rep->r_rtt << NFSRSB)) >> 3;
	931	d = n - NFS_SRTT(rep);
	932	NFS_SRTT(rep) = n;
	933
	934	/*
	935	* Don't let the jitter calculation decay
	936	* too quickly, but we want a fast rampup.
	937	*/
	938	if (d < 0)
	939	d = -d;
	940	d <<= NFSRSB;
	941	if (d < NFS_SDRTT(rep))
	942	n = ((NFS_SDRTT(rep) * 15) + d) >> 4;
	943	else
	944	n = ((NFS_SDRTT(rep) * 3) + d) >> 2;
	945	NFS_SDRTT(rep) = n;
	946	#undef NFSRSB
	947	}
	948	nmp->nm_timeouts = 0;
	949	rep->r_mrep = info.mrep;
	950	nfs_hardterm(rep, 0);
	951	} else {
	952	/*
	953	* Extract vers, prog, nfsver, procnum. A duplicate
	954	* response means we didn't wait long enough so
	955	* we increase the SRTT to avoid future spurious
	956	* timeouts.
	957	*/
	958	u_int procnum = nmp->nm_lastreprocnum;
	959	int n;
	960
	961	if (procnum < NFS_NPROCS && proct[procnum]) {
	962	if (nfs_showrexmit)
	963	kprintf("D");
	964	n = nmp->nm_srtt[proct[procnum]];
	965	n += NFS_ASYSCALE * NFS_HZ;
	966	if (n < NFS_ASYSCALE * NFS_HZ * 10)
	967	n = NFS_ASYSCALE * NFS_HZ * 10;
	968	nmp->nm_srtt[proct[procnum]] = n;
	969	}
	970	}
	971	nfs_rcvunlock(nmp);
	972	crit_exit();
	973
	974	/*
	975	* If not matched to a request, drop it.
	976	* If it's mine, get out.
	977	*/
	978	if (rep == NULL) {
	979	nfsstats.rpcunexpected++;
	980	m_freem(info.mrep);
	981	info.mrep = NULL;
	982	} else if (rep == myrep) {
	983	if (rep->r_mrep == NULL)
	984	panic("nfsreply nil");
	985	return (0);
	986	}
	987	}
	988	}
	989
	990	/*
	991	* Run the request state machine until the target state is reached
	992	* or a fatal error occurs. The target state is not run. Specifying
	993	* a target of NFSM_STATE_DONE runs the state machine until the rpc
	994	* is complete.
	995	*
	996	* EINPROGRESS is returned for all states other then the DONE state,
	997	* indicating that the rpc is still in progress.
	998	*/
	999	int
	1000	nfs_request(struct nfsm_info *info, nfsm_state_t bstate, nfsm_state_t estate)
	1001	{
	1002	struct nfsreq *req;
	1003
	1004	while (info->state >= bstate && info->state < estate) {
	1005	switch(info->state) {
	1006	case NFSM_STATE_SETUP:
	1007	/*
	1008	* Setup the nfsreq. Any error which occurs during
	1009	* this state is fatal.
	1010	*/
	1011	info->error = nfs_request_setup(info);
	1012	if (info->error) {
	1013	info->state = NFSM_STATE_DONE;
	1014	return (info->error);
	1015	} else {
	1016	req = info->req;
	1017	req->r_mrp = &info->mrep;
	1018	req->r_mdp = &info->md;
	1019	req->r_dposp = &info->dpos;
	1020	info->state = NFSM_STATE_AUTH;
	1021	}
	1022	break;
	1023	case NFSM_STATE_AUTH:
	1024	/*
	1025	* Authenticate the nfsreq. Any error which occurs
	1026	* during this state is fatal.
	1027	*/
	1028	info->error = nfs_request_auth(info->req);
	1029	if (info->error) {
	1030	info->state = NFSM_STATE_DONE;
	1031	return (info->error);
	1032	} else {
	1033	info->state = NFSM_STATE_TRY;
	1034	}
	1035	break;
	1036	case NFSM_STATE_TRY:
	1037	/*
	1038	* Transmit or retransmit attempt. An error in this
	1039	* state is ignored and we always move on to the
	1040	* next state.
	1041	*
	1042	* This can trivially race the receiver if the
	1043	* request is asynchronous. nfs_request_try()
	1044	* will thus set the state for us and we
	1045	* must also return immediately if we are
	1046	* running an async state machine, because
	1047	* info can become invalid due to races after
	1048	* try() returns.
	1049	*/
	1050	if (info->req->r_flags & R_ASYNC) {
	1051	nfs_request_try(info->req);
	1052	if (estate == NFSM_STATE_WAITREPLY)
	1053	return (EINPROGRESS);
	1054	} else {
	1055	nfs_request_try(info->req);
	1056	info->state = NFSM_STATE_WAITREPLY;
	1057	}
	1058	break;
	1059	case NFSM_STATE_WAITREPLY:
	1060	/*
	1061	* Wait for a reply or timeout and move on to the
	1062	* next state. The error returned by this state
	1063	* is passed to the processing code in the next
	1064	* state.
	1065	*/
	1066	info->error = nfs_request_waitreply(info->req);
	1067	info->state = NFSM_STATE_PROCESSREPLY;
	1068	break;
	1069	case NFSM_STATE_PROCESSREPLY:
	1070	/*
	1071	* Process the reply or timeout. Errors which occur
	1072	* in this state may cause the state machine to
	1073	* go back to an earlier state, and are fatal
	1074	* otherwise.
	1075	*/
	1076	info->error = nfs_request_processreply(info,
	1077	info->error);
	1078	switch(info->error) {
	1079	case ENEEDAUTH:
	1080	info->state = NFSM_STATE_AUTH;
	1081	break;
	1082	case EAGAIN:
	1083	info->state = NFSM_STATE_TRY;
	1084	break;
	1085	default:
	1086	/*
	1087	* Operation complete, with or without an
	1088	* error. We are done.
	1089	*/
	1090	info->req = NULL;
	1091	info->state = NFSM_STATE_DONE;
	1092	return (info->error);
	1093	}
	1094	break;
	1095	case NFSM_STATE_DONE:
	1096	/*
	1097	* Shouldn't be reached
	1098	*/
	1099	return (info->error);
	1100	/* NOT REACHED */
	1101	}
	1102	}
	1103
	1104	/*
	1105	* If we are done return the error code (if any).
	1106	* Otherwise return EINPROGRESS.
	1107	*/
	1108	if (info->state == NFSM_STATE_DONE)
	1109	return (info->error);
	1110	return (EINPROGRESS);
	1111	}
	1112
	1113	/*
	1114	* nfs_request - goes something like this
	1115	* - fill in request struct
	1116	* - links it into list
	1117	* - calls nfs_send() for first transmit
	1118	* - calls nfs_receive() to get reply
	1119	* - break down rpc header and return with nfs reply pointed to
	1120	* by mrep or error
	1121	* nb: always frees up mreq mbuf list
	1122	*/
	1123	static int
	1124	nfs_request_setup(nfsm_info_t info)
	1125	{
	1126	struct nfsreq *req;
	1127	struct nfsmount *nmp;
	1128	struct mbuf *m;
	1129	int i;
	1130
	1131	/*
	1132	* Reject requests while attempting a forced unmount.
	1133	*/
	1134	if (info->vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
	1135	m_freem(info->mreq);
	1136	info->mreq = NULL;
	1137	return (ESTALE);
	1138	}
	1139	nmp = VFSTONFS(info->vp->v_mount);
	1140	req = kmalloc(sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
	1141	req->r_nmp = nmp;
	1142	req->r_vp = info->vp;
	1143	req->r_td = info->td;
	1144	req->r_procnum = info->procnum;
	1145	req->r_mreq = NULL;
	1146	req->r_cred = info->cred;
	1147
	1148	i = 0;
	1149	m = info->mreq;
	1150	while (m) {
	1151	i += m->m_len;
	1152	m = m->m_next;
	1153	}
	1154	req->r_mrest = info->mreq;
	1155	req->r_mrest_len = i;
	1156
	1157	/*
	1158	* The presence of a non-NULL r_info in req indicates
	1159	* async completion via our helper threads. See the receiver
	1160	* code.
	1161	*/
	1162	if (info->bio) {
	1163	req->r_info = info;
	1164	req->r_flags = R_ASYNC;
	1165	} else {
	1166	req->r_info = NULL;
	1167	req->r_flags = 0;
	1168	}
	1169	info->req = req;
	1170	return(0);
	1171	}
	1172
	1173	static int
	1174	nfs_request_auth(struct nfsreq *rep)
	1175	{
	1176	struct nfsmount *nmp = rep->r_nmp;
	1177	struct mbuf *m;
	1178	char nickv[RPCX_NICKVERF];
	1179	int error = 0, auth_len, auth_type;
	1180	int verf_len;
	1181	u_int32_t xid;
	1182	char auth_str, verf_str;
	1183	struct ucred *cred;
	1184
	1185	cred = rep->r_cred;
	1186	rep->r_failed_auth = 0;
	1187
	1188	/*
	1189	* Get the RPC header with authorization.
	1190	*/
	1191	verf_str = auth_str = NULL;
	1192	if (nmp->nm_flag & NFSMNT_KERB) {
	1193	verf_str = nickv;
	1194	verf_len = sizeof (nickv);
	1195	auth_type = RPCAUTH_KERB4;
	1196	bzero((caddr_t)rep->r_key, sizeof(rep->r_key));
	1197	if (rep->r_failed_auth \|\|
	1198	nfs_getnickauth(nmp, cred, &auth_str, &auth_len,
	1199	verf_str, verf_len)) {
	1200	error = nfs_getauth(nmp, rep, cred, &auth_str,
	1201	&auth_len, verf_str, &verf_len, rep->r_key);
	1202	if (error) {
	1203	m_freem(rep->r_mrest);
	1204	rep->r_mrest = NULL;
	1205	kfree((caddr_t)rep, M_NFSREQ);
	1206	return (error);
	1207	}
	1208	}
	1209	} else {
	1210	auth_type = RPCAUTH_UNIX;
	1211	if (cred->cr_ngroups < 1)
	1212	panic("nfsreq nogrps");
	1213	auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
	1214	nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
	1215	5 * NFSX_UNSIGNED;
	1216	}
	1217	if (rep->r_mrest)
	1218	nfs_checkpkt(rep->r_mrest, rep->r_mrest_len);
	1219	m = nfsm_rpchead(cred, nmp->nm_flag, rep->r_procnum, auth_type,
	1220	auth_len, auth_str, verf_len, verf_str,
	1221	rep->r_mrest, rep->r_mrest_len, &rep->r_mheadend, &xid);
	1222	rep->r_mrest = NULL;
	1223	if (auth_str)
	1224	kfree(auth_str, M_TEMP);
	1225
	1226	/*
	1227	* For stream protocols, insert a Sun RPC Record Mark.
	1228	*/
	1229	if (nmp->nm_sotype == SOCK_STREAM) {
	1230	M_PREPEND(m, NFSX_UNSIGNED, MB_WAIT);
	1231	if (m == NULL) {
	1232	kfree(rep, M_NFSREQ);
	1233	return (ENOBUFS);
	1234	}
	1235	mtod(m, u_int32_t ) = htonl(0x80000000 \|
	1236	(m->m_pkthdr.len - NFSX_UNSIGNED));
	1237	}
	1238
	1239	nfs_checkpkt(m, m->m_pkthdr.len);
	1240
	1241	rep->r_mreq = m;
	1242	rep->r_xid = xid;
	1243	return (0);
	1244	}
	1245
	1246	static int
	1247	nfs_request_try(struct nfsreq *rep)
	1248	{
	1249	struct nfsmount *nmp = rep->r_nmp;
	1250	struct mbuf *m2;
	1251	int error;
	1252
	1253	/*
	1254	* Request is not on any queue, only the owner has access to it
	1255	* so it should not be locked by anyone atm.
	1256	*
	1257	* Interlock to prevent races. While locked the only remote
	1258	* action possible is for r_mrep to be set (once we enqueue it).
	1259	*/
	1260	if (rep->r_flags == 0xdeadc0de) {
	1261	print_backtrace(-1);
	1262	panic("flags nbad\n");
	1263	}
	1264	KKASSERT((rep->r_flags & (R_LOCKED \| R_ONREQQ)) == 0);
	1265	if (nmp->nm_flag & NFSMNT_SOFT)
	1266	rep->r_retry = nmp->nm_retry;
	1267	else
	1268	rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
	1269	rep->r_rtt = rep->r_rexmit = 0;
	1270	if (proct[rep->r_procnum] > 0)
	1271	rep->r_flags \|= R_TIMING \| R_LOCKED;
	1272	else
	1273	rep->r_flags \|= R_LOCKED;
	1274	rep->r_mrep = NULL;
	1275
	1276	nfsstats.rpcrequests++;
	1277
	1278	if (nmp->nm_flag & NFSMNT_FORCE) {
	1279	rep->r_flags \|= R_SOFTTERM;
	1280	rep->r_flags &= ~R_LOCKED;
	1281	return (0);
	1282	}
	1283	rep->r_flags \|= R_NEEDSXMIT; /* in case send lock races us */
	1284
	1285	/*
	1286	* Do the client side RPC.
	1287	*
	1288	* Chain request into list of outstanding requests. Be sure
	1289	* to put it LAST so timer finds oldest requests first. Note
	1290	* that our control of R_LOCKED prevents the request from
	1291	* getting ripped out from under us or transmitted by the
	1292	* timer code.
	1293	*
	1294	* For requests with info structures we must atomically set the
	1295	* info's state because the structure could become invalid upon
	1296	* return due to races (i.e., if async)
	1297	*/
	1298	crit_enter();
	1299	mtx_link_init(&rep->r_link);
	1300	KKASSERT((rep->r_flags & R_ONREQQ) == 0);
	1301	TAILQ_INSERT_TAIL(&nmp->nm_reqq, rep, r_chain);
	1302	rep->r_flags \|= R_ONREQQ;
	1303	++nmp->nm_reqqlen;
	1304	if (rep->r_flags & R_ASYNC)
	1305	rep->r_info->state = NFSM_STATE_WAITREPLY;
	1306	crit_exit();
	1307
	1308	error = 0;
	1309
	1310	/*
	1311	* Send if we can. Congestion control is not handled here any more
	1312	* becausing trying to defer the initial send based on the nfs_timer
	1313	* requires having a very fast nfs_timer, which is silly.
	1314	*/
	1315	if (nmp->nm_so) {
	1316	if (nmp->nm_soflags & PR_CONNREQUIRED)
	1317	error = nfs_sndlock(nmp, rep);
	1318	if (error == 0 && (rep->r_flags & R_NEEDSXMIT)) {
	1319	m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
	1320	error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
	1321	rep->r_flags &= ~R_NEEDSXMIT;
	1322	if ((rep->r_flags & R_SENT) == 0) {
	1323	rep->r_flags \|= R_SENT;
	1324	}
	1325	if (nmp->nm_soflags & PR_CONNREQUIRED)
	1326	nfs_sndunlock(nmp);
	1327	}
	1328	} else {
	1329	rep->r_rtt = -1;
	1330	}
	1331	if (error == EPIPE)
	1332	error = 0;
	1333
	1334	/*
	1335	* Release the lock. The only remote action that may have occurred
	1336	* would have been the setting of rep->r_mrep. If this occured
	1337	* and the request was async we have to move it to the reader
	1338	* thread's queue for action.
	1339	*
	1340	* For async requests also make sure the reader is woken up so
	1341	* it gets on the socket to read responses.
	1342	*/
	1343	crit_enter();
	1344	if (rep->r_flags & R_ASYNC) {
	1345	if (rep->r_mrep)
	1346	nfs_hardterm(rep, 1);
	1347	rep->r_flags &= ~R_LOCKED;
	1348	nfssvc_iod_reader_wakeup(nmp);
	1349	} else {
	1350	rep->r_flags &= ~R_LOCKED;
	1351	}
	1352	if (rep->r_flags & R_WANTED) {
	1353	rep->r_flags &= ~R_WANTED;
	1354	wakeup(rep);
	1355	}
	1356	crit_exit();
	1357	return (error);
	1358	}
	1359
	1360	/*
	1361	* This code is only called for synchronous requests. Completed synchronous
	1362	* requests are left on reqq and we remove them before moving on to the
	1363	* processing state.
	1364	*/
	1365	static int
	1366	nfs_request_waitreply(struct nfsreq *rep)
	1367	{
	1368	struct nfsmount *nmp = rep->r_nmp;
	1369	int error;
	1370
	1371	KKASSERT((rep->r_flags & R_ASYNC) == 0);
	1372
	1373	/*
	1374	* Wait until the request is finished.
	1375	*/
	1376	error = nfs_reply(nmp, rep);
	1377
	1378	/*
	1379	* RPC done, unlink the request, but don't rip it out from under
	1380	* the callout timer.
	1381	*
	1382	* Once unlinked no other receiver or the timer will have
	1383	* visibility, so we do not have to set R_LOCKED.
	1384	*/
	1385	crit_enter();
	1386	while (rep->r_flags & R_LOCKED) {
	1387	rep->r_flags \|= R_WANTED;
	1388	tsleep(rep, 0, "nfstrac", 0);
	1389	}
	1390	KKASSERT(rep->r_flags & R_ONREQQ);
	1391	TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
	1392	rep->r_flags &= ~R_ONREQQ;
	1393	--nmp->nm_reqqlen;
	1394	if (TAILQ_FIRST(&nmp->nm_bioq) &&
	1395	nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
	1396	nfssvc_iod_writer_wakeup(nmp);
	1397	}
	1398	crit_exit();
	1399
	1400	/*
	1401	* Decrement the outstanding request count.
	1402	*/
	1403	if (rep->r_flags & R_SENT) {
	1404	rep->r_flags &= ~R_SENT;
	1405	}
	1406	return (error);
	1407	}
	1408
	1409	/*
	1410	* Process reply with error returned from nfs_requet_waitreply().
	1411	*
	1412	* Returns EAGAIN if it wants us to loop up to nfs_request_try() again.
	1413	* Returns ENEEDAUTH if it wants us to loop up to nfs_request_auth() again.
	1414	*/
	1415	static int
	1416	nfs_request_processreply(nfsm_info_t info, int error)
	1417	{
	1418	struct nfsreq *req = info->req;
	1419	struct nfsmount *nmp = req->r_nmp;
	1420	u_int32_t *tl;
	1421	int verf_type;
	1422	int i;
	1423
	1424	/*
	1425	* If there was a successful reply and a tprintf msg.
	1426	* tprintf a response.
	1427	*/
	1428	if (error == 0 && (req->r_flags & R_TPRINTFMSG)) {
	1429	nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
	1430	"is alive again");
	1431	}
	1432	info->mrep = req->r_mrep;
	1433	info->md = req->r_md;
	1434	info->dpos = req->r_dpos;
	1435	if (error) {
	1436	m_freem(req->r_mreq);
	1437	req->r_mreq = NULL;
	1438	kfree(req, M_NFSREQ);
	1439	info->req = NULL;
	1440	return (error);
	1441	}
	1442
	1443	/*
	1444	* break down the rpc header and check if ok
	1445	*/
	1446	NULLOUT(tl = nfsm_dissect(info, 3 * NFSX_UNSIGNED));
	1447	if (*tl++ == rpc_msgdenied) {
	1448	if (*tl == rpc_mismatch) {
	1449	error = EOPNOTSUPP;
	1450	} else if ((nmp->nm_flag & NFSMNT_KERB) &&
	1451	*tl++ == rpc_autherr) {
	1452	if (req->r_failed_auth == 0) {
	1453	req->r_failed_auth++;
	1454	req->r_mheadend->m_next = NULL;
	1455	m_freem(info->mrep);
	1456	info->mrep = NULL;
	1457	m_freem(req->r_mreq);
	1458	req->r_mreq = NULL;
	1459	return (ENEEDAUTH);
	1460	} else {
	1461	error = EAUTH;
	1462	}
	1463	} else {
	1464	error = EACCES;
	1465	}
	1466	m_freem(info->mrep);
	1467	info->mrep = NULL;
	1468	m_freem(req->r_mreq);
	1469	req->r_mreq = NULL;
	1470	kfree(req, M_NFSREQ);
	1471	info->req = NULL;
	1472	return (error);
	1473	}
	1474
	1475	/*
	1476	* Grab any Kerberos verifier, otherwise just throw it away.
	1477	*/
	1478	verf_type = fxdr_unsigned(int, *tl++);
	1479	i = fxdr_unsigned(int32_t, *tl);
	1480	if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
	1481	error = nfs_savenickauth(nmp, req->r_cred, i, req->r_key,
	1482	&info->md, &info->dpos, info->mrep);
	1483	if (error)
	1484	goto nfsmout;
	1485	} else if (i > 0) {
	1486	ERROROUT(nfsm_adv(info, nfsm_rndup(i)));
	1487	}
	1488	NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
	1489	/* 0 == ok */
	1490	if (*tl == 0) {
	1491	NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
	1492	if (*tl != 0) {
	1493	error = fxdr_unsigned(int, *tl);
	1494
	1495	/*
	1496	* Does anyone even implement this? Just impose
	1497	* a 1-second delay.
	1498	*/
	1499	if ((nmp->nm_flag & NFSMNT_NFSV3) &&
	1500	error == NFSERR_TRYLATER) {
	1501	m_freem(info->mrep);
	1502	info->mrep = NULL;
	1503	error = 0;
	1504
	1505	tsleep((caddr_t)&lbolt, 0, "nqnfstry", 0);
	1506	return (EAGAIN); /* goto tryagain */
	1507	}
	1508
	1509	/*
	1510	* If the File Handle was stale, invalidate the
	1511	* lookup cache, just in case.
	1512	*
	1513	* To avoid namecache<->vnode deadlocks we must
	1514	* release the vnode lock if we hold it.
	1515	*/
	1516	if (error == ESTALE) {
	1517	struct vnode *vp = req->r_vp;
	1518	int ltype;
	1519
	1520	ltype = lockstatus(&vp->v_lock, curthread);
	1521	if (ltype == LK_EXCLUSIVE \|\| ltype == LK_SHARED)
	1522	lockmgr(&vp->v_lock, LK_RELEASE);
	1523	cache_inval_vp(vp, CINV_CHILDREN);
	1524	if (ltype == LK_EXCLUSIVE \|\| ltype == LK_SHARED)
	1525	lockmgr(&vp->v_lock, ltype);
	1526	}
	1527	if (nmp->nm_flag & NFSMNT_NFSV3) {
	1528	KKASSERT(*req->r_mrp == info->mrep);
	1529	KKASSERT(*req->r_mdp == info->md);
	1530	KKASSERT(*req->r_dposp == info->dpos);
	1531	error \|= NFSERR_RETERR;
	1532	} else {
	1533	m_freem(info->mrep);
	1534	info->mrep = NULL;
	1535	}
	1536	m_freem(req->r_mreq);
	1537	req->r_mreq = NULL;
	1538	kfree(req, M_NFSREQ);
	1539	info->req = NULL;
	1540	return (error);
	1541	}
	1542
	1543	KKASSERT(*req->r_mrp == info->mrep);
	1544	KKASSERT(*req->r_mdp == info->md);
	1545	KKASSERT(*req->r_dposp == info->dpos);
	1546	m_freem(req->r_mreq);
	1547	req->r_mreq = NULL;
	1548	FREE(req, M_NFSREQ);
	1549	return (0);
	1550	}
	1551	m_freem(info->mrep);
	1552	info->mrep = NULL;
	1553	error = EPROTONOSUPPORT;
	1554	nfsmout:
	1555	m_freem(req->r_mreq);
	1556	req->r_mreq = NULL;
	1557	kfree(req, M_NFSREQ);
	1558	info->req = NULL;
	1559	return (error);
	1560	}
	1561
	1562	#ifndef NFS_NOSERVER
	1563	/*
	1564	* Generate the rpc reply header
	1565	* siz arg. is used to decide if adding a cluster is worthwhile
	1566	*/
	1567	int
	1568	nfs_rephead(int siz, struct nfsrv_descript nd, struct nfssvc_sock slp,
	1569	int err, struct mbuf mrq, struct mbuf mbp, caddr_t *bposp)
	1570	{
	1571	u_int32_t *tl;
	1572	struct nfsm_info info;
	1573
	1574	siz += RPC_REPLYSIZ;
	1575	info.mb = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
	1576	info.mreq = info.mb;
	1577	info.mreq->m_pkthdr.len = 0;
	1578	/*
	1579	* If this is not a cluster, try and leave leading space
	1580	* for the lower level headers.
	1581	*/
	1582	if ((max_hdr + siz) < MINCLSIZE)
	1583	info.mreq->m_data += max_hdr;
	1584	tl = mtod(info.mreq, u_int32_t *);
	1585	info.mreq->m_len = 6 * NFSX_UNSIGNED;
	1586	info.bpos = ((caddr_t)tl) + info.mreq->m_len;
	1587	*tl++ = txdr_unsigned(nd->nd_retxid);
	1588	*tl++ = rpc_reply;
	1589	if (err == ERPCMISMATCH \|\| (err & NFSERR_AUTHERR)) {
	1590	*tl++ = rpc_msgdenied;
	1591	if (err & NFSERR_AUTHERR) {
	1592	*tl++ = rpc_autherr;
	1593	*tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
	1594	info.mreq->m_len -= NFSX_UNSIGNED;
	1595	info.bpos -= NFSX_UNSIGNED;
	1596	} else {
	1597	*tl++ = rpc_mismatch;
	1598	*tl++ = txdr_unsigned(RPC_VER2);
	1599	*tl = txdr_unsigned(RPC_VER2);
	1600	}
	1601	} else {
	1602	*tl++ = rpc_msgaccepted;
	1603
	1604	/*
	1605	* For Kerberos authentication, we must send the nickname
	1606	* verifier back, otherwise just RPCAUTH_NULL.
	1607	*/
	1608	if (nd->nd_flag & ND_KERBFULL) {
	1609	struct nfsuid *nuidp;
	1610	struct timeval ktvin, ktvout;
	1611
	1612	for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
	1613	nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
	1614	if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
	1615	(!nd->nd_nam2 \|\| netaddr_match(NU_NETFAM(nuidp),
	1616	&nuidp->nu_haddr, nd->nd_nam2)))
	1617	break;
	1618	}
	1619	if (nuidp) {
	1620	ktvin.tv_sec =
	1621	txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
	1622	ktvin.tv_usec =
	1623	txdr_unsigned(nuidp->nu_timestamp.tv_usec);
	1624
	1625	/*
	1626	* Encrypt the timestamp in ecb mode using the
	1627	* session key.
	1628	*/
	1629	#ifdef NFSKERB
	1630	XXX
	1631	#else
	1632	ktvout.tv_sec = 0;
	1633	ktvout.tv_usec = 0;
	1634	#endif
	1635
	1636	*tl++ = rpc_auth_kerb;
	1637	tl++ = txdr_unsigned(3 NFSX_UNSIGNED);
	1638	*tl = ktvout.tv_sec;
	1639	tl = nfsm_build(&info, 3 * NFSX_UNSIGNED);
	1640	*tl++ = ktvout.tv_usec;
	1641	*tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
	1642	} else {
	1643	*tl++ = 0;
	1644	*tl++ = 0;
	1645	}
	1646	} else {
	1647	*tl++ = 0;
	1648	*tl++ = 0;
	1649	}
	1650	switch (err) {
	1651	case EPROGUNAVAIL:
	1652	*tl = txdr_unsigned(RPC_PROGUNAVAIL);
	1653	break;
	1654	case EPROGMISMATCH:
	1655	*tl = txdr_unsigned(RPC_PROGMISMATCH);
	1656	tl = nfsm_build(&info, 2 * NFSX_UNSIGNED);
	1657	*tl++ = txdr_unsigned(2);
	1658	*tl = txdr_unsigned(3);
	1659	break;
	1660	case EPROCUNAVAIL:
	1661	*tl = txdr_unsigned(RPC_PROCUNAVAIL);
	1662	break;
	1663	case EBADRPC:
	1664	*tl = txdr_unsigned(RPC_GARBAGE);
	1665	break;
	1666	default:
	1667	*tl = 0;
	1668	if (err != NFSERR_RETVOID) {
	1669	tl = nfsm_build(&info, NFSX_UNSIGNED);
	1670	if (err)
	1671	*tl = txdr_unsigned(nfsrv_errmap(nd, err));
	1672	else
	1673	*tl = 0;
	1674	}
	1675	break;
	1676	};
	1677	}
	1678
	1679	if (mrq != NULL)
	1680	*mrq = info.mreq;
	1681	*mbp = info.mb;
	1682	*bposp = info.bpos;
	1683	if (err != 0 && err != NFSERR_RETVOID)
	1684	nfsstats.srvrpc_errs++;
	1685	return (0);
	1686	}
	1687
	1688
	1689	#endif /* NFS_NOSERVER */
	1690
	1691	/*
	1692	* Nfs timer routine.
	1693	*
	1694	* Scan the nfsreq list and retranmit any requests that have timed out
	1695	* To avoid retransmission attempts on STREAM sockets (in the future) make
	1696	* sure to set the r_retry field to 0 (implies nm_retry == 0).
	1697	*
	1698	* Requests with attached responses, terminated requests, and
	1699	* locked requests are ignored. Locked requests will be picked up
	1700	* in a later timer call.
	1701	*/
	1702	void
	1703	nfs_timer_callout(void arg / never used */)
	1704	{
	1705	struct nfsmount *nmp;
	1706	struct nfsreq *req;
	1707	#ifndef NFS_NOSERVER
	1708	struct nfssvc_sock *slp;
	1709	u_quad_t cur_usec;
	1710	#endif /* NFS_NOSERVER */
	1711
	1712	lwkt_gettoken(&nfs_token);
	1713	TAILQ_FOREACH(nmp, &nfs_mountq, nm_entry) {
	1714	lwkt_gettoken(&nmp->nm_token);
	1715	TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
	1716	KKASSERT(nmp == req->r_nmp);
	1717	if (req->r_mrep)
	1718	continue;
	1719	if (req->r_flags & (R_SOFTTERM \| R_LOCKED))
	1720	continue;
	1721
	1722	/*
	1723	* Handle timeout/retry. Be sure to process r_mrep
	1724	* for async requests that completed while we had
	1725	* the request locked or they will hang in the reqq
	1726	* forever.
	1727	*/
	1728	req->r_flags \|= R_LOCKED;
	1729	if (nfs_sigintr(nmp, req, req->r_td)) {
	1730	nfs_softterm(req, 1);
	1731	req->r_flags &= ~R_LOCKED;
	1732	} else {
	1733	nfs_timer_req(req);
	1734	if (req->r_flags & R_ASYNC) {
	1735	if (req->r_mrep)
	1736	nfs_hardterm(req, 1);
	1737	req->r_flags &= ~R_LOCKED;
	1738	nfssvc_iod_reader_wakeup(nmp);
	1739	} else {
	1740	req->r_flags &= ~R_LOCKED;
	1741	}
	1742	}
	1743	if (req->r_flags & R_WANTED) {
	1744	req->r_flags &= ~R_WANTED;
	1745	wakeup(req);
	1746	}
	1747	}
	1748	lwkt_reltoken(&nmp->nm_token);
	1749	}
	1750	#ifndef NFS_NOSERVER
	1751
	1752	/*
	1753	* Scan the write gathering queues for writes that need to be
	1754	* completed now.
	1755	*/
	1756	cur_usec = nfs_curusec();
	1757
	1758	TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
	1759	/* XXX race against removal */
	1760	if (lwkt_trytoken(&slp->ns_token)) {
	1761	if (slp->ns_tq.lh_first &&
	1762	(slp->ns_tq.lh_first->nd_time <= cur_usec)) {
	1763	nfsrv_wakenfsd(slp, 1);
	1764	}
	1765	lwkt_reltoken(&slp->ns_token);
	1766	}
	1767	}
	1768	#endif /* NFS_NOSERVER */
	1769
	1770	callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer_callout, NULL);
	1771	lwkt_reltoken(&nfs_token);
	1772	}
	1773
	1774	static
	1775	void
	1776	nfs_timer_req(struct nfsreq *req)
	1777	{
	1778	struct thread td = &thread0; / XXX for creds, will break if sleep */
	1779	struct nfsmount *nmp = req->r_nmp;
	1780	struct mbuf *m;
	1781	struct socket *so;
	1782	int timeo;
	1783	int error;
	1784
	1785	/*
	1786	* rtt ticks and timeout calculation. Return if the timeout
	1787	* has not been reached yet, unless the packet is flagged
	1788	* for an immediate send.
	1789	*
	1790	* The mean rtt doesn't help when we get random I/Os, we have
	1791	* to multiply by fairly large numbers.
	1792	*/
	1793	if (req->r_rtt >= 0) {
	1794	/*
	1795	* Calculate the timeout to test against.
	1796	*/
	1797	req->r_rtt++;
	1798	if (nmp->nm_flag & NFSMNT_DUMBTIMR) {
	1799	timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
	1800	} else if (req->r_flags & R_TIMING) {
	1801	timeo = NFS_SRTT(req) + NFS_SDRTT(req);
	1802	} else {
	1803	timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
	1804	}
	1805	timeo *= multt[req->r_procnum];
	1806	/* timeo is still scaled by SCALE_BITS */
	1807
	1808	#define NFSFS (NFS_RTT_SCALE * NFS_HZ)
	1809	if (req->r_flags & R_TIMING) {
	1810	static long last_time;
	1811	if (nfs_showrtt && last_time != time_second) {
	1812	kprintf("rpccmd %d NFS SRTT %d SDRTT %d "
	1813	"timeo %d.%03d\n",
	1814	proct[req->r_procnum],
	1815	NFS_SRTT(req), NFS_SDRTT(req),
	1816	timeo / NFSFS,
	1817	timeo % NFSFS * 1000 / NFSFS);
	1818	last_time = time_second;
	1819	}
	1820	}
	1821	#undef NFSFS
	1822
	1823	/*
	1824	* deal with nfs_timer jitter.
	1825	*/
	1826	timeo = (timeo >> NFS_RTT_SCALE_BITS) + 1;
	1827	if (timeo < 2)
	1828	timeo = 2;
	1829
	1830	if (nmp->nm_timeouts > 0)
	1831	timeo *= nfs_backoff[nmp->nm_timeouts - 1];
	1832	if (timeo > NFS_MAXTIMEO)
	1833	timeo = NFS_MAXTIMEO;
	1834	if (req->r_rtt <= timeo) {
	1835	if ((req->r_flags & R_NEEDSXMIT) == 0)
	1836	return;
	1837	} else if (nmp->nm_timeouts < 8) {
	1838	nmp->nm_timeouts++;
	1839	}
	1840	}
	1841
	1842	/*
	1843	* Check for server not responding
	1844	*/
	1845	if ((req->r_flags & R_TPRINTFMSG) == 0 &&
	1846	req->r_rexmit > nmp->nm_deadthresh) {
	1847	nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
	1848	"not responding");
	1849	req->r_flags \|= R_TPRINTFMSG;
	1850	}
	1851	if (req->r_rexmit >= req->r_retry) { /* too many */
	1852	nfsstats.rpctimeouts++;
	1853	nfs_softterm(req, 1);
	1854	return;
	1855	}
	1856
	1857	/*
	1858	* Generally disable retransmission on reliable sockets,
	1859	* unless the request is flagged for immediate send.
	1860	*/
	1861	if (nmp->nm_sotype != SOCK_DGRAM) {
	1862	if (++req->r_rexmit > NFS_MAXREXMIT)
	1863	req->r_rexmit = NFS_MAXREXMIT;
	1864	if ((req->r_flags & R_NEEDSXMIT) == 0)
	1865	return;
	1866	}
	1867
	1868	/*
	1869	* Stop here if we do not have a socket!
	1870	*/
	1871	if ((so = nmp->nm_so) == NULL)
	1872	return;
	1873
	1874	/*
	1875	* If there is enough space and the window allows.. resend it.
	1876	*
	1877	* r_rtt is left intact in case we get an answer after the
	1878	* retry that was a reply to the original packet.
	1879	*
	1880	* NOTE: so_pru_send()
	1881	*/
	1882	if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len &&
	1883	(req->r_flags & (R_SENT \| R_NEEDSXMIT)) &&
	1884	(m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
	1885	if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
	1886	error = so_pru_send(so, 0, m, NULL, NULL, td);
	1887	else
	1888	error = so_pru_send(so, 0, m, nmp->nm_nam, NULL, td);
	1889	if (error) {
	1890	if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
	1891	so->so_error = 0;
	1892	req->r_flags \|= R_NEEDSXMIT;
	1893	} else if (req->r_mrep == NULL) {
	1894	/*
	1895	* Iff first send, start timing
	1896	* else turn timing off, backoff timer
	1897	* and divide congestion window by 2.
	1898	*
	1899	* It is possible for the so_pru_send() to
	1900	* block and for us to race a reply so we
	1901	* only do this if the reply field has not
	1902	* been filled in. R_LOCKED will prevent
	1903	* the request from being ripped out from under
	1904	* us entirely.
	1905	*
	1906	* Record the last resent procnum to aid us
	1907	* in duplicate detection on receive.
	1908	*/
	1909	if ((req->r_flags & R_NEEDSXMIT) == 0) {
	1910	if (nfs_showrexmit)
	1911	kprintf("X");
	1912	if (++req->r_rexmit > NFS_MAXREXMIT)
	1913	req->r_rexmit = NFS_MAXREXMIT;
	1914	nmp->nm_maxasync_scaled >>= 1;
	1915	if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED)
	1916	nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
	1917	nfsstats.rpcretries++;
	1918	nmp->nm_lastreprocnum = req->r_procnum;
	1919	} else {
	1920	req->r_flags \|= R_SENT;
	1921	req->r_flags &= ~R_NEEDSXMIT;
	1922	}
	1923	}
	1924	}
	1925	}
	1926
	1927	/*
	1928	* Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
	1929	* wait for all requests to complete. This is used by forced unmounts
	1930	* to terminate any outstanding RPCs.
	1931	*
	1932	* Locked requests cannot be canceled but will be marked for
	1933	* soft-termination.
	1934	*/
	1935	int
	1936	nfs_nmcancelreqs(struct nfsmount *nmp)
	1937	{
	1938	struct nfsreq *req;
	1939	int i;
	1940
	1941	crit_enter();
	1942	TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
	1943	if (req->r_mrep != NULL \|\| (req->r_flags & R_SOFTTERM))
	1944	continue;
	1945	nfs_softterm(req, 0);
	1946	}
	1947	/* XXX the other two queues as well */
	1948	crit_exit();
	1949
	1950	for (i = 0; i < 30; i++) {
	1951	crit_enter();
	1952	TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
	1953	if (nmp == req->r_nmp)
	1954	break;
	1955	}
	1956	crit_exit();
	1957	if (req == NULL)
	1958	return (0);
	1959	tsleep(&lbolt, 0, "nfscancel", 0);
	1960	}
	1961	return (EBUSY);
	1962	}
	1963
	1964	/*
	1965	* Soft-terminate a request, effectively marking it as failed.
	1966	*
	1967	* Must be called from within a critical section.
	1968	*/
	1969	static void
	1970	nfs_softterm(struct nfsreq *rep, int islocked)
	1971	{
	1972	rep->r_flags \|= R_SOFTTERM;
	1973	nfs_hardterm(rep, islocked);
	1974	}
	1975
	1976	/*
	1977	* Hard-terminate a request, typically after getting a response.
	1978	*
	1979	* The state machine can still decide to re-issue it later if necessary.
	1980	*
	1981	* Must be called from within a critical section.
	1982	*/
	1983	static void
	1984	nfs_hardterm(struct nfsreq *rep, int islocked)
	1985	{
	1986	struct nfsmount *nmp = rep->r_nmp;
	1987
	1988	/*
	1989	* The nm_send count is decremented now to avoid deadlocks
	1990	* when the process in soreceive() hasn't yet managed to send
	1991	* its own request.
	1992	*/
	1993	if (rep->r_flags & R_SENT) {
	1994	rep->r_flags &= ~R_SENT;
	1995	}
	1996
	1997	/*
	1998	* If we locked the request or nobody else has locked the request,
	1999	* and the request is async, we can move it to the reader thread's
	2000	* queue now and fix up the state.
	2001	*
	2002	* If we locked the request or nobody else has locked the request,
	2003	* we can wake up anyone blocked waiting for a response on the
	2004	* request.
	2005	*/
	2006	if (islocked \|\| (rep->r_flags & R_LOCKED) == 0) {
	2007	if ((rep->r_flags & (R_ONREQQ \| R_ASYNC)) ==
	2008	(R_ONREQQ \| R_ASYNC)) {
	2009	rep->r_flags &= ~R_ONREQQ;
	2010	TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
	2011	--nmp->nm_reqqlen;
	2012	TAILQ_INSERT_TAIL(&nmp->nm_reqrxq, rep, r_chain);
	2013	KKASSERT(rep->r_info->state == NFSM_STATE_TRY \|\|
	2014	rep->r_info->state == NFSM_STATE_WAITREPLY);
	2015	rep->r_info->state = NFSM_STATE_PROCESSREPLY;
	2016	nfssvc_iod_reader_wakeup(nmp);
	2017	if (TAILQ_FIRST(&nmp->nm_bioq) &&
	2018	nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
	2019	nfssvc_iod_writer_wakeup(nmp);
	2020	}
	2021	}
	2022	mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link);
	2023	}
	2024	}
	2025
	2026	/*
	2027	* Test for a termination condition pending on the process.
	2028	* This is used for NFSMNT_INT mounts.
	2029	*/
	2030	int
	2031	nfs_sigintr(struct nfsmount nmp, struct nfsreq rep, struct thread *td)
	2032	{
	2033	sigset_t tmpset;
	2034	struct proc *p;
	2035	struct lwp *lp;
	2036
	2037	if (rep && (rep->r_flags & R_SOFTTERM))
	2038	return (EINTR);
	2039	/* Terminate all requests while attempting a forced unmount. */
	2040	if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
	2041	return (EINTR);
	2042	if (!(nmp->nm_flag & NFSMNT_INT))
	2043	return (0);
	2044	/* td might be NULL YYY */
	2045	if (td == NULL \|\| (p = td->td_proc) == NULL)
	2046	return (0);
	2047
	2048	lp = td->td_lwp;
	2049	tmpset = lwp_sigpend(lp);
	2050	SIGSETNAND(tmpset, lp->lwp_sigmask);
	2051	SIGSETNAND(tmpset, p->p_sigignore);
	2052	if (SIGNOTEMPTY(tmpset) && NFSINT_SIGMASK(tmpset))
	2053	return (EINTR);
	2054
	2055	return (0);
	2056	}
	2057
	2058	/*
	2059	* Lock a socket against others.
	2060	* Necessary for STREAM sockets to ensure you get an entire rpc request/reply
	2061	* and also to avoid race conditions between the processes with nfs requests
	2062	* in progress when a reconnect is necessary.
	2063	*/
	2064	int
	2065	nfs_sndlock(struct nfsmount nmp, struct nfsreq rep)
	2066	{
	2067	mtx_t mtx = &nmp->nm_txlock;
	2068	struct thread *td;
	2069	int slptimeo;
	2070	int slpflag;
	2071	int error;
	2072
	2073	slpflag = 0;
	2074	slptimeo = 0;
	2075	td = rep ? rep->r_td : NULL;
	2076	if (nmp->nm_flag & NFSMNT_INT)
	2077	slpflag = PCATCH;
	2078
	2079	while ((error = mtx_lock_ex_try(mtx)) != 0) {
	2080	if (nfs_sigintr(nmp, rep, td)) {
	2081	error = EINTR;
	2082	break;
	2083	}
	2084	error = mtx_lock_ex(mtx, "nfsndlck", slpflag, slptimeo);
	2085	if (error == 0)
	2086	break;
	2087	if (slpflag == PCATCH) {
	2088	slpflag = 0;
	2089	slptimeo = 2 * hz;
	2090	}
	2091	}
	2092	/* Always fail if our request has been cancelled. */
	2093	if (rep && (rep->r_flags & R_SOFTTERM)) {
	2094	if (error == 0)
	2095	mtx_unlock(mtx);
	2096	error = EINTR;
	2097	}
	2098	return (error);
	2099	}
	2100
	2101	/*
	2102	* Unlock the stream socket for others.
	2103	*/
	2104	void
	2105	nfs_sndunlock(struct nfsmount *nmp)
	2106	{
	2107	mtx_unlock(&nmp->nm_txlock);
	2108	}
	2109
	2110	/*
	2111	* Lock the receiver side of the socket.
	2112	*
	2113	* rep may be NULL.
	2114	*/
	2115	static int
	2116	nfs_rcvlock(struct nfsmount nmp, struct nfsreq rep)
	2117	{
	2118	mtx_t mtx = &nmp->nm_rxlock;
	2119	int slpflag;
	2120	int slptimeo;
	2121	int error;
	2122
	2123	/*
	2124	* Unconditionally check for completion in case another nfsiod
	2125	* get the packet while the caller was blocked, before the caller
	2126	* called us. Packet reception is handled by mainline code which
	2127	* is protected by the BGL at the moment.
	2128	*
	2129	* We do not strictly need the second check just before the
	2130	* tsleep(), but it's good defensive programming.
	2131	*/
	2132	if (rep && rep->r_mrep != NULL)
	2133	return (EALREADY);
	2134
	2135	if (nmp->nm_flag & NFSMNT_INT)
	2136	slpflag = PCATCH;
	2137	else
	2138	slpflag = 0;
	2139	slptimeo = 0;
	2140
	2141	while ((error = mtx_lock_ex_try(mtx)) != 0) {
	2142	if (nfs_sigintr(nmp, rep, (rep ? rep->r_td : NULL))) {
	2143	error = EINTR;
	2144	break;
	2145	}
	2146	if (rep && rep->r_mrep != NULL) {
	2147	error = EALREADY;
	2148	break;
	2149	}
	2150
	2151	/*
	2152	* NOTE: can return ENOLCK, but in that case rep->r_mrep
	2153	* will already be set.
	2154	*/
	2155	if (rep) {
	2156	error = mtx_lock_ex_link(mtx, &rep->r_link,
	2157	"nfsrcvlk",
	2158	slpflag, slptimeo);
	2159	} else {
	2160	error = mtx_lock_ex(mtx, "nfsrcvlk", slpflag, slptimeo);
	2161	}
	2162	if (error == 0)
	2163	break;
	2164
	2165	/*
	2166	* If our reply was recieved while we were sleeping,
	2167	* then just return without taking the lock to avoid a
	2168	* situation where a single iod could 'capture' the
	2169	* recieve lock.
	2170	*/
	2171	if (rep && rep->r_mrep != NULL) {
	2172	error = EALREADY;
	2173	break;
	2174	}
	2175	if (slpflag == PCATCH) {
	2176	slpflag = 0;
	2177	slptimeo = 2 * hz;
	2178	}
	2179	}
	2180	if (error == 0) {
	2181	if (rep && rep->r_mrep != NULL) {
	2182	error = EALREADY;
	2183	mtx_unlock(mtx);
	2184	}
	2185	}
	2186	return (error);
	2187	}
	2188
	2189	/*
	2190	* Unlock the stream socket for others.
	2191	*/
	2192	static void
	2193	nfs_rcvunlock(struct nfsmount *nmp)
	2194	{
	2195	mtx_unlock(&nmp->nm_rxlock);
	2196	}
	2197
	2198	/*
	2199	* nfs_realign:
	2200	*
	2201	* Check for badly aligned mbuf data and realign by copying the unaligned
	2202	* portion of the data into a new mbuf chain and freeing the portions
	2203	* of the old chain that were replaced.
	2204	*
	2205	* We cannot simply realign the data within the existing mbuf chain
	2206	* because the underlying buffers may contain other rpc commands and
	2207	* we cannot afford to overwrite them.
	2208	*
	2209	* We would prefer to avoid this situation entirely. The situation does
	2210	* not occur with NFS/UDP and is supposed to only occassionally occur
	2211	* with TCP. Use vfs.nfs.realign_count and realign_test to check this.
	2212	*
	2213	* NOTE! MB_DONTWAIT cannot be used here. The mbufs must be acquired
	2214	* because the rpc request OR reply cannot be thrown away. TCP NFS
	2215	* mounts do not retry their RPCs unless the TCP connection itself
	2216	* is dropped so throwing away a RPC will basically cause the NFS
	2217	* operation to lockup indefinitely.
	2218	*/
	2219	static void
	2220	nfs_realign(struct mbuf **pm, int hsiz)
	2221	{
	2222	struct mbuf *m;
	2223	struct mbuf *n = NULL;
	2224
	2225	/*
	2226	* Check for misalignemnt
	2227	*/
	2228	++nfs_realign_test;
	2229	while ((m = *pm) != NULL) {
	2230	if ((m->m_len & 0x3) \|\| (mtod(m, intptr_t) & 0x3))
	2231	break;
	2232	pm = &m->m_next;
	2233	}
	2234
	2235	/*
	2236	* If misalignment found make a completely new copy.
	2237	*/
	2238	if (m) {
	2239	++nfs_realign_count;
	2240	n = m_dup_data(m, MB_WAIT);
	2241	m_freem(*pm);
	2242	*pm = n;
	2243	}
	2244	}
	2245
	2246	#ifndef NFS_NOSERVER
	2247
	2248	/*
	2249	* Parse an RPC request
	2250	* - verify it
	2251	* - fill in the cred struct.
	2252	*/
	2253	int
	2254	nfs_getreq(struct nfsrv_descript nd, struct nfsd nfsd, int has_header)
	2255	{
	2256	int len, i;
	2257	u_int32_t *tl;
	2258	struct uio uio;
	2259	struct iovec iov;
	2260	caddr_t cp;
	2261	u_int32_t nfsvers, auth_type;
	2262	uid_t nickuid;
	2263	int error = 0, ticklen;
	2264	struct nfsuid *nuidp;
	2265	struct timeval tvin, tvout;
	2266	struct nfsm_info info;
	2267	#if 0 /* until encrypted keys are implemented */
	2268	NFSKERBKEYSCHED_T keys; /* stores key schedule */
	2269	#endif
	2270
	2271	info.mrep = nd->nd_mrep;
	2272	info.md = nd->nd_md;
	2273	info.dpos = nd->nd_dpos;
	2274
	2275	if (has_header) {
	2276	NULLOUT(tl = nfsm_dissect(&info, 10 * NFSX_UNSIGNED));
	2277	nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
	2278	if (*tl++ != rpc_call) {
	2279	m_freem(info.mrep);
	2280	return (EBADRPC);
	2281	}
	2282	} else {
	2283	NULLOUT(tl = nfsm_dissect(&info, 8 * NFSX_UNSIGNED));
	2284	}
	2285	nd->nd_repstat = 0;
	2286	nd->nd_flag = 0;
	2287	if (*tl++ != rpc_vers) {
	2288	nd->nd_repstat = ERPCMISMATCH;
	2289	nd->nd_procnum = NFSPROC_NOOP;
	2290	return (0);
	2291	}
	2292	if (*tl != nfs_prog) {
	2293	nd->nd_repstat = EPROGUNAVAIL;
	2294	nd->nd_procnum = NFSPROC_NOOP;
	2295	return (0);
	2296	}
	2297	tl++;
	2298	nfsvers = fxdr_unsigned(u_int32_t, *tl++);
	2299	if (nfsvers < NFS_VER2 \|\| nfsvers > NFS_VER3) {
	2300	nd->nd_repstat = EPROGMISMATCH;
	2301	nd->nd_procnum = NFSPROC_NOOP;
	2302	return (0);
	2303	}
	2304	if (nfsvers == NFS_VER3)
	2305	nd->nd_flag = ND_NFSV3;
	2306	nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
	2307	if (nd->nd_procnum == NFSPROC_NULL)
	2308	return (0);
	2309	if (nd->nd_procnum >= NFS_NPROCS \|\|
	2310	(nd->nd_procnum >= NQNFSPROC_GETLEASE) \|\|
	2311	(!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
	2312	nd->nd_repstat = EPROCUNAVAIL;
	2313	nd->nd_procnum = NFSPROC_NOOP;
	2314	return (0);
	2315	}
	2316	if ((nd->nd_flag & ND_NFSV3) == 0)
	2317	nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
	2318	auth_type = *tl++;
	2319	len = fxdr_unsigned(int, *tl++);
	2320	if (len < 0 \|\| len > RPCAUTH_MAXSIZ) {
	2321	m_freem(info.mrep);
	2322	return (EBADRPC);
	2323	}
	2324
	2325	nd->nd_flag &= ~ND_KERBAUTH;
	2326	/*
	2327	* Handle auth_unix or auth_kerb.
	2328	*/
	2329	if (auth_type == rpc_auth_unix) {
	2330	len = fxdr_unsigned(int, *++tl);
	2331	if (len < 0 \|\| len > NFS_MAXNAMLEN) {
	2332	m_freem(info.mrep);
	2333	return (EBADRPC);
	2334	}
	2335	ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
	2336	NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
	2337	bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
	2338	nd->nd_cr.cr_ref = 1;
	2339	nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
	2340	nd->nd_cr.cr_ruid = nd->nd_cr.cr_svuid = nd->nd_cr.cr_uid;
	2341	nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
	2342	nd->nd_cr.cr_rgid = nd->nd_cr.cr_svgid = nd->nd_cr.cr_gid;
	2343	len = fxdr_unsigned(int, *tl);
	2344	if (len < 0 \|\| len > RPCAUTH_UNIXGIDS) {
	2345	m_freem(info.mrep);
	2346	return (EBADRPC);
	2347	}
	2348	NULLOUT(tl = nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED));
	2349	for (i = 1; i <= len; i++)
	2350	if (i < NGROUPS)
	2351	nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
	2352	else
	2353	tl++;
	2354	nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
	2355	if (nd->nd_cr.cr_ngroups > 1)
	2356	nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
	2357	len = fxdr_unsigned(int, *++tl);
	2358	if (len < 0 \|\| len > RPCAUTH_MAXSIZ) {
	2359	m_freem(info.mrep);
	2360	return (EBADRPC);
	2361	}
	2362	if (len > 0) {
	2363	ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
	2364	}
	2365	} else if (auth_type == rpc_auth_kerb) {
	2366	switch (fxdr_unsigned(int, *tl++)) {
	2367	case RPCAKN_FULLNAME:
	2368	ticklen = fxdr_unsigned(int, *tl);
	2369	((u_int32_t )nfsd->nfsd_authstr) = *tl;
	2370	uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
	2371	nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
	2372	if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
	2373	m_freem(info.mrep);
	2374	return (EBADRPC);
	2375	}
	2376	uio.uio_offset = 0;
	2377	uio.uio_iov = &iov;
	2378	uio.uio_iovcnt = 1;
	2379	uio.uio_segflg = UIO_SYSSPACE;
	2380	iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
	2381	iov.iov_len = RPCAUTH_MAXSIZ - 4;
	2382	ERROROUT(nfsm_mtouio(&info, &uio, uio.uio_resid));
	2383	NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
	2384	if (*tl++ != rpc_auth_kerb \|\|
	2385	fxdr_unsigned(int, tl) != 4 NFSX_UNSIGNED) {
	2386	kprintf("Bad kerb verifier\n");
	2387	nd->nd_repstat = (NFSERR_AUTHERR\|AUTH_BADVERF);
	2388	nd->nd_procnum = NFSPROC_NOOP;
	2389	return (0);
	2390	}
	2391	NULLOUT(cp = nfsm_dissect(&info, 4 * NFSX_UNSIGNED));
	2392	tl = (u_int32_t *)cp;
	2393	if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
	2394	kprintf("Not fullname kerb verifier\n");
	2395	nd->nd_repstat = (NFSERR_AUTHERR\|AUTH_BADVERF);
	2396	nd->nd_procnum = NFSPROC_NOOP;
	2397	return (0);
	2398	}
	2399	cp += NFSX_UNSIGNED;
	2400	bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
	2401	nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
	2402	nd->nd_flag \|= ND_KERBFULL;
	2403	nfsd->nfsd_flag \|= NFSD_NEEDAUTH;
	2404	break;
	2405	case RPCAKN_NICKNAME:
	2406	if (len != 2 * NFSX_UNSIGNED) {
	2407	kprintf("Kerb nickname short\n");
	2408	nd->nd_repstat = (NFSERR_AUTHERR\|AUTH_BADCRED);
	2409	nd->nd_procnum = NFSPROC_NOOP;
	2410	return (0);
	2411	}
	2412	nickuid = fxdr_unsigned(uid_t, *tl);
	2413	NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
	2414	if (*tl++ != rpc_auth_kerb \|\|
	2415	fxdr_unsigned(int, tl) != 3 NFSX_UNSIGNED) {
	2416	kprintf("Kerb nick verifier bad\n");
	2417	nd->nd_repstat = (NFSERR_AUTHERR\|AUTH_BADVERF);
	2418	nd->nd_procnum = NFSPROC_NOOP;
	2419	return (0);
	2420	}
	2421	NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
	2422	tvin.tv_sec = *tl++;
	2423	tvin.tv_usec = *tl;
	2424
	2425	for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
	2426	nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
	2427	if (nuidp->nu_cr.cr_uid == nickuid &&
	2428	(!nd->nd_nam2 \|\|
	2429	netaddr_match(NU_NETFAM(nuidp),
	2430	&nuidp->nu_haddr, nd->nd_nam2)))
	2431	break;
	2432	}
	2433	if (!nuidp) {
	2434	nd->nd_repstat =
	2435	(NFSERR_AUTHERR\|AUTH_REJECTCRED);
	2436	nd->nd_procnum = NFSPROC_NOOP;
	2437	return (0);
	2438	}
	2439
	2440	/*
	2441	* Now, decrypt the timestamp using the session key
	2442	* and validate it.
	2443	*/
	2444	#ifdef NFSKERB
	2445	XXX
	2446	#else
	2447	tvout.tv_sec = 0;
	2448	tvout.tv_usec = 0;
	2449	#endif
	2450
	2451	tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
	2452	tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
	2453	if (nuidp->nu_expire < time_second \|\|
	2454	nuidp->nu_timestamp.tv_sec > tvout.tv_sec \|\|
	2455	(nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
	2456	nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
	2457	nuidp->nu_expire = 0;
	2458	nd->nd_repstat =
	2459	(NFSERR_AUTHERR\|AUTH_REJECTVERF);
	2460	nd->nd_procnum = NFSPROC_NOOP;
	2461	return (0);
	2462	}
	2463	nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
	2464	nd->nd_flag \|= ND_KERBNICK;
	2465	};
	2466	} else {
	2467	nd->nd_repstat = (NFSERR_AUTHERR \| AUTH_REJECTCRED);
	2468	nd->nd_procnum = NFSPROC_NOOP;
	2469	return (0);
	2470	}
	2471
	2472	nd->nd_md = info.md;
	2473	nd->nd_dpos = info.dpos;
	2474	return (0);
	2475	nfsmout:
	2476	return (error);
	2477	}
	2478
	2479	#endif
	2480
	2481	/*
	2482	* Send a message to the originating process's terminal. The thread and/or
	2483	* process may be NULL. YYY the thread should not be NULL but there may
	2484	* still be some uio_td's that are still being passed as NULL through to
	2485	* nfsm_request().
	2486	*/
	2487	static int
	2488	nfs_msg(struct thread td, char server, char *msg)
	2489	{
	2490	tpr_t tpr;
	2491
	2492	if (td && td->td_proc)
	2493	tpr = tprintf_open(td->td_proc);
	2494	else
	2495	tpr = NULL;
	2496	tprintf(tpr, "nfs server %s: %s\n", server, msg);
	2497	tprintf_close(tpr);
	2498	return (0);
	2499	}
	2500
	2501	#ifndef NFS_NOSERVER
	2502
	2503	/*
	2504	* Socket upcall routine for nfsd sockets. This runs in the protocol
	2505	* thread and passes waitflag == MB_DONTWAIT.
	2506	*/
	2507	void
	2508	nfsrv_rcv_upcall(struct socket so, void arg, int waitflag)
	2509	{
	2510	struct nfssvc_sock slp = (struct nfssvc_sock )arg;
	2511
	2512	if (slp->ns_needq_upcall == 0) {
	2513	slp->ns_needq_upcall = 1; /* ok to race */
	2514	lwkt_gettoken(&nfs_token);
	2515	nfsrv_wakenfsd(slp, 1);
	2516	lwkt_reltoken(&nfs_token);
	2517	}
	2518	#if 0
	2519	lwkt_gettoken(&slp->ns_token);
	2520	slp->ns_flag \|= SLP_NEEDQ;
	2521	nfsrv_rcv(so, arg, waitflag);
	2522	lwkt_reltoken(&slp->ns_token);
	2523	#endif
	2524	}
	2525
	2526	/*
	2527	* Process new data on a receive socket. Essentially do as much as we can
	2528	* non-blocking, else punt and it will be called with MB_WAIT from an nfsd.
	2529	*
	2530	* slp->ns_token is held on call
	2531	*/
	2532	void
	2533	nfsrv_rcv(struct socket so, void arg, int waitflag)
	2534	{
	2535	struct nfssvc_sock slp = (struct nfssvc_sock )arg;
	2536	struct mbuf *m;
	2537	struct sockaddr *nam;
	2538	struct sockbuf sio;
	2539	int flags, error;
	2540	int nparallel_wakeup = 0;
	2541
	2542	ASSERT_LWKT_TOKEN_HELD(&slp->ns_token);
	2543
	2544	if ((slp->ns_flag & SLP_VALID) == 0)
	2545	return;
	2546
	2547	/*
	2548	* Do not allow an infinite number of completed RPC records to build
	2549	* up before we stop reading data from the socket. Otherwise we could
	2550	* end up holding onto an unreasonable number of mbufs for requests
	2551	* waiting for service.
	2552	*
	2553	* This should give pretty good feedback to the TCP layer and
	2554	* prevents a memory crunch for other protocols.
	2555	*
	2556	* Note that the same service socket can be dispatched to several
	2557	* nfs servers simultaniously. The tcp protocol callback calls us
	2558	* with MB_DONTWAIT. nfsd calls us with MB_WAIT (typically).
	2559	*/
	2560	if (NFSRV_RECLIMIT(slp))
	2561	return;
	2562
	2563	/*
	2564	* Handle protocol specifics to parse an RPC request. We always
	2565	* pull from the socket using non-blocking I/O.
	2566	*/
	2567	if (so->so_type == SOCK_STREAM) {
	2568	/*
	2569	* The data has to be read in an orderly fashion from a TCP
	2570	* stream, unlike a UDP socket. It is possible for soreceive
	2571	* and/or nfsrv_getstream() to block, so make sure only one
	2572	* entity is messing around with the TCP stream at any given
	2573	* moment. The receive sockbuf's lock in soreceive is not
	2574	* sufficient.
	2575	*/
	2576	if (slp->ns_flag & SLP_GETSTREAM)
	2577	return;
	2578	slp->ns_flag \|= SLP_GETSTREAM;
	2579
	2580	/*
	2581	* Do soreceive(). Pull out as much data as possible without
	2582	* blocking.
	2583	*/
	2584	sbinit(&sio, 1000000000);
	2585	flags = MSG_DONTWAIT;
	2586	error = so_pru_soreceive(so, &nam, NULL, &sio, NULL, &flags);
	2587	if (error \|\| sio.sb_mb == NULL) {
	2588	if (error != EWOULDBLOCK)
	2589	slp->ns_flag \|= SLP_DISCONN;
	2590	slp->ns_flag &= ~(SLP_GETSTREAM \| SLP_NEEDQ);
	2591	goto done;
	2592	}
	2593	m = sio.sb_mb;
	2594	if (slp->ns_rawend) {
	2595	slp->ns_rawend->m_next = m;
	2596	slp->ns_cc += sio.sb_cc;
	2597	} else {
	2598	slp->ns_raw = m;
	2599	slp->ns_cc = sio.sb_cc;
	2600	}
	2601	while (m->m_next)
	2602	m = m->m_next;
	2603	slp->ns_rawend = m;
	2604
	2605	/*
	2606	* Now try and parse as many record(s) as we can out of the
	2607	* raw stream data. This will set SLP_DOREC.
	2608	*/
	2609	error = nfsrv_getstream(slp, waitflag, &nparallel_wakeup);
	2610	if (error && error != EWOULDBLOCK)
	2611	slp->ns_flag \|= SLP_DISCONN;
	2612	slp->ns_flag &= ~SLP_GETSTREAM;
	2613	} else {
	2614	/*
	2615	* For UDP soreceive typically pulls just one packet, loop
	2616	* to get the whole batch.
	2617	*/
	2618	do {
	2619	sbinit(&sio, 1000000000);
	2620	flags = MSG_DONTWAIT;
	2621	error = so_pru_soreceive(so, &nam, NULL, &sio,
	2622	NULL, &flags);
	2623	if (sio.sb_mb) {
	2624	struct nfsrv_rec *rec;
	2625	int mf = (waitflag & MB_DONTWAIT) ?
	2626	M_NOWAIT : M_WAITOK;
	2627	rec = kmalloc(sizeof(struct nfsrv_rec),
	2628	M_NFSRVDESC, mf);
	2629	if (!rec) {
	2630	if (nam)
	2631	FREE(nam, M_SONAME);
	2632	m_freem(sio.sb_mb);
	2633	continue;
	2634	}
	2635	nfs_realign(&sio.sb_mb, 10 * NFSX_UNSIGNED);
	2636	rec->nr_address = nam;
	2637	rec->nr_packet = sio.sb_mb;
	2638	STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
	2639	++slp->ns_numrec;
	2640	slp->ns_flag \|= SLP_DOREC;
	2641	++nparallel_wakeup;
	2642	} else {
	2643	slp->ns_flag &= ~SLP_NEEDQ;
	2644	}
	2645	if (error) {
	2646	if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
	2647	&& error != EWOULDBLOCK) {
	2648	slp->ns_flag \|= SLP_DISCONN;
	2649	break;
	2650	}
	2651	}
	2652	if (NFSRV_RECLIMIT(slp))
	2653	break;
	2654	} while (sio.sb_mb);
	2655	}
	2656
	2657	/*
	2658	* If we were upcalled from the tcp protocol layer and we have
	2659	* fully parsed records ready to go, or there is new data pending,
	2660	* or something went wrong, try to wake up a nfsd thread to deal
	2661	* with it.
	2662	*/
	2663	done:
	2664	/* XXX this code is currently not executed (nfsrv_rcv_upcall) */
	2665	if (waitflag == MB_DONTWAIT && (slp->ns_flag & SLP_ACTION_MASK)) {
	2666	lwkt_gettoken(&nfs_token);
	2667	nfsrv_wakenfsd(slp, nparallel_wakeup);
	2668	lwkt_reltoken(&nfs_token);
	2669	}
	2670	}
	2671
	2672	/*
	2673	* Try and extract an RPC request from the mbuf data list received on a
	2674	* stream socket. The "waitflag" argument indicates whether or not it
	2675	* can sleep.
	2676	*/
	2677	static int
	2678	nfsrv_getstream(struct nfssvc_sock slp, int waitflag, int countp)
	2679	{
	2680	struct mbuf m, *mpp;
	2681	char cp1, cp2;
	2682	int len;
	2683	struct mbuf om, m2, *recm;
	2684	u_int32_t recmark;
	2685
	2686	for (;;) {
	2687	if (slp->ns_reclen == 0) {
	2688	if (slp->ns_cc < NFSX_UNSIGNED)
	2689	return (0);
	2690	m = slp->ns_raw;
	2691	if (m->m_len >= NFSX_UNSIGNED) {
	2692	bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
	2693	m->m_data += NFSX_UNSIGNED;
	2694	m->m_len -= NFSX_UNSIGNED;
	2695	} else {
	2696	cp1 = (caddr_t)&recmark;
	2697	cp2 = mtod(m, caddr_t);
	2698	while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
	2699	while (m->m_len == 0) {
	2700	m = m->m_next;
	2701	cp2 = mtod(m, caddr_t);
	2702	}
	2703	cp1++ = cp2++;
	2704	m->m_data++;
	2705	m->m_len--;
	2706	}
	2707	}
	2708	slp->ns_cc -= NFSX_UNSIGNED;
	2709	recmark = ntohl(recmark);
	2710	slp->ns_reclen = recmark & ~0x80000000;
	2711	if (recmark & 0x80000000)
	2712	slp->ns_flag \|= SLP_LASTFRAG;
	2713	else
	2714	slp->ns_flag &= ~SLP_LASTFRAG;
	2715	if (slp->ns_reclen > NFS_MAXPACKET \|\| slp->ns_reclen <= 0) {
	2716	log(LOG_ERR, "%s (%d) from nfs client\n",
	2717	"impossible packet length",
	2718	slp->ns_reclen);
	2719	return (EPERM);
	2720	}
	2721	}
	2722
	2723	/*
	2724	* Now get the record part.
	2725	*
	2726	* Note that slp->ns_reclen may be 0. Linux sometimes
	2727	* generates 0-length RPCs
	2728	*/
	2729	recm = NULL;
	2730	if (slp->ns_cc == slp->ns_reclen) {
	2731	recm = slp->ns_raw;
	2732	slp->ns_raw = slp->ns_rawend = NULL;
	2733	slp->ns_cc = slp->ns_reclen = 0;
	2734	} else if (slp->ns_cc > slp->ns_reclen) {
	2735	len = 0;
	2736	m = slp->ns_raw;
	2737	om = NULL;
	2738
	2739	while (len < slp->ns_reclen) {
	2740	if ((len + m->m_len) > slp->ns_reclen) {
	2741	m2 = m_copym(m, 0, slp->ns_reclen - len,
	2742	waitflag);
	2743	if (m2) {
	2744	if (om) {
	2745	om->m_next = m2;
	2746	recm = slp->ns_raw;
	2747	} else
	2748	recm = m2;
	2749	m->m_data += slp->ns_reclen - len;
	2750	m->m_len -= slp->ns_reclen - len;
	2751	len = slp->ns_reclen;
	2752	} else {
	2753	return (EWOULDBLOCK);
	2754	}
	2755	} else if ((len + m->m_len) == slp->ns_reclen) {
	2756	om = m;
	2757	len += m->m_len;
	2758	m = m->m_next;
	2759	recm = slp->ns_raw;
	2760	om->m_next = NULL;
	2761	} else {
	2762	om = m;
	2763	len += m->m_len;
	2764	m = m->m_next;
	2765	}
	2766	}
	2767	slp->ns_raw = m;
	2768	slp->ns_cc -= len;
	2769	slp->ns_reclen = 0;
	2770	} else {
	2771	return (0);
	2772	}
	2773
	2774	/*
	2775	* Accumulate the fragments into a record.
	2776	*/
	2777	mpp = &slp->ns_frag;
	2778	while (*mpp)
	2779	mpp = &((*mpp)->m_next);
	2780	*mpp = recm;
	2781	if (slp->ns_flag & SLP_LASTFRAG) {
	2782	struct nfsrv_rec *rec;
	2783	int mf = (waitflag & MB_DONTWAIT) ? M_NOWAIT : M_WAITOK;
	2784	rec = kmalloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
	2785	if (!rec) {
	2786	m_freem(slp->ns_frag);
	2787	} else {
	2788	nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
	2789	rec->nr_address = NULL;
	2790	rec->nr_packet = slp->ns_frag;
	2791	STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
	2792	++slp->ns_numrec;
	2793	slp->ns_flag \|= SLP_DOREC;
	2794	++*countp;
	2795	}
	2796	slp->ns_frag = NULL;
	2797	}
	2798	}
	2799	}
	2800
	2801	#ifdef INVARIANTS
	2802
	2803	/*
	2804	* Sanity check our mbuf chain.
	2805	*/
	2806	static void
	2807	nfs_checkpkt(struct mbuf *m, int len)
	2808	{
	2809	int xlen = 0;
	2810	while (m) {
	2811	xlen += m->m_len;
	2812	m = m->m_next;
	2813	}
	2814	if (xlen != len) {
	2815	panic("nfs_checkpkt: len mismatch %d/%d mbuf %p\n",
	2816	xlen, len, m);
	2817	}
	2818	}
	2819
	2820	#else
	2821
	2822	static void
	2823	nfs_checkpkt(struct mbuf *m __unused, int len __unused)
	2824	{
	2825	}
	2826
	2827	#endif
	2828
	2829	/*
	2830	* Parse an RPC header.
	2831	*
	2832	* If the socket is invalid or no records are pending we return ENOBUFS.
	2833	* The caller must deal with NEEDQ races.
	2834	*/
	2835	int
	2836	nfsrv_dorec(struct nfssvc_sock slp, struct nfsd nfsd,
	2837	struct nfsrv_descript **ndp)
	2838	{
	2839	struct nfsrv_rec *rec;
	2840	struct mbuf *m;
	2841	struct sockaddr *nam;
	2842	struct nfsrv_descript *nd;
	2843	int error;
	2844
	2845	*ndp = NULL;
	2846	if ((slp->ns_flag & SLP_VALID) == 0 \|\| !STAILQ_FIRST(&slp->ns_rec))
	2847	return (ENOBUFS);
	2848	rec = STAILQ_FIRST(&slp->ns_rec);
	2849	STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
	2850	KKASSERT(slp->ns_numrec > 0);
	2851	if (--slp->ns_numrec == 0)
	2852	slp->ns_flag &= ~SLP_DOREC;
	2853	nam = rec->nr_address;
	2854	m = rec->nr_packet;
	2855	kfree(rec, M_NFSRVDESC);
	2856	MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
	2857	M_NFSRVDESC, M_WAITOK);
	2858	nd->nd_md = nd->nd_mrep = m;
	2859	nd->nd_nam2 = nam;
	2860	nd->nd_dpos = mtod(m, caddr_t);
	2861	error = nfs_getreq(nd, nfsd, TRUE);
	2862	if (error) {
	2863	if (nam) {
	2864	FREE(nam, M_SONAME);
	2865	}
	2866	kfree((caddr_t)nd, M_NFSRVDESC);
	2867	return (error);
	2868	}
	2869	*ndp = nd;
	2870	nfsd->nfsd_nd = nd;
	2871	return (0);
	2872	}
	2873
	2874	/*
	2875	* Try to assign service sockets to nfsd threads based on the number
	2876	* of new rpc requests that have been queued on the service socket.
	2877	*
	2878	* If no nfsd's are available or additonal requests are pending, set the
	2879	* NFSD_CHECKSLP flag so that one of the running nfsds will go look for
	2880	* the work in the nfssvc_sock list when it is finished processing its
	2881	* current work. This flag is only cleared when an nfsd can not find
	2882	* any new work to perform.
	2883	*/
	2884	void
	2885	nfsrv_wakenfsd(struct nfssvc_sock *slp, int nparallel)
	2886	{
	2887	struct nfsd *nd;
	2888
	2889	if ((slp->ns_flag & SLP_VALID) == 0)
	2890	return;
	2891	if (nparallel <= 1)
	2892	nparallel = 1;
	2893	TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
	2894	if (nd->nfsd_flag & NFSD_WAITING) {
	2895	nd->nfsd_flag &= ~NFSD_WAITING;
	2896	if (nd->nfsd_slp)
	2897	panic("nfsd wakeup");
	2898	nfsrv_slpref(slp);
	2899	nd->nfsd_slp = slp;
	2900	wakeup((caddr_t)nd);
	2901	if (--nparallel == 0)
	2902	break;
	2903	}
	2904	}
	2905
	2906	/*
	2907	* If we couldn't assign slp then the NFSDs are all busy and
	2908	* we set a flag indicating that there is pending work.
	2909	*/
	2910	if (nparallel)
	2911	nfsd_head_flag \|= NFSD_CHECKSLP;
	2912	}
	2913	#endif /* NFS_NOSERVER */