Allow the nominal NFS io block size to be set with a sysctl vfs.nfs.nfs_io_size
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
1 /*
2  * Copyright (c) 1989, 1991, 1993, 1995
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by the University of
19  *      California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
37  * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
38  * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.13 2004/03/05 16:57:16 hsu Exp $
39  */
40
41 /*
42  * Socket operations for use by nfs
43  */
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 #include <sys/vnode.h>
53 #include <sys/protosw.h>
54 #include <sys/resourcevar.h>
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <sys/socketops.h>
58 #include <sys/syslog.h>
59 #include <sys/thread.h>
60 #include <sys/tprintf.h>
61 #include <sys/sysctl.h>
62 #include <sys/signalvar.h>
63
64 #include <netinet/in.h>
65 #include <netinet/tcp.h>
66
67 #include "rpcv2.h"
68 #include "nfsproto.h"
69 #include "nfs.h"
70 #include "xdr_subs.h"
71 #include "nfsm_subs.h"
72 #include "nfsmount.h"
73 #include "nfsnode.h"
74 #include "nfsrtt.h"
75 #include "nqnfs.h"
76
77 #define TRUE    1
78 #define FALSE   0
79
80 /*
81  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
82  * Use the mean and mean deviation of rtt for the appropriate type of rpc
83  * for the frequent rpcs and a default for the others.
84  * The justification for doing "other" this way is that these rpcs
85  * happen so infrequently that timer est. would probably be stale.
86  * Also, since many of these rpcs are
87  * non-idempotent, a conservative timeout is desired.
88  * getattr, lookup - A+2D
89  * read, write     - A+4D
90  * other           - nm_timeo
91  */
92 #define NFS_RTO(n, t) \
93         ((t) == 0 ? (n)->nm_timeo : \
94          ((t) < 3 ? \
95           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
96           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
97 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
98 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
99 /*
100  * External data, mostly RPC constants in XDR form
101  */
102 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers,
103         rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr,
104         rpc_auth_kerb;
105 extern u_int32_t nfs_prog, nqnfs_prog;
106 extern time_t nqnfsstarttime;
107 extern struct nfsstats nfsstats;
108 extern int nfsv3_procid[NFS_NPROCS];
109 extern int nfs_ticks;
110
111 /*
112  * Defines which timer to use for the procnum.
113  * 0 - default
114  * 1 - getattr
115  * 2 - lookup
116  * 3 - read
117  * 4 - write
118  */
119 static int proct[NFS_NPROCS] = {
120         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
121         0, 0, 0,
122 };
123
124 static int nfs_realign_test;
125 static int nfs_realign_count;
126 static int nfs_bufpackets = 4;
127
128 SYSCTL_DECL(_vfs_nfs);
129
130 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
131 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
132 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
133
134
135 /*
136  * There is a congestion window for outstanding rpcs maintained per mount
137  * point. The cwnd size is adjusted in roughly the way that:
138  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
139  * SIGCOMM '88". ACM, August 1988.
140  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
141  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
142  * of rpcs is in progress.
143  * (The sent count and cwnd are scaled for integer arith.)
144  * Variants of "slow start" were tried and were found to be too much of a
145  * performance hit (ave. rtt 3 times larger),
146  * I suspect due to the large rtt that nfs rpcs have.
147  */
148 #define NFS_CWNDSCALE   256
149 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
150 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
151 int nfsrtton = 0;
152 struct nfsrtt nfsrtt;
153 struct callout_handle   nfs_timer_handle;
154
155 static int      nfs_msg (struct thread *,char *,char *);
156 static int      nfs_rcvlock (struct nfsreq *);
157 static void     nfs_rcvunlock (struct nfsreq *);
158 static void     nfs_realign (struct mbuf **pm, int hsiz);
159 static int      nfs_receive (struct nfsreq *rep, struct sockaddr **aname,
160                                  struct mbuf **mp);
161 static void     nfs_softterm (struct nfsreq *rep);
162 static int      nfs_reconnect (struct nfsreq *rep);
163 #ifndef NFS_NOSERVER 
164 static int      nfsrv_getstream (struct nfssvc_sock *,int);
165
166 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
167                                     struct nfssvc_sock *slp,
168                                     struct thread *td,
169                                     struct mbuf **mreqp) = {
170         nfsrv_null,
171         nfsrv_getattr,
172         nfsrv_setattr,
173         nfsrv_lookup,
174         nfsrv3_access,
175         nfsrv_readlink,
176         nfsrv_read,
177         nfsrv_write,
178         nfsrv_create,
179         nfsrv_mkdir,
180         nfsrv_symlink,
181         nfsrv_mknod,
182         nfsrv_remove,
183         nfsrv_rmdir,
184         nfsrv_rename,
185         nfsrv_link,
186         nfsrv_readdir,
187         nfsrv_readdirplus,
188         nfsrv_statfs,
189         nfsrv_fsinfo,
190         nfsrv_pathconf,
191         nfsrv_commit,
192         nqnfsrv_getlease,
193         nqnfsrv_vacated,
194         nfsrv_noop,
195         nfsrv_noop
196 };
197 #endif /* NFS_NOSERVER */
198
199 /*
200  * Initialize sockets and congestion for a new NFS connection.
201  * We do not free the sockaddr if error.
202  */
203 int
204 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
205 {
206         struct socket *so;
207         int s, error, rcvreserve, sndreserve;
208         int pktscale;
209         struct sockaddr *saddr;
210         struct sockaddr_in *sin;
211         struct thread *td = &thread0; /* only used for socreate and sobind */
212
213         nmp->nm_so = (struct socket *)0;
214         saddr = nmp->nm_nam;
215         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
216                 nmp->nm_soproto, td);
217         if (error)
218                 goto bad;
219         so = nmp->nm_so;
220         nmp->nm_soflags = so->so_proto->pr_flags;
221
222         /*
223          * Some servers require that the client port be a reserved port number.
224          */
225         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
226                 struct sockopt sopt;
227                 int ip;
228                 struct sockaddr_in ssin;
229
230                 bzero(&sopt, sizeof sopt);
231                 ip = IP_PORTRANGE_LOW;
232                 sopt.sopt_dir = SOPT_SET;
233                 sopt.sopt_level = IPPROTO_IP;
234                 sopt.sopt_name = IP_PORTRANGE;
235                 sopt.sopt_val = (void *)&ip;
236                 sopt.sopt_valsize = sizeof(ip);
237                 sopt.sopt_td = NULL;
238                 error = sosetopt(so, &sopt);
239                 if (error)
240                         goto bad;
241                 bzero(&ssin, sizeof ssin);
242                 sin = &ssin;
243                 sin->sin_len = sizeof (struct sockaddr_in);
244                 sin->sin_family = AF_INET;
245                 sin->sin_addr.s_addr = INADDR_ANY;
246                 sin->sin_port = htons(0);
247                 error = sobind(so, (struct sockaddr *)sin, td);
248                 if (error)
249                         goto bad;
250                 bzero(&sopt, sizeof sopt);
251                 ip = IP_PORTRANGE_DEFAULT;
252                 sopt.sopt_dir = SOPT_SET;
253                 sopt.sopt_level = IPPROTO_IP;
254                 sopt.sopt_name = IP_PORTRANGE;
255                 sopt.sopt_val = (void *)&ip;
256                 sopt.sopt_valsize = sizeof(ip);
257                 sopt.sopt_td = NULL;
258                 error = sosetopt(so, &sopt);
259                 if (error)
260                         goto bad;
261         }
262
263         /*
264          * Protocols that do not require connections may be optionally left
265          * unconnected for servers that reply from a port other than NFS_PORT.
266          */
267         if (nmp->nm_flag & NFSMNT_NOCONN) {
268                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
269                         error = ENOTCONN;
270                         goto bad;
271                 }
272         } else {
273                 error = soconnect(so, nmp->nm_nam, td);
274                 if (error)
275                         goto bad;
276
277                 /*
278                  * Wait for the connection to complete. Cribbed from the
279                  * connect system call but with the wait timing out so
280                  * that interruptible mounts don't hang here for a long time.
281                  */
282                 s = splnet();
283                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
284                         (void) tsleep((caddr_t)&so->so_timeo, 0,
285                                 "nfscon", 2 * hz);
286                         if ((so->so_state & SS_ISCONNECTING) &&
287                             so->so_error == 0 && rep &&
288                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
289                                 so->so_state &= ~SS_ISCONNECTING;
290                                 splx(s);
291                                 goto bad;
292                         }
293                 }
294                 if (so->so_error) {
295                         error = so->so_error;
296                         so->so_error = 0;
297                         splx(s);
298                         goto bad;
299                 }
300                 splx(s);
301         }
302         so->so_rcv.sb_timeo = (5 * hz);
303         so->so_snd.sb_timeo = (5 * hz);
304
305         /*
306          * Get buffer reservation size from sysctl, but impose reasonable
307          * limits.
308          */
309         pktscale = nfs_bufpackets;
310         if (pktscale < 2)
311                 pktscale = 2;
312         if (pktscale > 64)
313                 pktscale = 64;
314
315         if (nmp->nm_sotype == SOCK_DGRAM) {
316                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
317                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
318                     NFS_MAXPKTHDR) * pktscale;
319         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
320                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
321                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
322                     NFS_MAXPKTHDR) * pktscale;
323         } else {
324                 if (nmp->nm_sotype != SOCK_STREAM)
325                         panic("nfscon sotype");
326                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
327                         struct sockopt sopt;
328                         int val;
329
330                         bzero(&sopt, sizeof sopt);
331                         sopt.sopt_level = SOL_SOCKET;
332                         sopt.sopt_name = SO_KEEPALIVE;
333                         sopt.sopt_val = &val;
334                         sopt.sopt_valsize = sizeof val;
335                         val = 1;
336                         sosetopt(so, &sopt);
337                 }
338                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
339                         struct sockopt sopt;
340                         int val;
341
342                         bzero(&sopt, sizeof sopt);
343                         sopt.sopt_level = IPPROTO_TCP;
344                         sopt.sopt_name = TCP_NODELAY;
345                         sopt.sopt_val = &val;
346                         sopt.sopt_valsize = sizeof val;
347                         val = 1;
348                         sosetopt(so, &sopt);
349                 }
350                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
351                     sizeof (u_int32_t)) * pktscale;
352                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
353                     sizeof (u_int32_t)) * pktscale;
354         }
355         error = soreserve(so, sndreserve, rcvreserve,
356                           &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
357         if (error)
358                 goto bad;
359         so->so_rcv.sb_flags |= SB_NOINTR;
360         so->so_snd.sb_flags |= SB_NOINTR;
361
362         /* Initialize other non-zero congestion variables */
363         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = 
364                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
365         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
366                 nmp->nm_sdrtt[3] = 0;
367         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
368         nmp->nm_sent = 0;
369         nmp->nm_timeouts = 0;
370         return (0);
371
372 bad:
373         nfs_disconnect(nmp);
374         return (error);
375 }
376
377 /*
378  * Reconnect routine:
379  * Called when a connection is broken on a reliable protocol.
380  * - clean up the old socket
381  * - nfs_connect() again
382  * - set R_MUSTRESEND for all outstanding requests on mount point
383  * If this fails the mount point is DEAD!
384  * nb: Must be called with the nfs_sndlock() set on the mount point.
385  */
386 static int
387 nfs_reconnect(rep)
388         struct nfsreq *rep;
389 {
390         struct nfsreq *rp;
391         struct nfsmount *nmp = rep->r_nmp;
392         int error;
393
394         nfs_disconnect(nmp);
395         while ((error = nfs_connect(nmp, rep)) != 0) {
396                 if (error == EINTR || error == ERESTART)
397                         return (EINTR);
398                 (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
399         }
400
401         /*
402          * Loop through outstanding request list and fix up all requests
403          * on old socket.
404          */
405         for (rp = nfs_reqq.tqh_first; rp != 0; rp = rp->r_chain.tqe_next) {
406                 if (rp->r_nmp == nmp)
407                         rp->r_flags |= R_MUSTRESEND;
408         }
409         return (0);
410 }
411
412 /*
413  * NFS disconnect. Clean up and unlink.
414  */
415 void
416 nfs_disconnect(nmp)
417         struct nfsmount *nmp;
418 {
419         struct socket *so;
420
421         if (nmp->nm_so) {
422                 so = nmp->nm_so;
423                 nmp->nm_so = (struct socket *)0;
424                 soshutdown(so, 2);
425                 soclose(so);
426         }
427 }
428
429 void
430 nfs_safedisconnect(nmp)
431         struct nfsmount *nmp;
432 {
433         struct nfsreq dummyreq;
434
435         bzero(&dummyreq, sizeof(dummyreq));
436         dummyreq.r_nmp = nmp;
437         dummyreq.r_td = NULL;
438         nfs_rcvlock(&dummyreq);
439         nfs_disconnect(nmp);
440         nfs_rcvunlock(&dummyreq);
441 }
442
443 /*
444  * This is the nfs send routine. For connection based socket types, it
445  * must be called with an nfs_sndlock() on the socket.
446  * "rep == NULL" indicates that it has been called from a server.
447  * For the client side:
448  * - return EINTR if the RPC is terminated, 0 otherwise
449  * - set R_MUSTRESEND if the send fails for any reason
450  * - do any cleanup required by recoverable socket errors (?)
451  * For the server side:
452  * - return EINTR or ERESTART if interrupted by a signal
453  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
454  * - do any cleanup required by recoverable socket errors (?)
455  */
456 int
457 nfs_send(so, nam, top, rep)
458         struct socket *so;
459         struct sockaddr *nam;
460         struct mbuf *top;
461         struct nfsreq *rep;
462 {
463         struct sockaddr *sendnam;
464         int error, soflags, flags;
465
466         if (rep) {
467                 if (rep->r_flags & R_SOFTTERM) {
468                         m_freem(top);
469                         return (EINTR);
470                 }
471                 if ((so = rep->r_nmp->nm_so) == NULL) {
472                         rep->r_flags |= R_MUSTRESEND;
473                         m_freem(top);
474                         return (0);
475                 }
476                 rep->r_flags &= ~R_MUSTRESEND;
477                 soflags = rep->r_nmp->nm_soflags;
478         } else
479                 soflags = so->so_proto->pr_flags;
480         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
481                 sendnam = (struct sockaddr *)0;
482         else
483                 sendnam = nam;
484         if (so->so_type == SOCK_SEQPACKET)
485                 flags = MSG_EOR;
486         else
487                 flags = 0;
488
489         error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
490             curthread /*XXX*/);
491         /*
492          * ENOBUFS for dgram sockets is transient and non fatal.
493          * No need to log, and no need to break a soft mount.
494          */
495         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
496                 error = 0;
497                 if (rep)                /* do backoff retransmit on client */
498                         rep->r_flags |= R_MUSTRESEND;
499         }
500
501         if (error) {
502                 if (rep) {
503                         log(LOG_INFO, "nfs send error %d for server %s\n",error,
504                             rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
505                         /*
506                          * Deal with errors for the client side.
507                          */
508                         if (rep->r_flags & R_SOFTTERM)
509                                 error = EINTR;
510                         else
511                                 rep->r_flags |= R_MUSTRESEND;
512                 } else
513                         log(LOG_INFO, "nfsd send error %d\n", error);
514
515                 /*
516                  * Handle any recoverable (soft) socket errors here. (?)
517                  */
518                 if (error != EINTR && error != ERESTART &&
519                         error != EWOULDBLOCK && error != EPIPE)
520                         error = 0;
521         }
522         return (error);
523 }
524
525 /*
526  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
527  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
528  * Mark and consolidate the data into a new mbuf list.
529  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
530  *     small mbufs.
531  * For SOCK_STREAM we must be very careful to read an entire record once
532  * we have read any of it, even if the system call has been interrupted.
533  */
534 static int
535 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
536 {
537         struct socket *so;
538         struct uio auio;
539         struct iovec aio;
540         struct mbuf *m;
541         struct mbuf *control;
542         u_int32_t len;
543         struct sockaddr **getnam;
544         int error, sotype, rcvflg;
545         struct thread *td = curthread;  /* XXX */
546
547         /*
548          * Set up arguments for soreceive()
549          */
550         *mp = (struct mbuf *)0;
551         *aname = (struct sockaddr *)0;
552         sotype = rep->r_nmp->nm_sotype;
553
554         /*
555          * For reliable protocols, lock against other senders/receivers
556          * in case a reconnect is necessary.
557          * For SOCK_STREAM, first get the Record Mark to find out how much
558          * more there is to get.
559          * We must lock the socket against other receivers
560          * until we have an entire rpc request/reply.
561          */
562         if (sotype != SOCK_DGRAM) {
563                 error = nfs_sndlock(rep);
564                 if (error)
565                         return (error);
566 tryagain:
567                 /*
568                  * Check for fatal errors and resending request.
569                  */
570                 /*
571                  * Ugh: If a reconnect attempt just happened, nm_so
572                  * would have changed. NULL indicates a failed
573                  * attempt that has essentially shut down this
574                  * mount point.
575                  */
576                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
577                         nfs_sndunlock(rep);
578                         return (EINTR);
579                 }
580                 so = rep->r_nmp->nm_so;
581                 if (!so) {
582                         error = nfs_reconnect(rep);
583                         if (error) {
584                                 nfs_sndunlock(rep);
585                                 return (error);
586                         }
587                         goto tryagain;
588                 }
589                 while (rep->r_flags & R_MUSTRESEND) {
590                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
591                         nfsstats.rpcretries++;
592                         error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
593                         if (error) {
594                                 if (error == EINTR || error == ERESTART ||
595                                     (error = nfs_reconnect(rep)) != 0) {
596                                         nfs_sndunlock(rep);
597                                         return (error);
598                                 }
599                                 goto tryagain;
600                         }
601                 }
602                 nfs_sndunlock(rep);
603                 if (sotype == SOCK_STREAM) {
604                         aio.iov_base = (caddr_t) &len;
605                         aio.iov_len = sizeof(u_int32_t);
606                         auio.uio_iov = &aio;
607                         auio.uio_iovcnt = 1;
608                         auio.uio_segflg = UIO_SYSSPACE;
609                         auio.uio_rw = UIO_READ;
610                         auio.uio_offset = 0;
611                         auio.uio_resid = sizeof(u_int32_t);
612                         auio.uio_td = td;
613                         do {
614                            rcvflg = MSG_WAITALL;
615                            error = so_pru_soreceive(so, NULL, &auio, NULL,
616                                NULL, &rcvflg);
617                            if (error == EWOULDBLOCK && rep) {
618                                 if (rep->r_flags & R_SOFTTERM)
619                                         return (EINTR);
620                            }
621                         } while (error == EWOULDBLOCK);
622                         if (!error && auio.uio_resid > 0) {
623                             /*
624                              * Don't log a 0 byte receive; it means
625                              * that the socket has been closed, and
626                              * can happen during normal operation
627                              * (forcible unmount or Solaris server).
628                              */
629                             if (auio.uio_resid != sizeof (u_int32_t))
630                             log(LOG_INFO,
631                                  "short receive (%d/%d) from nfs server %s\n",
632                                  (int)(sizeof(u_int32_t) - auio.uio_resid),
633                                  (int)sizeof(u_int32_t),
634                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
635                             error = EPIPE;
636                         }
637                         if (error)
638                                 goto errout;
639                         len = ntohl(len) & ~0x80000000;
640                         /*
641                          * This is SERIOUS! We are out of sync with the sender
642                          * and forcing a disconnect/reconnect is all I can do.
643                          */
644                         if (len > NFS_MAXPACKET) {
645                             log(LOG_ERR, "%s (%d) from nfs server %s\n",
646                                 "impossible packet length",
647                                 len,
648                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
649                             error = EFBIG;
650                             goto errout;
651                         }
652                         auio.uio_resid = len;
653                         do {
654                             rcvflg = MSG_WAITALL;
655                             error =  so_pru_soreceive(so, NULL, &auio, mp,
656                                 NULL, &rcvflg);
657                         } while (error == EWOULDBLOCK || error == EINTR ||
658                                  error == ERESTART);
659                         if (!error && auio.uio_resid > 0) {
660                             if (len != auio.uio_resid)
661                             log(LOG_INFO,
662                                 "short receive (%d/%d) from nfs server %s\n",
663                                 len - auio.uio_resid, len,
664                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
665                             error = EPIPE;
666                         }
667                 } else {
668                         /*
669                          * NB: Since uio_resid is big, MSG_WAITALL is ignored
670                          * and soreceive() will return when it has either a
671                          * control msg or a data msg.
672                          * We have no use for control msg., but must grab them
673                          * and then throw them away so we know what is going
674                          * on.
675                          */
676                         auio.uio_resid = len = 100000000; /* Anything Big */
677                         auio.uio_td = td;
678                         do {
679                             rcvflg = 0;
680                             error =  so_pru_soreceive(so, NULL, &auio, mp,
681                                 &control, &rcvflg);
682                             if (control)
683                                 m_freem(control);
684                             if (error == EWOULDBLOCK && rep) {
685                                 if (rep->r_flags & R_SOFTTERM)
686                                         return (EINTR);
687                             }
688                         } while (error == EWOULDBLOCK ||
689                                  (!error && *mp == NULL && control));
690                         if ((rcvflg & MSG_EOR) == 0)
691                                 printf("Egad!!\n");
692                         if (!error && *mp == NULL)
693                                 error = EPIPE;
694                         len -= auio.uio_resid;
695                 }
696 errout:
697                 if (error && error != EINTR && error != ERESTART) {
698                         m_freem(*mp);
699                         *mp = (struct mbuf *)0;
700                         if (error != EPIPE)
701                                 log(LOG_INFO,
702                                     "receive error %d from nfs server %s\n",
703                                     error,
704                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
705                         error = nfs_sndlock(rep);
706                         if (!error) {
707                                 error = nfs_reconnect(rep);
708                                 if (!error)
709                                         goto tryagain;
710                                 else
711                                         nfs_sndunlock(rep);
712                         }
713                 }
714         } else {
715                 if ((so = rep->r_nmp->nm_so) == NULL)
716                         return (EACCES);
717                 if (so->so_state & SS_ISCONNECTED)
718                         getnam = (struct sockaddr **)0;
719                 else
720                         getnam = aname;
721                 auio.uio_resid = len = 1000000;
722                 auio.uio_td = td;
723                 do {
724                         rcvflg = 0;
725                         error =  so_pru_soreceive(so, getnam, &auio, mp, NULL,
726                             &rcvflg);
727                         if (error == EWOULDBLOCK &&
728                             (rep->r_flags & R_SOFTTERM))
729                                 return (EINTR);
730                 } while (error == EWOULDBLOCK);
731                 len -= auio.uio_resid;
732         }
733         if (error) {
734                 m_freem(*mp);
735                 *mp = (struct mbuf *)0;
736         }
737         /*
738          * Search for any mbufs that are not a multiple of 4 bytes long
739          * or with m_data not longword aligned.
740          * These could cause pointer alignment problems, so copy them to
741          * well aligned mbufs.
742          */
743         nfs_realign(mp, 5 * NFSX_UNSIGNED);
744         return (error);
745 }
746
747 /*
748  * Implement receipt of reply on a socket.
749  * We must search through the list of received datagrams matching them
750  * with outstanding requests using the xid, until ours is found.
751  */
752 /* ARGSUSED */
753 int
754 nfs_reply(myrep)
755         struct nfsreq *myrep;
756 {
757         struct nfsreq *rep;
758         struct nfsmount *nmp = myrep->r_nmp;
759         int32_t t1;
760         struct mbuf *mrep, *md;
761         struct sockaddr *nam;
762         u_int32_t rxid, *tl;
763         caddr_t dpos, cp2;
764         int error;
765
766         /*
767          * Loop around until we get our own reply
768          */
769         for (;;) {
770                 /*
771                  * Lock against other receivers so that I don't get stuck in
772                  * sbwait() after someone else has received my reply for me.
773                  * Also necessary for connection based protocols to avoid
774                  * race conditions during a reconnect.
775                  * If nfs_rcvlock() returns EALREADY, that means that
776                  * the reply has already been recieved by another
777                  * process and we can return immediately.  In this
778                  * case, the lock is not taken to avoid races with
779                  * other processes.
780                  */
781                 error = nfs_rcvlock(myrep);
782                 if (error == EALREADY)
783                         return (0);
784                 if (error)
785                         return (error);
786                 /*
787                  * Get the next Rpc reply off the socket
788                  */
789                 error = nfs_receive(myrep, &nam, &mrep);
790                 nfs_rcvunlock(myrep);
791                 if (error) {
792
793                         /*
794                          * Ignore routing errors on connectionless protocols??
795                          */
796                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
797                                 nmp->nm_so->so_error = 0;
798                                 if (myrep->r_flags & R_GETONEREP)
799                                         return (0);
800                                 continue;
801                         }
802                         return (error);
803                 }
804                 if (nam)
805                         FREE(nam, M_SONAME);
806
807                 /*
808                  * Get the xid and check that it is an rpc reply
809                  */
810                 md = mrep;
811                 dpos = mtod(md, caddr_t);
812                 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED);
813                 rxid = *tl++;
814                 if (*tl != rpc_reply) {
815 #ifndef NFS_NOSERVER
816                         if (nmp->nm_flag & NFSMNT_NQNFS) {
817                                 if (nqnfs_callback(nmp, mrep, md, dpos))
818                                         nfsstats.rpcinvalid++;
819                         } else {
820                                 nfsstats.rpcinvalid++;
821                                 m_freem(mrep);
822                         }
823 #else
824                         nfsstats.rpcinvalid++;
825                         m_freem(mrep);
826 #endif
827 nfsmout:
828                         if (myrep->r_flags & R_GETONEREP)
829                                 return (0);
830                         continue;
831                 }
832
833                 /*
834                  * Loop through the request list to match up the reply
835                  * Iff no match, just drop the datagram
836                  */
837                 for (rep = nfs_reqq.tqh_first; rep != 0;
838                     rep = rep->r_chain.tqe_next) {
839                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
840                                 /* Found it.. */
841                                 rep->r_mrep = mrep;
842                                 rep->r_md = md;
843                                 rep->r_dpos = dpos;
844                                 if (nfsrtton) {
845                                         struct rttl *rt;
846
847                                         rt = &nfsrtt.rttl[nfsrtt.pos];
848                                         rt->proc = rep->r_procnum;
849                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
850                                         rt->sent = nmp->nm_sent;
851                                         rt->cwnd = nmp->nm_cwnd;
852                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
853                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
854                                         rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
855                                         getmicrotime(&rt->tstamp);
856                                         if (rep->r_flags & R_TIMING)
857                                                 rt->rtt = rep->r_rtt;
858                                         else
859                                                 rt->rtt = 1000000;
860                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
861                                 }
862                                 /*
863                                  * Update congestion window.
864                                  * Do the additive increase of
865                                  * one rpc/rtt.
866                                  */
867                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
868                                         nmp->nm_cwnd +=
869                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
870                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
871                                         if (nmp->nm_cwnd > NFS_MAXCWND)
872                                                 nmp->nm_cwnd = NFS_MAXCWND;
873                                 }
874                                 if (rep->r_flags & R_SENT) {
875                                         rep->r_flags &= ~R_SENT;
876                                         nmp->nm_sent -= NFS_CWNDSCALE;
877                                 }
878                                 /*
879                                  * Update rtt using a gain of 0.125 on the mean
880                                  * and a gain of 0.25 on the deviation.
881                                  */
882                                 if (rep->r_flags & R_TIMING) {
883                                         /*
884                                          * Since the timer resolution of
885                                          * NFS_HZ is so course, it can often
886                                          * result in r_rtt == 0. Since
887                                          * r_rtt == N means that the actual
888                                          * rtt is between N+dt and N+2-dt ticks,
889                                          * add 1.
890                                          */
891                                         t1 = rep->r_rtt + 1;
892                                         t1 -= (NFS_SRTT(rep) >> 3);
893                                         NFS_SRTT(rep) += t1;
894                                         if (t1 < 0)
895                                                 t1 = -t1;
896                                         t1 -= (NFS_SDRTT(rep) >> 2);
897                                         NFS_SDRTT(rep) += t1;
898                                 }
899                                 nmp->nm_timeouts = 0;
900                                 break;
901                         }
902                 }
903                 /*
904                  * If not matched to a request, drop it.
905                  * If it's mine, get out.
906                  */
907                 if (rep == 0) {
908                         nfsstats.rpcunexpected++;
909                         m_freem(mrep);
910                 } else if (rep == myrep) {
911                         if (rep->r_mrep == NULL)
912                                 panic("nfsreply nil");
913                         return (0);
914                 }
915                 if (myrep->r_flags & R_GETONEREP)
916                         return (0);
917         }
918 }
919
920 /*
921  * nfs_request - goes something like this
922  *      - fill in request struct
923  *      - links it into list
924  *      - calls nfs_send() for first transmit
925  *      - calls nfs_receive() to get reply
926  *      - break down rpc header and return with nfs reply pointed to
927  *        by mrep or error
928  * nb: always frees up mreq mbuf list
929  */
930 int
931 nfs_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp)
932         struct vnode *vp;
933         struct mbuf *mrest;
934         int procnum;
935         struct thread *td;
936         struct ucred *cred;
937         struct mbuf **mrp;
938         struct mbuf **mdp;
939         caddr_t *dposp;
940 {
941         struct mbuf *mrep, *m2;
942         struct nfsreq *rep;
943         u_int32_t *tl;
944         int i;
945         struct nfsmount *nmp;
946         struct mbuf *m, *md, *mheadend;
947         struct nfsnode *np;
948         char nickv[RPCX_NICKVERF];
949         time_t reqtime, waituntil;
950         caddr_t dpos, cp2;
951         int t1, nqlflag, cachable, s, error = 0, mrest_len, auth_len, auth_type;
952         int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0, failed_auth = 0;
953         int verf_len, verf_type;
954         u_int32_t xid;
955         u_quad_t frev;
956         char *auth_str, *verf_str;
957         NFSKERBKEY_T key;               /* save session key */
958
959         /* Reject requests while attempting a forced unmount. */
960         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
961                 m_freem(mrest);
962                 return (ESTALE);
963         }
964         nmp = VFSTONFS(vp->v_mount);
965         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
966         rep->r_nmp = nmp;
967         rep->r_vp = vp;
968         rep->r_td = td;
969         rep->r_procnum = procnum;
970         i = 0;
971         m = mrest;
972         while (m) {
973                 i += m->m_len;
974                 m = m->m_next;
975         }
976         mrest_len = i;
977
978         /*
979          * Get the RPC header with authorization.
980          */
981 kerbauth:
982         verf_str = auth_str = (char *)0;
983         if (nmp->nm_flag & NFSMNT_KERB) {
984                 verf_str = nickv;
985                 verf_len = sizeof (nickv);
986                 auth_type = RPCAUTH_KERB4;
987                 bzero((caddr_t)key, sizeof (key));
988                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
989                         &auth_len, verf_str, verf_len)) {
990                         error = nfs_getauth(nmp, rep, cred, &auth_str,
991                                 &auth_len, verf_str, &verf_len, key);
992                         if (error) {
993                                 free((caddr_t)rep, M_NFSREQ);
994                                 m_freem(mrest);
995                                 return (error);
996                         }
997                 }
998         } else {
999                 auth_type = RPCAUTH_UNIX;
1000                 if (cred->cr_ngroups < 1)
1001                         panic("nfsreq nogrps");
1002                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1003                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1004                         5 * NFSX_UNSIGNED;
1005         }
1006         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1007              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid);
1008         if (auth_str)
1009                 free(auth_str, M_TEMP);
1010
1011         /*
1012          * For stream protocols, insert a Sun RPC Record Mark.
1013          */
1014         if (nmp->nm_sotype == SOCK_STREAM) {
1015                 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT);
1016                 if (m == NULL)
1017                         return (ENOBUFS);
1018                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
1019                          (m->m_pkthdr.len - NFSX_UNSIGNED));
1020         }
1021         rep->r_mreq = m;
1022         rep->r_xid = xid;
1023 tryagain:
1024         if (nmp->nm_flag & NFSMNT_SOFT)
1025                 rep->r_retry = nmp->nm_retry;
1026         else
1027                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1028         rep->r_rtt = rep->r_rexmit = 0;
1029         if (proct[procnum] > 0)
1030                 rep->r_flags = R_TIMING;
1031         else
1032                 rep->r_flags = 0;
1033         rep->r_mrep = NULL;
1034
1035         /*
1036          * Do the client side RPC.
1037          */
1038         nfsstats.rpcrequests++;
1039         /*
1040          * Chain request into list of outstanding requests. Be sure
1041          * to put it LAST so timer finds oldest requests first.
1042          */
1043         s = splsoftclock();
1044         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1045
1046         /* Get send time for nqnfs */
1047         reqtime = time_second;
1048
1049         /*
1050          * If backing off another request or avoiding congestion, don't
1051          * send this one now but let timer do it. If not timing a request,
1052          * do it now.
1053          */
1054         if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1055                 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1056                 nmp->nm_sent < nmp->nm_cwnd)) {
1057                 splx(s);
1058                 if (nmp->nm_soflags & PR_CONNREQUIRED)
1059                         error = nfs_sndlock(rep);
1060                 if (!error) {
1061                         m2 = m_copym(m, 0, M_COPYALL, M_WAIT);
1062                         error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1063                         if (nmp->nm_soflags & PR_CONNREQUIRED)
1064                                 nfs_sndunlock(rep);
1065                 }
1066                 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
1067                         nmp->nm_sent += NFS_CWNDSCALE;
1068                         rep->r_flags |= R_SENT;
1069                 }
1070         } else {
1071                 splx(s);
1072                 rep->r_rtt = -1;
1073         }
1074
1075         /*
1076          * Wait for the reply from our send or the timer's.
1077          */
1078         if (!error || error == EPIPE)
1079                 error = nfs_reply(rep);
1080
1081         /*
1082          * RPC done, unlink the request.
1083          */
1084         s = splsoftclock();
1085         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1086         splx(s);
1087
1088         /*
1089          * Decrement the outstanding request count.
1090          */
1091         if (rep->r_flags & R_SENT) {
1092                 rep->r_flags &= ~R_SENT;        /* paranoia */
1093                 nmp->nm_sent -= NFS_CWNDSCALE;
1094         }
1095
1096         /*
1097          * If there was a successful reply and a tprintf msg.
1098          * tprintf a response.
1099          */
1100         if (!error && (rep->r_flags & R_TPRINTFMSG))
1101                 nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1102                     "is alive again");
1103         mrep = rep->r_mrep;
1104         md = rep->r_md;
1105         dpos = rep->r_dpos;
1106         if (error) {
1107                 m_freem(rep->r_mreq);
1108                 free((caddr_t)rep, M_NFSREQ);
1109                 return (error);
1110         }
1111
1112         /*
1113          * break down the rpc header and check if ok
1114          */
1115         nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1116         if (*tl++ == rpc_msgdenied) {
1117                 if (*tl == rpc_mismatch)
1118                         error = EOPNOTSUPP;
1119                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1120                         if (!failed_auth) {
1121                                 failed_auth++;
1122                                 mheadend->m_next = (struct mbuf *)0;
1123                                 m_freem(mrep);
1124                                 m_freem(rep->r_mreq);
1125                                 goto kerbauth;
1126                         } else
1127                                 error = EAUTH;
1128                 } else
1129                         error = EACCES;
1130                 m_freem(mrep);
1131                 m_freem(rep->r_mreq);
1132                 free((caddr_t)rep, M_NFSREQ);
1133                 return (error);
1134         }
1135
1136         /*
1137          * Grab any Kerberos verifier, otherwise just throw it away.
1138          */
1139         verf_type = fxdr_unsigned(int, *tl++);
1140         i = fxdr_unsigned(int32_t, *tl);
1141         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1142                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1143                 if (error)
1144                         goto nfsmout;
1145         } else if (i > 0)
1146                 nfsm_adv(nfsm_rndup(i));
1147         nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1148         /* 0 == ok */
1149         if (*tl == 0) {
1150                 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1151                 if (*tl != 0) {
1152                         error = fxdr_unsigned(int, *tl);
1153                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1154                                 error == NFSERR_TRYLATER) {
1155                                 m_freem(mrep);
1156                                 error = 0;
1157                                 waituntil = time_second + trylater_delay;
1158                                 while (time_second < waituntil)
1159                                         (void) tsleep((caddr_t)&lbolt,
1160                                                 0, "nqnfstry", 0);
1161                                 trylater_delay *= nfs_backoff[trylater_cnt];
1162                                 if (trylater_cnt < 7)
1163                                         trylater_cnt++;
1164                                 goto tryagain;
1165                         }
1166
1167                         /*
1168                          * If the File Handle was stale, invalidate the
1169                          * lookup cache, just in case.
1170                          */
1171                         if (error == ESTALE)
1172                                 cache_purge(vp);
1173                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1174                                 *mrp = mrep;
1175                                 *mdp = md;
1176                                 *dposp = dpos;
1177                                 error |= NFSERR_RETERR;
1178                         } else
1179                                 m_freem(mrep);
1180                         m_freem(rep->r_mreq);
1181                         free((caddr_t)rep, M_NFSREQ);
1182                         return (error);
1183                 }
1184
1185                 /*
1186                  * For nqnfs, get any lease in reply
1187                  */
1188                 if (nmp->nm_flag & NFSMNT_NQNFS) {
1189                         nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1190                         if (*tl) {
1191                                 np = VTONFS(vp);
1192                                 nqlflag = fxdr_unsigned(int, *tl);
1193                                 nfsm_dissect(tl, u_int32_t *, 4*NFSX_UNSIGNED);
1194                                 cachable = fxdr_unsigned(int, *tl++);
1195                                 reqtime += fxdr_unsigned(int, *tl++);
1196                                 if (reqtime > time_second) {
1197                                     frev = fxdr_hyper(tl);
1198                                     nqnfs_clientlease(nmp, np, nqlflag,
1199                                         cachable, reqtime, frev);
1200                                 }
1201                         }
1202                 }
1203                 *mrp = mrep;
1204                 *mdp = md;
1205                 *dposp = dpos;
1206                 m_freem(rep->r_mreq);
1207                 FREE((caddr_t)rep, M_NFSREQ);
1208                 return (0);
1209         }
1210         m_freem(mrep);
1211         error = EPROTONOSUPPORT;
1212 nfsmout:
1213         m_freem(rep->r_mreq);
1214         free((caddr_t)rep, M_NFSREQ);
1215         return (error);
1216 }
1217
1218 #ifndef NFS_NOSERVER
1219 /*
1220  * Generate the rpc reply header
1221  * siz arg. is used to decide if adding a cluster is worthwhile
1222  */
1223 int
1224 nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp)
1225         int siz;
1226         struct nfsrv_descript *nd;
1227         struct nfssvc_sock *slp;
1228         int err;
1229         int cache;
1230         u_quad_t *frev;
1231         struct mbuf **mrq;
1232         struct mbuf **mbp;
1233         caddr_t *bposp;
1234 {
1235         u_int32_t *tl;
1236         struct mbuf *mreq;
1237         caddr_t bpos;
1238         struct mbuf *mb, *mb2;
1239
1240         MGETHDR(mreq, M_WAIT, MT_DATA);
1241         mb = mreq;
1242         /*
1243          * If this is a big reply, use a cluster else
1244          * try and leave leading space for the lower level headers.
1245          */
1246         siz += RPC_REPLYSIZ;
1247         if ((max_hdr + siz) >= MINCLSIZE) {
1248                 MCLGET(mreq, M_WAIT);
1249         } else
1250                 mreq->m_data += max_hdr;
1251         tl = mtod(mreq, u_int32_t *);
1252         mreq->m_len = 6 * NFSX_UNSIGNED;
1253         bpos = ((caddr_t)tl) + mreq->m_len;
1254         *tl++ = txdr_unsigned(nd->nd_retxid);
1255         *tl++ = rpc_reply;
1256         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1257                 *tl++ = rpc_msgdenied;
1258                 if (err & NFSERR_AUTHERR) {
1259                         *tl++ = rpc_autherr;
1260                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1261                         mreq->m_len -= NFSX_UNSIGNED;
1262                         bpos -= NFSX_UNSIGNED;
1263                 } else {
1264                         *tl++ = rpc_mismatch;
1265                         *tl++ = txdr_unsigned(RPC_VER2);
1266                         *tl = txdr_unsigned(RPC_VER2);
1267                 }
1268         } else {
1269                 *tl++ = rpc_msgaccepted;
1270
1271                 /*
1272                  * For Kerberos authentication, we must send the nickname
1273                  * verifier back, otherwise just RPCAUTH_NULL.
1274                  */
1275                 if (nd->nd_flag & ND_KERBFULL) {
1276                     struct nfsuid *nuidp;
1277                     struct timeval ktvin, ktvout;
1278
1279                     for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
1280                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1281                         if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
1282                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1283                              &nuidp->nu_haddr, nd->nd_nam2)))
1284                             break;
1285                     }
1286                     if (nuidp) {
1287                         ktvin.tv_sec =
1288                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1289                         ktvin.tv_usec =
1290                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1291
1292                         /*
1293                          * Encrypt the timestamp in ecb mode using the
1294                          * session key.
1295                          */
1296 #ifdef NFSKERB
1297                         XXX
1298 #endif
1299
1300                         *tl++ = rpc_auth_kerb;
1301                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1302                         *tl = ktvout.tv_sec;
1303                         nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1304                         *tl++ = ktvout.tv_usec;
1305                         *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
1306                     } else {
1307                         *tl++ = 0;
1308                         *tl++ = 0;
1309                     }
1310                 } else {
1311                         *tl++ = 0;
1312                         *tl++ = 0;
1313                 }
1314                 switch (err) {
1315                 case EPROGUNAVAIL:
1316                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1317                         break;
1318                 case EPROGMISMATCH:
1319                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1320                         nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1321                         if (nd->nd_flag & ND_NQNFS) {
1322                                 *tl++ = txdr_unsigned(3);
1323                                 *tl = txdr_unsigned(3);
1324                         } else {
1325                                 *tl++ = txdr_unsigned(2);
1326                                 *tl = txdr_unsigned(3);
1327                         }
1328                         break;
1329                 case EPROCUNAVAIL:
1330                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1331                         break;
1332                 case EBADRPC:
1333                         *tl = txdr_unsigned(RPC_GARBAGE);
1334                         break;
1335                 default:
1336                         *tl = 0;
1337                         if (err != NFSERR_RETVOID) {
1338                                 nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
1339                                 if (err)
1340                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1341                                 else
1342                                     *tl = 0;
1343                         }
1344                         break;
1345                 };
1346         }
1347
1348         /*
1349          * For nqnfs, piggyback lease as requested.
1350          */
1351         if ((nd->nd_flag & ND_NQNFS) && err == 0) {
1352                 if (nd->nd_flag & ND_LEASE) {
1353                         nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
1354                         *tl++ = txdr_unsigned(nd->nd_flag & ND_LEASE);
1355                         *tl++ = txdr_unsigned(cache);
1356                         *tl++ = txdr_unsigned(nd->nd_duration);
1357                         txdr_hyper(*frev, tl);
1358                 } else {
1359                         nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
1360                         *tl = 0;
1361                 }
1362         }
1363         if (mrq != NULL)
1364             *mrq = mreq;
1365         *mbp = mb;
1366         *bposp = bpos;
1367         if (err != 0 && err != NFSERR_RETVOID)
1368                 nfsstats.srvrpc_errs++;
1369         return (0);
1370 }
1371
1372
1373 #endif /* NFS_NOSERVER */
1374 /*
1375  * Nfs timer routine
1376  * Scan the nfsreq list and retranmit any requests that have timed out
1377  * To avoid retransmission attempts on STREAM sockets (in the future) make
1378  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1379  */
1380 void
1381 nfs_timer(arg)
1382         void *arg;      /* never used */
1383 {
1384         struct nfsreq *rep;
1385         struct mbuf *m;
1386         struct socket *so;
1387         struct nfsmount *nmp;
1388         int timeo;
1389         int s, error;
1390 #ifndef NFS_NOSERVER
1391         static long lasttime = 0;
1392         struct nfssvc_sock *slp;
1393         u_quad_t cur_usec;
1394 #endif /* NFS_NOSERVER */
1395         struct thread *td = &thread0; /* XXX for credentials, will break if sleep */
1396
1397         s = splnet();
1398         for (rep = nfs_reqq.tqh_first; rep != 0; rep = rep->r_chain.tqe_next) {
1399                 nmp = rep->r_nmp;
1400                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1401                         continue;
1402                 if (nfs_sigintr(nmp, rep, rep->r_td)) {
1403                         nfs_softterm(rep);
1404                         continue;
1405                 }
1406                 if (rep->r_rtt >= 0) {
1407                         rep->r_rtt++;
1408                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1409                                 timeo = nmp->nm_timeo;
1410                         else
1411                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1412                         if (nmp->nm_timeouts > 0)
1413                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1414                         if (rep->r_rtt <= timeo)
1415                                 continue;
1416                         if (nmp->nm_timeouts < 8)
1417                                 nmp->nm_timeouts++;
1418                 }
1419                 /*
1420                  * Check for server not responding
1421                  */
1422                 if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
1423                      rep->r_rexmit > nmp->nm_deadthresh) {
1424                         nfs_msg(rep->r_td,
1425                             nmp->nm_mountp->mnt_stat.f_mntfromname,
1426                             "not responding");
1427                         rep->r_flags |= R_TPRINTFMSG;
1428                 }
1429                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1430                         nfsstats.rpctimeouts++;
1431                         nfs_softterm(rep);
1432                         continue;
1433                 }
1434                 if (nmp->nm_sotype != SOCK_DGRAM) {
1435                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1436                                 rep->r_rexmit = NFS_MAXREXMIT;
1437                         continue;
1438                 }
1439                 if ((so = nmp->nm_so) == NULL)
1440                         continue;
1441
1442                 /*
1443                  * If there is enough space and the window allows..
1444                  *      Resend it
1445                  * Set r_rtt to -1 in case we fail to send it now.
1446                  */
1447                 rep->r_rtt = -1;
1448                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
1449                    ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1450                     (rep->r_flags & R_SENT) ||
1451                     nmp->nm_sent < nmp->nm_cwnd) &&
1452                    (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
1453                         if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1454                             error = so_pru_send(so, 0, m, (struct sockaddr *)0,
1455                                      (struct mbuf *)0, td);
1456                         else
1457                             error = so_pru_send(so, 0, m, nmp->nm_nam,
1458                                 (struct mbuf *)0, td);
1459                         if (error) {
1460                                 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1461                                         so->so_error = 0;
1462                         } else {
1463                                 /*
1464                                  * Iff first send, start timing
1465                                  * else turn timing off, backoff timer
1466                                  * and divide congestion window by 2.
1467                                  */
1468                                 if (rep->r_flags & R_SENT) {
1469                                         rep->r_flags &= ~R_TIMING;
1470                                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1471                                                 rep->r_rexmit = NFS_MAXREXMIT;
1472                                         nmp->nm_cwnd >>= 1;
1473                                         if (nmp->nm_cwnd < NFS_CWNDSCALE)
1474                                                 nmp->nm_cwnd = NFS_CWNDSCALE;
1475                                         nfsstats.rpcretries++;
1476                                 } else {
1477                                         rep->r_flags |= R_SENT;
1478                                         nmp->nm_sent += NFS_CWNDSCALE;
1479                                 }
1480                                 rep->r_rtt = 0;
1481                         }
1482                 }
1483         }
1484 #ifndef NFS_NOSERVER
1485         /*
1486          * Call the nqnfs server timer once a second to handle leases.
1487          */
1488         if (lasttime != time_second) {
1489                 lasttime = time_second;
1490                 nqnfs_serverd();
1491         }
1492
1493         /*
1494          * Scan the write gathering queues for writes that need to be
1495          * completed now.
1496          */
1497         cur_usec = nfs_curusec();
1498         for (slp = nfssvc_sockhead.tqh_first; slp != 0;
1499             slp = slp->ns_chain.tqe_next) {
1500             if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec)
1501                 nfsrv_wakenfsd(slp);
1502         }
1503 #endif /* NFS_NOSERVER */
1504         splx(s);
1505         nfs_timer_handle = timeout(nfs_timer, (void *)0, nfs_ticks);
1506 }
1507
1508 /*
1509  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1510  * wait for all requests to complete. This is used by forced unmounts
1511  * to terminate any outstanding RPCs.
1512  */
1513 int
1514 nfs_nmcancelreqs(nmp)
1515         struct nfsmount *nmp;
1516 {
1517         struct nfsreq *req;
1518         int i, s;
1519
1520         s = splnet();
1521         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1522                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
1523                     (req->r_flags & R_SOFTTERM))
1524                         continue;
1525                 nfs_softterm(req);
1526         }
1527         splx(s);
1528
1529         for (i = 0; i < 30; i++) {
1530                 s = splnet();
1531                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1532                         if (nmp == req->r_nmp)
1533                                 break;
1534                 }
1535                 splx(s);
1536                 if (req == NULL)
1537                         return (0);
1538                 tsleep(&lbolt, 0, "nfscancel", 0);
1539         }
1540         return (EBUSY);
1541 }
1542
1543 /*
1544  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
1545  * The nm_send count is decremented now to avoid deadlocks when the process in
1546  * soreceive() hasn't yet managed to send its own request.
1547  */
1548
1549 static void
1550 nfs_softterm(rep)
1551         struct nfsreq *rep;
1552 {
1553         rep->r_flags |= R_SOFTTERM;
1554
1555         if (rep->r_flags & R_SENT) {
1556                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1557                 rep->r_flags &= ~R_SENT;
1558         }
1559 }
1560
1561 /*
1562  * Test for a termination condition pending on the process.
1563  * This is used for NFSMNT_INT mounts.
1564  */
1565 int
1566 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
1567 {
1568         sigset_t tmpset;
1569         struct proc *p;
1570
1571         if (rep && (rep->r_flags & R_SOFTTERM))
1572                 return (EINTR);
1573         /* Terminate all requests while attempting a forced unmount. */
1574         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
1575                 return (EINTR);
1576         if (!(nmp->nm_flag & NFSMNT_INT))
1577                 return (0);
1578         /* td might be NULL YYY */
1579         if (td == NULL || (p = td->td_proc) == NULL)
1580                 return (0);
1581
1582         tmpset = p->p_siglist;
1583         SIGSETNAND(tmpset, p->p_sigmask);
1584         SIGSETNAND(tmpset, p->p_sigignore);
1585         if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset))
1586                 return (EINTR);
1587
1588         return (0);
1589 }
1590
1591 /*
1592  * Lock a socket against others.
1593  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1594  * and also to avoid race conditions between the processes with nfs requests
1595  * in progress when a reconnect is necessary.
1596  */
1597 int
1598 nfs_sndlock(struct nfsreq *rep)
1599 {
1600         int *statep = &rep->r_nmp->nm_state;
1601         struct thread *td;
1602         int slpflag = 0, slptimeo = 0;
1603
1604         td = rep->r_td;
1605         if (rep->r_nmp->nm_flag & NFSMNT_INT)
1606                 slpflag = PCATCH;
1607         while (*statep & NFSSTA_SNDLOCK) {
1608                 if (nfs_sigintr(rep->r_nmp, rep, td))
1609                         return (EINTR);
1610                 *statep |= NFSSTA_WANTSND;
1611                 (void) tsleep((caddr_t)statep, slpflag,
1612                         "nfsndlck", slptimeo);
1613                 if (slpflag == PCATCH) {
1614                         slpflag = 0;
1615                         slptimeo = 2 * hz;
1616                 }
1617         }
1618         /* Always fail if our request has been cancelled. */
1619         if ((rep->r_flags & R_SOFTTERM))
1620                 return (EINTR);
1621         *statep |= NFSSTA_SNDLOCK;
1622         return (0);
1623 }
1624
1625 /*
1626  * Unlock the stream socket for others.
1627  */
1628 void
1629 nfs_sndunlock(rep)
1630         struct nfsreq *rep;
1631 {
1632         int *statep = &rep->r_nmp->nm_state;
1633
1634         if ((*statep & NFSSTA_SNDLOCK) == 0)
1635                 panic("nfs sndunlock");
1636         *statep &= ~NFSSTA_SNDLOCK;
1637         if (*statep & NFSSTA_WANTSND) {
1638                 *statep &= ~NFSSTA_WANTSND;
1639                 wakeup((caddr_t)statep);
1640         }
1641 }
1642
1643 static int
1644 nfs_rcvlock(rep)
1645         struct nfsreq *rep;
1646 {
1647         int *statep = &rep->r_nmp->nm_state;
1648         int slpflag, slptimeo = 0;
1649
1650         if (rep->r_nmp->nm_flag & NFSMNT_INT)
1651                 slpflag = PCATCH;
1652         else
1653                 slpflag = 0;
1654         while (*statep & NFSSTA_RCVLOCK) {
1655                 if (nfs_sigintr(rep->r_nmp, rep, rep->r_td))
1656                         return (EINTR);
1657                 *statep |= NFSSTA_WANTRCV;
1658                 (void) tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo);
1659                 /*
1660                  * If our reply was recieved while we were sleeping,
1661                  * then just return without taking the lock to avoid a
1662                  * situation where a single iod could 'capture' the
1663                  * recieve lock.
1664                  */
1665                 if (rep->r_mrep != NULL)
1666                         return (EALREADY);
1667                 if (slpflag == PCATCH) {
1668                         slpflag = 0;
1669                         slptimeo = 2 * hz;
1670                 }
1671         }
1672         *statep |= NFSSTA_RCVLOCK;
1673         return (0);
1674 }
1675
1676 /*
1677  * Unlock the stream socket for others.
1678  */
1679 static void
1680 nfs_rcvunlock(rep)
1681         struct nfsreq *rep;
1682 {
1683         int *statep = &rep->r_nmp->nm_state;
1684
1685         if ((*statep & NFSSTA_RCVLOCK) == 0)
1686                 panic("nfs rcvunlock");
1687         *statep &= ~NFSSTA_RCVLOCK;
1688         if (*statep & NFSSTA_WANTRCV) {
1689                 *statep &= ~NFSSTA_WANTRCV;
1690                 wakeup((caddr_t)statep);
1691         }
1692 }
1693
1694 /*
1695  *      nfs_realign:
1696  *
1697  *      Check for badly aligned mbuf data and realign by copying the unaligned
1698  *      portion of the data into a new mbuf chain and freeing the portions
1699  *      of the old chain that were replaced.
1700  *
1701  *      We cannot simply realign the data within the existing mbuf chain
1702  *      because the underlying buffers may contain other rpc commands and
1703  *      we cannot afford to overwrite them.
1704  *
1705  *      We would prefer to avoid this situation entirely.  The situation does
1706  *      not occur with NFS/UDP and is supposed to only occassionally occur
1707  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
1708  */
1709 static void
1710 nfs_realign(pm, hsiz)
1711         struct mbuf **pm;
1712         int hsiz;
1713 {
1714         struct mbuf *m;
1715         struct mbuf *n = NULL;
1716         int off = 0;
1717
1718         ++nfs_realign_test;
1719
1720         while ((m = *pm) != NULL) {
1721                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
1722                         MGET(n, M_WAIT, MT_DATA);
1723                         if (m->m_len >= MINCLSIZE) {
1724                                 MCLGET(n, M_WAIT);
1725                         }
1726                         n->m_len = 0;
1727                         break;
1728                 }
1729                 pm = &m->m_next;
1730         }
1731
1732         /*
1733          * If n is non-NULL, loop on m copying data, then replace the
1734          * portion of the chain that had to be realigned.
1735          */
1736         if (n != NULL) {
1737                 ++nfs_realign_count;
1738                 while (m) {
1739                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
1740                         off += m->m_len;
1741                         m = m->m_next;
1742                 }
1743                 m_freem(*pm);
1744                 *pm = n;
1745         }
1746 }
1747
1748 #ifndef NFS_NOSERVER
1749
1750 /*
1751  * Parse an RPC request
1752  * - verify it
1753  * - fill in the cred struct.
1754  */
1755 int
1756 nfs_getreq(nd, nfsd, has_header)
1757         struct nfsrv_descript *nd;
1758         struct nfsd *nfsd;
1759         int has_header;
1760 {
1761         int len, i;
1762         u_int32_t *tl;
1763         int32_t t1;
1764         struct uio uio;
1765         struct iovec iov;
1766         caddr_t dpos, cp2, cp;
1767         u_int32_t nfsvers, auth_type;
1768         uid_t nickuid;
1769         int error = 0, nqnfs = 0, ticklen;
1770         struct mbuf *mrep, *md;
1771         struct nfsuid *nuidp;
1772         struct timeval tvin, tvout;
1773 #if 0                           /* until encrypted keys are implemented */
1774         NFSKERBKEYSCHED_T keys; /* stores key schedule */
1775 #endif
1776
1777         mrep = nd->nd_mrep;
1778         md = nd->nd_md;
1779         dpos = nd->nd_dpos;
1780         if (has_header) {
1781                 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED);
1782                 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
1783                 if (*tl++ != rpc_call) {
1784                         m_freem(mrep);
1785                         return (EBADRPC);
1786                 }
1787         } else
1788                 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
1789         nd->nd_repstat = 0;
1790         nd->nd_flag = 0;
1791         if (*tl++ != rpc_vers) {
1792                 nd->nd_repstat = ERPCMISMATCH;
1793                 nd->nd_procnum = NFSPROC_NOOP;
1794                 return (0);
1795         }
1796         if (*tl != nfs_prog) {
1797                 if (*tl == nqnfs_prog)
1798                         nqnfs++;
1799                 else {
1800                         nd->nd_repstat = EPROGUNAVAIL;
1801                         nd->nd_procnum = NFSPROC_NOOP;
1802                         return (0);
1803                 }
1804         }
1805         tl++;
1806         nfsvers = fxdr_unsigned(u_int32_t, *tl++);
1807         if (((nfsvers < NFS_VER2 || nfsvers > NFS_VER3) && !nqnfs) ||
1808                 (nfsvers != NQNFS_VER3 && nqnfs)) {
1809                 nd->nd_repstat = EPROGMISMATCH;
1810                 nd->nd_procnum = NFSPROC_NOOP;
1811                 return (0);
1812         }
1813         if (nqnfs)
1814                 nd->nd_flag = (ND_NFSV3 | ND_NQNFS);
1815         else if (nfsvers == NFS_VER3)
1816                 nd->nd_flag = ND_NFSV3;
1817         nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
1818         if (nd->nd_procnum == NFSPROC_NULL)
1819                 return (0);
1820         if (nd->nd_procnum >= NFS_NPROCS ||
1821                 (!nqnfs && nd->nd_procnum >= NQNFSPROC_GETLEASE) ||
1822                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
1823                 nd->nd_repstat = EPROCUNAVAIL;
1824                 nd->nd_procnum = NFSPROC_NOOP;
1825                 return (0);
1826         }
1827         if ((nd->nd_flag & ND_NFSV3) == 0)
1828                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
1829         auth_type = *tl++;
1830         len = fxdr_unsigned(int, *tl++);
1831         if (len < 0 || len > RPCAUTH_MAXSIZ) {
1832                 m_freem(mrep);
1833                 return (EBADRPC);
1834         }
1835
1836         nd->nd_flag &= ~ND_KERBAUTH;
1837         /*
1838          * Handle auth_unix or auth_kerb.
1839          */
1840         if (auth_type == rpc_auth_unix) {
1841                 len = fxdr_unsigned(int, *++tl);
1842                 if (len < 0 || len > NFS_MAXNAMLEN) {
1843                         m_freem(mrep);
1844                         return (EBADRPC);
1845                 }
1846                 nfsm_adv(nfsm_rndup(len));
1847                 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1848                 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
1849                 nd->nd_cr.cr_ref = 1;
1850                 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
1851                 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
1852                 len = fxdr_unsigned(int, *tl);
1853                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
1854                         m_freem(mrep);
1855                         return (EBADRPC);
1856                 }
1857                 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED);
1858                 for (i = 1; i <= len; i++)
1859                     if (i < NGROUPS)
1860                         nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
1861                     else
1862                         tl++;
1863                 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
1864                 if (nd->nd_cr.cr_ngroups > 1)
1865                     nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
1866                 len = fxdr_unsigned(int, *++tl);
1867                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1868                         m_freem(mrep);
1869                         return (EBADRPC);
1870                 }
1871                 if (len > 0)
1872                         nfsm_adv(nfsm_rndup(len));
1873         } else if (auth_type == rpc_auth_kerb) {
1874                 switch (fxdr_unsigned(int, *tl++)) {
1875                 case RPCAKN_FULLNAME:
1876                         ticklen = fxdr_unsigned(int, *tl);
1877                         *((u_int32_t *)nfsd->nfsd_authstr) = *tl;
1878                         uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
1879                         nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
1880                         if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
1881                                 m_freem(mrep);
1882                                 return (EBADRPC);
1883                         }
1884                         uio.uio_offset = 0;
1885                         uio.uio_iov = &iov;
1886                         uio.uio_iovcnt = 1;
1887                         uio.uio_segflg = UIO_SYSSPACE;
1888                         iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
1889                         iov.iov_len = RPCAUTH_MAXSIZ - 4;
1890                         nfsm_mtouio(&uio, uio.uio_resid);
1891                         nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1892                         if (*tl++ != rpc_auth_kerb ||
1893                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
1894                                 printf("Bad kerb verifier\n");
1895                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1896                                 nd->nd_procnum = NFSPROC_NOOP;
1897                                 return (0);
1898                         }
1899                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
1900                         tl = (u_int32_t *)cp;
1901                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
1902                                 printf("Not fullname kerb verifier\n");
1903                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1904                                 nd->nd_procnum = NFSPROC_NOOP;
1905                                 return (0);
1906                         }
1907                         cp += NFSX_UNSIGNED;
1908                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
1909                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
1910                         nd->nd_flag |= ND_KERBFULL;
1911                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
1912                         break;
1913                 case RPCAKN_NICKNAME:
1914                         if (len != 2 * NFSX_UNSIGNED) {
1915                                 printf("Kerb nickname short\n");
1916                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
1917                                 nd->nd_procnum = NFSPROC_NOOP;
1918                                 return (0);
1919                         }
1920                         nickuid = fxdr_unsigned(uid_t, *tl);
1921                         nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1922                         if (*tl++ != rpc_auth_kerb ||
1923                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
1924                                 printf("Kerb nick verifier bad\n");
1925                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1926                                 nd->nd_procnum = NFSPROC_NOOP;
1927                                 return (0);
1928                         }
1929                         nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1930                         tvin.tv_sec = *tl++;
1931                         tvin.tv_usec = *tl;
1932
1933                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
1934                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1935                                 if (nuidp->nu_cr.cr_uid == nickuid &&
1936                                     (!nd->nd_nam2 ||
1937                                      netaddr_match(NU_NETFAM(nuidp),
1938                                       &nuidp->nu_haddr, nd->nd_nam2)))
1939                                         break;
1940                         }
1941                         if (!nuidp) {
1942                                 nd->nd_repstat =
1943                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
1944                                 nd->nd_procnum = NFSPROC_NOOP;
1945                                 return (0);
1946                         }
1947
1948                         /*
1949                          * Now, decrypt the timestamp using the session key
1950                          * and validate it.
1951                          */
1952 #ifdef NFSKERB
1953                         XXX
1954 #endif
1955
1956                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
1957                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
1958                         if (nuidp->nu_expire < time_second ||
1959                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
1960                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
1961                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
1962                                 nuidp->nu_expire = 0;
1963                                 nd->nd_repstat =
1964                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
1965                                 nd->nd_procnum = NFSPROC_NOOP;
1966                                 return (0);
1967                         }
1968                         nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
1969                         nd->nd_flag |= ND_KERBNICK;
1970                 };
1971         } else {
1972                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
1973                 nd->nd_procnum = NFSPROC_NOOP;
1974                 return (0);
1975         }
1976
1977         /*
1978          * For nqnfs, get piggybacked lease request.
1979          */
1980         if (nqnfs && nd->nd_procnum != NQNFSPROC_EVICTED) {
1981                 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1982                 nd->nd_flag |= fxdr_unsigned(int, *tl);
1983                 if (nd->nd_flag & ND_LEASE) {
1984                         nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1985                         nd->nd_duration = fxdr_unsigned(int32_t, *tl);
1986                 } else
1987                         nd->nd_duration = NQ_MINLEASE;
1988         } else
1989                 nd->nd_duration = NQ_MINLEASE;
1990         nd->nd_md = md;
1991         nd->nd_dpos = dpos;
1992         return (0);
1993 nfsmout:
1994         return (error);
1995 }
1996
1997 #endif
1998
1999 /*
2000  * Send a message to the originating process's terminal.  The thread and/or
2001  * process may be NULL.  YYY the thread should not be NULL but there may
2002  * still be some uio_td's that are still being passed as NULL through to
2003  * nfsm_request().
2004  */
2005 static int
2006 nfs_msg(struct thread *td, char *server, char *msg)
2007 {
2008         tpr_t tpr;
2009
2010         if (td && td->td_proc)
2011                 tpr = tprintf_open(td->td_proc);
2012         else
2013                 tpr = NULL;
2014         tprintf(tpr, "nfs server %s: %s\n", server, msg);
2015         tprintf_close(tpr);
2016         return (0);
2017 }
2018
2019 #ifndef NFS_NOSERVER
2020 /*
2021  * Socket upcall routine for the nfsd sockets.
2022  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2023  * Essentially do as much as possible non-blocking, else punt and it will
2024  * be called with M_WAIT from an nfsd.
2025  */
2026 void
2027 nfsrv_rcv(so, arg, waitflag)
2028         struct socket *so;
2029         void *arg;
2030         int waitflag;
2031 {
2032         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2033         struct mbuf *m;
2034         struct mbuf *mp;
2035         struct sockaddr *nam;
2036         struct uio auio;
2037         int flags, error;
2038
2039         if ((slp->ns_flag & SLP_VALID) == 0)
2040                 return;
2041 #ifdef notdef
2042         /*
2043          * Define this to test for nfsds handling this under heavy load.
2044          */
2045         if (waitflag == M_DONTWAIT) {
2046                 slp->ns_flag |= SLP_NEEDQ; goto dorecs;
2047         }
2048 #endif
2049         auio.uio_td = NULL;
2050         if (so->so_type == SOCK_STREAM) {
2051                 /*
2052                  * If there are already records on the queue, defer soreceive()
2053                  * to an nfsd so that there is feedback to the TCP layer that
2054                  * the nfs servers are heavily loaded.
2055                  */
2056                 if (STAILQ_FIRST(&slp->ns_rec) && waitflag == M_DONTWAIT) {
2057                         slp->ns_flag |= SLP_NEEDQ;
2058                         goto dorecs;
2059                 }
2060
2061                 /*
2062                  * Do soreceive().
2063                  */
2064                 auio.uio_resid = 1000000000;
2065                 flags = MSG_DONTWAIT;
2066                 error = so_pru_soreceive(so, &nam, &auio, &mp, NULL, &flags);
2067                 if (error || mp == (struct mbuf *)0) {
2068                         if (error == EWOULDBLOCK)
2069                                 slp->ns_flag |= SLP_NEEDQ;
2070                         else
2071                                 slp->ns_flag |= SLP_DISCONN;
2072                         goto dorecs;
2073                 }
2074                 m = mp;
2075                 if (slp->ns_rawend) {
2076                         slp->ns_rawend->m_next = m;
2077                         slp->ns_cc += 1000000000 - auio.uio_resid;
2078                 } else {
2079                         slp->ns_raw = m;
2080                         slp->ns_cc = 1000000000 - auio.uio_resid;
2081                 }
2082                 while (m->m_next)
2083                         m = m->m_next;
2084                 slp->ns_rawend = m;
2085
2086                 /*
2087                  * Now try and parse record(s) out of the raw stream data.
2088                  */
2089                 error = nfsrv_getstream(slp, waitflag);
2090                 if (error) {
2091                         if (error == EPERM)
2092                                 slp->ns_flag |= SLP_DISCONN;
2093                         else
2094                                 slp->ns_flag |= SLP_NEEDQ;
2095                 }
2096         } else {
2097                 do {
2098                         auio.uio_resid = 1000000000;
2099                         flags = MSG_DONTWAIT;
2100                         error = so_pru_soreceive(so, &nam, &auio, &mp, NULL,
2101                             &flags);
2102                         if (mp) {
2103                                 struct nfsrv_rec *rec;
2104                                 int mf = (waitflag & M_DONTWAIT) ?
2105                                             M_NOWAIT : M_WAITOK;
2106                                 rec = malloc(sizeof(struct nfsrv_rec),
2107                                              M_NFSRVDESC, mf);
2108                                 if (!rec) {
2109                                         if (nam)
2110                                                 FREE(nam, M_SONAME);
2111                                         m_freem(mp);
2112                                         continue;
2113                                 }
2114                                 nfs_realign(&mp, 10 * NFSX_UNSIGNED);
2115                                 rec->nr_address = nam;
2116                                 rec->nr_packet = mp;
2117                                 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2118                         }
2119                         if (error) {
2120                                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
2121                                         && error != EWOULDBLOCK) {
2122                                         slp->ns_flag |= SLP_DISCONN;
2123                                         goto dorecs;
2124                                 }
2125                         }
2126                 } while (mp);
2127         }
2128
2129         /*
2130          * Now try and process the request records, non-blocking.
2131          */
2132 dorecs:
2133         if (waitflag == M_DONTWAIT &&
2134                 (STAILQ_FIRST(&slp->ns_rec)
2135                  || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN))))
2136                 nfsrv_wakenfsd(slp);
2137 }
2138
2139 /*
2140  * Try and extract an RPC request from the mbuf data list received on a
2141  * stream socket. The "waitflag" argument indicates whether or not it
2142  * can sleep.
2143  */
2144 static int
2145 nfsrv_getstream(slp, waitflag)
2146         struct nfssvc_sock *slp;
2147         int waitflag;
2148 {
2149         struct mbuf *m, **mpp;
2150         char *cp1, *cp2;
2151         int len;
2152         struct mbuf *om, *m2, *recm;
2153         u_int32_t recmark;
2154
2155         if (slp->ns_flag & SLP_GETSTREAM)
2156                 panic("nfs getstream");
2157         slp->ns_flag |= SLP_GETSTREAM;
2158         for (;;) {
2159             if (slp->ns_reclen == 0) {
2160                 if (slp->ns_cc < NFSX_UNSIGNED) {
2161                         slp->ns_flag &= ~SLP_GETSTREAM;
2162                         return (0);
2163                 }
2164                 m = slp->ns_raw;
2165                 if (m->m_len >= NFSX_UNSIGNED) {
2166                         bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
2167                         m->m_data += NFSX_UNSIGNED;
2168                         m->m_len -= NFSX_UNSIGNED;
2169                 } else {
2170                         cp1 = (caddr_t)&recmark;
2171                         cp2 = mtod(m, caddr_t);
2172                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2173                                 while (m->m_len == 0) {
2174                                         m = m->m_next;
2175                                         cp2 = mtod(m, caddr_t);
2176                                 }
2177                                 *cp1++ = *cp2++;
2178                                 m->m_data++;
2179                                 m->m_len--;
2180                         }
2181                 }
2182                 slp->ns_cc -= NFSX_UNSIGNED;
2183                 recmark = ntohl(recmark);
2184                 slp->ns_reclen = recmark & ~0x80000000;
2185                 if (recmark & 0x80000000)
2186                         slp->ns_flag |= SLP_LASTFRAG;
2187                 else
2188                         slp->ns_flag &= ~SLP_LASTFRAG;
2189                 if (slp->ns_reclen > NFS_MAXPACKET) {
2190                         slp->ns_flag &= ~SLP_GETSTREAM;
2191                         return (EPERM);
2192                 }
2193             }
2194
2195             /*
2196              * Now get the record part.
2197              *
2198              * Note that slp->ns_reclen may be 0.  Linux sometimes
2199              * generates 0-length RPCs
2200              */
2201             recm = NULL;
2202             if (slp->ns_cc == slp->ns_reclen) {
2203                 recm = slp->ns_raw;
2204                 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
2205                 slp->ns_cc = slp->ns_reclen = 0;
2206             } else if (slp->ns_cc > slp->ns_reclen) {
2207                 len = 0;
2208                 m = slp->ns_raw;
2209                 om = (struct mbuf *)0;
2210
2211                 while (len < slp->ns_reclen) {
2212                         if ((len + m->m_len) > slp->ns_reclen) {
2213                                 m2 = m_copym(m, 0, slp->ns_reclen - len,
2214                                         waitflag);
2215                                 if (m2) {
2216                                         if (om) {
2217                                                 om->m_next = m2;
2218                                                 recm = slp->ns_raw;
2219                                         } else
2220                                                 recm = m2;
2221                                         m->m_data += slp->ns_reclen - len;
2222                                         m->m_len -= slp->ns_reclen - len;
2223                                         len = slp->ns_reclen;
2224                                 } else {
2225                                         slp->ns_flag &= ~SLP_GETSTREAM;
2226                                         return (EWOULDBLOCK);
2227                                 }
2228                         } else if ((len + m->m_len) == slp->ns_reclen) {
2229                                 om = m;
2230                                 len += m->m_len;
2231                                 m = m->m_next;
2232                                 recm = slp->ns_raw;
2233                                 om->m_next = (struct mbuf *)0;
2234                         } else {
2235                                 om = m;
2236                                 len += m->m_len;
2237                                 m = m->m_next;
2238                         }
2239                 }
2240                 slp->ns_raw = m;
2241                 slp->ns_cc -= len;
2242                 slp->ns_reclen = 0;
2243             } else {
2244                 slp->ns_flag &= ~SLP_GETSTREAM;
2245                 return (0);
2246             }
2247
2248             /*
2249              * Accumulate the fragments into a record.
2250              */
2251             mpp = &slp->ns_frag;
2252             while (*mpp)
2253                 mpp = &((*mpp)->m_next);
2254             *mpp = recm;
2255             if (slp->ns_flag & SLP_LASTFRAG) {
2256                 struct nfsrv_rec *rec;
2257                 int mf = (waitflag & M_DONTWAIT) ? M_NOWAIT : M_WAITOK;
2258                 rec = malloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
2259                 if (!rec) {
2260                     m_freem(slp->ns_frag);
2261                 } else {
2262                     nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
2263                     rec->nr_address = (struct sockaddr *)0;
2264                     rec->nr_packet = slp->ns_frag;
2265                     STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2266                 }
2267                 slp->ns_frag = (struct mbuf *)0;
2268             }
2269         }
2270 }
2271
2272 /*
2273  * Parse an RPC header.
2274  */
2275 int
2276 nfsrv_dorec(slp, nfsd, ndp)
2277         struct nfssvc_sock *slp;
2278         struct nfsd *nfsd;
2279         struct nfsrv_descript **ndp;
2280 {
2281         struct nfsrv_rec *rec;
2282         struct mbuf *m;
2283         struct sockaddr *nam;
2284         struct nfsrv_descript *nd;
2285         int error;
2286
2287         *ndp = NULL;
2288         if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec))
2289                 return (ENOBUFS);
2290         rec = STAILQ_FIRST(&slp->ns_rec);
2291         STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
2292         nam = rec->nr_address;
2293         m = rec->nr_packet;
2294         free(rec, M_NFSRVDESC);
2295         MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
2296                 M_NFSRVDESC, M_WAITOK);
2297         nd->nd_md = nd->nd_mrep = m;
2298         nd->nd_nam2 = nam;
2299         nd->nd_dpos = mtod(m, caddr_t);
2300         error = nfs_getreq(nd, nfsd, TRUE);
2301         if (error) {
2302                 if (nam) {
2303                         FREE(nam, M_SONAME);
2304                 }
2305                 free((caddr_t)nd, M_NFSRVDESC);
2306                 return (error);
2307         }
2308         *ndp = nd;
2309         nfsd->nfsd_nd = nd;
2310         return (0);
2311 }
2312
2313 /*
2314  * Search for a sleeping nfsd and wake it up.
2315  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2316  * running nfsds will go look for the work in the nfssvc_sock list.
2317  */
2318 void
2319 nfsrv_wakenfsd(slp)
2320         struct nfssvc_sock *slp;
2321 {
2322         struct nfsd *nd;
2323
2324         if ((slp->ns_flag & SLP_VALID) == 0)
2325                 return;
2326         for (nd = nfsd_head.tqh_first; nd != 0; nd = nd->nfsd_chain.tqe_next) {
2327                 if (nd->nfsd_flag & NFSD_WAITING) {
2328                         nd->nfsd_flag &= ~NFSD_WAITING;
2329                         if (nd->nfsd_slp)
2330                                 panic("nfsd wakeup");
2331                         slp->ns_sref++;
2332                         nd->nfsd_slp = slp;
2333                         wakeup((caddr_t)nd);
2334                         return;
2335                 }
2336         }
2337         slp->ns_flag |= SLP_DOREC;
2338         nfsd_head_flag |= NFSD_CHECKSLP;
2339 }
2340 #endif /* NFS_NOSERVER */