Bring the BFE(4) manual page up-to-date with FreeBSD RELENG_4.
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
1 /*
2  * Copyright (c) 1989, 1991, 1993, 1995
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by the University of
19  *      California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
37  * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
38  * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.12 2004/03/04 10:29:24 hsu Exp $
39  */
40
41 /*
42  * Socket operations for use by nfs
43  */
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 #include <sys/vnode.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/socketops.h>
57 #include <sys/syslog.h>
58 #include <sys/tprintf.h>
59 #include <sys/sysctl.h>
60 #include <sys/signalvar.h>
61
62 #include <netinet/in.h>
63 #include <netinet/tcp.h>
64
65 #include "rpcv2.h"
66 #include "nfsproto.h"
67 #include "nfs.h"
68 #include "xdr_subs.h"
69 #include "nfsm_subs.h"
70 #include "nfsmount.h"
71 #include "nfsnode.h"
72 #include "nfsrtt.h"
73 #include "nqnfs.h"
74
75 #define TRUE    1
76 #define FALSE   0
77
78 /*
79  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
80  * Use the mean and mean deviation of rtt for the appropriate type of rpc
81  * for the frequent rpcs and a default for the others.
82  * The justification for doing "other" this way is that these rpcs
83  * happen so infrequently that timer est. would probably be stale.
84  * Also, since many of these rpcs are
85  * non-idempotent, a conservative timeout is desired.
86  * getattr, lookup - A+2D
87  * read, write     - A+4D
88  * other           - nm_timeo
89  */
90 #define NFS_RTO(n, t) \
91         ((t) == 0 ? (n)->nm_timeo : \
92          ((t) < 3 ? \
93           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
94           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
95 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
96 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
97 /*
98  * External data, mostly RPC constants in XDR form
99  */
100 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers,
101         rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr,
102         rpc_auth_kerb;
103 extern u_int32_t nfs_prog, nqnfs_prog;
104 extern time_t nqnfsstarttime;
105 extern struct nfsstats nfsstats;
106 extern int nfsv3_procid[NFS_NPROCS];
107 extern int nfs_ticks;
108
109 /*
110  * Defines which timer to use for the procnum.
111  * 0 - default
112  * 1 - getattr
113  * 2 - lookup
114  * 3 - read
115  * 4 - write
116  */
117 static int proct[NFS_NPROCS] = {
118         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
119         0, 0, 0,
120 };
121
122 static int nfs_realign_test;
123 static int nfs_realign_count;
124 static int nfs_bufpackets = 4;
125
126 SYSCTL_DECL(_vfs_nfs);
127
128 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
129 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
130 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
131
132
133 /*
134  * There is a congestion window for outstanding rpcs maintained per mount
135  * point. The cwnd size is adjusted in roughly the way that:
136  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
137  * SIGCOMM '88". ACM, August 1988.
138  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
139  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
140  * of rpcs is in progress.
141  * (The sent count and cwnd are scaled for integer arith.)
142  * Variants of "slow start" were tried and were found to be too much of a
143  * performance hit (ave. rtt 3 times larger),
144  * I suspect due to the large rtt that nfs rpcs have.
145  */
146 #define NFS_CWNDSCALE   256
147 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
148 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
149 int nfsrtton = 0;
150 struct nfsrtt nfsrtt;
151 struct callout_handle   nfs_timer_handle;
152
153 static int      nfs_msg (struct thread *,char *,char *);
154 static int      nfs_rcvlock (struct nfsreq *);
155 static void     nfs_rcvunlock (struct nfsreq *);
156 static void     nfs_realign (struct mbuf **pm, int hsiz);
157 static int      nfs_receive (struct nfsreq *rep, struct sockaddr **aname,
158                                  struct mbuf **mp);
159 static void     nfs_softterm (struct nfsreq *rep);
160 static int      nfs_reconnect (struct nfsreq *rep);
161 #ifndef NFS_NOSERVER 
162 static int      nfsrv_getstream (struct nfssvc_sock *,int);
163
164 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
165                                     struct nfssvc_sock *slp,
166                                     struct thread *td,
167                                     struct mbuf **mreqp) = {
168         nfsrv_null,
169         nfsrv_getattr,
170         nfsrv_setattr,
171         nfsrv_lookup,
172         nfsrv3_access,
173         nfsrv_readlink,
174         nfsrv_read,
175         nfsrv_write,
176         nfsrv_create,
177         nfsrv_mkdir,
178         nfsrv_symlink,
179         nfsrv_mknod,
180         nfsrv_remove,
181         nfsrv_rmdir,
182         nfsrv_rename,
183         nfsrv_link,
184         nfsrv_readdir,
185         nfsrv_readdirplus,
186         nfsrv_statfs,
187         nfsrv_fsinfo,
188         nfsrv_pathconf,
189         nfsrv_commit,
190         nqnfsrv_getlease,
191         nqnfsrv_vacated,
192         nfsrv_noop,
193         nfsrv_noop
194 };
195 #endif /* NFS_NOSERVER */
196
197 /*
198  * Initialize sockets and congestion for a new NFS connection.
199  * We do not free the sockaddr if error.
200  */
201 int
202 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
203 {
204         struct socket *so;
205         int s, error, rcvreserve, sndreserve;
206         int pktscale;
207         struct sockaddr *saddr;
208         struct sockaddr_in *sin;
209         struct thread *td = &thread0; /* only used for socreate and sobind */
210
211         nmp->nm_so = (struct socket *)0;
212         saddr = nmp->nm_nam;
213         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
214                 nmp->nm_soproto, td);
215         if (error)
216                 goto bad;
217         so = nmp->nm_so;
218         nmp->nm_soflags = so->so_proto->pr_flags;
219
220         /*
221          * Some servers require that the client port be a reserved port number.
222          */
223         if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
224                 struct sockopt sopt;
225                 int ip;
226                 struct sockaddr_in ssin;
227
228                 bzero(&sopt, sizeof sopt);
229                 ip = IP_PORTRANGE_LOW;
230                 sopt.sopt_dir = SOPT_SET;
231                 sopt.sopt_level = IPPROTO_IP;
232                 sopt.sopt_name = IP_PORTRANGE;
233                 sopt.sopt_val = (void *)&ip;
234                 sopt.sopt_valsize = sizeof(ip);
235                 sopt.sopt_td = NULL;
236                 error = sosetopt(so, &sopt);
237                 if (error)
238                         goto bad;
239                 bzero(&ssin, sizeof ssin);
240                 sin = &ssin;
241                 sin->sin_len = sizeof (struct sockaddr_in);
242                 sin->sin_family = AF_INET;
243                 sin->sin_addr.s_addr = INADDR_ANY;
244                 sin->sin_port = htons(0);
245                 error = sobind(so, (struct sockaddr *)sin, td);
246                 if (error)
247                         goto bad;
248                 bzero(&sopt, sizeof sopt);
249                 ip = IP_PORTRANGE_DEFAULT;
250                 sopt.sopt_dir = SOPT_SET;
251                 sopt.sopt_level = IPPROTO_IP;
252                 sopt.sopt_name = IP_PORTRANGE;
253                 sopt.sopt_val = (void *)&ip;
254                 sopt.sopt_valsize = sizeof(ip);
255                 sopt.sopt_td = NULL;
256                 error = sosetopt(so, &sopt);
257                 if (error)
258                         goto bad;
259         }
260
261         /*
262          * Protocols that do not require connections may be optionally left
263          * unconnected for servers that reply from a port other than NFS_PORT.
264          */
265         if (nmp->nm_flag & NFSMNT_NOCONN) {
266                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
267                         error = ENOTCONN;
268                         goto bad;
269                 }
270         } else {
271                 error = soconnect(so, nmp->nm_nam, td);
272                 if (error)
273                         goto bad;
274
275                 /*
276                  * Wait for the connection to complete. Cribbed from the
277                  * connect system call but with the wait timing out so
278                  * that interruptible mounts don't hang here for a long time.
279                  */
280                 s = splnet();
281                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
282                         (void) tsleep((caddr_t)&so->so_timeo, 0,
283                                 "nfscon", 2 * hz);
284                         if ((so->so_state & SS_ISCONNECTING) &&
285                             so->so_error == 0 && rep &&
286                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
287                                 so->so_state &= ~SS_ISCONNECTING;
288                                 splx(s);
289                                 goto bad;
290                         }
291                 }
292                 if (so->so_error) {
293                         error = so->so_error;
294                         so->so_error = 0;
295                         splx(s);
296                         goto bad;
297                 }
298                 splx(s);
299         }
300         so->so_rcv.sb_timeo = (5 * hz);
301         so->so_snd.sb_timeo = (5 * hz);
302
303         /*
304          * Get buffer reservation size from sysctl, but impose reasonable
305          * limits.
306          */
307         pktscale = nfs_bufpackets;
308         if (pktscale < 2)
309                 pktscale = 2;
310         if (pktscale > 64)
311                 pktscale = 64;
312
313         if (nmp->nm_sotype == SOCK_DGRAM) {
314                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
315                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
316                     NFS_MAXPKTHDR) * pktscale;
317         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
318                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
319                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
320                     NFS_MAXPKTHDR) * pktscale;
321         } else {
322                 if (nmp->nm_sotype != SOCK_STREAM)
323                         panic("nfscon sotype");
324                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
325                         struct sockopt sopt;
326                         int val;
327
328                         bzero(&sopt, sizeof sopt);
329                         sopt.sopt_level = SOL_SOCKET;
330                         sopt.sopt_name = SO_KEEPALIVE;
331                         sopt.sopt_val = &val;
332                         sopt.sopt_valsize = sizeof val;
333                         val = 1;
334                         sosetopt(so, &sopt);
335                 }
336                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
337                         struct sockopt sopt;
338                         int val;
339
340                         bzero(&sopt, sizeof sopt);
341                         sopt.sopt_level = IPPROTO_TCP;
342                         sopt.sopt_name = TCP_NODELAY;
343                         sopt.sopt_val = &val;
344                         sopt.sopt_valsize = sizeof val;
345                         val = 1;
346                         sosetopt(so, &sopt);
347                 }
348                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
349                     sizeof (u_int32_t)) * pktscale;
350                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
351                     sizeof (u_int32_t)) * pktscale;
352         }
353         error = soreserve(so, sndreserve, rcvreserve);
354         if (error)
355                 goto bad;
356         so->so_rcv.sb_flags |= SB_NOINTR;
357         so->so_snd.sb_flags |= SB_NOINTR;
358
359         /* Initialize other non-zero congestion variables */
360         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = 
361                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
362         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
363                 nmp->nm_sdrtt[3] = 0;
364         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
365         nmp->nm_sent = 0;
366         nmp->nm_timeouts = 0;
367         return (0);
368
369 bad:
370         nfs_disconnect(nmp);
371         return (error);
372 }
373
374 /*
375  * Reconnect routine:
376  * Called when a connection is broken on a reliable protocol.
377  * - clean up the old socket
378  * - nfs_connect() again
379  * - set R_MUSTRESEND for all outstanding requests on mount point
380  * If this fails the mount point is DEAD!
381  * nb: Must be called with the nfs_sndlock() set on the mount point.
382  */
383 static int
384 nfs_reconnect(rep)
385         struct nfsreq *rep;
386 {
387         struct nfsreq *rp;
388         struct nfsmount *nmp = rep->r_nmp;
389         int error;
390
391         nfs_disconnect(nmp);
392         while ((error = nfs_connect(nmp, rep)) != 0) {
393                 if (error == EINTR || error == ERESTART)
394                         return (EINTR);
395                 (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
396         }
397
398         /*
399          * Loop through outstanding request list and fix up all requests
400          * on old socket.
401          */
402         for (rp = nfs_reqq.tqh_first; rp != 0; rp = rp->r_chain.tqe_next) {
403                 if (rp->r_nmp == nmp)
404                         rp->r_flags |= R_MUSTRESEND;
405         }
406         return (0);
407 }
408
409 /*
410  * NFS disconnect. Clean up and unlink.
411  */
412 void
413 nfs_disconnect(nmp)
414         struct nfsmount *nmp;
415 {
416         struct socket *so;
417
418         if (nmp->nm_so) {
419                 so = nmp->nm_so;
420                 nmp->nm_so = (struct socket *)0;
421                 soshutdown(so, 2);
422                 soclose(so);
423         }
424 }
425
426 void
427 nfs_safedisconnect(nmp)
428         struct nfsmount *nmp;
429 {
430         struct nfsreq dummyreq;
431
432         bzero(&dummyreq, sizeof(dummyreq));
433         dummyreq.r_nmp = nmp;
434         dummyreq.r_td = NULL;
435         nfs_rcvlock(&dummyreq);
436         nfs_disconnect(nmp);
437         nfs_rcvunlock(&dummyreq);
438 }
439
440 /*
441  * This is the nfs send routine. For connection based socket types, it
442  * must be called with an nfs_sndlock() on the socket.
443  * "rep == NULL" indicates that it has been called from a server.
444  * For the client side:
445  * - return EINTR if the RPC is terminated, 0 otherwise
446  * - set R_MUSTRESEND if the send fails for any reason
447  * - do any cleanup required by recoverable socket errors (?)
448  * For the server side:
449  * - return EINTR or ERESTART if interrupted by a signal
450  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
451  * - do any cleanup required by recoverable socket errors (?)
452  */
453 int
454 nfs_send(so, nam, top, rep)
455         struct socket *so;
456         struct sockaddr *nam;
457         struct mbuf *top;
458         struct nfsreq *rep;
459 {
460         struct sockaddr *sendnam;
461         int error, soflags, flags;
462
463         if (rep) {
464                 if (rep->r_flags & R_SOFTTERM) {
465                         m_freem(top);
466                         return (EINTR);
467                 }
468                 if ((so = rep->r_nmp->nm_so) == NULL) {
469                         rep->r_flags |= R_MUSTRESEND;
470                         m_freem(top);
471                         return (0);
472                 }
473                 rep->r_flags &= ~R_MUSTRESEND;
474                 soflags = rep->r_nmp->nm_soflags;
475         } else
476                 soflags = so->so_proto->pr_flags;
477         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
478                 sendnam = (struct sockaddr *)0;
479         else
480                 sendnam = nam;
481         if (so->so_type == SOCK_SEQPACKET)
482                 flags = MSG_EOR;
483         else
484                 flags = 0;
485
486         error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
487             curthread /*XXX*/);
488         /*
489          * ENOBUFS for dgram sockets is transient and non fatal.
490          * No need to log, and no need to break a soft mount.
491          */
492         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
493                 error = 0;
494                 if (rep)                /* do backoff retransmit on client */
495                         rep->r_flags |= R_MUSTRESEND;
496         }
497
498         if (error) {
499                 if (rep) {
500                         log(LOG_INFO, "nfs send error %d for server %s\n",error,
501                             rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
502                         /*
503                          * Deal with errors for the client side.
504                          */
505                         if (rep->r_flags & R_SOFTTERM)
506                                 error = EINTR;
507                         else
508                                 rep->r_flags |= R_MUSTRESEND;
509                 } else
510                         log(LOG_INFO, "nfsd send error %d\n", error);
511
512                 /*
513                  * Handle any recoverable (soft) socket errors here. (?)
514                  */
515                 if (error != EINTR && error != ERESTART &&
516                         error != EWOULDBLOCK && error != EPIPE)
517                         error = 0;
518         }
519         return (error);
520 }
521
522 /*
523  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
524  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
525  * Mark and consolidate the data into a new mbuf list.
526  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
527  *     small mbufs.
528  * For SOCK_STREAM we must be very careful to read an entire record once
529  * we have read any of it, even if the system call has been interrupted.
530  */
531 static int
532 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
533 {
534         struct socket *so;
535         struct uio auio;
536         struct iovec aio;
537         struct mbuf *m;
538         struct mbuf *control;
539         u_int32_t len;
540         struct sockaddr **getnam;
541         int error, sotype, rcvflg;
542         struct thread *td = curthread;  /* XXX */
543
544         /*
545          * Set up arguments for soreceive()
546          */
547         *mp = (struct mbuf *)0;
548         *aname = (struct sockaddr *)0;
549         sotype = rep->r_nmp->nm_sotype;
550
551         /*
552          * For reliable protocols, lock against other senders/receivers
553          * in case a reconnect is necessary.
554          * For SOCK_STREAM, first get the Record Mark to find out how much
555          * more there is to get.
556          * We must lock the socket against other receivers
557          * until we have an entire rpc request/reply.
558          */
559         if (sotype != SOCK_DGRAM) {
560                 error = nfs_sndlock(rep);
561                 if (error)
562                         return (error);
563 tryagain:
564                 /*
565                  * Check for fatal errors and resending request.
566                  */
567                 /*
568                  * Ugh: If a reconnect attempt just happened, nm_so
569                  * would have changed. NULL indicates a failed
570                  * attempt that has essentially shut down this
571                  * mount point.
572                  */
573                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
574                         nfs_sndunlock(rep);
575                         return (EINTR);
576                 }
577                 so = rep->r_nmp->nm_so;
578                 if (!so) {
579                         error = nfs_reconnect(rep);
580                         if (error) {
581                                 nfs_sndunlock(rep);
582                                 return (error);
583                         }
584                         goto tryagain;
585                 }
586                 while (rep->r_flags & R_MUSTRESEND) {
587                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
588                         nfsstats.rpcretries++;
589                         error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
590                         if (error) {
591                                 if (error == EINTR || error == ERESTART ||
592                                     (error = nfs_reconnect(rep)) != 0) {
593                                         nfs_sndunlock(rep);
594                                         return (error);
595                                 }
596                                 goto tryagain;
597                         }
598                 }
599                 nfs_sndunlock(rep);
600                 if (sotype == SOCK_STREAM) {
601                         aio.iov_base = (caddr_t) &len;
602                         aio.iov_len = sizeof(u_int32_t);
603                         auio.uio_iov = &aio;
604                         auio.uio_iovcnt = 1;
605                         auio.uio_segflg = UIO_SYSSPACE;
606                         auio.uio_rw = UIO_READ;
607                         auio.uio_offset = 0;
608                         auio.uio_resid = sizeof(u_int32_t);
609                         auio.uio_td = td;
610                         do {
611                            rcvflg = MSG_WAITALL;
612                            error = so_pru_soreceive(so, NULL, &auio, NULL,
613                                NULL, &rcvflg);
614                            if (error == EWOULDBLOCK && rep) {
615                                 if (rep->r_flags & R_SOFTTERM)
616                                         return (EINTR);
617                            }
618                         } while (error == EWOULDBLOCK);
619                         if (!error && auio.uio_resid > 0) {
620                             /*
621                              * Don't log a 0 byte receive; it means
622                              * that the socket has been closed, and
623                              * can happen during normal operation
624                              * (forcible unmount or Solaris server).
625                              */
626                             if (auio.uio_resid != sizeof (u_int32_t))
627                             log(LOG_INFO,
628                                  "short receive (%d/%d) from nfs server %s\n",
629                                  (int)(sizeof(u_int32_t) - auio.uio_resid),
630                                  (int)sizeof(u_int32_t),
631                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
632                             error = EPIPE;
633                         }
634                         if (error)
635                                 goto errout;
636                         len = ntohl(len) & ~0x80000000;
637                         /*
638                          * This is SERIOUS! We are out of sync with the sender
639                          * and forcing a disconnect/reconnect is all I can do.
640                          */
641                         if (len > NFS_MAXPACKET) {
642                             log(LOG_ERR, "%s (%d) from nfs server %s\n",
643                                 "impossible packet length",
644                                 len,
645                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
646                             error = EFBIG;
647                             goto errout;
648                         }
649                         auio.uio_resid = len;
650                         do {
651                             rcvflg = MSG_WAITALL;
652                             error =  so_pru_soreceive(so, NULL, &auio, mp,
653                                 NULL, &rcvflg);
654                         } while (error == EWOULDBLOCK || error == EINTR ||
655                                  error == ERESTART);
656                         if (!error && auio.uio_resid > 0) {
657                             if (len != auio.uio_resid)
658                             log(LOG_INFO,
659                                 "short receive (%d/%d) from nfs server %s\n",
660                                 len - auio.uio_resid, len,
661                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
662                             error = EPIPE;
663                         }
664                 } else {
665                         /*
666                          * NB: Since uio_resid is big, MSG_WAITALL is ignored
667                          * and soreceive() will return when it has either a
668                          * control msg or a data msg.
669                          * We have no use for control msg., but must grab them
670                          * and then throw them away so we know what is going
671                          * on.
672                          */
673                         auio.uio_resid = len = 100000000; /* Anything Big */
674                         auio.uio_td = td;
675                         do {
676                             rcvflg = 0;
677                             error =  so_pru_soreceive(so, NULL, &auio, mp,
678                                 &control, &rcvflg);
679                             if (control)
680                                 m_freem(control);
681                             if (error == EWOULDBLOCK && rep) {
682                                 if (rep->r_flags & R_SOFTTERM)
683                                         return (EINTR);
684                             }
685                         } while (error == EWOULDBLOCK ||
686                                  (!error && *mp == NULL && control));
687                         if ((rcvflg & MSG_EOR) == 0)
688                                 printf("Egad!!\n");
689                         if (!error && *mp == NULL)
690                                 error = EPIPE;
691                         len -= auio.uio_resid;
692                 }
693 errout:
694                 if (error && error != EINTR && error != ERESTART) {
695                         m_freem(*mp);
696                         *mp = (struct mbuf *)0;
697                         if (error != EPIPE)
698                                 log(LOG_INFO,
699                                     "receive error %d from nfs server %s\n",
700                                     error,
701                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
702                         error = nfs_sndlock(rep);
703                         if (!error) {
704                                 error = nfs_reconnect(rep);
705                                 if (!error)
706                                         goto tryagain;
707                                 else
708                                         nfs_sndunlock(rep);
709                         }
710                 }
711         } else {
712                 if ((so = rep->r_nmp->nm_so) == NULL)
713                         return (EACCES);
714                 if (so->so_state & SS_ISCONNECTED)
715                         getnam = (struct sockaddr **)0;
716                 else
717                         getnam = aname;
718                 auio.uio_resid = len = 1000000;
719                 auio.uio_td = td;
720                 do {
721                         rcvflg = 0;
722                         error =  so_pru_soreceive(so, getnam, &auio, mp, NULL,
723                             &rcvflg);
724                         if (error == EWOULDBLOCK &&
725                             (rep->r_flags & R_SOFTTERM))
726                                 return (EINTR);
727                 } while (error == EWOULDBLOCK);
728                 len -= auio.uio_resid;
729         }
730         if (error) {
731                 m_freem(*mp);
732                 *mp = (struct mbuf *)0;
733         }
734         /*
735          * Search for any mbufs that are not a multiple of 4 bytes long
736          * or with m_data not longword aligned.
737          * These could cause pointer alignment problems, so copy them to
738          * well aligned mbufs.
739          */
740         nfs_realign(mp, 5 * NFSX_UNSIGNED);
741         return (error);
742 }
743
744 /*
745  * Implement receipt of reply on a socket.
746  * We must search through the list of received datagrams matching them
747  * with outstanding requests using the xid, until ours is found.
748  */
749 /* ARGSUSED */
750 int
751 nfs_reply(myrep)
752         struct nfsreq *myrep;
753 {
754         struct nfsreq *rep;
755         struct nfsmount *nmp = myrep->r_nmp;
756         int32_t t1;
757         struct mbuf *mrep, *md;
758         struct sockaddr *nam;
759         u_int32_t rxid, *tl;
760         caddr_t dpos, cp2;
761         int error;
762
763         /*
764          * Loop around until we get our own reply
765          */
766         for (;;) {
767                 /*
768                  * Lock against other receivers so that I don't get stuck in
769                  * sbwait() after someone else has received my reply for me.
770                  * Also necessary for connection based protocols to avoid
771                  * race conditions during a reconnect.
772                  * If nfs_rcvlock() returns EALREADY, that means that
773                  * the reply has already been recieved by another
774                  * process and we can return immediately.  In this
775                  * case, the lock is not taken to avoid races with
776                  * other processes.
777                  */
778                 error = nfs_rcvlock(myrep);
779                 if (error == EALREADY)
780                         return (0);
781                 if (error)
782                         return (error);
783                 /*
784                  * Get the next Rpc reply off the socket
785                  */
786                 error = nfs_receive(myrep, &nam, &mrep);
787                 nfs_rcvunlock(myrep);
788                 if (error) {
789
790                         /*
791                          * Ignore routing errors on connectionless protocols??
792                          */
793                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
794                                 nmp->nm_so->so_error = 0;
795                                 if (myrep->r_flags & R_GETONEREP)
796                                         return (0);
797                                 continue;
798                         }
799                         return (error);
800                 }
801                 if (nam)
802                         FREE(nam, M_SONAME);
803
804                 /*
805                  * Get the xid and check that it is an rpc reply
806                  */
807                 md = mrep;
808                 dpos = mtod(md, caddr_t);
809                 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED);
810                 rxid = *tl++;
811                 if (*tl != rpc_reply) {
812 #ifndef NFS_NOSERVER
813                         if (nmp->nm_flag & NFSMNT_NQNFS) {
814                                 if (nqnfs_callback(nmp, mrep, md, dpos))
815                                         nfsstats.rpcinvalid++;
816                         } else {
817                                 nfsstats.rpcinvalid++;
818                                 m_freem(mrep);
819                         }
820 #else
821                         nfsstats.rpcinvalid++;
822                         m_freem(mrep);
823 #endif
824 nfsmout:
825                         if (myrep->r_flags & R_GETONEREP)
826                                 return (0);
827                         continue;
828                 }
829
830                 /*
831                  * Loop through the request list to match up the reply
832                  * Iff no match, just drop the datagram
833                  */
834                 for (rep = nfs_reqq.tqh_first; rep != 0;
835                     rep = rep->r_chain.tqe_next) {
836                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
837                                 /* Found it.. */
838                                 rep->r_mrep = mrep;
839                                 rep->r_md = md;
840                                 rep->r_dpos = dpos;
841                                 if (nfsrtton) {
842                                         struct rttl *rt;
843
844                                         rt = &nfsrtt.rttl[nfsrtt.pos];
845                                         rt->proc = rep->r_procnum;
846                                         rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
847                                         rt->sent = nmp->nm_sent;
848                                         rt->cwnd = nmp->nm_cwnd;
849                                         rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
850                                         rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
851                                         rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
852                                         getmicrotime(&rt->tstamp);
853                                         if (rep->r_flags & R_TIMING)
854                                                 rt->rtt = rep->r_rtt;
855                                         else
856                                                 rt->rtt = 1000000;
857                                         nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
858                                 }
859                                 /*
860                                  * Update congestion window.
861                                  * Do the additive increase of
862                                  * one rpc/rtt.
863                                  */
864                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
865                                         nmp->nm_cwnd +=
866                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
867                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
868                                         if (nmp->nm_cwnd > NFS_MAXCWND)
869                                                 nmp->nm_cwnd = NFS_MAXCWND;
870                                 }
871                                 if (rep->r_flags & R_SENT) {
872                                         rep->r_flags &= ~R_SENT;
873                                         nmp->nm_sent -= NFS_CWNDSCALE;
874                                 }
875                                 /*
876                                  * Update rtt using a gain of 0.125 on the mean
877                                  * and a gain of 0.25 on the deviation.
878                                  */
879                                 if (rep->r_flags & R_TIMING) {
880                                         /*
881                                          * Since the timer resolution of
882                                          * NFS_HZ is so course, it can often
883                                          * result in r_rtt == 0. Since
884                                          * r_rtt == N means that the actual
885                                          * rtt is between N+dt and N+2-dt ticks,
886                                          * add 1.
887                                          */
888                                         t1 = rep->r_rtt + 1;
889                                         t1 -= (NFS_SRTT(rep) >> 3);
890                                         NFS_SRTT(rep) += t1;
891                                         if (t1 < 0)
892                                                 t1 = -t1;
893                                         t1 -= (NFS_SDRTT(rep) >> 2);
894                                         NFS_SDRTT(rep) += t1;
895                                 }
896                                 nmp->nm_timeouts = 0;
897                                 break;
898                         }
899                 }
900                 /*
901                  * If not matched to a request, drop it.
902                  * If it's mine, get out.
903                  */
904                 if (rep == 0) {
905                         nfsstats.rpcunexpected++;
906                         m_freem(mrep);
907                 } else if (rep == myrep) {
908                         if (rep->r_mrep == NULL)
909                                 panic("nfsreply nil");
910                         return (0);
911                 }
912                 if (myrep->r_flags & R_GETONEREP)
913                         return (0);
914         }
915 }
916
917 /*
918  * nfs_request - goes something like this
919  *      - fill in request struct
920  *      - links it into list
921  *      - calls nfs_send() for first transmit
922  *      - calls nfs_receive() to get reply
923  *      - break down rpc header and return with nfs reply pointed to
924  *        by mrep or error
925  * nb: always frees up mreq mbuf list
926  */
927 int
928 nfs_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp)
929         struct vnode *vp;
930         struct mbuf *mrest;
931         int procnum;
932         struct thread *td;
933         struct ucred *cred;
934         struct mbuf **mrp;
935         struct mbuf **mdp;
936         caddr_t *dposp;
937 {
938         struct mbuf *mrep, *m2;
939         struct nfsreq *rep;
940         u_int32_t *tl;
941         int i;
942         struct nfsmount *nmp;
943         struct mbuf *m, *md, *mheadend;
944         struct nfsnode *np;
945         char nickv[RPCX_NICKVERF];
946         time_t reqtime, waituntil;
947         caddr_t dpos, cp2;
948         int t1, nqlflag, cachable, s, error = 0, mrest_len, auth_len, auth_type;
949         int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0, failed_auth = 0;
950         int verf_len, verf_type;
951         u_int32_t xid;
952         u_quad_t frev;
953         char *auth_str, *verf_str;
954         NFSKERBKEY_T key;               /* save session key */
955
956         /* Reject requests while attempting a forced unmount. */
957         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
958                 m_freem(mrest);
959                 return (ESTALE);
960         }
961         nmp = VFSTONFS(vp->v_mount);
962         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
963         rep->r_nmp = nmp;
964         rep->r_vp = vp;
965         rep->r_td = td;
966         rep->r_procnum = procnum;
967         i = 0;
968         m = mrest;
969         while (m) {
970                 i += m->m_len;
971                 m = m->m_next;
972         }
973         mrest_len = i;
974
975         /*
976          * Get the RPC header with authorization.
977          */
978 kerbauth:
979         verf_str = auth_str = (char *)0;
980         if (nmp->nm_flag & NFSMNT_KERB) {
981                 verf_str = nickv;
982                 verf_len = sizeof (nickv);
983                 auth_type = RPCAUTH_KERB4;
984                 bzero((caddr_t)key, sizeof (key));
985                 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
986                         &auth_len, verf_str, verf_len)) {
987                         error = nfs_getauth(nmp, rep, cred, &auth_str,
988                                 &auth_len, verf_str, &verf_len, key);
989                         if (error) {
990                                 free((caddr_t)rep, M_NFSREQ);
991                                 m_freem(mrest);
992                                 return (error);
993                         }
994                 }
995         } else {
996                 auth_type = RPCAUTH_UNIX;
997                 if (cred->cr_ngroups < 1)
998                         panic("nfsreq nogrps");
999                 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1000                         nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1001                         5 * NFSX_UNSIGNED;
1002         }
1003         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1004              auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid);
1005         if (auth_str)
1006                 free(auth_str, M_TEMP);
1007
1008         /*
1009          * For stream protocols, insert a Sun RPC Record Mark.
1010          */
1011         if (nmp->nm_sotype == SOCK_STREAM) {
1012                 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT);
1013                 if (m == NULL)
1014                         return (ENOBUFS);
1015                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
1016                          (m->m_pkthdr.len - NFSX_UNSIGNED));
1017         }
1018         rep->r_mreq = m;
1019         rep->r_xid = xid;
1020 tryagain:
1021         if (nmp->nm_flag & NFSMNT_SOFT)
1022                 rep->r_retry = nmp->nm_retry;
1023         else
1024                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1025         rep->r_rtt = rep->r_rexmit = 0;
1026         if (proct[procnum] > 0)
1027                 rep->r_flags = R_TIMING;
1028         else
1029                 rep->r_flags = 0;
1030         rep->r_mrep = NULL;
1031
1032         /*
1033          * Do the client side RPC.
1034          */
1035         nfsstats.rpcrequests++;
1036         /*
1037          * Chain request into list of outstanding requests. Be sure
1038          * to put it LAST so timer finds oldest requests first.
1039          */
1040         s = splsoftclock();
1041         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1042
1043         /* Get send time for nqnfs */
1044         reqtime = time_second;
1045
1046         /*
1047          * If backing off another request or avoiding congestion, don't
1048          * send this one now but let timer do it. If not timing a request,
1049          * do it now.
1050          */
1051         if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1052                 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1053                 nmp->nm_sent < nmp->nm_cwnd)) {
1054                 splx(s);
1055                 if (nmp->nm_soflags & PR_CONNREQUIRED)
1056                         error = nfs_sndlock(rep);
1057                 if (!error) {
1058                         m2 = m_copym(m, 0, M_COPYALL, M_WAIT);
1059                         error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1060                         if (nmp->nm_soflags & PR_CONNREQUIRED)
1061                                 nfs_sndunlock(rep);
1062                 }
1063                 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
1064                         nmp->nm_sent += NFS_CWNDSCALE;
1065                         rep->r_flags |= R_SENT;
1066                 }
1067         } else {
1068                 splx(s);
1069                 rep->r_rtt = -1;
1070         }
1071
1072         /*
1073          * Wait for the reply from our send or the timer's.
1074          */
1075         if (!error || error == EPIPE)
1076                 error = nfs_reply(rep);
1077
1078         /*
1079          * RPC done, unlink the request.
1080          */
1081         s = splsoftclock();
1082         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1083         splx(s);
1084
1085         /*
1086          * Decrement the outstanding request count.
1087          */
1088         if (rep->r_flags & R_SENT) {
1089                 rep->r_flags &= ~R_SENT;        /* paranoia */
1090                 nmp->nm_sent -= NFS_CWNDSCALE;
1091         }
1092
1093         /*
1094          * If there was a successful reply and a tprintf msg.
1095          * tprintf a response.
1096          */
1097         if (!error && (rep->r_flags & R_TPRINTFMSG))
1098                 nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1099                     "is alive again");
1100         mrep = rep->r_mrep;
1101         md = rep->r_md;
1102         dpos = rep->r_dpos;
1103         if (error) {
1104                 m_freem(rep->r_mreq);
1105                 free((caddr_t)rep, M_NFSREQ);
1106                 return (error);
1107         }
1108
1109         /*
1110          * break down the rpc header and check if ok
1111          */
1112         nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1113         if (*tl++ == rpc_msgdenied) {
1114                 if (*tl == rpc_mismatch)
1115                         error = EOPNOTSUPP;
1116                 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1117                         if (!failed_auth) {
1118                                 failed_auth++;
1119                                 mheadend->m_next = (struct mbuf *)0;
1120                                 m_freem(mrep);
1121                                 m_freem(rep->r_mreq);
1122                                 goto kerbauth;
1123                         } else
1124                                 error = EAUTH;
1125                 } else
1126                         error = EACCES;
1127                 m_freem(mrep);
1128                 m_freem(rep->r_mreq);
1129                 free((caddr_t)rep, M_NFSREQ);
1130                 return (error);
1131         }
1132
1133         /*
1134          * Grab any Kerberos verifier, otherwise just throw it away.
1135          */
1136         verf_type = fxdr_unsigned(int, *tl++);
1137         i = fxdr_unsigned(int32_t, *tl);
1138         if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1139                 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1140                 if (error)
1141                         goto nfsmout;
1142         } else if (i > 0)
1143                 nfsm_adv(nfsm_rndup(i));
1144         nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1145         /* 0 == ok */
1146         if (*tl == 0) {
1147                 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1148                 if (*tl != 0) {
1149                         error = fxdr_unsigned(int, *tl);
1150                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1151                                 error == NFSERR_TRYLATER) {
1152                                 m_freem(mrep);
1153                                 error = 0;
1154                                 waituntil = time_second + trylater_delay;
1155                                 while (time_second < waituntil)
1156                                         (void) tsleep((caddr_t)&lbolt,
1157                                                 0, "nqnfstry", 0);
1158                                 trylater_delay *= nfs_backoff[trylater_cnt];
1159                                 if (trylater_cnt < 7)
1160                                         trylater_cnt++;
1161                                 goto tryagain;
1162                         }
1163
1164                         /*
1165                          * If the File Handle was stale, invalidate the
1166                          * lookup cache, just in case.
1167                          */
1168                         if (error == ESTALE)
1169                                 cache_purge(vp);
1170                         if (nmp->nm_flag & NFSMNT_NFSV3) {
1171                                 *mrp = mrep;
1172                                 *mdp = md;
1173                                 *dposp = dpos;
1174                                 error |= NFSERR_RETERR;
1175                         } else
1176                                 m_freem(mrep);
1177                         m_freem(rep->r_mreq);
1178                         free((caddr_t)rep, M_NFSREQ);
1179                         return (error);
1180                 }
1181
1182                 /*
1183                  * For nqnfs, get any lease in reply
1184                  */
1185                 if (nmp->nm_flag & NFSMNT_NQNFS) {
1186                         nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1187                         if (*tl) {
1188                                 np = VTONFS(vp);
1189                                 nqlflag = fxdr_unsigned(int, *tl);
1190                                 nfsm_dissect(tl, u_int32_t *, 4*NFSX_UNSIGNED);
1191                                 cachable = fxdr_unsigned(int, *tl++);
1192                                 reqtime += fxdr_unsigned(int, *tl++);
1193                                 if (reqtime > time_second) {
1194                                     frev = fxdr_hyper(tl);
1195                                     nqnfs_clientlease(nmp, np, nqlflag,
1196                                         cachable, reqtime, frev);
1197                                 }
1198                         }
1199                 }
1200                 *mrp = mrep;
1201                 *mdp = md;
1202                 *dposp = dpos;
1203                 m_freem(rep->r_mreq);
1204                 FREE((caddr_t)rep, M_NFSREQ);
1205                 return (0);
1206         }
1207         m_freem(mrep);
1208         error = EPROTONOSUPPORT;
1209 nfsmout:
1210         m_freem(rep->r_mreq);
1211         free((caddr_t)rep, M_NFSREQ);
1212         return (error);
1213 }
1214
1215 #ifndef NFS_NOSERVER
1216 /*
1217  * Generate the rpc reply header
1218  * siz arg. is used to decide if adding a cluster is worthwhile
1219  */
1220 int
1221 nfs_rephead(siz, nd, slp, err, cache, frev, mrq, mbp, bposp)
1222         int siz;
1223         struct nfsrv_descript *nd;
1224         struct nfssvc_sock *slp;
1225         int err;
1226         int cache;
1227         u_quad_t *frev;
1228         struct mbuf **mrq;
1229         struct mbuf **mbp;
1230         caddr_t *bposp;
1231 {
1232         u_int32_t *tl;
1233         struct mbuf *mreq;
1234         caddr_t bpos;
1235         struct mbuf *mb, *mb2;
1236
1237         MGETHDR(mreq, M_WAIT, MT_DATA);
1238         mb = mreq;
1239         /*
1240          * If this is a big reply, use a cluster else
1241          * try and leave leading space for the lower level headers.
1242          */
1243         siz += RPC_REPLYSIZ;
1244         if ((max_hdr + siz) >= MINCLSIZE) {
1245                 MCLGET(mreq, M_WAIT);
1246         } else
1247                 mreq->m_data += max_hdr;
1248         tl = mtod(mreq, u_int32_t *);
1249         mreq->m_len = 6 * NFSX_UNSIGNED;
1250         bpos = ((caddr_t)tl) + mreq->m_len;
1251         *tl++ = txdr_unsigned(nd->nd_retxid);
1252         *tl++ = rpc_reply;
1253         if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1254                 *tl++ = rpc_msgdenied;
1255                 if (err & NFSERR_AUTHERR) {
1256                         *tl++ = rpc_autherr;
1257                         *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1258                         mreq->m_len -= NFSX_UNSIGNED;
1259                         bpos -= NFSX_UNSIGNED;
1260                 } else {
1261                         *tl++ = rpc_mismatch;
1262                         *tl++ = txdr_unsigned(RPC_VER2);
1263                         *tl = txdr_unsigned(RPC_VER2);
1264                 }
1265         } else {
1266                 *tl++ = rpc_msgaccepted;
1267
1268                 /*
1269                  * For Kerberos authentication, we must send the nickname
1270                  * verifier back, otherwise just RPCAUTH_NULL.
1271                  */
1272                 if (nd->nd_flag & ND_KERBFULL) {
1273                     struct nfsuid *nuidp;
1274                     struct timeval ktvin, ktvout;
1275
1276                     for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
1277                         nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1278                         if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
1279                             (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1280                              &nuidp->nu_haddr, nd->nd_nam2)))
1281                             break;
1282                     }
1283                     if (nuidp) {
1284                         ktvin.tv_sec =
1285                             txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1286                         ktvin.tv_usec =
1287                             txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1288
1289                         /*
1290                          * Encrypt the timestamp in ecb mode using the
1291                          * session key.
1292                          */
1293 #ifdef NFSKERB
1294                         XXX
1295 #endif
1296
1297                         *tl++ = rpc_auth_kerb;
1298                         *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1299                         *tl = ktvout.tv_sec;
1300                         nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1301                         *tl++ = ktvout.tv_usec;
1302                         *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
1303                     } else {
1304                         *tl++ = 0;
1305                         *tl++ = 0;
1306                     }
1307                 } else {
1308                         *tl++ = 0;
1309                         *tl++ = 0;
1310                 }
1311                 switch (err) {
1312                 case EPROGUNAVAIL:
1313                         *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1314                         break;
1315                 case EPROGMISMATCH:
1316                         *tl = txdr_unsigned(RPC_PROGMISMATCH);
1317                         nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1318                         if (nd->nd_flag & ND_NQNFS) {
1319                                 *tl++ = txdr_unsigned(3);
1320                                 *tl = txdr_unsigned(3);
1321                         } else {
1322                                 *tl++ = txdr_unsigned(2);
1323                                 *tl = txdr_unsigned(3);
1324                         }
1325                         break;
1326                 case EPROCUNAVAIL:
1327                         *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1328                         break;
1329                 case EBADRPC:
1330                         *tl = txdr_unsigned(RPC_GARBAGE);
1331                         break;
1332                 default:
1333                         *tl = 0;
1334                         if (err != NFSERR_RETVOID) {
1335                                 nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
1336                                 if (err)
1337                                     *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1338                                 else
1339                                     *tl = 0;
1340                         }
1341                         break;
1342                 };
1343         }
1344
1345         /*
1346          * For nqnfs, piggyback lease as requested.
1347          */
1348         if ((nd->nd_flag & ND_NQNFS) && err == 0) {
1349                 if (nd->nd_flag & ND_LEASE) {
1350                         nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
1351                         *tl++ = txdr_unsigned(nd->nd_flag & ND_LEASE);
1352                         *tl++ = txdr_unsigned(cache);
1353                         *tl++ = txdr_unsigned(nd->nd_duration);
1354                         txdr_hyper(*frev, tl);
1355                 } else {
1356                         nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
1357                         *tl = 0;
1358                 }
1359         }
1360         if (mrq != NULL)
1361             *mrq = mreq;
1362         *mbp = mb;
1363         *bposp = bpos;
1364         if (err != 0 && err != NFSERR_RETVOID)
1365                 nfsstats.srvrpc_errs++;
1366         return (0);
1367 }
1368
1369
1370 #endif /* NFS_NOSERVER */
1371 /*
1372  * Nfs timer routine
1373  * Scan the nfsreq list and retranmit any requests that have timed out
1374  * To avoid retransmission attempts on STREAM sockets (in the future) make
1375  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1376  */
1377 void
1378 nfs_timer(arg)
1379         void *arg;      /* never used */
1380 {
1381         struct nfsreq *rep;
1382         struct mbuf *m;
1383         struct socket *so;
1384         struct nfsmount *nmp;
1385         int timeo;
1386         int s, error;
1387 #ifndef NFS_NOSERVER
1388         static long lasttime = 0;
1389         struct nfssvc_sock *slp;
1390         u_quad_t cur_usec;
1391 #endif /* NFS_NOSERVER */
1392         struct thread *td = &thread0; /* XXX for credentials, will break if sleep */
1393
1394         s = splnet();
1395         for (rep = nfs_reqq.tqh_first; rep != 0; rep = rep->r_chain.tqe_next) {
1396                 nmp = rep->r_nmp;
1397                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1398                         continue;
1399                 if (nfs_sigintr(nmp, rep, rep->r_td)) {
1400                         nfs_softterm(rep);
1401                         continue;
1402                 }
1403                 if (rep->r_rtt >= 0) {
1404                         rep->r_rtt++;
1405                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1406                                 timeo = nmp->nm_timeo;
1407                         else
1408                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1409                         if (nmp->nm_timeouts > 0)
1410                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1411                         if (rep->r_rtt <= timeo)
1412                                 continue;
1413                         if (nmp->nm_timeouts < 8)
1414                                 nmp->nm_timeouts++;
1415                 }
1416                 /*
1417                  * Check for server not responding
1418                  */
1419                 if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
1420                      rep->r_rexmit > nmp->nm_deadthresh) {
1421                         nfs_msg(rep->r_td,
1422                             nmp->nm_mountp->mnt_stat.f_mntfromname,
1423                             "not responding");
1424                         rep->r_flags |= R_TPRINTFMSG;
1425                 }
1426                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1427                         nfsstats.rpctimeouts++;
1428                         nfs_softterm(rep);
1429                         continue;
1430                 }
1431                 if (nmp->nm_sotype != SOCK_DGRAM) {
1432                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1433                                 rep->r_rexmit = NFS_MAXREXMIT;
1434                         continue;
1435                 }
1436                 if ((so = nmp->nm_so) == NULL)
1437                         continue;
1438
1439                 /*
1440                  * If there is enough space and the window allows..
1441                  *      Resend it
1442                  * Set r_rtt to -1 in case we fail to send it now.
1443                  */
1444                 rep->r_rtt = -1;
1445                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
1446                    ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1447                     (rep->r_flags & R_SENT) ||
1448                     nmp->nm_sent < nmp->nm_cwnd) &&
1449                    (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
1450                         if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1451                             error = so_pru_send(so, 0, m, (struct sockaddr *)0,
1452                                      (struct mbuf *)0, td);
1453                         else
1454                             error = so_pru_send(so, 0, m, nmp->nm_nam,
1455                                 (struct mbuf *)0, td);
1456                         if (error) {
1457                                 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1458                                         so->so_error = 0;
1459                         } else {
1460                                 /*
1461                                  * Iff first send, start timing
1462                                  * else turn timing off, backoff timer
1463                                  * and divide congestion window by 2.
1464                                  */
1465                                 if (rep->r_flags & R_SENT) {
1466                                         rep->r_flags &= ~R_TIMING;
1467                                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1468                                                 rep->r_rexmit = NFS_MAXREXMIT;
1469                                         nmp->nm_cwnd >>= 1;
1470                                         if (nmp->nm_cwnd < NFS_CWNDSCALE)
1471                                                 nmp->nm_cwnd = NFS_CWNDSCALE;
1472                                         nfsstats.rpcretries++;
1473                                 } else {
1474                                         rep->r_flags |= R_SENT;
1475                                         nmp->nm_sent += NFS_CWNDSCALE;
1476                                 }
1477                                 rep->r_rtt = 0;
1478                         }
1479                 }
1480         }
1481 #ifndef NFS_NOSERVER
1482         /*
1483          * Call the nqnfs server timer once a second to handle leases.
1484          */
1485         if (lasttime != time_second) {
1486                 lasttime = time_second;
1487                 nqnfs_serverd();
1488         }
1489
1490         /*
1491          * Scan the write gathering queues for writes that need to be
1492          * completed now.
1493          */
1494         cur_usec = nfs_curusec();
1495         for (slp = nfssvc_sockhead.tqh_first; slp != 0;
1496             slp = slp->ns_chain.tqe_next) {
1497             if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec)
1498                 nfsrv_wakenfsd(slp);
1499         }
1500 #endif /* NFS_NOSERVER */
1501         splx(s);
1502         nfs_timer_handle = timeout(nfs_timer, (void *)0, nfs_ticks);
1503 }
1504
1505 /*
1506  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1507  * wait for all requests to complete. This is used by forced unmounts
1508  * to terminate any outstanding RPCs.
1509  */
1510 int
1511 nfs_nmcancelreqs(nmp)
1512         struct nfsmount *nmp;
1513 {
1514         struct nfsreq *req;
1515         int i, s;
1516
1517         s = splnet();
1518         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1519                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
1520                     (req->r_flags & R_SOFTTERM))
1521                         continue;
1522                 nfs_softterm(req);
1523         }
1524         splx(s);
1525
1526         for (i = 0; i < 30; i++) {
1527                 s = splnet();
1528                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1529                         if (nmp == req->r_nmp)
1530                                 break;
1531                 }
1532                 splx(s);
1533                 if (req == NULL)
1534                         return (0);
1535                 tsleep(&lbolt, 0, "nfscancel", 0);
1536         }
1537         return (EBUSY);
1538 }
1539
1540 /*
1541  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
1542  * The nm_send count is decremented now to avoid deadlocks when the process in
1543  * soreceive() hasn't yet managed to send its own request.
1544  */
1545
1546 static void
1547 nfs_softterm(rep)
1548         struct nfsreq *rep;
1549 {
1550         rep->r_flags |= R_SOFTTERM;
1551
1552         if (rep->r_flags & R_SENT) {
1553                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1554                 rep->r_flags &= ~R_SENT;
1555         }
1556 }
1557
1558 /*
1559  * Test for a termination condition pending on the process.
1560  * This is used for NFSMNT_INT mounts.
1561  */
1562 int
1563 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
1564 {
1565         sigset_t tmpset;
1566         struct proc *p;
1567
1568         if (rep && (rep->r_flags & R_SOFTTERM))
1569                 return (EINTR);
1570         /* Terminate all requests while attempting a forced unmount. */
1571         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
1572                 return (EINTR);
1573         if (!(nmp->nm_flag & NFSMNT_INT))
1574                 return (0);
1575         /* td might be NULL YYY */
1576         if (td == NULL || (p = td->td_proc) == NULL)
1577                 return (0);
1578
1579         tmpset = p->p_siglist;
1580         SIGSETNAND(tmpset, p->p_sigmask);
1581         SIGSETNAND(tmpset, p->p_sigignore);
1582         if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset))
1583                 return (EINTR);
1584
1585         return (0);
1586 }
1587
1588 /*
1589  * Lock a socket against others.
1590  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1591  * and also to avoid race conditions between the processes with nfs requests
1592  * in progress when a reconnect is necessary.
1593  */
1594 int
1595 nfs_sndlock(struct nfsreq *rep)
1596 {
1597         int *statep = &rep->r_nmp->nm_state;
1598         struct thread *td;
1599         int slpflag = 0, slptimeo = 0;
1600
1601         td = rep->r_td;
1602         if (rep->r_nmp->nm_flag & NFSMNT_INT)
1603                 slpflag = PCATCH;
1604         while (*statep & NFSSTA_SNDLOCK) {
1605                 if (nfs_sigintr(rep->r_nmp, rep, td))
1606                         return (EINTR);
1607                 *statep |= NFSSTA_WANTSND;
1608                 (void) tsleep((caddr_t)statep, slpflag,
1609                         "nfsndlck", slptimeo);
1610                 if (slpflag == PCATCH) {
1611                         slpflag = 0;
1612                         slptimeo = 2 * hz;
1613                 }
1614         }
1615         /* Always fail if our request has been cancelled. */
1616         if ((rep->r_flags & R_SOFTTERM))
1617                 return (EINTR);
1618         *statep |= NFSSTA_SNDLOCK;
1619         return (0);
1620 }
1621
1622 /*
1623  * Unlock the stream socket for others.
1624  */
1625 void
1626 nfs_sndunlock(rep)
1627         struct nfsreq *rep;
1628 {
1629         int *statep = &rep->r_nmp->nm_state;
1630
1631         if ((*statep & NFSSTA_SNDLOCK) == 0)
1632                 panic("nfs sndunlock");
1633         *statep &= ~NFSSTA_SNDLOCK;
1634         if (*statep & NFSSTA_WANTSND) {
1635                 *statep &= ~NFSSTA_WANTSND;
1636                 wakeup((caddr_t)statep);
1637         }
1638 }
1639
1640 static int
1641 nfs_rcvlock(rep)
1642         struct nfsreq *rep;
1643 {
1644         int *statep = &rep->r_nmp->nm_state;
1645         int slpflag, slptimeo = 0;
1646
1647         if (rep->r_nmp->nm_flag & NFSMNT_INT)
1648                 slpflag = PCATCH;
1649         else
1650                 slpflag = 0;
1651         while (*statep & NFSSTA_RCVLOCK) {
1652                 if (nfs_sigintr(rep->r_nmp, rep, rep->r_td))
1653                         return (EINTR);
1654                 *statep |= NFSSTA_WANTRCV;
1655                 (void) tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo);
1656                 /*
1657                  * If our reply was recieved while we were sleeping,
1658                  * then just return without taking the lock to avoid a
1659                  * situation where a single iod could 'capture' the
1660                  * recieve lock.
1661                  */
1662                 if (rep->r_mrep != NULL)
1663                         return (EALREADY);
1664                 if (slpflag == PCATCH) {
1665                         slpflag = 0;
1666                         slptimeo = 2 * hz;
1667                 }
1668         }
1669         *statep |= NFSSTA_RCVLOCK;
1670         return (0);
1671 }
1672
1673 /*
1674  * Unlock the stream socket for others.
1675  */
1676 static void
1677 nfs_rcvunlock(rep)
1678         struct nfsreq *rep;
1679 {
1680         int *statep = &rep->r_nmp->nm_state;
1681
1682         if ((*statep & NFSSTA_RCVLOCK) == 0)
1683                 panic("nfs rcvunlock");
1684         *statep &= ~NFSSTA_RCVLOCK;
1685         if (*statep & NFSSTA_WANTRCV) {
1686                 *statep &= ~NFSSTA_WANTRCV;
1687                 wakeup((caddr_t)statep);
1688         }
1689 }
1690
1691 /*
1692  *      nfs_realign:
1693  *
1694  *      Check for badly aligned mbuf data and realign by copying the unaligned
1695  *      portion of the data into a new mbuf chain and freeing the portions
1696  *      of the old chain that were replaced.
1697  *
1698  *      We cannot simply realign the data within the existing mbuf chain
1699  *      because the underlying buffers may contain other rpc commands and
1700  *      we cannot afford to overwrite them.
1701  *
1702  *      We would prefer to avoid this situation entirely.  The situation does
1703  *      not occur with NFS/UDP and is supposed to only occassionally occur
1704  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
1705  */
1706 static void
1707 nfs_realign(pm, hsiz)
1708         struct mbuf **pm;
1709         int hsiz;
1710 {
1711         struct mbuf *m;
1712         struct mbuf *n = NULL;
1713         int off = 0;
1714
1715         ++nfs_realign_test;
1716
1717         while ((m = *pm) != NULL) {
1718                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
1719                         MGET(n, M_WAIT, MT_DATA);
1720                         if (m->m_len >= MINCLSIZE) {
1721                                 MCLGET(n, M_WAIT);
1722                         }
1723                         n->m_len = 0;
1724                         break;
1725                 }
1726                 pm = &m->m_next;
1727         }
1728
1729         /*
1730          * If n is non-NULL, loop on m copying data, then replace the
1731          * portion of the chain that had to be realigned.
1732          */
1733         if (n != NULL) {
1734                 ++nfs_realign_count;
1735                 while (m) {
1736                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
1737                         off += m->m_len;
1738                         m = m->m_next;
1739                 }
1740                 m_freem(*pm);
1741                 *pm = n;
1742         }
1743 }
1744
1745 #ifndef NFS_NOSERVER
1746
1747 /*
1748  * Parse an RPC request
1749  * - verify it
1750  * - fill in the cred struct.
1751  */
1752 int
1753 nfs_getreq(nd, nfsd, has_header)
1754         struct nfsrv_descript *nd;
1755         struct nfsd *nfsd;
1756         int has_header;
1757 {
1758         int len, i;
1759         u_int32_t *tl;
1760         int32_t t1;
1761         struct uio uio;
1762         struct iovec iov;
1763         caddr_t dpos, cp2, cp;
1764         u_int32_t nfsvers, auth_type;
1765         uid_t nickuid;
1766         int error = 0, nqnfs = 0, ticklen;
1767         struct mbuf *mrep, *md;
1768         struct nfsuid *nuidp;
1769         struct timeval tvin, tvout;
1770 #if 0                           /* until encrypted keys are implemented */
1771         NFSKERBKEYSCHED_T keys; /* stores key schedule */
1772 #endif
1773
1774         mrep = nd->nd_mrep;
1775         md = nd->nd_md;
1776         dpos = nd->nd_dpos;
1777         if (has_header) {
1778                 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED);
1779                 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
1780                 if (*tl++ != rpc_call) {
1781                         m_freem(mrep);
1782                         return (EBADRPC);
1783                 }
1784         } else
1785                 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
1786         nd->nd_repstat = 0;
1787         nd->nd_flag = 0;
1788         if (*tl++ != rpc_vers) {
1789                 nd->nd_repstat = ERPCMISMATCH;
1790                 nd->nd_procnum = NFSPROC_NOOP;
1791                 return (0);
1792         }
1793         if (*tl != nfs_prog) {
1794                 if (*tl == nqnfs_prog)
1795                         nqnfs++;
1796                 else {
1797                         nd->nd_repstat = EPROGUNAVAIL;
1798                         nd->nd_procnum = NFSPROC_NOOP;
1799                         return (0);
1800                 }
1801         }
1802         tl++;
1803         nfsvers = fxdr_unsigned(u_int32_t, *tl++);
1804         if (((nfsvers < NFS_VER2 || nfsvers > NFS_VER3) && !nqnfs) ||
1805                 (nfsvers != NQNFS_VER3 && nqnfs)) {
1806                 nd->nd_repstat = EPROGMISMATCH;
1807                 nd->nd_procnum = NFSPROC_NOOP;
1808                 return (0);
1809         }
1810         if (nqnfs)
1811                 nd->nd_flag = (ND_NFSV3 | ND_NQNFS);
1812         else if (nfsvers == NFS_VER3)
1813                 nd->nd_flag = ND_NFSV3;
1814         nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
1815         if (nd->nd_procnum == NFSPROC_NULL)
1816                 return (0);
1817         if (nd->nd_procnum >= NFS_NPROCS ||
1818                 (!nqnfs && nd->nd_procnum >= NQNFSPROC_GETLEASE) ||
1819                 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
1820                 nd->nd_repstat = EPROCUNAVAIL;
1821                 nd->nd_procnum = NFSPROC_NOOP;
1822                 return (0);
1823         }
1824         if ((nd->nd_flag & ND_NFSV3) == 0)
1825                 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
1826         auth_type = *tl++;
1827         len = fxdr_unsigned(int, *tl++);
1828         if (len < 0 || len > RPCAUTH_MAXSIZ) {
1829                 m_freem(mrep);
1830                 return (EBADRPC);
1831         }
1832
1833         nd->nd_flag &= ~ND_KERBAUTH;
1834         /*
1835          * Handle auth_unix or auth_kerb.
1836          */
1837         if (auth_type == rpc_auth_unix) {
1838                 len = fxdr_unsigned(int, *++tl);
1839                 if (len < 0 || len > NFS_MAXNAMLEN) {
1840                         m_freem(mrep);
1841                         return (EBADRPC);
1842                 }
1843                 nfsm_adv(nfsm_rndup(len));
1844                 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1845                 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
1846                 nd->nd_cr.cr_ref = 1;
1847                 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
1848                 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
1849                 len = fxdr_unsigned(int, *tl);
1850                 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
1851                         m_freem(mrep);
1852                         return (EBADRPC);
1853                 }
1854                 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED);
1855                 for (i = 1; i <= len; i++)
1856                     if (i < NGROUPS)
1857                         nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
1858                     else
1859                         tl++;
1860                 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
1861                 if (nd->nd_cr.cr_ngroups > 1)
1862                     nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
1863                 len = fxdr_unsigned(int, *++tl);
1864                 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1865                         m_freem(mrep);
1866                         return (EBADRPC);
1867                 }
1868                 if (len > 0)
1869                         nfsm_adv(nfsm_rndup(len));
1870         } else if (auth_type == rpc_auth_kerb) {
1871                 switch (fxdr_unsigned(int, *tl++)) {
1872                 case RPCAKN_FULLNAME:
1873                         ticklen = fxdr_unsigned(int, *tl);
1874                         *((u_int32_t *)nfsd->nfsd_authstr) = *tl;
1875                         uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
1876                         nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
1877                         if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
1878                                 m_freem(mrep);
1879                                 return (EBADRPC);
1880                         }
1881                         uio.uio_offset = 0;
1882                         uio.uio_iov = &iov;
1883                         uio.uio_iovcnt = 1;
1884                         uio.uio_segflg = UIO_SYSSPACE;
1885                         iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
1886                         iov.iov_len = RPCAUTH_MAXSIZ - 4;
1887                         nfsm_mtouio(&uio, uio.uio_resid);
1888                         nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1889                         if (*tl++ != rpc_auth_kerb ||
1890                                 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
1891                                 printf("Bad kerb verifier\n");
1892                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1893                                 nd->nd_procnum = NFSPROC_NOOP;
1894                                 return (0);
1895                         }
1896                         nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
1897                         tl = (u_int32_t *)cp;
1898                         if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
1899                                 printf("Not fullname kerb verifier\n");
1900                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1901                                 nd->nd_procnum = NFSPROC_NOOP;
1902                                 return (0);
1903                         }
1904                         cp += NFSX_UNSIGNED;
1905                         bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
1906                         nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
1907                         nd->nd_flag |= ND_KERBFULL;
1908                         nfsd->nfsd_flag |= NFSD_NEEDAUTH;
1909                         break;
1910                 case RPCAKN_NICKNAME:
1911                         if (len != 2 * NFSX_UNSIGNED) {
1912                                 printf("Kerb nickname short\n");
1913                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
1914                                 nd->nd_procnum = NFSPROC_NOOP;
1915                                 return (0);
1916                         }
1917                         nickuid = fxdr_unsigned(uid_t, *tl);
1918                         nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1919                         if (*tl++ != rpc_auth_kerb ||
1920                                 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
1921                                 printf("Kerb nick verifier bad\n");
1922                                 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1923                                 nd->nd_procnum = NFSPROC_NOOP;
1924                                 return (0);
1925                         }
1926                         nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1927                         tvin.tv_sec = *tl++;
1928                         tvin.tv_usec = *tl;
1929
1930                         for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
1931                             nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1932                                 if (nuidp->nu_cr.cr_uid == nickuid &&
1933                                     (!nd->nd_nam2 ||
1934                                      netaddr_match(NU_NETFAM(nuidp),
1935                                       &nuidp->nu_haddr, nd->nd_nam2)))
1936                                         break;
1937                         }
1938                         if (!nuidp) {
1939                                 nd->nd_repstat =
1940                                         (NFSERR_AUTHERR|AUTH_REJECTCRED);
1941                                 nd->nd_procnum = NFSPROC_NOOP;
1942                                 return (0);
1943                         }
1944
1945                         /*
1946                          * Now, decrypt the timestamp using the session key
1947                          * and validate it.
1948                          */
1949 #ifdef NFSKERB
1950                         XXX
1951 #endif
1952
1953                         tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
1954                         tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
1955                         if (nuidp->nu_expire < time_second ||
1956                             nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
1957                             (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
1958                              nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
1959                                 nuidp->nu_expire = 0;
1960                                 nd->nd_repstat =
1961                                     (NFSERR_AUTHERR|AUTH_REJECTVERF);
1962                                 nd->nd_procnum = NFSPROC_NOOP;
1963                                 return (0);
1964                         }
1965                         nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
1966                         nd->nd_flag |= ND_KERBNICK;
1967                 };
1968         } else {
1969                 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
1970                 nd->nd_procnum = NFSPROC_NOOP;
1971                 return (0);
1972         }
1973
1974         /*
1975          * For nqnfs, get piggybacked lease request.
1976          */
1977         if (nqnfs && nd->nd_procnum != NQNFSPROC_EVICTED) {
1978                 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1979                 nd->nd_flag |= fxdr_unsigned(int, *tl);
1980                 if (nd->nd_flag & ND_LEASE) {
1981                         nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1982                         nd->nd_duration = fxdr_unsigned(int32_t, *tl);
1983                 } else
1984                         nd->nd_duration = NQ_MINLEASE;
1985         } else
1986                 nd->nd_duration = NQ_MINLEASE;
1987         nd->nd_md = md;
1988         nd->nd_dpos = dpos;
1989         return (0);
1990 nfsmout:
1991         return (error);
1992 }
1993
1994 #endif
1995
1996 /*
1997  * Send a message to the originating process's terminal.  The thread and/or
1998  * process may be NULL.  YYY the thread should not be NULL but there may
1999  * still be some uio_td's that are still being passed as NULL through to
2000  * nfsm_request().
2001  */
2002 static int
2003 nfs_msg(struct thread *td, char *server, char *msg)
2004 {
2005         tpr_t tpr;
2006
2007         if (td && td->td_proc)
2008                 tpr = tprintf_open(td->td_proc);
2009         else
2010                 tpr = NULL;
2011         tprintf(tpr, "nfs server %s: %s\n", server, msg);
2012         tprintf_close(tpr);
2013         return (0);
2014 }
2015
2016 #ifndef NFS_NOSERVER
2017 /*
2018  * Socket upcall routine for the nfsd sockets.
2019  * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2020  * Essentially do as much as possible non-blocking, else punt and it will
2021  * be called with M_WAIT from an nfsd.
2022  */
2023 void
2024 nfsrv_rcv(so, arg, waitflag)
2025         struct socket *so;
2026         void *arg;
2027         int waitflag;
2028 {
2029         struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2030         struct mbuf *m;
2031         struct mbuf *mp;
2032         struct sockaddr *nam;
2033         struct uio auio;
2034         int flags, error;
2035
2036         if ((slp->ns_flag & SLP_VALID) == 0)
2037                 return;
2038 #ifdef notdef
2039         /*
2040          * Define this to test for nfsds handling this under heavy load.
2041          */
2042         if (waitflag == M_DONTWAIT) {
2043                 slp->ns_flag |= SLP_NEEDQ; goto dorecs;
2044         }
2045 #endif
2046         auio.uio_td = NULL;
2047         if (so->so_type == SOCK_STREAM) {
2048                 /*
2049                  * If there are already records on the queue, defer soreceive()
2050                  * to an nfsd so that there is feedback to the TCP layer that
2051                  * the nfs servers are heavily loaded.
2052                  */
2053                 if (STAILQ_FIRST(&slp->ns_rec) && waitflag == M_DONTWAIT) {
2054                         slp->ns_flag |= SLP_NEEDQ;
2055                         goto dorecs;
2056                 }
2057
2058                 /*
2059                  * Do soreceive().
2060                  */
2061                 auio.uio_resid = 1000000000;
2062                 flags = MSG_DONTWAIT;
2063                 error = so_pru_soreceive(so, &nam, &auio, &mp, NULL, &flags);
2064                 if (error || mp == (struct mbuf *)0) {
2065                         if (error == EWOULDBLOCK)
2066                                 slp->ns_flag |= SLP_NEEDQ;
2067                         else
2068                                 slp->ns_flag |= SLP_DISCONN;
2069                         goto dorecs;
2070                 }
2071                 m = mp;
2072                 if (slp->ns_rawend) {
2073                         slp->ns_rawend->m_next = m;
2074                         slp->ns_cc += 1000000000 - auio.uio_resid;
2075                 } else {
2076                         slp->ns_raw = m;
2077                         slp->ns_cc = 1000000000 - auio.uio_resid;
2078                 }
2079                 while (m->m_next)
2080                         m = m->m_next;
2081                 slp->ns_rawend = m;
2082
2083                 /*
2084                  * Now try and parse record(s) out of the raw stream data.
2085                  */
2086                 error = nfsrv_getstream(slp, waitflag);
2087                 if (error) {
2088                         if (error == EPERM)
2089                                 slp->ns_flag |= SLP_DISCONN;
2090                         else
2091                                 slp->ns_flag |= SLP_NEEDQ;
2092                 }
2093         } else {
2094                 do {
2095                         auio.uio_resid = 1000000000;
2096                         flags = MSG_DONTWAIT;
2097                         error = so_pru_soreceive(so, &nam, &auio, &mp, NULL,
2098                             &flags);
2099                         if (mp) {
2100                                 struct nfsrv_rec *rec;
2101                                 int mf = (waitflag & M_DONTWAIT) ?
2102                                             M_NOWAIT : M_WAITOK;
2103                                 rec = malloc(sizeof(struct nfsrv_rec),
2104                                              M_NFSRVDESC, mf);
2105                                 if (!rec) {
2106                                         if (nam)
2107                                                 FREE(nam, M_SONAME);
2108                                         m_freem(mp);
2109                                         continue;
2110                                 }
2111                                 nfs_realign(&mp, 10 * NFSX_UNSIGNED);
2112                                 rec->nr_address = nam;
2113                                 rec->nr_packet = mp;
2114                                 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2115                         }
2116                         if (error) {
2117                                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
2118                                         && error != EWOULDBLOCK) {
2119                                         slp->ns_flag |= SLP_DISCONN;
2120                                         goto dorecs;
2121                                 }
2122                         }
2123                 } while (mp);
2124         }
2125
2126         /*
2127          * Now try and process the request records, non-blocking.
2128          */
2129 dorecs:
2130         if (waitflag == M_DONTWAIT &&
2131                 (STAILQ_FIRST(&slp->ns_rec)
2132                  || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN))))
2133                 nfsrv_wakenfsd(slp);
2134 }
2135
2136 /*
2137  * Try and extract an RPC request from the mbuf data list received on a
2138  * stream socket. The "waitflag" argument indicates whether or not it
2139  * can sleep.
2140  */
2141 static int
2142 nfsrv_getstream(slp, waitflag)
2143         struct nfssvc_sock *slp;
2144         int waitflag;
2145 {
2146         struct mbuf *m, **mpp;
2147         char *cp1, *cp2;
2148         int len;
2149         struct mbuf *om, *m2, *recm;
2150         u_int32_t recmark;
2151
2152         if (slp->ns_flag & SLP_GETSTREAM)
2153                 panic("nfs getstream");
2154         slp->ns_flag |= SLP_GETSTREAM;
2155         for (;;) {
2156             if (slp->ns_reclen == 0) {
2157                 if (slp->ns_cc < NFSX_UNSIGNED) {
2158                         slp->ns_flag &= ~SLP_GETSTREAM;
2159                         return (0);
2160                 }
2161                 m = slp->ns_raw;
2162                 if (m->m_len >= NFSX_UNSIGNED) {
2163                         bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
2164                         m->m_data += NFSX_UNSIGNED;
2165                         m->m_len -= NFSX_UNSIGNED;
2166                 } else {
2167                         cp1 = (caddr_t)&recmark;
2168                         cp2 = mtod(m, caddr_t);
2169                         while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2170                                 while (m->m_len == 0) {
2171                                         m = m->m_next;
2172                                         cp2 = mtod(m, caddr_t);
2173                                 }
2174                                 *cp1++ = *cp2++;
2175                                 m->m_data++;
2176                                 m->m_len--;
2177                         }
2178                 }
2179                 slp->ns_cc -= NFSX_UNSIGNED;
2180                 recmark = ntohl(recmark);
2181                 slp->ns_reclen = recmark & ~0x80000000;
2182                 if (recmark & 0x80000000)
2183                         slp->ns_flag |= SLP_LASTFRAG;
2184                 else
2185                         slp->ns_flag &= ~SLP_LASTFRAG;
2186                 if (slp->ns_reclen > NFS_MAXPACKET) {
2187                         slp->ns_flag &= ~SLP_GETSTREAM;
2188                         return (EPERM);
2189                 }
2190             }
2191
2192             /*
2193              * Now get the record part.
2194              *
2195              * Note that slp->ns_reclen may be 0.  Linux sometimes
2196              * generates 0-length RPCs
2197              */
2198             recm = NULL;
2199             if (slp->ns_cc == slp->ns_reclen) {
2200                 recm = slp->ns_raw;
2201                 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
2202                 slp->ns_cc = slp->ns_reclen = 0;
2203             } else if (slp->ns_cc > slp->ns_reclen) {
2204                 len = 0;
2205                 m = slp->ns_raw;
2206                 om = (struct mbuf *)0;
2207
2208                 while (len < slp->ns_reclen) {
2209                         if ((len + m->m_len) > slp->ns_reclen) {
2210                                 m2 = m_copym(m, 0, slp->ns_reclen - len,
2211                                         waitflag);
2212                                 if (m2) {
2213                                         if (om) {
2214                                                 om->m_next = m2;
2215                                                 recm = slp->ns_raw;
2216                                         } else
2217                                                 recm = m2;
2218                                         m->m_data += slp->ns_reclen - len;
2219                                         m->m_len -= slp->ns_reclen - len;
2220                                         len = slp->ns_reclen;
2221                                 } else {
2222                                         slp->ns_flag &= ~SLP_GETSTREAM;
2223                                         return (EWOULDBLOCK);
2224                                 }
2225                         } else if ((len + m->m_len) == slp->ns_reclen) {
2226                                 om = m;
2227                                 len += m->m_len;
2228                                 m = m->m_next;
2229                                 recm = slp->ns_raw;
2230                                 om->m_next = (struct mbuf *)0;
2231                         } else {
2232                                 om = m;
2233                                 len += m->m_len;
2234                                 m = m->m_next;
2235                         }
2236                 }
2237                 slp->ns_raw = m;
2238                 slp->ns_cc -= len;
2239                 slp->ns_reclen = 0;
2240             } else {
2241                 slp->ns_flag &= ~SLP_GETSTREAM;
2242                 return (0);
2243             }
2244
2245             /*
2246              * Accumulate the fragments into a record.
2247              */
2248             mpp = &slp->ns_frag;
2249             while (*mpp)
2250                 mpp = &((*mpp)->m_next);
2251             *mpp = recm;
2252             if (slp->ns_flag & SLP_LASTFRAG) {
2253                 struct nfsrv_rec *rec;
2254                 int mf = (waitflag & M_DONTWAIT) ? M_NOWAIT : M_WAITOK;
2255                 rec = malloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
2256                 if (!rec) {
2257                     m_freem(slp->ns_frag);
2258                 } else {
2259                     nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
2260                     rec->nr_address = (struct sockaddr *)0;
2261                     rec->nr_packet = slp->ns_frag;
2262                     STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2263                 }
2264                 slp->ns_frag = (struct mbuf *)0;
2265             }
2266         }
2267 }
2268
2269 /*
2270  * Parse an RPC header.
2271  */
2272 int
2273 nfsrv_dorec(slp, nfsd, ndp)
2274         struct nfssvc_sock *slp;
2275         struct nfsd *nfsd;
2276         struct nfsrv_descript **ndp;
2277 {
2278         struct nfsrv_rec *rec;
2279         struct mbuf *m;
2280         struct sockaddr *nam;
2281         struct nfsrv_descript *nd;
2282         int error;
2283
2284         *ndp = NULL;
2285         if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec))
2286                 return (ENOBUFS);
2287         rec = STAILQ_FIRST(&slp->ns_rec);
2288         STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
2289         nam = rec->nr_address;
2290         m = rec->nr_packet;
2291         free(rec, M_NFSRVDESC);
2292         MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
2293                 M_NFSRVDESC, M_WAITOK);
2294         nd->nd_md = nd->nd_mrep = m;
2295         nd->nd_nam2 = nam;
2296         nd->nd_dpos = mtod(m, caddr_t);
2297         error = nfs_getreq(nd, nfsd, TRUE);
2298         if (error) {
2299                 if (nam) {
2300                         FREE(nam, M_SONAME);
2301                 }
2302                 free((caddr_t)nd, M_NFSRVDESC);
2303                 return (error);
2304         }
2305         *ndp = nd;
2306         nfsd->nfsd_nd = nd;
2307         return (0);
2308 }
2309
2310 /*
2311  * Search for a sleeping nfsd and wake it up.
2312  * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2313  * running nfsds will go look for the work in the nfssvc_sock list.
2314  */
2315 void
2316 nfsrv_wakenfsd(slp)
2317         struct nfssvc_sock *slp;
2318 {
2319         struct nfsd *nd;
2320
2321         if ((slp->ns_flag & SLP_VALID) == 0)
2322                 return;
2323         for (nd = nfsd_head.tqh_first; nd != 0; nd = nd->nfsd_chain.tqe_next) {
2324                 if (nd->nfsd_flag & NFSD_WAITING) {
2325                         nd->nfsd_flag &= ~NFSD_WAITING;
2326                         if (nd->nfsd_slp)
2327                                 panic("nfsd wakeup");
2328                         slp->ns_sref++;
2329                         nd->nfsd_slp = slp;
2330                         wakeup((caddr_t)nd);
2331                         return;
2332                 }
2333         }
2334         slp->ns_flag |= SLP_DOREC;
2335         nfsd_head_flag |= NFSD_CHECKSLP;
2336 }
2337 #endif /* NFS_NOSERVER */