nrelease - fix/improve livecd
[dragonfly.git] / sys / netinet / tcp_usrreq.c
1 /*
2  * Copyright (c) 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2003, 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1993
36  *      The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *      From: @(#)tcp_usrreq.c  8.2 (Berkeley) 1/3/94
63  * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $
64  */
65
66 #include "opt_inet.h"
67 #include "opt_inet6.h"
68 #include "opt_tcpdebug.h"
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/malloc.h>
74 #include <sys/sysctl.h>
75 #include <sys/globaldata.h>
76 #include <sys/thread.h>
77
78 #include <sys/mbuf.h>
79 #ifdef INET6
80 #include <sys/domain.h>
81 #endif /* INET6 */
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/socketops.h>
85 #include <sys/protosw.h>
86 #include <sys/jail.h>
87
88 #include <sys/msgport2.h>
89 #include <sys/socketvar2.h>
90
91 #include <net/if.h>
92 #include <net/netisr.h>
93 #include <net/route.h>
94
95 #include <net/netmsg2.h>
96 #include <net/netisr2.h>
97
98 #include <netinet/in.h>
99 #include <netinet/in_systm.h>
100 #ifdef INET6
101 #include <netinet/ip6.h>
102 #endif
103 #include <netinet/in_pcb.h>
104 #ifdef INET6
105 #include <netinet6/in6_pcb.h>
106 #endif
107 #include <netinet/in_var.h>
108 #include <netinet/ip_var.h>
109 #ifdef INET6
110 #include <netinet6/ip6_var.h>
111 #include <netinet6/tcp6_var.h>
112 #endif
113 #include <netinet/tcp.h>
114 #include <netinet/tcp_fsm.h>
115 #include <netinet/tcp_seq.h>
116 #include <netinet/tcp_timer.h>
117 #include <netinet/tcp_timer2.h>
118 #include <netinet/tcp_var.h>
119 #include <netinet/tcpip.h>
120 #ifdef TCPDEBUG
121 #include <netinet/tcp_debug.h>
122 #endif
123 #include <machine/limits.h>
124
125 /*
126  * Limits for TCP_KEEP* options (we will adopt the same limits that linux
127  * uses).
128  */
129 #define MAXKEEPALIVE            32767
130 #define MAXKEEPCNT              127
131
132 /*
133  * TCP protocol interface to socket abstraction.
134  */
135 extern  char *tcpstates[];      /* XXX ??? */
136
137 static int      tcp_attach (struct socket *, struct pru_attach_info *);
138 static void     tcp_connect (netmsg_t msg);
139 #ifdef INET6
140 static void     tcp6_connect (netmsg_t msg);
141 static int      tcp6_connect_oncpu(struct tcpcb *tp, int flags,
142                                 struct mbuf **mp,
143                                 struct sockaddr_in6 *sin6,
144                                 struct in6_addr *addr6);
145 #endif /* INET6 */
146 static struct tcpcb *
147                 tcp_disconnect (struct tcpcb *);
148 static struct tcpcb *
149                 tcp_usrclosed (struct tcpcb *);
150
151 #ifdef TCPDEBUG
152 #define TCPDEBUG0       int ostate = 0
153 #define TCPDEBUG1()     ostate = tp ? tp->t_state : 0
154 #define TCPDEBUG2(req)  if (tp && (so->so_options & SO_DEBUG)) \
155                                 tcp_trace(TA_USER, ostate, tp, 0, 0, req)
156 #else
157 #define TCPDEBUG0
158 #define TCPDEBUG1()
159 #define TCPDEBUG2(req)
160 #endif
161
162 /*
163  * For some ill optimized programs, which try to use TCP_NOPUSH
164  * to improve performance, will have small amount of data sits
165  * in the sending buffer.  These small amount of data will _not_
166  * be pushed into the network until more data are written into
167  * the socket or the socket write side is shutdown.
168  */ 
169 static int      tcp_disable_nopush = 1;
170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW,
171     &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect");
172
173 /*
174  * Allocate socket buffer space.
175  */
176 static int
177 tcp_usr_preattach(struct socket *so, int proto __unused,
178     struct pru_attach_info *ai)
179 {
180         int error;
181
182         if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) {
183                 error = soreserve(so, tcp_sendspace, tcp_recvspace,
184                                   ai->sb_rlimit);
185                 if (error)
186                         return (error);
187         }
188         atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE);
189         atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE | SSB_PREALLOC);
190
191         return 0;
192 }
193
194 /*
195  * TCP attaches to socket via pru_attach(), reserving space,
196  * and an internet control block.  This socket may move to
197  * other CPU later when we bind/connect.
198  */
199 static void
200 tcp_usr_attach(netmsg_t msg)
201 {
202         struct socket *so = msg->base.nm_so;
203         struct pru_attach_info *ai = msg->attach.nm_ai;
204         int error;
205         struct inpcb *inp;
206         struct tcpcb *tp = NULL;
207         TCPDEBUG0;
208
209         inp = so->so_pcb;
210         KASSERT(inp == NULL, ("tcp socket attached"));
211         TCPDEBUG1();
212
213         error = tcp_attach(so, ai);
214         if (error)
215                 goto out;
216
217         if ((so->so_options & SO_LINGER) && so->so_linger == 0)
218                 so->so_linger = TCP_LINGERTIME;
219         tp = sototcpcb(so);
220 out:
221         TCPDEBUG2(PRU_ATTACH);
222         lwkt_replymsg(&msg->lmsg, error);
223 }
224
225 /*
226  * pru_detach() detaches the TCP protocol from the socket.
227  * If the protocol state is non-embryonic, then can't
228  * do this directly: have to initiate a pru_disconnect(),
229  * which may finish later; embryonic TCB's can just
230  * be discarded here.
231  */
232 static void
233 tcp_usr_detach(netmsg_t msg)
234 {
235         struct socket *so = msg->base.nm_so;
236         int error = 0;
237         struct inpcb *inp;
238         struct tcpcb *tp;
239         TCPDEBUG0;
240
241         inp = so->so_pcb;
242
243         /*
244          * If the inp is already detached or never attached, it may have
245          * been due to an async close or async attach failure.  Just return
246          * as if no error occured.
247          */
248         if (inp) {
249                 tp = intotcpcb(inp);
250                 KASSERT(tp != NULL, ("tcp_usr_detach: tp is NULL"));
251                 TCPDEBUG1();
252                 tp = tcp_disconnect(tp);
253                 TCPDEBUG2(PRU_DETACH);
254         }
255         lwkt_replymsg(&msg->lmsg, error);
256 }
257
258 /*
259  * NOTE: ignore_error is non-zero for certain disconnection races
260  * which we want to silently allow, otherwise close() may return
261  * an unexpected error.
262  *
263  * NOTE: The variables (msg) and (tp) are assumed.
264  */
265 #define COMMON_START(so, inp, ignore_error)                     \
266         TCPDEBUG0;                                              \
267                                                                 \
268         inp = so->so_pcb;                                       \
269         do {                                                    \
270                 if (inp == NULL) {                              \
271                         error = ignore_error ? 0 : EINVAL;      \
272                         tp = NULL;                              \
273                         goto out;                               \
274                 }                                               \
275                 tp = intotcpcb(inp);                            \
276                 TCPDEBUG1();                                    \
277         } while(0)
278
279 #define COMMON_END1(req, noreply)                               \
280         out: do {                                               \
281                 TCPDEBUG2(req);                                 \
282                 if (!(noreply))                                 \
283                         lwkt_replymsg(&msg->lmsg, error);       \
284                 return;                                         \
285         } while(0)
286
287 #define COMMON_END(req)         COMMON_END1((req), 0)
288
289 static void
290 tcp_sosetport(struct lwkt_msg *msg, lwkt_port_t port)
291 {
292         sosetport(((struct netmsg_base *)msg)->nm_so, port);
293 }
294
295 /*
296  * Give the socket an address.
297  */
298 static void
299 tcp_usr_bind(netmsg_t msg)
300 {
301         struct socket *so = msg->bind.base.nm_so;
302         struct sockaddr *nam = msg->bind.nm_nam;
303         struct thread *td = msg->bind.nm_td;
304         int error = 0;
305         struct inpcb *inp;
306         struct tcpcb *tp;
307         struct sockaddr_in *sinp;
308         lwkt_port_t port0 = netisr_cpuport(0);
309
310         COMMON_START(so, inp, 0);
311
312         /*
313          * Must check for multicast addresses and disallow binding
314          * to them.
315          */
316         sinp = (struct sockaddr_in *)nam;
317         if (sinp->sin_family == AF_INET &&
318             IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
319                 error = EAFNOSUPPORT;
320                 goto out;
321         }
322
323         /*
324          * Check "already bound" here (in_pcbbind() does the same check
325          * though), so we don't forward a connected socket to netisr0,
326          * which would panic in the following in_pcbunlink().
327          */
328         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) {
329                 error = EINVAL; /* already bound */
330                 goto out;
331         }
332
333         /*
334          * Use netisr0 to serialize in_pcbbind(), so that pru_detach and
335          * pru_bind for different sockets on the same local port could be
336          * properly ordered.  The original race is illustrated here for
337          * reference.
338          *
339          * s1 = socket();
340          * bind(s1, *.PORT);
341          * close(s1);  <----- asynchronous
342          * s2 = socket();
343          * bind(s2, *.PORT);
344          *
345          * All will expect bind(s2, *.PORT) to succeed.  However, it will
346          * fail, if following sequence happens due to random socket initial
347          * msgport and asynchronous close(2):
348          *
349          *    netisrN                  netisrM
350          *       :                        :
351          *       :                    pru_bind(s2) [*.PORT is used by s1]
352          *  pru_detach(s1)                :
353          */
354         if (&curthread->td_msgport != port0) {
355                 lwkt_msg_t lmsg = &msg->bind.base.lmsg;
356
357                 KASSERT((msg->bind.nm_flags & PRUB_RELINK) == 0,
358                     ("already asked to relink"));
359
360                 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]);
361                 msg->bind.nm_flags |= PRUB_RELINK;
362
363                 TCP_STATE_MIGRATE_START(tp);
364
365                 /* See the related comment in tcp_connect() */
366                 lwkt_setmsg_receipt(lmsg, tcp_sosetport);
367                 lwkt_forwardmsg(port0, lmsg);
368                 /* msg invalid now */
369                 return;
370         }
371         KASSERT(so->so_port == port0, ("so_port is not netisr0"));
372
373         if (msg->bind.nm_flags & PRUB_RELINK) {
374                 msg->bind.nm_flags &= ~PRUB_RELINK;
375                 TCP_STATE_MIGRATE_END(tp);
376                 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]);
377         }
378         KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0"));
379
380         error = in_pcbbind(inp, nam, td);
381         if (error)
382                 goto out;
383
384         COMMON_END(PRU_BIND);
385 }
386
387 #ifdef INET6
388
389 static void
390 tcp6_usr_bind(netmsg_t msg)
391 {
392         struct socket *so = msg->bind.base.nm_so;
393         struct sockaddr *nam = msg->bind.nm_nam;
394         struct thread *td = msg->bind.nm_td;
395         int error = 0;
396         struct inpcb *inp;
397         struct tcpcb *tp;
398         struct sockaddr_in6 *sin6p;
399
400         COMMON_START(so, inp, 0);
401
402         /*
403          * Must check for multicast addresses and disallow binding
404          * to them.
405          */
406         sin6p = (struct sockaddr_in6 *)nam;
407         if (sin6p->sin6_family == AF_INET6 &&
408             IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
409                 error = EAFNOSUPPORT;
410                 goto out;
411         }
412         error = in6_pcbbind(inp, nam, td);
413         if (error)
414                 goto out;
415         COMMON_END(PRU_BIND);
416 }
417 #endif /* INET6 */
418
419 struct netmsg_inswildcard {
420         struct netmsg_base      base;
421         struct inpcb            *nm_inp;
422 };
423
424 static void
425 in_pcbinswildcardhash_handler(netmsg_t msg)
426 {
427         struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg;
428         int cpu = mycpuid, nextcpu;
429
430         in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]);
431
432         nextcpu = cpu + 1;
433         if (nextcpu < netisr_ncpus)
434                 lwkt_forwardmsg(netisr_cpuport(nextcpu), &nm->base.lmsg);
435         else
436                 lwkt_replymsg(&nm->base.lmsg, 0);
437 }
438
439 /*
440  * Prepare to accept connections.
441  */
442 static void
443 tcp_usr_listen(netmsg_t msg)
444 {
445         struct socket *so = msg->listen.base.nm_so;
446         struct thread *td = msg->listen.nm_td;
447         int error = 0;
448         struct inpcb *inp;
449         struct tcpcb *tp;
450         struct netmsg_inswildcard nm;
451         lwkt_port_t port0 = netisr_cpuport(0);
452
453         COMMON_START(so, inp, 0);
454
455         if (&curthread->td_msgport != port0) {
456                 lwkt_msg_t lmsg = &msg->listen.base.lmsg;
457
458                 KASSERT((msg->listen.nm_flags & PRUL_RELINK) == 0,
459                     ("already asked to relink"));
460
461                 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]);
462                 msg->listen.nm_flags |= PRUL_RELINK;
463
464                 TCP_STATE_MIGRATE_START(tp);
465
466                 /* See the related comment in tcp_connect() */
467                 lwkt_setmsg_receipt(lmsg, tcp_sosetport);
468                 lwkt_forwardmsg(port0, lmsg);
469                 /* msg invalid now */
470                 return;
471         }
472         KASSERT(so->so_port == port0, ("so_port is not netisr0"));
473
474         if (msg->listen.nm_flags & PRUL_RELINK) {
475                 msg->listen.nm_flags &= ~PRUL_RELINK;
476                 TCP_STATE_MIGRATE_END(tp);
477                 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]);
478         }
479         KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0"));
480
481         if (tp->t_flags & TF_LISTEN)
482                 goto out;
483
484         if (inp->inp_lport == 0) {
485                 error = in_pcbbind(inp, NULL, td);
486                 if (error)
487                         goto out;
488         }
489
490         TCP_STATE_CHANGE(tp, TCPS_LISTEN);
491         tp->t_flags |= TF_LISTEN;
492         tp->tt_msg = NULL; /* Catch any invalid timer usage */
493
494         /*
495          * Create tcpcb per-cpu port cache
496          *
497          * NOTE:
498          * This _must_ be done before installing this inpcb into
499          * wildcard hash.
500          */
501         tcp_pcbport_create(tp);
502
503         if (netisr_ncpus > 1) {
504                 /*
505                  * Put this inpcb into wildcard hash on other cpus.
506                  */
507                 ASSERT_INP_NOTINHASH(inp);
508                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
509                             MSGF_PRIORITY, in_pcbinswildcardhash_handler);
510                 nm.nm_inp = inp;
511                 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0);
512         }
513         in_pcbinswildcardhash(inp);
514         COMMON_END(PRU_LISTEN);
515 }
516
517 #ifdef INET6
518
519 static void
520 tcp6_usr_listen(netmsg_t msg)
521 {
522         struct socket *so = msg->listen.base.nm_so;
523         struct thread *td = msg->listen.nm_td;
524         int error = 0;
525         struct inpcb *inp;
526         struct tcpcb *tp;
527         struct netmsg_inswildcard nm;
528
529         COMMON_START(so, inp, 0);
530
531         if (tp->t_flags & TF_LISTEN)
532                 goto out;
533
534         if (inp->inp_lport == 0) {
535                 error = in6_pcbbind(inp, NULL, td);
536                 if (error)
537                         goto out;
538         }
539
540         TCP_STATE_CHANGE(tp, TCPS_LISTEN);
541         tp->t_flags |= TF_LISTEN;
542         tp->tt_msg = NULL; /* Catch any invalid timer usage */
543
544         /*
545          * Create tcpcb per-cpu port cache
546          *
547          * NOTE:
548          * This _must_ be done before installing this inpcb into
549          * wildcard hash.
550          */
551         tcp_pcbport_create(tp);
552
553         if (netisr_ncpus > 1) {
554                 /*
555                  * Put this inpcb into wildcard hash on other cpus.
556                  */
557                 KKASSERT(so->so_port == netisr_cpuport(0));
558                 ASSERT_NETISR0;
559                 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]);
560                 ASSERT_INP_NOTINHASH(inp);
561
562                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
563                             MSGF_PRIORITY, in_pcbinswildcardhash_handler);
564                 nm.nm_inp = inp;
565                 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0);
566         }
567         in_pcbinswildcardhash(inp);
568         COMMON_END(PRU_LISTEN);
569 }
570 #endif /* INET6 */
571
572 /*
573  * Initiate connection to peer.
574  * Create a template for use in transmissions on this connection.
575  * Enter SYN_SENT state, and mark socket as connecting.
576  * Start keep-alive timer, and seed output sequence space.
577  * Send initial segment on connection.
578  */
579 static void
580 tcp_usr_connect(netmsg_t msg)
581 {
582         struct socket *so = msg->connect.base.nm_so;
583         struct sockaddr *nam = msg->connect.nm_nam;
584         struct thread *td = msg->connect.nm_td;
585         int error = 0;
586         struct inpcb *inp;
587         struct tcpcb *tp;
588         struct sockaddr_in *sinp;
589
590         ASSERT_NETISR_NCPUS(mycpuid);
591
592         COMMON_START(so, inp, 0);
593
594         /*
595          * Must disallow TCP ``connections'' to multicast addresses.
596          */
597         sinp = (struct sockaddr_in *)nam;
598         if (sinp->sin_family == AF_INET
599             && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
600                 error = EAFNOSUPPORT;
601                 goto out;
602         }
603         tcp_connect(msg);
604         /* msg is invalid now */
605         return;
606 out:
607         if (msg->connect.nm_m) {
608                 m_freem(msg->connect.nm_m);
609                 msg->connect.nm_m = NULL;
610         }
611         if (msg->connect.nm_flags & PRUC_HELDTD)
612                 lwkt_rele(td);
613         if (error && (msg->connect.nm_flags & PRUC_ASYNC)) {
614                 so->so_error = error;
615                 soisdisconnected(so);
616         }
617         lwkt_replymsg(&msg->lmsg, error);
618 }
619
620 #ifdef INET6
621
622 static void
623 tcp6_usr_connect(netmsg_t msg)
624 {
625         struct socket *so = msg->connect.base.nm_so;
626         struct sockaddr *nam = msg->connect.nm_nam;
627         struct thread *td = msg->connect.nm_td;
628         int error = 0;
629         struct inpcb *inp;
630         struct tcpcb *tp;
631         struct sockaddr_in6 *sin6p;
632
633         ASSERT_NETISR_NCPUS(mycpuid);
634
635         COMMON_START(so, inp, 0);
636
637         /*
638          * Must disallow TCP ``connections'' to multicast addresses.
639          */
640         sin6p = (struct sockaddr_in6 *)nam;
641         if (sin6p->sin6_family == AF_INET6
642             && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
643                 error = EAFNOSUPPORT;
644                 goto out;
645         }
646
647         if (!prison_remote_ip(td, nam)) {
648                 error = EAFNOSUPPORT;   /* Illegal jail IP */
649                 goto out;
650         }
651
652         /* Reject v4-mapped address */
653         if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
654                 error = EADDRNOTAVAIL;
655                 goto out;
656         }
657
658         inp->inp_inc.inc_isipv6 = 1;
659         tcp6_connect(msg);
660         /* msg is invalid now */
661         return;
662 out:
663         if (msg->connect.nm_m) {
664                 m_freem(msg->connect.nm_m);
665                 msg->connect.nm_m = NULL;
666         }
667         lwkt_replymsg(&msg->lmsg, error);
668 }
669
670 #endif /* INET6 */
671
672 /*
673  * Initiate disconnect from peer.
674  * If connection never passed embryonic stage, just drop;
675  * else if don't need to let data drain, then can just drop anyways,
676  * else have to begin TCP shutdown process: mark socket disconnecting,
677  * drain unread data, state switch to reflect user close, and
678  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
679  * when peer sends FIN and acks ours.
680  *
681  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
682  */
683 static void
684 tcp_usr_disconnect(netmsg_t msg)
685 {
686         struct socket *so = msg->disconnect.base.nm_so;
687         int error = 0;
688         struct inpcb *inp;
689         struct tcpcb *tp;
690
691         COMMON_START(so, inp, 1);
692         tp = tcp_disconnect(tp);
693         COMMON_END(PRU_DISCONNECT);
694 }
695
696 /*
697  * Accept a connection.  Essentially all the work is
698  * done at higher levels; just return the address
699  * of the peer, storing through addr.
700  */
701 static void
702 tcp_usr_accept(netmsg_t msg)
703 {
704         struct socket *so = msg->accept.base.nm_so;
705         struct sockaddr **nam = msg->accept.nm_nam;
706         int error = 0;
707         struct inpcb *inp;
708         struct tcpcb *tp = NULL;
709         TCPDEBUG0;
710
711         inp = so->so_pcb;
712         if (so->so_state & SS_ISDISCONNECTED) {
713                 error = ECONNABORTED;
714                 goto out;
715         }
716         if (inp == NULL) {
717                 error = EINVAL;
718                 goto out;
719         }
720
721         tp = intotcpcb(inp);
722         TCPDEBUG1();
723         in_setpeeraddr(so, nam);
724         COMMON_END(PRU_ACCEPT);
725 }
726
727 #ifdef INET6
728 static void
729 tcp6_usr_accept(netmsg_t msg)
730 {
731         struct socket *so = msg->accept.base.nm_so;
732         struct sockaddr **nam = msg->accept.nm_nam;
733         int error = 0;
734         struct inpcb *inp;
735         struct tcpcb *tp = NULL;
736         TCPDEBUG0;
737
738         inp = so->so_pcb;
739
740         if (so->so_state & SS_ISDISCONNECTED) {
741                 error = ECONNABORTED;
742                 goto out;
743         }
744         if (inp == NULL) {
745                 error = EINVAL;
746                 goto out;
747         }
748         tp = intotcpcb(inp);
749         TCPDEBUG1();
750         in6_setpeeraddr(so, nam);
751         COMMON_END(PRU_ACCEPT);
752 }
753 #endif /* INET6 */
754
755 /*
756  * Mark the connection as being incapable of further output.
757  */
758 static void
759 tcp_usr_shutdown(netmsg_t msg)
760 {
761         struct socket *so = msg->shutdown.base.nm_so;
762         int error = 0;
763         struct inpcb *inp;
764         struct tcpcb *tp;
765
766         COMMON_START(so, inp, 0);
767         socantsendmore(so);
768         tp = tcp_usrclosed(tp);
769         if (tp)
770                 error = tcp_output(tp);
771         COMMON_END(PRU_SHUTDOWN);
772 }
773
774 /*
775  * After a receive, possibly send window update to peer.
776  */
777 static void
778 tcp_usr_rcvd(netmsg_t msg)
779 {
780         struct socket *so = msg->rcvd.base.nm_so;
781         int error = 0, noreply = 0;
782         struct inpcb *inp;
783         struct tcpcb *tp;
784
785         COMMON_START(so, inp, 0);
786
787         if (msg->rcvd.nm_pru_flags & PRUR_ASYNC) {
788                 noreply = 1;
789                 so_async_rcvd_reply(so);
790         }
791         tcp_output(tp);
792
793         COMMON_END1(PRU_RCVD, noreply);
794 }
795
796 /*
797  * Do a send by putting data in output queue and updating urgent
798  * marker if URG set.  Possibly send more data.  Unlike the other
799  * pru_*() routines, the mbuf chains are our responsibility.  We
800  * must either enqueue them or free them.  The other pru_* routines
801  * generally are caller-frees.
802  */
803 static void
804 tcp_usr_send(netmsg_t msg)
805 {
806         struct socket *so = msg->send.base.nm_so;
807         int flags = msg->send.nm_flags;
808         struct mbuf *m = msg->send.nm_m;
809         int error = 0;
810         struct inpcb *inp;
811         struct tcpcb *tp;
812         TCPDEBUG0;
813
814         KKASSERT(msg->send.nm_control == NULL);
815         KKASSERT(msg->send.nm_addr == NULL);
816         KKASSERT((flags & PRUS_FREEADDR) == 0);
817
818         inp = so->so_pcb;
819
820         if (inp == NULL) {
821                 /*
822                  * OOPS! we lost a race, the TCP session got reset after
823                  * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
824                  * network interrupt in the non-critical section of sosend().
825                  */
826                 m_freem(m);
827                 error = ECONNRESET;     /* XXX EPIPE? */
828                 tp = NULL;
829                 TCPDEBUG1();
830                 goto out;
831         }
832         tp = intotcpcb(inp);
833         TCPDEBUG1();
834
835 #ifdef foo
836         /*
837          * This is no longer necessary, since:
838          * - sosendtcp() has already checked it for us
839          * - It does not work with asynchronized send
840          */
841
842         /*
843          * Don't let too much OOB data build up
844          */
845         if (flags & PRUS_OOB) {
846                 if (ssb_space(&so->so_snd) < -512) {
847                         m_freem(m);
848                         error = ENOBUFS;
849                         goto out;
850                 }
851         }
852 #endif
853
854         /*
855          * Pump the data into the socket.
856          */
857         if (m) {
858                 ssb_appendstream(&so->so_snd, m);
859                 sowwakeup(so);
860         }
861         if (flags & PRUS_OOB) {
862                 /*
863                  * According to RFC961 (Assigned Protocols),
864                  * the urgent pointer points to the last octet
865                  * of urgent data.  We continue, however,
866                  * to consider it to indicate the first octet
867                  * of data past the urgent section.
868                  * Otherwise, snd_up should be one lower.
869                  */
870                 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
871                 tp->t_flags |= TF_FORCE;
872                 error = tcp_output(tp);
873                 tp->t_flags &= ~TF_FORCE;
874         } else {
875                 if (flags & PRUS_EOF) {
876                         /*
877                          * Close the send side of the connection after
878                          * the data is sent.
879                          */
880                         socantsendmore(so);
881                         tp = tcp_usrclosed(tp);
882                 }
883                 if (tp != NULL && !tcp_output_pending(tp)) {
884                         if (flags & PRUS_MORETOCOME)
885                                 tp->t_flags |= TF_MORETOCOME;
886                         error = tcp_output_fair(tp);
887                         if (flags & PRUS_MORETOCOME)
888                                 tp->t_flags &= ~TF_MORETOCOME;
889                 }
890         }
891         COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB :
892                    ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND),
893                    (flags & PRUS_NOREPLY));
894 }
895
896 /*
897  * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort()
898  *       will sofree() it when we return.
899  */
900 static void
901 tcp_usr_abort(netmsg_t msg)
902 {
903         struct socket *so = msg->abort.base.nm_so;
904         int error = 0;
905         struct inpcb *inp;
906         struct tcpcb *tp;
907
908         COMMON_START(so, inp, 1);
909         tp = tcp_drop(tp, ECONNABORTED);
910         COMMON_END(PRU_ABORT);
911 }
912
913 /*
914  * Receive out-of-band data.
915  */
916 static void
917 tcp_usr_rcvoob(netmsg_t msg)
918 {
919         struct socket *so = msg->rcvoob.base.nm_so;
920         struct mbuf *m = msg->rcvoob.nm_m;
921         int flags = msg->rcvoob.nm_flags;
922         int error = 0;
923         struct inpcb *inp;
924         struct tcpcb *tp;
925
926         COMMON_START(so, inp, 0);
927         if ((so->so_oobmark == 0 &&
928              (so->so_state & SS_RCVATMARK) == 0) ||
929             so->so_options & SO_OOBINLINE ||
930             tp->t_oobflags & TCPOOB_HADDATA) {
931                 error = EINVAL;
932                 goto out;
933         }
934         if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
935                 error = EWOULDBLOCK;
936                 goto out;
937         }
938         m->m_len = 1;
939         *mtod(m, caddr_t) = tp->t_iobc;
940         if ((flags & MSG_PEEK) == 0)
941                 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
942         COMMON_END(PRU_RCVOOB);
943 }
944
945 static void
946 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr)
947 {
948         in_savefaddr(so, faddr);
949 }
950
951 #ifdef INET6
952 static void
953 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr)
954 {
955         in6_savefaddr(so, faddr);
956 }
957 #endif
958
959 static int
960 tcp_usr_preconnect(struct socket *so, const struct sockaddr *nam,
961     struct thread *td __unused)
962 {
963         const struct sockaddr_in *sinp;
964
965         sinp = (const struct sockaddr_in *)nam;
966         if (sinp->sin_family == AF_INET &&
967             IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
968                 return EAFNOSUPPORT;
969
970         soisconnecting(so);
971         return 0;
972 }
973
974 /* xxx - should be const */
975 struct pr_usrreqs tcp_usrreqs = {
976         .pru_abort = tcp_usr_abort,
977         .pru_accept = tcp_usr_accept,
978         .pru_attach = tcp_usr_attach,
979         .pru_bind = tcp_usr_bind,
980         .pru_connect = tcp_usr_connect,
981         .pru_connect2 = pr_generic_notsupp,
982         .pru_control = in_control_dispatch,
983         .pru_detach = tcp_usr_detach,
984         .pru_disconnect = tcp_usr_disconnect,
985         .pru_listen = tcp_usr_listen,
986         .pru_peeraddr = in_setpeeraddr_dispatch,
987         .pru_rcvd = tcp_usr_rcvd,
988         .pru_rcvoob = tcp_usr_rcvoob,
989         .pru_send = tcp_usr_send,
990         .pru_sense = pru_sense_null,
991         .pru_shutdown = tcp_usr_shutdown,
992         .pru_sockaddr = in_setsockaddr_dispatch,
993         .pru_sosend = sosendtcp,
994         .pru_soreceive = sorecvtcp,
995         .pru_savefaddr = tcp_usr_savefaddr,
996         .pru_preconnect = tcp_usr_preconnect,
997         .pru_preattach = tcp_usr_preattach
998 };
999
1000 #ifdef INET6
1001 struct pr_usrreqs tcp6_usrreqs = {
1002         .pru_abort = tcp_usr_abort,
1003         .pru_accept = tcp6_usr_accept,
1004         .pru_attach = tcp_usr_attach,
1005         .pru_bind = tcp6_usr_bind,
1006         .pru_connect = tcp6_usr_connect,
1007         .pru_connect2 = pr_generic_notsupp,
1008         .pru_control = in6_control_dispatch,
1009         .pru_detach = tcp_usr_detach,
1010         .pru_disconnect = tcp_usr_disconnect,
1011         .pru_listen = tcp6_usr_listen,
1012         .pru_peeraddr = in6_setpeeraddr_dispatch,
1013         .pru_rcvd = tcp_usr_rcvd,
1014         .pru_rcvoob = tcp_usr_rcvoob,
1015         .pru_send = tcp_usr_send,
1016         .pru_sense = pru_sense_null,
1017         .pru_shutdown = tcp_usr_shutdown,
1018         .pru_sockaddr = in6_setsockaddr_dispatch,
1019         .pru_sosend = sosendtcp,
1020         .pru_soreceive = sorecvtcp,
1021         .pru_savefaddr = tcp6_usr_savefaddr
1022 };
1023 #endif /* INET6 */
1024
1025 static int
1026 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m,
1027                   const struct sockaddr_in *sin, struct sockaddr_in *if_sin,
1028                   uint16_t hash)
1029 {
1030         struct inpcb *inp = tp->t_inpcb, *oinp;
1031         struct socket *so = inp->inp_socket;
1032         struct route *ro = &inp->inp_route;
1033
1034         KASSERT(inp->inp_pcbinfo == &tcbinfo[mycpu->gd_cpuid],
1035             ("pcbinfo mismatch"));
1036
1037         oinp = in_pcblookup_hash(inp->inp_pcbinfo,
1038                                  sin->sin_addr, sin->sin_port,
1039                                  (inp->inp_laddr.s_addr != INADDR_ANY ?
1040                                   inp->inp_laddr : if_sin->sin_addr),
1041                                 inp->inp_lport, 0, NULL);
1042         if (oinp != NULL) {
1043                 m_freem(m);
1044                 return (EADDRINUSE);
1045         }
1046         if (inp->inp_laddr.s_addr == INADDR_ANY)
1047                 inp->inp_laddr = if_sin->sin_addr;
1048         KASSERT(inp->inp_faddr.s_addr == sin->sin_addr.s_addr,
1049             ("faddr mismatch for reconnect"));
1050         KASSERT(inp->inp_fport == sin->sin_port,
1051             ("fport mismatch for reconnect"));
1052         in_pcbinsconnhash(inp);
1053
1054         inp->inp_flags |= INP_HASH;
1055         inp->inp_hashval = hash;
1056
1057         /*
1058          * We are now on the inpcb's owner CPU, if the cached route was
1059          * freed because the rtentry's owner CPU is not the current CPU
1060          * (e.g. in tcp_connect()), then we try to reallocate it here with
1061          * the hope that a rtentry may be cloned from a RTF_PRCLONING
1062          * rtentry.
1063          */
1064         if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
1065             ro->ro_rt == NULL) {
1066                 bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
1067                 ro->ro_dst.sa_family = AF_INET;
1068                 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1069                 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr =
1070                         sin->sin_addr;
1071                 rtalloc(ro);
1072         }
1073
1074         /*
1075          * Now that no more errors can occur, change the protocol processing
1076          * port to the current thread (which is the correct thread).
1077          *
1078          * Create TCP timer message now; we are on the tcpcb's owner
1079          * CPU/thread.
1080          */
1081         tcp_create_timermsg(tp, &curthread->td_msgport);
1082
1083         /*
1084          * Compute window scaling to request.  Use a larger scaling then
1085          * needed for the initial receive buffer in case the receive buffer
1086          * gets expanded.
1087          */
1088         if (tp->request_r_scale < TCP_MIN_WINSHIFT)
1089                 tp->request_r_scale = TCP_MIN_WINSHIFT;
1090         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1091                (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat
1092         ) {
1093                 tp->request_r_scale++;
1094         }
1095
1096         soisconnecting(so);
1097         tcpstat.tcps_connattempt++;
1098         TCP_STATE_CHANGE(tp, TCPS_SYN_SENT);
1099         tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep);
1100         tp->iss = tcp_new_isn(tp);
1101         tcp_sendseqinit(tp);
1102         if (m) {
1103                 ssb_appendstream(&so->so_snd, m);
1104                 m = NULL;
1105                 if (flags & PRUS_OOB)
1106                         tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
1107         }
1108
1109         /*
1110          * Close the send side of the connection after
1111          * the data is sent if flagged.
1112          */
1113         if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) {
1114                 socantsendmore(so);
1115                 tp = tcp_usrclosed(tp);
1116         }
1117         return (tcp_output(tp));
1118 }
1119
1120 /*
1121  * Common subroutine to open a TCP connection to remote host specified
1122  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
1123  * port number if needed.  Call in_pcbladdr to do the routing and to choose
1124  * a local host address (interface).
1125  * Initialize connection parameters and enter SYN-SENT state.
1126  */
1127 static void
1128 tcp_connect(netmsg_t msg)
1129 {
1130         struct socket *so = msg->connect.base.nm_so;
1131         struct sockaddr *nam = msg->connect.nm_nam;
1132         struct thread *td = msg->connect.nm_td;
1133         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1134         struct sockaddr_in *if_sin = NULL;
1135         struct inpcb *inp;
1136         struct tcpcb *tp;
1137         int error;
1138         uint16_t hash;
1139         lwkt_port_t port;
1140
1141         COMMON_START(so, inp, 0);
1142
1143         /*
1144          * Reconnect our pcb if we have to
1145          */
1146         if (msg->connect.nm_flags & PRUC_RECONNECT) {
1147                 msg->connect.nm_flags &= ~PRUC_RECONNECT;
1148                 TCP_STATE_MIGRATE_END(tp);
1149                 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1150         } else {
1151                 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1152                         error = EISCONN;
1153                         if (so->so_state & SS_ISCONNECTING)
1154                                 error = EALREADY;
1155                         goto out;
1156                 }
1157                 KASSERT(inp->inp_fport == 0, ("invalid fport"));
1158         }
1159
1160         /*
1161          * Select local port, if it is not yet selected.
1162          */
1163         if (inp->inp_lport == 0) {
1164                 KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY);
1165
1166                 error = in_pcbladdr(inp, nam, &if_sin, td);
1167                 if (error)
1168                         goto out;
1169                 inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr;
1170                 msg->connect.nm_flags |= PRUC_HASLADDR;
1171
1172                 /*
1173                  * Install faddr/fport earlier, so that when this
1174                  * inpcb is installed on to the lport hash, the
1175                  * 4-tuple contains correct value.
1176                  *
1177                  * NOTE: The faddr/fport will have to be installed
1178                  * after the in_pcbladdr(), which may change them.
1179                  */
1180                 inp->inp_faddr = sin->sin_addr;
1181                 inp->inp_fport = sin->sin_port;
1182
1183                 error = in_pcbbind_remote(inp, nam, td);
1184                 if (error)
1185                         goto out;
1186         }
1187
1188         if ((msg->connect.nm_flags & PRUC_HASLADDR) == 0) {
1189                 /*
1190                  * Rarely used path:
1191                  * This inpcb was bound before this connect.
1192                  */
1193                 error = in_pcbladdr(inp, nam, &if_sin, td);
1194                 if (error)
1195                         goto out;
1196
1197                 /*
1198                  * Save or refresh the faddr/fport, since they may
1199                  * be changed by in_pcbladdr().
1200                  */
1201                 inp->inp_faddr = sin->sin_addr;
1202                 inp->inp_fport = sin->sin_port;
1203         }
1204 #ifdef INVARIANTS
1205         else {
1206                 KASSERT(inp->inp_faddr.s_addr == sin->sin_addr.s_addr,
1207                     ("faddr mismatch for reconnect"));
1208                 KASSERT(inp->inp_fport == sin->sin_port,
1209                     ("fport mismatch for reconnect"));
1210         }
1211 #endif
1212         KKASSERT(inp->inp_socket == so);
1213
1214         hash = tcp_addrhash(sin->sin_addr.s_addr, sin->sin_port,
1215                             (inp->inp_laddr.s_addr != INADDR_ANY ?
1216                              inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr),
1217                             inp->inp_lport);
1218         port = netisr_hashport(hash);
1219
1220         if (port != &curthread->td_msgport) {
1221                 lwkt_msg_t lmsg = &msg->connect.base.lmsg;
1222
1223                 /*
1224                  * in_pcbladdr() may have allocated a route entry for us
1225                  * on the current CPU, but we need a route entry on the
1226                  * inpcb's owner CPU, so free it here.
1227                  */
1228                 in_pcbresetroute(inp);
1229
1230                 /*
1231                  * We are moving the protocol processing port the socket
1232                  * is on, we have to unlink here and re-link on the
1233                  * target cpu.
1234                  */
1235                 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1236                 msg->connect.nm_flags |= PRUC_RECONNECT;
1237                 msg->connect.base.nm_dispatch = tcp_connect;
1238
1239                 TCP_STATE_MIGRATE_START(tp);
1240
1241                 /*
1242                  * Use message put done receipt to change this socket's
1243                  * so_port, i.e. _after_ this message was put onto the
1244                  * target netisr's msgport but _before_ the message could
1245                  * be pulled from the target netisr's msgport, so that:
1246                  * - The upper half (socket code) will not see the new
1247                  *   msgport before this message reaches the new msgport
1248                  *   and messages for this socket will be ordered.
1249                  * - This message will see the new msgport, when its
1250                  *   handler is called in the target netisr.
1251                  *
1252                  * NOTE:
1253                  * We MUST use messege put done receipt to change this
1254                  * socket's so_port:
1255                  * If we changed the so_port in this netisr after the
1256                  * lwkt_forwardmsg (so messages for this socket will be
1257                  * ordered) and changed the so_port in the target netisr
1258                  * at the very beginning of this message's handler, we
1259                  * would suffer so_port overwritten race, given this
1260                  * message might be forwarded again.
1261                  *
1262                  * NOTE:
1263                  * This mechanism depends on that the netisr's msgport
1264                  * is spin msgport (currently it is :).
1265                  *
1266                  * If the upper half saw the new msgport before this
1267                  * message reached the target netisr's msgport, the
1268                  * messages sent from the upper half could reach the new
1269                  * msgport before this message, thus there would be
1270                  * message reordering.  The worst case could be soclose()
1271                  * saw the new msgport and the detach message could reach
1272                  * the new msgport before this message, i.e. the inpcb
1273                  * could have been destroyed when this message was still
1274                  * pending on or on its way to the new msgport.  Other
1275                  * weird cases could also happen, e.g. inpcb->inp_pcbinfo,
1276                  * since we have unlinked this inpcb from the current
1277                  * pcbinfo first.
1278                  */
1279                 lwkt_setmsg_receipt(lmsg, tcp_sosetport);
1280                 lwkt_forwardmsg(port, lmsg);
1281                 /* msg invalid now */
1282                 return;
1283         } else if (msg->connect.nm_flags & PRUC_HELDTD) {
1284                 /*
1285                  * The original thread is no longer needed; release it.
1286                  */
1287                 lwkt_rele(td);
1288                 msg->connect.nm_flags &= ~PRUC_HELDTD;
1289         }
1290         error = tcp_connect_oncpu(tp, msg->connect.nm_sndflags,
1291                                   msg->connect.nm_m, sin, if_sin, hash);
1292         msg->connect.nm_m = NULL;
1293 out:
1294         if (msg->connect.nm_m) {
1295                 m_freem(msg->connect.nm_m);
1296                 msg->connect.nm_m = NULL;
1297         }
1298         if (msg->connect.nm_flags & PRUC_HELDTD)
1299                 lwkt_rele(td);
1300         if (error && (msg->connect.nm_flags & PRUC_ASYNC)) {
1301                 so->so_error = error;
1302                 soisdisconnected(so);
1303         }
1304         lwkt_replymsg(&msg->connect.base.lmsg, error);
1305         /* msg invalid now */
1306 }
1307
1308 #ifdef INET6
1309
1310 static void
1311 tcp6_connect(netmsg_t msg)
1312 {
1313         struct tcpcb *tp;
1314         struct socket *so = msg->connect.base.nm_so;
1315         struct sockaddr *nam = msg->connect.nm_nam;
1316         struct thread *td = msg->connect.nm_td;
1317         struct inpcb *inp;
1318         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
1319         struct in6_addr *addr6;
1320         lwkt_port_t port;
1321         int error;
1322
1323         COMMON_START(so, inp, 0);
1324
1325         /*
1326          * Reconnect our pcb if we have to
1327          */
1328         if (msg->connect.nm_flags & PRUC_RECONNECT) {
1329                 msg->connect.nm_flags &= ~PRUC_RECONNECT;
1330                 TCP_STATE_MIGRATE_END(tp);
1331                 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1332         }
1333
1334         /*
1335          * Bind if we have to
1336          */
1337         if (inp->inp_lport == 0) {
1338                 error = in6_pcbbind(inp, NULL, td);
1339                 if (error)
1340                         goto out;
1341         }
1342
1343         /*
1344          * Cannot simply call in_pcbconnect, because there might be an
1345          * earlier incarnation of this same connection still in
1346          * TIME_WAIT state, creating an ADDRINUSE error.
1347          */
1348         error = in6_pcbladdr(inp, nam, &addr6, td);
1349         if (error)
1350                 goto out;
1351
1352         port = tcp6_addrport(); /* XXX hack for now, always cpu0 */
1353
1354         if (port != &curthread->td_msgport) {
1355                 lwkt_msg_t lmsg = &msg->connect.base.lmsg;
1356
1357                 /*
1358                  * in_pcbladdr() may have allocated a route entry for us
1359                  * on the current CPU, but we need a route entry on the
1360                  * inpcb's owner CPU, so free it here.
1361                  */
1362                 in_pcbresetroute(inp);
1363
1364                 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1365                 msg->connect.nm_flags |= PRUC_RECONNECT;
1366                 msg->connect.base.nm_dispatch = tcp6_connect;
1367
1368                 TCP_STATE_MIGRATE_START(tp);
1369
1370                 /* See the related comment in tcp_connect() */
1371                 lwkt_setmsg_receipt(lmsg, tcp_sosetport);
1372                 lwkt_forwardmsg(port, lmsg);
1373                 /* msg invalid now */
1374                 return;
1375         }
1376         error = tcp6_connect_oncpu(tp, msg->connect.nm_sndflags,
1377                                    &msg->connect.nm_m, sin6, addr6);
1378         /* nm_m may still be intact */
1379 out:
1380         if (msg->connect.nm_m) {
1381                 m_freem(msg->connect.nm_m);
1382                 msg->connect.nm_m = NULL;
1383         }
1384         lwkt_replymsg(&msg->connect.base.lmsg, error);
1385         /* msg invalid now */
1386 }
1387
1388 static int
1389 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp,
1390                    struct sockaddr_in6 *sin6, struct in6_addr *addr6)
1391 {
1392         struct mbuf *m = *mp;
1393         struct inpcb *inp = tp->t_inpcb;
1394         struct socket *so = inp->inp_socket;
1395         struct inpcb *oinp;
1396
1397         /*
1398          * Cannot simply call in_pcbconnect, because there might be an
1399          * earlier incarnation of this same connection still in
1400          * TIME_WAIT state, creating an ADDRINUSE error.
1401          */
1402         oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
1403                                   &sin6->sin6_addr, sin6->sin6_port,
1404                                   (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ?
1405                                       addr6 : &inp->in6p_laddr),
1406                                   inp->inp_lport,  0, NULL);
1407         if (oinp)
1408                 return (EADDRINUSE);
1409
1410         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1411                 inp->in6p_laddr = *addr6;
1412         inp->in6p_faddr = sin6->sin6_addr;
1413         inp->inp_fport = sin6->sin6_port;
1414         if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
1415                 inp->in6p_flowinfo = sin6->sin6_flowinfo;
1416         in_pcbinsconnhash(inp);
1417
1418         /*
1419          * Now that no more errors can occur, change the protocol processing
1420          * port to the current thread (which is the correct thread).
1421          *
1422          * Create TCP timer message now; we are on the tcpcb's owner
1423          * CPU/thread.
1424          */
1425         tcp_create_timermsg(tp, &curthread->td_msgport);
1426
1427         /* Compute window scaling to request.  */
1428         if (tp->request_r_scale < TCP_MIN_WINSHIFT)
1429                 tp->request_r_scale = TCP_MIN_WINSHIFT;
1430         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1431             (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) {
1432                 tp->request_r_scale++;
1433         }
1434
1435         soisconnecting(so);
1436         tcpstat.tcps_connattempt++;
1437         TCP_STATE_CHANGE(tp, TCPS_SYN_SENT);
1438         tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep);
1439         tp->iss = tcp_new_isn(tp);
1440         tcp_sendseqinit(tp);
1441         if (m) {
1442                 ssb_appendstream(&so->so_snd, m);
1443                 *mp = NULL;
1444                 if (flags & PRUS_OOB)
1445                         tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
1446         }
1447
1448         /*
1449          * Close the send side of the connection after
1450          * the data is sent if flagged.
1451          */
1452         if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) {
1453                 socantsendmore(so);
1454                 tp = tcp_usrclosed(tp);
1455         }
1456         return (tcp_output(tp));
1457 }
1458
1459 #endif /* INET6 */
1460
1461 /*
1462  * The new sockopt interface makes it possible for us to block in the
1463  * copyin/out step (if we take a page fault).  Taking a page fault while
1464  * in a critical section is probably a Bad Thing.  (Since sockets and pcbs
1465  * both now use TSM, there probably isn't any need for this function to 
1466  * run in a critical section any more.  This needs more examination.)
1467  */
1468 void
1469 tcp_ctloutput(netmsg_t msg)
1470 {
1471         struct socket *so = msg->base.nm_so;
1472         struct sockopt *sopt = msg->ctloutput.nm_sopt;
1473         struct thread *td = NULL;
1474         int     error, opt, optval, opthz;
1475         struct  inpcb *inp;
1476         struct  tcpcb *tp;
1477
1478         if (msg->ctloutput.nm_flags & PRCO_HELDTD)
1479                 td = sopt->sopt_td;
1480
1481         error = 0;
1482         inp = so->so_pcb;
1483         if (inp == NULL) {
1484                 error = ECONNRESET;
1485                 goto done;
1486         }
1487         tp = intotcpcb(inp);
1488
1489         /* Get socket's owner cpuid hint */
1490         if (sopt->sopt_level == SOL_SOCKET &&
1491             sopt->sopt_dir == SOPT_GET &&
1492             sopt->sopt_name == SO_CPUHINT) {
1493                 if (tp->t_flags & TF_LISTEN) {
1494                         /*
1495                          * Listen sockets owner cpuid is always 0,
1496                          * which does not make sense if SO_REUSEPORT
1497                          * is not set.
1498                          *
1499                          * NOTE: inp_lgrpindex is _not_ assigned in jail.
1500                          */
1501                         if ((so->so_options & SO_REUSEPORT) &&
1502                             inp->inp_lgrpindex >= 0)
1503                                 optval = inp->inp_lgrpindex % netisr_ncpus;
1504                         else
1505                                 optval = -1; /* no hint */
1506                 } else {
1507                         optval = mycpuid;
1508                 }
1509                 soopt_from_kbuf(sopt, &optval, sizeof(optval));
1510                 goto done;
1511         }
1512
1513         if (sopt->sopt_level != IPPROTO_TCP) {
1514                 if (sopt->sopt_level == IPPROTO_IP) {
1515                         switch (sopt->sopt_name) {
1516                         case IP_MULTICAST_IF:
1517                         case IP_MULTICAST_VIF:
1518                         case IP_MULTICAST_TTL:
1519                         case IP_MULTICAST_LOOP:
1520                         case IP_ADD_MEMBERSHIP:
1521                         case IP_DROP_MEMBERSHIP:
1522                                 /*
1523                                  * Multicast does not make sense on
1524                                  * TCP sockets.
1525                                  */
1526                                 error = EOPNOTSUPP;
1527                                 goto done;
1528                         }
1529                 }
1530 #ifdef INET6
1531                 if (INP_CHECK_SOCKAF(so, AF_INET6))
1532                         ip6_ctloutput_dispatch(msg);
1533                 else
1534 #endif /* INET6 */
1535                 ip_ctloutput(msg);
1536                 /* msg invalid now */
1537                 if (td != NULL)
1538                         lwkt_rele(td);
1539                 return;
1540         }
1541
1542         switch (sopt->sopt_dir) {
1543         case SOPT_SET:
1544                 error = soopt_to_kbuf(sopt, &optval, sizeof optval,
1545                                       sizeof optval);
1546                 if (error)
1547                         break;
1548                 switch (sopt->sopt_name) {
1549                 case TCP_FASTKEEP:
1550                         if (optval > 0)
1551                                 tp->t_keepidle = tp->t_keepintvl;
1552                         else
1553                                 tp->t_keepidle = tcp_keepidle;
1554                         tcp_timer_keep_activity(tp, 0);
1555                         break;
1556 #ifdef TCP_SIGNATURE
1557                 case TCP_SIGNATURE_ENABLE:
1558                         if (tp->t_state == TCPS_CLOSED) {
1559                                 /*
1560                                  * This is the only safe state that this
1561                                  * option could be changed.  Some segments
1562                                  * could already have been sent in other
1563                                  * states.
1564                                  */
1565                                 if (optval > 0)
1566                                         tp->t_flags |= TF_SIGNATURE;
1567                                 else
1568                                         tp->t_flags &= ~TF_SIGNATURE;
1569                         } else {
1570                                 error = EOPNOTSUPP;
1571                         }
1572                         break;
1573 #endif /* TCP_SIGNATURE */
1574                 case TCP_NODELAY:
1575                 case TCP_NOOPT:
1576                         switch (sopt->sopt_name) {
1577                         case TCP_NODELAY:
1578                                 opt = TF_NODELAY;
1579                                 break;
1580                         case TCP_NOOPT:
1581                                 opt = TF_NOOPT;
1582                                 break;
1583                         default:
1584                                 opt = 0; /* dead code to fool gcc */
1585                                 break;
1586                         }
1587
1588                         if (optval)
1589                                 tp->t_flags |= opt;
1590                         else
1591                                 tp->t_flags &= ~opt;
1592                         break;
1593
1594                 case TCP_NOPUSH:
1595                         if (tcp_disable_nopush)
1596                                 break;
1597                         if (optval)
1598                                 tp->t_flags |= TF_NOPUSH;
1599                         else {
1600                                 tp->t_flags &= ~TF_NOPUSH;
1601                                 error = tcp_output(tp);
1602                         }
1603                         break;
1604
1605                 case TCP_MAXSEG:
1606                         /*
1607                          * Must be between 0 and maxseg.  If the requested
1608                          * maxseg is too small to satisfy the desired minmss,
1609                          * pump it up (silently so sysctl modifications of
1610                          * minmss do not create unexpected program failures).
1611                          * Handle degenerate cases.
1612                          */
1613                         if (optval > 0 && optval <= tp->t_maxseg) {
1614                                 if (optval + 40 < tcp_minmss) {
1615                                         optval = tcp_minmss - 40;
1616                                         if (optval < 0)
1617                                                 optval = 1;
1618                                 }
1619                                 tp->t_maxseg = optval;
1620                         } else {
1621                                 error = EINVAL;
1622                         }
1623                         break;
1624
1625                 case TCP_KEEPINIT:
1626                 case TCP_KEEPIDLE:
1627                 case TCP_KEEPINTVL:
1628                         if (optval < 1 || optval > MAXKEEPALIVE) {
1629                                 error = EINVAL;
1630                                 break;
1631                         }
1632                         opthz = optval * hz;
1633
1634                         switch (sopt->sopt_name) {
1635                         case TCP_KEEPINIT:
1636                                 tp->t_keepinit = opthz;
1637                                 break;
1638                         case TCP_KEEPIDLE:
1639                                 tp->t_keepidle = opthz;
1640                                 tcp_timer_keep_activity(tp, 0);
1641                                 break;
1642                         case TCP_KEEPINTVL:
1643                                 tp->t_keepintvl = opthz;
1644                                 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt;
1645                                 break;
1646                         }
1647                         break;
1648
1649                 case TCP_KEEPCNT:
1650                         if (optval < 1 || optval > MAXKEEPCNT) {
1651                                 error = EINVAL;
1652                                 break;
1653                         }
1654                         tp->t_keepcnt = optval;
1655                         tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt;
1656                         break;
1657
1658                 default:
1659                         error = ENOPROTOOPT;
1660                         break;
1661                 }
1662                 break;
1663
1664         case SOPT_GET:
1665                 switch (sopt->sopt_name) {
1666 #ifdef TCP_SIGNATURE
1667                 case TCP_SIGNATURE_ENABLE:
1668                         optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
1669                         break;
1670 #endif /* TCP_SIGNATURE */
1671                 case TCP_NODELAY:
1672                         optval = tp->t_flags & TF_NODELAY;
1673                         break;
1674                 case TCP_MAXSEG:
1675                         optval = tp->t_maxseg;
1676                         break;
1677                 case TCP_NOOPT:
1678                         optval = tp->t_flags & TF_NOOPT;
1679                         break;
1680                 case TCP_NOPUSH:
1681                         optval = tp->t_flags & TF_NOPUSH;
1682                         break;
1683                 case TCP_KEEPINIT:
1684                         optval = tp->t_keepinit / hz;
1685                         break;
1686                 case TCP_KEEPIDLE:
1687                         optval = tp->t_keepidle / hz;
1688                         break;
1689                 case TCP_KEEPINTVL:
1690                         optval = tp->t_keepintvl / hz;
1691                         break;
1692                 case TCP_KEEPCNT:
1693                         optval = tp->t_keepcnt;
1694                         break;
1695                 default:
1696                         error = ENOPROTOOPT;
1697                         break;
1698                 }
1699                 if (error == 0)
1700                         soopt_from_kbuf(sopt, &optval, sizeof optval);
1701                 break;
1702         }
1703 done:
1704         if (td != NULL)
1705                 lwkt_rele(td);
1706         lwkt_replymsg(&msg->lmsg, error);
1707 }
1708
1709 struct netmsg_tcp_ctloutput {
1710         struct netmsg_pr_ctloutput ctloutput;
1711         struct sockopt          sopt;
1712         int                     sopt_val;
1713 };
1714
1715 /*
1716  * Allocate netmsg_pr_ctloutput for asynchronous tcp_ctloutput.
1717  */
1718 struct netmsg_pr_ctloutput *
1719 tcp_ctloutmsg(struct sockopt *sopt)
1720 {
1721         struct netmsg_tcp_ctloutput *msg;
1722         int flags = 0, error;
1723
1724         KASSERT(sopt->sopt_dir == SOPT_SET, ("not from ctloutput"));
1725
1726         /* Only small set of options allows asynchronous setting. */
1727         if (sopt->sopt_level != IPPROTO_TCP)
1728                 return NULL;
1729         switch (sopt->sopt_name) {
1730         case TCP_NODELAY:
1731         case TCP_NOOPT:
1732         case TCP_NOPUSH:
1733         case TCP_FASTKEEP:
1734                 break;
1735         default:
1736                 return NULL;
1737         }
1738
1739         msg = kmalloc(sizeof(*msg), M_LWKTMSG, M_WAITOK | M_NULLOK);
1740         if (msg == NULL) {
1741                 /* Fallback to synchronous tcp_ctloutput */
1742                 return NULL;
1743         }
1744
1745         /* Save the sockopt */
1746         msg->sopt = *sopt;
1747
1748         /* Fixup the sopt.sopt_val ptr */
1749         error = sooptcopyin(sopt, &msg->sopt_val,
1750             sizeof(msg->sopt_val), sizeof(msg->sopt_val));
1751         if (error) {
1752                 kfree(msg, M_LWKTMSG);
1753                 return NULL;
1754         }
1755         msg->sopt.sopt_val = &msg->sopt_val;
1756
1757         /* Hold the current thread */
1758         if (msg->sopt.sopt_td != NULL) {
1759                 flags |= PRCO_HELDTD;
1760                 lwkt_hold(msg->sopt.sopt_td);
1761         }
1762
1763         msg->ctloutput.nm_flags = flags;
1764         msg->ctloutput.nm_sopt = &msg->sopt;
1765
1766         return &msg->ctloutput;
1767 }
1768
1769 /*
1770  * tcp_sendspace and tcp_recvspace are the default send and receive window
1771  * sizes, respectively.  These are obsolescent (this information should
1772  * be set by the route).
1773  *
1774  * Use a default that does not require tcp window scaling to be turned
1775  * on.  Individual programs or the administrator can increase the default.
1776  */
1777 u_long  tcp_sendspace = 57344;  /* largest multiple of PAGE_SIZE < 64k */
1778 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1779     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1780 u_long  tcp_recvspace = 57344;  /* largest multiple of PAGE_SIZE < 64k */
1781 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1782     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1783
1784 /*
1785  * Attach TCP protocol to socket, allocating internet protocol control
1786  * block, tcp control block, buffer space, and entering CLOSED state.
1787  */
1788 static int
1789 tcp_attach(struct socket *so, struct pru_attach_info *ai)
1790 {
1791         struct inpcb *inp;
1792         int error;
1793         int cpu;
1794 #ifdef INET6
1795         boolean_t isipv6 = INP_CHECK_SOCKAF(so, AF_INET6);
1796 #endif
1797
1798         if (ai != NULL) {
1799                 error = tcp_usr_preattach(so, 0 /* don't care */, ai);
1800                 if (error)
1801                         return (error);
1802         } else {
1803                 /* Post attach; do nothing */
1804         }
1805
1806         cpu = mycpu->gd_cpuid;
1807
1808         /*
1809          * Set the default pcbinfo.  This will likely change when we
1810          * bind/connect.
1811          */
1812         error = in_pcballoc(so, &tcbinfo[cpu]);
1813         if (error)
1814                 return (error);
1815         inp = so->so_pcb;
1816 #ifdef INET6
1817         if (isipv6)
1818                 inp->in6p_hops = -1;    /* use kernel default */
1819 #endif
1820         tcp_newtcpcb(inp);
1821         /* Keep a reference for asynchronized pru_rcvd */
1822         soreference(so);
1823         return (0);
1824 }
1825
1826 /*
1827  * Initiate (or continue) disconnect.
1828  * If embryonic state, just send reset (once).
1829  * If in ``let data drain'' option and linger null, just drop.
1830  * Otherwise (hard), mark socket disconnecting and drop
1831  * current input data; switch states based on user close, and
1832  * send segment to peer (with FIN).
1833  */
1834 static struct tcpcb *
1835 tcp_disconnect(struct tcpcb *tp)
1836 {
1837         struct socket *so = tp->t_inpcb->inp_socket;
1838
1839         if (tp->t_state < TCPS_ESTABLISHED) {
1840                 tp = tcp_close(tp);
1841         } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
1842                 tp = tcp_drop(tp, 0);
1843         } else {
1844                 lwkt_gettoken(&so->so_rcv.ssb_token);
1845                 soisdisconnecting(so);
1846                 sbflush(&so->so_rcv.sb);
1847                 tp = tcp_usrclosed(tp);
1848                 if (tp)
1849                         tcp_output(tp);
1850                 lwkt_reltoken(&so->so_rcv.ssb_token);
1851         }
1852         return (tp);
1853 }
1854
1855 /*
1856  * User issued close, and wish to trail through shutdown states:
1857  * if never received SYN, just forget it.  If got a SYN from peer,
1858  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1859  * If already got a FIN from peer, then almost done; go to LAST_ACK
1860  * state.  In all other cases, have already sent FIN to peer (e.g.
1861  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1862  * for peer to send FIN or not respond to keep-alives, etc.
1863  * We can let the user exit from the close as soon as the FIN is acked.
1864  */
1865 static struct tcpcb *
1866 tcp_usrclosed(struct tcpcb *tp)
1867 {
1868
1869         switch (tp->t_state) {
1870
1871         case TCPS_CLOSED:
1872         case TCPS_LISTEN:
1873                 TCP_STATE_CHANGE(tp, TCPS_CLOSED);
1874                 tp = tcp_close(tp);
1875                 break;
1876
1877         case TCPS_SYN_SENT:
1878         case TCPS_SYN_RECEIVED:
1879                 tp->t_flags |= TF_NEEDFIN;
1880                 break;
1881
1882         case TCPS_ESTABLISHED:
1883                 TCP_STATE_CHANGE(tp, TCPS_FIN_WAIT_1);
1884                 break;
1885
1886         case TCPS_CLOSE_WAIT:
1887                 TCP_STATE_CHANGE(tp, TCPS_LAST_ACK);
1888                 break;
1889         }
1890         if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1891                 soisdisconnected(tp->t_inpcb->inp_socket);
1892                 /* To prevent the connection hanging in FIN_WAIT_2 forever. */
1893                 if (tp->t_state == TCPS_FIN_WAIT_2) {
1894                         tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle,
1895                             tcp_timer_2msl);
1896                 }
1897         }
1898         return (tp);
1899 }