tcp/usrreq: tcp_newtcpcb() never fails
[dragonfly.git] / sys / netinet / tcp_usrreq.c
1 /*
2  * Copyright (c) 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2003, 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1993
36  *      The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *      From: @(#)tcp_usrreq.c  8.2 (Berkeley) 1/3/94
63  * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $
64  */
65
66 #include "opt_ipsec.h"
67 #include "opt_inet.h"
68 #include "opt_inet6.h"
69 #include "opt_tcpdebug.h"
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/malloc.h>
75 #include <sys/sysctl.h>
76 #include <sys/globaldata.h>
77 #include <sys/thread.h>
78
79 #include <sys/mbuf.h>
80 #ifdef INET6
81 #include <sys/domain.h>
82 #endif /* INET6 */
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/socketops.h>
86 #include <sys/protosw.h>
87
88 #include <sys/thread2.h>
89 #include <sys/msgport2.h>
90 #include <sys/socketvar2.h>
91
92 #include <net/if.h>
93 #include <net/netisr.h>
94 #include <net/route.h>
95
96 #include <net/netmsg2.h>
97 #include <net/netisr2.h>
98
99 #include <netinet/in.h>
100 #include <netinet/in_systm.h>
101 #ifdef INET6
102 #include <netinet/ip6.h>
103 #endif
104 #include <netinet/in_pcb.h>
105 #ifdef INET6
106 #include <netinet6/in6_pcb.h>
107 #endif
108 #include <netinet/in_var.h>
109 #include <netinet/ip_var.h>
110 #ifdef INET6
111 #include <netinet6/ip6_var.h>
112 #include <netinet6/tcp6_var.h>
113 #endif
114 #include <netinet/tcp.h>
115 #include <netinet/tcp_fsm.h>
116 #include <netinet/tcp_seq.h>
117 #include <netinet/tcp_timer.h>
118 #include <netinet/tcp_timer2.h>
119 #include <netinet/tcp_var.h>
120 #include <netinet/tcpip.h>
121 #ifdef TCPDEBUG
122 #include <netinet/tcp_debug.h>
123 #endif
124
125 #ifdef IPSEC
126 #include <netinet6/ipsec.h>
127 #endif /*IPSEC*/
128
129 /*
130  * TCP protocol interface to socket abstraction.
131  */
132 extern  char *tcpstates[];      /* XXX ??? */
133
134 static int      tcp_attach (struct socket *, struct pru_attach_info *);
135 static void     tcp_connect (netmsg_t msg);
136 #ifdef INET6
137 static void     tcp6_connect (netmsg_t msg);
138 static int      tcp6_connect_oncpu(struct tcpcb *tp, int flags,
139                                 struct mbuf **mp,
140                                 struct sockaddr_in6 *sin6,
141                                 struct in6_addr *addr6);
142 #endif /* INET6 */
143 static struct tcpcb *
144                 tcp_disconnect (struct tcpcb *);
145 static struct tcpcb *
146                 tcp_usrclosed (struct tcpcb *);
147
148 #ifdef TCPDEBUG
149 #define TCPDEBUG0       int ostate = 0
150 #define TCPDEBUG1()     ostate = tp ? tp->t_state : 0
151 #define TCPDEBUG2(req)  if (tp && (so->so_options & SO_DEBUG)) \
152                                 tcp_trace(TA_USER, ostate, tp, 0, 0, req)
153 #else
154 #define TCPDEBUG0
155 #define TCPDEBUG1()
156 #define TCPDEBUG2(req)
157 #endif
158
159 static int      tcp_lport_extension = 1;
160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lportext, CTLFLAG_RW,
161     &tcp_lport_extension, 0, "");
162
163 /*
164  * For some ill optimized programs, which try to use TCP_NOPUSH
165  * to improve performance, will have small amount of data sits
166  * in the sending buffer.  These small amount of data will _not_
167  * be pushed into the network until more data are written into
168  * the socket or the socket write side is shutdown.
169  */ 
170 static int      tcp_disable_nopush = 1;
171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW,
172     &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect");
173
174 /*
175  * Allocate socket buffer space.
176  */
177 static int
178 tcp_usr_preattach(struct socket *so, int proto __unused,
179     struct pru_attach_info *ai)
180 {
181         int error;
182
183         if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) {
184                 error = soreserve(so, tcp_sendspace, tcp_recvspace,
185                                   ai->sb_rlimit);
186                 if (error)
187                         return (error);
188         }
189         atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE | SSB_PREALLOC);
190         atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE | SSB_PREALLOC);
191
192         return 0;
193 }
194
195 /*
196  * TCP attaches to socket via pru_attach(), reserving space,
197  * and an internet control block.  This socket may move to
198  * other CPU later when we bind/connect.
199  */
200 static void
201 tcp_usr_attach(netmsg_t msg)
202 {
203         struct socket *so = msg->base.nm_so;
204         struct pru_attach_info *ai = msg->attach.nm_ai;
205         int error;
206         struct inpcb *inp;
207         struct tcpcb *tp = NULL;
208         TCPDEBUG0;
209
210         inp = so->so_pcb;
211         KASSERT(inp == NULL, ("tcp socket attached"));
212         TCPDEBUG1();
213
214         error = tcp_attach(so, ai);
215         if (error)
216                 goto out;
217
218         if ((so->so_options & SO_LINGER) && so->so_linger == 0)
219                 so->so_linger = TCP_LINGERTIME;
220         tp = sototcpcb(so);
221 out:
222         TCPDEBUG2(PRU_ATTACH);
223         lwkt_replymsg(&msg->lmsg, error);
224 }
225
226 /*
227  * pru_detach() detaches the TCP protocol from the socket.
228  * If the protocol state is non-embryonic, then can't
229  * do this directly: have to initiate a pru_disconnect(),
230  * which may finish later; embryonic TCB's can just
231  * be discarded here.
232  */
233 static void
234 tcp_usr_detach(netmsg_t msg)
235 {
236         struct socket *so = msg->base.nm_so;
237         int error = 0;
238         struct inpcb *inp;
239         struct tcpcb *tp;
240         TCPDEBUG0;
241
242         inp = so->so_pcb;
243
244         /*
245          * If the inp is already detached or never attached, it may have
246          * been due to an async close or async attach failure.  Just return
247          * as if no error occured.
248          *
249          * It's possible for the tcpcb (tp) to disconnect from the inp due
250          * to tcp_drop()->tcp_close() being called.  This may occur *after*
251          * the detach message has been queued so we may find a NULL tp here.
252          */
253         if (inp) {
254                 if ((tp = intotcpcb(inp)) != NULL) {
255                         TCPDEBUG1();
256                         tp = tcp_disconnect(tp);
257                         TCPDEBUG2(PRU_DETACH);
258                 }
259         }
260         lwkt_replymsg(&msg->lmsg, error);
261 }
262
263 /*
264  * NOTE: ignore_error is non-zero for certain disconnection races
265  * which we want to silently allow, otherwise close() may return
266  * an unexpected error.
267  *
268  * NOTE: The variables (msg) and (tp) are assumed.
269  */
270 #define COMMON_START(so, inp, ignore_error)                     \
271         TCPDEBUG0;                                              \
272                                                                 \
273         inp = so->so_pcb;                                       \
274         do {                                                    \
275                 if (inp == NULL) {                              \
276                         error = ignore_error ? 0 : EINVAL;      \
277                         tp = NULL;                              \
278                         goto out;                               \
279                 }                                               \
280                 tp = intotcpcb(inp);                            \
281                 TCPDEBUG1();                                    \
282         } while(0)
283
284 #define COMMON_END1(req, noreply)                               \
285         out: do {                                               \
286                 TCPDEBUG2(req);                                 \
287                 if (!(noreply))                                 \
288                         lwkt_replymsg(&msg->lmsg, error);       \
289                 return;                                         \
290         } while(0)
291
292 #define COMMON_END(req)         COMMON_END1((req), 0)
293
294 /*
295  * Give the socket an address.
296  */
297 static void
298 tcp_usr_bind(netmsg_t msg)
299 {
300         struct socket *so = msg->bind.base.nm_so;
301         struct sockaddr *nam = msg->bind.nm_nam;
302         struct thread *td = msg->bind.nm_td;
303         int error = 0;
304         struct inpcb *inp;
305         struct tcpcb *tp;
306         struct sockaddr_in *sinp;
307
308         COMMON_START(so, inp, 0);
309
310         /*
311          * Must check for multicast addresses and disallow binding
312          * to them.
313          */
314         sinp = (struct sockaddr_in *)nam;
315         if (sinp->sin_family == AF_INET &&
316             IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
317                 error = EAFNOSUPPORT;
318                 goto out;
319         }
320         error = in_pcbbind(inp, nam, td);
321         if (error)
322                 goto out;
323
324         COMMON_END(PRU_BIND);
325 }
326
327 #ifdef INET6
328
329 static void
330 tcp6_usr_bind(netmsg_t msg)
331 {
332         struct socket *so = msg->bind.base.nm_so;
333         struct sockaddr *nam = msg->bind.nm_nam;
334         struct thread *td = msg->bind.nm_td;
335         int error = 0;
336         struct inpcb *inp;
337         struct tcpcb *tp;
338         struct sockaddr_in6 *sin6p;
339
340         COMMON_START(so, inp, 0);
341
342         /*
343          * Must check for multicast addresses and disallow binding
344          * to them.
345          */
346         sin6p = (struct sockaddr_in6 *)nam;
347         if (sin6p->sin6_family == AF_INET6 &&
348             IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
349                 error = EAFNOSUPPORT;
350                 goto out;
351         }
352         error = in6_pcbbind(inp, nam, td);
353         if (error)
354                 goto out;
355         COMMON_END(PRU_BIND);
356 }
357 #endif /* INET6 */
358
359 struct netmsg_inswildcard {
360         struct netmsg_base      base;
361         struct inpcb            *nm_inp;
362 };
363
364 static void
365 in_pcbinswildcardhash_handler(netmsg_t msg)
366 {
367         struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg;
368         int cpu = mycpuid, nextcpu;
369
370         in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]);
371
372         nextcpu = cpu + 1;
373         if (nextcpu < ncpus2)
374                 lwkt_forwardmsg(netisr_cpuport(nextcpu), &nm->base.lmsg);
375         else
376                 lwkt_replymsg(&nm->base.lmsg, 0);
377 }
378
379 static void
380 tcp_sosetport(struct lwkt_msg *msg, lwkt_port_t port)
381 {
382         sosetport(((struct netmsg_base *)msg)->nm_so, port);
383 }
384
385 /*
386  * Prepare to accept connections.
387  */
388 static void
389 tcp_usr_listen(netmsg_t msg)
390 {
391         struct socket *so = msg->listen.base.nm_so;
392         struct thread *td = msg->listen.nm_td;
393         int error = 0;
394         struct inpcb *inp;
395         struct tcpcb *tp;
396         struct netmsg_inswildcard nm;
397         lwkt_port_t port0 = netisr_cpuport(0);
398
399         COMMON_START(so, inp, 0);
400
401         if (&curthread->td_msgport != port0) {
402                 lwkt_msg_t lmsg = &msg->listen.base.lmsg;
403
404                 KASSERT((msg->listen.nm_flags & PRUL_RELINK) == 0,
405                     ("already asked to relink"));
406
407                 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]);
408                 msg->listen.nm_flags |= PRUL_RELINK;
409
410                 /* See the related comment in tcp_connect() */
411                 lwkt_setmsg_receipt(lmsg, tcp_sosetport);
412                 lwkt_forwardmsg(port0, lmsg);
413                 /* msg invalid now */
414                 return;
415         }
416         KASSERT(so->so_port == port0, ("so_port is not netisr0"));
417
418         if (msg->listen.nm_flags & PRUL_RELINK) {
419                 msg->listen.nm_flags &= ~PRUL_RELINK;
420                 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]);
421         }
422         KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0"));
423
424         if (tp->t_flags & TF_LISTEN)
425                 goto out;
426
427         if (inp->inp_lport == 0) {
428                 error = in_pcbbind(inp, NULL, td);
429                 if (error)
430                         goto out;
431         }
432
433         tp->t_state = TCPS_LISTEN;
434         tp->t_flags |= TF_LISTEN;
435         tp->tt_msg = NULL; /* Catch any invalid timer usage */
436
437         if (ncpus2 > 1) {
438                 /*
439                  * Put this inpcb into wildcard hash on other cpus.
440                  */
441                 ASSERT_INP_NOTINHASH(inp);
442                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
443                             MSGF_PRIORITY, in_pcbinswildcardhash_handler);
444                 nm.nm_inp = inp;
445                 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0);
446         }
447         in_pcbinswildcardhash(inp);
448         COMMON_END(PRU_LISTEN);
449 }
450
451 #ifdef INET6
452
453 static void
454 tcp6_usr_listen(netmsg_t msg)
455 {
456         struct socket *so = msg->listen.base.nm_so;
457         struct thread *td = msg->listen.nm_td;
458         int error = 0;
459         struct inpcb *inp;
460         struct tcpcb *tp;
461         struct netmsg_inswildcard nm;
462
463         COMMON_START(so, inp, 0);
464
465         if (tp->t_flags & TF_LISTEN)
466                 goto out;
467
468         if (inp->inp_lport == 0) {
469                 error = in6_pcbbind(inp, NULL, td);
470                 if (error)
471                         goto out;
472         }
473
474         tp->t_state = TCPS_LISTEN;
475         tp->t_flags |= TF_LISTEN;
476         tp->tt_msg = NULL; /* Catch any invalid timer usage */
477
478         if (ncpus2 > 1) {
479                 /*
480                  * Put this inpcb into wildcard hash on other cpus.
481                  */
482                 KKASSERT(so->so_port == netisr_cpuport(0));
483                 ASSERT_IN_NETISR(0);
484                 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]);
485                 ASSERT_INP_NOTINHASH(inp);
486
487                 netmsg_init(&nm.base, NULL, &curthread->td_msgport,
488                             MSGF_PRIORITY, in_pcbinswildcardhash_handler);
489                 nm.nm_inp = inp;
490                 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0);
491         }
492         in_pcbinswildcardhash(inp);
493         COMMON_END(PRU_LISTEN);
494 }
495 #endif /* INET6 */
496
497 /*
498  * Initiate connection to peer.
499  * Create a template for use in transmissions on this connection.
500  * Enter SYN_SENT state, and mark socket as connecting.
501  * Start keep-alive timer, and seed output sequence space.
502  * Send initial segment on connection.
503  */
504 static void
505 tcp_usr_connect(netmsg_t msg)
506 {
507         struct socket *so = msg->connect.base.nm_so;
508         struct sockaddr *nam = msg->connect.nm_nam;
509         struct thread *td = msg->connect.nm_td;
510         int error = 0;
511         struct inpcb *inp;
512         struct tcpcb *tp;
513         struct sockaddr_in *sinp;
514
515         COMMON_START(so, inp, 0);
516
517         /*
518          * Must disallow TCP ``connections'' to multicast addresses.
519          */
520         sinp = (struct sockaddr_in *)nam;
521         if (sinp->sin_family == AF_INET
522             && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
523                 error = EAFNOSUPPORT;
524                 goto out;
525         }
526
527         if (!prison_remote_ip(td, (struct sockaddr*)sinp)) {
528                 error = EAFNOSUPPORT; /* IPv6 only jail */
529                 goto out;
530         }
531
532         tcp_connect(msg);
533         /* msg is invalid now */
534         return;
535 out:
536         if (msg->connect.nm_m) {
537                 m_freem(msg->connect.nm_m);
538                 msg->connect.nm_m = NULL;
539         }
540         if (msg->connect.nm_flags & PRUC_HELDTD)
541                 lwkt_rele(td);
542         if (error && (msg->connect.nm_flags & PRUC_ASYNC)) {
543                 so->so_error = error;
544                 soisdisconnected(so);
545         }
546         lwkt_replymsg(&msg->lmsg, error);
547 }
548
549 #ifdef INET6
550
551 static void
552 tcp6_usr_connect(netmsg_t msg)
553 {
554         struct socket *so = msg->connect.base.nm_so;
555         struct sockaddr *nam = msg->connect.nm_nam;
556         struct thread *td = msg->connect.nm_td;
557         int error = 0;
558         struct inpcb *inp;
559         struct tcpcb *tp;
560         struct sockaddr_in6 *sin6p;
561
562         COMMON_START(so, inp, 0);
563
564         /*
565          * Must disallow TCP ``connections'' to multicast addresses.
566          */
567         sin6p = (struct sockaddr_in6 *)nam;
568         if (sin6p->sin6_family == AF_INET6
569             && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
570                 error = EAFNOSUPPORT;
571                 goto out;
572         }
573
574         if (!prison_remote_ip(td, nam)) {
575                 error = EAFNOSUPPORT; /* IPv4 only jail */
576                 goto out;
577         }
578
579         /* Reject v4-mapped address */
580         if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
581                 error = EADDRNOTAVAIL;
582                 goto out;
583         }
584
585         inp->inp_inc.inc_isipv6 = 1;
586         tcp6_connect(msg);
587         /* msg is invalid now */
588         return;
589 out:
590         if (msg->connect.nm_m) {
591                 m_freem(msg->connect.nm_m);
592                 msg->connect.nm_m = NULL;
593         }
594         lwkt_replymsg(&msg->lmsg, error);
595 }
596
597 #endif /* INET6 */
598
599 /*
600  * Initiate disconnect from peer.
601  * If connection never passed embryonic stage, just drop;
602  * else if don't need to let data drain, then can just drop anyways,
603  * else have to begin TCP shutdown process: mark socket disconnecting,
604  * drain unread data, state switch to reflect user close, and
605  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
606  * when peer sends FIN and acks ours.
607  *
608  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
609  */
610 static void
611 tcp_usr_disconnect(netmsg_t msg)
612 {
613         struct socket *so = msg->disconnect.base.nm_so;
614         int error = 0;
615         struct inpcb *inp;
616         struct tcpcb *tp;
617
618         COMMON_START(so, inp, 1);
619         tp = tcp_disconnect(tp);
620         COMMON_END(PRU_DISCONNECT);
621 }
622
623 /*
624  * Accept a connection.  Essentially all the work is
625  * done at higher levels; just return the address
626  * of the peer, storing through addr.
627  */
628 static void
629 tcp_usr_accept(netmsg_t msg)
630 {
631         struct socket *so = msg->accept.base.nm_so;
632         struct sockaddr **nam = msg->accept.nm_nam;
633         int error = 0;
634         struct inpcb *inp;
635         struct tcpcb *tp = NULL;
636         TCPDEBUG0;
637
638         inp = so->so_pcb;
639         if (so->so_state & SS_ISDISCONNECTED) {
640                 error = ECONNABORTED;
641                 goto out;
642         }
643         if (inp == NULL) {
644                 error = EINVAL;
645                 goto out;
646         }
647
648         tp = intotcpcb(inp);
649         TCPDEBUG1();
650         in_setpeeraddr(so, nam);
651         COMMON_END(PRU_ACCEPT);
652 }
653
654 #ifdef INET6
655 static void
656 tcp6_usr_accept(netmsg_t msg)
657 {
658         struct socket *so = msg->accept.base.nm_so;
659         struct sockaddr **nam = msg->accept.nm_nam;
660         int error = 0;
661         struct inpcb *inp;
662         struct tcpcb *tp = NULL;
663         TCPDEBUG0;
664
665         inp = so->so_pcb;
666
667         if (so->so_state & SS_ISDISCONNECTED) {
668                 error = ECONNABORTED;
669                 goto out;
670         }
671         if (inp == NULL) {
672                 error = EINVAL;
673                 goto out;
674         }
675         tp = intotcpcb(inp);
676         TCPDEBUG1();
677         in6_setpeeraddr(so, nam);
678         COMMON_END(PRU_ACCEPT);
679 }
680 #endif /* INET6 */
681
682 /*
683  * Mark the connection as being incapable of further output.
684  */
685 static void
686 tcp_usr_shutdown(netmsg_t msg)
687 {
688         struct socket *so = msg->shutdown.base.nm_so;
689         int error = 0;
690         struct inpcb *inp;
691         struct tcpcb *tp;
692
693         COMMON_START(so, inp, 0);
694         socantsendmore(so);
695         tp = tcp_usrclosed(tp);
696         if (tp)
697                 error = tcp_output(tp);
698         COMMON_END(PRU_SHUTDOWN);
699 }
700
701 /*
702  * After a receive, possibly send window update to peer.
703  */
704 static void
705 tcp_usr_rcvd(netmsg_t msg)
706 {
707         struct socket *so = msg->rcvd.base.nm_so;
708         int error = 0, noreply = 0;
709         struct inpcb *inp;
710         struct tcpcb *tp;
711
712         COMMON_START(so, inp, 0);
713
714         if (msg->rcvd.nm_pru_flags & PRUR_ASYNC) {
715                 noreply = 1;
716                 so_async_rcvd_reply(so);
717         }
718         tcp_output(tp);
719
720         COMMON_END1(PRU_RCVD, noreply);
721 }
722
723 /*
724  * Do a send by putting data in output queue and updating urgent
725  * marker if URG set.  Possibly send more data.  Unlike the other
726  * pru_*() routines, the mbuf chains are our responsibility.  We
727  * must either enqueue them or free them.  The other pru_* routines
728  * generally are caller-frees.
729  */
730 static void
731 tcp_usr_send(netmsg_t msg)
732 {
733         struct socket *so = msg->send.base.nm_so;
734         int flags = msg->send.nm_flags;
735         struct mbuf *m = msg->send.nm_m;
736         int error = 0;
737         struct inpcb *inp;
738         struct tcpcb *tp;
739         TCPDEBUG0;
740
741         KKASSERT(msg->send.nm_control == NULL);
742         KKASSERT(msg->send.nm_addr == NULL);
743         KKASSERT((flags & PRUS_FREEADDR) == 0);
744
745         inp = so->so_pcb;
746
747         if (inp == NULL) {
748                 /*
749                  * OOPS! we lost a race, the TCP session got reset after
750                  * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
751                  * network interrupt in the non-critical section of sosend().
752                  */
753                 m_freem(m);
754                 error = ECONNRESET;     /* XXX EPIPE? */
755                 tp = NULL;
756                 TCPDEBUG1();
757                 goto out;
758         }
759         tp = intotcpcb(inp);
760         TCPDEBUG1();
761
762 #ifdef foo
763         /*
764          * This is no longer necessary, since:
765          * - sosendtcp() has already checked it for us
766          * - It does not work with asynchronized send
767          */
768
769         /*
770          * Don't let too much OOB data build up
771          */
772         if (flags & PRUS_OOB) {
773                 if (ssb_space(&so->so_snd) < -512) {
774                         m_freem(m);
775                         error = ENOBUFS;
776                         goto out;
777                 }
778         }
779 #endif
780
781         /*
782          * Pump the data into the socket.
783          */
784         if (m) {
785                 ssb_appendstream(&so->so_snd, m);
786                 sowwakeup(so);
787         }
788         if (flags & PRUS_OOB) {
789                 /*
790                  * According to RFC961 (Assigned Protocols),
791                  * the urgent pointer points to the last octet
792                  * of urgent data.  We continue, however,
793                  * to consider it to indicate the first octet
794                  * of data past the urgent section.
795                  * Otherwise, snd_up should be one lower.
796                  */
797                 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
798                 tp->t_flags |= TF_FORCE;
799                 error = tcp_output(tp);
800                 tp->t_flags &= ~TF_FORCE;
801         } else {
802                 if (flags & PRUS_EOF) {
803                         /*
804                          * Close the send side of the connection after
805                          * the data is sent.
806                          */
807                         socantsendmore(so);
808                         tp = tcp_usrclosed(tp);
809                 }
810                 if (tp != NULL && !tcp_output_pending(tp)) {
811                         if (flags & PRUS_MORETOCOME)
812                                 tp->t_flags |= TF_MORETOCOME;
813                         error = tcp_output_fair(tp);
814                         if (flags & PRUS_MORETOCOME)
815                                 tp->t_flags &= ~TF_MORETOCOME;
816                 }
817         }
818         COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB :
819                    ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND),
820                    (flags & PRUS_NOREPLY));
821 }
822
823 /*
824  * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort()
825  *       will sofree() it when we return.
826  */
827 static void
828 tcp_usr_abort(netmsg_t msg)
829 {
830         struct socket *so = msg->abort.base.nm_so;
831         int error = 0;
832         struct inpcb *inp;
833         struct tcpcb *tp;
834
835         COMMON_START(so, inp, 1);
836         tp = tcp_drop(tp, ECONNABORTED);
837         COMMON_END(PRU_ABORT);
838 }
839
840 /*
841  * Receive out-of-band data.
842  */
843 static void
844 tcp_usr_rcvoob(netmsg_t msg)
845 {
846         struct socket *so = msg->rcvoob.base.nm_so;
847         struct mbuf *m = msg->rcvoob.nm_m;
848         int flags = msg->rcvoob.nm_flags;
849         int error = 0;
850         struct inpcb *inp;
851         struct tcpcb *tp;
852
853         COMMON_START(so, inp, 0);
854         if ((so->so_oobmark == 0 &&
855              (so->so_state & SS_RCVATMARK) == 0) ||
856             so->so_options & SO_OOBINLINE ||
857             tp->t_oobflags & TCPOOB_HADDATA) {
858                 error = EINVAL;
859                 goto out;
860         }
861         if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
862                 error = EWOULDBLOCK;
863                 goto out;
864         }
865         m->m_len = 1;
866         *mtod(m, caddr_t) = tp->t_iobc;
867         if ((flags & MSG_PEEK) == 0)
868                 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
869         COMMON_END(PRU_RCVOOB);
870 }
871
872 static void
873 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr)
874 {
875         in_savefaddr(so, faddr);
876 }
877
878 #ifdef INET6
879 static void
880 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr)
881 {
882         in6_savefaddr(so, faddr);
883 }
884 #endif
885
886 static int
887 tcp_usr_preconnect(struct socket *so, const struct sockaddr *nam,
888     struct thread *td __unused)
889 {
890         const struct sockaddr_in *sinp;
891
892         sinp = (const struct sockaddr_in *)nam;
893         if (sinp->sin_family == AF_INET &&
894             IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
895                 return EAFNOSUPPORT;
896
897         soisconnecting(so);
898         return 0;
899 }
900
901 /* xxx - should be const */
902 struct pr_usrreqs tcp_usrreqs = {
903         .pru_abort = tcp_usr_abort,
904         .pru_accept = tcp_usr_accept,
905         .pru_attach = tcp_usr_attach,
906         .pru_bind = tcp_usr_bind,
907         .pru_connect = tcp_usr_connect,
908         .pru_connect2 = pr_generic_notsupp,
909         .pru_control = in_control_dispatch,
910         .pru_detach = tcp_usr_detach,
911         .pru_disconnect = tcp_usr_disconnect,
912         .pru_listen = tcp_usr_listen,
913         .pru_peeraddr = in_setpeeraddr_dispatch,
914         .pru_rcvd = tcp_usr_rcvd,
915         .pru_rcvoob = tcp_usr_rcvoob,
916         .pru_send = tcp_usr_send,
917         .pru_sense = pru_sense_null,
918         .pru_shutdown = tcp_usr_shutdown,
919         .pru_sockaddr = in_setsockaddr_dispatch,
920         .pru_sosend = sosendtcp,
921         .pru_soreceive = sorecvtcp,
922         .pru_savefaddr = tcp_usr_savefaddr,
923         .pru_preconnect = tcp_usr_preconnect,
924         .pru_preattach = tcp_usr_preattach
925 };
926
927 #ifdef INET6
928 struct pr_usrreqs tcp6_usrreqs = {
929         .pru_abort = tcp_usr_abort,
930         .pru_accept = tcp6_usr_accept,
931         .pru_attach = tcp_usr_attach,
932         .pru_bind = tcp6_usr_bind,
933         .pru_connect = tcp6_usr_connect,
934         .pru_connect2 = pr_generic_notsupp,
935         .pru_control = in6_control_dispatch,
936         .pru_detach = tcp_usr_detach,
937         .pru_disconnect = tcp_usr_disconnect,
938         .pru_listen = tcp6_usr_listen,
939         .pru_peeraddr = in6_setpeeraddr_dispatch,
940         .pru_rcvd = tcp_usr_rcvd,
941         .pru_rcvoob = tcp_usr_rcvoob,
942         .pru_send = tcp_usr_send,
943         .pru_sense = pru_sense_null,
944         .pru_shutdown = tcp_usr_shutdown,
945         .pru_sockaddr = in6_setsockaddr_dispatch,
946         .pru_sosend = sosendtcp,
947         .pru_soreceive = sorecvtcp,
948         .pru_savefaddr = tcp6_usr_savefaddr
949 };
950 #endif /* INET6 */
951
952 static int
953 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m,
954                   struct sockaddr_in *sin, struct sockaddr_in *if_sin)
955 {
956         struct inpcb *inp = tp->t_inpcb, *oinp;
957         struct socket *so = inp->inp_socket;
958         struct route *ro = &inp->inp_route;
959
960         KASSERT(inp->inp_pcbinfo == &tcbinfo[mycpu->gd_cpuid],
961             ("pcbinfo mismatch"));
962
963         oinp = in_pcblookup_hash(inp->inp_pcbinfo,
964                                  sin->sin_addr, sin->sin_port,
965                                  (inp->inp_laddr.s_addr != INADDR_ANY ?
966                                   inp->inp_laddr : if_sin->sin_addr),
967                                 inp->inp_lport, 0, NULL);
968         if (oinp != NULL) {
969                 m_freem(m);
970                 return (EADDRINUSE);
971         }
972         if (inp->inp_laddr.s_addr == INADDR_ANY)
973                 inp->inp_laddr = if_sin->sin_addr;
974         inp->inp_faddr = sin->sin_addr;
975         inp->inp_fport = sin->sin_port;
976         in_pcbinsconnhash(inp);
977
978         /*
979          * We are now on the inpcb's owner CPU, if the cached route was
980          * freed because the rtentry's owner CPU is not the current CPU
981          * (e.g. in tcp_connect()), then we try to reallocate it here with
982          * the hope that a rtentry may be cloned from a RTF_PRCLONING
983          * rtentry.
984          */
985         if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
986             ro->ro_rt == NULL) {
987                 bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
988                 ro->ro_dst.sa_family = AF_INET;
989                 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
990                 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr =
991                         sin->sin_addr;
992                 rtalloc(ro);
993         }
994
995         /*
996          * Now that no more errors can occur, change the protocol processing
997          * port to the current thread (which is the correct thread).
998          *
999          * Create TCP timer message now; we are on the tcpcb's owner
1000          * CPU/thread.
1001          */
1002         tcp_create_timermsg(tp, &curthread->td_msgport);
1003
1004         /*
1005          * Compute window scaling to request.  Use a larger scaling then
1006          * needed for the initial receive buffer in case the receive buffer
1007          * gets expanded.
1008          */
1009         if (tp->request_r_scale < TCP_MIN_WINSHIFT)
1010                 tp->request_r_scale = TCP_MIN_WINSHIFT;
1011         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1012                (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat
1013         ) {
1014                 tp->request_r_scale++;
1015         }
1016
1017         soisconnecting(so);
1018         tcpstat.tcps_connattempt++;
1019         tp->t_state = TCPS_SYN_SENT;
1020         tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep);
1021         tp->iss = tcp_new_isn(tp);
1022         tcp_sendseqinit(tp);
1023         if (m) {
1024                 ssb_appendstream(&so->so_snd, m);
1025                 m = NULL;
1026                 if (flags & PRUS_OOB)
1027                         tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
1028         }
1029
1030         /*
1031          * Close the send side of the connection after
1032          * the data is sent if flagged.
1033          */
1034         if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) {
1035                 socantsendmore(so);
1036                 tp = tcp_usrclosed(tp);
1037         }
1038         return (tcp_output(tp));
1039 }
1040
1041 /*
1042  * Common subroutine to open a TCP connection to remote host specified
1043  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
1044  * port number if needed.  Call in_pcbladdr to do the routing and to choose
1045  * a local host address (interface).
1046  * Initialize connection parameters and enter SYN-SENT state.
1047  */
1048 static void
1049 tcp_connect(netmsg_t msg)
1050 {
1051         struct socket *so = msg->connect.base.nm_so;
1052         struct sockaddr *nam = msg->connect.nm_nam;
1053         struct thread *td = msg->connect.nm_td;
1054         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1055         struct sockaddr_in *if_sin = NULL;
1056         struct inpcb *inp;
1057         struct tcpcb *tp;
1058         int error;
1059         lwkt_port_t port;
1060
1061         COMMON_START(so, inp, 0);
1062
1063         /*
1064          * Reconnect our pcb if we have to
1065          */
1066         if (msg->connect.nm_flags & PRUC_RECONNECT) {
1067                 msg->connect.nm_flags &= ~PRUC_RECONNECT;
1068                 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1069         }
1070
1071         /*
1072          * Bind if we have to
1073          */
1074         if (inp->inp_lport == 0) {
1075                 if (tcp_lport_extension) {
1076                         KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY);
1077
1078                         error = in_pcbladdr(inp, nam, &if_sin, td);
1079                         if (error)
1080                                 goto out;
1081                         inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr;
1082
1083                         error = in_pcbbind_remote(inp, nam, td);
1084                         if (error)
1085                                 goto out;
1086
1087                         msg->connect.nm_flags |= PRUC_HASLADDR;
1088                 } else {
1089                         error = in_pcbbind(inp, NULL, td);
1090                         if (error)
1091                                 goto out;
1092                 }
1093         }
1094
1095         if ((msg->connect.nm_flags & PRUC_HASLADDR) == 0) {
1096                 /*
1097                  * Calculate the correct protocol processing thread.  The
1098                  * connect operation must run there.  Set the forwarding
1099                  * port before we forward the message or it will get bounced
1100                  * right back to us.
1101                  */
1102                 error = in_pcbladdr(inp, nam, &if_sin, td);
1103                 if (error)
1104                         goto out;
1105         }
1106         KKASSERT(inp->inp_socket == so);
1107
1108         port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port,
1109                             (inp->inp_laddr.s_addr != INADDR_ANY ?
1110                              inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr),
1111                             inp->inp_lport);
1112
1113         if (port != &curthread->td_msgport) {
1114                 lwkt_msg_t lmsg = &msg->connect.base.lmsg;
1115
1116                 /*
1117                  * in_pcbladdr() may have allocated a route entry for us
1118                  * on the current CPU, but we need a route entry on the
1119                  * inpcb's owner CPU, so free it here.
1120                  */
1121                 in_pcbresetroute(inp);
1122
1123                 /*
1124                  * We are moving the protocol processing port the socket
1125                  * is on, we have to unlink here and re-link on the
1126                  * target cpu.
1127                  */
1128                 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1129                 msg->connect.nm_flags |= PRUC_RECONNECT;
1130                 msg->connect.base.nm_dispatch = tcp_connect;
1131
1132                 /*
1133                  * Use message put done receipt to change this socket's
1134                  * so_port, i.e. _after_ this message was put onto the
1135                  * target netisr's msgport but _before_ the message could
1136                  * be pulled from the target netisr's msgport, so that:
1137                  * - The upper half (socket code) will not see the new
1138                  *   msgport before this message reaches the new msgport
1139                  *   and messages for this socket will be ordered.
1140                  * - This message will see the new msgport, when its
1141                  *   handler is called in the target netisr.
1142                  *
1143                  * NOTE:
1144                  * We MUST use messege put done receipt to change this
1145                  * socket's so_port:
1146                  * If we changed the so_port in this netisr after the
1147                  * lwkt_forwardmsg (so messages for this socket will be
1148                  * ordered) and changed the so_port in the target netisr
1149                  * at the very beginning of this message's handler, we
1150                  * would suffer so_port overwritten race, given this
1151                  * message might be forwarded again.
1152                  *
1153                  * NOTE:
1154                  * This mechanism depends on that the netisr's msgport
1155                  * is spin msgport (currently it is :).
1156                  *
1157                  * If the upper half saw the new msgport before this
1158                  * message reached the target netisr's msgport, the
1159                  * messages sent from the upper half could reach the new
1160                  * msgport before this message, thus there would be
1161                  * message reordering.  The worst case could be soclose()
1162                  * saw the new msgport and the detach message could reach
1163                  * the new msgport before this message, i.e. the inpcb
1164                  * could have been destroyed when this message was still
1165                  * pending on or on its way to the new msgport.  Other
1166                  * weird cases could also happen, e.g. inpcb->inp_pcbinfo,
1167                  * since we have unlinked this inpcb from the current
1168                  * pcbinfo first.
1169                  */
1170                 lwkt_setmsg_receipt(lmsg, tcp_sosetport);
1171                 lwkt_forwardmsg(port, lmsg);
1172                 /* msg invalid now */
1173                 return;
1174         } else if (msg->connect.nm_flags & PRUC_HELDTD) {
1175                 /*
1176                  * The original thread is no longer needed; release it.
1177                  */
1178                 lwkt_rele(td);
1179                 msg->connect.nm_flags &= ~PRUC_HELDTD;
1180         }
1181         error = tcp_connect_oncpu(tp, msg->connect.nm_sndflags,
1182                                   msg->connect.nm_m, sin, if_sin);
1183         msg->connect.nm_m = NULL;
1184 out:
1185         if (msg->connect.nm_m) {
1186                 m_freem(msg->connect.nm_m);
1187                 msg->connect.nm_m = NULL;
1188         }
1189         if (msg->connect.nm_flags & PRUC_HELDTD)
1190                 lwkt_rele(td);
1191         if (error && (msg->connect.nm_flags & PRUC_ASYNC)) {
1192                 so->so_error = error;
1193                 soisdisconnected(so);
1194         }
1195         lwkt_replymsg(&msg->connect.base.lmsg, error);
1196         /* msg invalid now */
1197 }
1198
1199 #ifdef INET6
1200
1201 static void
1202 tcp6_connect(netmsg_t msg)
1203 {
1204         struct tcpcb *tp;
1205         struct socket *so = msg->connect.base.nm_so;
1206         struct sockaddr *nam = msg->connect.nm_nam;
1207         struct thread *td = msg->connect.nm_td;
1208         struct inpcb *inp;
1209         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
1210         struct in6_addr *addr6;
1211         lwkt_port_t port;
1212         int error;
1213
1214         COMMON_START(so, inp, 0);
1215
1216         /*
1217          * Reconnect our pcb if we have to
1218          */
1219         if (msg->connect.nm_flags & PRUC_RECONNECT) {
1220                 msg->connect.nm_flags &= ~PRUC_RECONNECT;
1221                 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1222         }
1223
1224         /*
1225          * Bind if we have to
1226          */
1227         if (inp->inp_lport == 0) {
1228                 error = in6_pcbbind(inp, NULL, td);
1229                 if (error)
1230                         goto out;
1231         }
1232
1233         /*
1234          * Cannot simply call in_pcbconnect, because there might be an
1235          * earlier incarnation of this same connection still in
1236          * TIME_WAIT state, creating an ADDRINUSE error.
1237          */
1238         error = in6_pcbladdr(inp, nam, &addr6, td);
1239         if (error)
1240                 goto out;
1241
1242         port = tcp6_addrport(); /* XXX hack for now, always cpu0 */
1243
1244         if (port != &curthread->td_msgport) {
1245                 lwkt_msg_t lmsg = &msg->connect.base.lmsg;
1246
1247                 /*
1248                  * in_pcbladdr() may have allocated a route entry for us
1249                  * on the current CPU, but we need a route entry on the
1250                  * inpcb's owner CPU, so free it here.
1251                  */
1252                 in_pcbresetroute(inp);
1253
1254                 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
1255                 msg->connect.nm_flags |= PRUC_RECONNECT;
1256                 msg->connect.base.nm_dispatch = tcp6_connect;
1257
1258                 /* See the related comment in tcp_connect() */
1259                 lwkt_setmsg_receipt(lmsg, tcp_sosetport);
1260                 lwkt_forwardmsg(port, lmsg);
1261                 /* msg invalid now */
1262                 return;
1263         }
1264         error = tcp6_connect_oncpu(tp, msg->connect.nm_sndflags,
1265                                    &msg->connect.nm_m, sin6, addr6);
1266         /* nm_m may still be intact */
1267 out:
1268         if (msg->connect.nm_m) {
1269                 m_freem(msg->connect.nm_m);
1270                 msg->connect.nm_m = NULL;
1271         }
1272         lwkt_replymsg(&msg->connect.base.lmsg, error);
1273         /* msg invalid now */
1274 }
1275
1276 static int
1277 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp,
1278                    struct sockaddr_in6 *sin6, struct in6_addr *addr6)
1279 {
1280         struct mbuf *m = *mp;
1281         struct inpcb *inp = tp->t_inpcb;
1282         struct socket *so = inp->inp_socket;
1283         struct inpcb *oinp;
1284
1285         /*
1286          * Cannot simply call in_pcbconnect, because there might be an
1287          * earlier incarnation of this same connection still in
1288          * TIME_WAIT state, creating an ADDRINUSE error.
1289          */
1290         oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
1291                                   &sin6->sin6_addr, sin6->sin6_port,
1292                                   (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ?
1293                                       addr6 : &inp->in6p_laddr),
1294                                   inp->inp_lport,  0, NULL);
1295         if (oinp)
1296                 return (EADDRINUSE);
1297
1298         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1299                 inp->in6p_laddr = *addr6;
1300         inp->in6p_faddr = sin6->sin6_addr;
1301         inp->inp_fport = sin6->sin6_port;
1302         if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
1303                 inp->in6p_flowinfo = sin6->sin6_flowinfo;
1304         in_pcbinsconnhash(inp);
1305
1306         /*
1307          * Now that no more errors can occur, change the protocol processing
1308          * port to the current thread (which is the correct thread).
1309          *
1310          * Create TCP timer message now; we are on the tcpcb's owner
1311          * CPU/thread.
1312          */
1313         tcp_create_timermsg(tp, &curthread->td_msgport);
1314
1315         /* Compute window scaling to request.  */
1316         if (tp->request_r_scale < TCP_MIN_WINSHIFT)
1317                 tp->request_r_scale = TCP_MIN_WINSHIFT;
1318         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1319             (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) {
1320                 tp->request_r_scale++;
1321         }
1322
1323         soisconnecting(so);
1324         tcpstat.tcps_connattempt++;
1325         tp->t_state = TCPS_SYN_SENT;
1326         tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep);
1327         tp->iss = tcp_new_isn(tp);
1328         tcp_sendseqinit(tp);
1329         if (m) {
1330                 ssb_appendstream(&so->so_snd, m);
1331                 *mp = NULL;
1332                 if (flags & PRUS_OOB)
1333                         tp->snd_up = tp->snd_una + so->so_snd.ssb_cc;
1334         }
1335
1336         /*
1337          * Close the send side of the connection after
1338          * the data is sent if flagged.
1339          */
1340         if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) {
1341                 socantsendmore(so);
1342                 tp = tcp_usrclosed(tp);
1343         }
1344         return (tcp_output(tp));
1345 }
1346
1347 #endif /* INET6 */
1348
1349 /*
1350  * The new sockopt interface makes it possible for us to block in the
1351  * copyin/out step (if we take a page fault).  Taking a page fault while
1352  * in a critical section is probably a Bad Thing.  (Since sockets and pcbs
1353  * both now use TSM, there probably isn't any need for this function to 
1354  * run in a critical section any more.  This needs more examination.)
1355  */
1356 void
1357 tcp_ctloutput(netmsg_t msg)
1358 {
1359         struct socket *so = msg->base.nm_so;
1360         struct sockopt *sopt = msg->ctloutput.nm_sopt;
1361         int     error, opt, optval, opthz;
1362         struct  inpcb *inp;
1363         struct  tcpcb *tp;
1364
1365         error = 0;
1366         inp = so->so_pcb;
1367         if (inp == NULL) {
1368                 error = ECONNRESET;
1369                 goto done;
1370         }
1371         tp = intotcpcb(inp);
1372
1373         /* Get socket's owner cpuid hint */
1374         if (sopt->sopt_level == SOL_SOCKET &&
1375             sopt->sopt_dir == SOPT_GET &&
1376             sopt->sopt_name == SO_CPUHINT) {
1377                 if (tp->t_flags & TF_LISTEN) {
1378                         /*
1379                          * Listen sockets owner cpuid is always 0,
1380                          * which does not make sense if SO_REUSEPORT
1381                          * is not set.
1382                          */
1383                         if (so->so_options & SO_REUSEPORT)
1384                                 optval = (inp->inp_lgrpindex & ncpus2_mask);
1385                         else
1386                                 optval = -1; /* no hint */
1387                 } else {
1388                         optval = mycpuid;
1389                 }
1390                 soopt_from_kbuf(sopt, &optval, sizeof(optval));
1391                 goto done;
1392         }
1393
1394         if (sopt->sopt_level != IPPROTO_TCP) {
1395                 if (sopt->sopt_level == IPPROTO_IP) {
1396                         switch (sopt->sopt_name) {
1397                         case IP_MULTICAST_IF:
1398                         case IP_MULTICAST_VIF:
1399                         case IP_MULTICAST_TTL:
1400                         case IP_MULTICAST_LOOP:
1401                         case IP_ADD_MEMBERSHIP:
1402                         case IP_DROP_MEMBERSHIP:
1403                                 /*
1404                                  * Multicast does not make sense on
1405                                  * TCP sockets.
1406                                  */
1407                                 error = EOPNOTSUPP;
1408                                 goto done;
1409                         }
1410                 }
1411 #ifdef INET6
1412                 if (INP_CHECK_SOCKAF(so, AF_INET6))
1413                         ip6_ctloutput_dispatch(msg);
1414                 else
1415 #endif /* INET6 */
1416                 ip_ctloutput(msg);
1417                 /* msg invalid now */
1418                 return;
1419         }
1420
1421         switch (sopt->sopt_dir) {
1422         case SOPT_SET:
1423                 error = soopt_to_kbuf(sopt, &optval, sizeof optval,
1424                                       sizeof optval);
1425                 if (error)
1426                         break;
1427                 switch (sopt->sopt_name) {
1428                 case TCP_FASTKEEP:
1429                         if (optval > 0)
1430                                 tp->t_keepidle = tp->t_keepintvl;
1431                         else
1432                                 tp->t_keepidle = tcp_keepidle;
1433                         tcp_timer_keep_activity(tp, 0);
1434                         break;
1435 #ifdef TCP_SIGNATURE
1436                 case TCP_SIGNATURE_ENABLE:
1437                         if (tp->t_state == TCPS_CLOSED) {
1438                                 /*
1439                                  * This is the only safe state that this
1440                                  * option could be changed.  Some segments
1441                                  * could already have been sent in other
1442                                  * states.
1443                                  */
1444                                 if (optval > 0)
1445                                         tp->t_flags |= TF_SIGNATURE;
1446                                 else
1447                                         tp->t_flags &= ~TF_SIGNATURE;
1448                         } else {
1449                                 error = EOPNOTSUPP;
1450                         }
1451                         break;
1452 #endif /* TCP_SIGNATURE */
1453                 case TCP_NODELAY:
1454                 case TCP_NOOPT:
1455                         switch (sopt->sopt_name) {
1456                         case TCP_NODELAY:
1457                                 opt = TF_NODELAY;
1458                                 break;
1459                         case TCP_NOOPT:
1460                                 opt = TF_NOOPT;
1461                                 break;
1462                         default:
1463                                 opt = 0; /* dead code to fool gcc */
1464                                 break;
1465                         }
1466
1467                         if (optval)
1468                                 tp->t_flags |= opt;
1469                         else
1470                                 tp->t_flags &= ~opt;
1471                         break;
1472
1473                 case TCP_NOPUSH:
1474                         if (tcp_disable_nopush)
1475                                 break;
1476                         if (optval)
1477                                 tp->t_flags |= TF_NOPUSH;
1478                         else {
1479                                 tp->t_flags &= ~TF_NOPUSH;
1480                                 error = tcp_output(tp);
1481                         }
1482                         break;
1483
1484                 case TCP_MAXSEG:
1485                         /*
1486                          * Must be between 0 and maxseg.  If the requested
1487                          * maxseg is too small to satisfy the desired minmss,
1488                          * pump it up (silently so sysctl modifications of
1489                          * minmss do not create unexpected program failures).
1490                          * Handle degenerate cases.
1491                          */
1492                         if (optval > 0 && optval <= tp->t_maxseg) {
1493                                 if (optval + 40 < tcp_minmss) {
1494                                         optval = tcp_minmss - 40;
1495                                         if (optval < 0)
1496                                                 optval = 1;
1497                                 }
1498                                 tp->t_maxseg = optval;
1499                         } else {
1500                                 error = EINVAL;
1501                         }
1502                         break;
1503
1504                 case TCP_KEEPINIT:
1505                         opthz = ((int64_t)optval * hz) / 1000;
1506                         if (opthz >= 1)
1507                                 tp->t_keepinit = opthz;
1508                         else
1509                                 error = EINVAL;
1510                         break;
1511
1512                 case TCP_KEEPIDLE:
1513                         opthz = ((int64_t)optval * hz) / 1000;
1514                         if (opthz >= 1) {
1515                                 tp->t_keepidle = opthz;
1516                                 tcp_timer_keep_activity(tp, 0);
1517                         } else {
1518                                 error = EINVAL;
1519                         }
1520                         break;
1521
1522                 case TCP_KEEPINTVL:
1523                         opthz = ((int64_t)optval * hz) / 1000;
1524                         if (opthz >= 1) {
1525                                 tp->t_keepintvl = opthz;
1526                                 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt;
1527                         } else {
1528                                 error = EINVAL;
1529                         }
1530                         break;
1531
1532                 case TCP_KEEPCNT:
1533                         if (optval > 0) {
1534                                 tp->t_keepcnt = optval;
1535                                 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt;
1536                         } else {
1537                                 error = EINVAL;
1538                         }
1539                         break;
1540
1541                 default:
1542                         error = ENOPROTOOPT;
1543                         break;
1544                 }
1545                 break;
1546
1547         case SOPT_GET:
1548                 switch (sopt->sopt_name) {
1549 #ifdef TCP_SIGNATURE
1550                 case TCP_SIGNATURE_ENABLE:
1551                         optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
1552                         break;
1553 #endif /* TCP_SIGNATURE */
1554                 case TCP_NODELAY:
1555                         optval = tp->t_flags & TF_NODELAY;
1556                         break;
1557                 case TCP_MAXSEG:
1558                         optval = tp->t_maxseg;
1559                         break;
1560                 case TCP_NOOPT:
1561                         optval = tp->t_flags & TF_NOOPT;
1562                         break;
1563                 case TCP_NOPUSH:
1564                         optval = tp->t_flags & TF_NOPUSH;
1565                         break;
1566                 case TCP_KEEPINIT:
1567                         optval = ((int64_t)tp->t_keepinit * 1000) / hz;
1568                         break;
1569                 case TCP_KEEPIDLE:
1570                         optval = ((int64_t)tp->t_keepidle * 1000) / hz;
1571                         break;
1572                 case TCP_KEEPINTVL:
1573                         optval = ((int64_t)tp->t_keepintvl * 1000) / hz;
1574                         break;
1575                 case TCP_KEEPCNT:
1576                         optval = tp->t_keepcnt;
1577                         break;
1578                 default:
1579                         error = ENOPROTOOPT;
1580                         break;
1581                 }
1582                 if (error == 0)
1583                         soopt_from_kbuf(sopt, &optval, sizeof optval);
1584                 break;
1585         }
1586 done:
1587         lwkt_replymsg(&msg->lmsg, error);
1588 }
1589
1590 /*
1591  * tcp_sendspace and tcp_recvspace are the default send and receive window
1592  * sizes, respectively.  These are obsolescent (this information should
1593  * be set by the route).
1594  *
1595  * Use a default that does not require tcp window scaling to be turned
1596  * on.  Individual programs or the administrator can increase the default.
1597  */
1598 u_long  tcp_sendspace = 57344;  /* largest multiple of PAGE_SIZE < 64k */
1599 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1600     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1601 u_long  tcp_recvspace = 57344;  /* largest multiple of PAGE_SIZE < 64k */
1602 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1603     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1604
1605 /*
1606  * Attach TCP protocol to socket, allocating internet protocol control
1607  * block, tcp control block, buffer space, and entering CLOSED state.
1608  */
1609 static int
1610 tcp_attach(struct socket *so, struct pru_attach_info *ai)
1611 {
1612         struct tcpcb *tp;
1613         struct inpcb *inp;
1614         int error;
1615         int cpu;
1616 #ifdef INET6
1617         boolean_t isipv6 = INP_CHECK_SOCKAF(so, AF_INET6);
1618 #endif
1619
1620         if (ai != NULL) {
1621                 error = tcp_usr_preattach(so, 0 /* don't care */, ai);
1622                 if (error)
1623                         return (error);
1624         } else {
1625                 /* Post attach; do nothing */
1626         }
1627
1628         cpu = mycpu->gd_cpuid;
1629
1630         /*
1631          * Set the default pcbinfo.  This will likely change when we
1632          * bind/connect.
1633          */
1634         error = in_pcballoc(so, &tcbinfo[cpu]);
1635         if (error)
1636                 return (error);
1637         inp = so->so_pcb;
1638 #ifdef INET6
1639         if (isipv6)
1640                 inp->in6p_hops = -1;    /* use kernel default */
1641 #endif
1642         tp = tcp_newtcpcb(inp);
1643         KASSERT(tp != NULL, ("tcp_newtcpcb failed"));
1644         tp->t_state = TCPS_CLOSED;
1645         /* Keep a reference for asynchronized pru_rcvd */
1646         soreference(so);
1647         return (0);
1648 }
1649
1650 /*
1651  * Initiate (or continue) disconnect.
1652  * If embryonic state, just send reset (once).
1653  * If in ``let data drain'' option and linger null, just drop.
1654  * Otherwise (hard), mark socket disconnecting and drop
1655  * current input data; switch states based on user close, and
1656  * send segment to peer (with FIN).
1657  */
1658 static struct tcpcb *
1659 tcp_disconnect(struct tcpcb *tp)
1660 {
1661         struct socket *so = tp->t_inpcb->inp_socket;
1662
1663         if (tp->t_state < TCPS_ESTABLISHED) {
1664                 tp = tcp_close(tp);
1665         } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
1666                 tp = tcp_drop(tp, 0);
1667         } else {
1668                 lwkt_gettoken(&so->so_rcv.ssb_token);
1669                 soisdisconnecting(so);
1670                 sbflush(&so->so_rcv.sb);
1671                 tp = tcp_usrclosed(tp);
1672                 if (tp)
1673                         tcp_output(tp);
1674                 lwkt_reltoken(&so->so_rcv.ssb_token);
1675         }
1676         return (tp);
1677 }
1678
1679 /*
1680  * User issued close, and wish to trail through shutdown states:
1681  * if never received SYN, just forget it.  If got a SYN from peer,
1682  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1683  * If already got a FIN from peer, then almost done; go to LAST_ACK
1684  * state.  In all other cases, have already sent FIN to peer (e.g.
1685  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1686  * for peer to send FIN or not respond to keep-alives, etc.
1687  * We can let the user exit from the close as soon as the FIN is acked.
1688  */
1689 static struct tcpcb *
1690 tcp_usrclosed(struct tcpcb *tp)
1691 {
1692
1693         switch (tp->t_state) {
1694
1695         case TCPS_CLOSED:
1696         case TCPS_LISTEN:
1697                 tp->t_state = TCPS_CLOSED;
1698                 tp = tcp_close(tp);
1699                 break;
1700
1701         case TCPS_SYN_SENT:
1702         case TCPS_SYN_RECEIVED:
1703                 tp->t_flags |= TF_NEEDFIN;
1704                 break;
1705
1706         case TCPS_ESTABLISHED:
1707                 tp->t_state = TCPS_FIN_WAIT_1;
1708                 break;
1709
1710         case TCPS_CLOSE_WAIT:
1711                 tp->t_state = TCPS_LAST_ACK;
1712                 break;
1713         }
1714         if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1715                 soisdisconnected(tp->t_inpcb->inp_socket);
1716                 /* To prevent the connection hanging in FIN_WAIT_2 forever. */
1717                 if (tp->t_state == TCPS_FIN_WAIT_2) {
1718                         tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle,
1719                             tcp_timer_2msl);
1720                 }
1721         }
1722         return (tp);
1723 }