gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2004 Jeffrey M. Hsu. All rights reserved.
	3	* Copyright (c) 1982, 1986, 1988, 1993
	4	* The Regents of the University of California. All rights reserved.
	5	*
	6	* Redistribution and use in source and binary forms, with or without
	7	* modification, are permitted provided that the following conditions
	8	* are met:
	9	* 1. Redistributions of source code must retain the above copyright
	10	* notice, this list of conditions and the following disclaimer.
	11	* 2. Redistributions in binary form must reproduce the above copyright
	12	* notice, this list of conditions and the following disclaimer in the
	13	* documentation and/or other materials provided with the distribution.
	14	* 3. All advertising materials mentioning features or use of this software
	15	* must display the following acknowledgement:
	16	* This product includes software developed by the University of
	17	* California, Berkeley and its contributors.
	18	* 4. Neither the name of the University nor the names of its contributors
	19	* may be used to endorse or promote products derived from this software
	20	* without specific prior written permission.
	21	*
	22	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	23	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	24	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	25	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	26	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	27	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	28	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	29	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	30	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	31	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
	35	* $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $
	36	* $DragonFly: src/sys/netinet/tcp_usrreq.c,v 1.24 2004/07/02 04:41:01 hsu Exp $
	37	*/
	38
	39	#include "opt_ipsec.h"
	40	#include "opt_inet6.h"
	41	#include "opt_tcpdebug.h"
	42
	43	#include <sys/param.h>
	44	#include <sys/systm.h>
	45	#include <sys/kernel.h>
	46	#include <sys/malloc.h>
	47	#include <sys/sysctl.h>
	48	#include <sys/globaldata.h>
	49	#include <sys/thread.h>
	50
	51	#include <sys/mbuf.h>
	52	#ifdef INET6
	53	#include <sys/domain.h>
	54	#endif /* INET6 */
	55	#include <sys/socket.h>
	56	#include <sys/socketvar.h>
	57	#include <sys/protosw.h>
	58
	59	#include <sys/msgport2.h>
	60
	61	#include <net/if.h>
	62	#include <net/netisr.h>
	63	#include <net/route.h>
	64
	65	#include <netinet/in.h>
	66	#include <netinet/in_systm.h>
	67	#ifdef INET6
	68	#include <netinet/ip6.h>
	69	#endif
	70	#include <netinet/in_pcb.h>
	71	#ifdef INET6
	72	#include <netinet6/in6_pcb.h>
	73	#endif
	74	#include <netinet/in_var.h>
	75	#include <netinet/ip_var.h>
	76	#ifdef INET6
	77	#include <netinet6/ip6_var.h>
	78	#endif
	79	#include <netinet/tcp.h>
	80	#include <netinet/tcp_fsm.h>
	81	#include <netinet/tcp_seq.h>
	82	#include <netinet/tcp_timer.h>
	83	#include <netinet/tcp_var.h>
	84	#include <netinet/tcpip.h>
	85	#ifdef TCPDEBUG
	86	#include <netinet/tcp_debug.h>
	87	#endif
	88
	89	#ifdef IPSEC
	90	#include <netinet6/ipsec.h>
	91	#endif /IPSEC/
	92
	93	/*
	94	* TCP protocol interface to socket abstraction.
	95	*/
	96	extern char tcpstates[]; / XXX ??? */
	97
	98	static int tcp_attach (struct socket , struct pru_attach_info );
	99	static int tcp_connect (struct tcpcb , struct sockaddr ,
	100	struct thread *);
	101	#ifdef INET6
	102	static int tcp6_connect (struct tcpcb , struct sockaddr ,
	103	struct thread *);
	104	#endif /* INET6 */
	105	static struct tcpcb *
	106	tcp_disconnect (struct tcpcb *);
	107	static struct tcpcb *
	108	tcp_usrclosed (struct tcpcb *);
	109
	110	#ifdef TCPDEBUG
	111	#define TCPDEBUG0 int ostate = 0
	112	#define TCPDEBUG1() ostate = tp ? tp->t_state : 0
	113	#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
	114	tcp_trace(TA_USER, ostate, tp, 0, 0, req)
	115	#else
	116	#define TCPDEBUG0
	117	#define TCPDEBUG1()
	118	#define TCPDEBUG2(req)
	119	#endif
	120
	121	/*
	122	* TCP attaches to socket via pru_attach(), reserving space,
	123	* and an internet control block.
	124	*/
	125	static int
	126	tcp_usr_attach(struct socket so, int proto, struct pru_attach_info ai)
	127	{
	128	int s = splnet();
	129	int error;
	130	struct inpcb *inp = sotoinpcb(so);
	131	struct tcpcb *tp = 0;
	132	TCPDEBUG0;
	133
	134	TCPDEBUG1();
	135	if (inp) {
	136	error = EISCONN;
	137	goto out;
	138	}
	139
	140	error = tcp_attach(so, ai);
	141	if (error)
	142	goto out;
	143
	144	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
	145	so->so_linger = TCP_LINGERTIME;
	146	tp = sototcpcb(so);
	147	out:
	148	TCPDEBUG2(PRU_ATTACH);
	149	splx(s);
	150	return error;
	151	}
	152
	153	/*
	154	* pru_detach() detaches the TCP protocol from the socket.
	155	* If the protocol state is non-embryonic, then can't
	156	* do this directly: have to initiate a pru_disconnect(),
	157	* which may finish later; embryonic TCB's can just
	158	* be discarded here.
	159	*/
	160	static int
	161	tcp_usr_detach(struct socket *so)
	162	{
	163	int s = splnet();
	164	int error = 0;
	165	struct inpcb *inp = sotoinpcb(so);
	166	struct tcpcb *tp;
	167	TCPDEBUG0;
	168
	169	if (inp == 0) {
	170	splx(s);
	171	return EINVAL; /* XXX */
	172	}
	173	tp = intotcpcb(inp);
	174	TCPDEBUG1();
	175	tp = tcp_disconnect(tp);
	176
	177	TCPDEBUG2(PRU_DETACH);
	178	splx(s);
	179	return error;
	180	}
	181
	182	#define COMMON_START() TCPDEBUG0; \
	183	do { \
	184	if (inp == 0) { \
	185	splx(s); \
	186	return EINVAL; \
	187	} \
	188	tp = intotcpcb(inp); \
	189	TCPDEBUG1(); \
	190	} while(0)
	191
	192	#define COMMON_END(req) out: TCPDEBUG2(req); splx(s); return error; goto out
	193
	194
	195	/*
	196	* Give the socket an address.
	197	*/
	198	static int
	199	tcp_usr_bind(struct socket so, struct sockaddr nam, struct thread *td)
	200	{
	201	int s = splnet();
	202	int error = 0;
	203	struct inpcb *inp = sotoinpcb(so);
	204	struct tcpcb *tp;
	205	struct sockaddr_in *sinp;
	206
	207	COMMON_START();
	208
	209	/*
	210	* Must check for multicast addresses and disallow binding
	211	* to them.
	212	*/
	213	sinp = (struct sockaddr_in *)nam;
	214	if (sinp->sin_family == AF_INET &&
	215	IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
	216	error = EAFNOSUPPORT;
	217	goto out;
	218	}
	219	error = in_pcbbind(inp, nam, td);
	220	if (error)
	221	goto out;
	222	COMMON_END(PRU_BIND);
	223
	224	}
	225
	226	#ifdef INET6
	227	static int
	228	tcp6_usr_bind(struct socket so, struct sockaddr nam, struct thread *td)
	229	{
	230	int s = splnet();
	231	int error = 0;
	232	struct inpcb *inp = sotoinpcb(so);
	233	struct tcpcb *tp;
	234	struct sockaddr_in6 *sin6p;
	235
	236	COMMON_START();
	237
	238	/*
	239	* Must check for multicast addresses and disallow binding
	240	* to them.
	241	*/
	242	sin6p = (struct sockaddr_in6 *)nam;
	243	if (sin6p->sin6_family == AF_INET6 &&
	244	IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
	245	error = EAFNOSUPPORT;
	246	goto out;
	247	}
	248	inp->inp_vflag &= ~INP_IPV4;
	249	inp->inp_vflag \|= INP_IPV6;
	250	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	251	if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
	252	inp->inp_vflag \|= INP_IPV4;
	253	else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
	254	struct sockaddr_in sin;
	255
	256	in6_sin6_2_sin(&sin, sin6p);
	257	inp->inp_vflag \|= INP_IPV4;
	258	inp->inp_vflag &= ~INP_IPV6;
	259	error = in_pcbbind(inp, (struct sockaddr *)&sin, td);
	260	goto out;
	261	}
	262	}
	263	error = in6_pcbbind(inp, nam, td);
	264	if (error)
	265	goto out;
	266	COMMON_END(PRU_BIND);
	267	}
	268	#endif /* INET6 */
	269
	270	#ifdef SMP
	271	struct netmsg_inswildcard {
	272	struct lwkt_msg nm_lmsg;
	273	struct inpcb *nm_inp;
	274	struct inpcbinfo *nm_pcbinfo;
	275	};
	276
	277	static int
	278	in_pcbinswildcardhash_handler(struct lwkt_msg *msg0)
	279	{
	280	struct netmsg_inswildcard msg = (struct netmsg_inswildcard )msg0;
	281
	282	in_pcbinswildcardhash_oncpu(msg->nm_inp, msg->nm_pcbinfo);
	283	lwkt_replymsg(&msg->nm_lmsg, 0);
	284	return (EASYNC);
	285	}
	286	#endif
	287
	288	/*
	289	* Prepare to accept connections.
	290	*/
	291	static int
	292	tcp_usr_listen(struct socket so, struct thread td)
	293	{
	294	int s = splnet();
	295	int error = 0;
	296	struct inpcb *inp = sotoinpcb(so);
	297	struct tcpcb *tp;
	298	#ifdef SMP
	299	int cpu;
	300	#endif
	301
	302	COMMON_START();
	303	if (inp->inp_lport == 0) {
	304	error = in_pcbbind(inp, NULL, td);
	305	if (error != 0)
	306	goto out;
	307	}
	308
	309	tp->t_state = TCPS_LISTEN;
	310	#ifdef SMP
	311	for (cpu = 0; cpu < ncpus2; cpu++) {
	312	struct netmsg_inswildcard *msg;
	313
	314	if (cpu == mycpu->gd_cpuid) {
	315	in_pcbinswildcardhash_oncpu(inp, &tcbinfo[cpu]);
	316	continue;
	317	}
	318
	319	msg = malloc(sizeof(struct netmsg_inswildcard), M_LWKTMSG,
	320	M_INTWAIT);
	321	lwkt_initmsg(&msg->nm_lmsg, &netisr_afree_rport, 0,
	322	lwkt_cmd_func(in_pcbinswildcardhash_handler),
	323	lwkt_cmd_op_none);
	324	msg->nm_inp = inp;
	325	msg->nm_pcbinfo = &tcbinfo[cpu];
	326	lwkt_sendmsg(tcp_cport(cpu), &msg->nm_lmsg);
	327	}
	328	inp->inp_flags \|= INP_WILDCARD_MP;
	329	#else
	330	in_pcbinswildcardhash(inp);
	331	#endif
	332	COMMON_END(PRU_LISTEN);
	333	}
	334
	335	#ifdef INET6
	336	static int
	337	tcp6_usr_listen(struct socket so, struct thread td)
	338	{
	339	int s = splnet();
	340	int error = 0;
	341	struct inpcb *inp = sotoinpcb(so);
	342	struct tcpcb *tp;
	343
	344	COMMON_START();
	345	if (inp->inp_lport == 0) {
	346	inp->inp_vflag &= ~INP_IPV4;
	347	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
	348	inp->inp_vflag \|= INP_IPV4;
	349	error = in6_pcbbind(inp, (struct sockaddr *)0, td);
	350	}
	351	if (error == 0)
	352	tp->t_state = TCPS_LISTEN;
	353	in_pcbinswildcardhash(inp);
	354	COMMON_END(PRU_LISTEN);
	355	}
	356	#endif /* INET6 */
	357
	358	/*
	359	* Initiate connection to peer.
	360	* Create a template for use in transmissions on this connection.
	361	* Enter SYN_SENT state, and mark socket as connecting.
	362	* Start keep-alive timer, and seed output sequence space.
	363	* Send initial segment on connection.
	364	*/
	365	static int
	366	tcp_usr_connect(struct socket so, struct sockaddr nam, struct thread *td)
	367	{
	368	int s = splnet();
	369	int error = 0;
	370	struct inpcb *inp = sotoinpcb(so);
	371	struct tcpcb *tp;
	372	struct sockaddr_in *sinp;
	373
	374	COMMON_START();
	375
	376	/*
	377	* Must disallow TCP ``connections'' to multicast addresses.
	378	*/
	379	sinp = (struct sockaddr_in *)nam;
	380	if (sinp->sin_family == AF_INET
	381	&& IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
	382	error = EAFNOSUPPORT;
	383	goto out;
	384	}
	385
	386	prison_remote_ip(td, 0, &sinp->sin_addr.s_addr);
	387
	388	if ((error = tcp_connect(tp, nam, td)) != 0)
	389	goto out;
	390	error = tcp_output(tp);
	391	COMMON_END(PRU_CONNECT);
	392	}
	393
	394	#ifdef INET6
	395	static int
	396	tcp6_usr_connect(struct socket so, struct sockaddr nam, struct thread *td)
	397	{
	398	int s = splnet();
	399	int error = 0;
	400	struct inpcb *inp = sotoinpcb(so);
	401	struct tcpcb *tp;
	402	struct sockaddr_in6 *sin6p;
	403
	404	COMMON_START();
	405
	406	/*
	407	* Must disallow TCP ``connections'' to multicast addresses.
	408	*/
	409	sin6p = (struct sockaddr_in6 *)nam;
	410	if (sin6p->sin6_family == AF_INET6
	411	&& IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
	412	error = EAFNOSUPPORT;
	413	goto out;
	414	}
	415
	416	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
	417	struct sockaddr_in sin;
	418
	419	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
	420	error = EINVAL;
	421	goto out;
	422	}
	423
	424	in6_sin6_2_sin(&sin, sin6p);
	425	inp->inp_vflag \|= INP_IPV4;
	426	inp->inp_vflag &= ~INP_IPV6;
	427	if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
	428	goto out;
	429	error = tcp_output(tp);
	430	goto out;
	431	}
	432	inp->inp_vflag &= ~INP_IPV4;
	433	inp->inp_vflag \|= INP_IPV6;
	434	inp->inp_inc.inc_isipv6 = 1;
	435	if ((error = tcp6_connect(tp, nam, td)) != 0)
	436	goto out;
	437	error = tcp_output(tp);
	438	COMMON_END(PRU_CONNECT);
	439	}
	440	#endif /* INET6 */
	441
	442	/*
	443	* Initiate disconnect from peer.
	444	* If connection never passed embryonic stage, just drop;
	445	* else if don't need to let data drain, then can just drop anyways,
	446	* else have to begin TCP shutdown process: mark socket disconnecting,
	447	* drain unread data, state switch to reflect user close, and
	448	* send segment (e.g. FIN) to peer. Socket will be really disconnected
	449	* when peer sends FIN and acks ours.
	450	*
	451	* SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
	452	*/
	453	static int
	454	tcp_usr_disconnect(struct socket *so)
	455	{
	456	int s = splnet();
	457	int error = 0;
	458	struct inpcb *inp = sotoinpcb(so);
	459	struct tcpcb *tp;
	460
	461	COMMON_START();
	462	tp = tcp_disconnect(tp);
	463	COMMON_END(PRU_DISCONNECT);
	464	}
	465
	466	/*
	467	* Accept a connection. Essentially all the work is
	468	* done at higher levels; just return the address
	469	* of the peer, storing through addr.
	470	*/
	471	static int
	472	tcp_usr_accept(struct socket so, struct sockaddr *nam)
	473	{
	474	int s = splnet();
	475	int error = 0;
	476	struct inpcb *inp = sotoinpcb(so);
	477	struct tcpcb *tp = NULL;
	478	TCPDEBUG0;
	479
	480	if (so->so_state & SS_ISDISCONNECTED) {
	481	error = ECONNABORTED;
	482	goto out;
	483	}
	484	if (inp == 0) {
	485	splx(s);
	486	return (EINVAL);
	487	}
	488	tp = intotcpcb(inp);
	489	TCPDEBUG1();
	490	in_setpeeraddr(so, nam);
	491	COMMON_END(PRU_ACCEPT);
	492	}
	493
	494	#ifdef INET6
	495	static int
	496	tcp6_usr_accept(struct socket so, struct sockaddr *nam)
	497	{
	498	int s = splnet();
	499	int error = 0;
	500	struct inpcb *inp = sotoinpcb(so);
	501	struct tcpcb *tp = NULL;
	502	TCPDEBUG0;
	503
	504	if (so->so_state & SS_ISDISCONNECTED) {
	505	error = ECONNABORTED;
	506	goto out;
	507	}
	508	if (inp == 0) {
	509	splx(s);
	510	return (EINVAL);
	511	}
	512	tp = intotcpcb(inp);
	513	TCPDEBUG1();
	514	in6_mapped_peeraddr(so, nam);
	515	COMMON_END(PRU_ACCEPT);
	516	}
	517	#endif /* INET6 */
	518	/*
	519	* Mark the connection as being incapable of further output.
	520	*/
	521	static int
	522	tcp_usr_shutdown(struct socket *so)
	523	{
	524	int s = splnet();
	525	int error = 0;
	526	struct inpcb *inp = sotoinpcb(so);
	527	struct tcpcb *tp;
	528
	529	COMMON_START();
	530	socantsendmore(so);
	531	tp = tcp_usrclosed(tp);
	532	if (tp)
	533	error = tcp_output(tp);
	534	COMMON_END(PRU_SHUTDOWN);
	535	}
	536
	537	/*
	538	* After a receive, possibly send window update to peer.
	539	*/
	540	static int
	541	tcp_usr_rcvd(struct socket *so, int flags)
	542	{
	543	int s = splnet();
	544	int error = 0;
	545	struct inpcb *inp = sotoinpcb(so);
	546	struct tcpcb *tp;
	547
	548	COMMON_START();
	549	tcp_output(tp);
	550	COMMON_END(PRU_RCVD);
	551	}
	552
	553	/*
	554	* Do a send by putting data in output queue and updating urgent
	555	* marker if URG set. Possibly send more data. Unlike the other
	556	* pru_*() routines, the mbuf chains are our responsibility. We
	557	* must either enqueue them or free them. The other pru_* routines
	558	* generally are caller-frees.
	559	*/
	560	static int
	561	tcp_usr_send(struct socket so, int flags, struct mbuf m,
	562	struct sockaddr nam, struct mbuf control, struct thread *td)
	563	{
	564	int s = splnet();
	565	int error = 0;
	566	struct inpcb *inp = sotoinpcb(so);
	567	struct tcpcb *tp;
	568	#ifdef INET6
	569	int isipv6;
	570	#endif
	571	TCPDEBUG0;
	572
	573	if (inp == NULL) {
	574	/*
	575	* OOPS! we lost a race, the TCP session got reset after
	576	* we checked SS_CANTSENDMORE, eg: while doing uiomove or a
	577	* network interrupt in the non-splnet() section of sosend().
	578	*/
	579	if (m)
	580	m_freem(m);
	581	if (control)
	582	m_freem(control);
	583	error = ECONNRESET; /* XXX EPIPE? */
	584	tp = NULL;
	585	TCPDEBUG1();
	586	goto out;
	587	}
	588	#ifdef INET6
	589	isipv6 = nam && nam->sa_family == AF_INET6;
	590	#endif /* INET6 */
	591	tp = intotcpcb(inp);
	592	TCPDEBUG1();
	593	if (control) {
	594	/* TCP doesn't do control messages (rights, creds, etc) */
	595	if (control->m_len) {
	596	m_freem(control);
	597	if (m)
	598	m_freem(m);
	599	error = EINVAL;
	600	goto out;
	601	}
	602	m_freem(control); /* empty control, just free it */
	603	}
	604	if(!(flags & PRUS_OOB)) {
	605	sbappend(&so->so_snd, m);
	606	if (nam && tp->t_state < TCPS_SYN_SENT) {
	607	/*
	608	* Do implied connect if not yet connected,
	609	* initialize window to default value, and
	610	* initialize maxseg/maxopd using peer's cached
	611	* MSS.
	612	*/
	613	#ifdef INET6
	614	if (isipv6)
	615	error = tcp6_connect(tp, nam, td);
	616	else
	617	#endif /* INET6 */
	618	error = tcp_connect(tp, nam, td);
	619	if (error)
	620	goto out;
	621	tp->snd_wnd = TTCP_CLIENT_SND_WND;
	622	tcp_mss(tp, -1);
	623	}
	624
	625	if (flags & PRUS_EOF) {
	626	/*
	627	* Close the send side of the connection after
	628	* the data is sent.
	629	*/
	630	socantsendmore(so);
	631	tp = tcp_usrclosed(tp);
	632	}
	633	if (tp != NULL) {
	634	if (flags & PRUS_MORETOCOME)
	635	tp->t_flags \|= TF_MORETOCOME;
	636	error = tcp_output(tp);
	637	if (flags & PRUS_MORETOCOME)
	638	tp->t_flags &= ~TF_MORETOCOME;
	639	}
	640	} else {
	641	if (sbspace(&so->so_snd) < -512) {
	642	m_freem(m);
	643	error = ENOBUFS;
	644	goto out;
	645	}
	646	/*
	647	* According to RFC961 (Assigned Protocols),
	648	* the urgent pointer points to the last octet
	649	* of urgent data. We continue, however,
	650	* to consider it to indicate the first octet
	651	* of data past the urgent section.
	652	* Otherwise, snd_up should be one lower.
	653	*/
	654	sbappend(&so->so_snd, m);
	655	if (nam && tp->t_state < TCPS_SYN_SENT) {
	656	/*
	657	* Do implied connect if not yet connected,
	658	* initialize window to default value, and
	659	* initialize maxseg/maxopd using peer's cached
	660	* MSS.
	661	*/
	662	#ifdef INET6
	663	if (isipv6)
	664	error = tcp6_connect(tp, nam, td);
	665	else
	666	#endif /* INET6 */
	667	error = tcp_connect(tp, nam, td);
	668	if (error)
	669	goto out;
	670	tp->snd_wnd = TTCP_CLIENT_SND_WND;
	671	tcp_mss(tp, -1);
	672	}
	673	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
	674	tp->t_flags \|= TF_FORCE;
	675	error = tcp_output(tp);
	676	tp->t_flags &= ~TF_FORCE;
	677	}
	678	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
	679	((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
	680	}
	681
	682	/*
	683	* Abort the TCP.
	684	*/
	685	static int
	686	tcp_usr_abort(struct socket *so)
	687	{
	688	int s = splnet();
	689	int error = 0;
	690	struct inpcb *inp = sotoinpcb(so);
	691	struct tcpcb *tp;
	692
	693	COMMON_START();
	694	tp = tcp_drop(tp, ECONNABORTED);
	695	COMMON_END(PRU_ABORT);
	696	}
	697
	698	/*
	699	* Receive out-of-band data.
	700	*/
	701	static int
	702	tcp_usr_rcvoob(struct socket so, struct mbuf m, int flags)
	703	{
	704	int s = splnet();
	705	int error = 0;
	706	struct inpcb *inp = sotoinpcb(so);
	707	struct tcpcb *tp;
	708
	709	COMMON_START();
	710	if ((so->so_oobmark == 0 &&
	711	(so->so_state & SS_RCVATMARK) == 0) \|\|
	712	so->so_options & SO_OOBINLINE \|\|
	713	tp->t_oobflags & TCPOOB_HADDATA) {
	714	error = EINVAL;
	715	goto out;
	716	}
	717	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
	718	error = EWOULDBLOCK;
	719	goto out;
	720	}
	721	m->m_len = 1;
	722	*mtod(m, caddr_t) = tp->t_iobc;
	723	if ((flags & MSG_PEEK) == 0)
	724	tp->t_oobflags ^= (TCPOOB_HAVEDATA \| TCPOOB_HADDATA);
	725	COMMON_END(PRU_RCVOOB);
	726	}
	727
	728	/* xxx - should be const */
	729	struct pr_usrreqs tcp_usrreqs = {
	730	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
	731	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
	732	tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd,
	733	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
	734	in_setsockaddr, sosend, soreceive, sopoll
	735	};
	736
	737	#ifdef INET6
	738	struct pr_usrreqs tcp6_usrreqs = {
	739	tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
	740	tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
	741	tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
	742	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
	743	in6_mapped_sockaddr, sosend, soreceive, sopoll
	744	};
	745	#endif /* INET6 */
	746
	747	static int
	748	tcp_connect_oncpu(struct tcpcb tp, struct sockaddr_in sin,
	749	struct sockaddr_in *if_sin)
	750	{
	751	struct inpcb inp = tp->t_inpcb, oinp;
	752	struct socket *so = inp->inp_socket;
	753	struct tcpcb *otp;
	754	struct rmxp_tao *taop;
	755	struct rmxp_tao tao_noncached;
	756
	757	oinp = in_pcblookup_hash(&tcbinfo[mycpu->gd_cpuid],
	758	sin->sin_addr, sin->sin_port,
	759	inp->inp_laddr.s_addr != INADDR_ANY ?
	760	inp->inp_laddr : if_sin->sin_addr,
	761	inp->inp_lport, 0, NULL);
	762	if (oinp != NULL) {
	763	if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
	764	otp->t_state == TCPS_TIME_WAIT &&
	765	(ticks - otp->t_starttime) < tcp_msl &&
	766	(otp->t_flags & TF_RCVD_CC))
	767	(void) tcp_close(otp);
	768	else
	769	return (EADDRINUSE);
	770	}
	771	if (inp->inp_laddr.s_addr == INADDR_ANY)
	772	inp->inp_laddr = if_sin->sin_addr;
	773	inp->inp_faddr = sin->sin_addr;
	774	inp->inp_fport = sin->sin_port;
	775	inp->inp_cpcbinfo = &tcbinfo[mycpu->gd_cpuid];
	776	in_pcbinsconnhash(inp);
	777
	778	/* Compute window scaling to request. */
	779	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	780	(TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
	781	tp->request_r_scale++;
	782
	783	soisconnecting(so);
	784	tcpstat.tcps_connattempt++;
	785	tp->t_state = TCPS_SYN_SENT;
	786	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
	787	tp->iss = tcp_new_isn(tp);
	788	tp->t_bw_rtseq = tp->iss;
	789	tcp_sendseqinit(tp);
	790
	791	/*
	792	* Generate a CC value for this connection and
	793	* check whether CC or CCnew should be used.
	794	*/
	795	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
	796	taop = &tao_noncached;
	797	bzero(taop, sizeof(*taop));
	798	}
	799
	800	tp->cc_send = CC_INC(tcp_ccgen);
	801	if (taop->tao_ccsent != 0 &&
	802	CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
	803	taop->tao_ccsent = tp->cc_send;
	804	} else {
	805	taop->tao_ccsent = 0;
	806	tp->t_flags \|= TF_SENDCCNEW;
	807	}
	808
	809	return (0);
	810	}
	811
	812	#ifdef SMP
	813
	814	struct netmsg_tcp_connect {
	815	struct lwkt_msg nm_lmsg;
	816	struct tcpcb *nm_tp;
	817	struct sockaddr_in *nm_sin;
	818	struct sockaddr_in *nm_ifsin;
	819	};
	820
	821	static int
	822	tcp_connect_handler(lwkt_msg_t lmsg)
	823	{
	824	struct netmsg_tcp_connect msg = (void )lmsg;
	825	int error;
	826
	827	error = tcp_connect_oncpu(msg->nm_tp, msg->nm_sin, msg->nm_ifsin);
	828	lwkt_replymsg(lmsg, error);
	829	return(EASYNC);
	830	}
	831
	832	#endif
	833
	834	/*
	835	* Common subroutine to open a TCP connection to remote host specified
	836	* by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
	837	* port number if needed. Call in_pcbladdr to do the routing and to choose
	838	* a local host address (interface). If there is an existing incarnation
	839	* of the same connection in TIME-WAIT state and if the remote host was
	840	* sending CC options and if the connection duration was < MSL, then
	841	* truncate the previous TIME-WAIT state and proceed.
	842	* Initialize connection parameters and enter SYN-SENT state.
	843	*/
	844	static int
	845	tcp_connect(struct tcpcb tp, struct sockaddr nam, struct thread *td)
	846	{
	847	struct inpcb *inp = tp->t_inpcb;
	848	struct sockaddr_in sin = (struct sockaddr_in )nam;
	849	struct sockaddr_in *if_sin;
	850	int error;
	851	#ifdef SMP
	852	lwkt_port_t port;
	853	#endif
	854
	855	if (inp->inp_lport == 0) {
	856	error = in_pcbbind(inp, (struct sockaddr *)NULL, td);
	857	if (error)
	858	return (error);
	859	}
	860
	861	/*
	862	* Cannot simply call in_pcbconnect, because there might be an
	863	* earlier incarnation of this same connection still in
	864	* TIME_WAIT state, creating an ADDRINUSE error.
	865	*/
	866	error = in_pcbladdr(inp, nam, &if_sin);
	867	if (error)
	868	return (error);
	869
	870	#ifdef SMP
	871	port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port,
	872	inp->inp_laddr.s_addr ?
	873	inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr,
	874	inp->inp_lport);
	875
	876	if (port->mp_td != curthread) {
	877	struct netmsg_tcp_connect msg;
	878
	879	lwkt_initmsg(&msg.nm_lmsg, &curthread->td_msgport, 0,
	880	lwkt_cmd_func(tcp_connect_handler), lwkt_cmd_op_none);
	881	msg.nm_tp = tp;
	882	msg.nm_sin = sin;
	883	msg.nm_ifsin = if_sin;
	884	error = lwkt_domsg(port, &msg.nm_lmsg);
	885	} else
	886	#endif
	887	error = tcp_connect_oncpu(tp, sin, if_sin);
	888
	889	return (error);
	890	}
	891
	892	#ifdef INET6
	893	static int
	894	tcp6_connect(struct tcpcb tp, struct sockaddr nam, struct thread *td)
	895	{
	896	struct inpcb inp = tp->t_inpcb, oinp;
	897	struct socket *so = inp->inp_socket;
	898	struct tcpcb *otp;
	899	struct sockaddr_in6 sin6 = (struct sockaddr_in6 )nam;
	900	struct in6_addr *addr6;
	901	struct rmxp_tao *taop;
	902	struct rmxp_tao tao_noncached;
	903	int error;
	904
	905	if (inp->inp_lport == 0) {
	906	error = in6_pcbbind(inp, (struct sockaddr *)0, td);
	907	if (error)
	908	return error;
	909	}
	910
	911	/*
	912	* Cannot simply call in_pcbconnect, because there might be an
	913	* earlier incarnation of this same connection still in
	914	* TIME_WAIT state, creating an ADDRINUSE error.
	915	*/
	916	error = in6_pcbladdr(inp, nam, &addr6);
	917	if (error)
	918	return error;
	919	oinp = in6_pcblookup_hash(inp->inp_cpcbinfo,
	920	&sin6->sin6_addr, sin6->sin6_port,
	921	IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ?
	922	addr6 : &inp->in6p_laddr,
	923	inp->inp_lport, 0, NULL);
	924	if (oinp) {
	925	if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
	926	otp->t_state == TCPS_TIME_WAIT &&
	927	(ticks - otp->t_starttime) < tcp_msl &&
	928	(otp->t_flags & TF_RCVD_CC))
	929	otp = tcp_close(otp);
	930	else
	931	return (EADDRINUSE);
	932	}
	933	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
	934	inp->in6p_laddr = *addr6;
	935	inp->in6p_faddr = sin6->sin6_addr;
	936	inp->inp_fport = sin6->sin6_port;
	937	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL)
	938	inp->in6p_flowinfo = sin6->sin6_flowinfo;
	939	in_pcbinsconnhash(inp);
	940
	941	/* Compute window scaling to request. */
	942	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	943	(TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
	944	tp->request_r_scale++;
	945
	946	soisconnecting(so);
	947	tcpstat.tcps_connattempt++;
	948	tp->t_state = TCPS_SYN_SENT;
	949	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
	950	tp->iss = tcp_new_isn(tp);
	951	tp->t_bw_rtseq = tp->iss;
	952	tcp_sendseqinit(tp);
	953
	954	/*
	955	* Generate a CC value for this connection and
	956	* check whether CC or CCnew should be used.
	957	*/
	958	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
	959	taop = &tao_noncached;
	960	bzero(taop, sizeof(*taop));
	961	}
	962
	963	tp->cc_send = CC_INC(tcp_ccgen);
	964	if (taop->tao_ccsent != 0 &&
	965	CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
	966	taop->tao_ccsent = tp->cc_send;
	967	} else {
	968	taop->tao_ccsent = 0;
	969	tp->t_flags \|= TF_SENDCCNEW;
	970	}
	971
	972	return (0);
	973	}
	974	#endif /* INET6 */
	975
	976	/*
	977	* The new sockopt interface makes it possible for us to block in the
	978	* copyin/out step (if we take a page fault). Taking a page fault at
	979	* splnet() is probably a Bad Thing. (Since sockets and pcbs both now
	980	* use TSM, there probably isn't any need for this function to run at
	981	* splnet() any more. This needs more examination.)
	982	*/
	983	int
	984	tcp_ctloutput(so, sopt)
	985	struct socket *so;
	986	struct sockopt *sopt;
	987	{
	988	int error, opt, optval, s;
	989	struct inpcb *inp;
	990	struct tcpcb *tp;
	991
	992	error = 0;
	993	s = splnet(); /* XXX */
	994	inp = sotoinpcb(so);
	995	if (inp == NULL) {
	996	splx(s);
	997	return (ECONNRESET);
	998	}
	999	if (sopt->sopt_level != IPPROTO_TCP) {
	1000	#ifdef INET6
	1001	if (INP_CHECK_SOCKAF(so, AF_INET6))
	1002	error = ip6_ctloutput(so, sopt);
	1003	else
	1004	#endif /* INET6 */
	1005	error = ip_ctloutput(so, sopt);
	1006	splx(s);
	1007	return (error);
	1008	}
	1009	tp = intotcpcb(inp);
	1010
	1011	switch (sopt->sopt_dir) {
	1012	case SOPT_SET:
	1013	switch (sopt->sopt_name) {
	1014	case TCP_NODELAY:
	1015	case TCP_NOOPT:
	1016	error = sooptcopyin(sopt, &optval, sizeof optval,
	1017	sizeof optval);
	1018	if (error)
	1019	break;
	1020
	1021	switch (sopt->sopt_name) {
	1022	case TCP_NODELAY:
	1023	opt = TF_NODELAY;
	1024	break;
	1025	case TCP_NOOPT:
	1026	opt = TF_NOOPT;
	1027	break;
	1028	default:
	1029	opt = 0; /* dead code to fool gcc */
	1030	break;
	1031	}
	1032
	1033	if (optval)
	1034	tp->t_flags \|= opt;
	1035	else
	1036	tp->t_flags &= ~opt;
	1037	break;
	1038
	1039	case TCP_NOPUSH:
	1040	error = sooptcopyin(sopt, &optval, sizeof optval,
	1041	sizeof optval);
	1042	if (error)
	1043	break;
	1044
	1045	if (optval)
	1046	tp->t_flags \|= TF_NOPUSH;
	1047	else {
	1048	tp->t_flags &= ~TF_NOPUSH;
	1049	error = tcp_output(tp);
	1050	}
	1051	break;
	1052
	1053	case TCP_MAXSEG:
	1054	error = sooptcopyin(sopt, &optval, sizeof optval,
	1055	sizeof optval);
	1056	if (error)
	1057	break;
	1058
	1059	if (optval > 0 && optval <= tp->t_maxseg)
	1060	tp->t_maxseg = optval;
	1061	else
	1062	error = EINVAL;
	1063	break;
	1064
	1065	default:
	1066	error = ENOPROTOOPT;
	1067	break;
	1068	}
	1069	break;
	1070
	1071	case SOPT_GET:
	1072	switch (sopt->sopt_name) {
	1073	case TCP_NODELAY:
	1074	optval = tp->t_flags & TF_NODELAY;
	1075	break;
	1076	case TCP_MAXSEG:
	1077	optval = tp->t_maxseg;
	1078	break;
	1079	case TCP_NOOPT:
	1080	optval = tp->t_flags & TF_NOOPT;
	1081	break;
	1082	case TCP_NOPUSH:
	1083	optval = tp->t_flags & TF_NOPUSH;
	1084	break;
	1085	default:
	1086	error = ENOPROTOOPT;
	1087	break;
	1088	}
	1089	if (error == 0)
	1090	error = sooptcopyout(sopt, &optval, sizeof optval);
	1091	break;
	1092	}
	1093	splx(s);
	1094	return (error);
	1095	}
	1096
	1097	/*
	1098	* tcp_sendspace and tcp_recvspace are the default send and receive window
	1099	* sizes, respectively. These are obsolescent (this information should
	1100	* be set by the route).
	1101	*/
	1102	u_long tcp_sendspace = 1024*32;
	1103	SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
	1104	&tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
	1105	u_long tcp_recvspace = 57344; /* largest multiple of PAGE_SIZE < 64k */
	1106	SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
	1107	&tcp_recvspace , 0, "Maximum incoming TCP datagram size");
	1108
	1109	/*
	1110	* Attach TCP protocol to socket, allocating
	1111	* internet protocol control block, tcp control block,
	1112	* bufer space, and entering LISTEN state if to accept connections.
	1113	*/
	1114	static int
	1115	tcp_attach(struct socket so, struct pru_attach_info ai)
	1116	{
	1117	struct tcpcb *tp;
	1118	struct inpcb *inp;
	1119	int error;
	1120	int cpu;
	1121	#ifdef INET6
	1122	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL;
	1123	#endif
	1124
	1125	if (so->so_snd.sb_hiwat == 0 \|\| so->so_rcv.sb_hiwat == 0) {
	1126	error = soreserve(so, tcp_sendspace, tcp_recvspace,
	1127	ai->sb_rlimit);
	1128	if (error)
	1129	return (error);
	1130	}
	1131	cpu = mycpu->gd_cpuid;
	1132	error = in_pcballoc(so, &tcbinfo[cpu]);
	1133	if (error)
	1134	return (error);
	1135	inp = sotoinpcb(so);
	1136	#ifdef INET6
	1137	if (isipv6) {
	1138	inp->inp_vflag \|= INP_IPV6;
	1139	inp->in6p_hops = -1; /* use kernel default */
	1140	}
	1141	else
	1142	#endif
	1143	inp->inp_vflag \|= INP_IPV4;
	1144	tp = tcp_newtcpcb(inp);
	1145	if (tp == 0) {
	1146	int nofd = so->so_state & SS_NOFDREF; /* XXX */
	1147
	1148	so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
	1149	#ifdef INET6
	1150	if (isipv6)
	1151	in6_pcbdetach(inp);
	1152	else
	1153	#endif
	1154	in_pcbdetach(inp);
	1155	so->so_state \|= nofd;
	1156	return (ENOBUFS);
	1157	}
	1158	tp->t_state = TCPS_CLOSED;
	1159	return (0);
	1160	}
	1161
	1162	/*
	1163	* Initiate (or continue) disconnect.
	1164	* If embryonic state, just send reset (once).
	1165	* If in ``let data drain'' option and linger null, just drop.
	1166	* Otherwise (hard), mark socket disconnecting and drop
	1167	* current input data; switch states based on user close, and
	1168	* send segment to peer (with FIN).
	1169	*/
	1170	static struct tcpcb *
	1171	tcp_disconnect(tp)
	1172	struct tcpcb *tp;
	1173	{
	1174	struct socket *so = tp->t_inpcb->inp_socket;
	1175
	1176	if (tp->t_state < TCPS_ESTABLISHED)
	1177	tp = tcp_close(tp);
	1178	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
	1179	tp = tcp_drop(tp, 0);
	1180	else {
	1181	soisdisconnecting(so);
	1182	sbflush(&so->so_rcv);
	1183	tp = tcp_usrclosed(tp);
	1184	if (tp)
	1185	(void) tcp_output(tp);
	1186	}
	1187	return (tp);
	1188	}
	1189
	1190	/*
	1191	* User issued close, and wish to trail through shutdown states:
	1192	* if never received SYN, just forget it. If got a SYN from peer,
	1193	* but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
	1194	* If already got a FIN from peer, then almost done; go to LAST_ACK
	1195	* state. In all other cases, have already sent FIN to peer (e.g.
	1196	* after PRU_SHUTDOWN), and just have to play tedious game waiting
	1197	* for peer to send FIN or not respond to keep-alives, etc.
	1198	* We can let the user exit from the close as soon as the FIN is acked.
	1199	*/
	1200	static struct tcpcb *
	1201	tcp_usrclosed(tp)
	1202	struct tcpcb *tp;
	1203	{
	1204
	1205	switch (tp->t_state) {
	1206
	1207	case TCPS_CLOSED:
	1208	case TCPS_LISTEN:
	1209	tp->t_state = TCPS_CLOSED;
	1210	tp = tcp_close(tp);
	1211	break;
	1212
	1213	case TCPS_SYN_SENT:
	1214	case TCPS_SYN_RECEIVED:
	1215	tp->t_flags \|= TF_NEEDFIN;
	1216	break;
	1217
	1218	case TCPS_ESTABLISHED:
	1219	tp->t_state = TCPS_FIN_WAIT_1;
	1220	break;
	1221
	1222	case TCPS_CLOSE_WAIT:
	1223	tp->t_state = TCPS_LAST_ACK;
	1224	break;
	1225	}
	1226	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
	1227	soisdisconnected(tp->t_inpcb->inp_socket);
	1228	/* To prevent the connection hanging in FIN_WAIT_2 forever. */
	1229	if (tp->t_state == TCPS_FIN_WAIT_2)
	1230	callout_reset(tp->tt_2msl, tcp_maxidle,
	1231	tcp_timer_2msl, tp);
	1232	}
	1233	return (tp);
	1234	}