gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	* 3. All advertising materials mentioning features or use of this software
	14	* must display the following acknowledgement:
	15	* This product includes software developed by the University of
	16	* California, Berkeley and its contributors.
	17	* 4. Neither the name of the University nor the names of its contributors
	18	* may be used to endorse or promote products derived from this software
	19	* without specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	22	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	24	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	25	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	26	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	27	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	28	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	29	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	30	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	31	* SUCH DAMAGE.
	32	*
	33	* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
	34	* $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.38 2003/05/21 04:46:41 cjc Exp $
	35	* $DragonFly: src/sys/netinet/tcp_input.c,v 1.8 2003/08/13 18:34:25 hsu Exp $
	36	*/
	37
	38	#include "opt_ipfw.h" /* for ipfw_fwd */
	39	#include "opt_inet6.h"
	40	#include "opt_ipsec.h"
	41	#include "opt_tcpdebug.h"
	42	#include "opt_tcp_input.h"
	43
	44	#include <sys/param.h>
	45	#include <sys/systm.h>
	46	#include <sys/kernel.h>
	47	#include <sys/sysctl.h>
	48	#include <sys/malloc.h>
	49	#include <sys/mbuf.h>
	50	#include <sys/proc.h> /* for proc0 declaration */
	51	#include <sys/protosw.h>
	52	#include <sys/socket.h>
	53	#include <sys/socketvar.h>
	54	#include <sys/syslog.h>
	55
	56	#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
	57
	58	#include <net/if.h>
	59	#include <net/route.h>
	60
	61	#include <netinet/in.h>
	62	#include <netinet/in_systm.h>
	63	#include <netinet/ip.h>
	64	#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
	65	#include <netinet/in_var.h>
	66	#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
	67	#include <netinet/in_pcb.h>
	68	#include <netinet/ip_var.h>
	69	#include <netinet/ip6.h>
	70	#include <netinet/icmp6.h>
	71	#include <netinet6/nd6.h>
	72	#include <netinet6/ip6_var.h>
	73	#include <netinet6/in6_pcb.h>
	74	#include <netinet/tcp.h>
	75	#include <netinet/tcp_fsm.h>
	76	#include <netinet/tcp_seq.h>
	77	#include <netinet/tcp_timer.h>
	78	#include <netinet/tcp_var.h>
	79	#include <netinet6/tcp6_var.h>
	80	#include <netinet/tcpip.h>
	81	#ifdef TCPDEBUG
	82	#include <netinet/tcp_debug.h>
	83
	84	u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
	85	struct tcphdr tcp_savetcp;
	86	#endif /* TCPDEBUG */
	87
	88	#ifdef FAST_IPSEC
	89	#include <netipsec/ipsec.h>
	90	#include <netipsec/ipsec6.h>
	91	#endif
	92
	93	#ifdef IPSEC
	94	#include <netinet6/ipsec.h>
	95	#include <netinet6/ipsec6.h>
	96	#include <netproto/key/key.h>
	97	#endif /IPSEC/
	98
	99	#include <machine/in_cksum.h>
	100
	101	MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
	102
	103	static const int tcprexmtthresh = 3;
	104	tcp_cc tcp_ccgen;
	105
	106	struct tcpstat tcpstat;
	107	SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
	108	&tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
	109
	110	static int log_in_vain = 0;
	111	SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
	112	&log_in_vain, 0, "Log all incoming TCP connections");
	113
	114	static int blackhole = 0;
	115	SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
	116	&blackhole, 0, "Do not send RST when dropping refused connections");
	117
	118	int tcp_delack_enabled = 1;
	119	SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
	120	&tcp_delack_enabled, 0,
	121	"Delay ACK to try and piggyback it onto a data packet");
	122
	123	#ifdef TCP_DROP_SYNFIN
	124	static int drop_synfin = 0;
	125	SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
	126	&drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
	127	#endif
	128
	129	static int tcp_do_limitedtransmit = 1;
	130	SYSCTL_INT(_net_inet_tcp, OID_AUTO, limitedtransmit, CTLFLAG_RW,
	131	&tcp_do_limitedtransmit, 0, "Enable RFC 3042 (Limited Transmit)");
	132
	133	static int tcp_do_rfc3390 = 1;
	134	SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
	135	&tcp_do_rfc3390, 0,
	136	"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
	137
	138	static int tcp_do_eifel_detect = 1;
	139	SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel, CTLFLAG_RW,
	140	&tcp_do_eifel_detect, 0, "Eifel detection algorithm (RFC 3522)");
	141
	142	struct inpcbhead tcb;
	143	#define tcb6 tcb /* for KAME src sync over BSD's /
	144	struct inpcbinfo tcbinfo;
	145
	146	static void tcp_dooptions(struct tcpopt , u_char , int, int);
	147	static void tcp_pulloutofband(struct socket *,
	148	struct tcphdr , struct mbuf , int);
	149	static int tcp_reass(struct tcpcb , struct tcphdr , int *,
	150	struct mbuf *);
	151	static void tcp_xmit_timer(struct tcpcb *, int);
	152	static void tcp_newreno_partial_ack(struct tcpcb , struct tcphdr );
	153
	154	/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
	155	#ifdef INET6
	156	#define ND6_HINT(tp) \
	157	do { \
	158	if ((tp) && (tp)->t_inpcb && \
	159	((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
	160	(tp)->t_inpcb->in6p_route.ro_rt) \
	161	nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
	162	} while (0)
	163	#else
	164	#define ND6_HINT(tp)
	165	#endif
	166
	167	/*
	168	* Indicate whether this ack should be delayed. We can delay the ack if
	169	* - delayed acks are enabled and
	170	* - there is no delayed ack timer in progress and
	171	* - our last ack wasn't a 0-sized window. We never want to delay
	172	* the ack that opens up a 0-sized window.
	173	*/
	174	#define DELAY_ACK(tp) \
	175	(tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
	176	(tp->t_flags & TF_RXWIN0SENT) == 0)
	177
	178	static int
	179	tcp_reass(tp, th, tlenp, m)
	180	struct tcpcb *tp;
	181	struct tcphdr *th;
	182	int *tlenp;
	183	struct mbuf *m;
	184	{
	185	struct tseg_qent *q;
	186	struct tseg_qent *p = NULL;
	187	struct tseg_qent *nq;
	188	struct tseg_qent *te;
	189	struct socket *so = tp->t_inpcb->inp_socket;
	190	int flags;
	191
	192	/*
	193	* Call with th==0 after become established to
	194	* force pre-ESTABLISHED data up to user socket.
	195	*/
	196	if (th == 0)
	197	goto present;
	198
	199	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
	200	MALLOC(te, struct tseg_qent *, sizeof(struct tseg_qent), M_TSEGQ,
	201	M_NOWAIT);
	202	if (te == NULL) {
	203	tcpstat.tcps_rcvmemdrop++;
	204	m_freem(m);
	205	return (0);
	206	}
	207
	208	/*
	209	* Find a segment which begins after this one does.
	210	*/
	211	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
	212	if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
	213	break;
	214	p = q;
	215	}
	216
	217	/*
	218	* If there is a preceding segment, it may provide some of
	219	* our data already. If so, drop the data from the incoming
	220	* segment. If it provides all of our data, drop us.
	221	*/
	222	if (p != NULL) {
	223	int i;
	224	/* conversion to int (in i) handles seq wraparound */
	225	i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
	226	if (i > 0) {
	227	if (i >= *tlenp) {
	228	tcpstat.tcps_rcvduppack++;
	229	tcpstat.tcps_rcvdupbyte += *tlenp;
	230	m_freem(m);
	231	free(te, M_TSEGQ);
	232	/*
	233	* Try to present any queued data
	234	* at the left window edge to the user.
	235	* This is needed after the 3-WHS
	236	* completes.
	237	*/
	238	goto present; /* ??? */
	239	}
	240	m_adj(m, i);
	241	*tlenp -= i;
	242	th->th_seq += i;
	243	}
	244	}
	245	tcpstat.tcps_rcvoopack++;
	246	tcpstat.tcps_rcvoobyte += *tlenp;
	247
	248	/*
	249	* While we overlap succeeding segments trim them or,
	250	* if they are completely covered, dequeue them.
	251	*/
	252	while (q) {
	253	int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
	254	if (i <= 0)
	255	break;
	256	if (i < q->tqe_len) {
	257	q->tqe_th->th_seq += i;
	258	q->tqe_len -= i;
	259	m_adj(q->tqe_m, i);
	260	break;
	261	}
	262
	263	nq = LIST_NEXT(q, tqe_q);
	264	LIST_REMOVE(q, tqe_q);
	265	m_freem(q->tqe_m);
	266	free(q, M_TSEGQ);
	267	q = nq;
	268	}
	269
	270	/* Insert the new segment queue entry into place. */
	271	te->tqe_m = m;
	272	te->tqe_th = th;
	273	te->tqe_len = *tlenp;
	274
	275	if (p == NULL) {
	276	LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
	277	} else {
	278	LIST_INSERT_AFTER(p, te, tqe_q);
	279	}
	280
	281	present:
	282	/*
	283	* Present data to user, advancing rcv_nxt through
	284	* completed sequence space.
	285	*/
	286	if (!TCPS_HAVEESTABLISHED(tp->t_state))
	287	return (0);
	288	q = LIST_FIRST(&tp->t_segq);
	289	if (!q \|\| q->tqe_th->th_seq != tp->rcv_nxt)
	290	return (0);
	291	do {
	292	tp->rcv_nxt += q->tqe_len;
	293	flags = q->tqe_th->th_flags & TH_FIN;
	294	nq = LIST_NEXT(q, tqe_q);
	295	LIST_REMOVE(q, tqe_q);
	296	if (so->so_state & SS_CANTRCVMORE)
	297	m_freem(q->tqe_m);
	298	else
	299	sbappend(&so->so_rcv, q->tqe_m);
	300	free(q, M_TSEGQ);
	301	q = nq;
	302	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
	303	ND6_HINT(tp);
	304	sorwakeup(so);
	305	return (flags);
	306	}
	307
	308	/*
	309	* TCP input routine, follows pages 65-76 of the
	310	* protocol specification dated September, 1981 very closely.
	311	*/
	312	#ifdef INET6
	313	int
	314	tcp6_input(mp, offp, proto)
	315	struct mbuf **mp;
	316	int *offp, proto;
	317	{
	318	struct mbuf m = mp;
	319	struct in6_ifaddr *ia6;
	320
	321	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
	322
	323	/*
	324	* draft-itojun-ipv6-tcp-to-anycast
	325	* better place to put this in?
	326	*/
	327	ia6 = ip6_getdstifaddr(m);
	328	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
	329	struct ip6_hdr *ip6;
	330
	331	ip6 = mtod(m, struct ip6_hdr *);
	332	icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
	333	(caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
	334	return IPPROTO_DONE;
	335	}
	336
	337	tcp_input(m, *offp, proto);
	338	return IPPROTO_DONE;
	339	}
	340	#endif
	341
	342	void
	343	tcp_input(m, off0, proto)
	344	struct mbuf *m;
	345	int off0, proto;
	346	{
	347	struct tcphdr *th;
	348	struct ip *ip = NULL;
	349	struct ipovly *ipov;
	350	struct inpcb *inp = NULL;
	351	u_char *optp = NULL;
	352	int optlen = 0;
	353	int len, tlen, off;
	354	int drop_hdrlen;
	355	struct tcpcb *tp = NULL;
	356	int thflags;
	357	struct socket *so = 0;
	358	int todrop, acked, ourfinisacked, needoutput = 0;
	359	u_long tiwin;
	360	struct tcpopt to; /* options in this segment */
	361	struct rmxp_tao taop; / pointer to our TAO cache entry */
	362	struct rmxp_tao tao_noncached; /* in case there's no cached entry */
	363	struct sockaddr_in *next_hop = NULL;
	364	int rstreason; /* For badport_bandlim accounting purposes */
	365	int useTS; /* use timestamps in Eifel detection */
	366	struct ip6_hdr *ip6 = NULL;
	367	#ifdef INET6
	368	int isipv6;
	369	#else
	370	const int isipv6 = 0;
	371	#endif
	372	#ifdef TCPDEBUG
	373	short ostate = 0;
	374	#endif
	375
	376	/* Grab info from MT_TAG mbufs prepended to the chain. */
	377	for (;m && m->m_type == MT_TAG; m = m->m_next) {
	378	if (m->_m_tag_id == PACKET_TAG_IPFORWARD)
	379	next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
	380	}
	381	#ifdef INET6
	382	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
	383	#endif
	384	bzero((char *)&to, sizeof(to));
	385
	386	tcpstat.tcps_rcvtotal++;
	387
	388	if (isipv6) {
	389	/* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
	390	ip6 = mtod(m, struct ip6_hdr *);
	391	tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
	392	if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
	393	tcpstat.tcps_rcvbadsum++;
	394	goto drop;
	395	}
	396	th = (struct tcphdr *)((caddr_t)ip6 + off0);
	397
	398	/*
	399	* Be proactive about unspecified IPv6 address in source.
	400	* As we use all-zero to indicate unbounded/unconnected pcb,
	401	* unspecified IPv6 address can be used to confuse us.
	402	*
	403	* Note that packets with unspecified IPv6 destination is
	404	* already dropped in ip6_input.
	405	*/
	406	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
	407	/* XXX stat */
	408	goto drop;
	409	}
	410	} else {
	411	/*
	412	* Get IP and TCP header together in first mbuf.
	413	* Note: IP leaves IP header in first mbuf.
	414	*/
	415	if (off0 > sizeof(struct ip)) {
	416	ip_stripoptions(m, (struct mbuf *)0);
	417	off0 = sizeof(struct ip);
	418	}
	419	if (m->m_len < sizeof(struct tcpiphdr)) {
	420	if ((m = m_pullup(m, sizeof(struct tcpiphdr))) == 0) {
	421	tcpstat.tcps_rcvshort++;
	422	return;
	423	}
	424	}
	425	ip = mtod(m, struct ip *);
	426	ipov = (struct ipovly *)ip;
	427	th = (struct tcphdr *)((caddr_t)ip + off0);
	428	tlen = ip->ip_len;
	429
	430	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
	431	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
	432	th->th_sum = m->m_pkthdr.csum_data;
	433	else
	434	th->th_sum = in_pseudo(ip->ip_src.s_addr,
	435	ip->ip_dst.s_addr,
	436	htonl(m->m_pkthdr.csum_data +
	437	ip->ip_len +
	438	IPPROTO_TCP));
	439	th->th_sum ^= 0xffff;
	440	} else {
	441	/*
	442	* Checksum extended TCP header and data.
	443	*/
	444	len = sizeof(struct ip) + tlen;
	445	bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
	446	ipov->ih_len = (u_short)tlen;
	447	ipov->ih_len = htons(ipov->ih_len);
	448	th->th_sum = in_cksum(m, len);
	449	}
	450	if (th->th_sum) {
	451	tcpstat.tcps_rcvbadsum++;
	452	goto drop;
	453	}
	454	#ifdef INET6
	455	/* Re-initialization for later version check */
	456	ip->ip_v = IPVERSION;
	457	#endif
	458	}
	459
	460	/*
	461	* Check that TCP offset makes sense,
	462	* pull out TCP options and adjust length. XXX
	463	*/
	464	off = th->th_off << 2;
	465	if (off < sizeof(struct tcphdr) \|\| off > tlen) {
	466	tcpstat.tcps_rcvbadoff++;
	467	goto drop;
	468	}
	469	tlen -= off; /* tlen is used instead of ti->ti_len */
	470	if (off > sizeof(struct tcphdr)) {
	471	if (isipv6) {
	472	IP6_EXTHDR_CHECK(m, off0, off, );
	473	ip6 = mtod(m, struct ip6_hdr *);
	474	th = (struct tcphdr *)((caddr_t)ip6 + off0);
	475	} else {
	476	if (m->m_len < sizeof(struct ip) + off) {
	477	if ((m = m_pullup(m, sizeof(struct ip) + off))
	478	== 0) {
	479	tcpstat.tcps_rcvshort++;
	480	return;
	481	}
	482	ip = mtod(m, struct ip *);
	483	ipov = (struct ipovly *)ip;
	484	th = (struct tcphdr *)((caddr_t)ip + off0);
	485	}
	486	}
	487	optlen = off - sizeof(struct tcphdr);
	488	optp = (u_char *)(th + 1);
	489	}
	490	thflags = th->th_flags;
	491
	492	#ifdef TCP_DROP_SYNFIN
	493	/*
	494	* If the drop_synfin option is enabled, drop all packets with
	495	* both the SYN and FIN bits set. This prevents e.g. nmap from
	496	* identifying the TCP/IP stack.
	497	*
	498	* This is a violation of the TCP specification.
	499	*/
	500	if (drop_synfin && (thflags & (TH_SYN\|TH_FIN)) == (TH_SYN\|TH_FIN))
	501	goto drop;
	502	#endif
	503
	504	/*
	505	* Convert TCP protocol specific fields to host format.
	506	*/
	507	th->th_seq = ntohl(th->th_seq);
	508	th->th_ack = ntohl(th->th_ack);
	509	th->th_win = ntohs(th->th_win);
	510	th->th_urp = ntohs(th->th_urp);
	511
	512	/*
	513	* Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
	514	* until after ip6_savecontrol() is called and before other functions
	515	* which don't want those proto headers.
	516	* Because ip6_savecontrol() is going to parse the mbuf to
	517	* search for data to be passed up to user-land, it wants mbuf
	518	* parameters to be unchanged.
	519	* XXX: the call of ip6_savecontrol() has been obsoleted based on
	520	* latest version of the advanced API (20020110).
	521	*/
	522	drop_hdrlen = off0 + off;
	523
	524	/*
	525	* Locate pcb for segment.
	526	*/
	527	findpcb:
	528	/* IPFIREWALL_FORWARD section */
	529	if (next_hop != NULL && isipv6 == 0) { /* IPv6 support is not yet */
	530	/*
	531	* Transparently forwarded. Pretend to be the destination.
	532	* already got one like this?
	533	*/
	534	inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
	535	ip->ip_dst, th->th_dport,
	536	0, m->m_pkthdr.rcvif);
	537	if (!inp) {
	538	/* It's new. Try find the ambushing socket. */
	539	inp = in_pcblookup_hash(&tcbinfo,
	540	ip->ip_src, th->th_sport,
	541	next_hop->sin_addr,
	542	next_hop->sin_port ?
	543	ntohs(next_hop->sin_port) :
	544	th->th_dport,
	545	1, m->m_pkthdr.rcvif);
	546	}
	547	} else {
	548	if (isipv6)
	549	inp = in6_pcblookup_hash(&tcbinfo,
	550	&ip6->ip6_src, th->th_sport,
	551	&ip6->ip6_dst, th->th_dport,
	552	1, m->m_pkthdr.rcvif);
	553	else
	554	inp = in_pcblookup_hash(&tcbinfo,
	555	ip->ip_src, th->th_sport,
	556	ip->ip_dst, th->th_dport,
	557	1, m->m_pkthdr.rcvif);
	558	}
	559
	560	#ifdef IPSEC
	561	if (isipv6) {
	562	if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
	563	ipsec6stat.in_polvio++;
	564	goto drop;
	565	}
	566	} else {
	567	if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
	568	ipsecstat.in_polvio++;
	569	goto drop;
	570	}
	571	}
	572	#endif
	573	#ifdef FAST_IPSEC
	574	if (isipv6) {
	575	if (inp != NULL && ipsec6_in_reject(m, inp)) {
	576	goto drop;
	577	}
	578	} else {
	579	if (inp != NULL && ipsec4_in_reject(m, inp)) {
	580	goto drop;
	581	}
	582	}
	583	#endif
	584
	585	/*
	586	* If the state is CLOSED (i.e., TCB does not exist) then
	587	* all data in the incoming segment is discarded.
	588	* If the TCB exists but is in CLOSED state, it is embryonic,
	589	* but should either do a listen or a connect soon.
	590	*/
	591	if (inp == NULL) {
	592	if (log_in_vain) {
	593	#ifdef INET6
	594	char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2];
	595	#else
	596	char dbuf[4sizeof "123"], sbuf[4sizeof "123"];
	597	#endif
	598	if (isipv6) {
	599	strcpy(dbuf, "[");
	600	strcpy(sbuf, "[");
	601	strcat(dbuf, ip6_sprintf(&ip6->ip6_dst));
	602	strcat(sbuf, ip6_sprintf(&ip6->ip6_src));
	603	strcat(dbuf, "]");
	604	strcat(sbuf, "]");
	605	} else {
	606	strcpy(dbuf, inet_ntoa(ip->ip_dst));
	607	strcpy(sbuf, inet_ntoa(ip->ip_src));
	608	}
	609	switch (log_in_vain) {
	610	case 1:
	611	if ((thflags & TH_SYN) == 0)
	612	break;
	613	case 2:
	614	log(LOG_INFO,
	615	"Connection attempt to TCP %s:%d "
	616	"from %s:%d flags:0x%02x\n",
	617	dbuf, ntohs(th->th_dport), sbuf,
	618	ntohs(th->th_sport), thflags);
	619	break;
	620	default:
	621	break;
	622	}
	623	}
	624	if (blackhole) {
	625	switch (blackhole) {
	626	case 1:
	627	if (thflags & TH_SYN)
	628	goto drop;
	629	break;
	630	case 2:
	631	goto drop;
	632	default:
	633	goto drop;
	634	}
	635	}
	636	rstreason = BANDLIM_RST_CLOSEDPORT;
	637	goto dropwithreset;
	638	}
	639	tp = intotcpcb(inp);
	640	if (tp == NULL) {
	641	rstreason = BANDLIM_RST_CLOSEDPORT;
	642	goto dropwithreset;
	643	}
	644	if (tp->t_state == TCPS_CLOSED)
	645	goto drop;
	646
	647	/* Unscale the window into a 32-bit value. */
	648	if ((thflags & TH_SYN) == 0)
	649	tiwin = th->th_win << tp->snd_scale;
	650	else
	651	tiwin = th->th_win;
	652
	653	so = inp->inp_socket;
	654	if (so->so_options & (SO_DEBUG\|SO_ACCEPTCONN)) {
	655	struct in_conninfo inc;
	656	#ifdef TCPDEBUG
	657	if (so->so_options & SO_DEBUG) {
	658	ostate = tp->t_state;
	659	if (isipv6)
	660	bcopy((char )ip6, (char )tcp_saveipgen,
	661	sizeof(*ip6));
	662	else
	663	bcopy((char )ip, (char )tcp_saveipgen,
	664	sizeof(*ip));
	665	tcp_savetcp = *th;
	666	}
	667	#endif
	668	/* skip if this isn't a listen socket */
	669	if ((so->so_options & SO_ACCEPTCONN) == 0)
	670	goto after_listen;
	671	#ifdef INET6
	672	inc.inc_isipv6 = isipv6;
	673	#endif
	674	if (isipv6) {
	675	inc.inc6_faddr = ip6->ip6_src;
	676	inc.inc6_laddr = ip6->ip6_dst;
	677	inc.inc6_route.ro_rt = NULL; /* XXX */
	678	} else {
	679	inc.inc_faddr = ip->ip_src;
	680	inc.inc_laddr = ip->ip_dst;
	681	inc.inc_route.ro_rt = NULL; /* XXX */
	682	}
	683	inc.inc_fport = th->th_sport;
	684	inc.inc_lport = th->th_dport;
	685
	686	/*
	687	* If the state is LISTEN then ignore segment if it contains
	688	* a RST. If the segment contains an ACK then it is bad and
	689	* send a RST. If it does not contain a SYN then it is not
	690	* interesting; drop it.
	691	*
	692	* If the state is SYN_RECEIVED (syncache) and seg contains
	693	* an ACK, but not for our SYN/ACK, send a RST. If the seg
	694	* contains a RST, check the sequence number to see if it
	695	* is a valid reset segment.
	696	*/
	697	if ((thflags & (TH_RST\|TH_ACK\|TH_SYN)) != TH_SYN) {
	698	if ((thflags & (TH_RST\|TH_ACK\|TH_SYN)) == TH_ACK) {
	699	if (!syncache_expand(&inc, th, &so, m)) {
	700	/*
	701	* No syncache entry, or ACK was not
	702	* for our SYN/ACK. Send a RST.
	703	*/
	704	tcpstat.tcps_badsyn++;
	705	rstreason = BANDLIM_RST_OPENPORT;
	706	goto dropwithreset;
	707	}
	708	if (so == NULL)
	709	/*
	710	* Could not complete 3-way handshake,
	711	* connection is being closed down, and
	712	* syncache will free mbuf.
	713	*/
	714	return;
	715	/*
	716	* Socket is created in state SYN_RECEIVED.
	717	* Continue processing segment.
	718	*/
	719	inp = sotoinpcb(so);
	720	tp = intotcpcb(inp);
	721	/*
	722	* This is what would have happened in
	723	* tcp_output() when the SYN,ACK was sent.
	724	*/
	725	tp->snd_up = tp->snd_una;
	726	tp->snd_max = tp->snd_nxt = tp->iss + 1;
	727	tp->last_ack_sent = tp->rcv_nxt;
	728	/*
	729	* XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled
	730	* until the _second_ ACK is received:
	731	* rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window.
	732	* rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale,
	733	* move to ESTAB, set snd_wnd to tiwin.
	734	*/
	735	tp->snd_wnd = tiwin; /* unscaled */
	736	goto after_listen;
	737	}
	738	if (thflags & TH_RST) {
	739	syncache_chkrst(&inc, th);
	740	goto drop;
	741	}
	742	if (thflags & TH_ACK) {
	743	syncache_badack(&inc);
	744	tcpstat.tcps_badsyn++;
	745	rstreason = BANDLIM_RST_OPENPORT;
	746	goto dropwithreset;
	747	}
	748	goto drop;
	749	}
	750
	751	/*
	752	* Segment's flags are (SYN) or (SYN\|FIN).
	753	*/
	754	#ifdef INET6
	755	/*
	756	* If deprecated address is forbidden,
	757	* we do not accept SYN to deprecated interface
	758	* address to prevent any new inbound connection from
	759	* getting established.
	760	* When we do not accept SYN, we send a TCP RST,
	761	* with deprecated source address (instead of dropping
	762	* it). We compromise it as it is much better for peer
	763	* to send a RST, and RST will be the final packet
	764	* for the exchange.
	765	*
	766	* If we do not forbid deprecated addresses, we accept
	767	* the SYN packet. RFC2462 does not suggest dropping
	768	* SYN in this case.
	769	* If we decipher RFC2462 5.5.4, it says like this:
	770	* 1. use of deprecated addr with existing
	771	* communication is okay - "SHOULD continue to be
	772	* used"
	773	* 2. use of it with new communication:
	774	* (2a) "SHOULD NOT be used if alternate address
	775	* with sufficient scope is available"
	776	* (2b) nothing mentioned otherwise.
	777	* Here we fall into (2b) case as we have no choice in
	778	* our source address selection - we must obey the peer.
	779	*
	780	* The wording in RFC2462 is confusing, and there are
	781	* multiple description text for deprecated address
	782	* handling - worse, they are not exactly the same.
	783	* I believe 5.5.4 is the best one, so we follow 5.5.4.
	784	*/
	785	if (isipv6 && !ip6_use_deprecated) {
	786	struct in6_ifaddr *ia6;
	787
	788	if ((ia6 = ip6_getdstifaddr(m)) &&
	789	(ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
	790	tp = NULL;
	791	rstreason = BANDLIM_RST_OPENPORT;
	792	goto dropwithreset;
	793	}
	794	}
	795	#endif
	796	/*
	797	* If it is from this socket, drop it, it must be forged.
	798	* Don't bother responding if the destination was a broadcast.
	799	*/
	800	if (th->th_dport == th->th_sport) {
	801	if (isipv6) {
	802	if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
	803	&ip6->ip6_src))
	804	goto drop;
	805	} else {
	806	if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
	807	goto drop;
	808	}
	809	}
	810	/*
	811	* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
	812	*
	813	* Note that it is quite possible to receive unicast
	814	* link-layer packets with a broadcast IP address. Use
	815	* in_broadcast() to find them.
	816	*/
	817	if (m->m_flags & (M_BCAST\|M_MCAST))
	818	goto drop;
	819	if (isipv6) {
	820	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	821	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
	822	goto drop;
	823	} else {
	824	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	825	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	826	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
	827	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
	828	goto drop;
	829	}
	830	/*
	831	* SYN appears to be valid; create compressed TCP state
	832	* for syncache, or perform t/tcp connection.
	833	*/
	834	if (so->so_qlen <= so->so_qlimit) {
	835	tcp_dooptions(&to, optp, optlen, 1);
	836	if (!syncache_add(&inc, &to, th, &so, m))
	837	goto drop;
	838	if (so == NULL)
	839	/*
	840	* Entry added to syncache, mbuf used to
	841	* send SYN,ACK packet.
	842	*/
	843	return;
	844	/*
	845	* Segment passed TAO tests.
	846	*/
	847	inp = sotoinpcb(so);
	848	tp = intotcpcb(inp);
	849	tp->snd_wnd = tiwin;
	850	tp->t_starttime = ticks;
	851	tp->t_state = TCPS_ESTABLISHED;
	852
	853	/*
	854	* If there is a FIN, or if there is data and the
	855	* connection is local, then delay SYN,ACK(SYN) in
	856	* the hope of piggy-backing it on a response
	857	* segment. Otherwise must send ACK now in case
	858	* the other side is slow starting.
	859	*/
	860	if (DELAY_ACK(tp) &&
	861	((thflags & TH_FIN) \|\|
	862	(tlen != 0 &&
	863	((isipv6 && in6_localaddr(&inp->in6p_faddr)) \|\|
	864	(!isipv6 && in_localaddr(inp->inp_faddr)))))) {
	865	callout_reset(tp->tt_delack, tcp_delacktime,
	866	tcp_timer_delack, tp);
	867	tp->t_flags \|= TF_NEEDSYN;
	868	} else
	869	tp->t_flags \|= (TF_ACKNOW \| TF_NEEDSYN);
	870
	871	tcpstat.tcps_connects++;
	872	soisconnected(so);
	873	goto trimthenstep6;
	874	}
	875	goto drop;
	876	}
	877	after_listen:
	878
	879	/* XXX temp debugging */
	880	/* should not happen - syncache should pick up these connections */
	881	if (tp->t_state == TCPS_LISTEN)
	882	panic("tcp_input: TCPS_LISTEN");
	883
	884	/*
	885	* Segment received on connection.
	886	* Reset idle time and keep-alive timer.
	887	*/
	888	tp->t_rcvtime = ticks;
	889	if (TCPS_HAVEESTABLISHED(tp->t_state))
	890	callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
	891
	892	/*
	893	* Process options.
	894	* XXX this is tradtitional behavior, may need to be cleaned up.
	895	*/
	896	tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
	897	if (thflags & TH_SYN) {
	898	if (to.to_flags & TOF_SCALE) {
	899	tp->t_flags \|= TF_RCVD_SCALE;
	900	tp->requested_s_scale = to.to_requested_s_scale;
	901	}
	902	if (to.to_flags & TOF_TS) {
	903	tp->t_flags \|= TF_RCVD_TSTMP;
	904	tp->ts_recent = to.to_tsval;
	905	tp->ts_recent_age = ticks;
	906	}
	907	if (to.to_flags & (TOF_CC\|TOF_CCNEW))
	908	tp->t_flags \|= TF_RCVD_CC;
	909	if (to.to_flags & TOF_MSS)
	910	tcp_mss(tp, to.to_mss);
	911	}
	912
	913	/*
	914	* Header prediction: check for the two common cases
	915	* of a uni-directional data xfer. If the packet has
	916	* no control flags, is in-sequence, the window didn't
	917	* change and we're not retransmitting, it's a
	918	* candidate. If the length is zero and the ack moved
	919	* forward, we're the sender side of the xfer. Just
	920	* free the data acked & wake any higher level process
	921	* that was blocked waiting for space. If the length
	922	* is non-zero and the ack didn't move, we're the
	923	* receiver side. If we're getting packets in-order
	924	* (the reassembly queue is empty), add the data to
	925	* the socket buffer and note that we need a delayed ack.
	926	* Make sure that the hidden state-flags are also off.
	927	* Since we check for TCPS_ESTABLISHED above, it can only
	928	* be TH_NEEDSYN.
	929	*/
	930	if (tp->t_state == TCPS_ESTABLISHED &&
	931	(thflags & (TH_SYN\|TH_FIN\|TH_RST\|TH_URG\|TH_ACK)) == TH_ACK &&
	932	((tp->t_flags & (TF_NEEDSYN\|TF_NEEDFIN)) == 0) &&
	933	((to.to_flags & TOF_TS) == 0 \|\|
	934	TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
	935	/*
	936	* Using the CC option is compulsory if once started:
	937	* the segment is OK if no T/TCP was negotiated or
	938	* if the segment has a CC option equal to CCrecv
	939	*/
	940	((tp->t_flags & (TF_REQ_CC\|TF_RCVD_CC)) != (TF_REQ_CC\|TF_RCVD_CC) \|\|
	941	((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
	942	th->th_seq == tp->rcv_nxt &&
	943	tiwin && tiwin == tp->snd_wnd &&
	944	tp->snd_nxt == tp->snd_max) {
	945
	946	/*
	947	* If last ACK falls within this segment's sequence numbers,
	948	* record the timestamp.
	949	* NOTE that the test is modified according to the latest
	950	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
	951	*/
	952	if ((to.to_flags & TOF_TS) != 0 &&
	953	SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
	954	tp->ts_recent_age = ticks;
	955	tp->ts_recent = to.to_tsval;
	956	}
	957
	958	if (tlen == 0) {
	959	if (SEQ_GT(th->th_ack, tp->snd_una) &&
	960	SEQ_LEQ(th->th_ack, tp->snd_max) &&
	961	tp->snd_cwnd >= tp->snd_wnd &&
	962	((!tcp_do_newreno &&
	963	tp->t_dupacks < tcprexmtthresh) \|\|
	964	(tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
	965	/*
	966	* this is a pure ack for outstanding data.
	967	*/
	968	++tcpstat.tcps_predack;
	969	/*
	970	* "bad retransmit" recovery
	971	*/
	972	useTS = tcp_do_eifel_detect &&
	973	(to.to_flags & TOF_TS) &&
	974	to.to_tsecr;
	975	if ((useTS &&
	976	(tp->t_flags & TF_FIRSTACCACK) &&
	977	(to.to_tsecr < tp->t_rexmtTS)) \|\|
	978	(!useTS &&
	979	(tp->t_rxtshift == 1 &&
	980	ticks < tp->t_badrxtwin))) {
	981	tp->snd_cwnd = tp->snd_cwnd_prev;
	982	tp->snd_ssthresh =
	983	tp->snd_ssthresh_prev;
	984	tp->snd_recover = tp->snd_recover_prev;
	985	if (tp->t_flags & TF_WASFRECOVERY)
	986	ENTER_FASTRECOVERY(tp);
	987	tp->snd_nxt = tp->snd_max;
	988	tp->t_badrxtwin = 0;
	989	tp->t_rxtshift = 0;
	990	if (tp->t_flags & TF_FASTREXMT)
	991	++tcpstat.tcps_sndfastrexmitbad;
	992	else
	993	++tcpstat.tcps_sndrtobad;
	994	}
	995	tp->t_flags &= ~(TF_FIRSTACCACK \| TF_FASTREXMT);
	996	/*
	997	* Recalculate the retransmit timer / rtt.
	998	*
	999	* Some machines (certain windows boxes)
	1000	* send broken timestamp replies during the
	1001	* SYN+ACK phase, ignore timestamps of 0.
	1002	*/
	1003	if ((to.to_flags & TOF_TS) != 0 &&
	1004	to.to_tsecr) {
	1005	tcp_xmit_timer(tp,
	1006	ticks - to.to_tsecr + 1);
	1007	} else if (tp->t_rtttime &&
	1008	SEQ_GT(th->th_ack, tp->t_rtseq)) {
	1009	tcp_xmit_timer(tp,
	1010	ticks - tp->t_rtttime);
	1011	}
	1012	tcp_xmit_bandwidth_limit(tp, th->th_ack);
	1013	acked = th->th_ack - tp->snd_una;
	1014	tcpstat.tcps_rcvackpack++;
	1015	tcpstat.tcps_rcvackbyte += acked;
	1016	sbdrop(&so->so_snd, acked);
	1017	if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
	1018	SEQ_LEQ(th->th_ack, tp->snd_recover))
	1019	tp->snd_recover = th->th_ack - 1;
	1020	tp->snd_una = th->th_ack;
	1021	tp->t_dupacks = 0;
	1022	m_freem(m);
	1023	ND6_HINT(tp); /* some progress has been done */
	1024
	1025	/*
	1026	* If all outstanding data are acked, stop
	1027	* retransmit timer, otherwise restart timer
	1028	* using current (possibly backed-off) value.
	1029	* If process is waiting for space,
	1030	* wakeup/selwakeup/signal. If data
	1031	* are ready to send, let tcp_output
	1032	* decide between more output or persist.
	1033	*/
	1034	if (tp->snd_una == tp->snd_max)
	1035	callout_stop(tp->tt_rexmt);
	1036	else if (!callout_active(tp->tt_persist))
	1037	callout_reset(tp->tt_rexmt,
	1038	tp->t_rxtcur,
	1039	tcp_timer_rexmt, tp);
	1040
	1041	sowwakeup(so);
	1042	if (so->so_snd.sb_cc)
	1043	(void) tcp_output(tp);
	1044	return;
	1045	}
	1046	} else if (th->th_ack == tp->snd_una &&
	1047	LIST_EMPTY(&tp->t_segq) &&
	1048	tlen <= sbspace(&so->so_rcv)) {
	1049	/*
	1050	* this is a pure, in-sequence data packet
	1051	* with nothing on the reassembly queue and
	1052	* we have enough buffer space to take it.
	1053	*/
	1054	++tcpstat.tcps_preddat;
	1055	tp->rcv_nxt += tlen;
	1056	tcpstat.tcps_rcvpack++;
	1057	tcpstat.tcps_rcvbyte += tlen;
	1058	ND6_HINT(tp); /* some progress has been done */
	1059	/*
	1060	* Add data to socket buffer.
	1061	*/
	1062	if (so->so_state & SS_CANTRCVMORE) {
	1063	m_freem(m);
	1064	} else {
	1065	m_adj(m, drop_hdrlen); /* delayed header drop */
	1066	sbappend(&so->so_rcv, m);
	1067	}
	1068	sorwakeup(so);
	1069	if (DELAY_ACK(tp)) {
	1070	callout_reset(tp->tt_delack, tcp_delacktime,
	1071	tcp_timer_delack, tp);
	1072	} else {
	1073	tp->t_flags \|= TF_ACKNOW;
	1074	tcp_output(tp);
	1075	}
	1076	return;
	1077	}
	1078	}
	1079
	1080	/*
	1081	* Calculate amount of space in receive window,
	1082	* and then do TCP input processing.
	1083	* Receive window is amount of space in rcv queue,
	1084	* but not less than advertised window.
	1085	*/
	1086	{ int win;
	1087
	1088	win = sbspace(&so->so_rcv);
	1089	if (win < 0)
	1090	win = 0;
	1091	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
	1092	}
	1093
	1094	switch (tp->t_state) {
	1095
	1096	/*
	1097	* If the state is SYN_RECEIVED:
	1098	* if seg contains an ACK, but not for our SYN/ACK, send a RST.
	1099	*/
	1100	case TCPS_SYN_RECEIVED:
	1101	if ((thflags & TH_ACK) &&
	1102	(SEQ_LEQ(th->th_ack, tp->snd_una) \|\|
	1103	SEQ_GT(th->th_ack, tp->snd_max))) {
	1104	rstreason = BANDLIM_RST_OPENPORT;
	1105	goto dropwithreset;
	1106	}
	1107	break;
	1108
	1109	/*
	1110	* If the state is SYN_SENT:
	1111	* if seg contains an ACK, but not for our SYN, drop the input.
	1112	* if seg contains a RST, then drop the connection.
	1113	* if seg does not contain SYN, then drop it.
	1114	* Otherwise this is an acceptable SYN segment
	1115	* initialize tp->rcv_nxt and tp->irs
	1116	* if seg contains ack then advance tp->snd_una
	1117	* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
	1118	* arrange for segment to be acked (eventually)
	1119	* continue processing rest of data/controls, beginning with URG
	1120	*/
	1121	case TCPS_SYN_SENT:
	1122	if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
	1123	taop = &tao_noncached;
	1124	bzero(taop, sizeof(*taop));
	1125	}
	1126
	1127	if ((thflags & TH_ACK) &&
	1128	(SEQ_LEQ(th->th_ack, tp->iss) \|\|
	1129	SEQ_GT(th->th_ack, tp->snd_max))) {
	1130	/*
	1131	* If we have a cached CCsent for the remote host,
	1132	* hence we haven't just crashed and restarted,
	1133	* do not send a RST. This may be a retransmission
	1134	* from the other side after our earlier ACK was lost.
	1135	* Our new SYN, when it arrives, will serve as the
	1136	* needed ACK.
	1137	*/
	1138	if (taop->tao_ccsent != 0)
	1139	goto drop;
	1140	else {
	1141	rstreason = BANDLIM_UNLIMITED;
	1142	goto dropwithreset;
	1143	}
	1144	}
	1145	if (thflags & TH_RST) {
	1146	if (thflags & TH_ACK)
	1147	tp = tcp_drop(tp, ECONNREFUSED);
	1148	goto drop;
	1149	}
	1150	if ((thflags & TH_SYN) == 0)
	1151	goto drop;
	1152	tp->snd_wnd = th->th_win; /* initial send window */
	1153	tp->cc_recv = to.to_cc; /* foreign CC */
	1154
	1155	tp->irs = th->th_seq;
	1156	tcp_rcvseqinit(tp);
	1157	if (thflags & TH_ACK) {
	1158	/*
	1159	* Our SYN was acked. If segment contains CC.ECHO
	1160	* option, check it to make sure this segment really
	1161	* matches our SYN. If not, just drop it as old
	1162	* duplicate, but send an RST if we're still playing
	1163	* by the old rules. If no CC.ECHO option, make sure
	1164	* we don't get fooled into using T/TCP.
	1165	*/
	1166	if (to.to_flags & TOF_CCECHO) {
	1167	if (tp->cc_send != to.to_ccecho) {
	1168	if (taop->tao_ccsent != 0)
	1169	goto drop;
	1170	else {
	1171	rstreason = BANDLIM_UNLIMITED;
	1172	goto dropwithreset;
	1173	}
	1174	}
	1175	} else
	1176	tp->t_flags &= ~TF_RCVD_CC;
	1177	tcpstat.tcps_connects++;
	1178	soisconnected(so);
	1179	/* Do window scaling on this connection? */
	1180	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
	1181	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
	1182	tp->snd_scale = tp->requested_s_scale;
	1183	tp->rcv_scale = tp->request_r_scale;
	1184	}
	1185	/* Segment is acceptable, update cache if undefined. */
	1186	if (taop->tao_ccsent == 0)
	1187	taop->tao_ccsent = to.to_ccecho;
	1188
	1189	tp->rcv_adv += tp->rcv_wnd;
	1190	tp->snd_una++; /* SYN is acked */
	1191	/*
	1192	* If there's data, delay ACK; if there's also a FIN
	1193	* ACKNOW will be turned on later.
	1194	*/
	1195	if (DELAY_ACK(tp) && tlen != 0)
	1196	callout_reset(tp->tt_delack, tcp_delacktime,
	1197	tcp_timer_delack, tp);
	1198	else
	1199	tp->t_flags \|= TF_ACKNOW;
	1200	/*
	1201	* Received <SYN,ACK> in SYN_SENT[*] state.
	1202	* Transitions:
	1203	* SYN_SENT --> ESTABLISHED
	1204	* SYN_SENT* --> FIN_WAIT_1
	1205	*/
	1206	tp->t_starttime = ticks;
	1207	if (tp->t_flags & TF_NEEDFIN) {
	1208	tp->t_state = TCPS_FIN_WAIT_1;
	1209	tp->t_flags &= ~TF_NEEDFIN;
	1210	thflags &= ~TH_SYN;
	1211	} else {
	1212	tp->t_state = TCPS_ESTABLISHED;
	1213	callout_reset(tp->tt_keep, tcp_keepidle,
	1214	tcp_timer_keep, tp);
	1215	}
	1216	} else {
	1217	/*
	1218	* Received initial SYN in SYN-SENT[*] state =>
	1219	* simultaneous open. If segment contains CC option
	1220	* and there is a cached CC, apply TAO test.
	1221	* If it succeeds, connection is * half-synchronized.
	1222	* Otherwise, do 3-way handshake:
	1223	* SYN-SENT -> SYN-RECEIVED
	1224	* SYN-SENT* -> SYN-RECEIVED*
	1225	* If there was no CC option, clear cached CC value.
	1226	*/
	1227	tp->t_flags \|= TF_ACKNOW;
	1228	callout_stop(tp->tt_rexmt);
	1229	if (to.to_flags & TOF_CC) {
	1230	if (taop->tao_cc != 0 &&
	1231	CC_GT(to.to_cc, taop->tao_cc)) {
	1232	/*
	1233	* update cache and make transition:
	1234	* SYN-SENT -> ESTABLISHED*
	1235	* SYN-SENT* -> FIN-WAIT-1*
	1236	*/
	1237	taop->tao_cc = to.to_cc;
	1238	tp->t_starttime = ticks;
	1239	if (tp->t_flags & TF_NEEDFIN) {
	1240	tp->t_state = TCPS_FIN_WAIT_1;
	1241	tp->t_flags &= ~TF_NEEDFIN;
	1242	} else {
	1243	tp->t_state = TCPS_ESTABLISHED;
	1244	callout_reset(tp->tt_keep,
	1245	tcp_keepidle,
	1246	tcp_timer_keep,
	1247	tp);
	1248	}
	1249	tp->t_flags \|= TF_NEEDSYN;
	1250	} else
	1251	tp->t_state = TCPS_SYN_RECEIVED;
	1252	} else {
	1253	/* CC.NEW or no option => invalidate cache */
	1254	taop->tao_cc = 0;
	1255	tp->t_state = TCPS_SYN_RECEIVED;
	1256	}
	1257	}
	1258
	1259	trimthenstep6:
	1260	/*
	1261	* Advance th->th_seq to correspond to first data byte.
	1262	* If data, trim to stay within window,
	1263	* dropping FIN if necessary.
	1264	*/
	1265	th->th_seq++;
	1266	if (tlen > tp->rcv_wnd) {
	1267	todrop = tlen - tp->rcv_wnd;
	1268	m_adj(m, -todrop);
	1269	tlen = tp->rcv_wnd;
	1270	thflags &= ~TH_FIN;
	1271	tcpstat.tcps_rcvpackafterwin++;
	1272	tcpstat.tcps_rcvbyteafterwin += todrop;
	1273	}
	1274	tp->snd_wl1 = th->th_seq - 1;
	1275	tp->rcv_up = th->th_seq;
	1276	/*
	1277	* Client side of transaction: already sent SYN and data.
	1278	* If the remote host used T/TCP to validate the SYN,
	1279	* our data will be ACK'd; if so, enter normal data segment
	1280	* processing in the middle of step 5, ack processing.
	1281	* Otherwise, goto step 6.
	1282	*/
	1283	if (thflags & TH_ACK)
	1284	goto process_ACK;
	1285
	1286	goto step6;
	1287
	1288	/*
	1289	* If the state is LAST_ACK or CLOSING or TIME_WAIT:
	1290	* if segment contains a SYN and CC [not CC.NEW] option:
	1291	* if state == TIME_WAIT and connection duration > MSL,
	1292	* drop packet and send RST;
	1293	*
	1294	* if SEG.CC > CCrecv then is new SYN, and can implicitly
	1295	* ack the FIN (and data) in retransmission queue.
	1296	* Complete close and delete TCPCB. Then reprocess
	1297	* segment, hoping to find new TCPCB in LISTEN state;
	1298	*
	1299	* else must be old SYN; drop it.
	1300	* else do normal processing.
	1301	*/
	1302	case TCPS_LAST_ACK:
	1303	case TCPS_CLOSING:
	1304	case TCPS_TIME_WAIT:
	1305	if ((thflags & TH_SYN) &&
	1306	(to.to_flags & TOF_CC) && tp->cc_recv != 0) {
	1307	if (tp->t_state == TCPS_TIME_WAIT &&
	1308	(ticks - tp->t_starttime) > tcp_msl) {
	1309	rstreason = BANDLIM_UNLIMITED;
	1310	goto dropwithreset;
	1311	}
	1312	if (CC_GT(to.to_cc, tp->cc_recv)) {
	1313	tp = tcp_close(tp);
	1314	goto findpcb;
	1315	}
	1316	else
	1317	goto drop;
	1318	}
	1319	break; /* continue normal processing */
	1320	}
	1321
	1322	/*
	1323	* States other than LISTEN or SYN_SENT.
	1324	* First check the RST flag and sequence number since reset segments
	1325	* are exempt from the timestamp and connection count tests. This
	1326	* fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
	1327	* below which allowed reset segments in half the sequence space
	1328	* to fall though and be processed (which gives forged reset
	1329	* segments with a random sequence number a 50 percent chance of
	1330	* killing a connection).
	1331	* Then check timestamp, if present.
	1332	* Then check the connection count, if present.
	1333	* Then check that at least some bytes of segment are within
	1334	* receive window. If segment begins before rcv_nxt,
	1335	* drop leading data (and SYN); if nothing left, just ack.
	1336	*
	1337	*
	1338	* If the RST bit is set, check the sequence number to see
	1339	* if this is a valid reset segment.
	1340	* RFC 793 page 37:
	1341	* In all states except SYN-SENT, all reset (RST) segments
	1342	* are validated by checking their SEQ-fields. A reset is
	1343	* valid if its sequence number is in the window.
	1344	* Note: this does not take into account delayed ACKs, so
	1345	* we should test against last_ack_sent instead of rcv_nxt.
	1346	* The sequence number in the reset segment is normally an
	1347	* echo of our outgoing acknowlegement numbers, but some hosts
	1348	* send a reset with the sequence number at the rightmost edge
	1349	* of our receive window, and we have to handle this case.
	1350	* If we have multiple segments in flight, the intial reset
	1351	* segment sequence numbers will be to the left of last_ack_sent,
	1352	* but they will eventually catch up.
	1353	* In any case, it never made sense to trim reset segments to
	1354	* fit the receive window since RFC 1122 says:
	1355	* 4.2.2.12 RST Segment: RFC-793 Section 3.4
	1356	*
	1357	* A TCP SHOULD allow a received RST segment to include data.
	1358	*
	1359	* DISCUSSION
	1360	* It has been suggested that a RST segment could contain
	1361	* ASCII text that encoded and explained the cause of the
	1362	* RST. No standard has yet been established for such
	1363	* data.
	1364	*
	1365	* If the reset segment passes the sequence number test examine
	1366	* the state:
	1367	* SYN_RECEIVED STATE:
	1368	* If passive open, return to LISTEN state.
	1369	* If active open, inform user that connection was refused.
	1370	* ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
	1371	* Inform user that connection was reset, and close tcb.
	1372	* CLOSING, LAST_ACK STATES:
	1373	* Close the tcb.
	1374	* TIME_WAIT STATE:
	1375	* Drop the segment - see Stevens, vol. 2, p. 964 and
	1376	* RFC 1337.
	1377	*/
	1378	if (thflags & TH_RST) {
	1379	if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
	1380	SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
	1381	switch (tp->t_state) {
	1382
	1383	case TCPS_SYN_RECEIVED:
	1384	so->so_error = ECONNREFUSED;
	1385	goto close;
	1386
	1387	case TCPS_ESTABLISHED:
	1388	case TCPS_FIN_WAIT_1:
	1389	case TCPS_FIN_WAIT_2:
	1390	case TCPS_CLOSE_WAIT:
	1391	so->so_error = ECONNRESET;
	1392	close:
	1393	tp->t_state = TCPS_CLOSED;
	1394	tcpstat.tcps_drops++;
	1395	tp = tcp_close(tp);
	1396	break;
	1397
	1398	case TCPS_CLOSING:
	1399	case TCPS_LAST_ACK:
	1400	tp = tcp_close(tp);
	1401	break;
	1402
	1403	case TCPS_TIME_WAIT:
	1404	break;
	1405	}
	1406	}
	1407	goto drop;
	1408	}
	1409
	1410	/*
	1411	* RFC 1323 PAWS: If we have a timestamp reply on this segment
	1412	* and it's less than ts_recent, drop it.
	1413	*/
	1414	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
	1415	TSTMP_LT(to.to_tsval, tp->ts_recent)) {
	1416
	1417	/* Check to see if ts_recent is over 24 days old. */
	1418	if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
	1419	/*
	1420	* Invalidate ts_recent. If this segment updates
	1421	* ts_recent, the age will be reset later and ts_recent
	1422	* will get a valid value. If it does not, setting
	1423	* ts_recent to zero will at least satisfy the
	1424	* requirement that zero be placed in the timestamp
	1425	* echo reply when ts_recent isn't valid. The
	1426	* age isn't reset until we get a valid ts_recent
	1427	* because we don't want out-of-order segments to be
	1428	* dropped when ts_recent is old.
	1429	*/
	1430	tp->ts_recent = 0;
	1431	} else {
	1432	tcpstat.tcps_rcvduppack++;
	1433	tcpstat.tcps_rcvdupbyte += tlen;
	1434	tcpstat.tcps_pawsdrop++;
	1435	if (tlen)
	1436	goto dropafterack;
	1437	goto drop;
	1438	}
	1439	}
	1440
	1441	/*
	1442	* T/TCP mechanism
	1443	* If T/TCP was negotiated and the segment doesn't have CC,
	1444	* or if its CC is wrong then drop the segment.
	1445	* RST segments do not have to comply with this.
	1446	*/
	1447	if ((tp->t_flags & (TF_REQ_CC\|TF_RCVD_CC)) == (TF_REQ_CC\|TF_RCVD_CC) &&
	1448	((to.to_flags & TOF_CC) == 0 \|\| tp->cc_recv != to.to_cc))
	1449	goto dropafterack;
	1450
	1451	/*
	1452	* In the SYN-RECEIVED state, validate that the packet belongs to
	1453	* this connection before trimming the data to fit the receive
	1454	* window. Check the sequence number versus IRS since we know
	1455	* the sequence numbers haven't wrapped. This is a partial fix
	1456	* for the "LAND" DoS attack.
	1457	*/
	1458	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
	1459	rstreason = BANDLIM_RST_OPENPORT;
	1460	goto dropwithreset;
	1461	}
	1462
	1463	todrop = tp->rcv_nxt - th->th_seq;
	1464	if (todrop > 0) {
	1465	if (thflags & TH_SYN) {
	1466	thflags &= ~TH_SYN;
	1467	th->th_seq++;
	1468	if (th->th_urp > 1)
	1469	th->th_urp--;
	1470	else
	1471	thflags &= ~TH_URG;
	1472	todrop--;
	1473	}
	1474	/*
	1475	* Following if statement from Stevens, vol. 2, p. 960.
	1476	*/
	1477	if (todrop > tlen
	1478	\|\| (todrop == tlen && (thflags & TH_FIN) == 0)) {
	1479	/*
	1480	* Any valid FIN must be to the left of the window.
	1481	* At this point the FIN must be a duplicate or out
	1482	* of sequence; drop it.
	1483	*/
	1484	thflags &= ~TH_FIN;
	1485
	1486	/*
	1487	* Send an ACK to resynchronize and drop any data.
	1488	* But keep on processing for RST or ACK.
	1489	*/
	1490	tp->t_flags \|= TF_ACKNOW;
	1491	todrop = tlen;
	1492	tcpstat.tcps_rcvduppack++;
	1493	tcpstat.tcps_rcvdupbyte += todrop;
	1494	} else {
	1495	tcpstat.tcps_rcvpartduppack++;
	1496	tcpstat.tcps_rcvpartdupbyte += todrop;
	1497	}
	1498	drop_hdrlen += todrop; /* drop from the top afterwards */
	1499	th->th_seq += todrop;
	1500	tlen -= todrop;
	1501	if (th->th_urp > todrop)
	1502	th->th_urp -= todrop;
	1503	else {
	1504	thflags &= ~TH_URG;
	1505	th->th_urp = 0;
	1506	}
	1507	}
	1508
	1509	/*
	1510	* If new data are received on a connection after the
	1511	* user processes are gone, then RST the other end.
	1512	*/
	1513	if ((so->so_state & SS_NOFDREF) &&
	1514	tp->t_state > TCPS_CLOSE_WAIT && tlen) {
	1515	tp = tcp_close(tp);
	1516	tcpstat.tcps_rcvafterclose++;
	1517	rstreason = BANDLIM_UNLIMITED;
	1518	goto dropwithreset;
	1519	}
	1520
	1521	/*
	1522	* If segment ends after window, drop trailing data
	1523	* (and PUSH and FIN); if nothing left, just ACK.
	1524	*/
	1525	todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
	1526	if (todrop > 0) {
	1527	tcpstat.tcps_rcvpackafterwin++;
	1528	if (todrop >= tlen) {
	1529	tcpstat.tcps_rcvbyteafterwin += tlen;
	1530	/*
	1531	* If a new connection request is received
	1532	* while in TIME_WAIT, drop the old connection
	1533	* and start over if the sequence numbers
	1534	* are above the previous ones.
	1535	*/
	1536	if (thflags & TH_SYN &&
	1537	tp->t_state == TCPS_TIME_WAIT &&
	1538	SEQ_GT(th->th_seq, tp->rcv_nxt)) {
	1539	tp = tcp_close(tp);
	1540	goto findpcb;
	1541	}
	1542	/*
	1543	* If window is closed can only take segments at
	1544	* window edge, and have to drop data and PUSH from
	1545	* incoming segments. Continue processing, but
	1546	* remember to ack. Otherwise, drop segment
	1547	* and ack.
	1548	*/
	1549	if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
	1550	tp->t_flags \|= TF_ACKNOW;
	1551	tcpstat.tcps_rcvwinprobe++;
	1552	} else
	1553	goto dropafterack;
	1554	} else
	1555	tcpstat.tcps_rcvbyteafterwin += todrop;
	1556	m_adj(m, -todrop);
	1557	tlen -= todrop;
	1558	thflags &= ~(TH_PUSH\|TH_FIN);
	1559	}
	1560
	1561	/*
	1562	* If last ACK falls within this segment's sequence numbers,
	1563	* record its timestamp.
	1564	* NOTE that the test is modified according to the latest
	1565	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
	1566	*/
	1567	if ((to.to_flags & TOF_TS) != 0 &&
	1568	SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
	1569	tp->ts_recent_age = ticks;
	1570	tp->ts_recent = to.to_tsval;
	1571	}
	1572
	1573	/*
	1574	* If a SYN is in the window, then this is an
	1575	* error and we send an RST and drop the connection.
	1576	*/
	1577	if (thflags & TH_SYN) {
	1578	tp = tcp_drop(tp, ECONNRESET);
	1579	rstreason = BANDLIM_UNLIMITED;
	1580	goto dropwithreset;
	1581	}
	1582
	1583	/*
	1584	* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
	1585	* flag is on (half-synchronized state), then queue data for
	1586	* later processing; else drop segment and return.
	1587	*/
	1588	if ((thflags & TH_ACK) == 0) {
	1589	if (tp->t_state == TCPS_SYN_RECEIVED \|\|
	1590	(tp->t_flags & TF_NEEDSYN))
	1591	goto step6;
	1592	else
	1593	goto drop;
	1594	}
	1595
	1596	/*
	1597	* Ack processing.
	1598	*/
	1599	switch (tp->t_state) {
	1600
	1601	/*
	1602	* In SYN_RECEIVED state, the ack ACKs our SYN, so enter
	1603	* ESTABLISHED state and continue processing.
	1604	* The ACK was checked above.
	1605	*/
	1606	case TCPS_SYN_RECEIVED:
	1607
	1608	tcpstat.tcps_connects++;
	1609	soisconnected(so);
	1610	/* Do window scaling? */
	1611	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
	1612	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
	1613	tp->snd_scale = tp->requested_s_scale;
	1614	tp->rcv_scale = tp->request_r_scale;
	1615	}
	1616	/*
	1617	* Upon successful completion of 3-way handshake,
	1618	* update cache.CC if it was undefined, pass any queued
	1619	* data to the user, and advance state appropriately.
	1620	*/
	1621	if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
	1622	taop->tao_cc == 0)
	1623	taop->tao_cc = tp->cc_recv;
	1624
	1625	/*
	1626	* Make transitions:
	1627	* SYN-RECEIVED -> ESTABLISHED
	1628	* SYN-RECEIVED* -> FIN-WAIT-1
	1629	*/
	1630	tp->t_starttime = ticks;
	1631	if (tp->t_flags & TF_NEEDFIN) {
	1632	tp->t_state = TCPS_FIN_WAIT_1;
	1633	tp->t_flags &= ~TF_NEEDFIN;
	1634	} else {
	1635	tp->t_state = TCPS_ESTABLISHED;
	1636	callout_reset(tp->tt_keep, tcp_keepidle,
	1637	tcp_timer_keep, tp);
	1638	}
	1639	/*
	1640	* If segment contains data or ACK, will call tcp_reass()
	1641	* later; if not, do so now to pass queued data to user.
	1642	*/
	1643	if (tlen == 0 && (thflags & TH_FIN) == 0)
	1644	(void) tcp_reass(tp, (struct tcphdr *)0, 0,
	1645	(struct mbuf *)0);
	1646	tp->snd_wl1 = th->th_seq - 1;
	1647	/* fall into ... */
	1648
	1649	/*
	1650	* In ESTABLISHED state: drop duplicate ACKs; ACK out of range
	1651	* ACKs. If the ack is in the range
	1652	* tp->snd_una < th->th_ack <= tp->snd_max
	1653	* then advance tp->snd_una to th->th_ack and drop
	1654	* data from the retransmission queue. If this ACK reflects
	1655	* more up to date window information we update our window information.
	1656	*/
	1657	case TCPS_ESTABLISHED:
	1658	case TCPS_FIN_WAIT_1:
	1659	case TCPS_FIN_WAIT_2:
	1660	case TCPS_CLOSE_WAIT:
	1661	case TCPS_CLOSING:
	1662	case TCPS_LAST_ACK:
	1663	case TCPS_TIME_WAIT:
	1664
	1665	if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
	1666	if (tlen == 0 && tiwin == tp->snd_wnd) {
	1667	tcpstat.tcps_rcvdupack++;
	1668	/*
	1669	* If we have outstanding data (other than
	1670	* a window probe), this is a completely
	1671	* duplicate ack (ie, window info didn't
	1672	* change), the ack is the biggest we've
	1673	* seen and we've seen exactly our rexmt
	1674	* threshhold of them, assume a packet
	1675	* has been dropped and retransmit it.
	1676	* Kludge snd_nxt & the congestion
	1677	* window so we send only this one
	1678	* packet.
	1679	*
	1680	* We know we're losing at the current
	1681	* window size so do congestion avoidance
	1682	* (set ssthresh to half the current window
	1683	* and pull our congestion window back to
	1684	* the new ssthresh).
	1685	*
	1686	* Dup acks mean that packets have left the
	1687	* network (they're now cached at the receiver)
	1688	* so bump cwnd by the amount in the receiver
	1689	* to keep a constant cwnd packets in the
	1690	* network.
	1691	*/
	1692	if (!callout_active(tp->tt_rexmt) \|\|
	1693	th->th_ack != tp->snd_una)
	1694	tp->t_dupacks = 0;
	1695	else if (++tp->t_dupacks > tcprexmtthresh \|\|
	1696	(tcp_do_newreno &&
	1697	IN_FASTRECOVERY(tp))) {
	1698	tp->snd_cwnd += tp->t_maxseg;
	1699	(void) tcp_output(tp);
	1700	goto drop;
	1701	} else if (tp->t_dupacks == tcprexmtthresh) {
	1702	tcp_seq onxt = tp->snd_nxt;
	1703	u_int win;
	1704	if (tcp_do_newreno &&
	1705	SEQ_LEQ(th->th_ack,
	1706	tp->snd_recover)) {
	1707	tp->t_dupacks = 0;
	1708	break;
	1709	}
	1710	if (tcp_do_eifel_detect &&
	1711	(tp->t_flags & TF_RCVD_TSTMP)) {
	1712	tcp_save_congestion_state(tp);
	1713	tp->t_flags \|= TF_FASTREXMT;
	1714	}
	1715	win = min(tp->snd_wnd, tp->snd_cwnd) /
	1716	2 / tp->t_maxseg;
	1717	if (win < 2)
	1718	win = 2;
	1719	tp->snd_ssthresh = win * tp->t_maxseg;
	1720	ENTER_FASTRECOVERY(tp);
	1721	tp->snd_recover = tp->snd_max;
	1722	callout_stop(tp->tt_rexmt);
	1723	tp->t_rtttime = 0;
	1724	tp->snd_nxt = th->th_ack;
	1725	tp->snd_cwnd = tp->t_maxseg;
	1726	(void) tcp_output(tp);
	1727	KASSERT(tp->snd_limited <= 2,
	1728	("tp->snd_limited too big"));
	1729	tp->snd_cwnd = tp->snd_ssthresh +
	1730	(tp->t_maxseg *
	1731	(tp->t_dupacks - tp->snd_limited));
	1732	if (SEQ_GT(onxt, tp->snd_nxt))
	1733	tp->snd_nxt = onxt;
	1734	goto drop;
	1735	} else if (tcp_do_limitedtransmit) {
	1736	u_long oldcwnd = tp->snd_cwnd;
	1737	tcp_seq oldsndmax = tp->snd_max;
	1738	u_int sent;
	1739	KASSERT(tp->t_dupacks == 1 \|\|
	1740	tp->t_dupacks == 2,
	1741	("dupacks not 1 or 2"));
	1742	if (tp->t_dupacks == 1) {
	1743	tp->snd_limited = 0;
	1744	tp->snd_cwnd += tp->t_maxseg;
	1745	} else {
	1746	tp->snd_cwnd +=
	1747	tp->t_maxseg * 2;
	1748	}
	1749	(void) tcp_output(tp);
	1750	sent = tp->snd_max - oldsndmax;
	1751	if (sent > tp->t_maxseg) {
	1752	KASSERT(tp->snd_limited == 0 &&
	1753	tp->t_dupacks == 2,
	1754	("sent too much"));
	1755	tp->snd_limited = 2;
	1756	} else if (sent > 0)
	1757	++tp->snd_limited;
	1758	tp->snd_cwnd = oldcwnd;
	1759	goto drop;
	1760	}
	1761	} else
	1762	tp->t_dupacks = 0;
	1763	break;
	1764	}
	1765
	1766	KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una"));
	1767
	1768	/*
	1769	* If the congestion window was inflated to account
	1770	* for the other side's cached packets, retract it.
	1771	*/
	1772	if (tcp_do_newreno) {
	1773	if (IN_FASTRECOVERY(tp)) {
	1774	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
	1775	tcp_newreno_partial_ack(tp, th);
	1776	} else {
	1777	/*
	1778	* Window inflation should have left us
	1779	* with approximately snd_ssthresh
	1780	* outstanding data.
	1781	* But in case we would be inclined to
	1782	* send a burst, better to do it via
	1783	* the slow start mechanism.
	1784	*/
	1785	if (SEQ_GT(th->th_ack +
	1786	tp->snd_ssthresh,
	1787	tp->snd_max))
	1788	tp->snd_cwnd = tp->snd_max -
	1789	th->th_ack +
	1790	tp->t_maxseg;
	1791	else
	1792	tp->snd_cwnd = tp->snd_ssthresh;
	1793	}
	1794	}
	1795	} else {
	1796	if (tp->t_dupacks >= tcprexmtthresh &&
	1797	tp->snd_cwnd > tp->snd_ssthresh)
	1798	tp->snd_cwnd = tp->snd_ssthresh;
	1799	}
	1800	tp->t_dupacks = 0;
	1801	if (SEQ_GT(th->th_ack, tp->snd_max)) {
	1802	tcpstat.tcps_rcvacktoomuch++;
	1803	goto dropafterack;
	1804	}
	1805	/*
	1806	* If we reach this point, ACK is not a duplicate,
	1807	* i.e., it ACKs something we sent.
	1808	*/
	1809	if (tp->t_flags & TF_NEEDSYN) {
	1810	/*
	1811	* T/TCP: Connection was half-synchronized, and our
	1812	* SYN has been ACK'd (so connection is now fully
	1813	* synchronized). Go to non-starred state,
	1814	* increment snd_una for ACK of SYN, and check if
	1815	* we can do window scaling.
	1816	*/
	1817	tp->t_flags &= ~TF_NEEDSYN;
	1818	tp->snd_una++;
	1819	/* Do window scaling? */
	1820	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
	1821	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
	1822	tp->snd_scale = tp->requested_s_scale;
	1823	tp->rcv_scale = tp->request_r_scale;
	1824	}
	1825	}
	1826
	1827	process_ACK:
	1828	acked = th->th_ack - tp->snd_una;
	1829	tcpstat.tcps_rcvackpack++;
	1830	tcpstat.tcps_rcvackbyte += acked;
	1831
	1832	/*
	1833	* If we just performed our first retransmit, and the ACK
	1834	* arrives within our recovery window, then it was a mistake
	1835	* to do the retransmit in the first place. Recover our
	1836	* original cwnd and ssthresh, and proceed to transmit where
	1837	* we left off.
	1838	*/
	1839	useTS = tcp_do_eifel_detect && (to.to_flags & TOF_TS) &&
	1840	to.to_tsecr;
	1841	if ((useTS && (tp->t_flags & TF_FIRSTACCACK) && acked &&
	1842	(to.to_tsecr < tp->t_rexmtTS)) \|\|
	1843	(!useTS &&
	1844	(tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin))) {
	1845	tp->snd_cwnd = tp->snd_cwnd_prev;
	1846	tp->snd_ssthresh = tp->snd_ssthresh_prev;
	1847	tp->snd_recover = tp->snd_recover_prev;
	1848	if (tp->t_flags & TF_WASFRECOVERY)
	1849	ENTER_FASTRECOVERY(tp);
	1850	tp->snd_nxt = tp->snd_max;
	1851	tp->t_badrxtwin = 0; /* XXX probably not required */
	1852	tp->t_rxtshift = 0;
	1853	if (tp->t_flags & TF_FASTREXMT)
	1854	++tcpstat.tcps_sndfastrexmitbad;
	1855	else
	1856	++tcpstat.tcps_sndrtobad;
	1857	}
	1858
	1859	/*
	1860	* If we have a timestamp reply, update smoothed
	1861	* round trip time. If no timestamp is present but
	1862	* transmit timer is running and timed sequence
	1863	* number was acked, update smoothed round trip time.
	1864	* Since we now have an rtt measurement, cancel the
	1865	* timer backoff (cf., Phil Karn's retransmit alg.).
	1866	* Recompute the initial retransmit timer.
	1867	*
	1868	* Some machines (certain windows boxes) send broken
	1869	* timestamp replies during the SYN+ACK phase, ignore
	1870	* timestamps of 0.
	1871	*/
	1872	if ((to.to_flags & TOF_TS) != 0 &&
	1873	to.to_tsecr) {
	1874	tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
	1875	} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
	1876	tcp_xmit_timer(tp, ticks - tp->t_rtttime);
	1877	}
	1878	tcp_xmit_bandwidth_limit(tp, th->th_ack);
	1879
	1880	/*
	1881	* If all outstanding data is acked, stop retransmit
	1882	* timer and remember to restart (more output or persist).
	1883	* If there is more data to be acked, restart retransmit
	1884	* timer, using current (possibly backed-off) value.
	1885	*/
	1886	if (th->th_ack == tp->snd_max) {
	1887	callout_stop(tp->tt_rexmt);
	1888	needoutput = 1;
	1889	} else if (!callout_active(tp->tt_persist))
	1890	callout_reset(tp->tt_rexmt, tp->t_rxtcur,
	1891	tcp_timer_rexmt, tp);
	1892
	1893	/*
	1894	* If no data (only SYN) was ACK'd,
	1895	* skip rest of ACK processing.
	1896	*/
	1897	if (acked == 0)
	1898	goto step6;
	1899
	1900	/* Stop looking for an acceptable ACK since one was received. */
	1901	tp->t_flags &= ~(TF_FIRSTACCACK \| TF_FASTREXMT);
	1902
	1903	/*
	1904	* When new data is acked, open the congestion window.
	1905	* If the window gives us less than ssthresh packets
	1906	* in flight, open exponentially (maxseg per packet).
	1907	* Otherwise open linearly: maxseg per window
	1908	* (maxseg^2 / cwnd per packet).
	1909	*/
	1910	if (!tcp_do_newreno \|\| !IN_FASTRECOVERY(tp)) {
	1911	u_int cw = tp->snd_cwnd;
	1912	u_int incr = tp->t_maxseg;
	1913	if (cw > tp->snd_ssthresh)
	1914	incr = incr * incr / cw;
	1915	tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
	1916	}
	1917	if (acked > so->so_snd.sb_cc) {
	1918	tp->snd_wnd -= so->so_snd.sb_cc;
	1919	sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
	1920	ourfinisacked = 1;
	1921	} else {
	1922	sbdrop(&so->so_snd, acked);
	1923	tp->snd_wnd -= acked;
	1924	ourfinisacked = 0;
	1925	}
	1926	sowwakeup(so);
	1927	/* detect una wraparound */
	1928	if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
	1929	SEQ_GT(tp->snd_una, tp->snd_recover) &&
	1930	SEQ_LEQ(th->th_ack, tp->snd_recover))
	1931	tp->snd_recover = th->th_ack - 1;
	1932	if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
	1933	SEQ_GEQ(th->th_ack, tp->snd_recover))
	1934	EXIT_FASTRECOVERY(tp);
	1935	tp->snd_una = th->th_ack;
	1936	if (SEQ_LT(tp->snd_nxt, tp->snd_una))
	1937	tp->snd_nxt = tp->snd_una;
	1938
	1939	switch (tp->t_state) {
	1940
	1941	/*
	1942	* In FIN_WAIT_1 STATE in addition to the processing
	1943	* for the ESTABLISHED state if our FIN is now acknowledged
	1944	* then enter FIN_WAIT_2.
	1945	*/
	1946	case TCPS_FIN_WAIT_1:
	1947	if (ourfinisacked) {
	1948	/*
	1949	* If we can't receive any more
	1950	* data, then closing user can proceed.
	1951	* Starting the timer is contrary to the
	1952	* specification, but if we don't get a FIN
	1953	* we'll hang forever.
	1954	*/
	1955	if (so->so_state & SS_CANTRCVMORE) {
	1956	soisdisconnected(so);
	1957	callout_reset(tp->tt_2msl, tcp_maxidle,
	1958	tcp_timer_2msl, tp);
	1959	}
	1960	tp->t_state = TCPS_FIN_WAIT_2;
	1961	}
	1962	break;
	1963
	1964	/*
	1965	* In CLOSING STATE in addition to the processing for
	1966	* the ESTABLISHED state if the ACK acknowledges our FIN
	1967	* then enter the TIME-WAIT state, otherwise ignore
	1968	* the segment.
	1969	*/
	1970	case TCPS_CLOSING:
	1971	if (ourfinisacked) {
	1972	tp->t_state = TCPS_TIME_WAIT;
	1973	tcp_canceltimers(tp);
	1974	/* Shorten TIME_WAIT [RFC-1644, p.28] */
	1975	if (tp->cc_recv != 0 &&
	1976	(ticks - tp->t_starttime) < tcp_msl)
	1977	callout_reset(tp->tt_2msl,
	1978	tp->t_rxtcur *
	1979	TCPTV_TWTRUNC,
	1980	tcp_timer_2msl, tp);
	1981	else
	1982	callout_reset(tp->tt_2msl, 2 * tcp_msl,
	1983	tcp_timer_2msl, tp);
	1984	soisdisconnected(so);
	1985	}
	1986	break;
	1987
	1988	/*
	1989	* In LAST_ACK, we may still be waiting for data to drain
	1990	* and/or to be acked, as well as for the ack of our FIN.
	1991	* If our FIN is now acknowledged, delete the TCB,
	1992	* enter the closed state and return.
	1993	*/
	1994	case TCPS_LAST_ACK:
	1995	if (ourfinisacked) {
	1996	tp = tcp_close(tp);
	1997	goto drop;
	1998	}
	1999	break;
	2000
	2001	/*
	2002	* In TIME_WAIT state the only thing that should arrive
	2003	* is a retransmission of the remote FIN. Acknowledge
	2004	* it and restart the finack timer.
	2005	*/
	2006	case TCPS_TIME_WAIT:
	2007	callout_reset(tp->tt_2msl, 2 * tcp_msl,
	2008	tcp_timer_2msl, tp);
	2009	goto dropafterack;
	2010	}
	2011	}
	2012
	2013	step6:
	2014	/*
	2015	* Update window information.
	2016	* Don't look at window if no ACK: TAC's send garbage on first SYN.
	2017	*/
	2018	if ((thflags & TH_ACK) &&
	2019	(SEQ_LT(tp->snd_wl1, th->th_seq) \|\|
	2020	(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) \|\|
	2021	(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
	2022	/* keep track of pure window updates */
	2023	if (tlen == 0 &&
	2024	tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
	2025	tcpstat.tcps_rcvwinupd++;
	2026	tp->snd_wnd = tiwin;
	2027	tp->snd_wl1 = th->th_seq;
	2028	tp->snd_wl2 = th->th_ack;
	2029	if (tp->snd_wnd > tp->max_sndwnd)
	2030	tp->max_sndwnd = tp->snd_wnd;
	2031	needoutput = 1;
	2032	}
	2033
	2034	/*
	2035	* Process segments with URG.
	2036	*/
	2037	if ((thflags & TH_URG) && th->th_urp &&
	2038	TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	2039	/*
	2040	* This is a kludge, but if we receive and accept
	2041	* random urgent pointers, we'll crash in
	2042	* soreceive. It's hard to imagine someone
	2043	* actually wanting to send this much urgent data.
	2044	*/
	2045	if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
	2046	th->th_urp = 0; /* XXX */
	2047	thflags &= ~TH_URG; /* XXX */
	2048	goto dodata; /* XXX */
	2049	}
	2050	/*
	2051	* If this segment advances the known urgent pointer,
	2052	* then mark the data stream. This should not happen
	2053	* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
	2054	* a FIN has been received from the remote side.
	2055	* In these states we ignore the URG.
	2056	*
	2057	* According to RFC961 (Assigned Protocols),
	2058	* the urgent pointer points to the last octet
	2059	* of urgent data. We continue, however,
	2060	* to consider it to indicate the first octet
	2061	* of data past the urgent section as the original
	2062	* spec states (in one of two places).
	2063	*/
	2064	if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
	2065	tp->rcv_up = th->th_seq + th->th_urp;
	2066	so->so_oobmark = so->so_rcv.sb_cc +
	2067	(tp->rcv_up - tp->rcv_nxt) - 1;
	2068	if (so->so_oobmark == 0)
	2069	so->so_state \|= SS_RCVATMARK;
	2070	sohasoutofband(so);
	2071	tp->t_oobflags &= ~(TCPOOB_HAVEDATA \| TCPOOB_HADDATA);
	2072	}
	2073	/*
	2074	* Remove out of band data so doesn't get presented to user.
	2075	* This can happen independent of advancing the URG pointer,
	2076	* but if two URG's are pending at once, some out-of-band
	2077	* data may creep in... ick.
	2078	*/
	2079	if (th->th_urp <= (u_long)tlen
	2080	#ifdef SO_OOBINLINE
	2081	&& (so->so_options & SO_OOBINLINE) == 0
	2082	#endif
	2083	)
	2084	tcp_pulloutofband(so, th, m,
	2085	drop_hdrlen); /* hdr drop is delayed */
	2086	} else {
	2087	/*
	2088	* If no out of band data is expected,
	2089	* pull receive urgent pointer along
	2090	* with the receive window.
	2091	*/
	2092	if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
	2093	tp->rcv_up = tp->rcv_nxt;
	2094	}
	2095	dodata: /* XXX */
	2096
	2097	/*
	2098	* Process the segment text, merging it into the TCP sequencing queue,
	2099	* and arranging for acknowledgment of receipt if necessary.
	2100	* This process logically involves adjusting tp->rcv_wnd as data
	2101	* is presented to the user (this happens in tcp_usrreq.c,
	2102	* case PRU_RCVD). If a FIN has already been received on this
	2103	* connection then we just ignore the text.
	2104	*/
	2105	if ((tlen \|\| (thflags & TH_FIN)) &&
	2106	TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	2107	m_adj(m, drop_hdrlen); /* delayed header drop */
	2108	/*
	2109	* Insert segment which includes th into TCP reassembly queue
	2110	* with control block tp. Set thflags to whether reassembly now
	2111	* includes a segment with FIN. This handles the common case
	2112	* inline (segment is the next to be received on an established
	2113	* connection, and the queue is empty), avoiding linkage into
	2114	* and removal from the queue and repetition of various
	2115	* conversions.
	2116	* Set DELACK for segments received in order, but ack
	2117	* immediately when segments are out of order (so
	2118	* fast retransmit can work).
	2119	*/
	2120	if (th->th_seq == tp->rcv_nxt &&
	2121	LIST_EMPTY(&tp->t_segq) &&
	2122	TCPS_HAVEESTABLISHED(tp->t_state)) {
	2123	if (DELAY_ACK(tp))
	2124	callout_reset(tp->tt_delack, tcp_delacktime,
	2125	tcp_timer_delack, tp);
	2126	else
	2127	tp->t_flags \|= TF_ACKNOW;
	2128	tp->rcv_nxt += tlen;
	2129	thflags = th->th_flags & TH_FIN;
	2130	tcpstat.tcps_rcvpack++;
	2131	tcpstat.tcps_rcvbyte += tlen;
	2132	ND6_HINT(tp);
	2133	if (so->so_state & SS_CANTRCVMORE)
	2134	m_freem(m);
	2135	else
	2136	sbappend(&so->so_rcv, m);
	2137	sorwakeup(so);
	2138	} else {
	2139	thflags = tcp_reass(tp, th, &tlen, m);
	2140	tp->t_flags \|= TF_ACKNOW;
	2141	}
	2142
	2143	/*
	2144	* Note the amount of data that peer has sent into
	2145	* our window, in order to estimate the sender's
	2146	* buffer size.
	2147	*/
	2148	len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
	2149	} else {
	2150	m_freem(m);
	2151	thflags &= ~TH_FIN;
	2152	}
	2153
	2154	/*
	2155	* If FIN is received ACK the FIN and let the user know
	2156	* that the connection is closing.
	2157	*/
	2158	if (thflags & TH_FIN) {
	2159	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	2160	socantrcvmore(so);
	2161	/*
	2162	* If connection is half-synchronized
	2163	* (ie NEEDSYN flag on) then delay ACK,
	2164	* so it may be piggybacked when SYN is sent.
	2165	* Otherwise, since we received a FIN then no
	2166	* more input can be expected, send ACK now.
	2167	*/
	2168	if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN))
	2169	callout_reset(tp->tt_delack, tcp_delacktime,
	2170	tcp_timer_delack, tp);
	2171	else
	2172	tp->t_flags \|= TF_ACKNOW;
	2173	tp->rcv_nxt++;
	2174	}
	2175	switch (tp->t_state) {
	2176
	2177	/*
	2178	* In SYN_RECEIVED and ESTABLISHED STATES
	2179	* enter the CLOSE_WAIT state.
	2180	*/
	2181	case TCPS_SYN_RECEIVED:
	2182	tp->t_starttime = ticks;
	2183	/FALLTHROUGH/
	2184	case TCPS_ESTABLISHED:
	2185	tp->t_state = TCPS_CLOSE_WAIT;
	2186	break;
	2187
	2188	/*
	2189	* If still in FIN_WAIT_1 STATE FIN has not been acked so
	2190	* enter the CLOSING state.
	2191	*/
	2192	case TCPS_FIN_WAIT_1:
	2193	tp->t_state = TCPS_CLOSING;
	2194	break;
	2195
	2196	/*
	2197	* In FIN_WAIT_2 state enter the TIME_WAIT state,
	2198	* starting the time-wait timer, turning off the other
	2199	* standard timers.
	2200	*/
	2201	case TCPS_FIN_WAIT_2:
	2202	tp->t_state = TCPS_TIME_WAIT;
	2203	tcp_canceltimers(tp);
	2204	/* Shorten TIME_WAIT [RFC-1644, p.28] */
	2205	if (tp->cc_recv != 0 &&
	2206	(ticks - tp->t_starttime) < tcp_msl) {
	2207	callout_reset(tp->tt_2msl,
	2208	tp->t_rxtcur * TCPTV_TWTRUNC,
	2209	tcp_timer_2msl, tp);
	2210	/* For transaction client, force ACK now. */
	2211	tp->t_flags \|= TF_ACKNOW;
	2212	}
	2213	else
	2214	callout_reset(tp->tt_2msl, 2 * tcp_msl,
	2215	tcp_timer_2msl, tp);
	2216	soisdisconnected(so);
	2217	break;
	2218
	2219	/*
	2220	* In TIME_WAIT state restart the 2 MSL time_wait timer.
	2221	*/
	2222	case TCPS_TIME_WAIT:
	2223	callout_reset(tp->tt_2msl, 2 * tcp_msl,
	2224	tcp_timer_2msl, tp);
	2225	break;
	2226	}
	2227	}
	2228	#ifdef TCPDEBUG
	2229	if (so->so_options & SO_DEBUG)
	2230	tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
	2231	&tcp_savetcp, 0);
	2232	#endif
	2233
	2234	/*
	2235	* Return any desired output.
	2236	*/
	2237	if (needoutput \|\| (tp->t_flags & TF_ACKNOW))
	2238	(void) tcp_output(tp);
	2239	return;
	2240
	2241	dropafterack:
	2242	/*
	2243	* Generate an ACK dropping incoming segment if it occupies
	2244	* sequence space, where the ACK reflects our state.
	2245	*
	2246	* We can now skip the test for the RST flag since all
	2247	* paths to this code happen after packets containing
	2248	* RST have been dropped.
	2249	*
	2250	* In the SYN-RECEIVED state, don't send an ACK unless the
	2251	* segment we received passes the SYN-RECEIVED ACK test.
	2252	* If it fails send a RST. This breaks the loop in the
	2253	* "LAND" DoS attack, and also prevents an ACK storm
	2254	* between two listening ports that have been sent forged
	2255	* SYN segments, each with the source address of the other.
	2256	*/
	2257	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
	2258	(SEQ_GT(tp->snd_una, th->th_ack) \|\|
	2259	SEQ_GT(th->th_ack, tp->snd_max)) ) {
	2260	rstreason = BANDLIM_RST_OPENPORT;
	2261	goto dropwithreset;
	2262	}
	2263	#ifdef TCPDEBUG
	2264	if (so->so_options & SO_DEBUG)
	2265	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	2266	&tcp_savetcp, 0);
	2267	#endif
	2268	m_freem(m);
	2269	tp->t_flags \|= TF_ACKNOW;
	2270	(void) tcp_output(tp);
	2271	return;
	2272
	2273	dropwithreset:
	2274	/*
	2275	* Generate a RST, dropping incoming segment.
	2276	* Make ACK acceptable to originator of segment.
	2277	* Don't bother to respond if destination was broadcast/multicast.
	2278	*/
	2279	if ((thflags & TH_RST) \|\| m->m_flags & (M_BCAST\|M_MCAST))
	2280	goto drop;
	2281	if (isipv6) {
	2282	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	2283	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
	2284	goto drop;
	2285	} else {
	2286	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	2287	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	2288	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
	2289	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
	2290	goto drop;
	2291	}
	2292	/* IPv6 anycast check is done at tcp6_input() */
	2293
	2294	/*
	2295	* Perform bandwidth limiting.
	2296	*/
	2297	#ifdef ICMP_BANDLIM
	2298	if (badport_bandlim(rstreason) < 0)
	2299	goto drop;
	2300	#endif
	2301
	2302	#ifdef TCPDEBUG
	2303	if (tp == NULL \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
	2304	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	2305	&tcp_savetcp, 0);
	2306	#endif
	2307	if (thflags & TH_ACK)
	2308	/* mtod() below is safe as long as hdr dropping is delayed */
	2309	tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
	2310	TH_RST);
	2311	else {
	2312	if (thflags & TH_SYN)
	2313	tlen++;
	2314	/* mtod() below is safe as long as hdr dropping is delayed */
	2315	tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
	2316	(tcp_seq)0, TH_RST\|TH_ACK);
	2317	}
	2318	return;
	2319
	2320	drop:
	2321	/*
	2322	* Drop space held by incoming segment and return.
	2323	*/
	2324	#ifdef TCPDEBUG
	2325	if (tp == NULL \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
	2326	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	2327	&tcp_savetcp, 0);
	2328	#endif
	2329	m_freem(m);
	2330	return;
	2331	}
	2332
	2333	/*
	2334	* Parse TCP options and place in tcpopt.
	2335	*/
	2336	static void
	2337	tcp_dooptions(to, cp, cnt, is_syn)
	2338	struct tcpopt *to;
	2339	u_char *cp;
	2340	int cnt;
	2341	{
	2342	int opt, optlen;
	2343
	2344	to->to_flags = 0;
	2345	for (; cnt > 0; cnt -= optlen, cp += optlen) {
	2346	opt = cp[0];
	2347	if (opt == TCPOPT_EOL)
	2348	break;
	2349	if (opt == TCPOPT_NOP)
	2350	optlen = 1;
	2351	else {
	2352	if (cnt < 2)
	2353	break;
	2354	optlen = cp[1];
	2355	if (optlen < 2 \|\| optlen > cnt)
	2356	break;
	2357	}
	2358	switch (opt) {
	2359	case TCPOPT_MAXSEG:
	2360	if (optlen != TCPOLEN_MAXSEG)
	2361	continue;
	2362	if (!is_syn)
	2363	continue;
	2364	to->to_flags \|= TOF_MSS;
	2365	bcopy((char *)cp + 2,
	2366	(char *)&to->to_mss, sizeof(to->to_mss));
	2367	to->to_mss = ntohs(to->to_mss);
	2368	break;
	2369	case TCPOPT_WINDOW:
	2370	if (optlen != TCPOLEN_WINDOW)
	2371	continue;
	2372	if (! is_syn)
	2373	continue;
	2374	to->to_flags \|= TOF_SCALE;
	2375	to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
	2376	break;
	2377	case TCPOPT_TIMESTAMP:
	2378	if (optlen != TCPOLEN_TIMESTAMP)
	2379	continue;
	2380	to->to_flags \|= TOF_TS;
	2381	bcopy((char *)cp + 2,
	2382	(char *)&to->to_tsval, sizeof(to->to_tsval));
	2383	to->to_tsval = ntohl(to->to_tsval);
	2384	bcopy((char *)cp + 6,
	2385	(char *)&to->to_tsecr, sizeof(to->to_tsecr));
	2386	to->to_tsecr = ntohl(to->to_tsecr);
	2387	break;
	2388	case TCPOPT_CC:
	2389	if (optlen != TCPOLEN_CC)
	2390	continue;
	2391	to->to_flags \|= TOF_CC;
	2392	bcopy((char *)cp + 2,
	2393	(char *)&to->to_cc, sizeof(to->to_cc));
	2394	to->to_cc = ntohl(to->to_cc);
	2395	break;
	2396	case TCPOPT_CCNEW:
	2397	if (optlen != TCPOLEN_CC)
	2398	continue;
	2399	if (!is_syn)
	2400	continue;
	2401	to->to_flags \|= TOF_CCNEW;
	2402	bcopy((char *)cp + 2,
	2403	(char *)&to->to_cc, sizeof(to->to_cc));
	2404	to->to_cc = ntohl(to->to_cc);
	2405	break;
	2406	case TCPOPT_CCECHO:
	2407	if (optlen != TCPOLEN_CC)
	2408	continue;
	2409	if (!is_syn)
	2410	continue;
	2411	to->to_flags \|= TOF_CCECHO;
	2412	bcopy((char *)cp + 2,
	2413	(char *)&to->to_ccecho, sizeof(to->to_ccecho));
	2414	to->to_ccecho = ntohl(to->to_ccecho);
	2415	break;
	2416	default:
	2417	continue;
	2418	}
	2419	}
	2420	}
	2421
	2422	/*
	2423	* Pull out of band byte out of a segment so
	2424	* it doesn't appear in the user's data queue.
	2425	* It is still reflected in the segment length for
	2426	* sequencing purposes.
	2427	*/
	2428	static void
	2429	tcp_pulloutofband(so, th, m, off)
	2430	struct socket *so;
	2431	struct tcphdr *th;
	2432	struct mbuf *m;
	2433	int off; /* delayed to be droped hdrlen */
	2434	{
	2435	int cnt = off + th->th_urp - 1;
	2436
	2437	while (cnt >= 0) {
	2438	if (m->m_len > cnt) {
	2439	char *cp = mtod(m, caddr_t) + cnt;
	2440	struct tcpcb *tp = sototcpcb(so);
	2441
	2442	tp->t_iobc = *cp;
	2443	tp->t_oobflags \|= TCPOOB_HAVEDATA;
	2444	bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
	2445	m->m_len--;
	2446	if (m->m_flags & M_PKTHDR)
	2447	m->m_pkthdr.len--;
	2448	return;
	2449	}
	2450	cnt -= m->m_len;
	2451	m = m->m_next;
	2452	if (m == 0)
	2453	break;
	2454	}
	2455	panic("tcp_pulloutofband");
	2456	}
	2457
	2458	/*
	2459	* Collect new round-trip time estimate
	2460	* and update averages and current timeout.
	2461	*/
	2462	static void
	2463	tcp_xmit_timer(tp, rtt)
	2464	struct tcpcb *tp;
	2465	int rtt;
	2466	{
	2467	int delta;
	2468
	2469	tcpstat.tcps_rttupdated++;
	2470	tp->t_rttupdated++;
	2471	if (tp->t_srtt != 0) {
	2472	/*
	2473	* srtt is stored as fixed point with 5 bits after the
	2474	* binary point (i.e., scaled by 8). The following magic
	2475	* is equivalent to the smoothing algorithm in rfc793 with
	2476	* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
	2477	* point). Adjust rtt to origin 0.
	2478	*/
	2479	delta = ((rtt - 1) << TCP_DELTA_SHIFT)
	2480	- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
	2481
	2482	if ((tp->t_srtt += delta) <= 0)
	2483	tp->t_srtt = 1;
	2484
	2485	/*
	2486	* We accumulate a smoothed rtt variance (actually, a
	2487	* smoothed mean difference), then set the retransmit
	2488	* timer to smoothed rtt + 4 times the smoothed variance.
	2489	* rttvar is stored as fixed point with 4 bits after the
	2490	* binary point (scaled by 16). The following is
	2491	* equivalent to rfc793 smoothing with an alpha of .75
	2492	* (rttvar = rttvar*3/4 + \|delta\| / 4). This replaces
	2493	* rfc793's wired-in beta.
	2494	*/
	2495	if (delta < 0)
	2496	delta = -delta;
	2497	delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
	2498	if ((tp->t_rttvar += delta) <= 0)
	2499	tp->t_rttvar = 1;
	2500	if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
	2501	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
	2502	} else {
	2503	/*
	2504	* No rtt measurement yet - use the unsmoothed rtt.
	2505	* Set the variance to half the rtt (so our first
	2506	* retransmit happens at 3*rtt).
	2507	*/
	2508	tp->t_srtt = rtt << TCP_RTT_SHIFT;
	2509	tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
	2510	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
	2511	}
	2512	tp->t_rtttime = 0;
	2513	tp->t_rxtshift = 0;
	2514
	2515	/*
	2516	* the retransmit should happen at rtt + 4 * rttvar.
	2517	* Because of the way we do the smoothing, srtt and rttvar
	2518	* will each average +1/2 tick of bias. When we compute
	2519	* the retransmit timer, we want 1/2 tick of rounding and
	2520	* 1 extra tick because of +-1/2 tick uncertainty in the
	2521	* firing of the timer. The bias will give us exactly the
	2522	* 1.5 tick we need. But, because the bias is
	2523	* statistical, we have to test that we don't drop below
	2524	* the minimum feasible timer (which is 2 ticks).
	2525	*/
	2526	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
	2527	max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
	2528
	2529	/*
	2530	* We received an ack for a packet that wasn't retransmitted;
	2531	* it is probably safe to discard any error indications we've
	2532	* received recently. This isn't quite right, but close enough
	2533	* for now (a route might have failed after we sent a segment,
	2534	* and the return path might not be symmetrical).
	2535	*/
	2536	tp->t_softerror = 0;
	2537	}
	2538
	2539	/*
	2540	* Determine a reasonable value for maxseg size.
	2541	* If the route is known, check route for mtu.
	2542	* If none, use an mss that can be handled on the outgoing
	2543	* interface without forcing IP to fragment; if bigger than
	2544	* an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
	2545	* to utilize large mbufs. If no route is found, route has no mtu,
	2546	* or the destination isn't local, use a default, hopefully conservative
	2547	* size (usually 512 or the default IP max size, but no more than the mtu
	2548	* of the interface), as we can't discover anything about intervening
	2549	* gateways or networks. We also initialize the congestion/slow start
	2550	* window to be a single segment if the destination isn't local.
	2551	* While looking at the routing entry, we also initialize other path-dependent
	2552	* parameters from pre-set or cached values in the routing entry.
	2553	*
	2554	* Also take into account the space needed for options that we
	2555	* send regularly. Make maxseg shorter by that amount to assure
	2556	* that we can send maxseg amount of data even when the options
	2557	* are present. Store the upper limit of the length of options plus
	2558	* data in maxopd.
	2559	*
	2560	* NOTE that this routine is only called when we process an incoming
	2561	* segment, for outgoing segments only tcp_mssopt is called.
	2562	*
	2563	* In case of T/TCP, we call this routine during implicit connection
	2564	* setup as well (offer = -1), to initialize maxseg from the cached
	2565	* MSS of our peer.
	2566	*/
	2567	void
	2568	tcp_mss(tp, offer)
	2569	struct tcpcb *tp;
	2570	int offer;
	2571	{
	2572	struct rtentry *rt;
	2573	struct ifnet *ifp;
	2574	int rtt, mss;
	2575	u_long bufsize;
	2576	struct inpcb *inp = tp->t_inpcb;
	2577	struct socket *so;
	2578	struct rmxp_tao *taop;
	2579	int origoffer = offer;
	2580	#ifdef INET6
	2581	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
	2582	size_t min_protoh = isipv6 ?
	2583	sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
	2584	sizeof(struct tcpiphdr);
	2585	#else
	2586	const int isipv6 = 0;
	2587	const size_t min_protoh = sizeof(struct tcpiphdr);
	2588	#endif
	2589
	2590	if (isipv6)
	2591	rt = tcp_rtlookup6(&inp->inp_inc);
	2592	else
	2593	rt = tcp_rtlookup(&inp->inp_inc);
	2594	if (rt == NULL) {
	2595	tp->t_maxopd = tp->t_maxseg =
	2596	isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
	2597	return;
	2598	}
	2599	ifp = rt->rt_ifp;
	2600	so = inp->inp_socket;
	2601
	2602	taop = rmx_taop(rt->rt_rmx);
	2603	/*
	2604	* Offer == -1 means that we didn't receive SYN yet,
	2605	* use cached value in that case;
	2606	*/
	2607	if (offer == -1)
	2608	offer = taop->tao_mssopt;
	2609	/*
	2610	* Offer == 0 means that there was no MSS on the SYN segment,
	2611	* in this case we use tcp_mssdflt.
	2612	*/
	2613	if (offer == 0)
	2614	offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
	2615	else
	2616	/*
	2617	* Sanity check: make sure that maxopd will be large
	2618	* enough to allow some data on segments even is the
	2619	* all the option space is used (40bytes). Otherwise
	2620	* funny things may happen in tcp_output.
	2621	*/
	2622	offer = max(offer, 64);
	2623	taop->tao_mssopt = offer;
	2624
	2625	/*
	2626	* While we're here, check if there's an initial rtt
	2627	* or rttvar. Convert from the route-table units
	2628	* to scaled multiples of the slow timeout timer.
	2629	*/
	2630	if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
	2631	/*
	2632	* XXX the lock bit for RTT indicates that the value
	2633	* is also a minimum value; this is subject to time.
	2634	*/
	2635	if (rt->rt_rmx.rmx_locks & RTV_RTT)
	2636	tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
	2637	tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
	2638	tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
	2639	tcpstat.tcps_usedrtt++;
	2640	if (rt->rt_rmx.rmx_rttvar) {
	2641	tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
	2642	(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
	2643	tcpstat.tcps_usedrttvar++;
	2644	} else {
	2645	/* default variation is +- 1 rtt */
	2646	tp->t_rttvar =
	2647	tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
	2648	}
	2649	TCPT_RANGESET(tp->t_rxtcur,
	2650	((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
	2651	tp->t_rttmin, TCPTV_REXMTMAX);
	2652	}
	2653	/*
	2654	* if there's an mtu associated with the route, use it
	2655	* else, use the link mtu.
	2656	*/
	2657	if (rt->rt_rmx.rmx_mtu)
	2658	mss = rt->rt_rmx.rmx_mtu - min_protoh;
	2659	else {
	2660	if (isipv6) {
	2661	mss = nd_ifinfo[rt->rt_ifp->if_index].linkmtu -
	2662	min_protoh;
	2663	if (!in6_localaddr(&inp->in6p_faddr))
	2664	mss = min(mss, tcp_v6mssdflt);
	2665	} else {
	2666	mss = ifp->if_mtu - min_protoh;
	2667	if (!in_localaddr(inp->inp_faddr))
	2668	mss = min(mss, tcp_mssdflt);
	2669	}
	2670	}
	2671	mss = min(mss, offer);
	2672	/*
	2673	* maxopd stores the maximum length of data AND options
	2674	* in a segment; maxseg is the amount of data in a normal
	2675	* segment. We need to store this value (maxopd) apart
	2676	* from maxseg, because now every segment carries options
	2677	* and thus we normally have somewhat less data in segments.
	2678	*/
	2679	tp->t_maxopd = mss;
	2680
	2681	/*
	2682	* In case of T/TCP, origoffer==-1 indicates, that no segments
	2683	* were received yet. In this case we just guess, otherwise
	2684	* we do the same as before T/TCP.
	2685	*/
	2686	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_NOOPT)) == TF_REQ_TSTMP &&
	2687	(origoffer == -1 \|\|
	2688	(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
	2689	mss -= TCPOLEN_TSTAMP_APPA;
	2690	if ((tp->t_flags & (TF_REQ_CC\|TF_NOOPT)) == TF_REQ_CC &&
	2691	(origoffer == -1 \|\|
	2692	(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
	2693	mss -= TCPOLEN_CC_APPA;
	2694
	2695	#if (MCLBYTES & (MCLBYTES - 1)) == 0
	2696	if (mss > MCLBYTES)
	2697	mss &= ~(MCLBYTES-1);
	2698	#else
	2699	if (mss > MCLBYTES)
	2700	mss = mss / MCLBYTES * MCLBYTES;
	2701	#endif
	2702	/*
	2703	* If there's a pipesize, change the socket buffer
	2704	* to that size. Make the socket buffers an integral
	2705	* number of mss units; if the mss is larger than
	2706	* the socket buffer, decrease the mss.
	2707	*/
	2708	#ifdef RTV_SPIPE
	2709	if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
	2710	#endif
	2711	bufsize = so->so_snd.sb_hiwat;
	2712	if (bufsize < mss)
	2713	mss = bufsize;
	2714	else {
	2715	bufsize = roundup(bufsize, mss);
	2716	if (bufsize > sb_max)
	2717	bufsize = sb_max;
	2718	if (bufsize > so->so_snd.sb_hiwat)
	2719	(void)sbreserve(&so->so_snd, bufsize, so, NULL);
	2720	}
	2721	tp->t_maxseg = mss;
	2722
	2723	#ifdef RTV_RPIPE
	2724	if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
	2725	#endif
	2726	bufsize = so->so_rcv.sb_hiwat;
	2727	if (bufsize > mss) {
	2728	bufsize = roundup(bufsize, mss);
	2729	if (bufsize > sb_max)
	2730	bufsize = sb_max;
	2731	if (bufsize > so->so_rcv.sb_hiwat)
	2732	(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
	2733	}
	2734
	2735	/*
	2736	* Set the slow-start flight size depending on whether this
	2737	* is a local network or not.
	2738	*/
	2739	if (tcp_do_rfc3390)
	2740	tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
	2741	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) \|\|
	2742	(!isipv6 && in_localaddr(inp->inp_faddr)))
	2743	tp->snd_cwnd = mss * ss_fltsz_local;
	2744	else
	2745	tp->snd_cwnd = mss * ss_fltsz;
	2746
	2747	if (rt->rt_rmx.rmx_ssthresh) {
	2748	/*
	2749	* There's some sort of gateway or interface
	2750	* buffer limit on the path. Use this to set
	2751	* the slow start threshhold, but set the
	2752	* threshold to no less than 2*mss.
	2753	*/
	2754	tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
	2755	tcpstat.tcps_usedssthresh++;
	2756	}
	2757	}
	2758
	2759	/*
	2760	* Determine the MSS option to send on an outgoing SYN.
	2761	*/
	2762	int
	2763	tcp_mssopt(tp)
	2764	struct tcpcb *tp;
	2765	{
	2766	struct rtentry *rt;
	2767	#ifdef INET6
	2768	int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
	2769	int min_protoh = isipv6 ?
	2770	sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
	2771	sizeof(struct tcpiphdr);
	2772	#else
	2773	const int isipv6 = 0;
	2774	const size_t min_protoh = sizeof(struct tcpiphdr);
	2775	#endif
	2776
	2777	if (isipv6)
	2778	rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
	2779	else
	2780	rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
	2781	if (rt == NULL)
	2782	return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
	2783
	2784	return (rt->rt_ifp->if_mtu - min_protoh);
	2785	}
	2786
	2787
	2788	/*
	2789	* When a partial ack arrives, force the retransmission of the
	2790	* next unacknowledged segment. Do not clear tp->t_dupacks.
	2791	* By setting snd_nxt to ti_ack, this forces retransmission timer to
	2792	* be started again.
	2793	*/
	2794	static void
	2795	tcp_newreno_partial_ack(tp, th)
	2796	struct tcpcb *tp;
	2797	struct tcphdr *th;
	2798	{
	2799	tcp_seq onxt = tp->snd_nxt;
	2800	u_long ocwnd = tp->snd_cwnd;
	2801
	2802	callout_stop(tp->tt_rexmt);
	2803	tp->t_rtttime = 0;
	2804	tp->snd_nxt = th->th_ack;
	2805	/*
	2806	* Set snd_cwnd to one segment beyond acknowledged offset
	2807	* (tp->snd_una has not yet been updated when this function is called.)
	2808	*/
	2809	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
	2810	tp->t_flags \|= TF_ACKNOW;
	2811	(void) tcp_output(tp);
	2812	tp->snd_cwnd = ocwnd;
	2813	if (SEQ_GT(onxt, tp->snd_nxt))
	2814	tp->snd_nxt = onxt;
	2815	/*
	2816	* Partial window deflation. Relies on fact that tp->snd_una
	2817	* not updated yet.
	2818	*/
	2819	tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
	2820	}