gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
	3	* Copyright (c) 2004 The DragonFly Project. All rights reserved.
	4	*
	5	* This code is derived from software contributed to The DragonFly Project
	6	* by Jeffrey M. Hsu.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. Neither the name of The DragonFly Project nor the names of its
	17	* contributors may be used to endorse or promote products derived
	18	* from this software without specific, prior written permission.
	19	*
	20	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	21	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	22	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	23	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	24	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	25	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	26	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	27	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	28	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	29	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	30	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	31	* SUCH DAMAGE.
	32	*/
	33
	34	/*
	35	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	36	* The Regents of the University of California. All rights reserved.
	37	*
	38	* Redistribution and use in source and binary forms, with or without
	39	* modification, are permitted provided that the following conditions
	40	* are met:
	41	* 1. Redistributions of source code must retain the above copyright
	42	* notice, this list of conditions and the following disclaimer.
	43	* 2. Redistributions in binary form must reproduce the above copyright
	44	* notice, this list of conditions and the following disclaimer in the
	45	* documentation and/or other materials provided with the distribution.
	46	* 3. All advertising materials mentioning features or use of this software
	47	* must display the following acknowledgement:
	48	* This product includes software developed by the University of
	49	* California, Berkeley and its contributors.
	50	* 4. Neither the name of the University nor the names of its contributors
	51	* may be used to endorse or promote products derived from this software
	52	* without specific prior written permission.
	53	*
	54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	64	* SUCH DAMAGE.
	65	*
	66	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
	67	* $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.20 2003/01/29 22:45:36 hsu Exp $
	68	* $DragonFly: src/sys/netinet/tcp_output.c,v 1.34 2007/04/22 01:13:14 dillon Exp $
	69	*/
	70
	71	#include "opt_inet.h"
	72	#include "opt_inet6.h"
	73	#include "opt_ipsec.h"
	74	#include "opt_tcpdebug.h"
	75
	76	#include <sys/param.h>
	77	#include <sys/systm.h>
	78	#include <sys/kernel.h>
	79	#include <sys/sysctl.h>
	80	#include <sys/mbuf.h>
	81	#include <sys/domain.h>
	82	#include <sys/protosw.h>
	83	#include <sys/socket.h>
	84	#include <sys/socketvar.h>
	85	#include <sys/in_cksum.h>
	86	#include <sys/thread.h>
	87	#include <sys/globaldata.h>
	88
	89	#include <net/route.h>
	90
	91	#include <netinet/in.h>
	92	#include <netinet/in_systm.h>
	93	#include <netinet/ip.h>
	94	#include <netinet/in_pcb.h>
	95	#include <netinet/ip_var.h>
	96	#include <netinet6/in6_pcb.h>
	97	#include <netinet/ip6.h>
	98	#include <netinet6/ip6_var.h>
	99	#include <netinet/tcp.h>
	100	#define TCPOUTFLAGS
	101	#include <netinet/tcp_fsm.h>
	102	#include <netinet/tcp_seq.h>
	103	#include <netinet/tcp_timer.h>
	104	#include <netinet/tcp_timer2.h>
	105	#include <netinet/tcp_var.h>
	106	#include <netinet/tcpip.h>
	107	#ifdef TCPDEBUG
	108	#include <netinet/tcp_debug.h>
	109	#endif
	110
	111	#ifdef IPSEC
	112	#include <netinet6/ipsec.h>
	113	#endif /IPSEC/
	114
	115	#ifdef FAST_IPSEC
	116	#include <netproto/ipsec/ipsec.h>
	117	#define IPSEC
	118	#endif /FAST_IPSEC/
	119
	120	#ifdef notyet
	121	extern struct mbuf *m_copypack();
	122	#endif
	123
	124	int path_mtu_discovery = 0;
	125	SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
	126	&path_mtu_discovery, 1, "Enable Path MTU Discovery");
	127
	128	static int avoid_pure_win_update = 1;
	129	SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW,
	130	&avoid_pure_win_update, 1, "Avoid pure window updates when possible");
	131
	132	int tcp_do_autosndbuf = 1;
	133	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
	134	&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
	135
	136	int tcp_autosndbuf_inc = 8*1024;
	137	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
	138	&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
	139
	140	int tcp_autosndbuf_max = 210241024;
	141	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
	142	&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
	143
	144	/*
	145	* Tcp output routine: figure out what should be sent and send it.
	146	*/
	147	int
	148	tcp_output(struct tcpcb *tp)
	149	{
	150	struct inpcb * const inp = tp->t_inpcb;
	151	struct socket *so = inp->inp_socket;
	152	long len, recvwin, sendwin;
	153	int nsacked = 0;
	154	int off, flags, error;
	155	#ifdef TCP_SIGNATURE
	156	int sigoff = 0;
	157	#endif
	158	struct mbuf *m;
	159	struct ip *ip = NULL;
	160	struct ipovly *ipov = NULL;
	161	struct tcphdr *th;
	162	u_char opt[TCP_MAXOLEN];
	163	unsigned int ipoptlen, optlen, hdrlen;
	164	int idle;
	165	boolean_t sendalot;
	166	struct ip6_hdr *ip6 = NULL;
	167	#ifdef INET6
	168	const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
	169	#else
	170	const boolean_t isipv6 = FALSE;
	171	#endif
	172
	173	KKASSERT(so->so_port == &curthread->td_msgport);
	174
	175	/*
	176	* Determine length of data that should be transmitted,
	177	* and flags that will be used.
	178	* If there is some data or critical controls (SYN, RST)
	179	* to send, then transmit; otherwise, investigate further.
	180	*/
	181
	182	/*
	183	* If we have been idle for a while, the send congestion window
	184	* could be no longer representative of the current state of the link.
	185	* So unless we are expecting more acks to come in, slow-start from
	186	* scratch to re-determine the send congestion window.
	187	*/
	188	if (tp->snd_max == tp->snd_una &&
	189	(ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
	190	if (tcp_do_rfc3390) {
	191	int initial_cwnd =
	192	min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));
	193
	194	tp->snd_cwnd = min(tp->snd_cwnd, initial_cwnd);
	195	} else {
	196	tp->snd_cwnd = tp->t_maxseg;
	197	}
	198	tp->snd_wacked = 0;
	199	}
	200
	201	/*
	202	* Calculate whether the transmit stream was previously idle
	203	* and adjust TF_LASTIDLE for the next time.
	204	*/
	205	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
	206	if (idle && (tp->t_flags & TF_MORETOCOME))
	207	tp->t_flags \|= TF_LASTIDLE;
	208	else
	209	tp->t_flags &= ~TF_LASTIDLE;
	210
	211	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	212	!IN_FASTRECOVERY(tp))
	213	nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);
	214
	215	again:
	216	/* Make use of SACK information when slow-starting after a RTO. */
	217	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	218	!IN_FASTRECOVERY(tp)) {
	219	tcp_seq old_snd_nxt = tp->snd_nxt;
	220
	221	tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
	222	nsacked += tp->snd_nxt - old_snd_nxt;
	223	}
	224
	225	sendalot = FALSE;
	226	off = tp->snd_nxt - tp->snd_una;
	227	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	228	sendwin = min(sendwin, tp->snd_bwnd);
	229
	230	flags = tcp_outflags[tp->t_state];
	231	/*
	232	* Get standard flags, and add SYN or FIN if requested by 'hidden'
	233	* state flags.
	234	*/
	235	if (tp->t_flags & TF_NEEDFIN)
	236	flags \|= TH_FIN;
	237	if (tp->t_flags & TF_NEEDSYN)
	238	flags \|= TH_SYN;
	239
	240	/*
	241	* If in persist timeout with window of 0, send 1 byte.
	242	* Otherwise, if window is small but nonzero
	243	* and timer expired, we will send what we can
	244	* and go to transmit state.
	245	*/
	246	if (tp->t_flags & TF_FORCE) {
	247	if (sendwin == 0) {
	248	/*
	249	* If we still have some data to send, then
	250	* clear the FIN bit. Usually this would
	251	* happen below when it realizes that we
	252	* aren't sending all the data. However,
	253	* if we have exactly 1 byte of unsent data,
	254	* then it won't clear the FIN bit below,
	255	* and if we are in persist state, we wind
	256	* up sending the packet without recording
	257	* that we sent the FIN bit.
	258	*
	259	* We can't just blindly clear the FIN bit,
	260	* because if we don't have any more data
	261	* to send then the probe will be the FIN
	262	* itself.
	263	*/
	264	if (off < so->so_snd.ssb_cc)
	265	flags &= ~TH_FIN;
	266	sendwin = 1;
	267	} else {
	268	tcp_callout_stop(tp, tp->tt_persist);
	269	tp->t_rxtshift = 0;
	270	}
	271	}
	272
	273	/*
	274	* If snd_nxt == snd_max and we have transmitted a FIN, the
	275	* offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	276	* a negative length. This can also occur when TCP opens up
	277	* its congestion window while receiving additional duplicate
	278	* acks after fast-retransmit because TCP will reset snd_nxt
	279	* to snd_max after the fast-retransmit.
	280	*
	281	* A negative length can also occur when we are in the
	282	* TCPS_SYN_RECEIVED state due to a simultanious connect where
	283	* our SYN has not been acked yet.
	284	*
	285	* In the normal retransmit-FIN-only case, however, snd_nxt will
	286	* be set to snd_una, the offset will be 0, and the length may
	287	* wind up 0.
	288	*/
	289	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;
	290
	291	/*
	292	* Lop off SYN bit if it has already been sent. However, if this
	293	* is SYN-SENT state and if segment contains data, suppress sending
	294	* segment (sending the segment would be an option if we still
	295	* did TAO and the remote host supported it).
	296	*/
	297	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
	298	flags &= ~TH_SYN;
	299	off--, len++;
	300	if (len > 0 && tp->t_state == TCPS_SYN_SENT)
	301	return 0;
	302	}
	303
	304	/*
	305	* Be careful not to send data and/or FIN on SYN segments.
	306	* This measure is needed to prevent interoperability problems
	307	* with not fully conformant TCP implementations.
	308	*/
	309	if (flags & TH_SYN) {
	310	len = 0;
	311	flags &= ~TH_FIN;
	312	}
	313
	314	if (len < 0) {
	315	/*
	316	* A negative len can occur if our FIN has been sent but not
	317	* acked, or if we are in a simultanious connect in the
	318	* TCPS_SYN_RECEIVED state with our SYN sent but not yet
	319	* acked.
	320	*
	321	* If our window has contracted to 0 in the FIN case
	322	* (which can only occur if we have NOT been called to
	323	* retransmit as per code a few paragraphs up) then we
	324	* want to shift the retransmit timer over to the
	325	* persist timer.
	326	*
	327	* However, if we are in the TCPS_SYN_RECEIVED state
	328	* (the SYN case) we will be in a simultanious connect and
	329	* the window may be zero degeneratively. In this case we
	330	* do not want to shift to the persist timer after the SYN
	331	* or the SYN+ACK transmission.
	332	*/
	333	len = 0;
	334	if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) {
	335	tcp_callout_stop(tp, tp->tt_rexmt);
	336	tp->t_rxtshift = 0;
	337	tp->snd_nxt = tp->snd_una;
	338	if (!tcp_callout_active(tp, tp->tt_persist))
	339	tcp_setpersist(tp);
	340	}
	341	}
	342
	343	KASSERT(len >= 0, ("%s: len < 0", __func__));
	344	/*
	345	* Automatic sizing of send socket buffer. Often the send buffer
	346	* size is not optimally adjusted to the actual network conditions
	347	* at hand (delay bandwidth product). Setting the buffer size too
	348	* small limits throughput on links with high bandwidth and high
	349	* delay (eg. trans-continental/oceanic links). Setting the
	350	* buffer size too big consumes too much real kernel memory,
	351	* especially with many connections on busy servers.
	352	*
	353	* The criteria to step up the send buffer one notch are:
	354	* 1. receive window of remote host is larger than send buffer
	355	* (with a fudge factor of 5/4th);
	356	* 2. send buffer is filled to 7/8th with data (so we actually
	357	* have data to make use of it);
	358	* 3. send buffer fill has not hit maximal automatic size;
	359	* 4. our send window (slow start and cogestion controlled) is
	360	* larger than sent but unacknowledged data in send buffer.
	361	*
	362	* The remote host receive window scaling factor may limit the
	363	* growing of the send buffer before it reaches its allowed
	364	* maximum.
	365	*
	366	* It scales directly with slow start or congestion window
	367	* and does at most one step per received ACK. This fast
	368	* scaling has the drawback of growing the send buffer beyond
	369	* what is strictly necessary to make full use of a given
	370	* delay*bandwith product. However testing has shown this not
	371	* to be much of an problem. At worst we are trading wasting
	372	* of available bandwith (the non-use of it) for wasting some
	373	* socket buffer memory.
	374	*
	375	* TODO: Shrink send buffer during idle periods together
	376	* with congestion window. Requires another timer. Has to
	377	* wait for upcoming tcp timer rewrite.
	378	*/
	379	if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
	380	if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
	381	so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
	382	so->so_snd.ssb_cc < tcp_autosndbuf_max &&
	383	sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
	384	u_long newsize;
	385
	386	newsize = ulmin(so->so_snd.ssb_hiwat +
	387	tcp_autosndbuf_inc,
	388	tcp_autosndbuf_max);
	389	if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
	390	atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
	391	if (newsize >= (TCP_MAXWIN << tp->snd_scale))
	392	atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
	393	}
	394	}
	395
	396	/*
	397	* Truncate to the maximum segment length and ensure that FIN is
	398	* removed if the length no longer contains the last data byte.
	399	*/
	400	if (len > tp->t_maxseg) {
	401	len = tp->t_maxseg;
	402	sendalot = TRUE;
	403	}
	404	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
	405	flags &= ~TH_FIN;
	406
	407	recvwin = ssb_space(&so->so_rcv);
	408
	409	/*
	410	* Sender silly window avoidance. We transmit under the following
	411	* conditions when len is non-zero:
	412	*
	413	* - We have a full segment
	414	* - This is the last buffer in a write()/send() and we are
	415	* either idle or running NODELAY
	416	* - we've timed out (e.g. persist timer)
	417	* - we have more then 1/2 the maximum send window's worth of
	418	* data (receiver may be limiting the window size)
	419	* - we need to retransmit
	420	*/
	421	if (len) {
	422	if (len == tp->t_maxseg)
	423	goto send;
	424	/*
	425	* NOTE! on localhost connections an 'ack' from the remote
	426	* end may occur synchronously with the output and cause
	427	* us to flush a buffer queued with moretocome. XXX
	428	*
	429	* note: the len + off check is almost certainly unnecessary.
	430	*/
	431	if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
	432	(idle \|\| (tp->t_flags & TF_NODELAY)) &&
	433	len + off >= so->so_snd.ssb_cc &&
	434	!(tp->t_flags & TF_NOPUSH)) {
	435	goto send;
	436	}
	437	if (tp->t_flags & TF_FORCE) /* typ. timeout case */
	438	goto send;
	439	if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
	440	goto send;
	441	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
	442	goto send;
	443	}
	444
	445	/*
	446	* Compare available window to amount of window
	447	* known to peer (as advertised window less
	448	* next expected input). If the difference is at least two
	449	* max size segments, or at least 50% of the maximum possible
	450	* window, then want to send a window update to peer.
	451	*/
	452	if (recvwin > 0) {
	453	/*
	454	* "adv" is the amount we can increase the window,
	455	* taking into account that we are limited by
	456	* TCP_MAXWIN << tp->rcv_scale.
	457	*/
	458	long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
	459	(tp->rcv_adv - tp->rcv_nxt);
	460	long hiwat;
	461
	462	/*
	463	* This ack case typically occurs when the user has drained
	464	* the TCP socket buffer sufficiently to warrent an ack
	465	* containing a 'pure window update'... that is, an ack that
	466	* ONLY updates the tcp window.
	467	*
	468	* It is unclear why we would need to do a pure window update
	469	* past 2 segments if we are going to do one at 1/2 the high
	470	* water mark anyway, especially since under normal conditions
	471	* the user program will drain the socket buffer quickly.
	472	* The 2-segment pure window update will often add a large
	473	* number of extra, unnecessary acks to the stream.
	474	*
	475	* avoid_pure_win_update now defaults to 1.
	476	*/
	477	if (avoid_pure_win_update == 0 \|\|
	478	(tp->t_flags & TF_RXRESIZED)) {
	479	if (adv >= (long) (2 * tp->t_maxseg)) {
	480	goto send;
	481	}
	482	}
	483	hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
	484	if (hiwat > (long)so->so_rcv.ssb_hiwat)
	485	hiwat = (long)so->so_rcv.ssb_hiwat;
	486	if (adv >= hiwat / 2)
	487	goto send;
	488	}
	489
	490	/*
	491	* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
	492	* is also a catch-all for the retransmit timer timeout case.
	493	*/
	494	if (tp->t_flags & TF_ACKNOW)
	495	goto send;
	496	if ((flags & TH_RST) \|\|
	497	((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
	498	goto send;
	499	if (SEQ_GT(tp->snd_up, tp->snd_una))
	500	goto send;
	501	/*
	502	* If our state indicates that FIN should be sent
	503	* and we have not yet done so, then we need to send.
	504	*/
	505	if (flags & TH_FIN &&
	506	(!(tp->t_flags & TF_SENTFIN) \|\| tp->snd_nxt == tp->snd_una))
	507	goto send;
	508
	509	/*
	510	* TCP window updates are not reliable, rather a polling protocol
	511	* using ``persist'' packets is used to insure receipt of window
	512	* updates. The three ``states'' for the output side are:
	513	* idle not doing retransmits or persists
	514	* persisting to move a small or zero window
	515	* (re)transmitting and thereby not persisting
	516	*
	517	* tcp_callout_active(tp, tp->tt_persist)
	518	* is true when we are in persist state.
	519	* The TF_FORCE flag in tp->t_flags
	520	* is set when we are called to send a persist packet.
	521	* tcp_callout_active(tp, tp->tt_rexmt)
	522	* is set when we are retransmitting
	523	* The output side is idle when both timers are zero.
	524	*
	525	* If send window is too small, there is data to transmit, and no
	526	* retransmit or persist is pending, then go to persist state.
	527	* If nothing happens soon, send when timer expires:
	528	* if window is nonzero, transmit what we can,
	529	* otherwise force out a byte.
	530	*/
	531	if (so->so_snd.ssb_cc > 0 &&
	532	!tcp_callout_active(tp, tp->tt_rexmt) &&
	533	!tcp_callout_active(tp, tp->tt_persist)) {
	534	tp->t_rxtshift = 0;
	535	tcp_setpersist(tp);
	536	}
	537
	538	/*
	539	* No reason to send a segment, just return.
	540	*/
	541	return (0);
	542
	543	send:
	544	/*
	545	* Before ESTABLISHED, force sending of initial options
	546	* unless TCP set not to do any options.
	547	* NOTE: we assume that the IP/TCP header plus TCP options
	548	* always fit in a single mbuf, leaving room for a maximum
	549	* link header, i.e.
	550	* max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	551	*/
	552	optlen = 0;
	553	if (isipv6)
	554	hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	555	else
	556	hdrlen = sizeof(struct tcpiphdr);
	557	if (flags & TH_SYN) {
	558	tp->snd_nxt = tp->iss;
	559	if (!(tp->t_flags & TF_NOOPT)) {
	560	u_short mss;
	561
	562	opt[0] = TCPOPT_MAXSEG;
	563	opt[1] = TCPOLEN_MAXSEG;
	564	mss = htons((u_short) tcp_mssopt(tp));
	565	memcpy(opt + 2, &mss, sizeof mss);
	566	optlen = TCPOLEN_MAXSEG;
	567
	568	if ((tp->t_flags & TF_REQ_SCALE) &&
	569	(!(flags & TH_ACK) \|\|
	570	(tp->t_flags & TF_RCVD_SCALE))) {
	571	((u_int32_t )(opt + optlen)) = htonl(
	572	TCPOPT_NOP << 24 \|
	573	TCPOPT_WINDOW << 16 \|
	574	TCPOLEN_WINDOW << 8 \|
	575	tp->request_r_scale);
	576	optlen += 4;
	577	}
	578
	579	if ((tcp_do_sack && !(flags & TH_ACK)) \|\|
	580	tp->t_flags & TF_SACK_PERMITTED) {
	581	uint32_t lp = (uint32_t )(opt + optlen);
	582
	583	*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
	584	optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
	585	}
	586	}
	587	}
	588
	589	/*
	590	* Send a timestamp and echo-reply if this is a SYN and our side
	591	* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	592	* and our peer have sent timestamps in our SYN's.
	593	*/
	594	if ((tp->t_flags & (TF_REQ_TSTMP \| TF_NOOPT)) == TF_REQ_TSTMP &&
	595	!(flags & TH_RST) &&
	596	(!(flags & TH_ACK) \|\| (tp->t_flags & TF_RCVD_TSTMP))) {
	597	u_int32_t lp = (u_int32_t )(opt + optlen);
	598
	599	/* Form timestamp option as shown in appendix A of RFC 1323. */
	600	*lp++ = htonl(TCPOPT_TSTAMP_HDR);
	601	*lp++ = htonl(ticks);
	602	*lp = htonl(tp->ts_recent);
	603	optlen += TCPOLEN_TSTAMP_APPA;
	604	}
	605
	606	/* Set receive buffer autosizing timestamp. */
	607	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
	608	tp->rfbuf_ts = ticks;
	609
	610	/*
	611	* If this is a SACK connection and we have a block to report,
	612	* fill in the SACK blocks in the TCP options.
	613	*/
	614	if ((tp->t_flags & (TF_SACK_PERMITTED \| TF_NOOPT)) ==
	615	TF_SACK_PERMITTED &&
	616	(!LIST_EMPTY(&tp->t_segq) \|\|
	617	tp->reportblk.rblk_start != tp->reportblk.rblk_end))
	618	tcp_sack_fill_report(tp, opt, &optlen);
	619
	620	#ifdef TCP_SIGNATURE
	621	if (tp->t_flags & TF_SIGNATURE) {
	622	int i;
	623	u_char *bp;
	624	/*
	625	* Initialize TCP-MD5 option (RFC2385)
	626	*/
	627	bp = (u_char *)opt + optlen;
	628	*bp++ = TCPOPT_SIGNATURE;
	629	*bp++ = TCPOLEN_SIGNATURE;
	630	sigoff = optlen + 2;
	631	for (i = 0; i < TCP_SIGLEN; i++)
	632	*bp++ = 0;
	633	optlen += TCPOLEN_SIGNATURE;
	634	/*
	635	* Terminate options list and maintain 32-bit alignment.
	636	*/
	637	*bp++ = TCPOPT_NOP;
	638	*bp++ = TCPOPT_EOL;
	639	optlen += 2;
	640	}
	641	#endif /* TCP_SIGNATURE */
	642	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	643	hdrlen += optlen;
	644
	645	if (isipv6) {
	646	ipoptlen = ip6_optlen(inp);
	647	} else {
	648	if (inp->inp_options) {
	649	ipoptlen = inp->inp_options->m_len -
	650	offsetof(struct ipoption, ipopt_list);
	651	} else {
	652	ipoptlen = 0;
	653	}
	654	}
	655	#ifdef IPSEC
	656	ipoptlen += ipsec_hdrsiz_tcp(tp);
	657	#endif
	658
	659	/*
	660	* Adjust data length if insertion of options will bump the packet
	661	* length beyond the t_maxopd length. Clear FIN to prevent premature
	662	* closure since there is still more data to send after this (now
	663	* truncated) packet.
	664	*
	665	* If just the options do not fit we are in a no-win situation and
	666	* we treat it as an unreachable host.
	667	*/
	668	if (len + optlen + ipoptlen > tp->t_maxopd) {
	669	if (tp->t_maxopd <= optlen + ipoptlen) {
	670	static time_t last_optlen_report;
	671
	672	if (last_optlen_report != time_second) {
	673	last_optlen_report = time_second;
	674	kprintf("tcpcb %p: MSS (%d) too small to hold options!\n", tp, tp->t_maxopd);
	675	}
	676	error = EHOSTUNREACH;
	677	goto out;
	678	} else {
	679	flags &= ~TH_FIN;
	680	len = tp->t_maxopd - optlen - ipoptlen;
	681	sendalot = TRUE;
	682	}
	683	}
	684
	685	#ifdef INET6
	686	KASSERT(max_linkhdr + hdrlen <= MCLBYTES, ("tcphdr too big"));
	687	#else
	688	KASSERT(max_linkhdr + hdrlen <= MHLEN, ("tcphdr too big"));
	689	#endif
	690
	691	/*
	692	* Grab a header mbuf, attaching a copy of data to
	693	* be transmitted, and initialize the header from
	694	* the template for sends on this connection.
	695	*/
	696	if (len) {
	697	if ((tp->t_flags & TF_FORCE) && len == 1)
	698	tcpstat.tcps_sndprobe++;
	699	else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
	700	if (tp->snd_nxt == tp->snd_una)
	701	tp->snd_max_rexmt = tp->snd_max;
	702	tcpstat.tcps_sndrexmitpack++;
	703	tcpstat.tcps_sndrexmitbyte += len;
	704	} else {
	705	tcpstat.tcps_sndpack++;
	706	tcpstat.tcps_sndbyte += len;
	707	}
	708	#ifdef notyet
	709	if ((m = m_copypack(so->so_snd.ssb_mb, off, (int)len,
	710	max_linkhdr + hdrlen)) == NULL) {
	711	error = ENOBUFS;
	712	goto out;
	713	}
	714	/*
	715	* m_copypack left space for our hdr; use it.
	716	*/
	717	m->m_len += hdrlen;
	718	m->m_data -= hdrlen;
	719	#else
	720	#ifndef INET6
	721	m = m_gethdr(MB_DONTWAIT, MT_HEADER);
	722	#else
	723	m = m_getl(hdrlen + max_linkhdr, MB_DONTWAIT, MT_HEADER,
	724	M_PKTHDR, NULL);
	725	#endif
	726	if (m == NULL) {
	727	error = ENOBUFS;
	728	goto out;
	729	}
	730	m->m_data += max_linkhdr;
	731	m->m_len = hdrlen;
	732	if (len <= MHLEN - hdrlen - max_linkhdr) {
	733	m_copydata(so->so_snd.ssb_mb, off, (int) len,
	734	mtod(m, caddr_t) + hdrlen);
	735	m->m_len += len;
	736	} else {
	737	m->m_next = m_copy(so->so_snd.ssb_mb, off, (int) len);
	738	if (m->m_next == NULL) {
	739	m_free(m);
	740	error = ENOBUFS;
	741	goto out;
	742	}
	743	}
	744	#endif
	745	/*
	746	* If we're sending everything we've got, set PUSH.
	747	* (This will keep happy those implementations which only
	748	* give data to the user when a buffer fills or
	749	* a PUSH comes in.)
	750	*/
	751	if (off + len == so->so_snd.ssb_cc)
	752	flags \|= TH_PUSH;
	753	} else {
	754	if (tp->t_flags & TF_ACKNOW)
	755	tcpstat.tcps_sndacks++;
	756	else if (flags & (TH_SYN \| TH_FIN \| TH_RST))
	757	tcpstat.tcps_sndctrl++;
	758	else if (SEQ_GT(tp->snd_up, tp->snd_una))
	759	tcpstat.tcps_sndurg++;
	760	else
	761	tcpstat.tcps_sndwinup++;
	762
	763	MGETHDR(m, MB_DONTWAIT, MT_HEADER);
	764	if (m == NULL) {
	765	error = ENOBUFS;
	766	goto out;
	767	}
	768	if (isipv6 &&
	769	(hdrlen + max_linkhdr > MHLEN) && hdrlen <= MHLEN)
	770	MH_ALIGN(m, hdrlen);
	771	else
	772	m->m_data += max_linkhdr;
	773	m->m_len = hdrlen;
	774	}
	775	m->m_pkthdr.rcvif = NULL;
	776	if (isipv6) {
	777	ip6 = mtod(m, struct ip6_hdr *);
	778	th = (struct tcphdr *)(ip6 + 1);
	779	tcp_fillheaders(tp, ip6, th);
	780	} else {
	781	ip = mtod(m, struct ip *);
	782	ipov = (struct ipovly *)ip;
	783	th = (struct tcphdr *)(ip + 1);
	784	/* this picks up the pseudo header (w/o the length) */
	785	tcp_fillheaders(tp, ip, th);
	786	}
	787
	788	/*
	789	* Fill in fields, remembering maximum advertised
	790	* window for use in delaying messages about window sizes.
	791	* If resending a FIN, be sure not to use a new sequence number.
	792	*/
	793	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
	794	tp->snd_nxt == tp->snd_max)
	795	tp->snd_nxt--;
	796	/*
	797	* If we are doing retransmissions, then snd_nxt will
	798	* not reflect the first unsent octet. For ACK only
	799	* packets, we do not want the sequence number of the
	800	* retransmitted packet, we want the sequence number
	801	* of the next unsent octet. So, if there is no data
	802	* (and no SYN or FIN), use snd_max instead of snd_nxt
	803	* when filling in ti_seq. But if we are in persist
	804	* state, snd_max might reflect one byte beyond the
	805	* right edge of the window, so use snd_nxt in that
	806	* case, since we know we aren't doing a retransmission.
	807	* (retransmit and persist are mutually exclusive...)
	808	*/
	809	if (len \|\| (flags & (TH_SYN\|TH_FIN)) \|\|
	810	tcp_callout_active(tp, tp->tt_persist))
	811	th->th_seq = htonl(tp->snd_nxt);
	812	else
	813	th->th_seq = htonl(tp->snd_max);
	814	th->th_ack = htonl(tp->rcv_nxt);
	815	if (optlen) {
	816	bcopy(opt, th + 1, optlen);
	817	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
	818	}
	819	th->th_flags = flags;
	820
	821	/*
	822	* Calculate receive window. Don't shrink window, but avoid
	823	* silly window syndrome by sending a 0 window if the actual
	824	* window is less then one segment.
	825	*/
	826	if (recvwin < (long)(so->so_rcv.ssb_hiwat / 4) &&
	827	recvwin < (long)tp->t_maxseg)
	828	recvwin = 0;
	829	if (recvwin < (tcp_seq_diff_t)(tp->rcv_adv - tp->rcv_nxt))
	830	recvwin = (tcp_seq_diff_t)(tp->rcv_adv - tp->rcv_nxt);
	831	if (recvwin > (long)TCP_MAXWIN << tp->rcv_scale)
	832	recvwin = (long)TCP_MAXWIN << tp->rcv_scale;
	833	th->th_win = htons((u_short) (recvwin>>tp->rcv_scale));
	834
	835	/*
	836	* Adjust the RXWIN0SENT flag - indicate that we have advertised
	837	* a 0 window. This may cause the remote transmitter to stall. This
	838	* flag tells soreceive() to disable delayed acknowledgements when
	839	* draining the buffer. This can occur if the receiver is attempting
	840	* to read more data then can be buffered prior to transmitting on
	841	* the connection.
	842	*/
	843	if (recvwin == 0)
	844	tp->t_flags \|= TF_RXWIN0SENT;
	845	else
	846	tp->t_flags &= ~TF_RXWIN0SENT;
	847
	848	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
	849	th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
	850	th->th_flags \|= TH_URG;
	851	} else {
	852	/*
	853	* If no urgent pointer to send, then we pull
	854	* the urgent pointer to the left edge of the send window
	855	* so that it doesn't drift into the send window on sequence
	856	* number wraparound.
	857	*/
	858	tp->snd_up = tp->snd_una; /* drag it along */
	859	}
	860
	861	#ifdef TCP_SIGNATURE
	862	if (tp->t_flags & TF_SIGNATURE)
	863	tcpsignature_compute(m, len, optlen,
	864	(u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
	865	#endif /* TCP_SIGNATURE */
	866
	867	/*
	868	* Put TCP length in extended header, and then
	869	* checksum extended header and data.
	870	*/
	871	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
	872	if (isipv6) {
	873	/*
	874	* ip6_plen is not need to be filled now, and will be filled
	875	* in ip6_output().
	876	*/
	877	th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
	878	sizeof(struct tcphdr) + optlen + len);
	879	} else {
	880	m->m_pkthdr.csum_flags = CSUM_TCP;
	881	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	882	if (len + optlen)
	883	th->th_sum = in_addword(th->th_sum,
	884	htons((u_short)(optlen + len)));
	885
	886	/* IP version must be set here for ipv4/ipv6 checking later */
	887	KASSERT(ip->ip_v == IPVERSION,
	888	("%s: IP version incorrect: %d", __func__, ip->ip_v));
	889	}
	890
	891	/*
	892	* In transmit state, time the transmission and arrange for
	893	* the retransmit. In persist state, just set snd_max.
	894	*/
	895	if (!(tp->t_flags & TF_FORCE) \|\|
	896	!tcp_callout_active(tp, tp->tt_persist)) {
	897	tcp_seq startseq = tp->snd_nxt;
	898
	899	/*
	900	* Advance snd_nxt over sequence space of this segment.
	901	*/
	902	if (flags & (TH_SYN \| TH_FIN)) {
	903	if (flags & TH_SYN)
	904	tp->snd_nxt++;
	905	if (flags & TH_FIN) {
	906	tp->snd_nxt++;
	907	tp->t_flags \|= TF_SENTFIN;
	908	}
	909	}
	910	tp->snd_nxt += len;
	911	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
	912	tp->snd_max = tp->snd_nxt;
	913	/*
	914	* Time this transmission if not a retransmission and
	915	* not currently timing anything.
	916	*/
	917	if (tp->t_rtttime == 0) {
	918	tp->t_rtttime = ticks;
	919	tp->t_rtseq = startseq;
	920	tcpstat.tcps_segstimed++;
	921	}
	922	}
	923
	924	/*
	925	* Set retransmit timer if not currently set,
	926	* and not doing a pure ack or a keep-alive probe.
	927	* Initial value for retransmit timer is smoothed
	928	* round-trip time + 2 * round-trip time variance.
	929	* Initialize shift counter which is used for backoff
	930	* of retransmit time.
	931	*/
	932	if (!tcp_callout_active(tp, tp->tt_rexmt) &&
	933	tp->snd_nxt != tp->snd_una) {
	934	if (tcp_callout_active(tp, tp->tt_persist)) {
	935	tcp_callout_stop(tp, tp->tt_persist);
	936	tp->t_rxtshift = 0;
	937	}
	938	tcp_callout_reset(tp, tp->tt_rexmt, tp->t_rxtcur,
	939	tcp_timer_rexmt);
	940	}
	941	} else {
	942	/*
	943	* Persist case, update snd_max but since we are in
	944	* persist mode (no window) we do not update snd_nxt.
	945	*/
	946	int xlen = len;
	947	if (flags & TH_SYN)
	948	panic("tcp_output: persist timer to send SYN\n");
	949	if (flags & TH_FIN) {
	950	++xlen;
	951	tp->t_flags \|= TF_SENTFIN;
	952	}
	953	if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
	954	tp->snd_max = tp->snd_nxt + xlen;
	955	}
	956
	957	#ifdef TCPDEBUG
	958	/*
	959	* Trace.
	960	*/
	961	if (so->so_options & SO_DEBUG)
	962	tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
	963	#endif
	964
	965	/*
	966	* Fill in IP length and desired time to live and
	967	* send to IP level. There should be a better way
	968	* to handle ttl and tos; we could keep them in
	969	* the template, but need a way to checksum without them.
	970	*/
	971	/*
	972	* m->m_pkthdr.len should have been set before cksum calcuration,
	973	* because in6_cksum() need it.
	974	*/
	975	if (isipv6) {
	976	/*
	977	* we separately set hoplimit for every segment, since the
	978	* user might want to change the value via setsockopt.
	979	* Also, desired default hop limit might be changed via
	980	* Neighbor Discovery.
	981	*/
	982	ip6->ip6_hlim = in6_selecthlim(inp,
	983	(inp->in6p_route.ro_rt ?
	984	inp->in6p_route.ro_rt->rt_ifp : NULL));
	985
	986	/* TODO: IPv6 IP6TOS_ECT bit on */
	987	error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
	988	(so->so_options & SO_DONTROUTE), NULL, NULL,
	989	inp);
	990	} else {
	991	struct rtentry *rt;
	992	ip->ip_len = m->m_pkthdr.len;
	993	#ifdef INET6
	994	if (INP_CHECK_SOCKAF(so, AF_INET6))
	995	ip->ip_ttl = in6_selecthlim(inp,
	996	(inp->in6p_route.ro_rt ?
	997	inp->in6p_route.ro_rt->rt_ifp : NULL));
	998	else
	999	#endif
	1000	ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
	1001
	1002	ip->ip_tos = inp->inp_ip_tos; /* XXX */
	1003	/*
	1004	* See if we should do MTU discovery.
	1005	* We do it only if the following are true:
	1006	* 1) we have a valid route to the destination
	1007	* 2) the MTU is not locked (if it is,
	1008	* then discovery has been disabled)
	1009	*/
	1010	if (path_mtu_discovery &&
	1011	(rt = inp->inp_route.ro_rt) && (rt->rt_flags & RTF_UP) &&
	1012	!(rt->rt_rmx.rmx_locks & RTV_MTU))
	1013	ip->ip_off \|= IP_DF;
	1014
	1015	error = ip_output(m, inp->inp_options, &inp->inp_route,
	1016	(so->so_options & SO_DONTROUTE) \|
	1017	IP_DEBUGROUTE, NULL, inp);
	1018	}
	1019	if (error) {
	1020
	1021	/*
	1022	* We know that the packet was lost, so back out the
	1023	* sequence number advance, if any.
	1024	*/
	1025	if (!(tp->t_flags & TF_FORCE) \|\|
	1026	!tcp_callout_active(tp, tp->tt_persist)) {
	1027	/*
	1028	* No need to check for TH_FIN here because
	1029	* the TF_SENTFIN flag handles that case.
	1030	*/
	1031	if (!(flags & TH_SYN))
	1032	tp->snd_nxt -= len;
	1033	}
	1034
	1035	out:
	1036	if (error == ENOBUFS) {
	1037	/*
	1038	* If we can't send, make sure there is something
	1039	* to get us going again later. Persist state
	1040	* is not necessarily right, but it is close enough.
	1041	*/
	1042	if (!tcp_callout_active(tp, tp->tt_rexmt) &&
	1043	!tcp_callout_active(tp, tp->tt_persist)) {
	1044	tp->t_rxtshift = 0;
	1045	tcp_setpersist(tp);
	1046	}
	1047	tcp_quench(inp, 0);
	1048	return (0);
	1049	}
	1050	if (error == EMSGSIZE) {
	1051	/*
	1052	* ip_output() will have already fixed the route
	1053	* for us. tcp_mtudisc() will, as its last action,
	1054	* initiate retransmission, so it is important to
	1055	* not do so here.
	1056	*/
	1057	tcp_mtudisc(inp, 0);
	1058	return 0;
	1059	}
	1060	if ((error == EHOSTUNREACH \|\| error == ENETDOWN) &&
	1061	TCPS_HAVERCVDSYN(tp->t_state)) {
	1062	tp->t_softerror = error;
	1063	return (0);
	1064	}
	1065	return (error);
	1066	}
	1067	tcpstat.tcps_sndtotal++;
	1068
	1069	/*
	1070	* Data sent (as far as we can tell).
	1071	*
	1072	* If this advertises a larger window than any other segment,
	1073	* then remember the size of the advertised window.
	1074	*
	1075	* Any pending ACK has now been sent.
	1076	*/
	1077	if (recvwin > 0 && SEQ_GT(tp->rcv_nxt + recvwin, tp->rcv_adv)) {
	1078	tp->rcv_adv = tp->rcv_nxt + recvwin;
	1079	tp->t_flags &= ~TF_RXRESIZED;
	1080	}
	1081	tp->last_ack_sent = tp->rcv_nxt;
	1082	tp->t_flags &= ~TF_ACKNOW;
	1083	if (tcp_delack_enabled)
	1084	tcp_callout_stop(tp, tp->tt_delack);
	1085	if (sendalot)
	1086	goto again;
	1087	return (0);
	1088	}
	1089
	1090	void
	1091	tcp_setpersist(struct tcpcb *tp)
	1092	{
	1093	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
	1094	int tt;
	1095
	1096	if (tp->t_state == TCPS_SYN_SENT \|\|
	1097	tp->t_state == TCPS_SYN_RECEIVED) {
	1098	panic("tcp_setpersist: not established yet, current %s\n",
	1099	tp->t_state == TCPS_SYN_SENT ?
	1100	"SYN_SENT" : "SYN_RECEIVED");
	1101	}
	1102
	1103	if (tcp_callout_active(tp, tp->tt_rexmt))
	1104	panic("tcp_setpersist: retransmit pending");
	1105	/*
	1106	* Start/restart persistance timer.
	1107	*/
	1108	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN,
	1109	TCPTV_PERSMAX);
	1110	tcp_callout_reset(tp, tp->tt_persist, tt, tcp_timer_persist);
	1111	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
	1112	tp->t_rxtshift++;
	1113	}