* $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.38 2003/05/21 04:46:41 cjc Exp $
*/
-#include "opt_ipfw.h" /* for ipfw_fwd */
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
SYSCTL_INT(_net_inet_tcp, OID_AUTO, aggregate_acks, CTLFLAG_RW,
&tcp_aggregate_acks, 0, "Aggregate built-up acks into one ack");
-int tcp_do_rfc3390 = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
- &tcp_do_rfc3390, 0,
- "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
-
static int tcp_do_eifel_detect = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel, CTLFLAG_RW,
&tcp_do_eifel_detect, 0, "Eifel detection algorithm (RFC 3522)");
&tcp_do_abc, 0,
"TCP Appropriate Byte Counting (RFC 3465)");
+/*
+ * The following value actually takes range [25ms, 250ms],
+ * given that most modern systems use 1ms ~ 10ms as the unit
+ * of timestamp option.
+ */
+static u_int tcp_paws_tolerance = 25;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, paws_tolerance, CTLFLAG_RW,
+ &tcp_paws_tolerance, 0, "RFC1323 PAWS tolerance");
+
/*
* Define as tunable for easy testing with SACK on and off.
* Warning: do not change setting in the middle of an existing active TCP flow,
SYSCTL_INT(_net_inet_tcp, OID_AUTO, smartsack, CTLFLAG_RW,
&tcp_do_smartsack, 0, "Enable Smart SACK Algorithms");
+int tcp_do_rescuesack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack, CTLFLAG_RW,
+ &tcp_do_rescuesack, 0, "Rescue retransmission for SACK");
+
+int tcp_aggressive_rescuesack = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW,
+ &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK");
+
+int tcp_do_rfc3517bis = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis, CTLFLAG_RW,
+ &tcp_do_rfc3517bis, 0, "Enable RFC3517 update");
+
+int tcp_rfc3517bis_rxt = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis_rxt, CTLFLAG_RW,
+ &tcp_rfc3517bis_rxt, 0, "Enable RFC3517 retransmit update");
+
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
"TCP Segment Reassembly Queue");
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
&tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
-int tcp_sosnd_agglim = 2;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosnd_agglim, CTLFLAG_RW,
- &tcp_sosnd_agglim, 0, "TCP sosend mbuf aggregation limit");
+int tcp_sosend_agglim = 2;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_agglim, CTLFLAG_RW,
+ &tcp_sosend_agglim, 0, "TCP sosend mbuf aggregation limit");
-int tcp_sosnd_async = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosnd_async, CTLFLAG_RW,
- &tcp_sosnd_async, 0, "TCP asynchronized pru_send");
+int tcp_sosend_async = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_async, CTLFLAG_RW,
+ &tcp_sosend_async, 0, "TCP asynchronized pru_send");
-static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
+static int tcp_ignore_redun_dsack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, ignore_redun_dsack, CTLFLAG_RW,
+ &tcp_ignore_redun_dsack, 0, "Ignore redundant DSACK");
+
+static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t,
+ tcp_seq);
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
struct mbuf *);
-static void tcp_xmit_timer(struct tcpcb *, int);
+static void tcp_xmit_timer(struct tcpcb *, int, tcp_seq);
static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int);
-static void tcp_sack_rexmt(struct tcpcb *, struct tcphdr *);
+static void tcp_sack_rexmt(struct tcpcb *);
+static boolean_t tcp_sack_limitedxmit(struct tcpcb *);
static int tcp_rmx_msl(const struct tcpcb *);
+static void tcp_established(struct tcpcb *);
+static boolean_t tcp_fast_recovery(struct tcpcb *, tcp_seq,
+ const struct tcpopt *);
/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
#ifdef INET6
(SEQ_LT(tp->snd_wl2, th->th_ack) || \
(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))
+#define iceildiv(n, d) (((n)+(d)-1) / (d))
+#define need_early_retransmit(tp, ownd) \
+ (tcp_do_early_retransmit && \
+ (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) && \
+ ownd < ((tp->t_rxtthresh + 1) * tp->t_maxseg) && \
+ tp->t_dupacks + 1 >= iceildiv(ownd, tp->t_maxseg) && \
+ (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \
+ tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg)))
+
+/*
+ * Returns TRUE, if this segment can be merged with the last
+ * pending segment in the reassemble queue and this segment
+ * does not overlap with the pending segment immediately
+ * preceeding the last pending segment.
+ */
+static __inline boolean_t
+tcp_paws_canreasslast(const struct tcpcb *tp, const struct tcphdr *th, int tlen)
+{
+ const struct tseg_qent *last, *prev;
+
+ last = TAILQ_LAST(&tp->t_segq, tsegqe_head);
+ if (last == NULL)
+ return FALSE;
+
+ /* This segment comes immediately after the last pending segment */
+ if (last->tqe_th->th_seq + last->tqe_len == th->th_seq)
+ return TRUE;
+
+ if (th->th_seq + tlen != last->tqe_th->th_seq)
+ return FALSE;
+ /* This segment comes immediately before the last pending segment */
+
+ prev = TAILQ_PREV(last, tsegqe_head, tqe_q);
+ if (prev == NULL) {
+ /*
+ * No pending preceeding segment, we assume this segment
+ * could be reassembled.
+ */
+ return TRUE;
+ }
+
+ /* This segment does not overlap with the preceeding segment */
+ if (SEQ_GEQ(th->th_seq, prev->tqe_th->th_seq + prev->tqe_len))
+ return TRUE;
+
+ return FALSE;
+}
+
+static __inline void
+tcp_ncr_update_rxtthresh(struct tcpcb *tp)
+{
+ int old_rxtthresh = tp->t_rxtthresh;
+ uint32_t ownd = tp->snd_max - tp->snd_una;
+
+ tp->t_rxtthresh = max(3, ((ownd / tp->t_maxseg) >> 1));
+ if (tp->t_rxtthresh != old_rxtthresh) {
+ tcp_sack_update_lostseq(&tp->scb, tp->snd_una,
+ tp->t_maxseg, tp->t_rxtthresh);
+ }
+}
+
static int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
{
/*
* Find a segment which begins after this one does.
*/
- LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+ TAILQ_FOREACH(q, &tp->t_segq, tqe_q) {
if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
break;
p = q;
/* conversion to int (in i) handles seq wraparound */
i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
if (i > 0) { /* overlaps preceding segment */
- tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG);
+ tp->sack_flags |=
+ (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
/* enclosing block starts w/ preceding segment */
tp->encloseblk.rblk_start = p->tqe_th->th_seq;
if (i >= *tlenp) {
/* preceding encloses incoming segment */
- tp->encloseblk.rblk_end = p->tqe_th->th_seq +
- p->tqe_len;
+ tp->encloseblk.rblk_end = TCP_SACK_BLKEND(
+ p->tqe_th->th_seq + p->tqe_len,
+ p->tqe_th->th_flags);
tcpstat.tcps_rcvduppack++;
tcpstat.tcps_rcvdupbyte += *tlenp;
m_freem(m);
*tlenp -= i;
th->th_seq += i;
/* incoming segment end is enclosing block end */
- tp->encloseblk.rblk_end = th->th_seq + *tlenp +
- ((th->th_flags & TH_FIN) != 0);
+ tp->encloseblk.rblk_end = TCP_SACK_BLKEND(
+ th->th_seq + *tlenp, th->th_flags);
/* trim end of reported D-SACK block */
tp->reportblk.rblk_end = th->th_seq;
}
while (q) {
tcp_seq_diff_t i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
tcp_seq qend = q->tqe_th->th_seq + q->tqe_len;
+ tcp_seq qend_sack = TCP_SACK_BLKEND(qend, q->tqe_th->th_flags);
struct tseg_qent *nq;
if (i <= 0)
break;
- if (!(tp->t_flags & TF_DUPSEG)) { /* first time through */
- tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG);
+ if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
+ /* first time through */
+ tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
tp->encloseblk = tp->reportblk;
/* report trailing duplicate D-SACK segment */
tp->reportblk.rblk_start = q->tqe_th->th_seq;
}
- if ((tp->t_flags & TF_ENCLOSESEG) &&
- SEQ_GT(qend, tp->encloseblk.rblk_end)) {
+ if ((tp->sack_flags & TSACK_F_ENCLOSESEG) &&
+ SEQ_GT(qend_sack, tp->encloseblk.rblk_end)) {
/* extend enclosing block if one exists */
- tp->encloseblk.rblk_end = qend;
+ tp->encloseblk.rblk_end = qend_sack;
}
if (i < q->tqe_len) {
q->tqe_th->th_seq += i;
break;
}
- nq = LIST_NEXT(q, tqe_q);
- LIST_REMOVE(q, tqe_q);
+ nq = TAILQ_NEXT(q, tqe_q);
+ TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
m_freem(q->tqe_m);
kfree(q, M_TSEGQ);
atomic_add_int(&tcp_reass_qsize, -1);
/* check if can coalesce with following segment */
if (q != NULL && (th->th_seq + *tlenp == q->tqe_th->th_seq)) {
tcp_seq tend = te->tqe_th->th_seq + te->tqe_len;
+ tcp_seq tend_sack = TCP_SACK_BLKEND(tend, te->tqe_th->th_flags);
te->tqe_len += q->tqe_len;
if (q->tqe_th->th_flags & TH_FIN)
te->tqe_th->th_flags |= TH_FIN;
m_cat(te->tqe_m, q->tqe_m);
- tp->encloseblk.rblk_end = tend;
+ tp->encloseblk.rblk_end = tend_sack;
/*
* When not reporting a duplicate segment, use
* the larger enclosing block as the SACK block.
*/
- if (!(tp->t_flags & TF_DUPSEG))
- tp->reportblk.rblk_end = tend;
- LIST_REMOVE(q, tqe_q);
+ if (!(tp->sack_flags & TSACK_F_DUPSEG))
+ tp->reportblk.rblk_end = tend_sack;
+ TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
kfree(q, M_TSEGQ);
atomic_add_int(&tcp_reass_qsize, -1);
}
if (p == NULL) {
- LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+ TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q);
} else {
/* check if can coalesce with preceding segment */
if (p->tqe_th->th_seq + p->tqe_len == th->th_seq) {
* When not reporting a duplicate segment, use
* the larger enclosing block as the SACK block.
*/
- if (!(tp->t_flags & TF_DUPSEG))
+ if (!(tp->sack_flags & TSACK_F_DUPSEG))
tp->reportblk.rblk_start = p->tqe_th->th_seq;
kfree(te, M_TSEGQ);
atomic_add_int(&tcp_reass_qsize, -1);
} else {
- LIST_INSERT_AFTER(p, te, tqe_q);
+ TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q);
}
}
*/
if (!TCPS_HAVEESTABLISHED(tp->t_state))
return (0);
- q = LIST_FIRST(&tp->t_segq);
+ q = TAILQ_FIRST(&tp->t_segq);
if (q == NULL || q->tqe_th->th_seq != tp->rcv_nxt)
return (0);
tp->rcv_nxt += q->tqe_len;
- if (!(tp->t_flags & TF_DUPSEG)) {
+ if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
/* no SACK block to report since ACK advanced */
tp->reportblk.rblk_start = tp->reportblk.rblk_end;
}
/* no enclosing block to report since ACK advanced */
- tp->t_flags &= ~TF_ENCLOSESEG;
+ tp->sack_flags &= ~TSACK_F_ENCLOSESEG;
flags = q->tqe_th->th_flags & TH_FIN;
- LIST_REMOVE(q, tqe_q);
- KASSERT(LIST_EMPTY(&tp->t_segq) ||
- LIST_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt,
+ TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
+ KASSERT(TAILQ_EMPTY(&tp->t_segq) ||
+ TAILQ_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt,
("segment not coalesced"));
if (so->so_state & SS_CANTRCVMORE) {
m_freem(q->tqe_m);
int thflags;
struct socket *so = NULL;
int todrop, acked;
- boolean_t ourfinisacked, needoutput = FALSE;
+ boolean_t ourfinisacked, needoutput = FALSE, delayed_dupack = FALSE;
+ tcp_seq th_dupack = 0; /* XXX gcc warning */
u_long tiwin;
int recvwin;
struct tcpopt to; /* options in this segment */
if (tp->t_state <= TCPS_CLOSED)
goto drop;
- /* Unscale the window into a 32-bit value. */
- if (!(thflags & TH_SYN))
- tiwin = th->th_win << tp->snd_scale;
- else
- tiwin = th->th_win;
-
so = inp->inp_socket;
#ifdef TCPDEBUG
tp->snd_up = tp->snd_una;
tp->snd_max = tp->snd_nxt = tp->iss + 1;
tp->last_ack_sent = tp->rcv_nxt;
-/*
- * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled
- * until the _second_ ACK is received:
- * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window.
- * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale,
- * move to ESTAB, set snd_wnd to tiwin.
- */
- tp->snd_wnd = tiwin; /* unscaled */
+
goto after_listen;
}
if (thflags & TH_RST) {
* for syncache, or perform t/tcp connection.
*/
if (so->so_qlen <= so->so_qlimit) {
- tcp_dooptions(&to, optp, optlen, TRUE);
- if (!syncache_add(&inc, &to, th, &so, m))
+ tcp_dooptions(&to, optp, optlen, TRUE, th->th_ack);
+ if (!syncache_add(&inc, &to, th, so, m))
goto drop;
/*
* Entry added to syncache, mbuf used to
* send SYN,ACK packet.
*/
- if (so == NULL)
- return(IPPROTO_DONE);
-
- /*
- * We must be in the correct protocol thread for
- * this connection.
- */
- KKASSERT(so->so_port == &curthread->td_msgport);
-
- inp = so->so_pcb;
- tp = intotcpcb(inp);
- tp->snd_wnd = tiwin;
- tp->t_starttime = ticks;
- tp->t_state = TCPS_ESTABLISHED;
-
- /*
- * If there is a FIN, or if there is data and the
- * connection is local, then delay SYN,ACK(SYN) in
- * the hope of piggy-backing it on a response
- * segment. Otherwise must send ACK now in case
- * the other side is slow starting.
- */
- if (DELAY_ACK(tp) &&
- ((thflags & TH_FIN) ||
- (tlen != 0 &&
- ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
- (!isipv6 && in_localaddr(inp->inp_faddr)))))) {
- tcp_callout_reset(tp, tp->tt_delack,
- tcp_delacktime, tcp_timer_delack);
- tp->t_flags |= TF_NEEDSYN;
- } else {
- tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
- }
-
- tcpstat.tcps_connects++;
- soisconnected(so);
- goto trimthenstep6;
+ return(IPPROTO_DONE);
}
goto drop;
}
KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
KKASSERT(so->so_port == &curthread->td_msgport);
+ /* Unscale the window into a 32-bit value. */
+ if (!(thflags & TH_SYN))
+ tiwin = th->th_win << tp->snd_scale;
+ else
+ tiwin = th->th_win;
+
/*
* This is the second part of the MSS DoS prevention code (after
* minmss on the sending side) and it deals with too many too small
* Process options.
* XXX this is tradtitional behavior, may need to be cleaned up.
*/
- tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0);
+ tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0, th->th_ack);
if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
- if (to.to_flags & TOF_SCALE) {
+ if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) {
tp->t_flags |= TF_RCVD_SCALE;
- tp->requested_s_scale = to.to_requested_s_scale;
+ tp->snd_scale = to.to_requested_s_scale;
}
+
+ /*
+ * Initial send window; will be updated upon next ACK
+ */
+ tp->snd_wnd = th->th_win;
+
if (to.to_flags & TOF_TS) {
tp->t_flags |= TF_RCVD_TSTMP;
tp->ts_recent = to.to_tsval;
tp->ts_recent_age = ticks;
}
- if (to.to_flags & TOF_MSS)
- tcp_mss(tp, to.to_mss);
+ if (!(to.to_flags & TOF_MSS))
+ to.to_mss = 0;
+ tcp_mss(tp, to.to_mss);
/*
* Only set the TF_SACK_PERMITTED per-connection flag
* if we got a SACK_PERMITTED option from the other side
*/
if (tcp_do_eifel_detect &&
(to.to_flags & TOF_TS) && to.to_tsecr &&
- (tp->t_flags & TF_FIRSTACCACK)) {
+ (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
/* Eifel detection applicable. */
if (to.to_tsecr < tp->t_rexmtTS) {
tcp_revert_congestion_state(tp);
++tcpstat.tcps_eifeldetected;
+ if (tp->t_rxtshift != 1 ||
+ ticks >= tp->t_badrxtwin)
+ ++tcpstat.tcps_rttcantdetect;
}
} else if (tp->t_rxtshift == 1 &&
ticks < tp->t_badrxtwin) {
tcp_revert_congestion_state(tp);
++tcpstat.tcps_rttdetected;
}
- tp->t_flags &= ~(TF_FIRSTACCACK |
- TF_FASTREXMT | TF_EARLYREXMT);
+ tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
+ TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
/*
* Recalculate the retransmit timer / rtt.
*
*/
if ((to.to_flags & TOF_TS) && to.to_tsecr) {
tcp_xmit_timer(tp,
- ticks - to.to_tsecr + 1);
+ ticks - to.to_tsecr + 1,
+ th->th_ack);
} else if (tp->t_rtttime &&
SEQ_GT(th->th_ack, tp->t_rtseq)) {
tcp_xmit_timer(tp,
- ticks - tp->t_rtttime);
+ ticks - tp->t_rtttime,
+ th->th_ack);
}
tcp_xmit_bandwidth_limit(tp, th->th_ack);
acked = th->th_ack - tp->snd_una;
}
} else if (tiwin == tp->snd_wnd &&
th->th_ack == tp->snd_una &&
- LIST_EMPTY(&tp->t_segq) &&
+ TAILQ_EMPTY(&tp->t_segq) &&
tlen <= ssb_space(&so->so_rcv)) {
u_long newsize = 0; /* automatic sockbuf scaling */
/*
}
if (!(thflags & TH_SYN))
goto drop;
- tp->snd_wnd = th->th_win; /* initial send window */
tp->irs = th->th_seq;
tcp_rcvseqinit(tp);
soisconnected(so);
/* Do window scaling on this connection? */
if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
- (TF_RCVD_SCALE | TF_REQ_SCALE)) {
- tp->snd_scale = tp->requested_s_scale;
+ (TF_RCVD_SCALE | TF_REQ_SCALE))
tp->rcv_scale = tp->request_r_scale;
- }
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
tcp_callout_stop(tp, tp->tt_rexmt);
tp->t_flags &= ~TF_NEEDFIN;
thflags &= ~TH_SYN;
} else {
- tp->t_state = TCPS_ESTABLISHED;
- tcp_callout_reset(tp, tp->tt_keep,
- tcp_getkeepidle(tp),
- tcp_timer_keep);
+ tcp_established(tp);
}
} else {
/*
tp->t_state = TCPS_SYN_RECEIVED;
}
-trimthenstep6:
/*
* Advance th->th_seq to correspond to first data byte.
* If data, trim to stay within window,
*/
if ((to.to_flags & TOF_TS) && tp->ts_recent != 0 &&
TSTMP_LT(to.to_tsval, tp->ts_recent)) {
-
/* Check to see if ts_recent is over 24 days old. */
if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
/*
* dropped when ts_recent is old.
*/
tp->ts_recent = 0;
+ } else if (tcp_paws_tolerance && tlen != 0 &&
+ tp->t_state == TCPS_ESTABLISHED &&
+ (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK&&
+ !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) &&
+ th->th_ack == tp->snd_una &&
+ tiwin == tp->snd_wnd &&
+ TSTMP_GEQ(to.to_tsval + tcp_paws_tolerance, tp->ts_recent)&&
+ (th->th_seq == tp->rcv_nxt ||
+ (SEQ_GT(th->th_seq, tp->rcv_nxt) &&
+ tcp_paws_canreasslast(tp, th, tlen)))) {
+ /*
+ * This tends to prevent valid new segments from being
+ * dropped by the reordered segments sent by the fast
+ * retransmission algorithm on the sending side, i.e.
+ * the fast retransmitted segment w/ larger timestamp
+ * arrives earlier than the previously sent new segments
+ * w/ smaller timestamp.
+ *
+ * If following conditions are met, the segment is
+ * accepted:
+ * - The segment contains data
+ * - The connection is established
+ * - The header does not contain important flags
+ * - SYN or FIN is not needed
+ * - It does not acknowledge new data
+ * - Receive window is not changed
+ * - The timestamp is within "acceptable" range
+ * - The new segment is what we are expecting or
+ * the new segment could be merged w/ the last
+ * pending segment on the reassemble queue
+ */
+ tcpstat.tcps_pawsaccept++;
+ tcpstat.tcps_pawsdrop++;
} else {
tcpstat.tcps_rcvduppack++;
tcpstat.tcps_rcvdupbyte += tlen;
if (TCP_DO_SACK(tp)) {
/* Report duplicate segment at head of packet. */
tp->reportblk.rblk_start = th->th_seq;
- tp->reportblk.rblk_end = th->th_seq + tlen;
- if (thflags & TH_FIN)
- ++tp->reportblk.rblk_end;
+ tp->reportblk.rblk_end = TCP_SACK_BLKEND(
+ th->th_seq + tlen, thflags);
if (SEQ_GT(tp->reportblk.rblk_end, tp->rcv_nxt))
tp->reportblk.rblk_end = tp->rcv_nxt;
- tp->t_flags |= (TF_DUPSEG | TF_SACKLEFT | TF_ACKNOW);
+ tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_SACKLEFT);
+ tp->t_flags |= TF_ACKNOW;
}
if (thflags & TH_SYN) {
thflags &= ~TH_SYN;
soisconnected(so);
/* Do window scaling? */
if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
- (TF_RCVD_SCALE | TF_REQ_SCALE)) {
- tp->snd_scale = tp->requested_s_scale;
+ (TF_RCVD_SCALE | TF_REQ_SCALE))
tp->rcv_scale = tp->request_r_scale;
- }
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
tp->t_state = TCPS_FIN_WAIT_1;
tp->t_flags &= ~TF_NEEDFIN;
} else {
- tp->t_state = TCPS_ESTABLISHED;
- tcp_callout_reset(tp, tp->tt_keep,
- tcp_getkeepidle(tp),
- tcp_timer_keep);
+ tcp_established(tp);
}
/*
* If segment contains data or ACK, will call tcp_reass()
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
if (TCP_DO_SACK(tp))
tcp_sack_update_scoreboard(tp, &to);
- if (tlen != 0 || tiwin != tp->snd_wnd) {
- tp->t_dupacks = 0;
- break;
- }
- tcpstat.tcps_rcvdupack++;
if (!tcp_callout_active(tp, tp->tt_rexmt) ||
th->th_ack != tp->snd_una) {
+ if (tlen == 0 && tiwin == tp->snd_wnd)
+ tcpstat.tcps_rcvdupack++;
tp->t_dupacks = 0;
break;
}
- /*
- * We have outstanding data (other than
- * a window probe), this is a completely
- * duplicate ack (ie, window info didn't
- * change), the ack is the biggest we've
- * seen and we've seen exactly our rexmt
- * threshhold of them, so assume a packet
- * has been dropped and retransmit it.
- * Kludge snd_nxt & the congestion
- * window so we send only this one
- * packet.
- */
- if (IN_FASTRECOVERY(tp)) {
- if (TCP_DO_SACK(tp)) {
- /* No artifical cwnd inflation. */
- tcp_sack_rexmt(tp, th);
+ if (tlen != 0 || tiwin != tp->snd_wnd) {
+ if (!tcp_do_rfc3517bis ||
+ !TCP_DO_SACK(tp) ||
+ (to.to_flags &
+ (TOF_SACK | TOF_SACK_REDUNDANT))
+ != TOF_SACK) {
+ tp->t_dupacks = 0;
} else {
- /*
- * Dup acks mean that packets
- * have left the network
- * (they're now cached at the
- * receiver) so bump cwnd by
- * the amount in the receiver
- * to keep a constant cwnd
- * packets in the network.
- */
- tp->snd_cwnd += tp->t_maxseg;
- tcp_output(tp);
+ delayed_dupack = TRUE;
+ th_dupack = th->th_ack;
}
- } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- tp->t_dupacks = 0;
break;
- } else if (++tp->t_dupacks == tcprexmtthresh) {
- tcp_seq old_snd_nxt;
- u_int win;
-
-fastretransmit:
- if (tcp_do_eifel_detect &&
- (tp->t_flags & TF_RCVD_TSTMP)) {
- tcp_save_congestion_state(tp);
- tp->t_flags |= TF_FASTREXMT;
- }
- /*
- * We know we're losing at the current
- * window size, so do congestion avoidance:
- * set ssthresh to half the current window
- * and pull our congestion window back to the
- * new ssthresh.
- */
- win = min(tp->snd_wnd, tp->snd_cwnd) / 2 /
- tp->t_maxseg;
- if (win < 2)
- win = 2;
- tp->snd_ssthresh = win * tp->t_maxseg;
- ENTER_FASTRECOVERY(tp);
- tp->snd_recover = tp->snd_max;
- tcp_callout_stop(tp, tp->tt_rexmt);
- tp->t_rtttime = 0;
- old_snd_nxt = tp->snd_nxt;
- tp->snd_nxt = th->th_ack;
- tp->snd_cwnd = tp->t_maxseg;
- tcp_output(tp);
- ++tcpstat.tcps_sndfastrexmit;
- tp->snd_cwnd = tp->snd_ssthresh;
- tp->rexmt_high = tp->snd_nxt;
- if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
- tp->snd_nxt = old_snd_nxt;
- KASSERT(tp->snd_limited <= 2,
- ("tp->snd_limited too big"));
- if (TCP_DO_SACK(tp))
- tcp_sack_rexmt(tp, th);
- else
- tp->snd_cwnd += tp->t_maxseg *
- (tp->t_dupacks - tp->snd_limited);
- } else if (tcp_do_limitedtransmit) {
- u_long oldcwnd = tp->snd_cwnd;
- tcp_seq oldsndmax = tp->snd_max;
- tcp_seq oldsndnxt = tp->snd_nxt;
- /* outstanding data */
- uint32_t ownd = tp->snd_max - tp->snd_una;
- u_int sent;
-
-#define iceildiv(n, d) (((n)+(d)-1) / (d))
-
- KASSERT(tp->t_dupacks == 1 ||
- tp->t_dupacks == 2,
- ("dupacks not 1 or 2"));
- if (tp->t_dupacks == 1)
- tp->snd_limited = 0;
- tp->snd_nxt = tp->snd_max;
- tp->snd_cwnd = ownd +
- (tp->t_dupacks - tp->snd_limited) *
- tp->t_maxseg;
- tcp_output(tp);
-
- /*
- * Other acks may have been processed,
- * snd_nxt cannot be reset to a value less
- * then snd_una.
- */
- if (SEQ_LT(oldsndnxt, oldsndmax)) {
- if (SEQ_GT(oldsndnxt, tp->snd_una))
- tp->snd_nxt = oldsndnxt;
- else
- tp->snd_nxt = tp->snd_una;
- }
- tp->snd_cwnd = oldcwnd;
- sent = tp->snd_max - oldsndmax;
- if (sent > tp->t_maxseg) {
- KASSERT((tp->t_dupacks == 2 &&
- tp->snd_limited == 0) ||
- (sent == tp->t_maxseg + 1 &&
- tp->t_flags & TF_SENTFIN),
- ("sent too much"));
- KASSERT(sent <= tp->t_maxseg * 2,
- ("sent too many segments"));
- tp->snd_limited = 2;
- tcpstat.tcps_sndlimited += 2;
- } else if (sent > 0) {
- ++tp->snd_limited;
- ++tcpstat.tcps_sndlimited;
- } else if (tcp_do_early_retransmit &&
- (tcp_do_eifel_detect &&
- (tp->t_flags & TF_RCVD_TSTMP)) &&
- ownd < 4 * tp->t_maxseg &&
- tp->t_dupacks + 1 >=
- iceildiv(ownd, tp->t_maxseg) &&
- (!TCP_DO_SACK(tp) ||
- ownd <= tp->t_maxseg ||
- tcp_sack_has_sacked(&tp->scb,
- ownd - tp->t_maxseg))) {
- ++tcpstat.tcps_sndearlyrexmit;
- tp->t_flags |= TF_EARLYREXMT;
- goto fastretransmit;
- }
}
- goto drop;
+ if (tcp_fast_recovery(tp, th->th_ack, &to))
+ goto drop;
+ else
+ break;
}
KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una"));
tp->snd_una++;
/* Do window scaling? */
if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
- (TF_RCVD_SCALE | TF_REQ_SCALE)) {
- tp->snd_scale = tp->requested_s_scale;
+ (TF_RCVD_SCALE | TF_REQ_SCALE))
tp->rcv_scale = tp->request_r_scale;
- }
}
process_ACK:
if (tcp_do_eifel_detect && acked > 0 &&
(to.to_flags & TOF_TS) && (to.to_tsecr != 0) &&
- (tp->t_flags & TF_FIRSTACCACK)) {
+ (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
/* Eifel detection applicable. */
if (to.to_tsecr < tp->t_rexmtTS) {
++tcpstat.tcps_eifeldetected;
tcp_revert_congestion_state(tp);
- if (tp->t_rxtshift == 1 &&
+ if (tp->t_rxtshift != 1 ||
ticks >= tp->t_badrxtwin)
++tcpstat.tcps_rttcantdetect;
}
* timestamps of 0.
*/
if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0))
- tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
+ tcp_xmit_timer(tp, ticks - to.to_tsecr + 1, th->th_ack);
else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
- tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ tcp_xmit_timer(tp, ticks - tp->t_rtttime, th->th_ack);
tcp_xmit_bandwidth_limit(tp, th->th_ack);
/*
goto step6;
/* Stop looking for an acceptable ACK since one was received. */
- tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT | TF_EARLYREXMT);
+ tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
+ TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
if (acked > so->so_snd.ssb_cc) {
tp->snd_wnd -= so->so_snd.ssb_cc;
/*
* Update window information.
- * Don't look at window if no ACK:
- * TAC's send garbage on first SYN.
*/
- if (SEQ_LT(tp->snd_wl1, th->th_seq) ||
- (tp->snd_wl1 == th->th_seq &&
- (SEQ_LT(tp->snd_wl2, th->th_ack) ||
- (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)))) {
+ if (acceptable_window_update(tp, th, tiwin)) {
/* keep track of pure window updates */
if (tlen == 0 && tp->snd_wl2 == th->th_ack &&
tiwin > tp->snd_wnd)
} else {
if (TCP_DO_SACK(tp)) {
tp->snd_max_rexmt = tp->snd_max;
- tcp_sack_rexmt(tp, th);
+ tcp_sack_rexmt(tp);
} else {
tcp_newreno_partial_ack(tp, th, acked);
}
* fast retransmit can work).
*/
if (th->th_seq == tp->rcv_nxt &&
- LIST_EMPTY(&tp->t_segq) &&
+ TAILQ_EMPTY(&tp->t_segq) &&
TCPS_HAVEESTABLISHED(tp->t_state)) {
if (DELAY_ACK(tp)) {
tcp_callout_reset(tp, tp->tt_delack,
}
sorwakeup(so);
} else {
- if (!(tp->t_flags & TF_DUPSEG)) {
+ if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
/* Initialize SACK report block. */
tp->reportblk.rblk_start = th->th_seq;
- tp->reportblk.rblk_end = th->th_seq + tlen +
- ((thflags & TH_FIN) != 0);
+ tp->reportblk.rblk_end = TCP_SACK_BLKEND(
+ th->th_seq + tlen, thflags);
}
thflags = tcp_reass(tp, th, &tlen, m);
tp->t_flags |= TF_ACKNOW;
tcp_trace(TA_INPUT, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
#endif
+ /*
+ * Delayed duplicated ACK processing
+ */
+ if (delayed_dupack && tcp_fast_recovery(tp, th_dupack, &to))
+ needoutput = FALSE;
+
/*
* Return any desired output.
*/
if (needoutput || (tp->t_flags & TF_ACKNOW))
tcp_output(tp);
+ tcp_sack_report_cleanup(tp);
return(IPPROTO_DONE);
dropafterack:
m_freem(m);
tp->t_flags |= TF_ACKNOW;
tcp_output(tp);
+ tcp_sack_report_cleanup(tp);
return(IPPROTO_DONE);
dropwithreset:
tcp_respond(tp, mtod(m, void *), th, m, th->th_seq + tlen,
(tcp_seq)0, TH_RST | TH_ACK);
}
+ if (tp != NULL)
+ tcp_sack_report_cleanup(tp);
return(IPPROTO_DONE);
drop:
tcp_trace(TA_DROP, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
#endif
m_freem(m);
+ if (tp != NULL)
+ tcp_sack_report_cleanup(tp);
return(IPPROTO_DONE);
}
* Parse TCP options and place in tcpopt.
*/
static void
-tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn)
+tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn,
+ tcp_seq ack)
{
int opt, optlen, i;
r->rblk_start = ntohl(r->rblk_start);
r->rblk_end = ntohl(r->rblk_end);
+
+ if (SEQ_LEQ(r->rblk_end, r->rblk_start)) {
+ /*
+ * Invalid SACK block; discard all
+ * SACK blocks
+ */
+ tcpstat.tcps_rcvbadsackopt++;
+ to->to_nsackblocks = 0;
+ to->to_sackblocks = NULL;
+ to->to_flags &= ~TOF_SACK;
+ break;
+ }
}
+ if ((to->to_flags & TOF_SACK) &&
+ tcp_sack_ndsack_blocks(to->to_sackblocks,
+ to->to_nsackblocks, ack))
+ to->to_flags |= TOF_DSACK;
break;
#ifdef TCP_SIGNATURE
/*
* and update averages and current timeout.
*/
static void
-tcp_xmit_timer(struct tcpcb *tp, int rtt)
+tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack)
{
- int delta;
+ int rebaserto = 0;
tcpstat.tcps_rttupdated++;
tp->t_rttupdated++;
- if (tp->t_srtt != 0) {
+ if ((tp->rxt_flags & TRXT_F_REBASERTO) &&
+ SEQ_GT(ack, tp->snd_max_prev)) {
+#ifdef DEBUG_EIFEL_RESPONSE
+ kprintf("srtt/rttvar, prev %d/%d, cur %d/%d, ",
+ tp->t_srtt_prev, tp->t_rttvar_prev,
+ tp->t_srtt, tp->t_rttvar);
+#endif
+
+ tcpstat.tcps_eifelresponse++;
+ rebaserto = 1;
+ tp->rxt_flags &= ~TRXT_F_REBASERTO;
+ tp->t_srtt = max(tp->t_srtt_prev, (rtt << TCP_RTT_SHIFT));
+ tp->t_rttvar = max(tp->t_rttvar_prev,
+ (rtt << (TCP_RTTVAR_SHIFT - 1)));
+ if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+
+#ifdef DEBUG_EIFEL_RESPONSE
+ kprintf("new %d/%d ", tp->t_srtt, tp->t_rttvar);
+#endif
+ } else if (tp->t_srtt != 0) {
+ int delta;
+
/*
* srtt is stored as fixed point with 5 bits after the
* binary point (i.e., scaled by 8). The following magic
tp->t_rtttime = 0;
tp->t_rxtshift = 0;
+#ifdef DEBUG_EIFEL_RESPONSE
+ if (rebaserto) {
+ kprintf("| rxtcur prev %d, old %d, ",
+ tp->t_rxtcur_prev, tp->t_rxtcur);
+ }
+#endif
+
/*
* the retransmit should happen at rtt + 4 * rttvar.
* Because of the way we do the smoothing, srtt and rttvar
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
+ if (rebaserto) {
+ if (tp->t_rxtcur < tp->t_rxtcur_prev + tcp_eifel_rtoinc) {
+ /*
+ * RFC4015 requires that the new RTO is at least
+ * 2*G (tcp_eifel_rtoinc) greater then the RTO
+ * (t_rxtcur_prev) when the spurious retransmit
+ * timeout happens.
+ *
+ * The above condition could be true, if the SRTT
+ * and RTTVAR used to calculate t_rxtcur_prev
+ * resulted in a value less than t_rttmin. So
+ * simply increasing SRTT by tcp_eifel_rtoinc when
+ * preparing for the Eifel response in
+ * tcp_save_congestion_state() could not ensure
+ * that the new RTO will be tcp_eifel_rtoinc greater
+ * t_rxtcur_prev.
+ */
+ tp->t_rxtcur = tp->t_rxtcur_prev + tcp_eifel_rtoinc;
+ }
+#ifdef DEBUG_EIFEL_RESPONSE
+ kprintf("new %d\n", tp->t_rxtcur);
+#endif
+ }
+
/*
* We received an ack for a packet that wasn't retransmitted;
* it is probably safe to discard any error indications we've
mss -= TCPOLEN_TSTAMP_APPA;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
- if (mss > MCLBYTES)
- mss &= ~(MCLBYTES-1);
+ if (mss > MCLBYTES)
+ mss &= ~(MCLBYTES-1);
#else
- if (mss > MCLBYTES)
- mss = mss / MCLBYTES * MCLBYTES;
+ if (mss > MCLBYTES)
+ mss = mss / MCLBYTES * MCLBYTES;
#endif
/*
* If there's a pipesize, change the socket buffer
}
/*
- * Set the slow-start flight size depending on whether this
- * is a local network or not.
+ * Set the slow-start flight size
+ *
+ * NOTE: t_maxseg must have been configured!
*/
- if (tcp_do_rfc3390)
- tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
- else
- tp->snd_cwnd = mss;
+ tp->snd_cwnd = tcp_initial_window(tp);
if (rt->rt_rmx.rmx_ssthresh) {
/*
* except when retransmitting snd_una.
*/
static void
-tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
+tcp_sack_rexmt(struct tcpcb *tp)
{
- uint32_t pipe, seglen;
- tcp_seq nextrexmt;
- boolean_t lostdup;
tcp_seq old_snd_nxt = tp->snd_nxt;
u_long ocwnd = tp->snd_cwnd;
+ uint32_t pipe;
int nseg = 0; /* consecutive new segments */
+ int nseg_rexmt = 0; /* retransmitted segments */
#define MAXBURST 4 /* limit burst of new packets on partial ack */
tp->t_rtttime = 0;
pipe = tcp_sack_compute_pipe(tp);
while ((tcp_seq_diff_t)(ocwnd - pipe) >= (tcp_seq_diff_t)tp->t_maxseg &&
- (!tcp_do_smartsack || nseg < MAXBURST) &&
- tcp_sack_nextseg(tp, &nextrexmt, &seglen, &lostdup)) {
- uint32_t sent;
- tcp_seq old_snd_max;
+ (!tcp_do_smartsack || nseg < MAXBURST)) {
+ tcp_seq old_snd_max, old_rexmt_high, nextrexmt;
+ uint32_t sent, seglen;
+ boolean_t rescue;
int error;
+ old_rexmt_high = tp->rexmt_high;
+ if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) {
+ tp->rexmt_high = old_rexmt_high;
+ break;
+ }
+
+ /*
+ * If the next tranmission is a rescue retranmission,
+ * we check whether we have already sent some data
+ * (either new segments or retransmitted segments)
+ * into the the network or not. Since the idea of rescue
+ * retransmission is to sustain ACK clock, as long as
+ * some segments are in the network, ACK clock will be
+ * kept ticking.
+ */
+ if (rescue && (nseg_rexmt > 0 || nseg > 0)) {
+ tp->rexmt_high = old_rexmt_high;
+ break;
+ }
+
if (nextrexmt == tp->snd_max)
++nseg;
+ else
+ ++nseg_rexmt;
tp->snd_nxt = nextrexmt;
tp->snd_cwnd = nextrexmt - tp->snd_una + seglen;
old_snd_max = tp->snd_max;
if (nextrexmt == tp->snd_una)
tcp_callout_stop(tp, tp->tt_rexmt);
error = tcp_output(tp);
- if (error != 0)
+ if (error != 0) {
+ tp->rexmt_high = old_rexmt_high;
break;
+ }
sent = tp->snd_nxt - nextrexmt;
- if (sent <= 0)
+ if (sent <= 0) {
+ tp->rexmt_high = old_rexmt_high;
break;
- if (!lostdup)
- pipe += sent;
+ }
+ pipe += sent;
tcpstat.tcps_sndsackpack++;
tcpstat.tcps_sndsackbyte += sent;
+
+ if (rescue) {
+ tcpstat.tcps_sackrescue++;
+ tp->rexmt_rescue = tp->snd_nxt;
+ tp->sack_flags |= TSACK_F_SACKRESCUED;
+ break;
+ }
if (SEQ_LT(nextrexmt, old_snd_max) &&
- SEQ_LT(tp->rexmt_high, tp->snd_nxt))
+ SEQ_LT(tp->rexmt_high, tp->snd_nxt)) {
tp->rexmt_high = seq_min(tp->snd_nxt, old_snd_max);
+ if (tcp_aggressive_rescuesack &&
+ (tp->sack_flags & TSACK_F_SACKRESCUED) &&
+ SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
+ /* Drag RescueRxt along with HighRxt */
+ tp->rexmt_rescue = tp->rexmt_high;
+ }
+ }
}
if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
tp->snd_nxt = old_snd_nxt;
tp->snd_cwnd = ocwnd;
}
+/*
+ * Return TRUE, if some new segments are sent
+ */
+static boolean_t
+tcp_sack_limitedxmit(struct tcpcb *tp)
+{
+ tcp_seq oldsndnxt = tp->snd_nxt;
+ tcp_seq oldsndmax = tp->snd_max;
+ u_long ocwnd = tp->snd_cwnd;
+ uint32_t pipe, sent;
+ boolean_t ret = FALSE;
+ tcp_seq_diff_t cwnd_left;
+ tcp_seq next;
+
+ tp->rexmt_high = tp->snd_una - 1;
+ pipe = tcp_sack_compute_pipe(tp);
+ cwnd_left = (tcp_seq_diff_t)(ocwnd - pipe);
+ if (cwnd_left < (tcp_seq_diff_t)tp->t_maxseg)
+ return FALSE;
+
+ next = tp->snd_nxt = tp->snd_max;
+ tp->snd_cwnd = tp->snd_nxt - tp->snd_una +
+ rounddown(cwnd_left, tp->t_maxseg);
+
+ tcp_output(tp);
+
+ sent = tp->snd_nxt - next;
+ if (sent > 0) {
+ tcpstat.tcps_sndlimited += howmany(sent, tp->t_maxseg);
+ ret = TRUE;
+ }
+
+ if (SEQ_LT(oldsndnxt, oldsndmax)) {
+ KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
+ ("snd_una moved in other threads"));
+ tp->snd_nxt = oldsndnxt;
+ }
+ tp->snd_cwnd = ocwnd;
+
+ if (ret && TCP_DO_NCR(tp))
+ tcp_ncr_update_rxtthresh(tp);
+
+ return ret;
+}
+
/*
* Reset idle time and keep-alive timer, typically called when a valid
* tcp packet is received but may also be called when FASTKEEP is set
tp->t_rcvtime = ticks;
tp->t_flags &= ~TF_KEEPALIVE;
tcp_callout_reset(tp, tp->tt_keep,
- tcp_getkeepidle(tp),
+ tp->t_keepidle,
tcp_timer_keep);
}
}
return msl;
}
+
+static void
+tcp_established(struct tcpcb *tp)
+{
+ tp->t_state = TCPS_ESTABLISHED;
+ tcp_callout_reset(tp, tp->tt_keep, tp->t_keepidle, tcp_timer_keep);
+
+ if (tp->t_rxtsyn > 0) {
+ /*
+ * RFC6298:
+ * "If the timer expires awaiting the ACK of a SYN segment
+ * and the TCP implementation is using an RTO less than 3
+ * seconds, the RTO MUST be re-initialized to 3 seconds
+ * when data transmission begins"
+ */
+ if (tp->t_rxtcur < TCPTV_RTOBASE3)
+ tp->t_rxtcur = TCPTV_RTOBASE3;
+ }
+}
+
+/*
+ * Returns TRUE, if the ACK should be dropped
+ */
+static boolean_t
+tcp_fast_recovery(struct tcpcb *tp, tcp_seq th_ack, const struct tcpopt *to)
+{
+ boolean_t fast_sack_rexmt = TRUE;
+
+ tcpstat.tcps_rcvdupack++;
+
+ /*
+ * We have outstanding data (other than a window probe),
+ * this is a completely duplicate ack (ie, window info
+ * didn't change), the ack is the biggest we've seen and
+ * we've seen exactly our rexmt threshhold of them, so
+ * assume a packet has been dropped and retransmit it.
+ * Kludge snd_nxt & the congestion window so we send only
+ * this one packet.
+ */
+ if (IN_FASTRECOVERY(tp)) {
+ if (TCP_DO_SACK(tp)) {
+ /* No artifical cwnd inflation. */
+ tcp_sack_rexmt(tp);
+ } else {
+ /*
+ * Dup acks mean that packets have left
+ * the network (they're now cached at the
+ * receiver) so bump cwnd by the amount in
+ * the receiver to keep a constant cwnd
+ * packets in the network.
+ */
+ tp->snd_cwnd += tp->t_maxseg;
+ tcp_output(tp);
+ }
+ return TRUE;
+ } else if (SEQ_LT(th_ack, tp->snd_recover)) {
+ tp->t_dupacks = 0;
+ return FALSE;
+ } else if (tcp_ignore_redun_dsack && TCP_DO_SACK(tp) &&
+ (to->to_flags & (TOF_DSACK | TOF_SACK_REDUNDANT)) ==
+ (TOF_DSACK | TOF_SACK_REDUNDANT)) {
+ /*
+ * If the ACK carries DSACK and other SACK blocks
+ * carry information that we have already known,
+ * don't count this ACK as duplicate ACK. This
+ * prevents spurious early retransmit and fast
+ * retransmit. This also meets the requirement of
+ * RFC3042 that new segments should not be sent if
+ * the SACK blocks do not contain new information
+ * (XXX we actually loosen the requirment that only
+ * DSACK is checked here).
+ *
+ * This kind of ACKs are usually sent after spurious
+ * retransmit.
+ */
+ /* Do nothing; don't change t_dupacks */
+ return TRUE;
+ } else if (tp->t_dupacks == 0 && TCP_DO_NCR(tp)) {
+ tcp_ncr_update_rxtthresh(tp);
+ }
+
+ if (++tp->t_dupacks == tp->t_rxtthresh) {
+ tcp_seq old_snd_nxt;
+ u_int win;
+
+fastretransmit:
+ if (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) {
+ tcp_save_congestion_state(tp);
+ tp->rxt_flags |= TRXT_F_FASTREXMT;
+ }
+ /*
+ * We know we're losing at the current window size,
+ * so do congestion avoidance: set ssthresh to half
+ * the current window and pull our congestion window
+ * back to the new ssthresh.
+ */
+ win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+ if (win < 2)
+ win = 2;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ ENTER_FASTRECOVERY(tp);
+ tp->snd_recover = tp->snd_max;
+ tcp_callout_stop(tp, tp->tt_rexmt);
+ tp->t_rtttime = 0;
+ old_snd_nxt = tp->snd_nxt;
+ tp->snd_nxt = th_ack;
+ tp->snd_cwnd = tp->t_maxseg;
+ tcp_output(tp);
+ ++tcpstat.tcps_sndfastrexmit;
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->rexmt_high = tp->snd_nxt;
+ tp->sack_flags &= ~TSACK_F_SACKRESCUED;
+ if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
+ tp->snd_nxt = old_snd_nxt;
+ KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big"));
+ if (TCP_DO_SACK(tp)) {
+ if (fast_sack_rexmt)
+ tcp_sack_rexmt(tp);
+ } else {
+ tp->snd_cwnd += tp->t_maxseg *
+ (tp->t_dupacks - tp->snd_limited);
+ }
+ } else if ((tcp_do_rfc3517bis && TCP_DO_SACK(tp)) || TCP_DO_NCR(tp)) {
+ /*
+ * The RFC3517bis recommends to reduce the byte threshold,
+ * and enter fast retransmit if IsLost(snd_una). However,
+ * if we use IsLost(snd_una) based fast retransmit here,
+ * segments reordering will cause spurious retransmit. So
+ * we defer the IsLost(snd_una) based fast retransmit until
+ * the extended limited transmit can't send any segments and
+ * early retransmit can't be done.
+ */
+ if (tcp_rfc3517bis_rxt && tcp_do_rfc3517bis &&
+ tcp_sack_islost(&tp->scb, tp->snd_una))
+ goto fastretransmit;
+
+ if (tcp_do_limitedtransmit || TCP_DO_NCR(tp)) {
+ if (!tcp_sack_limitedxmit(tp)) {
+ /* outstanding data */
+ uint32_t ownd = tp->snd_max - tp->snd_una;
+
+ if (need_early_retransmit(tp, ownd)) {
+ ++tcpstat.tcps_sndearlyrexmit;
+ tp->rxt_flags |= TRXT_F_EARLYREXMT;
+ goto fastretransmit;
+ } else if (tcp_do_rfc3517bis &&
+ tcp_sack_islost(&tp->scb, tp->snd_una)) {
+ fast_sack_rexmt = FALSE;
+ goto fastretransmit;
+ }
+ }
+ }
+ } else if (tcp_do_limitedtransmit) {
+ u_long oldcwnd = tp->snd_cwnd;
+ tcp_seq oldsndmax = tp->snd_max;
+ tcp_seq oldsndnxt = tp->snd_nxt;
+ /* outstanding data */
+ uint32_t ownd = tp->snd_max - tp->snd_una;
+ u_int sent;
+
+ KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2,
+ ("dupacks not 1 or 2"));
+ if (tp->t_dupacks == 1)
+ tp->snd_limited = 0;
+ tp->snd_nxt = tp->snd_max;
+ tp->snd_cwnd = ownd +
+ (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg;
+ tcp_output(tp);
+
+ if (SEQ_LT(oldsndnxt, oldsndmax)) {
+ KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
+ ("snd_una moved in other threads"));
+ tp->snd_nxt = oldsndnxt;
+ }
+ tp->snd_cwnd = oldcwnd;
+ sent = tp->snd_max - oldsndmax;
+ if (sent > tp->t_maxseg) {
+ KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) ||
+ (sent == tp->t_maxseg + 1 &&
+ (tp->t_flags & TF_SENTFIN)),
+ ("sent too much"));
+ KASSERT(sent <= tp->t_maxseg * 2,
+ ("sent too many segments"));
+ tp->snd_limited = 2;
+ tcpstat.tcps_sndlimited += 2;
+ } else if (sent > 0) {
+ ++tp->snd_limited;
+ ++tcpstat.tcps_sndlimited;
+ } else if (need_early_retransmit(tp, ownd)) {
+ ++tcpstat.tcps_sndearlyrexmit;
+ tp->rxt_flags |= TRXT_F_EARLYREXMT;
+ goto fastretransmit;
+ }
+ }
+ return TRUE;
+}