From 9de1f696e640acef72985a955837be902d27c91d Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Wed, 16 May 2012 16:53:37 +0800 Subject: [PATCH] tcp: Make PAWS robust against segments reordering This tends to prevent valid new segments from being dropped by the reordered segments sent by the fast retransmission algorithm on the sending side, i.e. the fast retransmitted segment w/ larger timestamp arrives earlier than the previously sent new segments w/ smaller timestamp, which causes the valid new segments being dropped. --- sys/netinet/tcp_input.c | 82 ++++++++++++++++++++++++++++++++++++++++- sys/netinet/tcp_var.h | 1 + usr.bin/netstat/inet.c | 3 +- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index b3c3a807f7..5fb3c366d9 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -176,6 +176,15 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc, CTLFLAG_RW, &tcp_do_abc, 0, "TCP Appropriate Byte Counting (RFC 3465)"); +/* + * The following value actually takes range [25ms, 250ms], + * given that most modern systems use 1ms ~ 10ms as the unit + * of timestamp option. + */ +static u_int tcp_paws_tolerance = 25; +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, paws_tolerance, CTLFLAG_RW, + &tcp_paws_tolerance, 0, "RFC1323 PAWS tolerance"); + /* * Define as tunable for easy testing with SACK on and off. * Warning: do not change setting in the middle of an existing active TCP flow, @@ -300,6 +309,45 @@ do { \ (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \ tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg))) +/* + * Returns TRUE, if this segment can be merged with the last + * pending segment in the reassemble queue and this segment + * does not overlap with the pending segment immediately + * preceeding the last pending segment. + */ +static __inline boolean_t +tcp_paws_canreasslast(const struct tcpcb *tp, const struct tcphdr *th, int tlen) +{ + const struct tseg_qent *last, *prev; + + last = TAILQ_LAST(&tp->t_segq, tsegqe_head); + if (last == NULL) + return FALSE; + + /* This segment comes immediately after the last pending segment */ + if (last->tqe_th->th_seq + last->tqe_len == th->th_seq) + return TRUE; + + if (th->th_seq + tlen != last->tqe_th->th_seq) + return FALSE; + /* This segment comes immediately before the last pending segment */ + + prev = TAILQ_PREV(last, tsegqe_head, tqe_q); + if (prev == NULL) { + /* + * No pending preceeding segment, we assume this segment + * could be reassembled. + */ + return TRUE; + } + + /* This segment does not overlap with the preceeding segment */ + if (SEQ_GEQ(th->th_seq, prev->tqe_th->th_seq + prev->tqe_len)) + return TRUE; + + return FALSE; +} + static int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { @@ -1655,7 +1703,6 @@ after_listen: */ if ((to.to_flags & TOF_TS) && tp->ts_recent != 0 && TSTMP_LT(to.to_tsval, tp->ts_recent)) { - /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* @@ -1670,6 +1717,39 @@ after_listen: * dropped when ts_recent is old. */ tp->ts_recent = 0; + } else if (tcp_paws_tolerance && tlen != 0 && + tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK&& + !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) && + th->th_ack == tp->snd_una && + tiwin == tp->snd_wnd && + TSTMP_GEQ(to.to_tsval + tcp_paws_tolerance, tp->ts_recent)&& + (th->th_seq == tp->rcv_nxt || + (SEQ_GT(th->th_seq, tp->rcv_nxt) && + tcp_paws_canreasslast(tp, th, tlen)))) { + /* + * This tends to prevent valid new segments from being + * dropped by the reordered segments sent by the fast + * retransmission algorithm on the sending side, i.e. + * the fast retransmitted segment w/ larger timestamp + * arrives earlier than the previously sent new segments + * w/ smaller timestamp. + * + * If following conditions are met, the segment is + * accepted: + * - The segment contains data + * - The connection is established + * - The header does not contain important flags + * - SYN or FIN is not needed + * - It does not acknowledge new data + * - Receive window is not changed + * - The timestamp is within "acceptable" range + * - The new segment is what we are expecting or + * the new segment could be merged w/ the last + * pending segment on the reassemble queue + */ + tcpstat.tcps_pawsaccept++; + tcpstat.tcps_pawsdrop++; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 72f1e9a7bc..8002cf7fb3 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -400,6 +400,7 @@ struct tcp_stats { u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ u_long tcps_rcvwinupd; /* rcvd window update packets */ u_long tcps_pawsdrop; /* segments dropped due to PAWS */ + u_long tcps_pawsaccept; /* segments accepted, PAWS tolerance */ u_long tcps_predack; /* times hdr predict ok for acks */ u_long tcps_preddat; /* times hdr predict ok for data pkts */ u_long tcps_pcbcachemiss; diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index 9d47b31b0a..64f65f785c 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -479,7 +479,8 @@ tcp_stats(u_long off __unused, const char *name, int af1 __unused) "\t\t%lu packet%s (%lu byte%s) received in-sequence\n"); p2(tcps_rcvduppack, tcps_rcvdupbyte, "\t\t%lu completely duplicate packet%s (%lu byte%s)\n"); - p(tcps_pawsdrop, "\t\t%lu old duplicate packet%s\n"); + p2(tcps_pawsdrop, tcps_pawsaccept, + "\t\t%lu old duplicate packet%s (%lu packet%s accepted)\n"); p2(tcps_rcvpartduppack, tcps_rcvpartdupbyte, "\t\t%lu packet%s with some dup. data (%lu byte%s duped)\n"); p2(tcps_rcvoopack, tcps_rcvoobyte, -- 2.41.0