From 2b1ce38a26291667d9ed98c77d898c34e1aae541 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 3 Aug 2004 00:04:13 +0000 Subject: [PATCH] tcp_input()'s DELAY_ACK() code checks to see if the delayed ack timer is running and if it is not it starts it and returns rather then issue an ack. If the timer is already running tcp_input() will generate an immediate ack, resulting in one ack every other packet. This every-other-packet ack is usually required to ensure that the window does not close too much and stall the sender, but it really only exists because the tcp stack does not look ahead to see if there are other incoming packets that need to be processed that might themselves require additional acks. For optimal operation we really want to process all the pending TCP packets for the connection before sending any 'normal' acks. Many ethernet interfaces, including and most especially GigE interfaces, rate-limit their interrupts. This results in several packets being moved from the RX ring to the TCP/IP stack all at once, in a batch. GIVE THE TCP stack its own netisr dispatcher loop rather then using the generic netisr dispatcher loop. The TCP dispatcher loop will call an additional routine, tcp_willblock(), after all messages queued to the TCP protocol stack have been exhausted. When tcp_input() needs to send an ack in the normal header-prediction case it now places the TCPCB on a queue rather then send an immediate ack. tcp_willblock() processes this queue and calls tcp_output() to send the actual ack. The result is that on a GigE interface which typically queues 8+ packets per interrupt, a TCP stream will only be acked once per ~8 packets rather then 4 times (every other packet) per ~8 packets. This *GREATLY* reduces TCP protocol overhead and network ack traffic on both ends of the connection. NOTE: a later commit will deal with pure window space updates which generate an additional ACK per ~8 packets when the user program drains the buffer. Reviewed-by: Jeffrey Hsu --- sys/netinet/ip_demux.c | 4 ++-- sys/netinet/tcp_input.c | 21 +++++++++++++++++---- sys/netinet/tcp_subr.c | 42 ++++++++++++++++++++++++++++++++++++++++- sys/netinet/tcp_var.h | 11 +++++++++-- 4 files changed, 69 insertions(+), 9 deletions(-) diff --git a/sys/netinet/ip_demux.c b/sys/netinet/ip_demux.c index 30160600d0..9f5cd3e4a9 100644 --- a/sys/netinet/ip_demux.c +++ b/sys/netinet/ip_demux.c @@ -30,7 +30,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/netinet/ip_demux.c,v 1.25 2004/07/18 16:26:43 dillon Exp $ + * $DragonFly: src/sys/netinet/ip_demux.c,v 1.26 2004/08/03 00:04:13 dillon Exp $ */ /* @@ -348,7 +348,7 @@ tcp_thread_init(void) int cpu; for (cpu = 0; cpu < ncpus2; cpu++) { - lwkt_create(netmsg_service_loop, NULL, NULL, + lwkt_create(tcpmsg_service_loop, NULL, NULL, &tcp_thread[cpu], 0, cpu, "tcp_thread %d", cpu); tcp_thread[cpu].td_msgport.mp_putport = netmsg_put_port; } diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 67dec7e5b8..06300fc02a 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -82,7 +82,7 @@ * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.38 2003/05/21 04:46:41 cjc Exp $ - * $DragonFly: src/sys/netinet/tcp_input.c,v 1.32 2004/07/27 17:57:02 drhodus Exp $ + * $DragonFly: src/sys/netinet/tcp_input.c,v 1.33 2004/08/03 00:04:13 dillon Exp $ */ #include "opt_ipfw.h" /* for ipfw_fwd */ @@ -207,8 +207,6 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); -struct inpcbinfo tcbinfo[MAXCPU]; - static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); @@ -1146,12 +1144,27 @@ after_listen: sbappend(&so->so_rcv, m); } sorwakeup(so); + + /* + * This code is responsible for most of the ACKs + * the TCP stack sends back after receiving a data + * packet. Note that the DELAY_ACK check fails if + * the delack timer is already running, which results + * in an ack being sent every other packet (which is + * what we want). + */ if (DELAY_ACK(tp)) { callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); } else { tp->t_flags |= TF_ACKNOW; - tcp_output(tp); + if ((tp->t_flags & TF_ONOUTPUTQ) == 0) { + tp->t_flags |= TF_ONOUTPUTQ; + tp->tt_cpu = mycpu->gd_cpuid; + TAILQ_INSERT_TAIL( + &tcpcbackq[tp->tt_cpu], + tp, t_outputq); + } } return; } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 856ae6855f..e31b0dbce1 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -82,7 +82,7 @@ * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.31 2003/01/24 05:11:34 sam Exp $ - * $DragonFly: src/sys/netinet/tcp_subr.c,v 1.36 2004/07/08 22:07:35 hsu Exp $ + * $DragonFly: src/sys/netinet/tcp_subr.c,v 1.37 2004/08/03 00:04:13 dillon Exp $ */ #include "opt_compat.h" @@ -157,6 +157,9 @@ #include +struct inpcbinfo tcbinfo[MAXCPU]; +struct tcpcbackqhead tcpcbackq[MAXCPU]; + int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); @@ -229,6 +232,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, static MALLOC_DEFINE(M_TCPTEMP, "tcptemp", "TCP Templates for Keepalives"); static struct malloc_pipe tcptemp_mpipe; +static void tcp_willblock(void); static void tcp_cleartaocache (void); static void tcp_notify (struct inpcb *, int); @@ -339,6 +343,7 @@ tcp_init() tcbinfo[cpu].wildcardhashbase = hashinit(hashsize, M_PCB, &tcbinfo[cpu].wildcardhashmask); tcbinfo[cpu].ipi_zone = ipi_zone; + TAILQ_INIT(&tcpcbackq[cpu]); } tcp_reass_maxseg = nmbclusters / 16; @@ -374,6 +379,34 @@ tcp_init() tcp_thread_init(); } +void +tcpmsg_service_loop(void *dummy) +{ + struct netmsg *msg; + + while ((msg = lwkt_waitport(&curthread->td_msgport, NULL))) { + do { + msg->nm_lmsg.ms_cmd.cm_func(&msg->nm_lmsg); + } while ((msg = lwkt_getport(&curthread->td_msgport)) != NULL); + tcp_willblock(); + } +} + +static void +tcp_willblock(void) +{ + struct tcpcb *tp; + int cpu = mycpu->gd_cpuid; + + while ((tp = TAILQ_FIRST(&tcpcbackq[cpu])) != NULL) { + KKASSERT(tp->t_flags & TF_ONOUTPUTQ); + tp->t_flags &= ~TF_ONOUTPUTQ; + TAILQ_REMOVE(&tcpcbackq[cpu], tp, t_outputq); + tcp_output(tp); + } +} + + /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out @@ -739,6 +772,12 @@ tcp_close(struct tcpcb *tp) callout_stop(tp->tt_2msl); callout_stop(tp->tt_delack); + if (tp->t_flags & TF_ONOUTPUTQ) { + KKASSERT(tp->tt_cpu == mycpu->gd_cpuid); + TAILQ_REMOVE(&tcpcbackq[tp->tt_cpu], tp, t_outputq); + tp->t_flags &= ~TF_ONOUTPUTQ; + } + /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. @@ -859,6 +898,7 @@ no_valid_rt: msg->nm_pcbinfo = &tcbinfo[cpu]; lwkt_sendmsg(tcp_cport(cpu), &msg->nm_lmsg); } + /* XXX wait? */ } #endif diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 0e14651a28..5736ae7b3a 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -82,7 +82,7 @@ * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_var.h,v 1.56.2.13 2003/02/03 02:34:07 hsu Exp $ - * $DragonFly: src/sys/netinet/tcp_var.h,v 1.24 2004/07/17 20:31:31 hsu Exp $ + * $DragonFly: src/sys/netinet/tcp_var.h,v 1.25 2004/08/03 00:04:13 dillon Exp $ */ #ifndef _NETINET_TCP_VAR_H_ @@ -124,7 +124,7 @@ struct tcptemp { struct tcpcb { struct tsegqe_head t_segq; int t_dupacks; /* consecutive dup acks recd */ - struct tcptemp *unused; /* unused */ + int tt_cpu; /* sanity check the cpu */ struct callout *tt_rexmt; /* retransmit timer */ struct callout *tt_persist; /* retransmit persistence */ @@ -161,6 +161,7 @@ struct tcpcb { #define TF_FASTREXMT 0x00800000 /* Did Fast Retransmit. */ #define TF_EARLYREXMT 0x01000000 /* Did Early (Fast) Retransmit. */ #define TF_FORCE 0x02000000 /* Set if forcing out a byte */ +#define TF_ONOUTPUTQ 0x04000000 /* on t_outputq list */ tcp_seq snd_up; /* send urgent pointer */ tcp_seq snd_una; /* send unacknowledged */ @@ -235,6 +236,7 @@ struct tcpcb { u_long t_badrxtwin; /* window for retransmit recovery */ u_long t_rexmtTS; /* timestamp of last retransmit */ u_char snd_limited; /* segments limited transmitted */ + TAILQ_ENTRY(tcpcb) t_outputq; /* tcp_output needed list */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) @@ -516,7 +518,11 @@ struct xtcpcb { SYSCTL_DECL(_net_inet_tcp); #endif +TAILQ_HEAD(tcpcbackqhead,tcpcb); + extern struct inpcbinfo tcbinfo[]; +extern struct tcpcbackqhead tcpcbackq[]; + extern int tcp_mssdflt; /* XXX */ extern int tcp_delack_enabled; extern int path_mtu_discovery; @@ -531,6 +537,7 @@ struct lwkt_port * void tcp_canceltimers (struct tcpcb *); struct tcpcb * tcp_close (struct tcpcb *); +void tcpmsg_service_loop (void *); void tcp_ctlinput (int, struct sockaddr *, void *); int tcp_ctloutput (struct socket *, struct sockopt *); struct lwkt_port * -- 2.41.0