X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/72d9cf3aab1ebc7ac81d39eb75fd5e3469161437..697aadcd03f60a7715f28694a62840c6148eacf3:/sys/netinet/tcp_subr.c diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 222258ab91..817c25802f 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -65,7 +65,7 @@ * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.31 2003/01/24 05:11:34 sam Exp $ - * $DragonFly: src/sys/netinet/tcp_subr.c,v 1.56 2007/03/04 18:51:59 swildner Exp $ + * $DragonFly: src/sys/netinet/tcp_subr.c,v 1.63 2008/11/11 10:46:58 sephe Exp $ */ #include "opt_compat.h" @@ -85,6 +85,7 @@ #include #endif #include +#include #include #include #include @@ -116,6 +117,7 @@ #include #include #include +#include #include #include #include @@ -143,6 +145,8 @@ #include #include +#include + #if !defined(KTR_TCP) #define KTR_TCP KTR_ALL #endif @@ -155,6 +159,15 @@ KTR_INFO(KTR_TCP, tcp, delayed, 2, "tcp execute delayed ops", 0); struct inpcbinfo tcbinfo[MAXCPU]; struct tcpcbackqhead tcpcbackq[MAXCPU]; +int tcp_mpsafe_proto = 0; +TUNABLE_INT("net.inet.tcp.mpsafe_proto", &tcp_mpsafe_proto); + +static int tcp_mpsafe_thread = NETMSG_SERVICE_ADAPTIVE; +TUNABLE_INT("net.inet.tcp.mpsafe_thread", &tcp_mpsafe_thread); +SYSCTL_INT(_net_inet_tcp, OID_AUTO, mpsafe_thread, CTLFLAG_RW, + &tcp_mpsafe_thread, 0, + "0:BGL, 1:Adaptive BGL, 2:No BGL(experimental)"); + int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); @@ -227,7 +240,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, static MALLOC_DEFINE(M_TCPTEMP, "tcptemp", "TCP Templates for Keepalives"); static struct malloc_pipe tcptemp_mpipe; -static void tcp_willblock(void); +static void tcp_willblock(int); static void tcp_cleartaocache (void); static void tcp_notify (struct inpcb *, int); @@ -282,8 +295,12 @@ struct inp_tp { char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; } inp_tp_u; struct tcpcb tcb; - struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; - struct callout inp_tp_delack; + struct tcp_callout inp_tp_rexmt; + struct tcp_callout inp_tp_persist; + struct tcp_callout inp_tp_keep; + struct tcp_callout inp_tp_2msl; + struct tcp_callout inp_tp_delack; + struct netmsg_tcp_timer inp_tp_timermsg; }; #undef ALIGNMENT #undef ALIGNM1 @@ -368,7 +385,6 @@ tcp_init(void) #endif syncache_init(); - tcp_sack_init(); tcp_thread_init(); } @@ -376,23 +392,41 @@ void tcpmsg_service_loop(void *dummy) { struct netmsg *msg; + int mplocked; + + /* + * Thread was started with TDF_MPSAFE + */ + mplocked = 0; - while ((msg = lwkt_waitport(&curthread->td_msgport, NULL))) { + while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) { do { logtcp(rxmsg); - msg->nm_lmsg.ms_cmd.cm_func(&msg->nm_lmsg); + mplocked = netmsg_service(msg, tcp_mpsafe_thread, + mplocked); } while ((msg = lwkt_getport(&curthread->td_msgport)) != NULL); + logtcp(delayed); - tcp_willblock(); + tcp_willblock(mplocked); logtcp(wait); } } static void -tcp_willblock(void) +tcp_willblock(int mplocked) { struct tcpcb *tp; int cpu = mycpu->gd_cpuid; + int unlock = 0; + + if (!mplocked && !tcp_mpsafe_proto) { + if (TAILQ_EMPTY(&tcpcbackq[cpu])) + return; + + get_mplock(); + mplocked = 1; + unlock = 1; + } while ((tp = TAILQ_FIRST(&tcpcbackq[cpu])) != NULL) { KKASSERT(tp->t_flags & TF_ONOUTPUTQ); @@ -400,6 +434,9 @@ tcp_willblock(void) TAILQ_REMOVE(&tcpcbackq[cpu], tp, t_outputq); tcp_output(tp); } + + if (unlock) + rel_mplock(); } @@ -509,6 +546,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, struct route_in6 *ro6 = NULL; struct route_in6 sro6; struct ip6_hdr *ip6 = ipgen; + boolean_t use_tmpro = TRUE; #ifdef INET6 boolean_t isipv6 = (IP_VHL_V(ip->ip_vhl) == 6); #else @@ -517,15 +555,23 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (tp != NULL) { if (!(flags & TH_RST)) { - win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + win = ssb_space(&tp->t_inpcb->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } - if (isipv6) - ro6 = &tp->t_inpcb->in6p_route; - else - ro = &tp->t_inpcb->inp_route; - } else { + /* + * Don't use the route cache of a listen socket, + * it is not MPSAFE; use temporary route cache. + */ + if (tp->t_state != TCPS_LISTEN) { + if (isipv6) + ro6 = &tp->t_inpcb->in6p_route; + else + ro = &tp->t_inpcb->inp_route; + use_tmpro = FALSE; + } + } + if (use_tmpro) { if (isipv6) { ro6 = &sro6; bzero(ro6, sizeof *ro6); @@ -590,7 +636,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, } m->m_len = tlen; m->m_pkthdr.len = tlen; - m->m_pkthdr.rcvif = (struct ifnet *) NULL; + m->m_pkthdr.rcvif = NULL; nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; @@ -627,6 +673,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, ro6->ro_rt = NULL; } } else { + ipflags |= IP_DEBUGROUTE; ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL); if ((ro == &sro) && (ro->ro_rt != NULL)) { RTFREE(ro->ro_rt); @@ -659,11 +706,20 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; /* Set up our timeouts. */ - callout_init(tp->tt_rexmt = &it->inp_tp_rexmt); - callout_init(tp->tt_persist = &it->inp_tp_persist); - callout_init(tp->tt_keep = &it->inp_tp_keep); - callout_init(tp->tt_2msl = &it->inp_tp_2msl); - callout_init(tp->tt_delack = &it->inp_tp_delack); + tp->tt_rexmt = &it->inp_tp_rexmt; + tp->tt_persist = &it->inp_tp_persist; + tp->tt_keep = &it->inp_tp_keep; + tp->tt_2msl = &it->inp_tp_2msl; + tp->tt_delack = &it->inp_tp_delack; + tcp_inittimers(tp); + + /* + * Zero out timer message. We don't create it here, + * since the current CPU may not be the owner of this + * inpcb. + */ + tp->tt_msg = &it->inp_tp_timermsg; + bzero(tp->tt_msg, sizeof(*tp->tt_msg)); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE | TF_REQ_TSTMP); @@ -720,7 +776,7 @@ tcp_drop(struct tcpcb *tp, int error) #ifdef SMP struct netmsg_remwildcard { - struct lwkt_msg nm_lmsg; + struct netmsg nm_netmsg; struct inpcb *nm_inp; struct inpcbinfo *nm_pcbinfo; #if defined(INET6) @@ -735,8 +791,8 @@ struct netmsg_remwildcard { * inp can be detached. We do this by cycling through the cpus, ending up * on the cpu controlling the inp last and then doing the disconnect. */ -static int -in_pcbremwildcardhash_handler(struct lwkt_msg *msg0) +static void +in_pcbremwildcardhash_handler(struct netmsg *msg0) { struct netmsg_remwildcard *msg = (struct netmsg_remwildcard *)msg0; int cpu; @@ -751,14 +807,13 @@ in_pcbremwildcardhash_handler(struct lwkt_msg *msg0) else #endif in_pcbdetach(msg->nm_inp); - lwkt_replymsg(&msg->nm_lmsg, 0); + lwkt_replymsg(&msg->nm_netmsg.nm_lmsg, 0); } else { in_pcbremwildcardhash_oncpu(msg->nm_inp, msg->nm_pcbinfo); cpu = (cpu + 1) % ncpus2; msg->nm_pcbinfo = &tcbinfo[cpu]; - lwkt_forwardmsg(tcp_cport(cpu), &msg->nm_lmsg); + lwkt_forwardmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg); } - return (EASYNC); } #endif @@ -803,13 +858,17 @@ tcp_close(struct tcpcb *tp) /* * Make sure that all of our timers are stopped before we - * delete the PCB. + * delete the PCB. For listen TCP socket (tp->tt_msg == NULL), + * timers are never used. If timer message is never created + * (tp->tt_msg->tt_tcb == NULL), timers are never used too. */ - callout_stop(tp->tt_rexmt); - callout_stop(tp->tt_persist); - callout_stop(tp->tt_keep); - callout_stop(tp->tt_2msl); - callout_stop(tp->tt_delack); + if (tp->tt_msg != NULL && tp->tt_msg->tt_tcb != NULL) { + tcp_callout_stop(tp, tp->tt_rexmt); + tcp_callout_stop(tp, tp->tt_persist); + tcp_callout_stop(tp, tp->tt_keep); + tcp_callout_stop(tp, tp->tt_2msl); + tcp_callout_stop(tp, tp->tt_delack); + } if (tp->t_flags & TF_ONOUTPUTQ) { KKASSERT(tp->tt_cpu == mycpu->gd_cpuid); @@ -888,7 +947,7 @@ tcp_close(struct tcpcb *tp) if (rt->rt_rmx.rmx_sendpipe != 0) dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe/2); else - dosavessthresh = (i < so->so_snd.sb_hiwat/2); + dosavessthresh = (i < so->so_snd.ssb_hiwat/2); if (dosavessthresh || (!(rt->rt_rmx.rmx_locks & RTV_SSTHRESH) && (i != 0) && (rt->rt_rmx.rmx_ssthresh != 0))) { @@ -926,6 +985,9 @@ no_valid_rt: inp->inp_ppcb = NULL; soisdisconnected(so); + + tcp_destroy_timermsg(tp); + /* * Discard the inp. In the SMP case a wildcard inp's hash (created * by a listen socket or an INADDR_ANY udp socket) is replicated @@ -942,16 +1004,15 @@ no_valid_rt: cpu = (inp->inp_pcbinfo->cpu + 1) % ncpus2; msg = kmalloc(sizeof(struct netmsg_remwildcard), - M_LWKTMSG, M_INTWAIT); - lwkt_initmsg(&msg->nm_lmsg, &netisr_afree_rport, 0, - lwkt_cmd_func(in_pcbremwildcardhash_handler), - lwkt_cmd_op_none); + M_LWKTMSG, M_INTWAIT); + netmsg_init(&msg->nm_netmsg, &netisr_afree_rport, 0, + in_pcbremwildcardhash_handler); #ifdef INET6 msg->nm_isinet6 = isafinet6; #endif msg->nm_inp = inp; msg->nm_pcbinfo = &tcbinfo[cpu]; - lwkt_sendmsg(tcp_cport(cpu), &msg->nm_lmsg); + lwkt_sendmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg); } else #endif { @@ -990,18 +1051,17 @@ tcp_drain_oncpu(struct inpcbhead *head) #ifdef SMP struct netmsg_tcp_drain { - struct lwkt_msg nm_lmsg; + struct netmsg nm_netmsg; struct inpcbhead *nm_head; }; -static int -tcp_drain_handler(lwkt_msg_t lmsg) +static void +tcp_drain_handler(netmsg_t netmsg) { - struct netmsg_tcp_drain *nm = (void *)lmsg; + struct netmsg_tcp_drain *nm = (void *)netmsg; tcp_drain_oncpu(nm->nm_head); - lwkt_replymsg(lmsg, 0); - return(EASYNC); + lwkt_replymsg(&nm->nm_netmsg.nm_lmsg, 0); } #endif @@ -1034,11 +1094,10 @@ tcp_drain(void) M_LWKTMSG, M_NOWAIT); if (msg == NULL) continue; - lwkt_initmsg(&msg->nm_lmsg, &netisr_afree_rport, 0, - lwkt_cmd_func(tcp_drain_handler), - lwkt_cmd_op_none); + netmsg_init(&msg->nm_netmsg, &netisr_afree_rport, 0, + tcp_drain_handler); msg->nm_head = &tcbinfo[cpu].pcblisthead; - lwkt_sendmsg(tcp_cport(cpu), &msg->nm_lmsg); + lwkt_sendmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg); } } #else @@ -1202,7 +1261,7 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) int cpu; int error; - error = suser(req->td); + error = priv_check(req->td, PRIV_ROOT); if (error != 0) return (error); error = SYSCTL_IN(req, addrs, sizeof addrs); @@ -1235,7 +1294,7 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) int error; boolean_t mapped = FALSE; - error = suser(req->td); + error = priv_check(req->td, PRIV_ROOT); if (error != 0) return (error); error = SYSCTL_IN(req, addrs, sizeof addrs); @@ -1276,6 +1335,29 @@ SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, (CTLTYPE_OPAQUE | CTLFLAG_RW), tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection"); #endif +struct netmsg_tcp_notify { + struct netmsg nm_nmsg; + void (*nm_notify)(struct inpcb *, int); + struct in_addr nm_faddr; + int nm_arg; +}; + +static void +tcp_notifyall_oncpu(struct netmsg *netmsg) +{ + struct netmsg_tcp_notify *nmsg = (struct netmsg_tcp_notify *)netmsg; + int nextcpu; + + in_pcbnotifyall(&tcbinfo[mycpuid].pcblisthead, nmsg->nm_faddr, + nmsg->nm_arg, nmsg->nm_notify); + + nextcpu = mycpuid + 1; + if (nextcpu < ncpus2) + lwkt_forwardmsg(tcp_cport(nextcpu), &netmsg->nm_lmsg); + else + lwkt_replymsg(&netmsg->nm_lmsg, 0); +} + void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { @@ -1346,10 +1428,16 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) } crit_exit(); } else { - for (cpu = 0; cpu < ncpus2; cpu++) { - in_pcbnotifyall(&tcbinfo[cpu].pcblisthead, faddr, arg, - notify); - } + struct netmsg_tcp_notify nmsg; + + KKASSERT(&curthread->td_msgport == cpu_portfn(0)); + netmsg_init(&nmsg.nm_nmsg, &curthread->td_msgport, 0, + tcp_notifyall_oncpu); + nmsg.nm_faddr = faddr; + nmsg.nm_arg = arg; + nmsg.nm_notify = notify; + + lwkt_domsg(tcp_cport(0), &nmsg.nm_nmsg.nm_lmsg, 0); } } @@ -1633,8 +1721,8 @@ tcp_mtudisc(struct inpcb *inp, int mtu) mss = (mss / MCLBYTES) * MCLBYTES; #endif - if (so->so_snd.sb_hiwat < mss) - mss = so->so_snd.sb_hiwat; + if (so->so_snd.ssb_hiwat < mss) + mss = so->so_snd.ssb_hiwat; tp->t_maxseg = mss; tp->t_rtttime = 0;