From 02ad2f0b874fb0a45eb69750219f79f5e8982272 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Wed, 31 Jul 2013 16:31:02 +0800 Subject: [PATCH] tcp: Allow socket/syncache inheritance between SO_REUSEPORT listen sockets This is intend to address the drawback of the following SO_REUSEPORT tcp listen socket usage model: Multiple processses create their own listen socket w/ SO_REUSEPORT, if one of the process crashed, i.e. the listen socket is closed, then any syncache, pending sockets on the completion and incompletion queues are just dropped. Compared w/ multiple processes sharing the same tcp listen socket, this drawback could be unacceptable (e.g. when nginx reload configure). This commit addresses this drawback by allowing another tcp listen socket in the same local group, i.e. bound to the same listen port and address, to inherit the syncache and pending sockets of the to-be-closed tcp listen socket. The tcp listen socket, which inherits the syncache and pending sockets, is the last tcp listen socket in the same local group. --- sys/kern/uipc_socket.c | 67 ++++++++++++++++++++++++++++++++++++++ sys/netinet/in_pcb.c | 40 +++++++++++++++++++++++ sys/netinet/in_pcb.h | 2 ++ sys/netinet/tcp_subr.c | 39 ++++++++++++++++++++-- sys/netinet/tcp_syncache.c | 4 +-- sys/netinet/tcp_var.h | 2 +- sys/sys/socketvar.h | 1 + 7 files changed, 149 insertions(+), 6 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 26a88e9204..856dd1ff7a 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -452,6 +452,73 @@ sodiscard(struct socket *so) sosetstate(so, SS_NOFDREF); /* take ref */ } +void +soinherit(struct socket *so, struct socket *so_inh) +{ + TAILQ_HEAD(, socket) comp, incomp; + struct socket *sp; + int qlen, incqlen; + + KASSERT(so->so_options & SO_ACCEPTCONN, + ("so does not accept connection")); + KASSERT(so_inh->so_options & SO_ACCEPTCONN, + ("so_inh does not accept connection")); + + TAILQ_INIT(&comp); + TAILQ_INIT(&incomp); + + lwkt_getpooltoken(so); + lwkt_getpooltoken(so_inh); + + /* + * Save completed queue and incompleted queue + */ + TAILQ_CONCAT(&comp, &so->so_comp, so_list); + qlen = so->so_qlen; + so->so_qlen = 0; + + TAILQ_CONCAT(&incomp, &so->so_incomp, so_list); + incqlen = so->so_incqlen; + so->so_incqlen = 0; + + /* + * Append the saved completed queue and incompleted + * queue to the socket inherits them. + * + * XXX + * This may temporarily break the inheriting socket's + * so_qlimit. + */ + TAILQ_FOREACH(sp, &comp, so_list) { + sp->so_head = so_inh; + crfree(sp->so_cred); + sp->so_cred = crhold(so_inh->so_cred); + } + + TAILQ_FOREACH(sp, &incomp, so_list) { + sp->so_head = so_inh; + crfree(sp->so_cred); + sp->so_cred = crhold(so_inh->so_cred); + } + + TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list); + so_inh->so_qlen += qlen; + + TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list); + so_inh->so_incqlen += incqlen; + + lwkt_relpooltoken(so_inh); + lwkt_relpooltoken(so); + + if (qlen) { + /* + * "New" connections have arrived + */ + sorwakeup(so_inh); + wakeup(&so_inh->so_timeo); + } +} + static int soclose_sync(struct socket *so, int fflag) { diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index de8465fb25..61912bf2fd 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1239,6 +1239,46 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, return (match); } +struct inpcb * +in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo, + const struct inpcb *inp) +{ + const struct inp_localgrphead *hdr; + const struct inp_localgroup *grp; + int i; + + if (pcbinfo->localgrphashbase == NULL) + return NULL; + + hdr = &pcbinfo->localgrphashbase[ + INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; + + LIST_FOREACH(grp, hdr, il_list) { + if (grp->il_vflag == inp->inp_vflag && + grp->il_lport == inp->inp_lport && + memcmp(&grp->il_dependladdr, + &inp->inp_inc.inc_ie.ie_dependladdr, + sizeof(grp->il_dependladdr)) == 0) { + break; + } + } + if (grp == NULL || grp->il_inpcnt == 1) + return NULL; + + KASSERT(grp->il_inpcnt >= 2, + ("invalid localgroup inp count %d", grp->il_inpcnt)); + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] == inp) { + int last = grp->il_inpcnt - 1; + + if (i == last) + last = grp->il_inpcnt - 2; + return grp->il_inp[last]; + } + } + return NULL; +} + static struct inpcb * inp_localgroup_lookup(const struct inpcbinfo *pcbinfo, struct in_addr laddr, uint16_t lport, uint32_t pkt_hash) diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index d5394bd744..807b48359b 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -465,6 +465,8 @@ void in_pcbremconnhash(struct inpcb *inp); void in_pcbremlists (struct inpcb *inp); int prison_xinpcb (struct thread *p, struct inpcb *inp); void in_savefaddr (struct socket *so, const struct sockaddr *faddr); +struct inpcb * + in_pcblocalgroup_last(const struct inpcbinfo *, const struct inpcb *); int in_pcblist_global(SYSCTL_HANDLER_ARGS); int in_pcblist_global_nomarker(SYSCTL_HANDLER_ARGS, diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index e834718646..1fb3e51ba3 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -784,6 +784,7 @@ tcp_drop(struct tcpcb *tp, int error) struct netmsg_listen_detach { struct netmsg_base base; struct tcpcb *nm_tp; + struct tcpcb *nm_tp_inh; }; static void @@ -794,7 +795,7 @@ tcp_listen_detach_handler(netmsg_t msg) int cpu = mycpuid, nextcpu; if (tp->t_flags & TF_LISTEN) - syncache_destroy(tp); + syncache_destroy(tp, nmsg->nm_tp_inh); in_pcbremwildcardhash_oncpu(tp->t_inpcb, &tcbinfo[cpu]); @@ -816,6 +817,8 @@ tcp_close(struct tcpcb *tp) { struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; + struct inpcb *inp_inh = NULL; + struct tcpcb *tp_inh = NULL; struct socket *so = inp->inp_socket; struct rtentry *rt; boolean_t dosavessthresh; @@ -826,6 +829,26 @@ tcp_close(struct tcpcb *tp) const boolean_t isipv6 = FALSE; #endif + if (tp->t_flags & TF_LISTEN) { + /* + * Pending socket/syncache inheritance + * + * If this is a listen(2) socket, find another listen(2) + * socket in the same local group, which could inherit + * the syncache and sockets pending on the completion + * and incompletion queues. + * + * NOTE: + * Currently the inheritance could only happen on the + * listen(2) sockets w/ SO_REUSEPORT set. + */ + KASSERT(&curthread->td_msgport == netisr_cpuport(0), + ("listen socket close not in netisr0")); + inp_inh = in_pcblocalgroup_last(&tcbinfo[0], inp); + if (inp_inh != NULL) + tp_inh = intotcpcb(inp_inh); + } + /* * INP_WILDCARD_MP indicates that listen(2) has been called on * this socket. This implies: @@ -854,6 +877,7 @@ tcp_close(struct tcpcb *tp) netmsg_init(&nmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY, tcp_listen_detach_handler); nmsg.nm_tp = tp; + nmsg.nm_tp_inh = tp_inh; lwkt_domsg(netisr_cpuport(1), &nmsg.base.lmsg, 0); inp->inp_flags &= ~INP_WILDCARD_MP; @@ -996,8 +1020,17 @@ no_valid_rt: tcp_destroy_timermsg(tp); tcp_output_cancel(tp); - if (tp->t_flags & TF_LISTEN) - syncache_destroy(tp); + if (tp->t_flags & TF_LISTEN) { + syncache_destroy(tp, tp_inh); + if (inp_inh != NULL && inp_inh->inp_socket != NULL) { + /* + * Pending sockets inheritance only needs + * to be done once in the current thread, + * i.e. netisr0. + */ + soinherit(so, inp_inh->inp_socket); + } + } so_async_rcvd_drop(so); /* Drop the reference for the asynchronized pru_rcvd */ diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index be83d4051a..af1a57f2f6 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -418,7 +418,7 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch) } void -syncache_destroy(struct tcpcb *tp) +syncache_destroy(struct tcpcb *tp, struct tcpcb *tp_inh) { struct tcp_syncache_percpu *syncache_percpu; struct syncache_head *bucket; @@ -432,7 +432,7 @@ syncache_destroy(struct tcpcb *tp) bucket = &syncache_percpu->hashbase[i]; TAILQ_FOREACH(sc, &bucket->sch_bucket, sc_hash) { if (sc->sc_tp == tp) - sc->sc_tp = NULL; + sc->sc_tp = tp_inh; } } } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 90922ec349..9f0599408a 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -717,7 +717,7 @@ int syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket *, struct mbuf *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); -void syncache_destroy(struct tcpcb *tp); +void syncache_destroy(struct tcpcb *tp, struct tcpcb *new_tp); #ifdef TCP_SIGNATURE int tcpsignature_apply(void *fstate, void *data, unsigned int len); diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index ce17e31c04..288ccf11f8 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -463,6 +463,7 @@ int solisten (struct socket *so, int backlog, struct thread *td); struct socket *sonewconn (struct socket *head, int connstatus); struct socket *sonewconn_faddr (struct socket *head, int connstatus, const struct sockaddr *faddr); +void soinherit(struct socket *so, struct socket *so_inh); int sooptcopyin (struct sockopt *sopt, void *buf, size_t len, size_t minlen); int soopt_to_kbuf (struct sockopt *sopt, void *buf, size_t len, -- 2.41.0