From 1fe8db0691fe8eab833b067336afd654ac7f8cfc Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Sat, 7 Jun 2014 21:17:08 +0800 Subject: [PATCH] udp: Dispatch UDP datagrams to the correct netisr to perform ip_output() Redispatch UDP datagrams to the netisr, whose owner CPU matches the UDP datagrams hash, to do the ip_output(). As measured bt KTR, the udp_send() spends most of its time in ip_output(). To properly support this, following stuffs are added/changed: - Add a network private lwkt message flag to indicate the so_port of the netmsg should not be matched against the current netisr's msgport. This flag is set when we redispatch the UDP datagrams to other netisrs. - Don't use priority messages for netisr barrier, so at UDP socket close time, UDP datagrams pending on other netisr msgport could be properly sync'ed. The UDP datagrams redispatch itself: - If IP options are ever configured (supposely none), a copy of it will be carried along w/ the UDP datagram to the target netisr. The copy is made mainly because it is not safe to access the IP options of the inpcb in netisr which does not own the inpcb. (*) On the other hand accessing inpcb's multicast options is safe since multicast UDP datagrams output and multicast options configuration all happen in netisr0. - Add nm_priv into netmsg_pru_send, which saves flags to be passed to ip_output() for the UDP datagram. This does not changes the size of netmsg_pru_send on x86_64 due to the implicit 4bytes padding. - udp_addrcpu_pkt() is added to calculate the "real" CPU for the UDP datagrams. - Don't use inpcb route cache for redispatched UDP datagrams. Since: o The cached route is usually not for the UDP datagrams' destination. o Accessing inpcb route cache in the netisr, which is not the owner of the inpcb is not safe. On i7-3770 w/ 82599ES, this increases 18bytes UDP request/response performance by ~19% (1.12M trasactions/s ---> 1.34M transactions/s) This commit also makes lockless firewall state table doable, since input and output of UDP datagrams, which have same hash, are running in the same netisr now! --- sys/net/netisr.c | 17 ++++-- sys/net/netmsg.h | 3 ++ sys/netinet/ip_demux.c | 11 ++++ sys/netinet/udp_usrreq.c | 109 ++++++++++++++++++++++++++++++++++++--- sys/netinet/udp_var.h | 2 + 5 files changed, 132 insertions(+), 10 deletions(-) diff --git a/sys/net/netisr.c b/sys/net/netisr.c index 1e786a2122..e428405895 100644 --- a/sys/net/netisr.c +++ b/sys/net/netisr.c @@ -293,7 +293,12 @@ netmsg_service_loop(void *arg) KASSERT(msg->nm_dispatch != NULL, ("netmsg_service isr %d badmsg", msg->lmsg.u.ms_result)); - if (msg->nm_so && + /* + * Don't match so_port, if the msg explicitly + * asks us to ignore its so_port. + */ + if ((msg->lmsg.ms_flags & MSGF_IGNSOPORT) == 0 && + msg->nm_so && msg->nm_so->so_port != &td->td_msgport) { /* * Sockets undergoing connect or disconnect @@ -655,8 +660,14 @@ netisr_barrier_set(struct netisr_barrier *br) msg = kmalloc(sizeof(struct netmsg_barrier), M_LWKTMSG, M_WAITOK); - netmsg_init(&msg->base, NULL, &netisr_afree_rport, - MSGF_PRIORITY, netisr_barrier_dispatch); + + /* + * Don't use priority message here; mainly to keep + * it ordered w/ the previous data packets sent by + * the caller. + */ + netmsg_init(&msg->base, NULL, &netisr_afree_rport, 0, + netisr_barrier_dispatch); msg->br_cpumask = &other_cpumask; msg->br_done = NETISR_BR_NOTDONE; diff --git a/sys/net/netmsg.h b/sys/net/netmsg.h index 564c8fc97e..ac8be2eb40 100644 --- a/sys/net/netmsg.h +++ b/sys/net/netmsg.h @@ -59,6 +59,8 @@ struct netmsg_base { struct socket *nm_so; }; +#define MSGF_IGNSOPORT MSGF_USER0 /* don't check so_port */ + typedef struct netmsg_base *netmsg_base_t; /* @@ -201,6 +203,7 @@ struct netmsg_pru_rcvoob { struct netmsg_pru_send { struct netmsg_base base; int nm_flags; /* PRUS_xxx */ + int nm_priv; /* proto priv. */ struct mbuf *nm_m; struct sockaddr *nm_addr; struct mbuf *nm_control; diff --git a/sys/netinet/ip_demux.c b/sys/netinet/ip_demux.c index 9db38adf28..aa10858904 100644 --- a/sys/netinet/ip_demux.c +++ b/sys/netinet/ip_demux.c @@ -99,6 +99,17 @@ udp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) #endif } +int +udp_addrcpu_pkt(in_addr_t faddr, in_port_t fport, in_addr_t laddr, + in_port_t lport) +{ + if (IN_MULTICAST(ntohl(faddr))) { + /* XXX handle multicast on CPU0 for now */ + return 0; + } + return (netisr_hashcpu(INP_MPORT_HASH_UDP(faddr, laddr, fport, lport))); +} + /* * If the packet is a valid IP datagram, upon returning of this function * following things are promised: diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 4731da715e..6a88927b38 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -126,9 +126,12 @@ #endif KTR_INFO_MASTER(udp); -KTR_INFO(KTR_UDP, udp, output_beg, 0, UDP_KTR_STRING, UDP_KTR_ARGS); -KTR_INFO(KTR_UDP, udp, output_end, 1, UDP_KTR_STRING, UDP_KTR_ARGS); -KTR_INFO(KTR_UDP, udp, ip_output, 2, UDP_KTR_STRING, UDP_KTR_ARGS); +KTR_INFO(KTR_UDP, udp, send_beg, 0, UDP_KTR_STRING, UDP_KTR_ARGS); +KTR_INFO(KTR_UDP, udp, send_end, 1, UDP_KTR_STRING, UDP_KTR_ARGS); +KTR_INFO(KTR_UDP, udp, send_ipout, 2, UDP_KTR_STRING, UDP_KTR_ARGS); +KTR_INFO(KTR_UDP, udp, redisp_ipout_beg, 3, UDP_KTR_STRING, UDP_KTR_ARGS); +KTR_INFO(KTR_UDP, udp, redisp_ipout_end, 4, UDP_KTR_STRING, UDP_KTR_ARGS); +KTR_INFO(KTR_UDP, udp, send_redisp, 5, UDP_KTR_STRING, UDP_KTR_ARGS); #define logudp(name, inp) KTR_LOG(udp_##name, inp) @@ -853,6 +856,37 @@ udp_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp_getcred, "S,ucred", "Get the ucred of a UDP connection"); +static void +udp_send_redispatch(netmsg_t msg) +{ + struct mbuf *m = msg->send.nm_m; + int pru_flags = msg->send.nm_flags; + struct inpcb *inp = msg->send.base.nm_so->so_pcb; + struct mbuf *m_opt = msg->send.nm_control; /* XXX save ipopt */ + int flags = msg->send.nm_priv; /* ip_output flags */ + int error; + + logudp(redisp_ipout_beg, inp); + + /* + * - Don't use inp route cache. It should only be used in the + * inp owner netisr. + * - Access to inp_moptions should be safe, since multicast UDP + * datagrams are redispatched to netisr0 and inp_moptions is + * changed only in netisr0. + */ + error = ip_output(m, m_opt, NULL, flags, inp->inp_moptions, inp); + if ((pru_flags & PRUS_NOREPLY) == 0) + lwkt_replymsg(&msg->send.base.lmsg, error); + + if (m_opt != NULL) { + /* Free saved ip options, if any */ + m_freem(m_opt); + } + + logudp(redisp_ipout_end, inp); +} + static void udp_send(netmsg_t msg) { @@ -867,12 +901,12 @@ udp_send(netmsg_t msg) struct udpiphdr *ui; int len = m->m_pkthdr.len; struct sockaddr_in *sin; /* really is initialized before use */ - int error = 0; + int error = 0, cpu; KKASSERT(&curthread->td_msgport == netisr_cpuport(0)); KKASSERT(msg->send.nm_control == NULL); - logudp(output_beg, inp); + logudp(send_beg, inp); if (inp == NULL) { error = EINVAL; @@ -1010,7 +1044,68 @@ udp_send(netmsg_t msg) if (pru_flags & PRUS_DONTROUTE) flags |= SO_DONTROUTE; - logudp(ip_output, inp); + cpu = udp_addrcpu_pkt(ui->ui_dst.s_addr, ui->ui_dport, + ui->ui_src.s_addr, ui->ui_sport); + if (cpu != mycpuid) { + struct mbuf *m_opt = NULL; + struct netmsg_pru_send *smsg; + struct lwkt_port *port = netisr_cpuport(cpu); + + /* + * Not on the CPU that matches this UDP datagram hash; + * redispatch to the correct CPU to do the ip_output(). + */ + if (inp->inp_options != NULL) { + /* + * If there are ip options, then save a copy, + * since accessing inp_options on other CPUs' + * is not safe. + * + * XXX optimize this? + */ + m_opt = m_copym(inp->inp_options, 0, M_COPYALL, + MB_WAIT); + } + if ((pru_flags & PRUS_NOREPLY) == 0) { + /* + * Change some parts of the original netmsg and + * forward it to the target netisr. + * + * NOTE: so_port MUST NOT be checked in the target + * netisr. + */ + smsg = &msg->send; + smsg->nm_priv = flags; /* ip_output flags */ + smsg->nm_m = m; + smsg->nm_control = m_opt; /* XXX save ipopt */ + smsg->base.lmsg.ms_flags |= MSGF_IGNSOPORT; + smsg->base.nm_dispatch = udp_send_redispatch; + lwkt_forwardmsg(port, &smsg->base.lmsg); + } else { + /* + * Recreate the netmsg, since the original mbuf + * could have been changed. And send it to the + * target netisr. + * + * NOTE: so_port MUST NOT be checked in the target + * netisr. + */ + smsg = &m->m_hdr.mh_sndmsg; + netmsg_init(&smsg->base, so, &netisr_apanic_rport, + MSGF_IGNSOPORT, udp_send_redispatch); + smsg->nm_priv = flags; /* ip_output flags */ + smsg->nm_flags = pru_flags; + smsg->nm_m = m; + smsg->nm_control = m_opt; /* XXX save ipopt */ + lwkt_sendmsg(port, &smsg->base.lmsg); + } + + /* This UDP datagram is redispatched; done */ + logudp(send_redisp, inp); + return; + } + + logudp(send_ipout, inp); error = ip_output(m, inp->inp_options, &inp->inp_route, flags, inp->inp_moptions, inp); m = NULL; @@ -1026,7 +1121,7 @@ release: if ((pru_flags & PRUS_NOREPLY) == 0) lwkt_replymsg(&msg->send.base.lmsg, error); - logudp(output_end, inp); + logudp(send_end, inp); } u_long udp_sendspace = 9216; /* really max datagram size */ diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index 34575935c6..1e9816b63b 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -155,6 +155,8 @@ extern int log_in_vain; int udp_addrcpu (in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport); +int udp_addrcpu_pkt (in_addr_t faddr, in_port_t fport, + in_addr_t laddr, in_port_t lport); struct lwkt_port *udp_addrport (in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport); void udp_ctlinput(netmsg_t msg); -- 2.41.0