| 1 | /* |
| 2 | * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. |
| 3 | * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. |
| 4 | * |
| 5 | * This code is derived from software contributed to The DragonFly Project |
| 6 | * by Jeffrey M. Hsu. |
| 7 | * |
| 8 | * Redistribution and use in source and binary forms, with or without |
| 9 | * modification, are permitted provided that the following conditions |
| 10 | * are met: |
| 11 | * 1. Redistributions of source code must retain the above copyright |
| 12 | * notice, this list of conditions and the following disclaimer. |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer in the |
| 15 | * documentation and/or other materials provided with the distribution. |
| 16 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 17 | * contributors may be used to endorse or promote products derived |
| 18 | * from this software without specific, prior written permission. |
| 19 | * |
| 20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 23 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 24 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 25 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 26 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 27 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 28 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 29 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 30 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 31 | * SUCH DAMAGE. |
| 32 | * |
| 33 | * $DragonFly: src/sys/netinet/ip_demux.c,v 1.45 2008/11/11 10:46:58 sephe Exp $ |
| 34 | */ |
| 35 | |
| 36 | #include "opt_inet.h" |
| 37 | #include "opt_rss.h" |
| 38 | |
| 39 | #include <sys/param.h> |
| 40 | #include <sys/systm.h> |
| 41 | #include <sys/kernel.h> |
| 42 | #include <sys/socket.h> |
| 43 | #include <sys/socketvar.h> |
| 44 | #include <sys/thread.h> |
| 45 | #include <sys/sysctl.h> |
| 46 | #include <sys/globaldata.h> |
| 47 | |
| 48 | #include <net/if.h> |
| 49 | #include <net/netisr.h> |
| 50 | #include <net/toeplitz2.h> |
| 51 | |
| 52 | #include <netinet/in_systm.h> |
| 53 | #include <netinet/in.h> |
| 54 | #include <netinet/in_var.h> |
| 55 | #include <netinet/in_pcb.h> |
| 56 | #include <netinet/ip.h> |
| 57 | #include <netinet/ip_var.h> |
| 58 | #include <netinet/tcp.h> |
| 59 | #include <netinet/tcpip.h> |
| 60 | #include <netinet/tcp_var.h> |
| 61 | #include <netinet/udp.h> |
| 62 | #include <netinet/udp_var.h> |
| 63 | |
| 64 | extern int udp_mpsafe_thread; |
| 65 | |
| 66 | /* |
| 67 | * Toeplitz hash functions - the idea is to match the hardware. |
| 68 | */ |
| 69 | static __inline int |
| 70 | INP_MPORT_HASH_UDP(in_addr_t faddr, in_addr_t laddr, |
| 71 | in_port_t fport, in_port_t lport) |
| 72 | { |
| 73 | return toeplitz_hash(toeplitz_rawhash_addr(faddr, laddr)); |
| 74 | } |
| 75 | |
| 76 | static __inline int |
| 77 | INP_MPORT_HASH_TCP(in_addr_t faddr, in_addr_t laddr, |
| 78 | in_port_t fport, in_port_t lport) |
| 79 | { |
| 80 | return toeplitz_hash( |
| 81 | toeplitz_rawhash_addrport(faddr, laddr, fport, lport)); |
| 82 | } |
| 83 | |
| 84 | /* |
| 85 | * Map a network address to a processor. |
| 86 | */ |
| 87 | int |
| 88 | tcp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) |
| 89 | { |
| 90 | return (INP_MPORT_HASH_TCP(faddr, laddr, fport, lport)); |
| 91 | } |
| 92 | |
| 93 | /* |
| 94 | * Not implemented yet, use protocol thread 0 |
| 95 | */ |
| 96 | int |
| 97 | udp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) |
| 98 | { |
| 99 | #ifdef notyet |
| 100 | return (INP_MPORT_HASH_UDP(faddr, laddr, fport, lport)); |
| 101 | #else |
| 102 | return 0; |
| 103 | #endif |
| 104 | } |
| 105 | |
| 106 | /* |
| 107 | * If the packet is a valid IP datagram, upon returning of this function |
| 108 | * following things are promised: |
| 109 | * |
| 110 | * o IP header (including any possible IP options) and any data preceding |
| 111 | * IP header (usually linker layer header) are in one mbuf (m_len). |
| 112 | * o IP header length is not less than the minimum (sizeof(struct ip)). |
| 113 | * o IP total length is not less than IP header length. |
| 114 | * o IP datagram resides completely in the mbuf chain, |
| 115 | * i.e. pkthdr.len >= IP total length. |
| 116 | * |
| 117 | * If the packet is a UDP datagram, |
| 118 | * o IP header (including any possible IP options) and UDP header are in |
| 119 | * one mbuf (m_len). |
| 120 | * o IP total length is not less than (IP header length + UDP header length). |
| 121 | * |
| 122 | * If the packet is a TCP segment, |
| 123 | * o IP header (including any possible IP options) and TCP header (including |
| 124 | * any possible TCP options) are in one mbuf (m_len). |
| 125 | * o TCP header length is not less than the minimum (sizeof(struct tcphdr)). |
| 126 | * o IP total length is not less than (IP header length + TCP header length). |
| 127 | */ |
| 128 | boolean_t |
| 129 | ip_lengthcheck(struct mbuf **mp, int hoff) |
| 130 | { |
| 131 | struct mbuf *m = *mp; |
| 132 | struct ip *ip; |
| 133 | int len, iphlen, iplen; |
| 134 | struct tcphdr *th; |
| 135 | int thoff; /* TCP data offset */ |
| 136 | |
| 137 | len = hoff + sizeof(struct ip); |
| 138 | |
| 139 | /* The packet must be at least the size of an IP header. */ |
| 140 | if (m->m_pkthdr.len < len) { |
| 141 | ipstat.ips_tooshort++; |
| 142 | goto fail; |
| 143 | } |
| 144 | |
| 145 | /* The fixed IP header must reside completely in the first mbuf. */ |
| 146 | if (m->m_len < len) { |
| 147 | m = m_pullup(m, len); |
| 148 | if (m == NULL) { |
| 149 | ipstat.ips_toosmall++; |
| 150 | goto fail; |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | ip = mtodoff(m, struct ip *, hoff); |
| 155 | |
| 156 | /* Bound check the packet's stated IP header length. */ |
| 157 | iphlen = ip->ip_hl << 2; |
| 158 | if (iphlen < sizeof(struct ip)) { /* minimum header length */ |
| 159 | ipstat.ips_badhlen++; |
| 160 | goto fail; |
| 161 | } |
| 162 | |
| 163 | /* The full IP header must reside completely in the one mbuf. */ |
| 164 | if (m->m_len < hoff + iphlen) { |
| 165 | m = m_pullup(m, hoff + iphlen); |
| 166 | if (m == NULL) { |
| 167 | ipstat.ips_badhlen++; |
| 168 | goto fail; |
| 169 | } |
| 170 | ip = mtodoff(m, struct ip *, hoff); |
| 171 | } |
| 172 | |
| 173 | iplen = ntohs(ip->ip_len); |
| 174 | |
| 175 | /* |
| 176 | * Check that the amount of data in the buffers is as |
| 177 | * at least much as the IP header would have us expect. |
| 178 | */ |
| 179 | if (m->m_pkthdr.len < hoff + iplen) { |
| 180 | ipstat.ips_tooshort++; |
| 181 | goto fail; |
| 182 | } |
| 183 | |
| 184 | /* |
| 185 | * Fragments other than the first fragment don't have much |
| 186 | * length information. |
| 187 | */ |
| 188 | if (ntohs(ip->ip_off) & IP_OFFMASK) |
| 189 | goto ipcheckonly; |
| 190 | |
| 191 | /* |
| 192 | * The TCP/IP or UDP/IP header must be entirely contained within |
| 193 | * the first fragment of a packet. Packet filters will break if they |
| 194 | * aren't. |
| 195 | * |
| 196 | * Since the packet will be trimmed to ip_len we must also make sure |
| 197 | * the potentially trimmed down length is still sufficient to hold |
| 198 | * the header(s). |
| 199 | */ |
| 200 | switch (ip->ip_p) { |
| 201 | case IPPROTO_TCP: |
| 202 | if (iplen < iphlen + sizeof(struct tcphdr)) { |
| 203 | ++tcpstat.tcps_rcvshort; |
| 204 | goto fail; |
| 205 | } |
| 206 | if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) { |
| 207 | m = m_pullup(m, hoff + iphlen + sizeof(struct tcphdr)); |
| 208 | if (m == NULL) { |
| 209 | tcpstat.tcps_rcvshort++; |
| 210 | goto fail; |
| 211 | } |
| 212 | ip = mtodoff(m, struct ip *, hoff); |
| 213 | } |
| 214 | th = (struct tcphdr *)((caddr_t)ip + iphlen); |
| 215 | thoff = th->th_off << 2; |
| 216 | if (thoff < sizeof(struct tcphdr) || |
| 217 | thoff + iphlen > ntohs(ip->ip_len)) { |
| 218 | tcpstat.tcps_rcvbadoff++; |
| 219 | goto fail; |
| 220 | } |
| 221 | if (m->m_len < hoff + iphlen + thoff) { |
| 222 | m = m_pullup(m, hoff + iphlen + thoff); |
| 223 | if (m == NULL) { |
| 224 | tcpstat.tcps_rcvshort++; |
| 225 | goto fail; |
| 226 | } |
| 227 | } |
| 228 | break; |
| 229 | case IPPROTO_UDP: |
| 230 | if (iplen < iphlen + sizeof(struct udphdr)) { |
| 231 | ++udpstat.udps_hdrops; |
| 232 | goto fail; |
| 233 | } |
| 234 | if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) { |
| 235 | m = m_pullup(m, hoff + iphlen + sizeof(struct udphdr)); |
| 236 | if (m == NULL) { |
| 237 | udpstat.udps_hdrops++; |
| 238 | goto fail; |
| 239 | } |
| 240 | } |
| 241 | break; |
| 242 | default: |
| 243 | ipcheckonly: |
| 244 | if (iplen < iphlen) { |
| 245 | ++ipstat.ips_badlen; |
| 246 | goto fail; |
| 247 | } |
| 248 | break; |
| 249 | } |
| 250 | |
| 251 | m->m_flags |= M_LENCHECKED; |
| 252 | *mp = m; |
| 253 | return TRUE; |
| 254 | |
| 255 | fail: |
| 256 | if (m != NULL) |
| 257 | m_freem(m); |
| 258 | *mp = NULL; |
| 259 | return FALSE; |
| 260 | } |
| 261 | |
| 262 | /* |
| 263 | * Assign a protocol processing thread to a packet. The IP header is at |
| 264 | * offset (hoff) in the packet (i.e. the mac header might still be intact). |
| 265 | * |
| 266 | * This function can blow away the mbuf if the packet is malformed. |
| 267 | */ |
| 268 | void |
| 269 | ip_cpufn(struct mbuf **mptr, int hoff, int dir) |
| 270 | { |
| 271 | struct ip *ip; |
| 272 | int iphlen; |
| 273 | struct tcphdr *th; |
| 274 | struct udphdr *uh; |
| 275 | struct mbuf *m; |
| 276 | int thoff; /* TCP data offset */ |
| 277 | int cpu; |
| 278 | |
| 279 | if (!ip_lengthcheck(mptr, hoff)) |
| 280 | return; |
| 281 | |
| 282 | m = *mptr; |
| 283 | ip = mtodoff(m, struct ip *, hoff); |
| 284 | iphlen = ip->ip_hl << 2; |
| 285 | |
| 286 | /* |
| 287 | * XXX generic packet handling defrag on CPU 0 for now. |
| 288 | */ |
| 289 | if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { |
| 290 | cpu = 0; |
| 291 | goto back; |
| 292 | } |
| 293 | |
| 294 | switch (ip->ip_p) { |
| 295 | case IPPROTO_TCP: |
| 296 | th = (struct tcphdr *)((caddr_t)ip + iphlen); |
| 297 | thoff = th->th_off << 2; |
| 298 | cpu = INP_MPORT_HASH_TCP(ip->ip_src.s_addr, |
| 299 | ip->ip_dst.s_addr, |
| 300 | th->th_sport, |
| 301 | th->th_dport); |
| 302 | break; |
| 303 | |
| 304 | case IPPROTO_UDP: |
| 305 | uh = (struct udphdr *)((caddr_t)ip + iphlen); |
| 306 | |
| 307 | cpu = INP_MPORT_HASH_UDP(ip->ip_src.s_addr, |
| 308 | ip->ip_dst.s_addr, |
| 309 | uh->uh_sport, |
| 310 | uh->uh_dport); |
| 311 | break; |
| 312 | |
| 313 | default: |
| 314 | cpu = 0; |
| 315 | break; |
| 316 | } |
| 317 | back: |
| 318 | m->m_flags |= M_HASH; |
| 319 | m->m_pkthdr.hash = cpu; |
| 320 | } |
| 321 | |
| 322 | void |
| 323 | ip_cpufn_in(struct mbuf **mptr, int hoff) |
| 324 | { |
| 325 | ip_cpufn(mptr, hoff, IP_MPORT_IN); |
| 326 | } |
| 327 | |
| 328 | /* |
| 329 | * Verify and adjust the hash value of the packet. |
| 330 | * |
| 331 | * Unlike ip_cpufn(), the packet content is not accessed. The packet info |
| 332 | * (pi) and the hash of the packet (m_pkthdr.hash) is used instead. |
| 333 | * |
| 334 | * Caller has already made sure that m_pkthdr.hash is valid, i.e. m_flags |
| 335 | * has M_HASH set. |
| 336 | */ |
| 337 | void |
| 338 | ip_hashcheck(struct mbuf *m, const struct pktinfo *pi) |
| 339 | { |
| 340 | KASSERT((m->m_flags & M_HASH), ("no valid packet hash\n")); |
| 341 | KASSERT(m->m_pkthdr.hash < ncpus2, |
| 342 | ("invalid packet hash %#x\n", m->m_pkthdr.hash)); |
| 343 | |
| 344 | /* |
| 345 | * XXX generic packet handling defrag on CPU 0 for now. |
| 346 | */ |
| 347 | if (pi->pi_flags & PKTINFO_FLAG_FRAG) { |
| 348 | m->m_pkthdr.hash = 0; |
| 349 | return; |
| 350 | } |
| 351 | |
| 352 | switch (pi->pi_l3proto) { |
| 353 | case IPPROTO_TCP: |
| 354 | case IPPROTO_UDP: |
| 355 | break; |
| 356 | |
| 357 | default: |
| 358 | /* Let software calculate the hash */ |
| 359 | m->m_flags &= ~M_HASH; |
| 360 | break; |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | /* |
| 365 | * This is used to map a socket to a message port for sendmsg() and friends. |
| 366 | * It is not called for any other purpose. In the case of TCP we just return |
| 367 | * the port already installed in the socket. |
| 368 | */ |
| 369 | lwkt_port_t |
| 370 | tcp_soport(struct socket *so, struct sockaddr *nam, |
| 371 | struct mbuf **dummy __unused) |
| 372 | { |
| 373 | return(so->so_port); |
| 374 | } |
| 375 | |
| 376 | /* |
| 377 | * Used to route icmp messages to the proper protocol thread for ctlinput |
| 378 | * operation. |
| 379 | */ |
| 380 | lwkt_port_t |
| 381 | tcp_ctlport(int cmd, struct sockaddr *sa, void *vip) |
| 382 | { |
| 383 | struct ip *ip = vip; |
| 384 | struct tcphdr *th; |
| 385 | struct in_addr faddr; |
| 386 | int cpu; |
| 387 | |
| 388 | faddr = ((struct sockaddr_in *)sa)->sin_addr; |
| 389 | if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) |
| 390 | return(NULL); |
| 391 | if (ip == NULL || PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { |
| 392 | /* |
| 393 | * A new message will be allocated later to save necessary |
| 394 | * information and will be forwarded to all network protocol |
| 395 | * threads in the following way: |
| 396 | * |
| 397 | * (the the thread owns the msgport that we return here) |
| 398 | * netisr0 <--+ |
| 399 | * | | |
| 400 | * | | |
| 401 | * | | |
| 402 | * +-------+ |
| 403 | * sendmsg |
| 404 | * [msg is kmalloc()ed] |
| 405 | * |
| 406 | * |
| 407 | * Later on, when the msg is received by netisr0: |
| 408 | * |
| 409 | * forwardmsg forwardmsg |
| 410 | * netisr0 ---------> netisr1 ---------> netisrN |
| 411 | * [msg is kfree()ed] |
| 412 | */ |
| 413 | return cpu0_ctlport(cmd, sa, vip); |
| 414 | } else { |
| 415 | th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); |
| 416 | cpu = tcp_addrcpu(faddr.s_addr, th->th_dport, |
| 417 | ip->ip_src.s_addr, th->th_sport); |
| 418 | } |
| 419 | return(cpu_portfn(cpu)); |
| 420 | } |
| 421 | |
| 422 | lwkt_port_t |
| 423 | tcp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) |
| 424 | { |
| 425 | return(cpu_portfn(tcp_addrcpu(faddr, fport, laddr, lport))); |
| 426 | } |
| 427 | |
| 428 | lwkt_port_t |
| 429 | tcp_addrport0(void) |
| 430 | { |
| 431 | return(cpu_portfn(0)); |
| 432 | } |
| 433 | |
| 434 | lwkt_port_t |
| 435 | udp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport) |
| 436 | { |
| 437 | return(cpu_portfn(udp_addrcpu(faddr, fport, laddr, lport))); |
| 438 | } |
| 439 | |
| 440 | /* |
| 441 | * Used to route icmp messages to the proper protocol thread for ctlinput |
| 442 | * operation. |
| 443 | */ |
| 444 | lwkt_port_t |
| 445 | udp_ctlport(int cmd, struct sockaddr *sa, void *vip) |
| 446 | { |
| 447 | struct ip *ip = vip; |
| 448 | struct udphdr *uh; |
| 449 | struct in_addr faddr; |
| 450 | int cpu; |
| 451 | |
| 452 | faddr = ((struct sockaddr_in *)sa)->sin_addr; |
| 453 | if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) |
| 454 | return(NULL); |
| 455 | if (PRC_IS_REDIRECT(cmd)) { |
| 456 | /* |
| 457 | * See the comment in tcp_ctlport; the only difference |
| 458 | * is that message is forwarded to UDP protocol theads. |
| 459 | */ |
| 460 | return cpu0_ctlport(cmd, sa, vip); |
| 461 | } else if (ip == NULL || cmd == PRC_HOSTDEAD) { |
| 462 | /* |
| 463 | * XXX |
| 464 | * Once UDP inpcbs are CPU localized, we should do |
| 465 | * the same forwarding as PRC_IS_REDIRECT(cmd) |
| 466 | */ |
| 467 | cpu = 0; |
| 468 | } else { |
| 469 | uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); |
| 470 | |
| 471 | cpu = udp_addrcpu(faddr.s_addr, ip->ip_src.s_addr, |
| 472 | uh->uh_dport, uh->uh_sport); |
| 473 | } |
| 474 | return (cpu_portfn(cpu)); |
| 475 | } |