nrelease - fix/improve livecd
[dragonfly.git] / sys / netinet / ip_demux.c
CommitLineData
bf82f9b7 1/*
380d2ea3
JH
2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved.
3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved.
f23061d4 4 *
380d2ea3
JH
5 * This code is derived from software contributed to The DragonFly Project
6 * by Jeffrey M. Hsu.
f23061d4 7 *
380d2ea3
JH
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of The DragonFly Project nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific, prior written permission.
f23061d4 19 *
380d2ea3
JH
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
bf82f9b7
MD
32 */
33
34#include "opt_inet.h"
e99d9a39 35#include "opt_rss.h"
bf82f9b7
MD
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/socket.h>
41#include <sys/socketvar.h>
42#include <sys/thread.h>
43#include <sys/sysctl.h>
2b57d013 44#include <sys/globaldata.h>
bf82f9b7
MD
45
46#include <net/if.h>
5337421c 47#include <net/netisr2.h>
6d5eef1e 48#include <net/toeplitz2.h>
bf82f9b7
MD
49
50#include <netinet/in_systm.h>
51#include <netinet/in.h>
52#include <netinet/in_var.h>
53#include <netinet/in_pcb.h>
54#include <netinet/ip.h>
55#include <netinet/ip_var.h>
56#include <netinet/tcp.h>
57#include <netinet/tcpip.h>
58#include <netinet/tcp_var.h>
59#include <netinet/udp.h>
60#include <netinet/udp_var.h>
61
5cbeb1a5
SZ
62struct initport_index {
63 uint32_t port_index;
64} __cachealign;
65static struct initport_index initport_indices[MAXCPU];
66
c3c96e44
MD
67/*
68 * Toeplitz hash functions - the idea is to match the hardware.
69 */
f861aec9
SZ
70static __inline int
71INP_MPORT_HASH_UDP(in_addr_t faddr, in_addr_t laddr,
72 in_port_t fport, in_port_t lport)
73{
e2bf2ed2
SZ
74 /*
75 * NOTE: laddr could be multicast, since UDP socket could be
76 * bound to multicast address.
77 */
78 if (IN_MULTICAST(ntohl(faddr)) || IN_MULTICAST(ntohl(laddr))) {
79 /* XXX handle multicast on CPU0 for now */
80 return 0;
81 }
b73d4152 82 return toeplitz_hash(toeplitz_rawhash_addr(faddr, laddr));
f861aec9
SZ
83}
84
85static __inline int
86INP_MPORT_HASH_TCP(in_addr_t faddr, in_addr_t laddr,
87 in_port_t fport, in_port_t lport)
88{
b73d4152
SZ
89 return toeplitz_hash(
90 toeplitz_rawhash_addrport(faddr, laddr, fport, lport));
f861aec9
SZ
91}
92
76a9ffca
SZ
93/*
94 * Hash for the network address.
95 */
96int
97tcp_addrhash(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport)
98{
99 return (INP_MPORT_HASH_TCP(faddr, laddr, fport, lport));
100}
101
102int
103udp_addrhash(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport)
104{
105 return (INP_MPORT_HASH_UDP(faddr, laddr, fport, lport));
106}
107
c3c96e44
MD
108/*
109 * Map a network address to a processor.
110 */
111int
112tcp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport)
113{
87561f8f 114 return (netisr_hashcpu(INP_MPORT_HASH_TCP(faddr, laddr, fport, lport)));
c3c96e44
MD
115}
116
117int
118udp_addrcpu(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport)
119{
1fe8db06
SZ
120 return (netisr_hashcpu(INP_MPORT_HASH_UDP(faddr, laddr, fport, lport)));
121}
122
e9f5b82f
SZ
123/*
124 * If the packet is a valid IP datagram, upon returning of this function
125 * following things are promised:
126 *
dc6a6a0e
SZ
127 * o IP header (including any possible IP options) and any data preceding
128 * IP header (usually linker layer header) are in one mbuf (m_len).
e9f5b82f
SZ
129 * o IP header length is not less than the minimum (sizeof(struct ip)).
130 * o IP total length is not less than IP header length.
9b161cc2
SZ
131 * o IP datagram resides completely in the mbuf chain,
132 * i.e. pkthdr.len >= IP total length.
e9f5b82f
SZ
133 *
134 * If the packet is a UDP datagram,
135 * o IP header (including any possible IP options) and UDP header are in
136 * one mbuf (m_len).
137 * o IP total length is not less than (IP header length + UDP header length).
138 *
139 * If the packet is a TCP segment,
140 * o IP header (including any possible IP options) and TCP header (including
141 * any possible TCP options) are in one mbuf (m_len).
142 * o TCP header length is not less than the minimum (sizeof(struct tcphdr)).
143 * o IP total length is not less than (IP header length + TCP header length).
144 */
1e316d14 145boolean_t
c3c96e44 146ip_lengthcheck(struct mbuf **mp, int hoff)
bf82f9b7 147{
1e316d14 148 struct mbuf *m = *mp;
55d829f8 149 struct ip *ip;
c3c96e44 150 int len, iphlen, iplen;
bf82f9b7 151 struct tcphdr *th;
55d829f8 152 int thoff; /* TCP data offset */
bf82f9b7 153
c3c96e44
MD
154 len = hoff + sizeof(struct ip);
155
ead1d3cb 156 /* The packet must be at least the size of an IP header. */
c3c96e44 157 if (m->m_pkthdr.len < len) {
55d829f8 158 ipstat.ips_tooshort++;
dd4df7e9 159 goto fail;
55d829f8
JH
160 }
161
ead1d3cb 162 /* The fixed IP header must reside completely in the first mbuf. */
c3c96e44
MD
163 if (m->m_len < len) {
164 m = m_pullup(m, len);
ead1d3cb
JH
165 if (m == NULL) {
166 ipstat.ips_toosmall++;
dd4df7e9 167 goto fail;
ead1d3cb 168 }
bf82f9b7 169 }
ead1d3cb 170
c3c96e44 171 ip = mtodoff(m, struct ip *, hoff);
55d829f8 172
ead1d3cb 173 /* Bound check the packet's stated IP header length. */
9eeaa8a9 174 iphlen = ip->ip_hl << 2;
9babcab8
JH
175 if (iphlen < sizeof(struct ip)) { /* minimum header length */
176 ipstat.ips_badhlen++;
dd4df7e9 177 goto fail;
9babcab8 178 }
ead1d3cb
JH
179
180 /* The full IP header must reside completely in the one mbuf. */
c3c96e44
MD
181 if (m->m_len < hoff + iphlen) {
182 m = m_pullup(m, hoff + iphlen);
b01ae44a
MD
183 if (m == NULL) {
184 ipstat.ips_badhlen++;
dd4df7e9 185 goto fail;
b01ae44a 186 }
c3c96e44 187 ip = mtodoff(m, struct ip *, hoff);
b01ae44a
MD
188 }
189
ead1d3cb
JH
190 iplen = ntohs(ip->ip_len);
191
9b161cc2
SZ
192 /*
193 * Check that the amount of data in the buffers is as
194 * at least much as the IP header would have us expect.
195 */
c3c96e44 196 if (m->m_pkthdr.len < hoff + iplen) {
9b161cc2
SZ
197 ipstat.ips_tooshort++;
198 goto fail;
199 }
200
ead1d3cb
JH
201 /*
202 * Fragments other than the first fragment don't have much
203 * length information.
204 */
8a93af2a 205 if (ip->ip_off & htons(IP_OFFMASK))
ead1d3cb
JH
206 goto ipcheckonly;
207
b01ae44a
MD
208 /*
209 * The TCP/IP or UDP/IP header must be entirely contained within
210 * the first fragment of a packet. Packet filters will break if they
211 * aren't.
ee7990a0
MD
212 *
213 * Since the packet will be trimmed to ip_len we must also make sure
214 * the potentially trimmed down length is still sufficient to hold
215 * the header(s).
b01ae44a 216 */
ead1d3cb
JH
217 switch (ip->ip_p) {
218 case IPPROTO_TCP:
219 if (iplen < iphlen + sizeof(struct tcphdr)) {
220 ++tcpstat.tcps_rcvshort;
dd4df7e9 221 goto fail;
ead1d3cb 222 }
c3c96e44
MD
223 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) {
224 m = m_pullup(m, hoff + iphlen + sizeof(struct tcphdr));
ead1d3cb
JH
225 if (m == NULL) {
226 tcpstat.tcps_rcvshort++;
dd4df7e9 227 goto fail;
ee7990a0 228 }
c3c96e44 229 ip = mtodoff(m, struct ip *, hoff);
b01ae44a 230 }
9eeaa8a9 231 th = (struct tcphdr *)((caddr_t)ip + iphlen);
55d829f8 232 thoff = th->th_off << 2;
096deca3 233 if (thoff < sizeof(struct tcphdr) ||
cd5e5855 234 thoff + iphlen > ntohs(ip->ip_len)) {
9eeaa8a9 235 tcpstat.tcps_rcvbadoff++;
dd4df7e9 236 goto fail;
9eeaa8a9 237 }
c3c96e44
MD
238 if (m->m_len < hoff + iphlen + thoff) {
239 m = m_pullup(m, hoff + iphlen + thoff);
55d829f8
JH
240 if (m == NULL) {
241 tcpstat.tcps_rcvshort++;
dd4df7e9 242 goto fail;
55d829f8 243 }
bf82f9b7 244 }
1e316d14 245 break;
ead1d3cb
JH
246 case IPPROTO_UDP:
247 if (iplen < iphlen + sizeof(struct udphdr)) {
6e78e7fe 248 ++udp_stat.udps_hdrops;
dd4df7e9 249 goto fail;
ead1d3cb 250 }
c3c96e44
MD
251 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) {
252 m = m_pullup(m, hoff + iphlen + sizeof(struct udphdr));
ead1d3cb 253 if (m == NULL) {
6e78e7fe 254 udp_stat.udps_hdrops++;
dd4df7e9 255 goto fail;
ead1d3cb
JH
256 }
257 }
258 break;
259 default:
260ipcheckonly:
261 if (iplen < iphlen) {
262 ++ipstat.ips_badlen;
dd4df7e9 263 goto fail;
ead1d3cb
JH
264 }
265 break;
1e316d14
JH
266 }
267
8697599b 268 m->m_flags |= M_LENCHECKED;
1e316d14
JH
269 *mp = m;
270 return TRUE;
dd4df7e9
SZ
271
272fail:
273 if (m != NULL)
274 m_freem(m);
275 *mp = NULL;
276 return FALSE;
1e316d14 277}
bf82f9b7 278
1e316d14 279/*
c3c96e44
MD
280 * Assign a protocol processing thread to a packet. The IP header is at
281 * offset (hoff) in the packet (i.e. the mac header might still be intact).
282 *
283 * This function can blow away the mbuf if the packet is malformed.
1e316d14 284 */
c3c96e44 285void
b6615c82 286ip_hashfn(struct mbuf **mptr, int hoff)
1e316d14
JH
287{
288 struct ip *ip;
289 int iphlen;
290 struct tcphdr *th;
291 struct udphdr *uh;
292 struct mbuf *m;
87561f8f 293 int hash;
1e316d14 294
be0bea4a
SZ
295 if (((*mptr)->m_flags & M_LENCHECKED) == 0) {
296 if (!ip_lengthcheck(mptr, hoff))
297 return;
298 }
1e316d14
JH
299
300 m = *mptr;
c3c96e44 301 ip = mtodoff(m, struct ip *, hoff);
1e316d14
JH
302 iphlen = ip->ip_hl << 2;
303
8a93af2a 304 if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
aecff6d1 305 hash = toeplitz_hash(toeplitz_rawhash_addr(
8a93af2a 306 ip->ip_src.s_addr, ip->ip_dst.s_addr));
97a43e72
SZ
307 goto back;
308 }
1e316d14
JH
309
310 switch (ip->ip_p) {
311 case IPPROTO_TCP:
312 th = (struct tcphdr *)((caddr_t)ip + iphlen);
87561f8f
SZ
313 hash = INP_MPORT_HASH_TCP(ip->ip_src.s_addr, ip->ip_dst.s_addr,
314 th->th_sport, th->th_dport);
bf82f9b7 315 break;
f861aec9 316
bf82f9b7 317 case IPPROTO_UDP:
9eeaa8a9 318 uh = (struct udphdr *)((caddr_t)ip + iphlen);
87561f8f
SZ
319 hash = INP_MPORT_HASH_UDP(ip->ip_src.s_addr, ip->ip_dst.s_addr,
320 uh->uh_sport, uh->uh_dport);
bf82f9b7 321 break;
f861aec9 322
bf82f9b7 323 default:
87561f8f 324 hash = 0;
bf82f9b7
MD
325 break;
326 }
97a43e72 327back:
7558541b 328 m_sethash(m, hash);
bf82f9b7
MD
329}
330
8a5c0ed6 331/*
e6f77b88
SZ
332 * Verify and adjust the hash value of the packet.
333 *
ca86d83e 334 * Unlike ip_hashfn(), the packet content is not accessed. The packet info
e6f77b88 335 * (pi) and the hash of the packet (m_pkthdr.hash) is used instead.
8a5c0ed6
SZ
336 *
337 * Caller has already made sure that m_pkthdr.hash is valid, i.e. m_flags
338 * has M_HASH set.
339 */
e6f77b88
SZ
340void
341ip_hashcheck(struct mbuf *m, const struct pktinfo *pi)
8a5c0ed6 342{
ed20d0e3 343 KASSERT((m->m_flags & M_HASH), ("no valid packet hash"));
8a5c0ed6 344
8a5c0ed6
SZ
345 switch (pi->pi_l3proto) {
346 case IPPROTO_TCP:
8a5c0ed6 347 case IPPROTO_UDP:
8a5c0ed6
SZ
348 break;
349
350 default:
e6f77b88
SZ
351 /* Let software calculate the hash */
352 m->m_flags &= ~M_HASH;
8a5c0ed6
SZ
353 break;
354 }
8a5c0ed6
SZ
355}
356
48e7b118
MD
357/*
358 * This is used to map a socket to a message port for sendmsg() and friends.
359 * It is not called for any other purpose. In the case of TCP we just return
360 * the port already installed in the socket.
361 */
362lwkt_port_t
363tcp_soport(struct socket *so, struct sockaddr *nam,
364 struct mbuf **dummy __unused)
365{
366 return(so->so_port);
bf82f9b7
MD
367}
368
e3873585
SZ
369/*
370 * Used to route icmp messages to the proper protocol thread for ctlinput
371 * operation.
372 */
373lwkt_port_t
130b7902 374tcp_ctlport(int cmd, struct sockaddr *sa, void *vip, int *cpuid)
e3873585
SZ
375{
376 struct ip *ip = vip;
ffb15150 377 inp_notify_t notify;
130b7902 378 int arg;
ffb15150 379
130b7902 380 notify = tcp_get_inpnotify(cmd, sa, &arg, &ip, cpuid);
ffb15150
SZ
381 if (notify == NULL)
382 return NULL;
e3873585 383
43dbcc2a 384 if (*cpuid == netisr_ncpus) {
14572273 385 /*
43dbcc2a 386 * Go through all effective netisr CPUs.
ffb15150 387 *
8e0ec33b
SZ
388 * A new message will be allocated later to save necessary
389 * information and will be forwarded to all network protocol
390 * threads in the following way:
14572273 391 *
8e0ec33b
SZ
392 * (the the thread owns the msgport that we return here)
393 * netisr0 <--+
394 * | |
395 * | |
396 * | |
397 * +-------+
398 * sendmsg
399 * [msg is kmalloc()ed]
400 *
401 *
402 * Later on, when the msg is received by netisr0:
403 *
404 * forwardmsg forwardmsg
405 * netisr0 ---------> netisr1 ---------> netisrN
406 * [msg is kfree()ed]
14572273 407 */
130b7902 408 return netisr_cpuport(0);
e3873585 409 } else {
130b7902 410 return netisr_cpuport(*cpuid);
e3873585 411 }
e3873585
SZ
412}
413
7fe56515
JH
414lwkt_port_t
415tcp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport)
416{
ec7f7fc8 417 return(netisr_cpuport(tcp_addrcpu(faddr, fport, laddr, lport)));
7fe56515
JH
418}
419
65f3e756
MD
420lwkt_port_t
421tcp_addrport0(void)
422{
ec7f7fc8 423 return(netisr_cpuport(0));
65f3e756
MD
424}
425
48e7b118
MD
426lwkt_port_t
427udp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport)
428{
ec7f7fc8 429 return(netisr_cpuport(udp_addrcpu(faddr, fport, laddr, lport)));
48e7b118
MD
430}
431
e3873585
SZ
432/*
433 * Used to route icmp messages to the proper protocol thread for ctlinput
434 * operation.
435 */
436lwkt_port_t
130b7902 437udp_ctlport(int cmd, struct sockaddr *sa, void *vip, int *cpuid)
e3873585
SZ
438{
439 struct ip *ip = vip;
dfa15443 440 inp_notify_t notify;
e3873585 441
130b7902 442 notify = udp_get_inpnotify(cmd, sa, &ip, cpuid);
dfa15443
SZ
443 if (notify == NULL)
444 return NULL;
445
43dbcc2a 446 if (*cpuid == netisr_ncpus) {
14572273 447 /*
43dbcc2a 448 * Go through all effective netisr CPUs.
dfa15443 449 *
c8d29187 450 * See the comment in tcp_ctlport.
14572273 451 */
130b7902 452 return netisr_cpuport(0);
e3873585 453 } else {
130b7902 454 return netisr_cpuport(*cpuid);
e3873585 455 }
bf82f9b7 456}
ee0be9ca 457
5cbeb1a5 458static __inline struct lwkt_port *
bb36fa2f 459inp_initport(void)
5cbeb1a5
SZ
460{
461 int cpu = mycpuid;
462
bb36fa2f 463 if (cpu < netisr_ncpus) {
5cbeb1a5
SZ
464 return netisr_cpuport(cpu);
465 } else {
466 return netisr_cpuport(
bb36fa2f
SZ
467 ((initport_indices[cpu].port_index++) + (uint32_t)cpu) %
468 netisr_ncpus);
5cbeb1a5
SZ
469 }
470}
471
ee0be9ca
SZ
472struct lwkt_port *
473tcp_initport(void)
474{
bb36fa2f 475 return inp_initport();
ee0be9ca 476}
be4519a2
SZ
477
478struct lwkt_port *
479udp_initport(void)
480{
bb36fa2f 481 return inp_initport();
be4519a2 482}