ipfw: Add ipfrag filter.
[dragonfly.git] / sys / net / ipfw / ip_fw2.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26 */
27
984263bc
MD
28/*
29 * Implement IP packet firewall (new version)
30 */
31
984263bc 32#include "opt_ipfw.h"
984263bc
MD
33#include "opt_inet.h"
34#ifndef INET
35#error IPFIREWALL requires INET.
36#endif /* INET */
984263bc 37
984263bc
MD
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/kernel.h>
43#include <sys/proc.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/ucred.h>
3f9db7f8 49#include <sys/in_cksum.h>
2187815d 50#include <sys/limits.h>
997a0e9a 51#include <sys/lock.h>
2187815d 52#include <sys/tree.h>
0049e46a 53
984263bc
MD
54#include <net/if.h>
55#include <net/route.h>
8c6081b9 56#include <net/pfil.h>
b6d66a39 57#include <net/dummynet/ip_dummynet.h>
0049e46a 58
684a93c4
MD
59#include <sys/thread2.h>
60#include <sys/mplock2.h>
61#include <net/netmsg2.h>
62
984263bc
MD
63#include <netinet/in.h>
64#include <netinet/in_systm.h>
65#include <netinet/in_var.h>
66#include <netinet/in_pcb.h>
67#include <netinet/ip.h>
68#include <netinet/ip_var.h>
69#include <netinet/ip_icmp.h>
984263bc 70#include <netinet/tcp.h>
c8c63108 71#include <netinet/tcp_seq.h>
984263bc
MD
72#include <netinet/tcp_timer.h>
73#include <netinet/tcp_var.h>
74#include <netinet/tcpip.h>
75#include <netinet/udp.h>
76#include <netinet/udp_var.h>
68edaf54 77#include <netinet/ip_divert.h>
984263bc
MD
78#include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79
b6d66a39
SZ
80#include <net/ipfw/ip_fw2.h>
81
8d0865c8
SZ
82#ifdef IPFIREWALL_DEBUG
83#define DPRINTF(fmt, ...) \
84do { \
85 if (fw_debug > 0) \
86 kprintf(fmt, __VA_ARGS__); \
87} while (0)
88#else
89#define DPRINTF(fmt, ...) ((void)0)
90#endif
91
ca12e259
SZ
92/*
93 * Description about per-CPU rule duplication:
94 *
95 * Module loading/unloading and all ioctl operations are serialized
96 * by netisr0, so we don't have any ordering or locking problems.
97 *
98 * Following graph shows how operation on per-CPU rule list is
99 * performed [2 CPU case]:
100 *
101 * CPU0 CPU1
102 *
103 * netisr0 <------------------------------------+
104 * domsg |
388cb6c6 105 * : |
ca12e259
SZ
106 * :(delete/add...) |
107 * : |
388cb6c6
SZ
108 * : netmsg | netmsg
109 * forwardmsg---------->netisr1 |
ca12e259
SZ
110 * : |
111 * :(delete/add...) |
112 * : |
113 * : |
114 * replymsg--------------+
115 *
116 *
117 *
2187815d 118 * Rule structure [2 CPU case]
ca12e259
SZ
119 *
120 * CPU0 CPU1
388cb6c6 121 *
ca12e259
SZ
122 * layer3_chain layer3_chain
123 * | |
124 * V V
125 * +-------+ sibling +-------+ sibling
126 * | rule1 |--------->| rule1 |--------->NULL
127 * +-------+ +-------+
128 * | |
129 * |next |next
130 * V V
131 * +-------+ sibling +-------+ sibling
132 * | rule2 |--------->| rule2 |--------->NULL
133 * +-------+ +-------+
134 *
135 * ip_fw.sibling:
136 * 1) Ease statistics calculation during IP_FW_GET. We only need to
388cb6c6
SZ
137 * iterate layer3_chain in netisr0; the current rule's duplication
138 * to the other CPUs could safely be read-only accessed through
139 * ip_fw.sibling.
ca12e259 140 * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
388cb6c6
SZ
141 * a) In netisr0 rule3 is determined to be inserted between rule1
142 * and rule2. To make this decision we need to iterate the
143 * layer3_chain in netisr0. The netmsg, which is used to insert
144 * the rule, will contain rule1 in netisr0 as prev_rule and rule2
145 * in netisr0 as next_rule.
146 * b) After the insertion in netisr0 is done, we will move on to
147 * netisr1. But instead of relocating the rule3's position in
148 * netisr1 by iterating the layer3_chain in netisr1, we set the
149 * netmsg's prev_rule to rule1->sibling and next_rule to
150 * rule2->sibling before the netmsg is forwarded to netisr1 from
151 * netisr0.
2187815d
SZ
152 */
153
154/*
155 * Description of states and tracks.
156 *
157 * Both states and tracks are stored in per-cpu RB trees instead of
158 * per-cpu hash tables to avoid the worst case hash degeneration.
159 *
160 * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161 * measured in seconds and depending on the flags.
162 *
163 * When a packet is received, its address fields are first masked with
164 * the mask defined for the rule, then matched against the entries in
165 * the per-cpu state RB tree. States are generated by 'keep-state'
166 * and 'limit' options.
388cb6c6 167 *
2187815d
SZ
168 * The max number of states is ipfw_state_max. When we reach the
169 * maximum number of states we do not create anymore. This is done to
170 * avoid consuming too much memory, but also too much time when
171 * searching on each packet.
388cb6c6 172 *
2187815d
SZ
173 * Each state holds a pointer to the parent ipfw rule of the current
174 * CPU so we know what action to perform. States are removed when the
175 * parent rule is deleted. XXX we should make them survive.
ca12e259 176 *
2187815d
SZ
177 * There are some limitations with states -- we do not obey the
178 * 'randomized match', and we do not do multiple passes through the
179 * firewall. XXX check the latter!!!
ca12e259 180 *
2187815d 181 * States grow independently on each CPU, e.g. 2 CPU case:
ca12e259 182 *
2187815d
SZ
183 * CPU0 CPU1
184 * ................... ...................
185 * : state RB tree : : state RB tree :
186 * : : : :
187 * : state1 state2 : : state3 :
188 * : | | : : | :
189 * :.....|....|......: :........|........:
190 * | | |
191 * | | |st_rule
192 * | | |
193 * V V V
194 * +-------+ +-------+
195 * | rule1 | | rule1 |
196 * +-------+ +-------+
ca12e259 197 *
2187815d
SZ
198 * Tracks are used to enforce limits on the number of sessions. Tracks
199 * are generated by 'limit' option.
200 *
201 * The max number of tracks is ipfw_track_max. When we reach the
202 * maximum number of tracks we do not create anymore. This is done to
203 * avoid consuming too much memory.
204 *
205 * Tracks are organized into two layers, track counter RB tree is
206 * shared between CPUs, track RB tree is per-cpu. States generated by
207 * 'limit' option are linked to the track in addition to the per-cpu
208 * state RB tree; mainly to ease expiration. e.g. 2 CPU case:
209 *
210 * ..............................
211 * : track counter RB tree :
212 * : :
213 * : +-----------+ :
214 * : | trkcnt1 | :
215 * : | | :
216 * : +--->counter<----+ :
217 * : | | | | :
218 * : | +-----------+ | :
219 * :......|................|....:
220 * | |
221 * CPU0 | | CPU1
222 * ................. |t_count | .................
223 * : track RB tree : | | : track RB tree :
224 * : : | | : :
225 * : +-->track1-------+ +--------track2 :
226 * : | A : : :
227 * : | | : : :
228 * :.|.....|.......: :...............:
229 * | +----------------+
230 * | .................... |
231 * | : state RB tree : |st_track
232 * | : : |
233 * +---state1 state2---+
234 * : | | :
235 * :.....|.......|....:
236 * | |
237 * | |st_rule
238 * V V
239 * +----------+
240 * | rule1 |
241 * +----------+
ca12e259
SZ
242 */
243
9fabc2ac
SZ
244#define IPFW_AUTOINC_STEP_MIN 1
245#define IPFW_AUTOINC_STEP_MAX 1000
246#define IPFW_AUTOINC_STEP_DEF 100
247
d938108c
SZ
248#define IPFW_TABLE_MAX_DEF 64
249
ca12e259
SZ
250#define IPFW_DEFAULT_RULE 65535 /* rulenum for the default rule */
251#define IPFW_DEFAULT_SET 31 /* set number for the default rule */
984263bc 252
2187815d
SZ
253#define MATCH_REVERSE 0
254#define MATCH_FORWARD 1
255#define MATCH_NONE 2
256#define MATCH_UNKNOWN 3
257
258#define IPFW_STATE_TCPFLAGS (TH_SYN | TH_FIN | TH_RST)
259#define IPFW_STATE_TCPSTATES (IPFW_STATE_TCPFLAGS | \
260 (IPFW_STATE_TCPFLAGS << 8))
261
262#define BOTH_SYN (TH_SYN | (TH_SYN << 8))
263#define BOTH_FIN (TH_FIN | (TH_FIN << 8))
264#define BOTH_RST (TH_RST | (TH_RST << 8))
265/* TH_ACK here means FIN was ACKed. */
266#define BOTH_FINACK (TH_ACK | (TH_ACK << 8))
267
268#define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP && \
269 (((s)->st_state & BOTH_RST) || \
270 ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
271
272#define O_ANCHOR O_NOP
273
ca12e259 274struct netmsg_ipfw {
2187815d 275 struct netmsg_base base;
ca12e259 276 const struct ipfw_ioc_rule *ioc_rule;
2187815d
SZ
277 struct ip_fw *next_rule;
278 struct ip_fw *prev_rule;
279 struct ip_fw *sibling;
280 uint32_t rule_flags;
6998b243 281 struct ip_fw **cross_rules;
ca12e259
SZ
282};
283
284struct netmsg_del {
2187815d
SZ
285 struct netmsg_base base;
286 struct ip_fw *start_rule;
287 struct ip_fw *prev_rule;
288 uint16_t rulenum;
289 uint8_t from_set;
290 uint8_t to_set;
ca12e259
SZ
291};
292
293struct netmsg_zent {
2187815d
SZ
294 struct netmsg_base base;
295 struct ip_fw *start_rule;
296 uint16_t rulenum;
297 uint16_t log_only;
298};
299
300struct netmsg_cpstate {
301 struct netmsg_base base;
302 struct ipfw_ioc_state *ioc_state;
303 int state_cntmax;
304 int state_cnt;
305};
306
d938108c
SZ
307struct netmsg_tblent {
308 struct netmsg_base base;
309 struct sockaddr *key;
310 struct sockaddr *netmask;
311 struct ipfw_tblent *sibling;
312 int tableid;
313};
314
315struct netmsg_tblflush {
316 struct netmsg_base base;
317 int tableid;
318 int destroy;
319};
320
321struct netmsg_tblexp {
322 struct netmsg_base base;
323 time_t expire;
324 int tableid;
325 int cnt;
326 int expcnt;
327 struct radix_node_head *rnh;
328};
329
330struct ipfw_table_cp {
331 struct ipfw_ioc_tblent *te;
332 int te_idx;
333 int te_cnt;
334};
335
2187815d
SZ
336struct ipfw_addrs {
337 uint32_t addr1;
338 uint32_t addr2;
339};
340
341struct ipfw_ports {
342 uint16_t port1;
343 uint16_t port2;
344};
345
346struct ipfw_key {
347 union {
348 struct ipfw_addrs addrs;
349 uint64_t value;
350 } addr_u;
351 union {
352 struct ipfw_ports ports;
353 uint32_t value;
354 } port_u;
355 uint8_t proto;
356 uint8_t swap; /* IPFW_KEY_SWAP_ */
357 uint16_t rsvd2;
358};
359
360#define IPFW_KEY_SWAP_ADDRS 0x1
361#define IPFW_KEY_SWAP_PORTS 0x2
362#define IPFW_KEY_SWAP_ALL (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
363
364struct ipfw_trkcnt {
365 RB_ENTRY(ipfw_trkcnt) tc_rblink;
366 struct ipfw_key tc_key;
367 uintptr_t tc_ruleid;
368 int tc_refs;
369 int tc_count;
370 time_t tc_expire; /* userland get-only */
371 uint16_t tc_rulenum; /* userland get-only */
372} __cachealign;
373
374#define tc_addrs tc_key.addr_u.value
375#define tc_ports tc_key.port_u.value
376#define tc_proto tc_key.proto
377#define tc_saddr tc_key.addr_u.addrs.addr1
378#define tc_daddr tc_key.addr_u.addrs.addr2
379#define tc_sport tc_key.port_u.ports.port1
380#define tc_dport tc_key.port_u.ports.port2
381
382RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
383
384struct ipfw_state;
385
386struct ipfw_track {
387 RB_ENTRY(ipfw_track) t_rblink;
388 struct ipfw_key t_key;
389 struct ip_fw *t_rule;
390 time_t t_lastexp;
391 LIST_HEAD(, ipfw_state) t_state_list;
392 time_t t_expire;
393 volatile int *t_count;
394 struct ipfw_trkcnt *t_trkcnt;
395 TAILQ_ENTRY(ipfw_track) t_link;
ca12e259
SZ
396};
397
2187815d
SZ
398#define t_addrs t_key.addr_u.value
399#define t_ports t_key.port_u.value
400#define t_proto t_key.proto
401#define t_saddr t_key.addr_u.addrs.addr1
402#define t_daddr t_key.addr_u.addrs.addr2
403#define t_sport t_key.port_u.ports.port1
404#define t_dport t_key.port_u.ports.port2
405
406RB_HEAD(ipfw_track_tree, ipfw_track);
407TAILQ_HEAD(ipfw_track_list, ipfw_track);
408
409struct ipfw_state {
410 RB_ENTRY(ipfw_state) st_rblink;
411 struct ipfw_key st_key;
412
413 time_t st_expire; /* expire time */
414 struct ip_fw *st_rule;
415
416 uint64_t st_pcnt; /* packets */
417 uint64_t st_bcnt; /* bytes */
418
419 /*
420 * st_state:
421 * State of this rule, typically a combination of TCP flags.
422 *
423 * st_ack_fwd/st_ack_rev:
424 * Most recent ACKs in forward and reverse direction. They
425 * are used to generate keepalives.
426 */
427 uint32_t st_state;
428 uint32_t st_ack_fwd;
429 uint32_t st_seq_fwd;
430 uint32_t st_ack_rev;
431 uint32_t st_seq_rev;
432
433 uint16_t st_flags; /* IPFW_STATE_F_ */
434 uint16_t st_type; /* O_KEEP_STATE/O_LIMIT */
435 struct ipfw_track *st_track;
436
437 LIST_ENTRY(ipfw_state) st_trklink;
438 TAILQ_ENTRY(ipfw_state) st_link;
439};
440
441#define st_addrs st_key.addr_u.value
442#define st_ports st_key.port_u.value
443#define st_proto st_key.proto
444#define st_swap st_key.swap
445
446#define IPFW_STATE_F_ACKFWD 0x0001
447#define IPFW_STATE_F_SEQFWD 0x0002
448#define IPFW_STATE_F_ACKREV 0x0004
449#define IPFW_STATE_F_SEQREV 0x0008
450
451TAILQ_HEAD(ipfw_state_list, ipfw_state);
452RB_HEAD(ipfw_state_tree, ipfw_state);
453
d938108c
SZ
454struct ipfw_tblent {
455 struct radix_node te_nodes[2];
456 struct sockaddr_in te_key;
457 u_long te_use;
458 time_t te_lastuse;
459 struct ipfw_tblent *te_sibling;
460 volatile int te_expired;
461};
462
ca12e259 463struct ipfw_context {
2187815d
SZ
464 struct ip_fw *ipfw_layer3_chain; /* rules for layer3 */
465 struct ip_fw *ipfw_default_rule; /* default rule */
466 uint64_t ipfw_norule_counter; /* ipfw_log(NULL) stat*/
ca12e259
SZ
467
468 /*
469 * ipfw_set_disable contains one bit per set value (0..31).
470 * If the bit is set, all rules with the corresponding set
471 * are disabled. Set IPDW_DEFAULT_SET is reserved for the
472 * default rule and CANNOT be disabled.
473 */
2187815d
SZ
474 uint32_t ipfw_set_disable;
475
476 uint8_t ipfw_flags; /* IPFW_FLAG_ */
477
6998b243
SZ
478 struct ip_fw *ipfw_cont_rule;
479
2187815d
SZ
480 struct ipfw_state_tree ipfw_state_tree;
481 struct ipfw_state_list ipfw_state_list;
482 int ipfw_state_loosecnt;
483 int ipfw_state_cnt;
484
485 union {
486 struct ipfw_state state;
487 struct ipfw_track track;
488 struct ipfw_trkcnt trkcnt;
489 } ipfw_tmpkey;
490
491 struct ipfw_track_tree ipfw_track_tree;
492 struct ipfw_track_list ipfw_track_list;
493 struct ipfw_trkcnt *ipfw_trkcnt_spare;
494
495 struct callout ipfw_stateto_ch;
496 time_t ipfw_state_lastexp;
497 struct netmsg_base ipfw_stateexp_nm;
498 struct netmsg_base ipfw_stateexp_more;
499 struct ipfw_state ipfw_stateexp_anch;
500
501 struct callout ipfw_trackto_ch;
502 time_t ipfw_track_lastexp;
503 struct netmsg_base ipfw_trackexp_nm;
504 struct netmsg_base ipfw_trackexp_more;
505 struct ipfw_track ipfw_trackexp_anch;
506
507 struct callout ipfw_keepalive_ch;
508 struct netmsg_base ipfw_keepalive_nm;
509 struct netmsg_base ipfw_keepalive_more;
510 struct ipfw_state ipfw_keepalive_anch;
511
512 /*
513 * Statistics
514 */
515 u_long ipfw_sts_reap;
516 u_long ipfw_sts_reapfailed;
517 u_long ipfw_sts_overflow;
518 u_long ipfw_sts_nomem;
519 u_long ipfw_sts_tcprecycled;
520
521 u_long ipfw_tks_nomem;
522 u_long ipfw_tks_reap;
523 u_long ipfw_tks_reapfailed;
524 u_long ipfw_tks_overflow;
525 u_long ipfw_tks_cntnomem;
d938108c 526
6998b243
SZ
527 u_long ipfw_frags;
528 u_long ipfw_defraged;
529 u_long ipfw_defrag_remote;
530
d938108c
SZ
531 /* Last field */
532 struct radix_node_head *ipfw_tables[];
ca12e259
SZ
533};
534
2187815d
SZ
535#define IPFW_FLAG_KEEPALIVE 0x01
536#define IPFW_FLAG_STATEEXP 0x02
537#define IPFW_FLAG_TRACKEXP 0x04
538#define IPFW_FLAG_STATEREAP 0x08
539#define IPFW_FLAG_TRACKREAP 0x10
540
541#define ipfw_state_tmpkey ipfw_tmpkey.state
542#define ipfw_track_tmpkey ipfw_tmpkey.track
543#define ipfw_trkcnt_tmpkey ipfw_tmpkey.trkcnt
544
545struct ipfw_global {
546 int ipfw_state_loosecnt; /* cache aligned */
547 time_t ipfw_state_globexp __cachealign;
548
549 struct lwkt_token ipfw_trkcnt_token __cachealign;
550 struct ipfw_trkcnt_tree ipfw_trkcnt_tree;
551 int ipfw_trkcnt_cnt;
552 time_t ipfw_track_globexp;
984263bc 553
6998b243
SZ
554 /* Accessed in netisr0. */
555 struct ip_fw *ipfw_crossref_free __cachealign;
556 struct callout ipfw_crossref_ch;
557 struct netmsg_base ipfw_crossref_nm;
558
84a3e25a 559#ifdef KLD_MODULE
2187815d
SZ
560 /*
561 * Module can not be unloaded, if there are references to
562 * certains rules of ipfw(4), e.g. dummynet(4)
563 */
564 int ipfw_refcnt __cachealign;
84a3e25a 565#endif
2187815d
SZ
566} __cachealign;
567
568static struct ipfw_context *ipfw_ctx[MAXCPU];
84a3e25a 569
ca12e259 570MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
984263bc
MD
571
572/*
388cb6c6
SZ
573 * Following two global variables are accessed and updated only
574 * in netisr0.
984263bc 575 */
ca12e259
SZ
576static uint32_t static_count; /* # of static rules */
577static uint32_t static_ioc_len; /* bytes of static rules */
984263bc 578
ca12e259
SZ
579/*
580 * If 1, then ipfw static rules are being flushed,
581 * ipfw_chk() will skip to the default rule.
582 */
583static int ipfw_flushing;
584
585static int fw_verbose;
586static int verbose_limit;
984263bc 587
8d0865c8 588static int fw_debug;
9fabc2ac 589static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
984263bc 590
d938108c
SZ
591static int ipfw_table_max = IPFW_TABLE_MAX_DEF;
592
2803ec4a 593static int ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
c1aa76bb
SZ
594static int ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
595
d938108c
SZ
596TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
597
984263bc 598SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
2187815d
SZ
599SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
600 "Firewall statistics");
601
2803ec4a
SZ
602SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
603 &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
c1aa76bb
SZ
604SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
605 &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
606 "Rule number autincrement step");
984263bc
MD
607SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
608 &fw_one_pass, 0,
609 "Only do a single pass through ipfw when using dummynet(4)");
610SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
611 &fw_debug, 0, "Enable printing of debug ip_fw statements");
612SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
613 &fw_verbose, 0, "Log matches to ipfw rules");
614SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
615 &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
d938108c
SZ
616SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
617 &ipfw_table_max, 0, "Max # of tables");
984263bc 618
2187815d
SZ
619static int ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
620static int ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
621static int ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
622static int ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
623static int ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
624static int ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
984263bc
MD
625
626/*
2187815d 627 * Timeouts for various events in handing states.
5331fda4
SZ
628 *
629 * NOTE:
630 * 1 == 0~1 second.
631 * 2 == 1~2 second(s).
632 *
633 * We use 2 seconds for FIN lifetime, so that the states will not be
634 * ripped prematurely.
984263bc 635 */
a998c492
SZ
636static uint32_t dyn_ack_lifetime = 300;
637static uint32_t dyn_syn_lifetime = 20;
2187815d 638static uint32_t dyn_finwait_lifetime = 20;
5331fda4 639static uint32_t dyn_fin_lifetime = 2;
2187815d 640static uint32_t dyn_rst_lifetime = 2;
a998c492 641static uint32_t dyn_udp_lifetime = 10;
2187815d 642static uint32_t dyn_short_lifetime = 5; /* used by tracks too */
984263bc
MD
643
644/*
645 * Keepalives are sent if dyn_keepalive is set. They are sent every
646 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
647 * seconds of lifetime of a rule.
984263bc 648 */
a998c492
SZ
649static uint32_t dyn_keepalive_interval = 20;
650static uint32_t dyn_keepalive_period = 5;
651static uint32_t dyn_keepalive = 1; /* do send keepalives */
984263bc 652
2187815d
SZ
653static struct ipfw_global ipfw_gd;
654static int ipfw_state_loosecnt_updthr;
655static int ipfw_state_max = 4096; /* max # of states */
656static int ipfw_track_max = 4096; /* max # of tracks */
657
658static int ipfw_state_headroom; /* setup at module load time */
659static int ipfw_state_reap_min = 8;
660static int ipfw_state_expire_max = 32;
661static int ipfw_state_scan_max = 256;
662static int ipfw_keepalive_max = 8;
663static int ipfw_track_reap_max = 4;
664static int ipfw_track_expire_max = 16;
665static int ipfw_track_scan_max = 128;
666
667/* Compat */
668SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
669 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
670 "Number of states and tracks");
671SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
672 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
673 "Max number of states and tracks");
674
675SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
676 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
677 "Number of states");
678SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
679 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
680 "Max number of states");
681SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
682 &ipfw_state_headroom, 0, "headroom for state reap");
683SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
684 &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
685SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
686 &ipfw_track_max, 0, "Max number of tracks");
984263bc
MD
687SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
688 &static_count, 0, "Number of static rules");
689SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
690 &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
691SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
692 &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
2187815d
SZ
693SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
694 &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
695SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
696 &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
697SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
698 &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
984263bc
MD
699SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
700 &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
701SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
702 &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
703SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
704 &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
2187815d
SZ
705SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
706 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
707 "I", "# of states to scan for each expire iteration");
708SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
709 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
710 "I", "# of states to expire for each expire iteration");
711SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
712 CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
713 "I", "# of states to expire for each expire iteration");
714SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
715 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
716 "I", "# of states to reap for state shortage");
717SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
718 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
719 "I", "# of tracks to scan for each expire iteration");
720SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
721 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
722 "I", "# of tracks to expire for each expire iteration");
723SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
724 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
725 "I", "# of tracks to reap for track shortage");
726
727SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
728 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
729 __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
730 "LU", "# of state reaps due to states shortage");
731SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
732 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
733 __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
734 "LU", "# of state reap failure");
735SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
736 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
737 __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
738 "LU", "# of state overflow");
739SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
740 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
741 __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
742 "LU", "# of state allocation failure");
743SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
744 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
745 __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
746 "LU", "# of state deleted due to fast TCP port recycling");
747
748SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
749 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
750 __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
751 "LU", "# of track allocation failure");
752SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
753 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
754 __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
755 "LU", "# of track reap due to tracks shortage");
756SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
757 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
758 __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
759 "LU", "# of track reap failure");
760SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
761 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
762 __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
763 "LU", "# of track overflow");
764SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
765 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
766 __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
767 "LU", "# of track counter allocation failure");
6998b243
SZ
768SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
769 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
770 __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
771 "LU", "# of IP fragements defraged");
772SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
773 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
774 __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
775 "LU", "# of IP packets after defrag");
776SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
777 CTLTYPE_ULONG | CTLFLAG_RW, NULL,
778 __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
779 "LU", "# of IP packets after defrag dispatched to remote cpus");
2187815d
SZ
780
781static int ipfw_state_cmp(struct ipfw_state *,
782 struct ipfw_state *);
783static int ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
784 struct ipfw_trkcnt *);
785static int ipfw_track_cmp(struct ipfw_track *,
786 struct ipfw_track *);
787
788RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
789RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
790
791RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
792RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
793
794RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
795RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
984263bc 796
984263bc 797static ip_fw_chk_t ipfw_chk;
2187815d
SZ
798static void ipfw_track_expire_ipifunc(void *);
799static void ipfw_state_expire_ipifunc(void *);
800static void ipfw_keepalive(void *);
801static int ipfw_state_expire_start(struct ipfw_context *,
802 int, int);
6998b243 803static void ipfw_crossref_timeo(void *);
2187815d
SZ
804
805#define IPFW_TRKCNT_TOKGET lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
806#define IPFW_TRKCNT_TOKREL lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
807#define IPFW_TRKCNT_TOKINIT \
808 lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
809
d938108c
SZ
810static void
811sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
812 const struct sockaddr *netmask)
813{
814 const u_char *cp1 = (const u_char *)src;
815 u_char *cp2 = (u_char *)dst;
816 const u_char *cp3 = (const u_char *)netmask;
817 u_char *cplim = cp2 + *cp3;
818 u_char *cplim2 = cp2 + *cp1;
819
820 *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
821 cp3 += 2;
822 if (cplim > cplim2)
823 cplim = cplim2;
824 while (cp2 < cplim)
825 *cp2++ = *cp1++ & *cp3++;
826 if (cp2 < cplim2)
827 bzero(cp2, cplim2 - cp2);
828}
829
2187815d
SZ
830static __inline void
831ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
832 in_addr_t daddr, uint16_t dport, uint8_t proto)
833{
834
835 key->proto = proto;
836 key->swap = 0;
837
838 if (saddr < daddr) {
839 key->addr_u.addrs.addr1 = daddr;
840 key->addr_u.addrs.addr2 = saddr;
841 key->swap |= IPFW_KEY_SWAP_ADDRS;
842 } else {
843 key->addr_u.addrs.addr1 = saddr;
844 key->addr_u.addrs.addr2 = daddr;
845 }
846
847 if (sport < dport) {
848 key->port_u.ports.port1 = dport;
849 key->port_u.ports.port2 = sport;
850 key->swap |= IPFW_KEY_SWAP_PORTS;
851 } else {
852 key->port_u.ports.port1 = sport;
853 key->port_u.ports.port2 = dport;
854 }
855
856 if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
857 key->swap |= IPFW_KEY_SWAP_PORTS;
858 if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
859 key->swap |= IPFW_KEY_SWAP_ADDRS;
860}
861
862static __inline void
863ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
864 in_addr_t *daddr, uint16_t *dport)
865{
866
867 if (key->swap & IPFW_KEY_SWAP_ADDRS) {
868 *saddr = key->addr_u.addrs.addr2;
869 *daddr = key->addr_u.addrs.addr1;
870 } else {
871 *saddr = key->addr_u.addrs.addr1;
872 *daddr = key->addr_u.addrs.addr2;
873 }
874
875 if (key->swap & IPFW_KEY_SWAP_PORTS) {
876 *sport = key->port_u.ports.port2;
877 *dport = key->port_u.ports.port1;
878 } else {
879 *sport = key->port_u.ports.port1;
880 *dport = key->port_u.ports.port2;
881 }
882}
883
884static int
885ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
886{
887
888 if (s1->st_proto > s2->st_proto)
889 return (1);
890 if (s1->st_proto < s2->st_proto)
891 return (-1);
892
893 if (s1->st_addrs > s2->st_addrs)
894 return (1);
895 if (s1->st_addrs < s2->st_addrs)
896 return (-1);
897
898 if (s1->st_ports > s2->st_ports)
899 return (1);
900 if (s1->st_ports < s2->st_ports)
901 return (-1);
902
903 if (s1->st_swap == s2->st_swap ||
904 (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
905 return (0);
906
907 if (s1->st_swap > s2->st_swap)
908 return (1);
909 else
910 return (-1);
911}
912
913static int
914ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
915{
916
917 if (t1->tc_proto > t2->tc_proto)
918 return (1);
919 if (t1->tc_proto < t2->tc_proto)
920 return (-1);
921
922 if (t1->tc_addrs > t2->tc_addrs)
923 return (1);
924 if (t1->tc_addrs < t2->tc_addrs)
925 return (-1);
926
927 if (t1->tc_ports > t2->tc_ports)
928 return (1);
929 if (t1->tc_ports < t2->tc_ports)
930 return (-1);
931
932 if (t1->tc_ruleid > t2->tc_ruleid)
933 return (1);
934 if (t1->tc_ruleid < t2->tc_ruleid)
935 return (-1);
936
937 return (0);
938}
939
940static int
941ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
942{
943
944 if (t1->t_proto > t2->t_proto)
945 return (1);
946 if (t1->t_proto < t2->t_proto)
947 return (-1);
948
949 if (t1->t_addrs > t2->t_addrs)
950 return (1);
951 if (t1->t_addrs < t2->t_addrs)
952 return (-1);
953
954 if (t1->t_ports > t2->t_ports)
955 return (1);
956 if (t1->t_ports < t2->t_ports)
957 return (-1);
958
959 if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
960 return (1);
961 if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
962 return (-1);
963
964 return (0);
965}
966
967static void
968ipfw_state_max_set(int state_max)
969{
970
971 ipfw_state_max = state_max;
972 /* Allow 5% states over-allocation. */
973 ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
974}
975
976static __inline int
977ipfw_state_cntcoll(void)
978{
979 int cpu, state_cnt = 0;
980
981 for (cpu = 0; cpu < netisr_ncpus; ++cpu)
982 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
983 return (state_cnt);
984}
985
986static __inline int
987ipfw_state_cntsync(void)
988{
989 int state_cnt;
990
991 state_cnt = ipfw_state_cntcoll();
992 ipfw_gd.ipfw_state_loosecnt = state_cnt;
993 return (state_cnt);
994}
984263bc 995
84a3e25a
SZ
996static __inline int
997ipfw_free_rule(struct ip_fw *rule)
998{
ed20d0e3
SW
999 KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1000 KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
ca12e259
SZ
1001 rule->refcnt--;
1002 if (rule->refcnt == 0) {
6998b243
SZ
1003 if (rule->cross_rules != NULL)
1004 kfree(rule->cross_rules, M_IPFW);
84a3e25a
SZ
1005 kfree(rule, M_IPFW);
1006 return 1;
1007 }
1008 return 0;
1009}
1010
1011static void
1012ipfw_unref_rule(void *priv)
1013{
1014 ipfw_free_rule(priv);
1015#ifdef KLD_MODULE
6998b243
SZ
1016 KASSERT(ipfw_gd.ipfw_refcnt > 0,
1017 ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
2187815d 1018 atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
84a3e25a
SZ
1019#endif
1020}
1021
1022static __inline void
1023ipfw_ref_rule(struct ip_fw *rule)
1024{
ed20d0e3 1025 KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
84a3e25a 1026#ifdef KLD_MODULE
2187815d 1027 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
84a3e25a 1028#endif
ca12e259 1029 rule->refcnt++;
84a3e25a 1030}
984263bc
MD
1031
1032/*
1033 * This macro maps an ip pointer into a layer3 header pointer of type T
1034 */
a998c492 1035#define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
984263bc
MD
1036
1037static __inline int
1038icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1039{
1040 int type = L3HDR(struct icmp,ip)->icmp_type;
1041
368024c3 1042 return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1 << type)));
984263bc
MD
1043}
1044
368024c3
SZ
1045#define TT ((1 << ICMP_ECHO) | \
1046 (1 << ICMP_ROUTERSOLICIT) | \
1047 (1 << ICMP_TSTAMP) | \
1048 (1 << ICMP_IREQ) | \
1049 (1 << ICMP_MASKREQ))
984263bc
MD
1050
1051static int
1052is_icmp_query(struct ip *ip)
1053{
1054 int type = L3HDR(struct icmp, ip)->icmp_type;
368024c3
SZ
1055
1056 return (type <= ICMP_MAXTYPE && (TT & (1 << type)));
984263bc 1057}
368024c3 1058
984263bc
MD
1059#undef TT
1060
1061/*
1062 * The following checks use two arrays of 8 or 16 bits to store the
1063 * bits that we want set or clear, respectively. They are in the
1064 * low and high half of cmd->arg1 or cmd->d[0].
1065 *
1066 * We scan options and store the bits we find set. We succeed if
1067 *
1068 * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1069 *
1070 * The code is sometimes optimized not to store additional variables.
1071 */
984263bc 1072static int
a998c492 1073flags_match(ipfw_insn *cmd, uint8_t bits)
984263bc
MD
1074{
1075 u_char want_clear;
1076 bits = ~bits;
1077
368024c3 1078 if (((cmd->arg1 & 0xff) & bits) != 0)
984263bc 1079 return 0; /* some bits we want set were clear */
368024c3 1080
984263bc 1081 want_clear = (cmd->arg1 >> 8) & 0xff;
368024c3 1082 if ((want_clear & bits) != want_clear)
984263bc
MD
1083 return 0; /* some bits we want clear were set */
1084 return 1;
1085}
1086
1087static int
1088ipopts_match(struct ip *ip, ipfw_insn *cmd)
1089{
1090 int optlen, bits = 0;
1091 u_char *cp = (u_char *)(ip + 1);
368024c3 1092 int x = (ip->ip_hl << 2) - sizeof(struct ip);
984263bc
MD
1093
1094 for (; x > 0; x -= optlen, cp += optlen) {
1095 int opt = cp[IPOPT_OPTVAL];
1096
1097 if (opt == IPOPT_EOL)
1098 break;
368024c3
SZ
1099
1100 if (opt == IPOPT_NOP) {
984263bc 1101 optlen = 1;
368024c3 1102 } else {
984263bc
MD
1103 optlen = cp[IPOPT_OLEN];
1104 if (optlen <= 0 || optlen > x)
1105 return 0; /* invalid or truncated */
1106 }
984263bc 1107
368024c3 1108 switch (opt) {
984263bc
MD
1109 case IPOPT_LSRR:
1110 bits |= IP_FW_IPOPT_LSRR;
1111 break;
1112
1113 case IPOPT_SSRR:
1114 bits |= IP_FW_IPOPT_SSRR;
1115 break;
1116
1117 case IPOPT_RR:
1118 bits |= IP_FW_IPOPT_RR;
1119 break;
1120
1121 case IPOPT_TS:
1122 bits |= IP_FW_IPOPT_TS;
1123 break;
368024c3
SZ
1124
1125 default:
1126 break;
984263bc
MD
1127 }
1128 }
1129 return (flags_match(cmd, bits));
1130}
1131
1132static int
1133tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1134{
1135 int optlen, bits = 0;
1136 struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1137 u_char *cp = (u_char *)(tcp + 1);
1138 int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1139
1140 for (; x > 0; x -= optlen, cp += optlen) {
1141 int opt = cp[0];
368024c3 1142
984263bc
MD
1143 if (opt == TCPOPT_EOL)
1144 break;
368024c3
SZ
1145
1146 if (opt == TCPOPT_NOP) {
984263bc 1147 optlen = 1;
368024c3 1148 } else {
984263bc
MD
1149 optlen = cp[1];
1150 if (optlen <= 0)
1151 break;
1152 }
1153
1154 switch (opt) {
984263bc
MD
1155 case TCPOPT_MAXSEG:
1156 bits |= IP_FW_TCPOPT_MSS;
1157 break;
1158
1159 case TCPOPT_WINDOW:
1160 bits |= IP_FW_TCPOPT_WINDOW;
1161 break;
1162
1163 case TCPOPT_SACK_PERMITTED:
1164 case TCPOPT_SACK:
1165 bits |= IP_FW_TCPOPT_SACK;
1166 break;
1167
1168 case TCPOPT_TIMESTAMP:
1169 bits |= IP_FW_TCPOPT_TS;
1170 break;
1171
1172 case TCPOPT_CC:
1173 case TCPOPT_CCNEW:
1174 case TCPOPT_CCECHO:
1175 bits |= IP_FW_TCPOPT_CC;
1176 break;
368024c3
SZ
1177
1178 default:
1179 break;
984263bc
MD
1180 }
1181 }
1182 return (flags_match(cmd, bits));
1183}
1184
1185static int
1186iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1187{
1188 if (ifp == NULL) /* no iface with this packet, match fails */
1189 return 0;
368024c3 1190
984263bc
MD
1191 /* Check by name or by IP address */
1192 if (cmd->name[0] != '\0') { /* match by name */
984263bc 1193 /* Check name */
3e4a09e7 1194 if (cmd->p.glob) {
e93690c2 1195 if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
3e4a09e7
MD
1196 return(1);
1197 } else {
1198 if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1199 return(1);
1200 }
984263bc 1201 } else {
b2632176
SZ
1202 struct ifaddr_container *ifac;
1203
1204 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1205 struct ifaddr *ia = ifac->ifa;
984263bc 1206
984263bc
MD
1207 if (ia->ifa_addr == NULL)
1208 continue;
1209 if (ia->ifa_addr->sa_family != AF_INET)
1210 continue;
1211 if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1212 (ia->ifa_addr))->sin_addr.s_addr)
1213 return(1); /* match */
1214 }
1215 }
1216 return(0); /* no match, fail ... */
1217}
1218
984263bc 1219#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
984263bc
MD
1220
1221/*
1222 * We enter here when we have a rule with O_LOG.
1223 * XXX this function alone takes about 2Kbytes of code!
1224 */
1225static void
2187815d
SZ
1226ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1227 struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
984263bc
MD
1228{
1229 char *action;
1230 int limit_reached = 0;
54ba75e6 1231 char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
984263bc
MD
1232
1233 fragment[0] = '\0';
1234 proto[0] = '\0';
1235
1236 if (f == NULL) { /* bogus pkt */
ca12e259
SZ
1237 if (verbose_limit != 0 &&
1238 ctx->ipfw_norule_counter >= verbose_limit)
984263bc 1239 return;
ca12e259
SZ
1240 ctx->ipfw_norule_counter++;
1241 if (ctx->ipfw_norule_counter == verbose_limit)
984263bc
MD
1242 limit_reached = verbose_limit;
1243 action = "Refuse";
1244 } else { /* O_LOG is the first action, find the real one */
1245 ipfw_insn *cmd = ACTION_PTR(f);
1246 ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1247
1248 if (l->max_log != 0 && l->log_left == 0)
1249 return;
1250 l->log_left--;
1251 if (l->log_left == 0)
1252 limit_reached = l->max_log;
1253 cmd += F_LEN(cmd); /* point to first action */
1254 if (cmd->opcode == O_PROB)
1255 cmd += F_LEN(cmd);
1256
1257 action = action2;
1258 switch (cmd->opcode) {
1259 case O_DENY:
1260 action = "Deny";
1261 break;
1262
1263 case O_REJECT:
26ef90a3 1264 if (cmd->arg1==ICMP_REJECT_RST) {
984263bc 1265 action = "Reset";
26ef90a3 1266 } else if (cmd->arg1==ICMP_UNREACH_HOST) {
984263bc 1267 action = "Reject";
26ef90a3 1268 } else {
f8c7a42d 1269 ksnprintf(SNPARGS(action2, 0), "Unreach %d",
26ef90a3
SZ
1270 cmd->arg1);
1271 }
984263bc
MD
1272 break;
1273
1274 case O_ACCEPT:
1275 action = "Accept";
1276 break;
26ef90a3 1277
984263bc
MD
1278 case O_COUNT:
1279 action = "Count";
1280 break;
26ef90a3 1281
984263bc 1282 case O_DIVERT:
26ef90a3 1283 ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
984263bc 1284 break;
26ef90a3 1285
984263bc 1286 case O_TEE:
26ef90a3 1287 ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
984263bc 1288 break;
26ef90a3 1289
984263bc 1290 case O_SKIPTO:
26ef90a3 1291 ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
984263bc 1292 break;
26ef90a3 1293
984263bc 1294 case O_PIPE:
26ef90a3 1295 ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
984263bc 1296 break;
26ef90a3 1297
984263bc 1298 case O_QUEUE:
26ef90a3 1299 ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
984263bc 1300 break;
26ef90a3
SZ
1301
1302 case O_FORWARD_IP:
1303 {
1304 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1305 int len;
1306
1307 len = ksnprintf(SNPARGS(action2, 0),
54ba75e6
SZ
1308 "Forward to %s",
1309 kinet_ntoa(sa->sa.sin_addr, abuf));
26ef90a3
SZ
1310 if (sa->sa.sin_port) {
1311 ksnprintf(SNPARGS(action2, len), ":%d",
1312 sa->sa.sin_port);
1313 }
984263bc
MD
1314 }
1315 break;
26ef90a3 1316
984263bc
MD
1317 default:
1318 action = "UNKNOWN";
1319 break;
1320 }
1321 }
1322
1323 if (hlen == 0) { /* non-ip */
f8c7a42d 1324 ksnprintf(SNPARGS(proto, 0), "MAC");
984263bc
MD
1325 } else {
1326 struct ip *ip = mtod(m, struct ip *);
1327 /* these three are all aliases to the same thing */
1328 struct icmp *const icmp = L3HDR(struct icmp, ip);
1329 struct tcphdr *const tcp = (struct tcphdr *)icmp;
1330 struct udphdr *const udp = (struct udphdr *)icmp;
1331
1332 int ip_off, offset, ip_len;
984263bc
MD
1333 int len;
1334
1335 if (eh != NULL) { /* layer 2 packets are as on the wire */
1336 ip_off = ntohs(ip->ip_off);
1337 ip_len = ntohs(ip->ip_len);
1338 } else {
1339 ip_off = ip->ip_off;
1340 ip_len = ip->ip_len;
1341 }
1342 offset = ip_off & IP_OFFMASK;
1343 switch (ip->ip_p) {
1344 case IPPROTO_TCP:
f8c7a42d 1345 len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
54ba75e6 1346 kinet_ntoa(ip->ip_src, abuf));
26ef90a3 1347 if (offset == 0) {
f8c7a42d 1348 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
26ef90a3 1349 ntohs(tcp->th_sport),
54ba75e6 1350 kinet_ntoa(ip->ip_dst, abuf),
26ef90a3
SZ
1351 ntohs(tcp->th_dport));
1352 } else {
f8c7a42d 1353 ksnprintf(SNPARGS(proto, len), " %s",
54ba75e6 1354 kinet_ntoa(ip->ip_dst, abuf));
26ef90a3 1355 }
984263bc
MD
1356 break;
1357
1358 case IPPROTO_UDP:
f8c7a42d 1359 len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
54ba75e6 1360 kinet_ntoa(ip->ip_src, abuf));
26ef90a3 1361 if (offset == 0) {
f8c7a42d 1362 ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
26ef90a3 1363 ntohs(udp->uh_sport),
54ba75e6 1364 kinet_ntoa(ip->ip_dst, abuf),
26ef90a3
SZ
1365 ntohs(udp->uh_dport));
1366 } else {
f8c7a42d 1367 ksnprintf(SNPARGS(proto, len), " %s",
54ba75e6 1368 kinet_ntoa(ip->ip_dst, abuf));
26ef90a3 1369 }
984263bc
MD
1370 break;
1371
1372 case IPPROTO_ICMP:
26ef90a3 1373 if (offset == 0) {
f8c7a42d 1374 len = ksnprintf(SNPARGS(proto, 0),
26ef90a3
SZ
1375 "ICMP:%u.%u ",
1376 icmp->icmp_type,
1377 icmp->icmp_code);
1378 } else {
f8c7a42d 1379 len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
26ef90a3 1380 }
f8c7a42d 1381 len += ksnprintf(SNPARGS(proto, len), "%s",
54ba75e6 1382 kinet_ntoa(ip->ip_src, abuf));
f8c7a42d 1383 ksnprintf(SNPARGS(proto, len), " %s",
54ba75e6 1384 kinet_ntoa(ip->ip_dst, abuf));
984263bc
MD
1385 break;
1386
1387 default:
f8c7a42d 1388 len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
54ba75e6 1389 kinet_ntoa(ip->ip_src, abuf));
f8c7a42d 1390 ksnprintf(SNPARGS(proto, len), " %s",
54ba75e6 1391 kinet_ntoa(ip->ip_dst, abuf));
984263bc
MD
1392 break;
1393 }
1394
26ef90a3 1395 if (ip_off & (IP_MF | IP_OFFMASK)) {
f8c7a42d 1396 ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
26ef90a3
SZ
1397 ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1398 offset << 3, (ip_off & IP_MF) ? "+" : "");
1399 }
984263bc 1400 }
26ef90a3
SZ
1401
1402 if (oif || m->m_pkthdr.rcvif) {
984263bc 1403 log(LOG_SECURITY | LOG_INFO,
3e4a09e7 1404 "ipfw: %d %s %s %s via %s%s\n",
984263bc
MD
1405 f ? f->rulenum : -1,
1406 action, proto, oif ? "out" : "in",
3e4a09e7 1407 oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
984263bc 1408 fragment);
26ef90a3 1409 } else {
984263bc
MD
1410 log(LOG_SECURITY | LOG_INFO,
1411 "ipfw: %d %s %s [no if info]%s\n",
1412 f ? f->rulenum : -1,
1413 action, proto, fragment);
26ef90a3
SZ
1414 }
1415
1416 if (limit_reached) {
984263bc
MD
1417 log(LOG_SECURITY | LOG_NOTICE,
1418 "ipfw: limit %d reached on entry %d\n",
1419 limit_reached, f ? f->rulenum : -1);
26ef90a3 1420 }
984263bc
MD
1421}
1422
f2c88f94
SZ
1423#undef SNPARGS
1424
2187815d
SZ
1425#define TIME_LEQ(a, b) ((a) - (b) <= 0)
1426
1427static void
1428ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1429{
1430
1431 KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT,
1432 ("invalid state type %u", s->st_type));
1433 KASSERT(ctx->ipfw_state_cnt > 0,
1434 ("invalid state count %d", ctx->ipfw_state_cnt));
1435
1436 if (s->st_track != NULL) {
1437 struct ipfw_track *t = s->st_track;
1438
1439 KASSERT(!LIST_EMPTY(&t->t_state_list),
1440 ("track state list is empty"));
1441 LIST_REMOVE(s, st_trklink);
1442
1443 KASSERT(*t->t_count > 0,
1444 ("invalid track count %d", *t->t_count));
1445 atomic_subtract_int(t->t_count, 1);
1446 }
1447
1448 TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1449 RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1450 kfree(s, M_IPFW);
1451
1452 ctx->ipfw_state_cnt--;
1453 if (ctx->ipfw_state_loosecnt > 0)
1454 ctx->ipfw_state_loosecnt--;
1455}
1456
1457static int
1458ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
984263bc 1459{
2187815d
SZ
1460 struct ipfw_state *s, *anchor;
1461 int expired;
1462
1463 if (reap_max < ipfw_state_reap_min)
1464 reap_max = ipfw_state_reap_min;
1465
1466 if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1467 /*
1468 * Kick start state expiring. Ignore scan limit,
1469 * we are short of states.
1470 */
1471 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1472 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1473 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1474 return (expired);
1475 }
1476
1477 /*
1478 * States are being expired.
1479 */
1480
1481 if (ctx->ipfw_state_cnt == 0)
1482 return (0);
1483
1484 expired = 0;
1485 anchor = &ctx->ipfw_stateexp_anch;
1486 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1487 /*
1488 * Ignore scan limit; we are short of states.
1489 */
984263bc 1490
2187815d
SZ
1491 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1492 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1493
1494 if (s->st_type == O_ANCHOR)
1495 continue;
1496
1497 if (IPFW_STATE_TCPCLOSED(s) ||
1498 TIME_LEQ(s->st_expire, time_uptime)) {
1499 ipfw_state_del(ctx, s);
1500 if (++expired >= reap_max)
1501 break;
1502 if ((expired & 0xff) == 0 &&
1503 ipfw_state_cntcoll() + ipfw_state_headroom <=
1504 ipfw_state_max)
1505 break;
1506 }
1507 }
1508 /*
1509 * NOTE:
1510 * Leave the anchor on the list, even if the end of the list has
1511 * been reached. ipfw_state_expire_more_dispatch() will handle
1512 * the removal.
1513 */
1514 return (expired);
984263bc
MD
1515}
1516
2187815d
SZ
1517static void
1518ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1519{
1520 struct ipfw_state *s, *sn;
984263bc 1521
2187815d
SZ
1522 TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1523 if (s->st_type == O_ANCHOR)
1524 continue;
1525 if (rule != NULL && s->st_rule != rule)
1526 continue;
1527 ipfw_state_del(ctx, s);
1528 }
1529}
984263bc 1530
984263bc 1531static void
2187815d 1532ipfw_state_expire_done(struct ipfw_context *ctx)
984263bc 1533{
984263bc 1534
2187815d
SZ
1535 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1536 ("stateexp is not in progress"));
1537 ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1538 callout_reset(&ctx->ipfw_stateto_ch, hz,
1539 ipfw_state_expire_ipifunc, NULL);
1540}
1541
1542static void
1543ipfw_state_expire_more(struct ipfw_context *ctx)
1544{
1545 struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
984263bc 1546
2187815d
SZ
1547 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1548 ("stateexp is not in progress"));
1549 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1550 ("stateexp more did not finish"));
1551 netisr_sendmsg_oncpu(nm);
1552}
984263bc 1553
2187815d
SZ
1554static int
1555ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1556 int scan_max, int expire_max)
1557{
1558 struct ipfw_state *s;
1559 int scanned = 0, expired = 0;
1560
1561 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1562 ("stateexp is not in progress"));
1563
1564 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1565 if (scanned++ >= scan_max) {
1566 ipfw_state_expire_more(ctx);
1567 return (expired);
1568 }
1569
1570 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1571 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1572
1573 if (s->st_type == O_ANCHOR)
1574 continue;
1575
1576 if (TIME_LEQ(s->st_expire, time_uptime) ||
1577 ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1578 IPFW_STATE_TCPCLOSED(s))) {
1579 ipfw_state_del(ctx, s);
1580 if (++expired >= expire_max) {
1581 ipfw_state_expire_more(ctx);
1582 return (expired);
1583 }
1584 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1585 (expired & 0xff) == 0 &&
1586 ipfw_state_cntcoll() + ipfw_state_headroom <=
1587 ipfw_state_max) {
1588 ipfw_state_expire_more(ctx);
1589 return (expired);
1590 }
1591 }
1592 }
1593 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1594 ipfw_state_expire_done(ctx);
1595 return (expired);
1596}
1597
1598static void
1599ipfw_state_expire_more_dispatch(netmsg_t nm)
1600{
1601 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1602 struct ipfw_state *anchor;
1603
1604 ASSERT_NETISR_NCPUS(mycpuid);
1605 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1606 ("statexp is not in progress"));
1607
1608 /* Reply ASAP */
1609 netisr_replymsg(&nm->base, 0);
1610
1611 anchor = &ctx->ipfw_stateexp_anch;
1612 if (ctx->ipfw_state_cnt == 0) {
1613 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1614 ipfw_state_expire_done(ctx);
984263bc 1615 return;
2187815d
SZ
1616 }
1617 ipfw_state_expire_loop(ctx, anchor,
1618 ipfw_state_scan_max, ipfw_state_expire_max);
1619}
1620
1621static int
1622ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1623{
1624 struct ipfw_state *anchor;
1625
1626 KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1627 ("stateexp is in progress"));
1628 ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1629
1630 if (ctx->ipfw_state_cnt == 0) {
1631 ipfw_state_expire_done(ctx);
1632 return (0);
1633 }
984263bc
MD
1634
1635 /*
2187815d 1636 * Do not expire more than once per second, it is useless.
984263bc 1637 */
2187815d
SZ
1638 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1639 ctx->ipfw_state_lastexp == time_uptime) {
1640 ipfw_state_expire_done(ctx);
1641 return (0);
1642 }
1643 ctx->ipfw_state_lastexp = time_uptime;
1644
1645 anchor = &ctx->ipfw_stateexp_anch;
1646 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1647 return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1648}
1649
1650static void
1651ipfw_state_expire_dispatch(netmsg_t nm)
1652{
1653 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1654
1655 ASSERT_NETISR_NCPUS(mycpuid);
1656
1657 /* Reply ASAP */
1658 crit_enter();
1659 netisr_replymsg(&nm->base, 0);
1660 crit_exit();
1661
1662 if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
1663 /* Running; done. */
1664 return;
1665 }
1666 ipfw_state_expire_start(ctx,
1667 ipfw_state_scan_max, ipfw_state_expire_max);
1668}
1669
1670static void
1671ipfw_state_expire_ipifunc(void *dummy __unused)
1672{
1673 struct netmsg_base *msg;
1674
1675 KKASSERT(mycpuid < netisr_ncpus);
1676 msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
1677
1678 crit_enter();
1679 if (msg->lmsg.ms_flags & MSGF_DONE)
1680 netisr_sendmsg_oncpu(msg);
1681 crit_exit();
1682}
1683
1684static boolean_t
1685ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
1686{
1687 uint32_t seq = ntohl(tcp->th_seq);
1688 uint32_t ack = ntohl(tcp->th_ack);
1689
1690 if (tcp->th_flags & TH_RST)
1691 return (TRUE);
1692
1693 if (dir == MATCH_FORWARD) {
1694 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
1695 s->st_flags |= IPFW_STATE_F_SEQFWD;
1696 s->st_seq_fwd = seq;
1697 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
1698 s->st_seq_fwd = seq;
1699 } else {
1700 /* Out-of-sequence; done. */
1701 return (FALSE);
1702 }
1703 if (tcp->th_flags & TH_ACK) {
1704 if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
1705 s->st_flags |= IPFW_STATE_F_ACKFWD;
1706 s->st_ack_fwd = ack;
1707 } else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
1708 s->st_ack_fwd = ack;
984263bc 1709 } else {
2187815d
SZ
1710 /* Out-of-sequence; done. */
1711 return (FALSE);
984263bc 1712 }
2187815d
SZ
1713
1714 if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
1715 (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
1716 s->st_state |= (TH_ACK << 8);
1717 }
1718 } else {
1719 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
1720 s->st_flags |= IPFW_STATE_F_SEQREV;
1721 s->st_seq_rev = seq;
1722 } else if (SEQ_GEQ(seq, s->st_seq_rev)) {
1723 s->st_seq_rev = seq;
1724 } else {
1725 /* Out-of-sequence; done. */
1726 return (FALSE);
1727 }
1728 if (tcp->th_flags & TH_ACK) {
1729 if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
1730 s->st_flags |= IPFW_STATE_F_ACKREV;
1731 s->st_ack_rev= ack;
1732 } else if (SEQ_GEQ(ack, s->st_ack_rev)) {
1733 s->st_ack_rev = ack;
1734 } else {
1735 /* Out-of-sequence; done. */
1736 return (FALSE);
1737 }
1738
1739 if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
1740 s->st_ack_rev == s->st_seq_fwd + 1)
1741 s->st_state |= TH_ACK;
984263bc
MD
1742 }
1743 }
2187815d
SZ
1744 return (TRUE);
1745}
1746
1747static void
1748ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
1749 const struct tcphdr *tcp, struct ipfw_state *s)
1750{
1751
1752 if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
1753 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
1754
1755 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
1756 return;
1757
1758 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
1759 switch (s->st_state & IPFW_STATE_TCPSTATES) {
1760 case TH_SYN: /* opening */
1761 s->st_expire = time_uptime + dyn_syn_lifetime;
1762 break;
1763
1764 case BOTH_SYN: /* move to established */
1765 case BOTH_SYN | TH_FIN: /* one side tries to close */
1766 case BOTH_SYN | (TH_FIN << 8):
1767 s->st_expire = time_uptime + dyn_ack_lifetime;
1768 break;
e26039aa 1769
2187815d
SZ
1770 case BOTH_SYN | BOTH_FIN: /* both sides closed */
1771 if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
1772 /* And both FINs were ACKed. */
1773 s->st_expire = time_uptime + dyn_fin_lifetime;
1774 } else {
1775 s->st_expire = time_uptime +
1776 dyn_finwait_lifetime;
1777 }
1778 break;
c31665e4 1779
2187815d
SZ
1780 default:
1781#if 0
1782 /*
1783 * reset or some invalid combination, but can also
1784 * occur if we use keep-state the wrong way.
1785 */
1786 if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
1787 kprintf("invalid state: 0x%x\n", s->st_state);
1788#endif
1789 s->st_expire = time_uptime + dyn_rst_lifetime;
1790 break;
1791 }
1792 } else if (pkt->proto == IPPROTO_UDP) {
1793 s->st_expire = time_uptime + dyn_udp_lifetime;
1794 } else {
1795 /* other protocols */
1796 s->st_expire = time_uptime + dyn_short_lifetime;
1797 }
984263bc
MD
1798}
1799
388cb6c6 1800/*
2187815d 1801 * Lookup a state.
984263bc 1802 */
2187815d
SZ
1803static struct ipfw_state *
1804ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1805 int *match_direction, const struct tcphdr *tcp)
1806{
1807 struct ipfw_state *key, *s;
1808 int dir = MATCH_NONE;
1809
1810 key = &ctx->ipfw_state_tmpkey;
1811 ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
1812 pkt->dst_ip, pkt->dst_port, pkt->proto);
1813 s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
1814 if (s == NULL)
1815 goto done; /* not found. */
1816 if (TIME_LEQ(s->st_expire, time_uptime)) {
1817 /* Expired. */
1818 ipfw_state_del(ctx, s);
1819 s = NULL;
1820 goto done;
1821 }
1822 if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
1823 /* TCP ports recycling is too fast. */
1824 ctx->ipfw_sts_tcprecycled++;
1825 ipfw_state_del(ctx, s);
1826 s = NULL;
1827 goto done;
1828 }
1829
1830 if (s->st_swap == key->st_swap) {
1831 dir = MATCH_FORWARD;
1832 } else {
1833 KASSERT((s->st_swap & key->st_swap) == 0,
1834 ("found mismatch state"));
1835 dir = MATCH_REVERSE;
1836 }
1837
1838 /* Update this state. */
1839 ipfw_state_update(pkt, dir, tcp, s);
1840
1841 if (s->st_track != NULL) {
1842 /* This track has been used. */
1843 s->st_track->t_expire = time_uptime + dyn_short_lifetime;
1844 }
1845done:
1846 if (match_direction)
1847 *match_direction = dir;
1848 return (s);
1849}
1850
1851static __inline struct ip_fw *
1852ipfw_state_lookup_rule(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1853 int *match_direction, const struct tcphdr *tcp, uint16_t len)
1854{
1855 struct ipfw_state *s;
1856
1857 s = ipfw_state_lookup(ctx, pkt, match_direction, tcp);
1858 if (s == NULL)
1859 return (NULL);
1860
1861 KASSERT(s->st_rule->cpuid == mycpuid,
1862 ("rule %p (cpu%d) does not belong to the current cpu%d",
1863 s->st_rule, s->st_rule->cpuid, mycpuid));
1864
1865 s->st_pcnt++;
1866 s->st_bcnt += len;
1867
1868 return (s->st_rule);
1869}
1870
1871static struct ipfw_state *
1872ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
1873 uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
1874 const struct tcphdr *tcp)
984263bc 1875{
2187815d
SZ
1876 struct ipfw_state *s, *dup;
1877
1878 KASSERT(type == O_KEEP_STATE || type == O_LIMIT,
1879 ("invalid state type %u", type));
1880
1881 s = kmalloc(sizeof(*s), M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
1882 if (s == NULL) {
1883 ctx->ipfw_sts_nomem++;
1884 return (NULL);
1885 }
1886
1887 ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
1888 id->dst_ip, id->dst_port, id->proto);
1889
1890 s->st_rule = rule;
1891 s->st_type = type;
1892
1893 ctx->ipfw_state_cnt++;
1894 ctx->ipfw_state_loosecnt++;
1895 if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
1896 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
1897 ctx->ipfw_state_loosecnt = 0;
1898 }
1899
1900 dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1901 if (dup != NULL)
1902 panic("ipfw: state exists");
1903 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1904
984263bc 1905 /*
2187815d
SZ
1906 * Update this state:
1907 * Set st_expire and st_state.
984263bc 1908 */
2187815d 1909 ipfw_state_update(id, MATCH_FORWARD, tcp, s);
997a0e9a 1910
2187815d
SZ
1911 if (t != NULL) {
1912 /* Keep the track referenced. */
1913 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
1914 s->st_track = t;
1915 }
1916 return (s);
1917}
1918
1919static boolean_t
1920ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
1921{
1922 struct ipfw_trkcnt *trk;
1923 boolean_t trk_freed = FALSE;
1924
1925 KASSERT(t->t_count != NULL, ("track anchor"));
1926 KASSERT(LIST_EMPTY(&t->t_state_list),
1927 ("invalid track is still referenced"));
1928
1929 trk = t->t_trkcnt;
1930 KASSERT(trk != NULL, ("track has no trkcnt"));
1931
1932 RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
1933 TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
1934 kfree(t, M_IPFW);
1935
1936 /*
1937 * fdrop() style reference counting.
1938 * See kern/kern_descrip.c fdrop().
1939 */
1940 for (;;) {
1941 int refs = trk->tc_refs;
1942
1943 cpu_ccfence();
1944 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
1945 if (refs == 1) {
1946 IPFW_TRKCNT_TOKGET;
1947 if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
1948 KASSERT(trk->tc_count == 0,
1949 ("%d states reference this trkcnt",
1950 trk->tc_count));
1951 RB_REMOVE(ipfw_trkcnt_tree,
1952 &ipfw_gd.ipfw_trkcnt_tree, trk);
1953
1954 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
1955 ("invalid trkcnt cnt %d",
1956 ipfw_gd.ipfw_trkcnt_cnt));
1957 ipfw_gd.ipfw_trkcnt_cnt--;
1958 IPFW_TRKCNT_TOKREL;
1959
1960 if (ctx->ipfw_trkcnt_spare == NULL)
1961 ctx->ipfw_trkcnt_spare = trk;
1962 else
1963 kfree(trk, M_IPFW);
1964 trk_freed = TRUE;
1965 break; /* done! */
984263bc 1966 }
2187815d
SZ
1967 IPFW_TRKCNT_TOKREL;
1968 /* retry */
1969 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
1970 break; /* done! */
1971 }
1972 /* retry */
1973 }
1974 return (trk_freed);
1975}
1976
1977static void
1978ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
1979{
1980 struct ipfw_track *t, *tn;
1981
1982 TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
1983 if (t->t_count == NULL) /* anchor */
1984 continue;
1985 if (rule != NULL && t->t_rule != rule)
1986 continue;
1987 ipfw_track_free(ctx, t);
1988 }
1989}
1990
1991static boolean_t
1992ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
1993 boolean_t reap)
1994{
1995 struct ipfw_state *s, *sn;
1996 boolean_t ret = FALSE;
1997
1998 KASSERT(t->t_count != NULL, ("track anchor"));
1999
2000 if (LIST_EMPTY(&t->t_state_list))
2001 return (FALSE);
2002
2003 /*
2004 * Do not expire more than once per second, it is useless.
2005 */
2006 if (t->t_lastexp == time_uptime)
2007 return (FALSE);
2008 t->t_lastexp = time_uptime;
2009
2010 LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2011 if (TIME_LEQ(s->st_expire, time_uptime) ||
2012 (reap && IPFW_STATE_TCPCLOSED(s))) {
2013 KASSERT(s->st_track == t,
2014 ("state track %p does not match %p",
2015 s->st_track, t));
2016 ipfw_state_del(ctx, s);
2017 ret = TRUE;
2018 }
2019 }
2020 return (ret);
2021}
2022
2023static __inline struct ipfw_trkcnt *
2024ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2025{
2026 struct ipfw_trkcnt *trk;
2027
2028 if (ctx->ipfw_trkcnt_spare != NULL) {
2029 trk = ctx->ipfw_trkcnt_spare;
2030 ctx->ipfw_trkcnt_spare = NULL;
2031 } else {
2032 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2033 M_INTWAIT | M_NULLOK);
2034 }
2035 return (trk);
2036}
2037
2038static void
2039ipfw_track_expire_done(struct ipfw_context *ctx)
2040{
2041
2042 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2043 ("trackexp is not in progress"));
2044 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2045 callout_reset(&ctx->ipfw_trackto_ch, hz,
2046 ipfw_track_expire_ipifunc, NULL);
2047}
2048
2049static void
2050ipfw_track_expire_more(struct ipfw_context *ctx)
2051{
2052 struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2053
2054 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2055 ("trackexp is not in progress"));
2056 KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2057 ("trackexp more did not finish"));
2058 netisr_sendmsg_oncpu(nm);
2059}
2060
2061static int
2062ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2063 int scan_max, int expire_max)
2064{
2065 struct ipfw_track *t;
2066 int scanned = 0, expired = 0;
2067 boolean_t reap = FALSE;
2068
2069 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2070 ("trackexp is not in progress"));
2071
2072 if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2073 reap = TRUE;
2074
2075 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2076 if (scanned++ >= scan_max) {
2077 ipfw_track_expire_more(ctx);
2078 return (expired);
984263bc 2079 }
984263bc 2080
2187815d
SZ
2081 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2082 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
984263bc 2083
2187815d
SZ
2084 if (t->t_count == NULL) /* anchor */
2085 continue;
0dbcbe32 2086
2187815d
SZ
2087 ipfw_track_state_expire(ctx, t, reap);
2088 if (!LIST_EMPTY(&t->t_state_list)) {
2089 /* There are states referencing this track. */
2090 continue;
2091 }
984263bc 2092
2187815d
SZ
2093 if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2094 /* Expired. */
2095 if (ipfw_track_free(ctx, t)) {
2096 if (++expired >= expire_max) {
2097 ipfw_track_expire_more(ctx);
2098 return (expired);
984263bc 2099 }
984263bc 2100 }
2187815d
SZ
2101 }
2102 }
2103 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2104 ipfw_track_expire_done(ctx);
2105 return (expired);
2106}
984263bc 2107
2187815d
SZ
2108static int
2109ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2110{
2111 struct ipfw_track *anchor;
984263bc 2112
2187815d
SZ
2113 KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2114 ("trackexp is in progress"));
2115 ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2116
2117 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2118 ipfw_track_expire_done(ctx);
2119 return (0);
984263bc 2120 }
2187815d
SZ
2121
2122 /*
2123 * Do not expire more than once per second, it is useless.
2124 */
2125 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2126 ctx->ipfw_track_lastexp == time_uptime) {
2127 ipfw_track_expire_done(ctx);
2128 return (0);
2129 }
2130 ctx->ipfw_track_lastexp = time_uptime;
2131
2132 anchor = &ctx->ipfw_trackexp_anch;
2133 TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2134 return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
984263bc
MD
2135}
2136
2187815d
SZ
2137static void
2138ipfw_track_expire_more_dispatch(netmsg_t nm)
997a0e9a 2139{
2187815d
SZ
2140 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2141 struct ipfw_track *anchor;
997a0e9a 2142
2187815d
SZ
2143 ASSERT_NETISR_NCPUS(mycpuid);
2144 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2145 ("trackexp is not in progress"));
2146
2147 /* Reply ASAP */
2148 netisr_replymsg(&nm->base, 0);
997a0e9a 2149
2187815d
SZ
2150 anchor = &ctx->ipfw_trackexp_anch;
2151 if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2152 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2153 ipfw_track_expire_done(ctx);
2154 return;
997a0e9a 2155 }
2187815d
SZ
2156 ipfw_track_expire_loop(ctx, anchor,
2157 ipfw_track_scan_max, ipfw_track_expire_max);
997a0e9a
SZ
2158}
2159
984263bc 2160static void
2187815d 2161ipfw_track_expire_dispatch(netmsg_t nm)
984263bc 2162{
2187815d
SZ
2163 struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2164
2165 ASSERT_NETISR_NCPUS(mycpuid);
2166
2167 /* Reply ASAP */
2168 crit_enter();
2169 netisr_replymsg(&nm->base, 0);
2170 crit_exit();
af162095 2171
2187815d
SZ
2172 if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2173 /* Running; done. */
2174 return;
2175 }
2176 ipfw_track_expire_start(ctx,
2177 ipfw_track_scan_max, ipfw_track_expire_max);
2178}
0dbcbe32 2179
2187815d
SZ
2180static void
2181ipfw_track_expire_ipifunc(void *dummy __unused)
2182{
2183 struct netmsg_base *msg;
0dbcbe32 2184
2187815d
SZ
2185 KKASSERT(mycpuid < netisr_ncpus);
2186 msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
af162095 2187
2187815d
SZ
2188 crit_enter();
2189 if (msg->lmsg.ms_flags & MSGF_DONE)
2190 netisr_sendmsg_oncpu(msg);
2191 crit_exit();
984263bc
MD
2192}
2193
2187815d
SZ
2194static int
2195ipfw_track_reap(struct ipfw_context *ctx)
984263bc 2196{
2187815d
SZ
2197 struct ipfw_track *t, *anchor;
2198 int expired;
984263bc 2199
2187815d
SZ
2200 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2201 /*
2202 * Kick start track expiring. Ignore scan limit,
2203 * we are short of tracks.
2204 */
2205 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2206 expired = ipfw_track_expire_start(ctx, INT_MAX,
2207 ipfw_track_reap_max);
2208 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2209 return (expired);
984263bc 2210 }
2187815d
SZ
2211
2212 /*
2213 * Tracks are being expired.
2214 */
2215
2216 if (RB_EMPTY(&ctx->ipfw_track_tree))
2217 return (0);
2218
2219 expired = 0;
2220 anchor = &ctx->ipfw_trackexp_anch;
2221 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2222 /*
2223 * Ignore scan limit; we are short of tracks.
2224 */
2225
2226 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2227 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2228
2229 if (t->t_count == NULL) /* anchor */
2230 continue;
2231
2232 ipfw_track_state_expire(ctx, t, TRUE);
2233 if (!LIST_EMPTY(&t->t_state_list)) {
2234 /* There are states referencing this track. */
2235 continue;
2236 }
2237
2238 if (ipfw_track_free(ctx, t)) {
2239 if (++expired >= ipfw_track_reap_max) {
2240 ipfw_track_expire_more(ctx);
2241 break;
2242 }
2243 }
984263bc 2244 }
2187815d
SZ
2245 /*
2246 * NOTE:
2247 * Leave the anchor on the list, even if the end of the list has
2248 * been reached. ipfw_track_expire_more_dispatch() will handle
2249 * the removal.
2250 */
2251 return (expired);
984263bc
MD
2252}
2253
2187815d
SZ
2254static struct ipfw_track *
2255ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2256 uint16_t limit_mask, struct ip_fw *rule)
984263bc 2257{
2187815d
SZ
2258 struct ipfw_track *key, *t, *dup;
2259 struct ipfw_trkcnt *trk, *ret;
2260 boolean_t do_expire = FALSE;
2261
2262 KASSERT(rule->track_ruleid != 0,
2263 ("rule %u has no track ruleid", rule->rulenum));
2264
2265 key = &ctx->ipfw_track_tmpkey;
2266 key->t_proto = id->proto;
2267 key->t_addrs = 0;
2268 key->t_ports = 0;
2269 key->t_rule = rule;
2270 if (limit_mask & DYN_SRC_ADDR)
2271 key->t_saddr = id->src_ip;
2272 if (limit_mask & DYN_DST_ADDR)
2273 key->t_daddr = id->dst_ip;
2274 if (limit_mask & DYN_SRC_PORT)
2275 key->t_sport = id->src_port;
2276 if (limit_mask & DYN_DST_PORT)
2277 key->t_dport = id->dst_port;
2278
2279 t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2280 if (t != NULL)
2281 goto done;
2282
2283 t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2284 if (t == NULL) {
2285 ctx->ipfw_tks_nomem++;
2286 return (NULL);
2287 }
2288
2289 t->t_key = key->t_key;
2290 t->t_rule = rule;
2291 t->t_lastexp = 0;
2292 LIST_INIT(&t->t_state_list);
2293
2294 if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2295 time_t globexp, uptime;
2296
2297 trk = NULL;
2298 do_expire = TRUE;
984263bc 2299
2187815d
SZ
2300 /*
2301 * Do not expire globally more than once per second,
2302 * it is useless.
2303 */
2304 uptime = time_uptime;
2305 globexp = ipfw_gd.ipfw_track_globexp;
2306 if (globexp != uptime &&
2307 atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2308 globexp, uptime)) {
2309 int cpu;
2310
2311 /* Expire tracks on other CPUs. */
2312 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2313 if (cpu == mycpuid)
2314 continue;
2315 lwkt_send_ipiq(globaldata_find(cpu),
2316 ipfw_track_expire_ipifunc, NULL);
2317 }
2318 }
2319 } else {
2320 trk = ipfw_trkcnt_alloc(ctx);
2321 }
2322 if (trk == NULL) {
2323 struct ipfw_trkcnt *tkey;
2324
2325 tkey = &ctx->ipfw_trkcnt_tmpkey;
2326 key = NULL; /* tkey overlaps key */
2327
2328 tkey->tc_key = t->t_key;
2329 tkey->tc_ruleid = rule->track_ruleid;
2330
2331 IPFW_TRKCNT_TOKGET;
2332 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2333 tkey);
2334 if (trk == NULL) {
2335 IPFW_TRKCNT_TOKREL;
2336 if (do_expire) {
2337 ctx->ipfw_tks_reap++;
2338 if (ipfw_track_reap(ctx) > 0) {
2339 if (ipfw_gd.ipfw_trkcnt_cnt <
2340 ipfw_track_max) {
2341 trk = ipfw_trkcnt_alloc(ctx);
2342 if (trk != NULL)
2343 goto install;
2344 ctx->ipfw_tks_cntnomem++;
2345 } else {
2346 ctx->ipfw_tks_overflow++;
2347 }
2348 } else {
2349 ctx->ipfw_tks_reapfailed++;
2350 ctx->ipfw_tks_overflow++;
2351 }
2352 } else {
2353 ctx->ipfw_tks_cntnomem++;
984263bc 2354 }
2187815d
SZ
2355 kfree(t, M_IPFW);
2356 return (NULL);
2357 }
2358 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2359 ("invalid trkcnt refs %d", trk->tc_refs));
2360 atomic_add_int(&trk->tc_refs, 1);
2361 IPFW_TRKCNT_TOKREL;
2362 } else {
2363install:
2364 trk->tc_key = t->t_key;
2365 trk->tc_ruleid = rule->track_ruleid;
2366 trk->tc_refs = 0;
2367 trk->tc_count = 0;
2368 trk->tc_expire = 0;
2369 trk->tc_rulenum = rule->rulenum;
2370
2371 IPFW_TRKCNT_TOKGET;
2372 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2373 trk);
2374 if (ret != NULL) {
2375 KASSERT(ret->tc_refs > 0 &&
2376 ret->tc_refs < netisr_ncpus,
2377 ("invalid trkcnt refs %d", ret->tc_refs));
2378 KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2379 ("trkcnt spare was installed"));
2380 ctx->ipfw_trkcnt_spare = trk;
2381 trk = ret;
2382 } else {
2383 ipfw_gd.ipfw_trkcnt_cnt++;
0dbcbe32 2384 }
2187815d
SZ
2385 atomic_add_int(&trk->tc_refs, 1);
2386 IPFW_TRKCNT_TOKREL;
984263bc 2387 }
2187815d
SZ
2388 t->t_count = &trk->tc_count;
2389 t->t_trkcnt = trk;
2390
2391 dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2392 if (dup != NULL)
2393 panic("ipfw: track exists");
2394 TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2395done:
2396 t->t_expire = time_uptime + dyn_short_lifetime;
2397 return (t);
984263bc
MD
2398}
2399
388cb6c6 2400/*
2187815d 2401 * Install state for rule type cmd->o.opcode
984263bc
MD
2402 *
2403 * Returns 1 (failure) if state is not installed because of errors or because
2187815d 2404 * states limitations are enforced.
984263bc
MD
2405 */
2406static int
2187815d
SZ
2407ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2408 ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
984263bc 2409{
2187815d
SZ
2410 struct ipfw_state *s;
2411 struct ipfw_track *t;
2412 int count, diff;
984263bc 2413
2187815d
SZ
2414 if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2415 (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2416 boolean_t overflow = TRUE;
984263bc 2417
2187815d
SZ
2418 ctx->ipfw_sts_reap++;
2419 if (ipfw_state_reap(ctx, diff) == 0)
2420 ctx->ipfw_sts_reapfailed++;
2421 if (ipfw_state_cntsync() < ipfw_state_max)
2422 overflow = FALSE;
984263bc 2423
2187815d
SZ
2424 if (overflow) {
2425 time_t globexp, uptime;
2426 int cpu;
984263bc 2427
2187815d
SZ
2428 /*
2429 * Do not expire globally more than once per second,
2430 * it is useless.
2431 */
2432 uptime = time_uptime;
2433 globexp = ipfw_gd.ipfw_state_globexp;
2434 if (globexp == uptime ||
2435 !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2436 globexp, uptime)) {
2437 ctx->ipfw_sts_overflow++;
2438 return (1);
2439 }
2440
2441 /* Expire states on other CPUs. */
2442 for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2443 if (cpu == mycpuid)
2444 continue;
2445 lwkt_send_ipiq(globaldata_find(cpu),
2446 ipfw_state_expire_ipifunc, NULL);
997a0e9a 2447 }
2187815d
SZ
2448 ctx->ipfw_sts_overflow++;
2449 return (1);
984263bc 2450 }
984263bc
MD
2451 }
2452
2453 switch (cmd->o.opcode) {
2454 case O_KEEP_STATE: /* bidir rule */
2187815d
SZ
2455 s = ipfw_state_add(ctx, &args->f_id, O_KEEP_STATE, rule, NULL,
2456 tcp);
2457 if (s == NULL)
2458 return (1);
984263bc
MD
2459 break;
2460
2461 case O_LIMIT: /* limit number of sessions */
2187815d
SZ
2462 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2463 if (t == NULL)
2464 return (1);
0dbcbe32 2465
2187815d
SZ
2466 if (*t->t_count >= cmd->conn_limit) {
2467 if (!ipfw_track_state_expire(ctx, t, TRUE))
2468 return (1);
2469 }
2470 for (;;) {
2471 count = *t->t_count;
2472 if (count >= cmd->conn_limit)
2473 return (1);
2474 if (atomic_cmpset_int(t->t_count, count, count + 1))
2475 break;
2476 }
2477
2478 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2479 if (s == NULL) {
2480 /* Undo damage. */
2481 atomic_subtract_int(t->t_count, 1);
2482 return (1);
984263bc 2483 }
984263bc 2484 break;
2187815d 2485
984263bc 2486 default:
2187815d 2487 panic("unknown state type %u\n", cmd->o.opcode);
984263bc 2488 }
2187815d 2489 return (0);
997a0e9a
SZ
2490}
2491
d938108c
SZ
2492static int
2493ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2494 const struct in_addr *in)
2495{
2496 struct radix_node_head *rnh;
2497 struct sockaddr_in sin;
2498 struct ipfw_tblent *te;
2499
2500 KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2501 rnh = ctx->ipfw_tables[tableid];
2502 if (rnh == NULL)
2503 return (0); /* no match */
2504
2505 memset(&sin, 0, sizeof(sin));
2506 sin.sin_family = AF_INET;
2507 sin.sin_len = sizeof(sin);
2508 sin.sin_addr = *in;
2509
2510 te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2511 if (te == NULL)
2512 return (0); /* no match */
2513
2514 te->te_use++;
2515 te->te_lastuse = time_second;
2516 return (1); /* match */
2517}
2518
984263bc
MD
2519/*
2520 * Transmit a TCP packet, containing either a RST or a keepalive.
2521 * When flags & TH_RST, we are sending a RST packet, because of a
2522 * "reset" action matched the packet.
2523 * Otherwise we are sending a keepalive, and flags & TH_
2187815d
SZ
2524 *
2525 * Only {src,dst}_{ip,port} of "id" are used.
984263bc
MD
2526 */
2527static void
2187815d 2528send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
984263bc
MD
2529{
2530 struct mbuf *m;
2531 struct ip *ip;
2532 struct tcphdr *tcp;
2533 struct route sro; /* fake route */
2534
b5523eac 2535 MGETHDR(m, M_NOWAIT, MT_HEADER);
3f944588 2536 if (m == NULL)
984263bc 2537 return;
6aabd1a4 2538 m->m_pkthdr.rcvif = NULL;
984263bc
MD
2539 m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2540 m->m_data += max_linkhdr;
2541
2542 ip = mtod(m, struct ip *);
2543 bzero(ip, m->m_len);
2544 tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2545 ip->ip_p = IPPROTO_TCP;
2546 tcp->th_off = 5;
6aabd1a4 2547
984263bc
MD
2548 /*
2549 * Assume we are sending a RST (or a keepalive in the reverse
2550 * direction), swap src and destination addresses and ports.
2551 */
2552 ip->ip_src.s_addr = htonl(id->dst_ip);
2553 ip->ip_dst.s_addr = htonl(id->src_ip);
2554 tcp->th_sport = htons(id->dst_port);
2555 tcp->th_dport = htons(id->src_port);
2556 if (flags & TH_RST) { /* we are sending a RST */
2557 if (flags & TH_ACK) {
2558 tcp->th_seq = htonl(ack);
2559 tcp->th_ack = htonl(0);
2560 tcp->th_flags = TH_RST;
2561 } else {
2562 if (flags & TH_SYN)
2563 seq++;
2564 tcp->th_seq = htonl(0);
2565 tcp->th_ack = htonl(seq);
2566 tcp->th_flags = TH_RST | TH_ACK;
2567 }
2568 } else {
2569 /*
2570 * We are sending a keepalive. flags & TH_SYN determines
2571 * the direction, forward if set, reverse if clear.
2572 * NOTE: seq and ack are always assumed to be correct
2573 * as set by the caller. This may be confusing...
2574 */
2575 if (flags & TH_SYN) {
2576 /*
2577 * we have to rewrite the correct addresses!
2578 */
2579 ip->ip_dst.s_addr = htonl(id->dst_ip);
2580 ip->ip_src.s_addr = htonl(id->src_ip);
2581 tcp->th_dport = htons(id->dst_port);
2582 tcp->th_sport = htons(id->src_port);
2583 }
2584 tcp->th_seq = htonl(seq);
2585 tcp->th_ack = htonl(ack);
2586 tcp->th_flags = TH_ACK;
2587 }
6aabd1a4 2588
984263bc
MD
2589 /*
2590 * set ip_len to the payload size so we can compute
2591 * the tcp checksum on the pseudoheader
2592 * XXX check this, could save a couple of words ?
2593 */
2594 ip->ip_len = htons(sizeof(struct tcphdr));
2595 tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
6aabd1a4 2596
984263bc
MD
2597 /*
2598 * now fill fields left out earlier
2599 */
2600 ip->ip_ttl = ip_defttl;
2601 ip->ip_len = m->m_pkthdr.len;
6aabd1a4
SZ
2602
2603 bzero(&sro, sizeof(sro));
984263bc 2604 ip_rtaddr(ip->ip_dst, &sro);
6aabd1a4 2605
f2c2ec09 2606 m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
984263bc
MD
2607 ip_output(m, NULL, &sro, 0, NULL, NULL);
2608 if (sro.ro_rt)
2609 RTFREE(sro.ro_rt);
2610}
2611
2612/*
388cb6c6 2613 * Send a reject message, consuming the mbuf passed as an argument.
984263bc
MD
2614 */
2615static void
2616send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2617{
984263bc
MD
2618 if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2619 /* We need the IP header in host order for icmp_error(). */
2620 if (args->eh != NULL) {
2621 struct ip *ip = mtod(args->m, struct ip *);
48fabf32 2622
984263bc
MD
2623 ip->ip_len = ntohs(ip->ip_len);
2624 ip->ip_off = ntohs(ip->ip_off);
2625 }
2626 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2627 } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2628 struct tcphdr *const tcp =
2629 L3HDR(struct tcphdr, mtod(args->m, struct ip *));
48fabf32
SZ
2630
2631 if ((tcp->th_flags & TH_RST) == 0) {
2632 send_pkt(&args->f_id, ntohl(tcp->th_seq),
2633 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2634 }
984263bc 2635 m_freem(args->m);
48fabf32 2636 } else {
984263bc 2637 m_freem(args->m);
48fabf32 2638 }
984263bc
MD
2639 args->m = NULL;
2640}
2641
388cb6c6 2642/*
984263bc
MD
2643 * Given an ip_fw *, lookup_next_rule will return a pointer
2644 * to the next rule, which can be either the jump
2645 * target (for skipto instructions) or the next one in the list (in
2646 * all other cases including a missing jump target).
2647 * The result is also written in the "next_rule" field of the rule.
2648 * Backward jumps are not allowed, so start looking from the next
2649 * rule...
2650 *
2651 * This never returns NULL -- in case we do not have an exact match,
2652 * the next rule is returned. When the ruleset is changed,
2653 * pointers are flushed so we are always correct.
2654 */
984263bc
MD
2655static struct ip_fw *
2656lookup_next_rule(struct ip_fw *me)
2657{
2658 struct ip_fw *rule = NULL;
2659 ipfw_insn *cmd;
2660
2661 /* look for action, in case it is a skipto */
2662 cmd = ACTION_PTR(me);
2663 if (cmd->opcode == O_LOG)
2664 cmd += F_LEN(cmd);
48fabf32
SZ
2665 if (cmd->opcode == O_SKIPTO) {
2666 for (rule = me->next; rule; rule = rule->next) {
984263bc
MD
2667 if (rule->rulenum >= cmd->arg1)
2668 break;
48fabf32
SZ
2669 }
2670 }
984263bc
MD
2671 if (rule == NULL) /* failure or not a skipto */
2672 rule = me->next;
2673 me->next_rule = rule;
2674 return rule;
2675}
2676
e5f2be89 2677static int
5cfb918a 2678ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
e5f2be89
SZ
2679 enum ipfw_opcodes opcode, uid_t uid)
2680{
2681 struct in_addr src_ip, dst_ip;
2682 struct inpcbinfo *pi;
be4519a2 2683 boolean_t wildcard;
e5f2be89
SZ
2684 struct inpcb *pcb;
2685
2686 if (fid->proto == IPPROTO_TCP) {
be4519a2 2687 wildcard = FALSE;
e5f2be89
SZ
2688 pi = &tcbinfo[mycpuid];
2689 } else if (fid->proto == IPPROTO_UDP) {
be4519a2
SZ
2690 wildcard = TRUE;
2691 pi = &udbinfo[mycpuid];
e5f2be89
SZ
2692 } else {
2693 return 0;
2694 }
2695
2696 /*
2697 * Values in 'fid' are in host byte order
2698 */
2699 dst_ip.s_addr = htonl(fid->dst_ip);
2700 src_ip.s_addr = htonl(fid->src_ip);
2701 if (oif) {
2702 pcb = in_pcblookup_hash(pi,
2703 dst_ip, htons(fid->dst_port),
2704 src_ip, htons(fid->src_port),
2705 wildcard, oif);
2706 } else {
2707 pcb = in_pcblookup_hash(pi,
2708 src_ip, htons(fid->src_port),
2709 dst_ip, htons(fid->dst_port),
2710 wildcard, NULL);
2711 }
2712 if (pcb == NULL || pcb->inp_socket == NULL)
2713 return 0;
2714
2715 if (opcode == O_UID) {
2716#define socheckuid(a,b) ((a)->so_cred->cr_uid != (b))
2717 return !socheckuid(pcb->inp_socket, uid);
2718#undef socheckuid
2719 } else {
2720 return groupmember(uid, pcb->inp_socket->so_cred);
2721 }
2722}
2723
984263bc
MD
2724/*
2725 * The main check routine for the firewall.
2726 *
2727 * All arguments are in args so we can modify them and return them
2728 * back to the caller.
2729 *
2730 * Parameters:
2731 *
2732 * args->m (in/out) The packet; we set to NULL when/if we nuke it.
2733 * Starts with the IP header.
2734 * args->eh (in) Mac header if present, or NULL for layer3 packet.
2735 * args->oif Outgoing interface, or NULL if packet is incoming.
2736 * The incoming interface is in the mbuf. (in)
984263bc
MD
2737 *
2738 * args->rule Pointer to the last matching rule (in/out)
984263bc
MD
2739 * args->f_id Addresses grabbed from the packet (out)
2740 *
2741 * Return value:
2742 *
a237ddbd
SZ
2743 * If the packet was denied/rejected and has been dropped, *m is equal
2744 * to NULL upon return.
984263bc 2745 *
a237ddbd
SZ
2746 * IP_FW_DENY the packet must be dropped.
2747 * IP_FW_PASS The packet is to be accepted and routed normally.
2748 * IP_FW_DIVERT Divert the packet to port (args->cookie)
2749 * IP_FW_TEE Tee the packet to port (args->cookie)
2750 * IP_FW_DUMMYNET Send the packet to pipe/queue (args->cookie)
6998b243 2751 * IP_FW_CONTINUE Continue processing on another cpu.
984263bc 2752 */
984263bc
MD
2753static int
2754ipfw_chk(struct ip_fw_args *args)
2755{
2756 /*
2757 * Local variables hold state during the processing of a packet.
2758 *
2759 * IMPORTANT NOTE: to speed up the processing of rules, there
2760 * are some assumption on the values of the variables, which
2761 * are documented here. Should you change them, please check
2762 * the implementation of the various instructions to make sure
2763 * that they still work.
2764 *
2765 * args->eh The MAC header. It is non-null for a layer2
2766 * packet, it is NULL for a layer-3 packet.
2767 *
2768 * m | args->m Pointer to the mbuf, as received from the caller.
2769 * It may change if ipfw_chk() does an m_pullup, or if it
2770 * consumes the packet because it calls send_reject().
2771 * XXX This has to change, so that ipfw_chk() never modifies
2772 * or consumes the buffer.
2773 * ip is simply an alias of the value of m, and it is kept
2774 * in sync with it (the packet is supposed to start with
2775 * the ip header).
2776 */
2777 struct mbuf *m = args->m;
2778 struct ip *ip = mtod(m, struct ip *);
2779
2780 /*
2781 * oif | args->oif If NULL, ipfw_chk has been called on the
a8d45119 2782 * inbound path (ether_input, ip_input).
984263bc
MD
2783 * If non-NULL, ipfw_chk has been called on the outbound path
2784 * (ether_output, ip_output).
2785 */
2786 struct ifnet *oif = args->oif;
2787
2788 struct ip_fw *f = NULL; /* matching rule */
29b27cb7 2789 int retval = IP_FW_PASS;
e5ecc832 2790 struct m_tag *mtag;
68edaf54 2791 struct divert_info *divinfo;
984263bc
MD
2792
2793 /*
2794 * hlen The length of the IPv4 header.
2795 * hlen >0 means we have an IPv4 packet.
2796 */
2797 u_int hlen = 0; /* hlen >0 means we have an IP pkt */
2798
2799 /*
2800 * offset The offset of a fragment. offset != 0 means that
2801 * we have a fragment at this offset of an IPv4 packet.
2802 * offset == 0 means that (if this is an IPv4 packet)
2803 * this is the first or only fragment.
2804 */
2805 u_short offset = 0;
2806
2807 /*
2808 * Local copies of addresses. They are only valid if we have
2809 * an IP packet.
2810 *
2811 * proto The protocol. Set to 0 for non-ip packets,
2812 * or to the protocol read from the packet otherwise.
2813 * proto != 0 means that we have an IPv4 packet.
2814 *
2815 * src_port, dst_port port numbers, in HOST format. Only
2816 * valid for TCP and UDP packets.
2817 *
2818 * src_ip, dst_ip ip addresses, in NETWORK format.
2819 * Only valid for IPv4 packets.
2820 */
a998c492
SZ
2821 uint8_t proto;
2822 uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */
984263bc 2823 struct in_addr src_ip, dst_ip; /* NOTE: network format */
50050193 2824 uint16_t ip_len = 0;
99216103
SZ
2825
2826 /*
2827 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
2828 * MATCH_NONE when checked and not matched (dyn_f = NULL),
2829 * MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
2830 */
984263bc 2831 int dyn_dir = MATCH_UNKNOWN;
99216103 2832 struct ip_fw *dyn_f = NULL;
1cc3831d
SZ
2833 int cpuid = mycpuid;
2834 struct ipfw_context *ctx;
2835
5204e13c 2836 ASSERT_NETISR_NCPUS(cpuid);
1cc3831d 2837 ctx = ipfw_ctx[cpuid];
984263bc 2838
f2c2ec09 2839 if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
29b27cb7 2840 return IP_FW_PASS; /* accept */
984263bc
MD
2841
2842 if (args->eh == NULL || /* layer 3 packet */
50050193
SZ
2843 (m->m_pkthdr.len >= sizeof(struct ip) &&
2844 ntohs(args->eh->ether_type) == ETHERTYPE_IP))
2845 hlen = ip->ip_hl << 2;
984263bc
MD
2846
2847 /*
2848 * Collect parameters into local variables for faster matching.
2849 */
2850 if (hlen == 0) { /* do not grab addresses for non-ip pkts */
2851 proto = args->f_id.proto = 0; /* mark f_id invalid */
2852 goto after_ip_checks;
2853 }
2854
2855 proto = args->f_id.proto = ip->ip_p;
2856 src_ip = ip->ip_src;
2857 dst_ip = ip->ip_dst;
2858 if (args->eh != NULL) { /* layer 2 packets are as on the wire */
2859 offset = ntohs(ip->ip_off) & IP_OFFMASK;
2860 ip_len = ntohs(ip->ip_len);
2861 } else {
2862 offset = ip->ip_off & IP_OFFMASK;
2863 ip_len = ip->ip_len;
2864 }
2865
50050193
SZ
2866#define PULLUP_TO(len) \
2867do { \
2868 if (m->m_len < (len)) { \
2869 args->m = m = m_pullup(m, (len));\
2870 if (m == NULL) \
2871 goto pullup_failed; \
2872 ip = mtod(m, struct ip *); \
2873 } \
2874} while (0)
984263bc
MD
2875
2876 if (offset == 0) {
2877 switch (proto) {
2878 case IPPROTO_TCP:
50050193
SZ
2879 {
2880 struct tcphdr *tcp;
2881
2882 PULLUP_TO(hlen + sizeof(struct tcphdr));
2883 tcp = L3HDR(struct tcphdr, ip);
2884 dst_port = tcp->th_dport;
2885 src_port = tcp->th_sport;
2886 args->f_id.flags = tcp->th_flags;
984263bc
MD
2887 }
2888 break;
2889
2890 case IPPROTO_UDP:
50050193
SZ
2891 {
2892 struct udphdr *udp;
984263bc 2893
50050193
SZ
2894 PULLUP_TO(hlen + sizeof(struct udphdr));
2895 udp = L3HDR(struct udphdr, ip);
2896 dst_port = udp->uh_dport;
2897 src_port = udp->uh_sport;
984263bc
MD
2898 }
2899 break;
2900
2901 case IPPROTO_ICMP:
2902 PULLUP_TO(hlen + 4); /* type, code and checksum. */
2903 args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
2904 break;
2905
2906 default:
2907 break;
2908 }
984263bc
MD
2909 }
2910
2911 args->f_id.src_ip = ntohl(src_ip.s_addr);
2912 args->f_id.dst_ip = ntohl(dst_ip.s_addr);
2913 args->f_id.src_port = src_port = ntohs(src_port);
2914 args->f_id.dst_port = dst_port = ntohs(dst_port);
2915
2916after_ip_checks:
2917 if (args->rule) {
2918 /*
2919 * Packet has already been tagged. Look for the next rule
2920 * to restart processing.
2921 *
2922 * If fw_one_pass != 0 then just accept it.
2923 * XXX should not happen here, but optimized out in
2924 * the caller.
2925 */
6998b243 2926 if (fw_one_pass && !args->cont)
29b27cb7 2927 return IP_FW_PASS;
6998b243 2928 args->cont = 0;
984263bc 2929
ca12e259
SZ
2930 /* This rule is being/has been flushed */
2931 if (ipfw_flushing)
29b27cb7 2932 return IP_FW_DENY;
ca12e259 2933
1cc3831d
SZ
2934 KASSERT(args->rule->cpuid == cpuid,
2935 ("rule used on cpu%d", cpuid));
ca12e259 2936
84a3e25a
SZ
2937 /* This rule was deleted */
2938 if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
29b27cb7 2939 return IP_FW_DENY;
84a3e25a 2940
984263bc
MD
2941 f = args->rule->next_rule;
2942 if (f == NULL)
2943 f = lookup_next_rule(args->rule);
2944 } else {
2945 /*
2946 * Find the starting rule. It can be either the first
2947 * one, or the one after divert_rule if asked so.
2948 */
e5ecc832
JS
2949 int skipto;
2950
6998b243
SZ
2951 KKASSERT(!args->cont);
2952
e5ecc832 2953 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
68edaf54
SZ
2954 if (mtag != NULL) {
2955 divinfo = m_tag_data(mtag);
2956 skipto = divinfo->skipto;
2957 } else {
e5ecc832 2958 skipto = 0;
68edaf54 2959 }
984263bc 2960
ca12e259 2961 f = ctx->ipfw_layer3_chain;
984263bc 2962 if (args->eh == NULL && skipto != 0) {
ca12e259
SZ
2963 /* No skipto during rule flushing */
2964 if (ipfw_flushing)
29b27cb7 2965 return IP_FW_DENY;
ca12e259 2966
984263bc 2967 if (skipto >= IPFW_DEFAULT_RULE)
29b27cb7 2968 return IP_FW_DENY; /* invalid */
ca12e259 2969
984263bc
MD
2970 while (f && f->rulenum <= skipto)
2971 f = f->next;
2972 if (f == NULL) /* drop packet */
29b27cb7 2973 return IP_FW_DENY;
ca12e259
SZ
2974 } else if (ipfw_flushing) {
2975 /* Rules are being flushed; skip to default rule */
2976 f = ctx->ipfw_default_rule;
984263bc
MD
2977 }
2978 }
e5ecc832
JS
2979 if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
2980 m_tag_delete(m, mtag);
984263bc
MD
2981
2982 /*
2983 * Now scan the rules, and parse microinstructions for each rule.
2984 */
2985 for (; f; f = f->next) {
2986 int l, cmdlen;
2987 ipfw_insn *cmd;
2988 int skip_or; /* skip rest of OR block */
2989
2990again:
ca12e259 2991 if (ctx->ipfw_set_disable & (1 << f->set))
984263bc
MD
2992 continue;
2993
2994 skip_or = 0;
50050193
SZ
2995 for (l = f->cmd_len, cmd = f->cmd; l > 0;
2996 l -= cmdlen, cmd += cmdlen) {
5cfb918a 2997 int match;
984263bc
MD
2998
2999 /*
3000 * check_body is a jump target used when we find a
3001 * CHECK_STATE, and need to jump to the body of
3002 * the target rule.
3003 */
3004
3005check_body:
3006 cmdlen = F_LEN(cmd);
3007 /*
3008 * An OR block (insn_1 || .. || insn_n) has the
3009 * F_OR bit set in all but the last instruction.
3010 * The first match will set "skip_or", and cause
3011 * the following instructions to be skipped until
3012 * past the one with the F_OR bit clear.
3013 */
3014 if (skip_or) { /* skip this instruction */
3015 if ((cmd->len & F_OR) == 0)
3016 skip_or = 0; /* next one is good */
3017 continue;
3018 }
3019 match = 0; /* set to 1 if we succeed */
3020
3021 switch (cmd->opcode) {
3022 /*
3023 * The first set of opcodes compares the packet's
3024 * fields with some pattern, setting 'match' if a
3025 * match is found. At the end of the loop there is
3026 * logic to deal with F_NOT and F_OR flags associated
3027 * with the opcode.
3028 */
3029 case O_NOP:
3030 match = 1;
3031 break;
3032
3033 case O_FORWARD_MAC:
4b1cf444 3034 kprintf("ipfw: opcode %d unimplemented\n",
50050193 3035 cmd->opcode);
984263bc
MD
3036 break;
3037
3038 case O_GID:
3039 case O_UID:
3040 /*
3041 * We only check offset == 0 && proto != 0,
3042 * as this ensures that we have an IPv4
3043 * packet with the ports info.
3044 */
3045 if (offset!=0)
3046 break;
50050193 3047
e5f2be89
SZ
3048 match = ipfw_match_uid(&args->f_id, oif,
3049 cmd->opcode,
5cfb918a 3050 (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
984263bc
MD
3051 break;
3052
3053 case O_RECV:
3054 match = iface_match(m->m_pkthdr.rcvif,
3055 (ipfw_insn_if *)cmd);
3056 break;
3057
3058 case O_XMIT:
3059 match = iface_match(oif, (ipfw_insn_if *)cmd);
3060 break;
3061
3062 case O_VIA:
3063 match = iface_match(oif ? oif :
3064 m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3065 break;
3066
3067 case O_MACADDR2:
3068 if (args->eh != NULL) { /* have MAC header */
a998c492 3069 uint32_t *want = (uint32_t *)
984263bc 3070 ((ipfw_insn_mac *)cmd)->addr;
a998c492 3071 uint32_t *mask = (uint32_t *)
984263bc 3072 ((ipfw_insn_mac *)cmd)->mask;
a998c492 3073 uint32_t *hdr = (uint32_t *)args->eh;
984263bc
MD
3074
3075 match =
50050193
SZ
3076 (want[0] == (hdr[0] & mask[0]) &&
3077 want[1] == (hdr[1] & mask[1]) &&
3078 want[2] == (hdr[2] & mask[2]));
984263bc
MD
3079 }
3080 break;
3081
3082 case O_MAC_TYPE:
3083 if (args->eh != NULL) {
a998c492 3084 uint16_t t =
984263bc 3085 ntohs(args->eh->ether_type);
a998c492 3086 uint16_t *p =
984263bc
MD
3087 ((ipfw_insn_u16 *)cmd)->ports;
3088 int i;
3089
60f3eea1
SZ
3090 /* Special vlan handling */
3091 if (m->m_flags & M_VLANTAG)
3092 t = ETHERTYPE_VLAN;
3093
50050193
SZ
3094 for (i = cmdlen - 1; !match && i > 0;
3095 i--, p += 2) {
3096 match =
3097 (t >= p[0] && t <= p[1]);
3098 }
984263bc
MD
3099 }
3100 break;
3101
3102 case O_FRAG:
3103 match = (hlen > 0 && offset != 0);
3104 break;
3105
cc9ef3d3
SZ
3106 case O_IPFRAG:
3107 if (hlen > 0) {
3108 uint16_t off;
3109
3110 if (args->eh != NULL)
3111 off = ntohs(ip->ip_off);
3112 else
3113 off = ip->ip_off;
3114 if (off & (IP_MF | IP_OFFMASK))
3115 match = 1;
3116 }
3117 break;
3118
984263bc
MD
3119 case O_IN: /* "out" is "not in" */
3120 match = (oif == NULL);
3121 break;
3122
3123 case O_LAYER2:
3124 match = (args->eh != NULL);
3125 break;
3126
3127 case O_PROTO:
3128 /*
3129 * We do not allow an arg of 0 so the
3130 * check of "proto" only suffices.
3131 */
3132 match = (proto == cmd->arg1);
3133 break;
3134
3135 case O_IP_SRC:
3136 match = (hlen > 0 &&
3137 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3138 src_ip.s_addr);
3139 break;
3140
3141 case O_IP_SRC_MASK:
3142 match = (hlen > 0 &&
3143 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3144 (src_ip.s_addr &
3145 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3146 break;
3147
3148 case O_IP_SRC_ME:
3149 if (hlen > 0) {
3150 struct ifnet *tif;
3151
f8983475 3152 tif = INADDR_TO_IFP(&src_ip);
984263bc
MD
3153 match = (tif != NULL);
3154 }
3155 break;
3156
d938108c
SZ
3157 case O_IP_SRC_TABLE:
3158 match = ipfw_table_lookup(ctx, cmd->arg1,
3159 &src_ip);
3160 break;
3161
984263bc
MD
3162 case O_IP_DST_SET:
3163 case O_IP_SRC_SET:
3164 if (hlen > 0) {
3f944588 3165 uint32_t *d = (uint32_t *)(cmd + 1);
a998c492 3166 uint32_t addr =
984263bc
MD
3167 cmd->opcode == O_IP_DST_SET ?
3168 args->f_id.dst_ip :
3169 args->f_id.src_ip;
3170
50050193
SZ
3171 if (addr < d[0])
3172 break;
3173 addr -= d[0]; /* subtract base */
3174 match =
3175 (addr < cmd->arg1) &&
3176 (d[1 + (addr >> 5)] &
3177 (1 << (addr & 0x1f)));
984263bc
MD
3178 }
3179 break;
3180
3181 case O_IP_DST:
3182 match = (hlen > 0 &&
3183 ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3184 dst_ip.s_addr);
3185 break;
3186
3187 case O_IP_DST_MASK:
3188 match = (hlen > 0) &&
3189 (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3190 (dst_ip.s_addr &
3191 ((ipfw_insn_ip *)cmd)->mask.s_addr));
3192 break;
3193
3194 case O_IP_DST_ME:
3195 if (hlen > 0) {
3196 struct ifnet *tif;
3197
f8983475 3198 tif = INADDR_TO_IFP(&dst_ip);
984263bc
MD
3199 match = (tif != NULL);
3200 }
3201 break;
3202
d938108c
SZ
3203 case O_IP_DST_TABLE:
3204 match = ipfw_table_lookup(ctx, cmd->arg1,
3205 &dst_ip);
3206 break;
3207
984263bc
MD
3208 case O_IP_SRCPORT:
3209 case O_IP_DSTPORT:
3210 /*
3211 * offset == 0 && proto != 0 is enough
3212 * to guarantee that we have an IPv4
3213 * packet with port info.
3214 */
3215 if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
3216 && offset == 0) {
a998c492 3217 uint16_t x =
984263bc
MD
3218 (cmd->opcode == O_IP_SRCPORT) ?
3219 src_port : dst_port ;
a998c492 3220 uint16_t *p =
984263bc
MD
3221 ((ipfw_insn_u16 *)cmd)->ports;
3222 int i;
3223
50050193
SZ
3224 for (i = cmdlen - 1; !match && i > 0;
3225 i--, p += 2) {
3226 match =
3227 (x >= p[0] && x <= p[1]);
3228 }
984263bc
MD
3229 }
3230 break;
3231
3232 case O_ICMPTYPE:
3233 match = (offset == 0 && proto==IPPROTO_ICMP &&
50050193 3234 icmptype_match(ip, (ipfw_insn_u32 *)cmd));
984263bc
MD
3235 break;
3236
3237 case O_IPOPT:
50050193 3238 match = (hlen > 0 && ipopts_match(ip, cmd));
984263bc
MD
3239 break;
3240
3241 case O_IPVER:
3242 match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3243 break;
3244
3245 case O_IPTTL:
3246 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3247 break;
3248
3249 case O_IPID:
3250 match = (hlen > 0 &&
3251 cmd->arg1 == ntohs(ip->ip_id));
3252 break;
3253
3254 case O_IPLEN:
3255 match = (hlen > 0 && cmd->arg1 == ip_len);
3256 break;
3257
3258 case O_IPPRECEDENCE:
3259 match = (hlen > 0 &&
50050193 3260 (cmd->arg1 == (ip->ip_tos & 0xe0)));
984263bc
MD
3261 break;
3262
3263 case O_IPTOS:
3264 match = (hlen > 0 &&
3265 flags_match(cmd, ip->ip_tos));
3266 break;
3267
3268 case O_TCPFLAGS:
3269 match = (proto == IPPROTO_TCP && offset == 0 &&
3270 flags_match(cmd,
3271 L3HDR(struct tcphdr,ip)->th_flags));
3272 break;
3273
3274 case O_TCPOPTS:
3275 match = (proto == IPPROTO_TCP && offset == 0 &&
3276 tcpopts_match(ip, cmd));
3277 break;
3278
3279 case O_TCPSEQ:
3280 match = (proto == IPPROTO_TCP && offset == 0 &&
3281 ((ipfw_insn_u32 *)cmd)->d[0] ==
3282 L3HDR(struct tcphdr,ip)->th_seq);
3283 break;
3284
3285 case O_TCPACK:
3286 match = (proto == IPPROTO_TCP && offset == 0 &&
3287 ((ipfw_insn_u32 *)cmd)->d[0] ==
3288 L3HDR(struct tcphdr,ip)->th_ack);
3289 break;
3290
3291 case O_TCPWIN:
3292 match = (proto == IPPROTO_TCP && offset == 0 &&
3293 cmd->arg1 ==
3294 L3HDR(struct tcphdr,ip)->th_win);
3295 break;
3296
3297 case O_ESTAB:
3298 /* reject packets which have SYN only */
3299 /* XXX should i also check for TH_ACK ? */
3300 match = (proto == IPPROTO_TCP && offset == 0 &&
3301 (L3HDR(struct tcphdr,ip)->th_flags &
3302 (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3303 break;
3304
3305 case O_LOG:
2187815d
SZ
3306 if (fw_verbose) {
3307 ipfw_log(ctx, f, hlen, args->eh, m,
3308 oif);
3309 }
984263bc
MD
3310 match = 1;
3311 break;
3312
3313 case O_PROB:
cddfb7bb
MD
3314 match = (krandom() <
3315 ((ipfw_insn_u32 *)cmd)->d[0]);
984263bc
MD
3316 break;
3317
3318 /*
3319 * The second set of opcodes represents 'actions',
3320 * i.e. the terminal part of a rule once the packet
3321 * matches all previous patterns.
3322 * Typically there is only one action for each rule,
3323 * and the opcode is stored at the end of the rule
3324 * (but there are exceptions -- see below).
3325 *
3326 * In general, here we set retval and terminate the
3327 * outer loop (would be a 'break 3' in some language,
3328 * but we need to do a 'goto done').
3329 *
3330 * Exceptions:
3331 * O_COUNT and O_SKIPTO actions:
3332 * instead of terminating, we jump to the next rule
3333 * ('goto next_rule', equivalent to a 'break 2'),
3334 * or to the SKIPTO target ('goto again' after
3335 * having set f, cmd and l), respectively.
3336 *
3337 * O_LIMIT and O_KEEP_STATE: these opcodes are
3338 * not real 'actions', and are stored right
3339 * before the 'action' part of the rule.
3340 * These opcodes try to install an entry in the
3341 * state tables; if successful, we continue with
3342 * the next opcode (match=1; break;), otherwise
f5670523
SZ
3343 * the packet must be dropped ('goto done' after
3344 * setting retval). If static rules are changed
3345 * during the state installation, the packet will
29b27cb7
SZ
3346 * be dropped and rule's stats will not beupdated
3347 * ('return IP_FW_DENY').
984263bc
MD
3348 *
3349 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3350 * cause a lookup of the state table, and a jump
3351 * to the 'action' part of the parent rule
3352 * ('goto check_body') if an entry is found, or
3353 * (CHECK_STATE only) a jump to the next rule if
3354 * the entry is not found ('goto next_rule').
3355 * The result of the lookup is cached to make
3356 * further instances of these opcodes are
f5670523
SZ
3357 * effectively NOPs. If static rules are changed
3358 * during the state looking up, the packet will
29b27cb7
SZ
3359 * be dropped and rule's stats will not be updated
3360 * ('return IP_FW_DENY').
984263bc
MD
3361 */
3362 case O_LIMIT:
3363 case O_KEEP_STATE:
2187815d
SZ
3364 if (ipfw_state_install(ctx, f,
3365 (ipfw_insn_limit *)cmd, args,
3366 (offset == 0 && proto == IPPROTO_TCP) ?
3367 L3HDR(struct tcphdr, ip) : NULL)) {
29b27cb7 3368 retval = IP_FW_DENY;
984263bc
MD
3369 goto done; /* error/limit violation */
3370 }
3371 match = 1;
3372 break;
3373
3374 case O_PROBE_STATE:
3375 case O_CHECK_STATE:
3376 /*
2187815d
SZ
3377 * States are checked at the first keep-state
3378 * check-state occurrence, with the result
3379 * being stored in dyn_dir. The compiler
3380 * introduces a PROBE_STATE instruction for
3381 * us when we have a KEEP_STATE/LIMIT (because
3382 * PROBE_STATE needs to be run first).
984263bc 3383 */
997a0e9a 3384 if (dyn_dir == MATCH_UNKNOWN) {
2187815d
SZ
3385 dyn_f = ipfw_state_lookup_rule(ctx,
3386 &args->f_id, &dyn_dir,
3387 (offset == 0 &&
3388 proto == IPPROTO_TCP) ?
3389 L3HDR(struct tcphdr, ip) : NULL,
3390 ip_len);
997a0e9a
SZ
3391 if (dyn_f != NULL) {
3392 /*
2187815d
SZ
3393 * Found a rule from a state;
3394 * jump to the 'action' part
3395 * of the rule.
997a0e9a
SZ
3396 */
3397 f = dyn_f;
3398 cmd = ACTION_PTR(f);
3399 l = f->cmd_len - f->act_ofs;
3400 goto check_body;
3401 }
984263bc
MD
3402 }
3403 /*
2187815d
SZ
3404 * State not found. If CHECK_STATE, skip to
3405 * next rule, if PROBE_STATE just ignore and
3406 * continue with next opcode.
984263bc
MD
3407 */
3408 if (cmd->opcode == O_CHECK_STATE)
3409 goto next_rule;
3410 match = 1;
3411 break;
3412
3413 case O_ACCEPT:
29b27cb7 3414 retval = IP_FW_PASS; /* accept */
984263bc
MD
3415 goto done;
3416
6998b243
SZ
3417 case O_DEFRAG:
3418 if (f->cross_rules == NULL) {
3419 /*
3420 * This rule was not completely setup;
3421 * move on to the next rule.
3422 */
3423 goto next_rule;
3424 }
3425
3426 /*
3427 * Don't defrag for l2 packets, output packets
3428 * or non-fragments.
3429 */
3430 if (oif != NULL || args->eh != NULL ||
3431 (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
3432 goto next_rule;
3433
3434 ctx->ipfw_frags++;
3435 m = ip_reass(m);
3436 args->m = m;
3437 if (m == NULL) {
3438 retval = IP_FW_PASS;
3439 goto done;
3440 }
3441 ctx->ipfw_defraged++;
3442 KASSERT((m->m_flags & M_HASH) == 0,
3443 ("hash not cleared"));
3444
3445 /* Update statistics */
3446 f->pcnt++;
3447 f->bcnt += ip_len;
3448 f->timestamp = time_second;
3449
3450 ip = mtod(m, struct ip *);
3451 hlen = ip->ip_hl << 2;
3452 ip->ip_len += hlen;
3453
3454 ip->ip_len = htons(ip->ip_len);
3455 ip->ip_off = htons(ip->ip_off);
3456
3457 ip_hashfn(&m, 0);
3458 args->m = m;
3459 if (m == NULL)
3460 goto pullup_failed;
3461
3462 KASSERT(m->m_flags & M_HASH, ("no hash"));
3463 cpuid = netisr_hashcpu(m->m_pkthdr.hash);
3464 if (cpuid != mycpuid) {
3465 /*
3466 * NOTE:
3467 * ip_len/ip_off are in network byte
3468 * order.
3469 */
3470 ctx->ipfw_defrag_remote++;
3471 args->rule = f;
3472 return (IP_FW_CONTINUE);
3473 }
3474
3475 /* 'm' might be changed by ip_hashfn(). */
3476 ip = mtod(m, struct ip *);
3477 ip->ip_len = ntohs(ip->ip_len);
3478 ip->ip_off = ntohs(ip->ip_off);
3479
3480 ip_len = ip->ip_len;
3481 offset = 0;
3482 proto = args->f_id.proto = ip->ip_p;
3483
3484 switch (proto) {
3485 case IPPROTO_TCP:
3486 {
3487 struct tcphdr *tcp;
3488
3489 PULLUP_TO(hlen +
3490 sizeof(struct tcphdr));
3491 tcp = L3HDR(struct tcphdr, ip);
3492 dst_port = tcp->th_dport;
3493 src_port = tcp->th_sport;
3494 args->f_id.flags =
3495 tcp->th_flags;
3496 }
3497 break;
3498
3499 case IPPROTO_UDP:
3500 {
3501 struct udphdr *udp;
3502
3503 PULLUP_TO(hlen +
3504 sizeof(struct udphdr));
3505 udp = L3HDR(struct udphdr, ip);
3506 dst_port = udp->uh_dport;
3507 src_port = udp->uh_sport;
3508 }
3509 break;
3510
3511 case IPPROTO_ICMP:
3512 /* type, code and checksum. */
3513 PULLUP_TO(hlen + 4);
3514 args->f_id.flags =
3515 L3HDR(struct icmp, ip)->icmp_type;
3516 break;
3517
3518 default:
3519 break;
3520 }
3521 args->f_id.src_port = src_port =
3522 ntohs(src_port);
3523 args->f_id.dst_port = dst_port =
3524 ntohs(dst_port);
3525
3526 /* Move on. */
3527 goto next_rule;
3528
984263bc
MD
3529 case O_PIPE:
3530 case O_QUEUE:
3531 args->rule = f; /* report matching rule */
29b27cb7
SZ
3532 args->cookie = cmd->arg1;
3533 retval = IP_FW_DUMMYNET;
984263bc
MD
3534 goto done;
3535
3536 case O_DIVERT:
3537 case O_TEE:
3538 if (args->eh) /* not on layer 2 */
3539 break;
e5ecc832
JS
3540
3541 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
aa612e18 3542 sizeof(*divinfo), M_INTWAIT | M_NULLOK);
e5ecc832 3543 if (mtag == NULL) {
29b27cb7 3544 retval = IP_FW_DENY;
e5ecc832
JS
3545 goto done;
3546 }
68edaf54
SZ
3547 divinfo = m_tag_data(mtag);
3548
3549 divinfo->skipto = f->rulenum;
3550 divinfo->port = cmd->arg1;
3551 divinfo->tee = (cmd->opcode == O_TEE);
e5ecc832 3552 m_tag_prepend(m, mtag);
68edaf54 3553
29b27cb7 3554 args->cookie = cmd->arg1;
984263bc 3555 retval = (cmd->opcode == O_DIVERT) ?
29b27cb7 3556 IP_FW_DIVERT : IP_FW_TEE;
984263bc
MD
3557 goto done;
3558
3559 case O_COUNT:
3560 case O_SKIPTO:
3561 f->pcnt++; /* update stats */
3562 f->bcnt += ip_len;
3563 f->timestamp = time_second;
3564 if (cmd->opcode == O_COUNT)
3565 goto next_rule;
3566 /* handle skipto */
3567 if (f->next_rule == NULL)
3568 lookup_next_rule(f);
3569 f = f->next_rule;
3570 goto again;
3571
3572 case O_REJECT:
3573 /*
3574 * Drop the packet and send a reject notice
3575 * if the packet is not ICMP (or is an ICMP
3576 * query), and it is not multicast/broadcast.
3577 */
3578 if (hlen > 0 &&
3579 (proto != IPPROTO_ICMP ||
3580 is_icmp_query(ip)) &&
3581 !(m->m_flags & (M_BCAST|M_MCAST)) &&
d0d5be30 3582 !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
984263bc 3583 send_reject(args, cmd->arg1,
79a7f216
SZ
3584 offset, ip_len);
3585 retval = IP_FW_DENY;
3586 goto done;
984263bc
MD
3587 }
3588 /* FALLTHROUGH */
3589 case O_DENY:
29b27cb7 3590 retval = IP_FW_DENY;
984263bc
MD
3591 goto done;
3592
3593 case O_FORWARD_IP:
3594 if (args->eh) /* not valid on layer2 pkts */
3595 break;
99216103 3596 if (!dyn_f || dyn_dir == MATCH_FORWARD) {
5de23090
SZ
3597 struct sockaddr_in *sin;
3598
3599 mtag = m_tag_get(PACKET_TAG_IPFORWARD,
aa612e18 3600 sizeof(*sin), M_INTWAIT | M_NULLOK);
5de23090 3601 if (mtag == NULL) {
29b27cb7 3602 retval = IP_FW_DENY;
5de23090
SZ
3603 goto done;
3604 }
3605 sin = m_tag_data(mtag);
3606
3607 /* Structure copy */
3608 *sin = ((ipfw_insn_sa *)cmd)->sa;
3609
3610 m_tag_prepend(m, mtag);
3611 m->m_pkthdr.fw_flags |=
3612 IPFORWARD_MBUF_TAGGED;
b7441d0c
MD
3613 m->m_pkthdr.fw_flags &=
3614 ~BRIDGE_MBUF_TAGGED;
50050193 3615 }
29b27cb7 3616 retval = IP_FW_PASS;
984263bc
MD
3617 goto done;
3618
3619 default:
ed20d0e3 3620 panic("-- unknown opcode %d", cmd->opcode);
984263bc
MD
3621 } /* end of switch() on opcodes */
3622
3623 if (cmd->len & F_NOT)
3624 match = !match;
3625
3626 if (match) {
3627 if (cmd->len & F_OR)
3628 skip_or = 1;
3629 } else {
3630 if (!(cmd->len & F_OR)) /* not an OR block, */
3631 break; /* try next rule */
3632 }
3633
3634 } /* end of inner for, scan opcodes */
3635
3636next_rule:; /* try next rule */
3637
3638 } /* end of outer for, scan rules */
4b1cf444 3639 kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
29b27cb7 3640 return IP_FW_DENY;
984263bc
MD
3641
3642done:
3643 /* Update statistics */
3644 f->pcnt++;
3645 f->bcnt += ip_len;
3646 f->timestamp = time_second;
3647 return retval;
3648
3649pullup_failed:
3650 if (fw_verbose)
4b1cf444 3651 kprintf("pullup failed\n");
29b27cb7 3652 return IP_FW_DENY;
6998b243
SZ
3653
3654#undef PULLUP_TO
984263bc
MD
3655}
3656
b089787f 3657static struct mbuf *
84a3e25a
SZ
3658ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
3659{
3660 struct m_tag *mtag;
3661 struct dn_pkt *pkt;
3662 ipfw_insn *cmd;
3663 const struct ipfw_flow_id *id;
3664 struct dn_flow_id *fid;
3665
f849e7f7
SZ
3666 M_ASSERTPKTHDR(m);
3667
aa612e18
SZ
3668 mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
3669 M_INTWAIT | M_NULLOK);
84a3e25a
SZ
3670 if (mtag == NULL) {
3671 m_freem(m);
b089787f 3672 return (NULL);
84a3e25a
SZ
3673 }
3674 m_tag_prepend(m, mtag);
3675
3676 pkt = m_tag_data(mtag);
3677 bzero(pkt, sizeof(*pkt));
3678
3679 cmd = fwa->rule->cmd + fwa->rule->act_ofs;
3680 if (cmd->opcode == O_LOG)
3681 cmd += F_LEN(cmd);
3682 KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
ed20d0e3 3683 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
84a3e25a
SZ
3684
3685 pkt->dn_m = m;
3686 pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
3687 pkt->ifp = fwa->oif;
84a3e25a
SZ
3688 pkt->pipe_nr = pipe_nr;
3689
e5d90c37 3690 pkt->cpuid = mycpuid;
08b13ea9 3691 pkt->msgport = netisr_curport();
e5d90c37 3692
84a3e25a
SZ
3693 id = &fwa->f_id;
3694 fid = &pkt->id;
3695 fid->fid_dst_ip = id->dst_ip;
3696 fid->fid_src_ip = id->src_ip;
3697 fid->fid_dst_port = id->dst_port;
3698 fid->fid_src_port = id->src_port;
3699 fid->fid_proto = id->proto;
3700 fid->fid_flags = id->flags;
3701
3702 ipfw_ref_rule(fwa->rule);
3703 pkt->dn_priv = fwa->rule;
3704 pkt->dn_unref_priv = ipfw_unref_rule;
3705
3706 if (cmd->opcode == O_PIPE)
3707 pkt->dn_flags |= DN_FLAGS_IS_PIPE;
3708
84a3e25a 3709 m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
b089787f 3710 return (m);
84a3e25a
SZ
3711}
3712
984263bc
MD
3713/*
3714 * When a rule is added/deleted, clear the next_rule pointers in all rules.
3715 * These will be reconstructed on the fly as packets are matched.
984263bc
MD
3716 */
3717static void
ca12e259 3718ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
984263bc
MD
3719{
3720 struct ip_fw *rule;
3721
ca12e259 3722 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
984263bc
MD
3723 rule->next_rule = NULL;
3724}
3725
b30401b3
SZ
3726static __inline void
3727ipfw_inc_static_count(struct ip_fw *rule)
3728{
ac5988d6 3729 /* Static rule's counts are updated only on CPU0 */
ca12e259 3730 KKASSERT(mycpuid == 0);
61224eb9 3731
b30401b3 3732 static_count++;
b78533e2 3733 static_ioc_len += IOC_RULESIZE(rule);
b30401b3
SZ
3734}
3735
9c4d5568
SZ
3736static __inline void
3737ipfw_dec_static_count(struct ip_fw *rule)
3738{
b78533e2 3739 int l = IOC_RULESIZE(rule);
9c4d5568 3740
ac5988d6 3741 /* Static rule's counts are updated only on CPU0 */
ca12e259 3742 KKASSERT(mycpuid == 0);
61224eb9 3743
ed20d0e3 3744 KASSERT(static_count > 0, ("invalid static count %u", static_count));
9c4d5568
SZ
3745 static_count--;
3746
b78533e2 3747 KASSERT(static_ioc_len >= l,
ed20d0e3 3748 ("invalid static len %u", static_ioc_len));
b78533e2
SZ
3749 static_ioc_len -= l;
3750}
3751
ca12e259
SZ
3752static void
3753ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
3754{
3755 if (fwmsg->sibling != NULL) {
3756 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
3757 fwmsg->sibling->sibling = rule;
3758 }
3759 fwmsg->sibling = rule;
3760}
3761
b78533e2 3762static struct ip_fw *
2187815d 3763ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
b78533e2
SZ
3764{
3765 struct ip_fw *rule;
3766
3767 rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
3768
3769 rule->act_ofs = ioc_rule->act_ofs;
3770 rule->cmd_len = ioc_rule->cmd_len;
3771 rule->rulenum = ioc_rule->rulenum;
3772 rule->set = ioc_rule->set;
3773 rule->usr_flags = ioc_rule->usr_flags;
3774
3775 bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
3776
84a3e25a 3777 rule->refcnt = 1;
ca12e259 3778 rule->cpuid = mycpuid;
2187815d 3779 rule->rule_flags = rule_flags;
84a3e25a 3780
b78533e2 3781 return rule;
9c4d5568
SZ
3782}
3783
ca12e259 3784static void
002c1265